feat: webarm: normalize service alerts

This commit is contained in:
2026-01-04 19:02:11 +01:00
parent 336ddb13f8
commit e83aa3c893
11 changed files with 93 additions and 539 deletions

View File

@@ -0,0 +1,90 @@
{ lib, pkgs, config, ... }:
let
# Add services here - each entry generates an alert rule
# instance = which node exporter to query (hostname:9100)
monitoredServices = [
{ name = "AI-Mailer"; service = "ai-mailer.service"; instance = "fw:9100"; }
{ name = "Postfix"; service = "postfix.service"; instance = "mail:9100"; }
{ name = "Dovecot"; service = "dovecot.service"; instance = "mail:9100"; }
{ name = "OpenLDAP"; service = "openldap.service"; instance = "mail:9100"; }
{ name = "Gitea"; service = "container@git.service"; instance = "fw:9100"; }
{ name = "Gitea Runner"; service = "microvm@git-runner-1.service"; instance = "fw:9100"; }
{ name = "WireGuard"; service = "wireguard-wg_cloonar.service"; instance = "mail:9100"; }
{ name = "MySQL"; service = "mysql.service"; instance = "amzebs-01:9100"; }
{ name = "Nginx"; service = "nginx.service"; instance = "amzebs-01:9100"; }
{ name = "PHP-FPM"; service = "phpfpm-.*\\.service"; instance = "amzebs-01:9100"; }
];
# Extract host from instance (e.g., "fw:9100" -> "fw")
getHost = instance: lib.head (lib.splitString ":" instance);
# Generate a unique UID from service name
mkUid = name: "${lib.toLower (lib.replaceStrings [" " "@" "."] ["-" "-" "-"] name)}-down-uid";
# Check if service pattern uses regex (contains special chars)
isRegex = svc: lib.hasInfix ".*" svc || lib.hasInfix "\\" svc;
# Build the PromQL expression
mkExpr = svc:
let
nameMatch = if isRegex svc.service
then "name=~\"${svc.service}\""
else "name=\"${svc.service}\"";
in "node_systemd_unit_state{state=\"active\", ${nameMatch}, instance=\"${svc.instance}\"} OR on() vector(0)";
mkServiceAlert = svc: {
uid = mkUid svc.name;
title = "${svc.name} Service Down on ${getHost svc.instance}";
condition = "C";
data = [
{
refId = "A";
relativeTimeRange = {
from = 300;
to = 0;
};
datasourceUid = "vm-datasource-uid";
model = {
editorMode = "code";
expr = mkExpr svc;
hide = false;
intervalMs = 1000;
legendFormat = "__auto";
maxDataPoints = 43200;
range = true;
refId = "A";
};
}
{
refId = "B";
datasourceUid = "__expr__";
model = {
type = "reduce";
expression = "A";
reducer = "last";
};
}
{
refId = "C";
datasourceUid = "__expr__";
model = {
type = "math";
expression = "$B < 1";
};
}
];
noDataState = "Alerting";
execErrState = "Alerting";
for = "5m";
annotations = {
description = "${svc.name} service is down on ${getHost svc.instance}";
summary = "${svc.name} Service Down";
};
labels = {
severity = "critical";
host = getHost svc.instance;
};
};
in {
grafanaAlertRuleDefinitions = map mkServiceAlert monitoredServices;
}