{ lib, pkgs, config, ... }: let # Add services here - each entry generates an alert rule # instance = which node exporter to query (hostname:9100) monitoredServices = [ { name = "AI-Mailer"; service = "ai-mailer.service"; instance = "fw:9100"; } { name = "Postfix"; service = "postfix.service"; instance = "mail:9100"; } { name = "Dovecot"; service = "dovecot.service"; instance = "mail:9100"; } { name = "OpenLDAP"; service = "openldap.service"; instance = "mail:9100"; } { name = "Forgejo"; service = "container@forgejo.service"; instance = "fw:9100"; } { name = "Forgejo Runner 1"; service = "microvm@fj-runner-1.service"; instance = "fw:9100"; } { name = "Forgejo Runner 2"; service = "microvm@fj-runner-2.service"; instance = "fw:9100"; } { name = "WireGuard"; service = "wireguard-wg_cloonar.service"; instance = "fw:9100"; } { name = "MySQL"; service = "mysql.service"; instance = "amzebs-01:9100"; } { name = "Nginx"; service = "nginx.service"; instance = "amzebs-01:9100"; } { name = "PHP-FPM"; service = "phpfpm-.*[.]service"; instance = "amzebs-01:9100"; } ]; # Extract host from instance (e.g., "fw:9100" -> "fw") getHost = instance: lib.head (lib.splitString ":" instance); # Generate a unique UID from service name mkUid = name: "${lib.toLower (lib.replaceStrings [" " "@" "."] ["-" "-" "-"] name)}-down-uid"; # Check if service pattern uses regex (contains special chars) isRegex = svc: lib.hasInfix ".*" svc || lib.hasInfix "\\" svc; # Build the PromQL expression # For regex patterns: use min() to alert if ANY matching service is down # For single services: use OR vector(0) to handle missing metrics mkExpr = svc: let nameMatch = if isRegex svc.service then "name=~\"${svc.service}\"" else "name=\"${svc.service}\""; baseQuery = "node_systemd_unit_state{state=\"active\", ${nameMatch}, instance=\"${svc.instance}\"}"; in if isRegex svc.service then "min(${baseQuery})" else "${baseQuery} OR on() vector(0)"; mkServiceAlert = svc: { uid = mkUid svc.name; title = "${svc.name} Service Down on ${getHost svc.instance}"; condition = "C"; data = [ { refId = "A"; relativeTimeRange = { from = 300; to = 0; }; datasourceUid = "vm-datasource-uid"; model = { editorMode = "code"; expr = mkExpr svc; hide = false; intervalMs = 1000; legendFormat = "__auto"; maxDataPoints = 43200; range = true; refId = "A"; }; } { refId = "B"; datasourceUid = "__expr__"; model = { type = "reduce"; expression = "A"; reducer = "last"; }; } { refId = "C"; datasourceUid = "__expr__"; model = { type = "math"; expression = "$B < 1"; }; } ]; noDataState = "Alerting"; execErrState = "Alerting"; for = "5m"; annotations = { description = "${svc.name} service is down on ${getHost svc.instance}"; summary = "${svc.name} Service Down"; }; labels = { severity = "critical"; host = getHost svc.instance; }; }; in { grafanaAlertRuleDefinitions = map mkServiceAlert monitoredServices; }