diff --git a/hosts/web-arm/modules/grafana/alerting/service/amzebs_mysql_down.nix b/hosts/web-arm/modules/grafana/alerting/service/amzebs_mysql_down.nix deleted file mode 100644 index 9416794..0000000 --- a/hosts/web-arm/modules/grafana/alerting/service/amzebs_mysql_down.nix +++ /dev/null @@ -1,58 +0,0 @@ -{ lib, pkgs, config, ... }: -{ - grafanaAlertRuleDefinitions = [ - { - uid = "amzebs-mysql-service-down-alert-uid"; - title = "MySQL Service Down on amzebs-01"; - condition = "C"; - data = [ - { - refId = "A"; - relativeTimeRange = { - from = 300; - to = 0; - }; - datasourceUid = "vm-datasource-uid"; - model = { - editorMode = "code"; - expr = "node_systemd_unit_state{state=\"active\", name=\"mysql.service\", instance=\"amzebs-01:9100\"} OR on() vector(0)"; - hide = false; - intervalMs = 1000; - legendFormat = "__auto"; - maxDataPoints = 43200; - range = true; - refId = "A"; - }; - } - { - refId = "B"; - datasourceUid = "__expr__"; - model = { - type = "reduce"; - expression = "A"; - reducer = "last"; - }; - } - { - refId = "C"; - datasourceUid = "__expr__"; - model = { - type = "math"; - expression = "$B < 1"; - }; - } - ]; - noDataState = "Alerting"; - execErrState = "Alerting"; - for = "5m"; - annotations = { - description = "MySQL service is down on amzebs-01"; - summary = "MySQL Service Down on amzebs-01"; - }; - labels = { - severity = "critical"; - host = "amzebs-01"; - }; - } - ]; -} diff --git a/hosts/web-arm/modules/grafana/alerting/service/amzebs_nginx_down.nix b/hosts/web-arm/modules/grafana/alerting/service/amzebs_nginx_down.nix deleted file mode 100644 index a3b2119..0000000 --- a/hosts/web-arm/modules/grafana/alerting/service/amzebs_nginx_down.nix +++ /dev/null @@ -1,58 +0,0 @@ -{ lib, pkgs, config, ... }: -{ - grafanaAlertRuleDefinitions = [ - { - uid = "amzebs-nginx-service-down-alert-uid"; - title = "Nginx Service Down on amzebs-01"; - condition = "C"; - data = [ - { - refId = "A"; - relativeTimeRange = { - from = 300; - to = 0; - }; - datasourceUid = "vm-datasource-uid"; - model = { - editorMode = "code"; - expr = "node_systemd_unit_state{state=\"active\", name=\"nginx.service\", instance=\"amzebs-01:9100\"} OR on() vector(0)"; - hide = false; - intervalMs = 1000; - legendFormat = "__auto"; - maxDataPoints = 43200; - range = true; - refId = "A"; - }; - } - { - refId = "B"; - datasourceUid = "__expr__"; - model = { - type = "reduce"; - expression = "A"; - reducer = "last"; - }; - } - { - refId = "C"; - datasourceUid = "__expr__"; - model = { - type = "math"; - expression = "$B < 1"; - }; - } - ]; - noDataState = "Alerting"; - execErrState = "Alerting"; - for = "5m"; - annotations = { - description = "Nginx service is down on amzebs-01"; - summary = "Nginx Service Down on amzebs-01"; - }; - labels = { - severity = "critical"; - host = "amzebs-01"; - }; - } - ]; -} diff --git a/hosts/web-arm/modules/grafana/alerting/service/amzebs_phpfpm_down.nix b/hosts/web-arm/modules/grafana/alerting/service/amzebs_phpfpm_down.nix deleted file mode 100644 index bfe52fa..0000000 --- a/hosts/web-arm/modules/grafana/alerting/service/amzebs_phpfpm_down.nix +++ /dev/null @@ -1,58 +0,0 @@ -{ lib, pkgs, config, ... }: -{ - grafanaAlertRuleDefinitions = [ - { - uid = "amzebs-phpfpm-service-down-alert-uid"; - title = "PHP-FPM Service Down on amzebs-01"; - condition = "C"; - data = [ - { - refId = "A"; - relativeTimeRange = { - from = 300; - to = 0; - }; - datasourceUid = "vm-datasource-uid"; - model = { - editorMode = "code"; - expr = "node_systemd_unit_state{state=\"active\", name=~\"phpfpm-.*\\\\.service\", instance=\"amzebs-01:9100\"} OR on() vector(0)"; - hide = false; - intervalMs = 1000; - legendFormat = "__auto"; - maxDataPoints = 43200; - range = true; - refId = "A"; - }; - } - { - refId = "B"; - datasourceUid = "__expr__"; - model = { - type = "reduce"; - expression = "A"; - reducer = "min"; - }; - } - { - refId = "C"; - datasourceUid = "__expr__"; - model = { - type = "math"; - expression = "$B < 1"; - }; - } - ]; - noDataState = "Alerting"; - execErrState = "Alerting"; - for = "5m"; - annotations = { - description = "One or more PHP-FPM services are down on amzebs-01"; - summary = "PHP-FPM Service Down on amzebs-01"; - }; - labels = { - severity = "critical"; - host = "amzebs-01"; - }; - } - ]; -} diff --git a/hosts/web-arm/modules/grafana/alerting/service/default.nix b/hosts/web-arm/modules/grafana/alerting/service/default.nix index cdcd759..edb75fb 100644 --- a/hosts/web-arm/modules/grafana/alerting/service/default.nix +++ b/hosts/web-arm/modules/grafana/alerting/service/default.nix @@ -1,26 +1,6 @@ { lib, pkgs, config, ... }: let - giteaDownAlertRules = (import ./gitea_down.nix { inherit lib pkgs config; }).grafanaAlertRuleDefinitions; - giteaRunnerDownAlertRules = (import ./gitea_runner_down.nix { inherit lib pkgs config; }).grafanaAlertRuleDefinitions; - postfixDownAlertRules = (import ./postfix_down.nix { inherit lib pkgs config; }).grafanaAlertRuleDefinitions; - dovecotDownAlertRules = (import ./dovecot_down.nix { inherit lib pkgs config; }).grafanaAlertRuleDefinitions; - openldapDownAlertRules = (import ./openldap_down.nix { inherit lib pkgs config; }).grafanaAlertRuleDefinitions; - wireguardDownAlertRules = (import ./wireguard_down.nix { inherit lib pkgs config; }).grafanaAlertRuleDefinitions; - - # amzebs-01 service alerts - ambebsMysqlDownAlertRules = (import ./amzebs_mysql_down.nix { inherit lib pkgs config; }).grafanaAlertRuleDefinitions; - ambebsNginxDownAlertRules = (import ./amzebs_nginx_down.nix { inherit lib pkgs config; }).grafanaAlertRuleDefinitions; - ambebsPhpfpmDownAlertRules = (import ./amzebs_phpfpm_down.nix { inherit lib pkgs config; }).grafanaAlertRuleDefinitions; - - allServiceRules = giteaDownAlertRules - ++ giteaRunnerDownAlertRules - ++ postfixDownAlertRules - ++ dovecotDownAlertRules - ++ openldapDownAlertRules - ++ wireguardDownAlertRules - ++ ambebsMysqlDownAlertRules - ++ ambebsNginxDownAlertRules - ++ ambebsPhpfpmDownAlertRules; + servicesDownAlertRules = (import ./services_down.nix { inherit lib pkgs config; }).grafanaAlertRuleDefinitions; in { services.grafana.provision.alerting.rules.settings.groups = [ @@ -28,7 +8,7 @@ in name = "Service Alerts"; folder = "Service Monitoring"; interval = "1m"; - rules = allServiceRules; + rules = servicesDownAlertRules; } ]; -} \ No newline at end of file +} diff --git a/hosts/web-arm/modules/grafana/alerting/service/dovecot_down.nix b/hosts/web-arm/modules/grafana/alerting/service/dovecot_down.nix deleted file mode 100644 index 18645fd..0000000 --- a/hosts/web-arm/modules/grafana/alerting/service/dovecot_down.nix +++ /dev/null @@ -1,57 +0,0 @@ -{ lib, pkgs, config, ... }: -{ - grafanaAlertRuleDefinitions = [ - { - uid = "dovecot-service-down-alert-uid"; - title = "Dovecot Service Down"; - condition = "C"; - data = [ - { - refId = "A"; - relativeTimeRange = { - from = 300; - to = 0; - }; - datasourceUid = "vm-datasource-uid"; - model = { - editorMode = "code"; - expr = "node_systemd_unit_state{state=\"active\", name=\"dovecot.service\"} OR on() vector(0)"; - hide = false; - intervalMs = 1000; - legendFormat = "__auto"; - maxDataPoints = 43200; - range = true; - refId = "A"; - }; - } - { - refId = "B"; - datasourceUid = "__expr__"; - model = { - type = "reduce"; - expression = "A"; - reducer = "last"; - }; - } - { - refId = "C"; - datasourceUid = "__expr__"; - model = { - type = "math"; - expression = "$B < 1"; - }; - } - ]; - noDataState = "Alerting"; - execErrState = "Alerting"; - for = "5m"; - annotations = { - description = "Dovecot service is down on {{ $labels.instance }}"; - summary = "Dovecot Service Down"; - }; - labels = { - severity = "critical"; - }; - } - ]; -} \ No newline at end of file diff --git a/hosts/web-arm/modules/grafana/alerting/service/gitea_down.nix b/hosts/web-arm/modules/grafana/alerting/service/gitea_down.nix deleted file mode 100644 index f4b0741..0000000 --- a/hosts/web-arm/modules/grafana/alerting/service/gitea_down.nix +++ /dev/null @@ -1,57 +0,0 @@ -{ lib, pkgs, config, ... }: -{ - grafanaAlertRuleDefinitions = [ - { - uid = "gitea-service-down-alert-uid"; - title = "Gitea Service Down"; - condition = "C"; - data = [ - { - refId = "A"; - relativeTimeRange = { - from = 300; - to = 0; - }; - datasourceUid = "vm-datasource-uid"; - model = { - editorMode = "code"; - expr = "node_systemd_unit_state{state=\"active\", name=\"container@git.service\"} OR on() vector(0)"; - hide = false; - intervalMs = 1000; - legendFormat = "__auto"; - maxDataPoints = 43200; - range = true; - refId = "A"; - }; - } - { - refId = "B"; - datasourceUid = "__expr__"; - model = { - type = "reduce"; - expression = "A"; - reducer = "last"; - }; - } - { - refId = "C"; - datasourceUid = "__expr__"; - model = { - type = "math"; - expression = "$B < 1"; - }; - } - ]; - noDataState = "Alerting"; - execErrState = "Alerting"; - for = "5m"; - annotations = { - description = "Gitea service is down on {{ $labels.instance }}"; - summary = "Gitea Service Down"; - }; - labels = { - severity = "critical"; - }; - } - ]; -} \ No newline at end of file diff --git a/hosts/web-arm/modules/grafana/alerting/service/gitea_runner_down.nix b/hosts/web-arm/modules/grafana/alerting/service/gitea_runner_down.nix deleted file mode 100644 index d4232a4..0000000 --- a/hosts/web-arm/modules/grafana/alerting/service/gitea_runner_down.nix +++ /dev/null @@ -1,57 +0,0 @@ -{ lib, pkgs, config, ... }: -{ - grafanaAlertRuleDefinitions = [ - { - uid = "gitea-runner-service-down-alert-uid"; - title = "Gitea Runner Service Down"; - condition = "C"; - data = [ - { - refId = "A"; - relativeTimeRange = { - from = 300; - to = 0; - }; - datasourceUid = "vm-datasource-uid"; - model = { - editorMode = "code"; - expr = "node_systemd_unit_state{state=\"active\", name=\"microvm@git-runner-1.service\"} OR on() vector(0)"; - hide = false; - intervalMs = 1000; - legendFormat = "__auto"; - maxDataPoints = 43200; - range = true; - refId = "A"; - }; - } - { - refId = "B"; - datasourceUid = "__expr__"; - model = { - type = "reduce"; - expression = "A"; - reducer = "last"; - }; - } - { - refId = "C"; - datasourceUid = "__expr__"; - model = { - type = "math"; - expression = "$B < 1"; - }; - } - ]; - noDataState = "Alerting"; - execErrState = "Alerting"; - for = "5m"; - annotations = { - description = "Gitea Runner service is down on {{ $labels.instance }}"; - summary = "Gitea Runner Service Down"; - }; - labels = { - severity = "critical"; - }; - } - ]; -} \ No newline at end of file diff --git a/hosts/web-arm/modules/grafana/alerting/service/openldap_down.nix b/hosts/web-arm/modules/grafana/alerting/service/openldap_down.nix deleted file mode 100644 index 35172a8..0000000 --- a/hosts/web-arm/modules/grafana/alerting/service/openldap_down.nix +++ /dev/null @@ -1,57 +0,0 @@ -{ lib, pkgs, config, ... }: -{ - grafanaAlertRuleDefinitions = [ - { - uid = "openldap-service-down-alert-uid"; - title = "OpenLDAP Service Down"; - condition = "C"; - data = [ - { - refId = "A"; - relativeTimeRange = { - from = 300; - to = 0; - }; - datasourceUid = "vm-datasource-uid"; - model = { - editorMode = "code"; - expr = "node_systemd_unit_state{state=\"active\", name=\"openldap.service\"} OR on() vector(0)"; - hide = false; - intervalMs = 1000; - legendFormat = "__auto"; - maxDataPoints = 43200; - range = true; - refId = "A"; - }; - } - { - refId = "B"; - datasourceUid = "__expr__"; - model = { - type = "reduce"; - expression = "A"; - reducer = "last"; - }; - } - { - refId = "C"; - datasourceUid = "__expr__"; - model = { - type = "math"; - expression = "$B < 1"; - }; - } - ]; - noDataState = "Alerting"; - execErrState = "Alerting"; - for = "5m"; - annotations = { - description = "OpenLDAP service is down on {{ $labels.instance }}"; - summary = "OpenLDAP Service Down"; - }; - labels = { - severity = "critical"; - }; - } - ]; -} \ No newline at end of file diff --git a/hosts/web-arm/modules/grafana/alerting/service/postfix_down.nix b/hosts/web-arm/modules/grafana/alerting/service/postfix_down.nix deleted file mode 100644 index cfd5247..0000000 --- a/hosts/web-arm/modules/grafana/alerting/service/postfix_down.nix +++ /dev/null @@ -1,57 +0,0 @@ -{ lib, pkgs, config, ... }: -{ - grafanaAlertRuleDefinitions = [ - { - uid = "postfix-service-down-alert-uid"; - title = "Postfix Service Down"; - condition = "C"; - data = [ - { - refId = "A"; - relativeTimeRange = { - from = 300; - to = 0; - }; - datasourceUid = "vm-datasource-uid"; - model = { - editorMode = "code"; - expr = "node_systemd_unit_state{state=\"active\", name=\"postfix.service\"} OR on() vector(0)"; - hide = false; - intervalMs = 1000; - legendFormat = "__auto"; - maxDataPoints = 43200; - range = true; - refId = "A"; - }; - } - { - refId = "B"; - datasourceUid = "__expr__"; - model = { - type = "reduce"; - expression = "A"; - reducer = "last"; - }; - } - { - refId = "C"; - datasourceUid = "__expr__"; - model = { - type = "math"; - expression = "$B < 1"; - }; - } - ]; - noDataState = "Alerting"; - execErrState = "Alerting"; - for = "5m"; - annotations = { - description = "Postfix service is down on {{ $labels.instance }}"; - summary = "Postfix Service Down"; - }; - labels = { - severity = "critical"; - }; - } - ]; -} \ No newline at end of file diff --git a/hosts/web-arm/modules/grafana/alerting/service/services_down.nix b/hosts/web-arm/modules/grafana/alerting/service/services_down.nix new file mode 100644 index 0000000..bc2df22 --- /dev/null +++ b/hosts/web-arm/modules/grafana/alerting/service/services_down.nix @@ -0,0 +1,90 @@ +{ lib, pkgs, config, ... }: +let + # Add services here - each entry generates an alert rule + # instance = which node exporter to query (hostname:9100) + monitoredServices = [ + { name = "AI-Mailer"; service = "ai-mailer.service"; instance = "fw:9100"; } + { name = "Postfix"; service = "postfix.service"; instance = "mail:9100"; } + { name = "Dovecot"; service = "dovecot.service"; instance = "mail:9100"; } + { name = "OpenLDAP"; service = "openldap.service"; instance = "mail:9100"; } + { name = "Gitea"; service = "container@git.service"; instance = "fw:9100"; } + { name = "Gitea Runner"; service = "microvm@git-runner-1.service"; instance = "fw:9100"; } + { name = "WireGuard"; service = "wireguard-wg_cloonar.service"; instance = "mail:9100"; } + { name = "MySQL"; service = "mysql.service"; instance = "amzebs-01:9100"; } + { name = "Nginx"; service = "nginx.service"; instance = "amzebs-01:9100"; } + { name = "PHP-FPM"; service = "phpfpm-.*\\.service"; instance = "amzebs-01:9100"; } + ]; + + # Extract host from instance (e.g., "fw:9100" -> "fw") + getHost = instance: lib.head (lib.splitString ":" instance); + + # Generate a unique UID from service name + mkUid = name: "${lib.toLower (lib.replaceStrings [" " "@" "."] ["-" "-" "-"] name)}-down-uid"; + + # Check if service pattern uses regex (contains special chars) + isRegex = svc: lib.hasInfix ".*" svc || lib.hasInfix "\\" svc; + + # Build the PromQL expression + mkExpr = svc: + let + nameMatch = if isRegex svc.service + then "name=~\"${svc.service}\"" + else "name=\"${svc.service}\""; + in "node_systemd_unit_state{state=\"active\", ${nameMatch}, instance=\"${svc.instance}\"} OR on() vector(0)"; + + mkServiceAlert = svc: { + uid = mkUid svc.name; + title = "${svc.name} Service Down on ${getHost svc.instance}"; + condition = "C"; + data = [ + { + refId = "A"; + relativeTimeRange = { + from = 300; + to = 0; + }; + datasourceUid = "vm-datasource-uid"; + model = { + editorMode = "code"; + expr = mkExpr svc; + hide = false; + intervalMs = 1000; + legendFormat = "__auto"; + maxDataPoints = 43200; + range = true; + refId = "A"; + }; + } + { + refId = "B"; + datasourceUid = "__expr__"; + model = { + type = "reduce"; + expression = "A"; + reducer = "last"; + }; + } + { + refId = "C"; + datasourceUid = "__expr__"; + model = { + type = "math"; + expression = "$B < 1"; + }; + } + ]; + noDataState = "Alerting"; + execErrState = "Alerting"; + for = "5m"; + annotations = { + description = "${svc.name} service is down on ${getHost svc.instance}"; + summary = "${svc.name} Service Down"; + }; + labels = { + severity = "critical"; + host = getHost svc.instance; + }; + }; +in { + grafanaAlertRuleDefinitions = map mkServiceAlert monitoredServices; +} diff --git a/hosts/web-arm/modules/grafana/alerting/service/wireguard_down.nix b/hosts/web-arm/modules/grafana/alerting/service/wireguard_down.nix deleted file mode 100644 index b7be698..0000000 --- a/hosts/web-arm/modules/grafana/alerting/service/wireguard_down.nix +++ /dev/null @@ -1,57 +0,0 @@ -{ lib, pkgs, config, ... }: -{ - grafanaAlertRuleDefinitions = [ - { - uid = "wireguard-service-down-alert-uid"; - title = "WireGuard Service Down"; - condition = "C"; - data = [ - { - refId = "A"; - relativeTimeRange = { - from = 300; - to = 0; - }; - datasourceUid = "vm-datasource-uid"; - model = { - editorMode = "code"; - expr = "node_systemd_unit_state{state=\"active\", name=\"wireguard-wg_cloonar.service\"} OR on() vector(0)"; - hide = false; - intervalMs = 1000; - legendFormat = "__auto"; - maxDataPoints = 43200; - range = true; - refId = "A"; - }; - } - { - refId = "B"; - datasourceUid = "__expr__"; - model = { - type = "reduce"; - expression = "A"; - reducer = "last"; - }; - } - { - refId = "C"; - datasourceUid = "__expr__"; - model = { - type = "math"; - expression = "$B < 1"; - }; - } - ]; - noDataState = "Alerting"; - execErrState = "Alerting"; - for = "5m"; - annotations = { - description = "WireGuard service is down on {{ $labels.instance }}"; - summary = "WireGuard Service Down"; - }; - labels = { - severity = "critical"; - }; - } - ]; -} \ No newline at end of file