diff --git a/hosts/fw/modules/fwmetrics.nix b/hosts/fw/modules/fwmetrics.nix index 665775b..e1de095 100644 --- a/hosts/fw/modules/fwmetrics.nix +++ b/hosts/fw/modules/fwmetrics.nix @@ -2,41 +2,18 @@ let configure_prom = builtins.toFile "prometheus.yml" '' scrape_configs: - # System metrics - - job_name: 'node' + - job_name: 'server' stream_parse: true static_configs: - targets: - ${config.networking.hostName}:9100 - - # Systemd service monitoring - - job_name: 'systemd' - metrics_path: /metrics - params: - collect[]: - - 'systemd.service.state' - - 'systemd.service.start_time_seconds' - - 'systemd.unit_file.state' - static_configs: - - targets: - - ${config.networking.hostName}:9100 - relabel_configs: - - source_labels: [__name__] - regex: 'node_systemd_unit_state' - action: keep - - source_labels: [name] - regex: '(ai-mailer|container@git|microvm@git-runner-).*\.service' - action: keep ''; in { sops.secrets.victoria-agent-env = { sopsFile = ../utils/modules/victoriametrics/secrets.yaml; }; - services.prometheus.exporters.node = { - enable = true; - enabledCollectors = [ "systemd" ]; - }; + services.prometheus.exporters.node.enable = true; systemd.services.export-fw-to-prometheus = { path = with pkgs; [victoriametrics]; diff --git a/hosts/web-arm/modules/grafana/alerting/service/amzebs_mysql_down.nix b/hosts/web-arm/modules/grafana/alerting/service/amzebs_mysql_down.nix new file mode 100644 index 0000000..9416794 --- /dev/null +++ b/hosts/web-arm/modules/grafana/alerting/service/amzebs_mysql_down.nix @@ -0,0 +1,58 @@ +{ lib, pkgs, config, ... }: +{ + grafanaAlertRuleDefinitions = [ + { + uid = "amzebs-mysql-service-down-alert-uid"; + title = "MySQL Service Down on amzebs-01"; + condition = "C"; + data = [ + { + refId = "A"; + relativeTimeRange = { + from = 300; + to = 0; + }; + datasourceUid = "vm-datasource-uid"; + model = { + editorMode = "code"; + expr = "node_systemd_unit_state{state=\"active\", name=\"mysql.service\", instance=\"amzebs-01:9100\"} OR on() vector(0)"; + hide = false; + intervalMs = 1000; + legendFormat = "__auto"; + maxDataPoints = 43200; + range = true; + refId = "A"; + }; + } + { + refId = "B"; + datasourceUid = "__expr__"; + model = { + type = "reduce"; + expression = "A"; + reducer = "last"; + }; + } + { + refId = "C"; + datasourceUid = "__expr__"; + model = { + type = "math"; + expression = "$B < 1"; + }; + } + ]; + noDataState = "Alerting"; + execErrState = "Alerting"; + for = "5m"; + annotations = { + description = "MySQL service is down on amzebs-01"; + summary = "MySQL Service Down on amzebs-01"; + }; + labels = { + severity = "critical"; + host = "amzebs-01"; + }; + } + ]; +} diff --git a/hosts/web-arm/modules/grafana/alerting/service/amzebs_nginx_down.nix b/hosts/web-arm/modules/grafana/alerting/service/amzebs_nginx_down.nix new file mode 100644 index 0000000..a3b2119 --- /dev/null +++ b/hosts/web-arm/modules/grafana/alerting/service/amzebs_nginx_down.nix @@ -0,0 +1,58 @@ +{ lib, pkgs, config, ... }: +{ + grafanaAlertRuleDefinitions = [ + { + uid = "amzebs-nginx-service-down-alert-uid"; + title = "Nginx Service Down on amzebs-01"; + condition = "C"; + data = [ + { + refId = "A"; + relativeTimeRange = { + from = 300; + to = 0; + }; + datasourceUid = "vm-datasource-uid"; + model = { + editorMode = "code"; + expr = "node_systemd_unit_state{state=\"active\", name=\"nginx.service\", instance=\"amzebs-01:9100\"} OR on() vector(0)"; + hide = false; + intervalMs = 1000; + legendFormat = "__auto"; + maxDataPoints = 43200; + range = true; + refId = "A"; + }; + } + { + refId = "B"; + datasourceUid = "__expr__"; + model = { + type = "reduce"; + expression = "A"; + reducer = "last"; + }; + } + { + refId = "C"; + datasourceUid = "__expr__"; + model = { + type = "math"; + expression = "$B < 1"; + }; + } + ]; + noDataState = "Alerting"; + execErrState = "Alerting"; + for = "5m"; + annotations = { + description = "Nginx service is down on amzebs-01"; + summary = "Nginx Service Down on amzebs-01"; + }; + labels = { + severity = "critical"; + host = "amzebs-01"; + }; + } + ]; +} diff --git a/hosts/web-arm/modules/grafana/alerting/service/amzebs_phpfpm_down.nix b/hosts/web-arm/modules/grafana/alerting/service/amzebs_phpfpm_down.nix new file mode 100644 index 0000000..bfe52fa --- /dev/null +++ b/hosts/web-arm/modules/grafana/alerting/service/amzebs_phpfpm_down.nix @@ -0,0 +1,58 @@ +{ lib, pkgs, config, ... }: +{ + grafanaAlertRuleDefinitions = [ + { + uid = "amzebs-phpfpm-service-down-alert-uid"; + title = "PHP-FPM Service Down on amzebs-01"; + condition = "C"; + data = [ + { + refId = "A"; + relativeTimeRange = { + from = 300; + to = 0; + }; + datasourceUid = "vm-datasource-uid"; + model = { + editorMode = "code"; + expr = "node_systemd_unit_state{state=\"active\", name=~\"phpfpm-.*\\\\.service\", instance=\"amzebs-01:9100\"} OR on() vector(0)"; + hide = false; + intervalMs = 1000; + legendFormat = "__auto"; + maxDataPoints = 43200; + range = true; + refId = "A"; + }; + } + { + refId = "B"; + datasourceUid = "__expr__"; + model = { + type = "reduce"; + expression = "A"; + reducer = "min"; + }; + } + { + refId = "C"; + datasourceUid = "__expr__"; + model = { + type = "math"; + expression = "$B < 1"; + }; + } + ]; + noDataState = "Alerting"; + execErrState = "Alerting"; + for = "5m"; + annotations = { + description = "One or more PHP-FPM services are down on amzebs-01"; + summary = "PHP-FPM Service Down on amzebs-01"; + }; + labels = { + severity = "critical"; + host = "amzebs-01"; + }; + } + ]; +} diff --git a/hosts/web-arm/modules/grafana/alerting/service/default.nix b/hosts/web-arm/modules/grafana/alerting/service/default.nix index edb75fb..cdcd759 100644 --- a/hosts/web-arm/modules/grafana/alerting/service/default.nix +++ b/hosts/web-arm/modules/grafana/alerting/service/default.nix @@ -1,6 +1,26 @@ { lib, pkgs, config, ... }: let - servicesDownAlertRules = (import ./services_down.nix { inherit lib pkgs config; }).grafanaAlertRuleDefinitions; + giteaDownAlertRules = (import ./gitea_down.nix { inherit lib pkgs config; }).grafanaAlertRuleDefinitions; + giteaRunnerDownAlertRules = (import ./gitea_runner_down.nix { inherit lib pkgs config; }).grafanaAlertRuleDefinitions; + postfixDownAlertRules = (import ./postfix_down.nix { inherit lib pkgs config; }).grafanaAlertRuleDefinitions; + dovecotDownAlertRules = (import ./dovecot_down.nix { inherit lib pkgs config; }).grafanaAlertRuleDefinitions; + openldapDownAlertRules = (import ./openldap_down.nix { inherit lib pkgs config; }).grafanaAlertRuleDefinitions; + wireguardDownAlertRules = (import ./wireguard_down.nix { inherit lib pkgs config; }).grafanaAlertRuleDefinitions; + + # amzebs-01 service alerts + ambebsMysqlDownAlertRules = (import ./amzebs_mysql_down.nix { inherit lib pkgs config; }).grafanaAlertRuleDefinitions; + ambebsNginxDownAlertRules = (import ./amzebs_nginx_down.nix { inherit lib pkgs config; }).grafanaAlertRuleDefinitions; + ambebsPhpfpmDownAlertRules = (import ./amzebs_phpfpm_down.nix { inherit lib pkgs config; }).grafanaAlertRuleDefinitions; + + allServiceRules = giteaDownAlertRules + ++ giteaRunnerDownAlertRules + ++ postfixDownAlertRules + ++ dovecotDownAlertRules + ++ openldapDownAlertRules + ++ wireguardDownAlertRules + ++ ambebsMysqlDownAlertRules + ++ ambebsNginxDownAlertRules + ++ ambebsPhpfpmDownAlertRules; in { services.grafana.provision.alerting.rules.settings.groups = [ @@ -8,7 +28,7 @@ in name = "Service Alerts"; folder = "Service Monitoring"; interval = "1m"; - rules = servicesDownAlertRules; + rules = allServiceRules; } ]; -} +} \ No newline at end of file diff --git a/hosts/web-arm/modules/grafana/alerting/service/dovecot_down.nix b/hosts/web-arm/modules/grafana/alerting/service/dovecot_down.nix new file mode 100644 index 0000000..18645fd --- /dev/null +++ b/hosts/web-arm/modules/grafana/alerting/service/dovecot_down.nix @@ -0,0 +1,57 @@ +{ lib, pkgs, config, ... }: +{ + grafanaAlertRuleDefinitions = [ + { + uid = "dovecot-service-down-alert-uid"; + title = "Dovecot Service Down"; + condition = "C"; + data = [ + { + refId = "A"; + relativeTimeRange = { + from = 300; + to = 0; + }; + datasourceUid = "vm-datasource-uid"; + model = { + editorMode = "code"; + expr = "node_systemd_unit_state{state=\"active\", name=\"dovecot.service\"} OR on() vector(0)"; + hide = false; + intervalMs = 1000; + legendFormat = "__auto"; + maxDataPoints = 43200; + range = true; + refId = "A"; + }; + } + { + refId = "B"; + datasourceUid = "__expr__"; + model = { + type = "reduce"; + expression = "A"; + reducer = "last"; + }; + } + { + refId = "C"; + datasourceUid = "__expr__"; + model = { + type = "math"; + expression = "$B < 1"; + }; + } + ]; + noDataState = "Alerting"; + execErrState = "Alerting"; + for = "5m"; + annotations = { + description = "Dovecot service is down on {{ $labels.instance }}"; + summary = "Dovecot Service Down"; + }; + labels = { + severity = "critical"; + }; + } + ]; +} \ No newline at end of file diff --git a/hosts/web-arm/modules/grafana/alerting/service/gitea_down.nix b/hosts/web-arm/modules/grafana/alerting/service/gitea_down.nix new file mode 100644 index 0000000..f4b0741 --- /dev/null +++ b/hosts/web-arm/modules/grafana/alerting/service/gitea_down.nix @@ -0,0 +1,57 @@ +{ lib, pkgs, config, ... }: +{ + grafanaAlertRuleDefinitions = [ + { + uid = "gitea-service-down-alert-uid"; + title = "Gitea Service Down"; + condition = "C"; + data = [ + { + refId = "A"; + relativeTimeRange = { + from = 300; + to = 0; + }; + datasourceUid = "vm-datasource-uid"; + model = { + editorMode = "code"; + expr = "node_systemd_unit_state{state=\"active\", name=\"container@git.service\"} OR on() vector(0)"; + hide = false; + intervalMs = 1000; + legendFormat = "__auto"; + maxDataPoints = 43200; + range = true; + refId = "A"; + }; + } + { + refId = "B"; + datasourceUid = "__expr__"; + model = { + type = "reduce"; + expression = "A"; + reducer = "last"; + }; + } + { + refId = "C"; + datasourceUid = "__expr__"; + model = { + type = "math"; + expression = "$B < 1"; + }; + } + ]; + noDataState = "Alerting"; + execErrState = "Alerting"; + for = "5m"; + annotations = { + description = "Gitea service is down on {{ $labels.instance }}"; + summary = "Gitea Service Down"; + }; + labels = { + severity = "critical"; + }; + } + ]; +} \ No newline at end of file diff --git a/hosts/web-arm/modules/grafana/alerting/service/gitea_runner_down.nix b/hosts/web-arm/modules/grafana/alerting/service/gitea_runner_down.nix new file mode 100644 index 0000000..d4232a4 --- /dev/null +++ b/hosts/web-arm/modules/grafana/alerting/service/gitea_runner_down.nix @@ -0,0 +1,57 @@ +{ lib, pkgs, config, ... }: +{ + grafanaAlertRuleDefinitions = [ + { + uid = "gitea-runner-service-down-alert-uid"; + title = "Gitea Runner Service Down"; + condition = "C"; + data = [ + { + refId = "A"; + relativeTimeRange = { + from = 300; + to = 0; + }; + datasourceUid = "vm-datasource-uid"; + model = { + editorMode = "code"; + expr = "node_systemd_unit_state{state=\"active\", name=\"microvm@git-runner-1.service\"} OR on() vector(0)"; + hide = false; + intervalMs = 1000; + legendFormat = "__auto"; + maxDataPoints = 43200; + range = true; + refId = "A"; + }; + } + { + refId = "B"; + datasourceUid = "__expr__"; + model = { + type = "reduce"; + expression = "A"; + reducer = "last"; + }; + } + { + refId = "C"; + datasourceUid = "__expr__"; + model = { + type = "math"; + expression = "$B < 1"; + }; + } + ]; + noDataState = "Alerting"; + execErrState = "Alerting"; + for = "5m"; + annotations = { + description = "Gitea Runner service is down on {{ $labels.instance }}"; + summary = "Gitea Runner Service Down"; + }; + labels = { + severity = "critical"; + }; + } + ]; +} \ No newline at end of file diff --git a/hosts/web-arm/modules/grafana/alerting/service/openldap_down.nix b/hosts/web-arm/modules/grafana/alerting/service/openldap_down.nix new file mode 100644 index 0000000..35172a8 --- /dev/null +++ b/hosts/web-arm/modules/grafana/alerting/service/openldap_down.nix @@ -0,0 +1,57 @@ +{ lib, pkgs, config, ... }: +{ + grafanaAlertRuleDefinitions = [ + { + uid = "openldap-service-down-alert-uid"; + title = "OpenLDAP Service Down"; + condition = "C"; + data = [ + { + refId = "A"; + relativeTimeRange = { + from = 300; + to = 0; + }; + datasourceUid = "vm-datasource-uid"; + model = { + editorMode = "code"; + expr = "node_systemd_unit_state{state=\"active\", name=\"openldap.service\"} OR on() vector(0)"; + hide = false; + intervalMs = 1000; + legendFormat = "__auto"; + maxDataPoints = 43200; + range = true; + refId = "A"; + }; + } + { + refId = "B"; + datasourceUid = "__expr__"; + model = { + type = "reduce"; + expression = "A"; + reducer = "last"; + }; + } + { + refId = "C"; + datasourceUid = "__expr__"; + model = { + type = "math"; + expression = "$B < 1"; + }; + } + ]; + noDataState = "Alerting"; + execErrState = "Alerting"; + for = "5m"; + annotations = { + description = "OpenLDAP service is down on {{ $labels.instance }}"; + summary = "OpenLDAP Service Down"; + }; + labels = { + severity = "critical"; + }; + } + ]; +} \ No newline at end of file diff --git a/hosts/web-arm/modules/grafana/alerting/service/postfix_down.nix b/hosts/web-arm/modules/grafana/alerting/service/postfix_down.nix new file mode 100644 index 0000000..cfd5247 --- /dev/null +++ b/hosts/web-arm/modules/grafana/alerting/service/postfix_down.nix @@ -0,0 +1,57 @@ +{ lib, pkgs, config, ... }: +{ + grafanaAlertRuleDefinitions = [ + { + uid = "postfix-service-down-alert-uid"; + title = "Postfix Service Down"; + condition = "C"; + data = [ + { + refId = "A"; + relativeTimeRange = { + from = 300; + to = 0; + }; + datasourceUid = "vm-datasource-uid"; + model = { + editorMode = "code"; + expr = "node_systemd_unit_state{state=\"active\", name=\"postfix.service\"} OR on() vector(0)"; + hide = false; + intervalMs = 1000; + legendFormat = "__auto"; + maxDataPoints = 43200; + range = true; + refId = "A"; + }; + } + { + refId = "B"; + datasourceUid = "__expr__"; + model = { + type = "reduce"; + expression = "A"; + reducer = "last"; + }; + } + { + refId = "C"; + datasourceUid = "__expr__"; + model = { + type = "math"; + expression = "$B < 1"; + }; + } + ]; + noDataState = "Alerting"; + execErrState = "Alerting"; + for = "5m"; + annotations = { + description = "Postfix service is down on {{ $labels.instance }}"; + summary = "Postfix Service Down"; + }; + labels = { + severity = "critical"; + }; + } + ]; +} \ No newline at end of file diff --git a/hosts/web-arm/modules/grafana/alerting/service/services_down.nix b/hosts/web-arm/modules/grafana/alerting/service/services_down.nix deleted file mode 100644 index bc2df22..0000000 --- a/hosts/web-arm/modules/grafana/alerting/service/services_down.nix +++ /dev/null @@ -1,90 +0,0 @@ -{ lib, pkgs, config, ... }: -let - # Add services here - each entry generates an alert rule - # instance = which node exporter to query (hostname:9100) - monitoredServices = [ - { name = "AI-Mailer"; service = "ai-mailer.service"; instance = "fw:9100"; } - { name = "Postfix"; service = "postfix.service"; instance = "mail:9100"; } - { name = "Dovecot"; service = "dovecot.service"; instance = "mail:9100"; } - { name = "OpenLDAP"; service = "openldap.service"; instance = "mail:9100"; } - { name = "Gitea"; service = "container@git.service"; instance = "fw:9100"; } - { name = "Gitea Runner"; service = "microvm@git-runner-1.service"; instance = "fw:9100"; } - { name = "WireGuard"; service = "wireguard-wg_cloonar.service"; instance = "mail:9100"; } - { name = "MySQL"; service = "mysql.service"; instance = "amzebs-01:9100"; } - { name = "Nginx"; service = "nginx.service"; instance = "amzebs-01:9100"; } - { name = "PHP-FPM"; service = "phpfpm-.*\\.service"; instance = "amzebs-01:9100"; } - ]; - - # Extract host from instance (e.g., "fw:9100" -> "fw") - getHost = instance: lib.head (lib.splitString ":" instance); - - # Generate a unique UID from service name - mkUid = name: "${lib.toLower (lib.replaceStrings [" " "@" "."] ["-" "-" "-"] name)}-down-uid"; - - # Check if service pattern uses regex (contains special chars) - isRegex = svc: lib.hasInfix ".*" svc || lib.hasInfix "\\" svc; - - # Build the PromQL expression - mkExpr = svc: - let - nameMatch = if isRegex svc.service - then "name=~\"${svc.service}\"" - else "name=\"${svc.service}\""; - in "node_systemd_unit_state{state=\"active\", ${nameMatch}, instance=\"${svc.instance}\"} OR on() vector(0)"; - - mkServiceAlert = svc: { - uid = mkUid svc.name; - title = "${svc.name} Service Down on ${getHost svc.instance}"; - condition = "C"; - data = [ - { - refId = "A"; - relativeTimeRange = { - from = 300; - to = 0; - }; - datasourceUid = "vm-datasource-uid"; - model = { - editorMode = "code"; - expr = mkExpr svc; - hide = false; - intervalMs = 1000; - legendFormat = "__auto"; - maxDataPoints = 43200; - range = true; - refId = "A"; - }; - } - { - refId = "B"; - datasourceUid = "__expr__"; - model = { - type = "reduce"; - expression = "A"; - reducer = "last"; - }; - } - { - refId = "C"; - datasourceUid = "__expr__"; - model = { - type = "math"; - expression = "$B < 1"; - }; - } - ]; - noDataState = "Alerting"; - execErrState = "Alerting"; - for = "5m"; - annotations = { - description = "${svc.name} service is down on ${getHost svc.instance}"; - summary = "${svc.name} Service Down"; - }; - labels = { - severity = "critical"; - host = getHost svc.instance; - }; - }; -in { - grafanaAlertRuleDefinitions = map mkServiceAlert monitoredServices; -} diff --git a/hosts/web-arm/modules/grafana/alerting/service/wireguard_down.nix b/hosts/web-arm/modules/grafana/alerting/service/wireguard_down.nix new file mode 100644 index 0000000..b7be698 --- /dev/null +++ b/hosts/web-arm/modules/grafana/alerting/service/wireguard_down.nix @@ -0,0 +1,57 @@ +{ lib, pkgs, config, ... }: +{ + grafanaAlertRuleDefinitions = [ + { + uid = "wireguard-service-down-alert-uid"; + title = "WireGuard Service Down"; + condition = "C"; + data = [ + { + refId = "A"; + relativeTimeRange = { + from = 300; + to = 0; + }; + datasourceUid = "vm-datasource-uid"; + model = { + editorMode = "code"; + expr = "node_systemd_unit_state{state=\"active\", name=\"wireguard-wg_cloonar.service\"} OR on() vector(0)"; + hide = false; + intervalMs = 1000; + legendFormat = "__auto"; + maxDataPoints = 43200; + range = true; + refId = "A"; + }; + } + { + refId = "B"; + datasourceUid = "__expr__"; + model = { + type = "reduce"; + expression = "A"; + reducer = "last"; + }; + } + { + refId = "C"; + datasourceUid = "__expr__"; + model = { + type = "math"; + expression = "$B < 1"; + }; + } + ]; + noDataState = "Alerting"; + execErrState = "Alerting"; + for = "5m"; + annotations = { + description = "WireGuard service is down on {{ $labels.instance }}"; + summary = "WireGuard Service Down"; + }; + labels = { + severity = "critical"; + }; + } + ]; +} \ No newline at end of file