diff --git a/hosts/web-arm/modules/grafana/alerting/service/default.nix b/hosts/web-arm/modules/grafana/alerting/service/default.nix index 57d98d9..07f1848 100644 --- a/hosts/web-arm/modules/grafana/alerting/service/default.nix +++ b/hosts/web-arm/modules/grafana/alerting/service/default.nix @@ -1,8 +1,18 @@ { lib, pkgs, config, ... }: let - serviceDownAlertRules = (import ./service_down.nix { inherit lib pkgs config; }).grafanaAlertRuleDefinitions; + giteaDownAlertRules = (import ./gitea_down.nix { inherit lib pkgs config; }).grafanaAlertRuleDefinitions; + giteaRunnerDownAlertRules = (import ./gitea_runner_down.nix { inherit lib pkgs config; }).grafanaAlertRuleDefinitions; + postfixDownAlertRules = (import ./postfix_down.nix { inherit lib pkgs config; }).grafanaAlertRuleDefinitions; + dovecotDownAlertRules = (import ./dovecot_down.nix { inherit lib pkgs config; }).grafanaAlertRuleDefinitions; + openldapDownAlertRules = (import ./openldap_down.nix { inherit lib pkgs config; }).grafanaAlertRuleDefinitions; + wireguardDownAlertRules = (import ./wireguard_down.nix { inherit lib pkgs config; }).grafanaAlertRuleDefinitions; - allServiceRules = serviceDownAlertRules; + allServiceRules = giteaDownAlertRules + ++ giteaRunnerDownAlertRules + ++ postfixDownAlertRules + ++ dovecotDownAlertRules + ++ openldapDownAlertRules + ++ wireguardDownAlertRules; in { services.grafana.provision.alerting.rules.settings.groups = [ diff --git a/hosts/web-arm/modules/grafana/alerting/service/dovecot_down.nix b/hosts/web-arm/modules/grafana/alerting/service/dovecot_down.nix new file mode 100644 index 0000000..2a03cb9 --- /dev/null +++ b/hosts/web-arm/modules/grafana/alerting/service/dovecot_down.nix @@ -0,0 +1,57 @@ +{ lib, pkgs, config, ... }: +{ + grafanaAlertRuleDefinitions = [ + { + uid = "dovecot-service-down-alert-uid"; + title = "Dovecot Service Down"; + condition = "C"; + data = [ + { + refId = "A"; + relativeTimeRange = { + from = 300; + to = 0; + }; + datasourceUid = "vm-datasource-uid"; + model = { + editorMode = "code"; + expr = "node_systemd_unit_state{state=\"active\", name=\"dovecot2.service\"} OR on() vector(0)"; + hide = false; + intervalMs = 1000; + legendFormat = "__auto"; + maxDataPoints = 43200; + range = true; + refId = "A"; + }; + } + { + refId = "B"; + datasourceUid = "__expr__"; + model = { + type = "reduce"; + expression = "A"; + reducer = "last"; + }; + } + { + refId = "C"; + datasourceUid = "__expr__"; + model = { + type = "math"; + expression = "$B < 1"; + }; + } + ]; + noDataState = "Alerting"; + execErrState = "Alerting"; + for = "5m"; + annotations = { + description = "Dovecot service is down on {{ $labels.instance }}"; + summary = "Dovecot Service Down"; + }; + labels = { + severity = "critical"; + }; + } + ]; +} \ No newline at end of file diff --git a/hosts/web-arm/modules/grafana/alerting/service/gitea_down.nix b/hosts/web-arm/modules/grafana/alerting/service/gitea_down.nix new file mode 100644 index 0000000..f4b0741 --- /dev/null +++ b/hosts/web-arm/modules/grafana/alerting/service/gitea_down.nix @@ -0,0 +1,57 @@ +{ lib, pkgs, config, ... }: +{ + grafanaAlertRuleDefinitions = [ + { + uid = "gitea-service-down-alert-uid"; + title = "Gitea Service Down"; + condition = "C"; + data = [ + { + refId = "A"; + relativeTimeRange = { + from = 300; + to = 0; + }; + datasourceUid = "vm-datasource-uid"; + model = { + editorMode = "code"; + expr = "node_systemd_unit_state{state=\"active\", name=\"container@git.service\"} OR on() vector(0)"; + hide = false; + intervalMs = 1000; + legendFormat = "__auto"; + maxDataPoints = 43200; + range = true; + refId = "A"; + }; + } + { + refId = "B"; + datasourceUid = "__expr__"; + model = { + type = "reduce"; + expression = "A"; + reducer = "last"; + }; + } + { + refId = "C"; + datasourceUid = "__expr__"; + model = { + type = "math"; + expression = "$B < 1"; + }; + } + ]; + noDataState = "Alerting"; + execErrState = "Alerting"; + for = "5m"; + annotations = { + description = "Gitea service is down on {{ $labels.instance }}"; + summary = "Gitea Service Down"; + }; + labels = { + severity = "critical"; + }; + } + ]; +} \ No newline at end of file diff --git a/hosts/web-arm/modules/grafana/alerting/service/gitea_runner_down.nix b/hosts/web-arm/modules/grafana/alerting/service/gitea_runner_down.nix new file mode 100644 index 0000000..d4232a4 --- /dev/null +++ b/hosts/web-arm/modules/grafana/alerting/service/gitea_runner_down.nix @@ -0,0 +1,57 @@ +{ lib, pkgs, config, ... }: +{ + grafanaAlertRuleDefinitions = [ + { + uid = "gitea-runner-service-down-alert-uid"; + title = "Gitea Runner Service Down"; + condition = "C"; + data = [ + { + refId = "A"; + relativeTimeRange = { + from = 300; + to = 0; + }; + datasourceUid = "vm-datasource-uid"; + model = { + editorMode = "code"; + expr = "node_systemd_unit_state{state=\"active\", name=\"microvm@git-runner-1.service\"} OR on() vector(0)"; + hide = false; + intervalMs = 1000; + legendFormat = "__auto"; + maxDataPoints = 43200; + range = true; + refId = "A"; + }; + } + { + refId = "B"; + datasourceUid = "__expr__"; + model = { + type = "reduce"; + expression = "A"; + reducer = "last"; + }; + } + { + refId = "C"; + datasourceUid = "__expr__"; + model = { + type = "math"; + expression = "$B < 1"; + }; + } + ]; + noDataState = "Alerting"; + execErrState = "Alerting"; + for = "5m"; + annotations = { + description = "Gitea Runner service is down on {{ $labels.instance }}"; + summary = "Gitea Runner Service Down"; + }; + labels = { + severity = "critical"; + }; + } + ]; +} \ No newline at end of file diff --git a/hosts/web-arm/modules/grafana/alerting/service/openldap_down.nix b/hosts/web-arm/modules/grafana/alerting/service/openldap_down.nix new file mode 100644 index 0000000..35172a8 --- /dev/null +++ b/hosts/web-arm/modules/grafana/alerting/service/openldap_down.nix @@ -0,0 +1,57 @@ +{ lib, pkgs, config, ... }: +{ + grafanaAlertRuleDefinitions = [ + { + uid = "openldap-service-down-alert-uid"; + title = "OpenLDAP Service Down"; + condition = "C"; + data = [ + { + refId = "A"; + relativeTimeRange = { + from = 300; + to = 0; + }; + datasourceUid = "vm-datasource-uid"; + model = { + editorMode = "code"; + expr = "node_systemd_unit_state{state=\"active\", name=\"openldap.service\"} OR on() vector(0)"; + hide = false; + intervalMs = 1000; + legendFormat = "__auto"; + maxDataPoints = 43200; + range = true; + refId = "A"; + }; + } + { + refId = "B"; + datasourceUid = "__expr__"; + model = { + type = "reduce"; + expression = "A"; + reducer = "last"; + }; + } + { + refId = "C"; + datasourceUid = "__expr__"; + model = { + type = "math"; + expression = "$B < 1"; + }; + } + ]; + noDataState = "Alerting"; + execErrState = "Alerting"; + for = "5m"; + annotations = { + description = "OpenLDAP service is down on {{ $labels.instance }}"; + summary = "OpenLDAP Service Down"; + }; + labels = { + severity = "critical"; + }; + } + ]; +} \ No newline at end of file diff --git a/hosts/web-arm/modules/grafana/alerting/service/postfix_down.nix b/hosts/web-arm/modules/grafana/alerting/service/postfix_down.nix new file mode 100644 index 0000000..cfd5247 --- /dev/null +++ b/hosts/web-arm/modules/grafana/alerting/service/postfix_down.nix @@ -0,0 +1,57 @@ +{ lib, pkgs, config, ... }: +{ + grafanaAlertRuleDefinitions = [ + { + uid = "postfix-service-down-alert-uid"; + title = "Postfix Service Down"; + condition = "C"; + data = [ + { + refId = "A"; + relativeTimeRange = { + from = 300; + to = 0; + }; + datasourceUid = "vm-datasource-uid"; + model = { + editorMode = "code"; + expr = "node_systemd_unit_state{state=\"active\", name=\"postfix.service\"} OR on() vector(0)"; + hide = false; + intervalMs = 1000; + legendFormat = "__auto"; + maxDataPoints = 43200; + range = true; + refId = "A"; + }; + } + { + refId = "B"; + datasourceUid = "__expr__"; + model = { + type = "reduce"; + expression = "A"; + reducer = "last"; + }; + } + { + refId = "C"; + datasourceUid = "__expr__"; + model = { + type = "math"; + expression = "$B < 1"; + }; + } + ]; + noDataState = "Alerting"; + execErrState = "Alerting"; + for = "5m"; + annotations = { + description = "Postfix service is down on {{ $labels.instance }}"; + summary = "Postfix Service Down"; + }; + labels = { + severity = "critical"; + }; + } + ]; +} \ No newline at end of file diff --git a/hosts/web-arm/modules/grafana/alerting/service/service_down.nix b/hosts/web-arm/modules/grafana/alerting/service/service_down.nix deleted file mode 100644 index 8ca3c36..0000000 --- a/hosts/web-arm/modules/grafana/alerting/service/service_down.nix +++ /dev/null @@ -1,318 +0,0 @@ -{ lib, pkgs, config, ... }: -{ - grafanaAlertRuleDefinitions = [ - # Systemd service monitoring alerts - { - uid = "gitea-service-down-alert-uid"; - title = "Gitea Service Down"; - condition = "C"; - data = [ - { - refId = "A"; - relativeTimeRange = { - from = 300; - to = 0; - }; - datasourceUid = "vm-datasource-uid"; - model = { - editorMode = "code"; - expr = "node_systemd_unit_state{state=\"active\", name=\"container@git.service\"} OR on() vector(0)"; - hide = false; - intervalMs = 1000; - legendFormat = "__auto"; - maxDataPoints = 43200; - range = true; - refId = "A"; - }; - } - { - refId = "B"; - datasourceUid = "__expr__"; - model = { - type = "reduce"; - expression = "A"; - reducer = "last"; - }; - } - { - refId = "C"; - datasourceUid = "__expr__"; - model = { - type = "math"; - expression = "$B < 1"; - }; - } - ]; - noDataState = "Alerting"; - execErrState = "Alerting"; - for = "5m"; - annotations = { - description = "Gitea service is down on {{ $labels.instance }}"; - summary = "Gitea Service Down"; - }; - labels = { - severity = "critical"; - }; - } - { - uid = "gitea-runner-service-down-alert-uid"; - title = "Gitea Runner Service Down"; - condition = "C"; - data = [ - { - refId = "A"; - relativeTimeRange = { - from = 300; - to = 0; - }; - datasourceUid = "vm-datasource-uid"; - model = { - editorMode = "code"; - expr = "node_systemd_unit_state{state=\"active\", name=\"microvm@git-runner-1.service\"} OR on() vector(0)"; - hide = false; - intervalMs = 1000; - legendFormat = "__auto"; - maxDataPoints = 43200; - range = true; - refId = "A"; - }; - } - { - refId = "B"; - datasourceUid = "__expr__"; - model = { - type = "reduce"; - expression = "A"; - reducer = "last"; - }; - } - { - refId = "C"; - datasourceUid = "__expr__"; - model = { - type = "math"; - expression = "$B < 1"; - }; - } - ]; - noDataState = "Alerting"; - execErrState = "Alerting"; - for = "5m"; - annotations = { - description = "Gitea Runner service is down on {{ $labels.instance }}"; - summary = "Gitea Runner Service Down"; - }; - labels = { - severity = "critical"; - }; - } - { - uid = "postfix-service-down-alert-uid"; - title = "Postfix Service Down"; - condition = "C"; - data = [ - { - refId = "A"; - relativeTimeRange = { - from = 300; - to = 0; - }; - datasourceUid = "vm-datasource-uid"; - model = { - editorMode = "code"; - expr = "node_systemd_unit_state{state=\"active\", name=\"postfix.service\"} OR on() vector(0)"; - hide = false; - intervalMs = 1000; - legendFormat = "__auto"; - maxDataPoints = 43200; - range = true; - refId = "A"; - }; - } - { - refId = "B"; - datasourceUid = "__expr__"; - model = { - type = "reduce"; - expression = "A"; - reducer = "last"; - }; - } - { - refId = "C"; - datasourceUid = "__expr__"; - model = { - type = "math"; - expression = "$B < 1"; - }; - } - ]; - noDataState = "Alerting"; - execErrState = "Alerting"; - for = "5m"; - annotations = { - description = "Postfix service is down on {{ $labels.instance }}"; - summary = "Postfix Service Down"; - }; - labels = { - severity = "critical"; - }; - } - { - uid = "dovecot-service-down-alert-uid"; - title = "Dovecot Service Down"; - condition = "C"; - data = [ - { - refId = "A"; - relativeTimeRange = { - from = 300; - to = 0; - }; - datasourceUid = "vm-datasource-uid"; - model = { - editorMode = "code"; - expr = "node_systemd_unit_state{state=\"active\", name=\"dovecot2.service\"} OR on() vector(0)"; - hide = false; - intervalMs = 1000; - legendFormat = "__auto"; - maxDataPoints = 43200; - range = true; - refId = "A"; - }; - } - { - refId = "B"; - datasourceUid = "__expr__"; - model = { - type = "reduce"; - expression = "A"; - reducer = "last"; - }; - } - { - refId = "C"; - datasourceUid = "__expr__"; - model = { - type = "math"; - expression = "$B < 1"; - }; - } - ]; - noDataState = "Alerting"; - execErrState = "Alerting"; - for = "5m"; - annotations = { - description = "Dovecot service is down on {{ $labels.instance }}"; - summary = "Dovecot Service Down"; - }; - labels = { - severity = "critical"; - }; - } - { - uid = "openldap-service-down-alert-uid"; - title = "OpenLDAP Service Down"; - condition = "C"; - data = [ - { - refId = "A"; - relativeTimeRange = { - from = 300; - to = 0; - }; - datasourceUid = "vm-datasource-uid"; - model = { - editorMode = "code"; - expr = "node_systemd_unit_state{state=\"active\", name=\"openldap.service\"} OR on() vector(0)"; - hide = false; - intervalMs = 1000; - legendFormat = "__auto"; - maxDataPoints = 43200; - range = true; - refId = "A"; - }; - } - { - refId = "B"; - datasourceUid = "__expr__"; - model = { - type = "reduce"; - expression = "A"; - reducer = "last"; - }; - } - { - refId = "C"; - datasourceUid = "__expr__"; - model = { - type = "math"; - expression = "$B < 1"; - }; - } - ]; - noDataState = "Alerting"; - execErrState = "Alerting"; - for = "5m"; - annotations = { - description = "OpenLDAP service is down on {{ $labels.instance }}"; - summary = "OpenLDAP Service Down"; - }; - labels = { - severity = "critical"; - }; - } - { - uid = "wireguard-service-down-alert-uid"; - title = "WireGuard Service Down"; - condition = "C"; - data = [ - { - refId = "A"; - relativeTimeRange = { - from = 300; - to = 0; - }; - datasourceUid = "vm-datasource-uid"; - model = { - editorMode = "code"; - expr = "node_systemd_unit_state{state=\"active\", name=\"wireguard-wg_cloonar.service\"} OR on() vector(0)"; - hide = false; - intervalMs = 1000; - legendFormat = "__auto"; - maxDataPoints = 43200; - range = true; - refId = "A"; - }; - } - { - refId = "B"; - datasourceUid = "__expr__"; - model = { - type = "reduce"; - expression = "A"; - reducer = "last"; - }; - } - { - refId = "C"; - datasourceUid = "__expr__"; - model = { - type = "math"; - expression = "$B < 1"; - }; - } - ]; - noDataState = "Alerting"; - execErrState = "Alerting"; - for = "5m"; - annotations = { - description = "WireGuard service is down on {{ $labels.instance }}"; - summary = "WireGuard Service Down"; - }; - labels = { - severity = "critical"; - }; - } - ]; -} diff --git a/hosts/web-arm/modules/grafana/alerting/service/wireguard_down.nix b/hosts/web-arm/modules/grafana/alerting/service/wireguard_down.nix new file mode 100644 index 0000000..b7be698 --- /dev/null +++ b/hosts/web-arm/modules/grafana/alerting/service/wireguard_down.nix @@ -0,0 +1,57 @@ +{ lib, pkgs, config, ... }: +{ + grafanaAlertRuleDefinitions = [ + { + uid = "wireguard-service-down-alert-uid"; + title = "WireGuard Service Down"; + condition = "C"; + data = [ + { + refId = "A"; + relativeTimeRange = { + from = 300; + to = 0; + }; + datasourceUid = "vm-datasource-uid"; + model = { + editorMode = "code"; + expr = "node_systemd_unit_state{state=\"active\", name=\"wireguard-wg_cloonar.service\"} OR on() vector(0)"; + hide = false; + intervalMs = 1000; + legendFormat = "__auto"; + maxDataPoints = 43200; + range = true; + refId = "A"; + }; + } + { + refId = "B"; + datasourceUid = "__expr__"; + model = { + type = "reduce"; + expression = "A"; + reducer = "last"; + }; + } + { + refId = "C"; + datasourceUid = "__expr__"; + model = { + type = "math"; + expression = "$B < 1"; + }; + } + ]; + noDataState = "Alerting"; + execErrState = "Alerting"; + for = "5m"; + annotations = { + description = "WireGuard service is down on {{ $labels.instance }}"; + summary = "WireGuard Service Down"; + }; + labels = { + severity = "critical"; + }; + } + ]; +} \ No newline at end of file diff --git a/hosts/web-arm/modules/grafana/alerting/cpu_usage.nix b/hosts/web-arm/modules/grafana/alerting/system/cpu_usage.nix similarity index 100% rename from hosts/web-arm/modules/grafana/alerting/cpu_usage.nix rename to hosts/web-arm/modules/grafana/alerting/system/cpu_usage.nix diff --git a/hosts/web-arm/modules/grafana/alerting/system/default.nix b/hosts/web-arm/modules/grafana/alerting/system/default.nix index 26db06d..ff0826d 100644 --- a/hosts/web-arm/modules/grafana/alerting/system/default.nix +++ b/hosts/web-arm/modules/grafana/alerting/system/default.nix @@ -1,11 +1,11 @@ { lib, pkgs, config, ... }: let # Import rule definitions from refactored alert files in the parent 'alerting' directory - cpuAlertRules = (import ../cpu_usage.nix { inherit lib pkgs config; }).grafanaAlertRuleDefinitions; - diskAlertRules = (import ../disk_usage.nix { inherit lib pkgs config; }).grafanaAlertRuleDefinitions; - hostDownAlertRules = (import ../host_down.nix { inherit lib pkgs config; }).grafanaAlertRuleDefinitions; - inodeAlertRules = (import ../inode_usage.nix { inherit lib pkgs config; }).grafanaAlertRuleDefinitions; - ramAlertRules = (import ../ram_usage.nix { inherit lib pkgs config; }).grafanaAlertRuleDefinitions; + cpuAlertRules = (import ./cpu_usage.nix { inherit lib pkgs config; }).grafanaAlertRuleDefinitions; + diskAlertRules = (import ./disk_usage.nix { inherit lib pkgs config; }).grafanaAlertRuleDefinitions; + hostDownAlertRules = (import ./host_down.nix { inherit lib pkgs config; }).grafanaAlertRuleDefinitions; + inodeAlertRules = (import ./inode_usage.nix { inherit lib pkgs config; }).grafanaAlertRuleDefinitions; + ramAlertRules = (import ./ram_usage.nix { inherit lib pkgs config; }).grafanaAlertRuleDefinitions; allSystemRules = cpuAlertRules ++ diskAlertRules ++ hostDownAlertRules ++ inodeAlertRules ++ ramAlertRules; in diff --git a/hosts/web-arm/modules/grafana/alerting/disk_usage.nix b/hosts/web-arm/modules/grafana/alerting/system/disk_usage.nix similarity index 100% rename from hosts/web-arm/modules/grafana/alerting/disk_usage.nix rename to hosts/web-arm/modules/grafana/alerting/system/disk_usage.nix diff --git a/hosts/web-arm/modules/grafana/alerting/host_down.nix b/hosts/web-arm/modules/grafana/alerting/system/host_down.nix similarity index 100% rename from hosts/web-arm/modules/grafana/alerting/host_down.nix rename to hosts/web-arm/modules/grafana/alerting/system/host_down.nix diff --git a/hosts/web-arm/modules/grafana/alerting/inode_usage.nix b/hosts/web-arm/modules/grafana/alerting/system/inode_usage.nix similarity index 100% rename from hosts/web-arm/modules/grafana/alerting/inode_usage.nix rename to hosts/web-arm/modules/grafana/alerting/system/inode_usage.nix diff --git a/hosts/web-arm/modules/grafana/alerting/ram_usage.nix b/hosts/web-arm/modules/grafana/alerting/system/ram_usage.nix similarity index 100% rename from hosts/web-arm/modules/grafana/alerting/ram_usage.nix rename to hosts/web-arm/modules/grafana/alerting/system/ram_usage.nix diff --git a/hosts/web-arm/modules/grafana/default.nix b/hosts/web-arm/modules/grafana/default.nix index 54787e7..8bf0771 100644 --- a/hosts/web-arm/modules/grafana/default.nix +++ b/hosts/web-arm/modules/grafana/default.nix @@ -29,11 +29,6 @@ in { imports = [ # Individual alert files removed, now handled by alerting/system/default.nix - # ./alerting/disk_usage.nix - # ./alerting/cpu_usage.nix - # ./alerting/host_down.nix - # ./alerting/inode_usage.nix - # ./alerting/ram_usage.nix ./alerting/system/default.nix # Added: Imports the consolidated system alerts module ./alerting/service/default.nix # Added: Imports the new service alerts module # ... other rule files can be added here ...