feat: restructure Grafana configuration, migrate alert rules to new format and add VictoriaMetrics datasource

2025-05-31 09:27:25 +02:00 · 2025-05-31 09:27:25 +02:00 · 8b5fb0861d
commit 8b5fb0861d
parent 17a3602d3c
14 changed files with 384 additions and 166 deletions
--- a/hosts/web-arm/modules/grafana/alerting/host_down.nix
+++ b/hosts/web-arm/modules/grafana/alerting/host_down.nix
@ -0,0 +1,62 @@
+{ lib, pkgs, config, ... }:
+{
+  services.grafana.provision.alerting.rules.settings.groups = [
+    {
+      name = "HostStatusAlerts";
+      folder = "System Alerts";
+      interval = "1m";
+
+      rules = [
+        {
+          uid = "host-down-alert-uid";
+          title = "HostDown";
+          condition = "C";
+
+          data = [
+            {
+              refId = "A";
+              datasourceUid = "vm-datasource-uid";
+              queryType = "prometheus";
+              relativeTimeRange = { from = 60; to = 0; }; # Query over the last minute
+              model = {
+                expr = ''up'';
+                legendFormat = "{{instance}} ({{job}})";
+                instant = false; # Changed from true, as relativeTimeRange is used
+              };
+            }
+            { # New Expression B: Reduce Query A
+              refId = "B";
+              datasourceUid = "__expr__";
+              model = {
+                type = "reduce";
+                expression = "A"; # Input is Query A
+                reducer = "last"; # Get the last value of each series in A
+              };
+            }
+            { # Modified Expression C: Math condition based on B
+              refId = "C";
+              datasourceUid = "__expr__";
+              model = {
+                type = "math";
+                expression = "$B == 0"; # Check if the last value from B is 0
+              };
+            }
+          ];
+
+          for = "2m";
+          noDataState = "Alerting";
+          execErrState = "Error";
+
+          annotations = {
+            summary = "Host {{ $labels.instance }} is down";
+            description = ''Host {{ $labels.instance }} (job: {{ $labels.job }}) has been down for more than 2 minutes.'';
+          };
+          labels = {
+            severity = "critical";
+            category = "availability";
+          };
+        }
+      ];
+    }
+  ];
+}