refactor: Grafana alerting rules: consolidate and reorganize alert definitions

- Deleted individual alert files for host down, inode usage, and RAM usage. - Merged service down alerts into a new structure with separate files for each service (Gitea, Postfix, Dovecot, OpenLDAP, WireGuard). - Introduced a new system alert structure consolidating CPU, disk, host down, inode, and RAM usage alerts. - Updated alert conditions to use 'D' for thresholds and adjusted expressions accordingly. - Improved annotations and labels for clarity and consistency across alerts.
2025-05-31 21:14:36 +02:00 · 2025-05-31 21:14:36 +02:00 · b6b90bca7d
commit b6b90bca7d
parent 39b9726be7
15 changed files with 359 additions and 330 deletions
--- a/hosts/web-arm/modules/grafana/alerting/system/cpu_usage.nix
+++ b/hosts/web-arm/modules/grafana/alerting/system/cpu_usage.nix
@ -0,0 +1,58 @@
+{ lib, pkgs, config, ... }:
+{
+  grafanaAlertRuleDefinitions = [
+    {
+      uid = "high-cpu-usage-alert-uid";
+      title = "HighCPUUsage";
+      condition = "D"; # Condition is now D
+
+      data = [
+        # Query A: Calculate CPU usage percentage
+        {
+          refId = "A";
+          datasourceUid = "vm-datasource-uid";
+          queryType = "prometheus";
+          relativeTimeRange = { from = 60; to = 0; }; # Query over the last minute
+          model = {
+            # Calculate average CPU usage over 1m, grouped by instance and job
+            expr = ''(1 - avg by (instance, job) (rate(node_cpu_seconds_total{mode="idle"}[1m]))) * 100'';
+            legendFormat = "CPU usage on {{instance}} ({{job}})";
+            instant = false; # This is a range query
+          };
+        }
+        # Expression C: Reduce Query A to its last value, preserving labels
+        {
+          refId = "C";
+          datasourceUid = "__expr__";
+          model = {
+            type = "reduce";
+            expression = "A"; # Input is Query A
+            reducer = "last"; # Get the last value of each series in A
+          };
+        }
+        # Expression D: Apply math condition to the reduced values from C
+        {
+          refId = "D";
+          datasourceUid = "__expr__";
+          model = {
+            type = "math";
+            expression = "$C > 90"; # Alert if CPU usage from C is > 90%
+          };
+        }
+      ];
+
+      for = "5m"; # Duration the condition must be met
+      noDataState = "NoData";
+      execErrState = "Error";
+
+      annotations = {
+        summary = "High CPU usage on {{ $labels.instance }}";
+        description = ''CPU usage on {{ $labels.instance }} (job: {{ $labels.job }}) has been above 90% for more than 5 minutes. Current value: {{ if $values.C }}{{ $values.C | humanizePercentage }}{{ else }}N/A{{ end }}%.'';
+      };
+      labels = {
+        severity = "warning";
+        category = "performance";
+      };
+    }
+  ];
+}
--- a/hosts/web-arm/modules/grafana/alerting/system/default.nix
+++ b/hosts/web-arm/modules/grafana/alerting/system/default.nix
@ -1,11 +1,11 @@
 { lib, pkgs, config, ... }:
 let
  # Import rule definitions from refactored alert files in the parent 'alerting' directory
-  cpuAlertRules = (import ../cpu_usage.nix { inherit lib pkgs config; }).grafanaAlertRuleDefinitions;
-  diskAlertRules = (import ../disk_usage.nix { inherit lib pkgs config; }).grafanaAlertRuleDefinitions;
-  hostDownAlertRules = (import ../host_down.nix { inherit lib pkgs config; }).grafanaAlertRuleDefinitions;
-  inodeAlertRules = (import ../inode_usage.nix { inherit lib pkgs config; }).grafanaAlertRuleDefinitions;
-  ramAlertRules = (import ../ram_usage.nix { inherit lib pkgs config; }).grafanaAlertRuleDefinitions;
+  cpuAlertRules = (import ./cpu_usage.nix { inherit lib pkgs config; }).grafanaAlertRuleDefinitions;
+  diskAlertRules = (import ./disk_usage.nix { inherit lib pkgs config; }).grafanaAlertRuleDefinitions;
+  hostDownAlertRules = (import ./host_down.nix { inherit lib pkgs config; }).grafanaAlertRuleDefinitions;
+  inodeAlertRules = (import ./inode_usage.nix { inherit lib pkgs config; }).grafanaAlertRuleDefinitions;
+  ramAlertRules = (import ./ram_usage.nix { inherit lib pkgs config; }).grafanaAlertRuleDefinitions;

  allSystemRules = cpuAlertRules ++ diskAlertRules ++ hostDownAlertRules ++ inodeAlertRules ++ ramAlertRules;
 in
--- a/hosts/web-arm/modules/grafana/alerting/system/disk_usage.nix
+++ b/hosts/web-arm/modules/grafana/alerting/system/disk_usage.nix
@ -0,0 +1,76 @@
+{ lib, pkgs, config, ... }:
+{
+  grafanaAlertRuleDefinitions = [
+    {
+      uid = "high-disk-usage-alert-uid"; # Optional: provide a stable UID for the rule itself
+      title = "HighDiskUsage";          # Name of the alert rule (was 'alert' in vmalert)
+
+      # Condition for the alert to fire. 'D' refers to the refId of the threshold expression.
+      condition = "D"; # Condition is now D
+      # Removed rule-level relativeTimeRange
+
+      # Data queries and expressions
+      data = [
+        # Query A: Calculate disk usage percentage
+        {
+          refId = "A";
+          datasourceUid = "vm-datasource-uid"; # UID of the VictoriaMetrics datasource
+          queryType = "prometheus"; # Explicitly set, though often inferred
+          relativeTimeRange = { from = 60; to = 0; }; # Query-level, integer seconds
+          model = {
+            expr = ''
+              (
+                node_filesystem_size_bytes{fstype!~"tmpfs|rootfs",mountpoint!=""} - node_filesystem_avail_bytes{fstype!~"tmpfs|rootfs",mountpoint!=""}
+              ) / (node_filesystem_size_bytes{fstype!~"tmpfs|rootfs",mountpoint!=""} > 0) * 100
+              and node_filesystem_size_bytes{fstype!~"tmpfs|rootfs",mountpoint!=""}
+              and node_filesystem_avail_bytes{fstype!~"tmpfs|rootfs",mountpoint!=""}
+            '';
+            legendFormat = "{{mountpoint}} on {{instance}}"; # Example legend
+            instant = false; # For range queries, default is false
+          };
+        }
+        # Expression C: Reduce Query A to its last value, preserving labels
+        {
+          refId = "C";
+          datasourceUid = "__expr__";
+          model = {
+            type = "reduce";
+            expression = "A"; # Input is Query A
+            reducer = "last"; # Get the last value of each series in A
+          };
+        }
+        # Expression D: Apply math condition to the reduced values from C
+        {
+          refId = "D";
+          datasourceUid = "__expr__";
+          model = {
+            type = "math";
+            expression = "$C > 85"; # Check if the last value from each series in C is > 85
+          };
+        }
+      ];
+      
+      for = "15m"; # Duration the condition must be met (same as vmalert)
+      
+      # How to handle states where data is missing or query errors
+      noDataState = "NoData";   # Options: NoData, Alerting, OK
+      execErrState = "Error"; # Options: Error, Alerting, OK
+
+      annotations = {
+        summary = "High disk usage on {{ $labels.instance }} at {{ $labels.mountpoint }}";
+        description = ''
+          Disk usage on {{ $labels.instance }} for mount point {{ $labels.mountpoint }}
+          (fstype: {{ $labels.fstype }}) has been above 85% for more than 15 minutes.
+          Current value: {{ if $values.C }}{{ $values.C | humanizePercentage }}{{ else }}N/A{{ end }}%.
+        ''; # Using $values.C as it's the input to the math condition D
+      };
+      labels = {
+        severity = "warning";
+        category = "capacity";
+        # Grafana automatically adds labels from the query result (instance, mountpoint, etc.)
+        # and labels from the rule group/folder.
+      };
+      # isPaused = false; # Default is not paused
+    }
+  ];
+}
--- a/hosts/web-arm/modules/grafana/alerting/system/host_down.nix
+++ b/hosts/web-arm/modules/grafana/alerting/system/host_down.nix
@ -0,0 +1,54 @@
+{ lib, pkgs, config, ... }:
+{
+  grafanaAlertRuleDefinitions = [
+    {
+      uid = "host-down-alert-uid";
+      title = "HostDown";
+      condition = "C";
+
+      data = [
+        {
+          refId = "A";
+          datasourceUid = "vm-datasource-uid";
+          queryType = "prometheus";
+          relativeTimeRange = { from = 60; to = 0; }; # Query over the last minute
+          model = {
+            expr = ''up'';
+            legendFormat = "{{instance}} ({{job}})";
+            instant = false; # Changed from true, as relativeTimeRange is used
+          };
+        }
+        { # New Expression B: Reduce Query A
+          refId = "B";
+          datasourceUid = "__expr__";
+          model = {
+            type = "reduce";
+            expression = "A"; # Input is Query A
+            reducer = "last"; # Get the last value of each series in A
+          };
+        }
+        { # Modified Expression C: Math condition based on B
+          refId = "C";
+          datasourceUid = "__expr__";
+          model = {
+            type = "math";
+            expression = "$B == 0"; # Check if the last value from B is 0
+          };
+        }
+      ];
+
+      for = "2m";
+      noDataState = "Alerting";
+      execErrState = "Error";
+
+      annotations = {
+        summary = "Host {{ $labels.instance }} is down";
+        description = ''Host {{ $labels.instance }} (job: {{ $labels.job }}) has been down for more than 2 minutes.'';
+      };
+      labels = {
+        severity = "critical";
+        category = "availability";
+      };
+    }
+  ];
+}
--- a/hosts/web-arm/modules/grafana/alerting/system/inode_usage.nix
+++ b/hosts/web-arm/modules/grafana/alerting/system/inode_usage.nix
@ -0,0 +1,63 @@
+{ lib, pkgs, config, ... }:
+{
+  grafanaAlertRuleDefinitions = [
+    {
+      uid = "high-inode-usage-alert-uid";
+      title = "HighInodeUsage";
+      condition = "D"; # Condition is now D
+
+      data = [
+        # Query A: Calculate inode usage percentage
+        {
+          refId = "A";
+          datasourceUid = "vm-datasource-uid";
+          queryType = "prometheus";
+          relativeTimeRange = { from = 60; to = 0; };
+          model = {
+            expr = ''
+              (
+                node_filesystem_files{fstype!~"tmpfs|rootfs",mountpoint!=""} - node_filesystem_files_free{fstype!~"tmpfs|rootfs",mountpoint!=""}
+              ) / (node_filesystem_files{fstype!~"tmpfs|rootfs",mountpoint!=""} > 0) * 100
+              and node_filesystem_files{fstype!~"tmpfs|rootfs",mountpoint!=""}
+              and node_filesystem_files_free{fstype!~"tmpfs|rootfs",mountpoint!=""}
+            '';
+            legendFormat = "{{mountpoint}} on {{instance}}";
+            instant = false;
+          };
+        }
+        # Expression C: Reduce Query A to its last value, preserving labels
+        {
+          refId = "C";
+          datasourceUid = "__expr__";
+          model = {
+            type = "reduce";
+            expression = "A"; # Input is Query A
+            reducer = "last"; # Get the last value of each series in A
+          };
+        }
+        # Expression D: Apply math condition to the reduced values from C
+        {
+          refId = "D";
+          datasourceUid = "__expr__";
+          model = {
+            type = "math";
+            expression = "$C > 80"; # Alert if inode usage from C is > 80%
+          };
+        }
+      ];
+
+      for = "30m"; # Duration the condition must be met
+      noDataState = "NoData";
+      execErrState = "Error";
+
+      annotations = {
+        summary = "High inode usage on {{ $labels.instance }} at {{ $labels.mountpoint }}";
+        description = ''Inode usage on {{ $labels.instance }} for mount point {{ $labels.mountpoint }} (fstype: {{ $labels.fstype }}) has been above 80% for more than 30 minutes. Current value: {{ if $values.C }}{{ $values.C | humanizePercentage }}{{ else }}N/A{{ end }}%.'';
+      };
+      labels = {
+        severity = "warning";
+        category = "capacity";
+      };
+    }
+  ];
+}
--- a/hosts/web-arm/modules/grafana/alerting/system/ram_usage.nix
+++ b/hosts/web-arm/modules/grafana/alerting/system/ram_usage.nix
@ -0,0 +1,61 @@
+{ lib, pkgs, config, ... }:
+{
+  grafanaAlertRuleDefinitions = [
+    {
+      uid = "high-ram-usage-alert-uid";
+      title = "HighRAMUsage";
+      condition = "D"; # Condition is now D
+
+      data = [
+        # Query A: Calculate RAM usage percentage
+        {
+          refId = "A";
+          datasourceUid = "vm-datasource-uid";
+          queryType = "prometheus";
+          relativeTimeRange = { from = 60; to = 0; };
+          model = {
+            expr = ''
+              (1 - node_memory_MemAvailable_bytes / (node_memory_MemTotal_bytes > 0)) * 100
+              and node_memory_MemAvailable_bytes
+              and node_memory_MemTotal_bytes
+            '';
+            legendFormat = "RAM usage on {{instance}} ({{job}})";
+            instant = false;
+          };
+        }
+        # Expression C: Reduce Query A to its last value, preserving labels
+        {
+          refId = "C";
+          datasourceUid = "__expr__";
+          model = {
+            type = "reduce";
+            expression = "A"; # Input is Query A
+            reducer = "last"; # Get the last value of each series in A
+          };
+        }
+        # Expression D: Apply math condition to the reduced values from C
+        {
+          refId = "D";
+          datasourceUid = "__expr__";
+          model = {
+            type = "math";
+            expression = "$C > 90"; # Alert if RAM usage from C is > 90%
+          };
+        }
+      ];
+
+      for = "10m"; # Duration the condition must be met
+      noDataState = "NoData";
+      execErrState = "Error";
+
+      annotations = {
+        summary = "High RAM usage on {{ $labels.instance }}";
+        description = ''RAM usage on {{ $labels.instance }} (job: {{ $labels.job }}) has been above 90% for more than 10 minutes. Current value: {{ if $values.C }}{{ $values.C | humanizePercentage }}{{ else }}N/A{{ end }}%.'';
+      };
+      labels = {
+        severity = "warning";
+        category = "performance";
+      };
+    }
+  ];
+}