feat: refactor Grafana alerting rules into a consolidated system module and update individual alert files

2025-05-31 09:57:03 +02:00 · 2025-05-31 09:57:03 +02:00 · 35fa61ef34
commit 35fa61ef34
parent 8b5fb0861d
7 changed files with 281 additions and 299 deletions
--- a/hosts/web-arm/modules/grafana/alerting/cpu_usage.nix
+++ b/hosts/web-arm/modules/grafana/alerting/cpu_usage.nix
@ -1,66 +1,58 @@
 { lib, pkgs, config, ... }:
 {
-  services.grafana.provision.alerting.rules.settings.groups = [
+  grafanaAlertRuleDefinitions = [
    {
-      name = "CPUUsageAlerts";
-      folder = "System Alerts";
-      interval = "1m";
+      uid = "high-cpu-usage-alert-uid";
+      title = "HighCPUUsage";
+      condition = "D"; # Condition is now D

-      rules = [
+      data = [
+        # Query A: Calculate CPU usage percentage
        {
-          uid = "high-cpu-usage-alert-uid";
-          title = "HighCPUUsage";
-          condition = "D"; # Condition is now D
-
-          data = [
-            # Query A: Calculate CPU usage percentage
-            {
-              refId = "A";
-              datasourceUid = "vm-datasource-uid";
-              queryType = "prometheus";
-              relativeTimeRange = { from = 60; to = 0; }; # Query over the last minute
-              model = {
-                # Calculate average CPU usage over 1m, grouped by instance and job
-                expr = ''(1 - avg by (instance, job) (rate(node_cpu_seconds_total{mode="idle"}[1m]))) * 100'';
-                legendFormat = "CPU usage on {{instance}} ({{job}})";
-                instant = false; # This is a range query
-              };
-            }
-            # Expression C: Reduce Query A to its last value, preserving labels
-            {
-              refId = "C";
-              datasourceUid = "__expr__";
-              model = {
-                type = "reduce";
-                expression = "A"; # Input is Query A
-                reducer = "last"; # Get the last value of each series in A
-              };
-            }
-            # Expression D: Apply math condition to the reduced values from C
-            {
-              refId = "D";
-              datasourceUid = "__expr__";
-              model = {
-                type = "math";
-                expression = "$C > 90"; # Alert if CPU usage from C is > 90%
-              };
-            }
-          ];
-
-          for = "5m"; # Duration the condition must be met
-          noDataState = "NoData";
-          execErrState = "Error";
-
-          annotations = {
-            summary = "High CPU usage on {{ $labels.instance }}";
-            description = ''CPU usage on {{ $labels.instance }} (job: {{ $labels.job }}) has been above 90% for more than 5 minutes. Current value: {{ if $values.C }}{{ $values.C | humanizePercentage }}{{ else }}N/A{{ end }}%.'';
+          refId = "A";
+          datasourceUid = "vm-datasource-uid";
+          queryType = "prometheus";
+          relativeTimeRange = { from = 60; to = 0; }; # Query over the last minute
+          model = {
+            # Calculate average CPU usage over 1m, grouped by instance and job
+            expr = ''(1 - avg by (instance, job) (rate(node_cpu_seconds_total{mode="idle"}[1m]))) * 100'';
+            legendFormat = "CPU usage on {{instance}} ({{job}})";
+            instant = false; # This is a range query
          };
-          labels = {
-            severity = "warning";
-            category = "performance";
+        }
+        # Expression C: Reduce Query A to its last value, preserving labels
+        {
+          refId = "C";
+          datasourceUid = "__expr__";
+          model = {
+            type = "reduce";
+            expression = "A"; # Input is Query A
+            reducer = "last"; # Get the last value of each series in A
+          };
+        }
+        # Expression D: Apply math condition to the reduced values from C
+        {
+          refId = "D";
+          datasourceUid = "__expr__";
+          model = {
+            type = "math";
+            expression = "$C > 90"; # Alert if CPU usage from C is > 90%
          };
        }
      ];
+
+      for = "5m"; # Duration the condition must be met
+      noDataState = "NoData";
+      execErrState = "Error";
+
+      annotations = {
+        summary = "High CPU usage on {{ $labels.instance }}";
+        description = ''CPU usage on {{ $labels.instance }} (job: {{ $labels.job }}) has been above 90% for more than 5 minutes. Current value: {{ if $values.C }}{{ $values.C | humanizePercentage }}{{ else }}N/A{{ end }}%.'';
+      };
+      labels = {
+        severity = "warning";
+        category = "performance";
+      };
    }
  ];
 }
--- a/hosts/web-arm/modules/grafana/alerting/disk_usage.nix
+++ b/hosts/web-arm/modules/grafana/alerting/disk_usage.nix
@ -1,85 +1,76 @@
 { lib, pkgs, config, ... }:
 {
-  services.grafana.provision.alerting.rules.settings.groups = [
+  grafanaAlertRuleDefinitions = [
    {
-      # orgId = 1; # Defaults to 1 for provisioned rules
-      name = "DiskUsageAlerts";      # Name of the rule group
-      folder = "System Alerts";       # The folder these rules belong to in Grafana UI
-      interval = "1m";              # How often to evaluate rules in this group
+      uid = "high-disk-usage-alert-uid"; # Optional: provide a stable UID for the rule itself
+      title = "HighDiskUsage";          # Name of the alert rule (was 'alert' in vmalert)

-      rules = [
+      # Condition for the alert to fire. 'D' refers to the refId of the threshold expression.
+      condition = "D"; # Condition is now D
+      # Removed rule-level relativeTimeRange
+
+      # Data queries and expressions
+      data = [
+        # Query A: Calculate disk usage percentage
        {
-          uid = "high-disk-usage-alert-uid"; # Optional: provide a stable UID for the rule itself
-          title = "HighDiskUsage";          # Name of the alert rule (was 'alert' in vmalert)
-
-          # Condition for the alert to fire. 'C' refers to the refId of the threshold expression.
-          condition = "D"; # Condition is now D
-          # Removed rule-level relativeTimeRange
-
-          # Data queries and expressions
-          data = [
-            # Query A: Calculate disk usage percentage
-            {
-              refId = "A";
-              datasourceUid = "vm-datasource-uid"; # UID of the VictoriaMetrics datasource
-              queryType = "prometheus"; # Explicitly set, though often inferred
-              relativeTimeRange = { from = 60; to = 0; }; # Query-level, integer seconds
-              model = {
-                expr = ''
-                  (
-                    node_filesystem_size_bytes{fstype!~"tmpfs|rootfs",mountpoint!=""} - node_filesystem_avail_bytes{fstype!~"tmpfs|rootfs",mountpoint!=""}
-                  ) / (node_filesystem_size_bytes{fstype!~"tmpfs|rootfs",mountpoint!=""} > 0) * 100
-                  and node_filesystem_size_bytes{fstype!~"tmpfs|rootfs",mountpoint!=""}
-                  and node_filesystem_avail_bytes{fstype!~"tmpfs|rootfs",mountpoint!=""}
-                '';
-                legendFormat = "{{mountpoint}} on {{instance}}"; # Example legend
-                instant = false; # For range queries, default is false
-              };
-            }
-            # Expression C: Reduce Query A to its last value, preserving labels
-            {
-              refId = "C";
-              datasourceUid = "__expr__";
-              model = {
-                type = "reduce";
-                expression = "A"; # Input is Query A
-                reducer = "last"; # Get the last value of each series in A
-              };
-            }
-            # Expression D: Apply math condition to the reduced values from C
-            {
-              refId = "D";
-              datasourceUid = "__expr__";
-              model = {
-                type = "math";
-                expression = "$C > 85"; # Check if the last value from each series in C is > 85
-              };
-            }
-          ];
-          
-          for = "15m"; # Duration the condition must be met (same as vmalert)
-          
-          # How to handle states where data is missing or query errors
-          noDataState = "NoData";   # Options: NoData, Alerting, OK
-          execErrState = "Error"; # Options: Error, Alerting, OK
-
-          annotations = {
-            summary = "High disk usage on {{ $labels.instance }} at {{ $labels.mountpoint }}";
-            description = ''
-              Disk usage on {{ $labels.instance }} for mount point {{ $labels.mountpoint }}
-              (fstype: {{ $labels.fstype }}) has been above 85% for more than 15 minutes.
-              Current value: {{ if $values.C }}{{ $values.C | humanizePercentage }}{{ else }}N/A{{ end }}%.
-            ''; # Using $values.C as it's the input to the math condition D
+          refId = "A";
+          datasourceUid = "vm-datasource-uid"; # UID of the VictoriaMetrics datasource
+          queryType = "prometheus"; # Explicitly set, though often inferred
+          relativeTimeRange = { from = 60; to = 0; }; # Query-level, integer seconds
+          model = {
+            expr = ''
+              (
+                node_filesystem_size_bytes{fstype!~"tmpfs|rootfs",mountpoint!=""} - node_filesystem_avail_bytes{fstype!~"tmpfs|rootfs",mountpoint!=""}
+              ) / (node_filesystem_size_bytes{fstype!~"tmpfs|rootfs",mountpoint!=""} > 0) * 100
+              and node_filesystem_size_bytes{fstype!~"tmpfs|rootfs",mountpoint!=""}
+              and node_filesystem_avail_bytes{fstype!~"tmpfs|rootfs",mountpoint!=""}
+            '';
+            legendFormat = "{{mountpoint}} on {{instance}}"; # Example legend
+            instant = false; # For range queries, default is false
          };
-          labels = {
-            severity = "warning";
-            category = "capacity";
-            # Grafana automatically adds labels from the query result (instance, mountpoint, etc.)
-            # and labels from the rule group/folder.
+        }
+        # Expression C: Reduce Query A to its last value, preserving labels
+        {
+          refId = "C";
+          datasourceUid = "__expr__";
+          model = {
+            type = "reduce";
+            expression = "A"; # Input is Query A
+            reducer = "last"; # Get the last value of each series in A
+          };
+        }
+        # Expression D: Apply math condition to the reduced values from C
+        {
+          refId = "D";
+          datasourceUid = "__expr__";
+          model = {
+            type = "math";
+            expression = "$C > 85"; # Check if the last value from each series in C is > 85
          };
-          # isPaused = false; # Default is not paused
        }
      ];
+      
+      for = "15m"; # Duration the condition must be met (same as vmalert)
+      
+      # How to handle states where data is missing or query errors
+      noDataState = "NoData";   # Options: NoData, Alerting, OK
+      execErrState = "Error"; # Options: Error, Alerting, OK
+
+      annotations = {
+        summary = "High disk usage on {{ $labels.instance }} at {{ $labels.mountpoint }}";
+        description = ''
+          Disk usage on {{ $labels.instance }} for mount point {{ $labels.mountpoint }}
+          (fstype: {{ $labels.fstype }}) has been above 85% for more than 15 minutes.
+          Current value: {{ if $values.C }}{{ $values.C | humanizePercentage }}{{ else }}N/A{{ end }}%.
+        ''; # Using $values.C as it's the input to the math condition D
+      };
+      labels = {
+        severity = "warning";
+        category = "capacity";
+        # Grafana automatically adds labels from the query result (instance, mountpoint, etc.)
+        # and labels from the rule group/folder.
+      };
+      # isPaused = false; # Default is not paused
    }
  ];
 }
--- a/hosts/web-arm/modules/grafana/alerting/host_down.nix
+++ b/hosts/web-arm/modules/grafana/alerting/host_down.nix
@ -1,62 +1,54 @@
 { lib, pkgs, config, ... }:
 {
-  services.grafana.provision.alerting.rules.settings.groups = [
+  grafanaAlertRuleDefinitions = [
    {
-      name = "HostStatusAlerts";
-      folder = "System Alerts";
-      interval = "1m";
+      uid = "host-down-alert-uid";
+      title = "HostDown";
+      condition = "C";

-      rules = [
+      data = [
        {
-          uid = "host-down-alert-uid";
-          title = "HostDown";
-          condition = "C";
-
-          data = [
-            {
-              refId = "A";
-              datasourceUid = "vm-datasource-uid";
-              queryType = "prometheus";
-              relativeTimeRange = { from = 60; to = 0; }; # Query over the last minute
-              model = {
-                expr = ''up'';
-                legendFormat = "{{instance}} ({{job}})";
-                instant = false; # Changed from true, as relativeTimeRange is used
-              };
-            }
-            { # New Expression B: Reduce Query A
-              refId = "B";
-              datasourceUid = "__expr__";
-              model = {
-                type = "reduce";
-                expression = "A"; # Input is Query A
-                reducer = "last"; # Get the last value of each series in A
-              };
-            }
-            { # Modified Expression C: Math condition based on B
-              refId = "C";
-              datasourceUid = "__expr__";
-              model = {
-                type = "math";
-                expression = "$B == 0"; # Check if the last value from B is 0
-              };
-            }
-          ];
-
-          for = "2m";
-          noDataState = "Alerting";
-          execErrState = "Error";
-
-          annotations = {
-            summary = "Host {{ $labels.instance }} is down";
-            description = ''Host {{ $labels.instance }} (job: {{ $labels.job }}) has been down for more than 2 minutes.'';
+          refId = "A";
+          datasourceUid = "vm-datasource-uid";
+          queryType = "prometheus";
+          relativeTimeRange = { from = 60; to = 0; }; # Query over the last minute
+          model = {
+            expr = ''up'';
+            legendFormat = "{{instance}} ({{job}})";
+            instant = false; # Changed from true, as relativeTimeRange is used
          };
-          labels = {
-            severity = "critical";
-            category = "availability";
+        }
+        { # New Expression B: Reduce Query A
+          refId = "B";
+          datasourceUid = "__expr__";
+          model = {
+            type = "reduce";
+            expression = "A"; # Input is Query A
+            reducer = "last"; # Get the last value of each series in A
+          };
+        }
+        { # Modified Expression C: Math condition based on B
+          refId = "C";
+          datasourceUid = "__expr__";
+          model = {
+            type = "math";
+            expression = "$B == 0"; # Check if the last value from B is 0
          };
        }
      ];
+
+      for = "2m";
+      noDataState = "Alerting";
+      execErrState = "Error";
+
+      annotations = {
+        summary = "Host {{ $labels.instance }} is down";
+        description = ''Host {{ $labels.instance }} (job: {{ $labels.job }}) has been down for more than 2 minutes.'';
+      };
+      labels = {
+        severity = "critical";
+        category = "availability";
+      };
    }
  ];
 }
--- a/hosts/web-arm/modules/grafana/alerting/inode_usage.nix
+++ b/hosts/web-arm/modules/grafana/alerting/inode_usage.nix
@ -1,71 +1,63 @@
 { lib, pkgs, config, ... }:
 {
-  services.grafana.provision.alerting.rules.settings.groups = [
+  grafanaAlertRuleDefinitions = [
    {
-      name = "InodeUsageAlerts";
-      folder = "System Alerts";
-      interval = "1m";
+      uid = "high-inode-usage-alert-uid";
+      title = "HighInodeUsage";
+      condition = "D"; # Condition is now D

-      rules = [
+      data = [
+        # Query A: Calculate inode usage percentage
        {
-          uid = "high-inode-usage-alert-uid";
-          title = "HighInodeUsage";
-          condition = "D"; # Condition is now D
-
-          data = [
-            # Query A: Calculate inode usage percentage
-            {
-              refId = "A";
-              datasourceUid = "vm-datasource-uid";
-              queryType = "prometheus";
-              relativeTimeRange = { from = 60; to = 0; };
-              model = {
-                expr = ''
-                  (
-                    node_filesystem_files{fstype!~"tmpfs|rootfs",mountpoint!=""} - node_filesystem_files_free{fstype!~"tmpfs|rootfs",mountpoint!=""}
-                  ) / (node_filesystem_files{fstype!~"tmpfs|rootfs",mountpoint!=""} > 0) * 100
-                  and node_filesystem_files{fstype!~"tmpfs|rootfs",mountpoint!=""}
-                  and node_filesystem_files_free{fstype!~"tmpfs|rootfs",mountpoint!=""}
-                '';
-                legendFormat = "{{mountpoint}} on {{instance}}";
-                instant = false;
-              };
-            }
-            # Expression C: Reduce Query A to its last value, preserving labels
-            {
-              refId = "C";
-              datasourceUid = "__expr__";
-              model = {
-                type = "reduce";
-                expression = "A"; # Input is Query A
-                reducer = "last"; # Get the last value of each series in A
-              };
-            }
-            # Expression D: Apply math condition to the reduced values from C
-            {
-              refId = "D";
-              datasourceUid = "__expr__";
-              model = {
-                type = "math";
-                expression = "$C > 80"; # Alert if inode usage from C is > 80%
-              };
-            }
-          ];
-
-          for = "30m"; # Duration the condition must be met
-          noDataState = "NoData";
-          execErrState = "Error";
-
-          annotations = {
-            summary = "High inode usage on {{ $labels.instance }} at {{ $labels.mountpoint }}";
-            description = ''Inode usage on {{ $labels.instance }} for mount point {{ $labels.mountpoint }} (fstype: {{ $labels.fstype }}) has been above 80% for more than 30 minutes. Current value: {{ if $values.C }}{{ $values.C | humanizePercentage }}{{ else }}N/A{{ end }}%.'';
+          refId = "A";
+          datasourceUid = "vm-datasource-uid";
+          queryType = "prometheus";
+          relativeTimeRange = { from = 60; to = 0; };
+          model = {
+            expr = ''
+              (
+                node_filesystem_files{fstype!~"tmpfs|rootfs",mountpoint!=""} - node_filesystem_files_free{fstype!~"tmpfs|rootfs",mountpoint!=""}
+              ) / (node_filesystem_files{fstype!~"tmpfs|rootfs",mountpoint!=""} > 0) * 100
+              and node_filesystem_files{fstype!~"tmpfs|rootfs",mountpoint!=""}
+              and node_filesystem_files_free{fstype!~"tmpfs|rootfs",mountpoint!=""}
+            '';
+            legendFormat = "{{mountpoint}} on {{instance}}";
+            instant = false;
          };
-          labels = {
-            severity = "warning";
-            category = "capacity";
+        }
+        # Expression C: Reduce Query A to its last value, preserving labels
+        {
+          refId = "C";
+          datasourceUid = "__expr__";
+          model = {
+            type = "reduce";
+            expression = "A"; # Input is Query A
+            reducer = "last"; # Get the last value of each series in A
+          };
+        }
+        # Expression D: Apply math condition to the reduced values from C
+        {
+          refId = "D";
+          datasourceUid = "__expr__";
+          model = {
+            type = "math";
+            expression = "$C > 80"; # Alert if inode usage from C is > 80%
          };
        }
      ];
+
+      for = "30m"; # Duration the condition must be met
+      noDataState = "NoData";
+      execErrState = "Error";
+
+      annotations = {
+        summary = "High inode usage on {{ $labels.instance }} at {{ $labels.mountpoint }}";
+        description = ''Inode usage on {{ $labels.instance }} for mount point {{ $labels.mountpoint }} (fstype: {{ $labels.fstype }}) has been above 80% for more than 30 minutes. Current value: {{ if $values.C }}{{ $values.C | humanizePercentage }}{{ else }}N/A{{ end }}%.'';
+      };
+      labels = {
+        severity = "warning";
+        category = "capacity";
+      };
    }
  ];
 }
--- a/hosts/web-arm/modules/grafana/alerting/ram_usage.nix
+++ b/hosts/web-arm/modules/grafana/alerting/ram_usage.nix
@ -1,69 +1,61 @@
 { lib, pkgs, config, ... }:
 {
-  services.grafana.provision.alerting.rules.settings.groups = [
+  grafanaAlertRuleDefinitions = [
    {
-      name = "RAMUsageAlerts";
-      folder = "System Alerts";
-      interval = "1m";
+      uid = "high-ram-usage-alert-uid";
+      title = "HighRAMUsage";
+      condition = "D"; # Condition is now D

-      rules = [
+      data = [
+        # Query A: Calculate RAM usage percentage
        {
-          uid = "high-ram-usage-alert-uid";
-          title = "HighRAMUsage";
-          condition = "D"; # Condition is now D
-
-          data = [
-            # Query A: Calculate RAM usage percentage
-            {
-              refId = "A";
-              datasourceUid = "vm-datasource-uid";
-              queryType = "prometheus";
-              relativeTimeRange = { from = 60; to = 0; };
-              model = {
-                expr = ''
-                  (1 - node_memory_MemAvailable_bytes / (node_memory_MemTotal_bytes > 0)) * 100
-                  and node_memory_MemAvailable_bytes
-                  and node_memory_MemTotal_bytes
-                '';
-                legendFormat = "RAM usage on {{instance}} ({{job}})";
-                instant = false;
-              };
-            }
-            # Expression C: Reduce Query A to its last value, preserving labels
-            {
-              refId = "C";
-              datasourceUid = "__expr__";
-              model = {
-                type = "reduce";
-                expression = "A"; # Input is Query A
-                reducer = "last"; # Get the last value of each series in A
-              };
-            }
-            # Expression D: Apply math condition to the reduced values from C
-            {
-              refId = "D";
-              datasourceUid = "__expr__";
-              model = {
-                type = "math";
-                expression = "$C > 90"; # Alert if RAM usage from C is > 90%
-              };
-            }
-          ];
-
-          for = "10m"; # Duration the condition must be met
-          noDataState = "NoData";
-          execErrState = "Error";
-
-          annotations = {
-            summary = "High RAM usage on {{ $labels.instance }}";
-            description = ''RAM usage on {{ $labels.instance }} (job: {{ $labels.job }}) has been above 90% for more than 10 minutes. Current value: {{ if $values.C }}{{ $values.C | humanizePercentage }}{{ else }}N/A{{ end }}%.'';
+          refId = "A";
+          datasourceUid = "vm-datasource-uid";
+          queryType = "prometheus";
+          relativeTimeRange = { from = 60; to = 0; };
+          model = {
+            expr = ''
+              (1 - node_memory_MemAvailable_bytes / (node_memory_MemTotal_bytes > 0)) * 100
+              and node_memory_MemAvailable_bytes
+              and node_memory_MemTotal_bytes
+            '';
+            legendFormat = "RAM usage on {{instance}} ({{job}})";
+            instant = false;
          };
-          labels = {
-            severity = "warning";
-            category = "performance";
+        }
+        # Expression C: Reduce Query A to its last value, preserving labels
+        {
+          refId = "C";
+          datasourceUid = "__expr__";
+          model = {
+            type = "reduce";
+            expression = "A"; # Input is Query A
+            reducer = "last"; # Get the last value of each series in A
+          };
+        }
+        # Expression D: Apply math condition to the reduced values from C
+        {
+          refId = "D";
+          datasourceUid = "__expr__";
+          model = {
+            type = "math";
+            expression = "$C > 90"; # Alert if RAM usage from C is > 90%
          };
        }
      ];
+
+      for = "10m"; # Duration the condition must be met
+      noDataState = "NoData";
+      execErrState = "Error";
+
+      annotations = {
+        summary = "High RAM usage on {{ $labels.instance }}";
+        description = ''RAM usage on {{ $labels.instance }} (job: {{ $labels.job }}) has been above 90% for more than 10 minutes. Current value: {{ if $values.C }}{{ $values.C | humanizePercentage }}{{ else }}N/A{{ end }}%.'';
+      };
+      labels = {
+        severity = "warning";
+        category = "performance";
+      };
    }
  ];
 }
--- a/hosts/web-arm/modules/grafana/alerting/system/default.nix
+++ b/hosts/web-arm/modules/grafana/alerting/system/default.nix
@ -0,0 +1,21 @@
+{ lib, pkgs, config, ... }:
+let
+  # Import rule definitions from refactored alert files in the parent 'alerting' directory
+  cpuAlertRules = (import ../cpu_usage.nix { inherit lib pkgs config; }).grafanaAlertRuleDefinitions;
+  diskAlertRules = (import ../disk_usage.nix { inherit lib pkgs config; }).grafanaAlertRuleDefinitions;
+  hostDownAlertRules = (import ../host_down.nix { inherit lib pkgs config; }).grafanaAlertRuleDefinitions;
+  inodeAlertRules = (import ../inode_usage.nix { inherit lib pkgs config; }).grafanaAlertRuleDefinitions;
+  ramAlertRules = (import ../ram_usage.nix { inherit lib pkgs config; }).grafanaAlertRuleDefinitions;
+
+  allSystemRules = cpuAlertRules ++ diskAlertRules ++ hostDownAlertRules ++ inodeAlertRules ++ ramAlertRules;
+in
+{
+  services.grafana.provision.alerting.rules.settings.groups = [
+    {
+      name = "System Alerts"; # This is the Grafana alert group name
+      folder = "System Alerts"; # This is the Grafana folder name
+      interval = "1m"; 
+      rules = allSystemRules;
+    }
+  ];
+}