From 35fa61ef34fe8e55849032cab6144cf5f716b904 Mon Sep 17 00:00:00 2001 From: Dominik Polakovics Date: Sat, 31 May 2025 09:57:03 +0200 Subject: [PATCH] feat: refactor Grafana alerting rules into a consolidated system module and update individual alert files --- .../modules/grafana/alerting/cpu_usage.nix | 100 ++++++------- .../modules/grafana/alerting/disk_usage.nix | 137 ++++++++---------- .../modules/grafana/alerting/host_down.nix | 92 ++++++------ .../modules/grafana/alerting/inode_usage.nix | 110 +++++++------- .../modules/grafana/alerting/ram_usage.nix | 106 +++++++------- .../grafana/alerting/system/default.nix | 21 +++ hosts/web-arm/modules/grafana/default.nix | 14 +- 7 files changed, 281 insertions(+), 299 deletions(-) create mode 100644 hosts/web-arm/modules/grafana/alerting/system/default.nix diff --git a/hosts/web-arm/modules/grafana/alerting/cpu_usage.nix b/hosts/web-arm/modules/grafana/alerting/cpu_usage.nix index 515fabb..9c09881 100644 --- a/hosts/web-arm/modules/grafana/alerting/cpu_usage.nix +++ b/hosts/web-arm/modules/grafana/alerting/cpu_usage.nix @@ -1,66 +1,58 @@ { lib, pkgs, config, ... }: { - services.grafana.provision.alerting.rules.settings.groups = [ + grafanaAlertRuleDefinitions = [ { - name = "CPUUsageAlerts"; - folder = "System Alerts"; - interval = "1m"; + uid = "high-cpu-usage-alert-uid"; + title = "HighCPUUsage"; + condition = "D"; # Condition is now D - rules = [ + data = [ + # Query A: Calculate CPU usage percentage { - uid = "high-cpu-usage-alert-uid"; - title = "HighCPUUsage"; - condition = "D"; # Condition is now D - - data = [ - # Query A: Calculate CPU usage percentage - { - refId = "A"; - datasourceUid = "vm-datasource-uid"; - queryType = "prometheus"; - relativeTimeRange = { from = 60; to = 0; }; # Query over the last minute - model = { - # Calculate average CPU usage over 1m, grouped by instance and job - expr = ''(1 - avg by (instance, job) (rate(node_cpu_seconds_total{mode="idle"}[1m]))) * 100''; - legendFormat = "CPU usage on {{instance}} ({{job}})"; - instant = false; # This is a range query - }; - } - # Expression C: Reduce Query A to its last value, preserving labels - { - refId = "C"; - datasourceUid = "__expr__"; - model = { - type = "reduce"; - expression = "A"; # Input is Query A - reducer = "last"; # Get the last value of each series in A - }; - } - # Expression D: Apply math condition to the reduced values from C - { - refId = "D"; - datasourceUid = "__expr__"; - model = { - type = "math"; - expression = "$C > 90"; # Alert if CPU usage from C is > 90% - }; - } - ]; - - for = "5m"; # Duration the condition must be met - noDataState = "NoData"; - execErrState = "Error"; - - annotations = { - summary = "High CPU usage on {{ $labels.instance }}"; - description = ''CPU usage on {{ $labels.instance }} (job: {{ $labels.job }}) has been above 90% for more than 5 minutes. Current value: {{ if $values.C }}{{ $values.C | humanizePercentage }}{{ else }}N/A{{ end }}%.''; + refId = "A"; + datasourceUid = "vm-datasource-uid"; + queryType = "prometheus"; + relativeTimeRange = { from = 60; to = 0; }; # Query over the last minute + model = { + # Calculate average CPU usage over 1m, grouped by instance and job + expr = ''(1 - avg by (instance, job) (rate(node_cpu_seconds_total{mode="idle"}[1m]))) * 100''; + legendFormat = "CPU usage on {{instance}} ({{job}})"; + instant = false; # This is a range query }; - labels = { - severity = "warning"; - category = "performance"; + } + # Expression C: Reduce Query A to its last value, preserving labels + { + refId = "C"; + datasourceUid = "__expr__"; + model = { + type = "reduce"; + expression = "A"; # Input is Query A + reducer = "last"; # Get the last value of each series in A + }; + } + # Expression D: Apply math condition to the reduced values from C + { + refId = "D"; + datasourceUid = "__expr__"; + model = { + type = "math"; + expression = "$C > 90"; # Alert if CPU usage from C is > 90% }; } ]; + + for = "5m"; # Duration the condition must be met + noDataState = "NoData"; + execErrState = "Error"; + + annotations = { + summary = "High CPU usage on {{ $labels.instance }}"; + description = ''CPU usage on {{ $labels.instance }} (job: {{ $labels.job }}) has been above 90% for more than 5 minutes. Current value: {{ if $values.C }}{{ $values.C | humanizePercentage }}{{ else }}N/A{{ end }}%.''; + }; + labels = { + severity = "warning"; + category = "performance"; + }; } ]; } diff --git a/hosts/web-arm/modules/grafana/alerting/disk_usage.nix b/hosts/web-arm/modules/grafana/alerting/disk_usage.nix index b30686b..020947b 100644 --- a/hosts/web-arm/modules/grafana/alerting/disk_usage.nix +++ b/hosts/web-arm/modules/grafana/alerting/disk_usage.nix @@ -1,85 +1,76 @@ { lib, pkgs, config, ... }: { - services.grafana.provision.alerting.rules.settings.groups = [ + grafanaAlertRuleDefinitions = [ { - # orgId = 1; # Defaults to 1 for provisioned rules - name = "DiskUsageAlerts"; # Name of the rule group - folder = "System Alerts"; # The folder these rules belong to in Grafana UI - interval = "1m"; # How often to evaluate rules in this group + uid = "high-disk-usage-alert-uid"; # Optional: provide a stable UID for the rule itself + title = "HighDiskUsage"; # Name of the alert rule (was 'alert' in vmalert) - rules = [ + # Condition for the alert to fire. 'D' refers to the refId of the threshold expression. + condition = "D"; # Condition is now D + # Removed rule-level relativeTimeRange + + # Data queries and expressions + data = [ + # Query A: Calculate disk usage percentage { - uid = "high-disk-usage-alert-uid"; # Optional: provide a stable UID for the rule itself - title = "HighDiskUsage"; # Name of the alert rule (was 'alert' in vmalert) - - # Condition for the alert to fire. 'C' refers to the refId of the threshold expression. - condition = "D"; # Condition is now D - # Removed rule-level relativeTimeRange - - # Data queries and expressions - data = [ - # Query A: Calculate disk usage percentage - { - refId = "A"; - datasourceUid = "vm-datasource-uid"; # UID of the VictoriaMetrics datasource - queryType = "prometheus"; # Explicitly set, though often inferred - relativeTimeRange = { from = 60; to = 0; }; # Query-level, integer seconds - model = { - expr = '' - ( - node_filesystem_size_bytes{fstype!~"tmpfs|rootfs",mountpoint!=""} - node_filesystem_avail_bytes{fstype!~"tmpfs|rootfs",mountpoint!=""} - ) / (node_filesystem_size_bytes{fstype!~"tmpfs|rootfs",mountpoint!=""} > 0) * 100 - and node_filesystem_size_bytes{fstype!~"tmpfs|rootfs",mountpoint!=""} - and node_filesystem_avail_bytes{fstype!~"tmpfs|rootfs",mountpoint!=""} - ''; - legendFormat = "{{mountpoint}} on {{instance}}"; # Example legend - instant = false; # For range queries, default is false - }; - } - # Expression C: Reduce Query A to its last value, preserving labels - { - refId = "C"; - datasourceUid = "__expr__"; - model = { - type = "reduce"; - expression = "A"; # Input is Query A - reducer = "last"; # Get the last value of each series in A - }; - } - # Expression D: Apply math condition to the reduced values from C - { - refId = "D"; - datasourceUid = "__expr__"; - model = { - type = "math"; - expression = "$C > 85"; # Check if the last value from each series in C is > 85 - }; - } - ]; - - for = "15m"; # Duration the condition must be met (same as vmalert) - - # How to handle states where data is missing or query errors - noDataState = "NoData"; # Options: NoData, Alerting, OK - execErrState = "Error"; # Options: Error, Alerting, OK - - annotations = { - summary = "High disk usage on {{ $labels.instance }} at {{ $labels.mountpoint }}"; - description = '' - Disk usage on {{ $labels.instance }} for mount point {{ $labels.mountpoint }} - (fstype: {{ $labels.fstype }}) has been above 85% for more than 15 minutes. - Current value: {{ if $values.C }}{{ $values.C | humanizePercentage }}{{ else }}N/A{{ end }}%. - ''; # Using $values.C as it's the input to the math condition D + refId = "A"; + datasourceUid = "vm-datasource-uid"; # UID of the VictoriaMetrics datasource + queryType = "prometheus"; # Explicitly set, though often inferred + relativeTimeRange = { from = 60; to = 0; }; # Query-level, integer seconds + model = { + expr = '' + ( + node_filesystem_size_bytes{fstype!~"tmpfs|rootfs",mountpoint!=""} - node_filesystem_avail_bytes{fstype!~"tmpfs|rootfs",mountpoint!=""} + ) / (node_filesystem_size_bytes{fstype!~"tmpfs|rootfs",mountpoint!=""} > 0) * 100 + and node_filesystem_size_bytes{fstype!~"tmpfs|rootfs",mountpoint!=""} + and node_filesystem_avail_bytes{fstype!~"tmpfs|rootfs",mountpoint!=""} + ''; + legendFormat = "{{mountpoint}} on {{instance}}"; # Example legend + instant = false; # For range queries, default is false }; - labels = { - severity = "warning"; - category = "capacity"; - # Grafana automatically adds labels from the query result (instance, mountpoint, etc.) - # and labels from the rule group/folder. + } + # Expression C: Reduce Query A to its last value, preserving labels + { + refId = "C"; + datasourceUid = "__expr__"; + model = { + type = "reduce"; + expression = "A"; # Input is Query A + reducer = "last"; # Get the last value of each series in A + }; + } + # Expression D: Apply math condition to the reduced values from C + { + refId = "D"; + datasourceUid = "__expr__"; + model = { + type = "math"; + expression = "$C > 85"; # Check if the last value from each series in C is > 85 }; - # isPaused = false; # Default is not paused } ]; + + for = "15m"; # Duration the condition must be met (same as vmalert) + + # How to handle states where data is missing or query errors + noDataState = "NoData"; # Options: NoData, Alerting, OK + execErrState = "Error"; # Options: Error, Alerting, OK + + annotations = { + summary = "High disk usage on {{ $labels.instance }} at {{ $labels.mountpoint }}"; + description = '' + Disk usage on {{ $labels.instance }} for mount point {{ $labels.mountpoint }} + (fstype: {{ $labels.fstype }}) has been above 85% for more than 15 minutes. + Current value: {{ if $values.C }}{{ $values.C | humanizePercentage }}{{ else }}N/A{{ end }}%. + ''; # Using $values.C as it's the input to the math condition D + }; + labels = { + severity = "warning"; + category = "capacity"; + # Grafana automatically adds labels from the query result (instance, mountpoint, etc.) + # and labels from the rule group/folder. + }; + # isPaused = false; # Default is not paused } ]; } diff --git a/hosts/web-arm/modules/grafana/alerting/host_down.nix b/hosts/web-arm/modules/grafana/alerting/host_down.nix index a2d938f..1910b23 100644 --- a/hosts/web-arm/modules/grafana/alerting/host_down.nix +++ b/hosts/web-arm/modules/grafana/alerting/host_down.nix @@ -1,62 +1,54 @@ { lib, pkgs, config, ... }: { - services.grafana.provision.alerting.rules.settings.groups = [ + grafanaAlertRuleDefinitions = [ { - name = "HostStatusAlerts"; - folder = "System Alerts"; - interval = "1m"; + uid = "host-down-alert-uid"; + title = "HostDown"; + condition = "C"; - rules = [ + data = [ { - uid = "host-down-alert-uid"; - title = "HostDown"; - condition = "C"; - - data = [ - { - refId = "A"; - datasourceUid = "vm-datasource-uid"; - queryType = "prometheus"; - relativeTimeRange = { from = 60; to = 0; }; # Query over the last minute - model = { - expr = ''up''; - legendFormat = "{{instance}} ({{job}})"; - instant = false; # Changed from true, as relativeTimeRange is used - }; - } - { # New Expression B: Reduce Query A - refId = "B"; - datasourceUid = "__expr__"; - model = { - type = "reduce"; - expression = "A"; # Input is Query A - reducer = "last"; # Get the last value of each series in A - }; - } - { # Modified Expression C: Math condition based on B - refId = "C"; - datasourceUid = "__expr__"; - model = { - type = "math"; - expression = "$B == 0"; # Check if the last value from B is 0 - }; - } - ]; - - for = "2m"; - noDataState = "Alerting"; - execErrState = "Error"; - - annotations = { - summary = "Host {{ $labels.instance }} is down"; - description = ''Host {{ $labels.instance }} (job: {{ $labels.job }}) has been down for more than 2 minutes.''; + refId = "A"; + datasourceUid = "vm-datasource-uid"; + queryType = "prometheus"; + relativeTimeRange = { from = 60; to = 0; }; # Query over the last minute + model = { + expr = ''up''; + legendFormat = "{{instance}} ({{job}})"; + instant = false; # Changed from true, as relativeTimeRange is used }; - labels = { - severity = "critical"; - category = "availability"; + } + { # New Expression B: Reduce Query A + refId = "B"; + datasourceUid = "__expr__"; + model = { + type = "reduce"; + expression = "A"; # Input is Query A + reducer = "last"; # Get the last value of each series in A + }; + } + { # Modified Expression C: Math condition based on B + refId = "C"; + datasourceUid = "__expr__"; + model = { + type = "math"; + expression = "$B == 0"; # Check if the last value from B is 0 }; } ]; + + for = "2m"; + noDataState = "Alerting"; + execErrState = "Error"; + + annotations = { + summary = "Host {{ $labels.instance }} is down"; + description = ''Host {{ $labels.instance }} (job: {{ $labels.job }}) has been down for more than 2 minutes.''; + }; + labels = { + severity = "critical"; + category = "availability"; + }; } ]; } diff --git a/hosts/web-arm/modules/grafana/alerting/inode_usage.nix b/hosts/web-arm/modules/grafana/alerting/inode_usage.nix index 8f67178..ba73f30 100644 --- a/hosts/web-arm/modules/grafana/alerting/inode_usage.nix +++ b/hosts/web-arm/modules/grafana/alerting/inode_usage.nix @@ -1,71 +1,63 @@ { lib, pkgs, config, ... }: { - services.grafana.provision.alerting.rules.settings.groups = [ + grafanaAlertRuleDefinitions = [ { - name = "InodeUsageAlerts"; - folder = "System Alerts"; - interval = "1m"; + uid = "high-inode-usage-alert-uid"; + title = "HighInodeUsage"; + condition = "D"; # Condition is now D - rules = [ + data = [ + # Query A: Calculate inode usage percentage { - uid = "high-inode-usage-alert-uid"; - title = "HighInodeUsage"; - condition = "D"; # Condition is now D - - data = [ - # Query A: Calculate inode usage percentage - { - refId = "A"; - datasourceUid = "vm-datasource-uid"; - queryType = "prometheus"; - relativeTimeRange = { from = 60; to = 0; }; - model = { - expr = '' - ( - node_filesystem_files{fstype!~"tmpfs|rootfs",mountpoint!=""} - node_filesystem_files_free{fstype!~"tmpfs|rootfs",mountpoint!=""} - ) / (node_filesystem_files{fstype!~"tmpfs|rootfs",mountpoint!=""} > 0) * 100 - and node_filesystem_files{fstype!~"tmpfs|rootfs",mountpoint!=""} - and node_filesystem_files_free{fstype!~"tmpfs|rootfs",mountpoint!=""} - ''; - legendFormat = "{{mountpoint}} on {{instance}}"; - instant = false; - }; - } - # Expression C: Reduce Query A to its last value, preserving labels - { - refId = "C"; - datasourceUid = "__expr__"; - model = { - type = "reduce"; - expression = "A"; # Input is Query A - reducer = "last"; # Get the last value of each series in A - }; - } - # Expression D: Apply math condition to the reduced values from C - { - refId = "D"; - datasourceUid = "__expr__"; - model = { - type = "math"; - expression = "$C > 80"; # Alert if inode usage from C is > 80% - }; - } - ]; - - for = "30m"; # Duration the condition must be met - noDataState = "NoData"; - execErrState = "Error"; - - annotations = { - summary = "High inode usage on {{ $labels.instance }} at {{ $labels.mountpoint }}"; - description = ''Inode usage on {{ $labels.instance }} for mount point {{ $labels.mountpoint }} (fstype: {{ $labels.fstype }}) has been above 80% for more than 30 minutes. Current value: {{ if $values.C }}{{ $values.C | humanizePercentage }}{{ else }}N/A{{ end }}%.''; + refId = "A"; + datasourceUid = "vm-datasource-uid"; + queryType = "prometheus"; + relativeTimeRange = { from = 60; to = 0; }; + model = { + expr = '' + ( + node_filesystem_files{fstype!~"tmpfs|rootfs",mountpoint!=""} - node_filesystem_files_free{fstype!~"tmpfs|rootfs",mountpoint!=""} + ) / (node_filesystem_files{fstype!~"tmpfs|rootfs",mountpoint!=""} > 0) * 100 + and node_filesystem_files{fstype!~"tmpfs|rootfs",mountpoint!=""} + and node_filesystem_files_free{fstype!~"tmpfs|rootfs",mountpoint!=""} + ''; + legendFormat = "{{mountpoint}} on {{instance}}"; + instant = false; }; - labels = { - severity = "warning"; - category = "capacity"; + } + # Expression C: Reduce Query A to its last value, preserving labels + { + refId = "C"; + datasourceUid = "__expr__"; + model = { + type = "reduce"; + expression = "A"; # Input is Query A + reducer = "last"; # Get the last value of each series in A + }; + } + # Expression D: Apply math condition to the reduced values from C + { + refId = "D"; + datasourceUid = "__expr__"; + model = { + type = "math"; + expression = "$C > 80"; # Alert if inode usage from C is > 80% }; } ]; + + for = "30m"; # Duration the condition must be met + noDataState = "NoData"; + execErrState = "Error"; + + annotations = { + summary = "High inode usage on {{ $labels.instance }} at {{ $labels.mountpoint }}"; + description = ''Inode usage on {{ $labels.instance }} for mount point {{ $labels.mountpoint }} (fstype: {{ $labels.fstype }}) has been above 80% for more than 30 minutes. Current value: {{ if $values.C }}{{ $values.C | humanizePercentage }}{{ else }}N/A{{ end }}%.''; + }; + labels = { + severity = "warning"; + category = "capacity"; + }; } ]; } diff --git a/hosts/web-arm/modules/grafana/alerting/ram_usage.nix b/hosts/web-arm/modules/grafana/alerting/ram_usage.nix index 03dd931..14a2ea8 100644 --- a/hosts/web-arm/modules/grafana/alerting/ram_usage.nix +++ b/hosts/web-arm/modules/grafana/alerting/ram_usage.nix @@ -1,69 +1,61 @@ { lib, pkgs, config, ... }: { - services.grafana.provision.alerting.rules.settings.groups = [ + grafanaAlertRuleDefinitions = [ { - name = "RAMUsageAlerts"; - folder = "System Alerts"; - interval = "1m"; + uid = "high-ram-usage-alert-uid"; + title = "HighRAMUsage"; + condition = "D"; # Condition is now D - rules = [ + data = [ + # Query A: Calculate RAM usage percentage { - uid = "high-ram-usage-alert-uid"; - title = "HighRAMUsage"; - condition = "D"; # Condition is now D - - data = [ - # Query A: Calculate RAM usage percentage - { - refId = "A"; - datasourceUid = "vm-datasource-uid"; - queryType = "prometheus"; - relativeTimeRange = { from = 60; to = 0; }; - model = { - expr = '' - (1 - node_memory_MemAvailable_bytes / (node_memory_MemTotal_bytes > 0)) * 100 - and node_memory_MemAvailable_bytes - and node_memory_MemTotal_bytes - ''; - legendFormat = "RAM usage on {{instance}} ({{job}})"; - instant = false; - }; - } - # Expression C: Reduce Query A to its last value, preserving labels - { - refId = "C"; - datasourceUid = "__expr__"; - model = { - type = "reduce"; - expression = "A"; # Input is Query A - reducer = "last"; # Get the last value of each series in A - }; - } - # Expression D: Apply math condition to the reduced values from C - { - refId = "D"; - datasourceUid = "__expr__"; - model = { - type = "math"; - expression = "$C > 90"; # Alert if RAM usage from C is > 90% - }; - } - ]; - - for = "10m"; # Duration the condition must be met - noDataState = "NoData"; - execErrState = "Error"; - - annotations = { - summary = "High RAM usage on {{ $labels.instance }}"; - description = ''RAM usage on {{ $labels.instance }} (job: {{ $labels.job }}) has been above 90% for more than 10 minutes. Current value: {{ if $values.C }}{{ $values.C | humanizePercentage }}{{ else }}N/A{{ end }}%.''; + refId = "A"; + datasourceUid = "vm-datasource-uid"; + queryType = "prometheus"; + relativeTimeRange = { from = 60; to = 0; }; + model = { + expr = '' + (1 - node_memory_MemAvailable_bytes / (node_memory_MemTotal_bytes > 0)) * 100 + and node_memory_MemAvailable_bytes + and node_memory_MemTotal_bytes + ''; + legendFormat = "RAM usage on {{instance}} ({{job}})"; + instant = false; }; - labels = { - severity = "warning"; - category = "performance"; + } + # Expression C: Reduce Query A to its last value, preserving labels + { + refId = "C"; + datasourceUid = "__expr__"; + model = { + type = "reduce"; + expression = "A"; # Input is Query A + reducer = "last"; # Get the last value of each series in A + }; + } + # Expression D: Apply math condition to the reduced values from C + { + refId = "D"; + datasourceUid = "__expr__"; + model = { + type = "math"; + expression = "$C > 90"; # Alert if RAM usage from C is > 90% }; } ]; + + for = "10m"; # Duration the condition must be met + noDataState = "NoData"; + execErrState = "Error"; + + annotations = { + summary = "High RAM usage on {{ $labels.instance }}"; + description = ''RAM usage on {{ $labels.instance }} (job: {{ $labels.job }}) has been above 90% for more than 10 minutes. Current value: {{ if $values.C }}{{ $values.C | humanizePercentage }}{{ else }}N/A{{ end }}%.''; + }; + labels = { + severity = "warning"; + category = "performance"; + }; } ]; } diff --git a/hosts/web-arm/modules/grafana/alerting/system/default.nix b/hosts/web-arm/modules/grafana/alerting/system/default.nix new file mode 100644 index 0000000..26db06d --- /dev/null +++ b/hosts/web-arm/modules/grafana/alerting/system/default.nix @@ -0,0 +1,21 @@ +{ lib, pkgs, config, ... }: +let + # Import rule definitions from refactored alert files in the parent 'alerting' directory + cpuAlertRules = (import ../cpu_usage.nix { inherit lib pkgs config; }).grafanaAlertRuleDefinitions; + diskAlertRules = (import ../disk_usage.nix { inherit lib pkgs config; }).grafanaAlertRuleDefinitions; + hostDownAlertRules = (import ../host_down.nix { inherit lib pkgs config; }).grafanaAlertRuleDefinitions; + inodeAlertRules = (import ../inode_usage.nix { inherit lib pkgs config; }).grafanaAlertRuleDefinitions; + ramAlertRules = (import ../ram_usage.nix { inherit lib pkgs config; }).grafanaAlertRuleDefinitions; + + allSystemRules = cpuAlertRules ++ diskAlertRules ++ hostDownAlertRules ++ inodeAlertRules ++ ramAlertRules; +in +{ + services.grafana.provision.alerting.rules.settings.groups = [ + { + name = "System Alerts"; # This is the Grafana alert group name + folder = "System Alerts"; # This is the Grafana folder name + interval = "1m"; + rules = allSystemRules; + } + ]; +} \ No newline at end of file diff --git a/hosts/web-arm/modules/grafana/default.nix b/hosts/web-arm/modules/grafana/default.nix index 6d1394f..6f48794 100644 --- a/hosts/web-arm/modules/grafana/default.nix +++ b/hosts/web-arm/modules/grafana/default.nix @@ -28,11 +28,13 @@ let in { imports = [ - ./alerting/disk_usage.nix - ./alerting/cpu_usage.nix - ./alerting/host_down.nix - ./alerting/inode_usage.nix - ./alerting/ram_usage.nix + # Individual alert files removed, now handled by alerting/system/default.nix + # ./alerting/disk_usage.nix + # ./alerting/cpu_usage.nix + # ./alerting/host_down.nix + # ./alerting/inode_usage.nix + # ./alerting/ram_usage.nix + ./alerting/system/default.nix # Added: Imports the consolidated system alerts module # ... other rule files can be added here ... ./datasources/victoriametrics.nix ]; @@ -99,7 +101,7 @@ in }; provision = { alerting = { - rules.settings.groups = lib.mkMerge []; # Allows rule groups to be merged + rules.settings.groups = lib.mkMerge []; # Allows rule groups to be merged (including the one from system/default.nix) contactPoints = { settings = { apiVersion = 1; # As per Grafana provisioning API