From 8b5fb0861d090e223736cda7a764052f96fd928e Mon Sep 17 00:00:00 2001 From: Dominik Polakovics Date: Sat, 31 May 2025 09:27:25 +0200 Subject: [PATCH] feat: restructure Grafana configuration, migrate alert rules to new format and add VictoriaMetrics datasource --- hosts/web-arm/configuration.nix | 3 +- .../modules/grafana/alerting/cpu_usage.nix | 66 ++++++++++++++ .../modules/grafana/alerting/disk_usage.nix | 85 +++++++++++++++++++ .../modules/grafana/alerting/host_down.nix | 62 ++++++++++++++ .../modules/grafana/alerting/inode_usage.nix | 71 ++++++++++++++++ .../modules/grafana/alerting/ram_usage.nix | 69 +++++++++++++++ .../grafana/datasources/victoriametrics.nix | 18 ++++ .../{grafana.nix => grafana/default.nix} | 12 +++ hosts/web-arm/modules/vmalert/default.nix | 38 --------- .../modules/vmalert/rules/cpu_usage.nix | 26 ------ .../modules/vmalert/rules/disk_usage.nix | 27 ------ .../modules/vmalert/rules/host_down.nix | 23 ----- .../modules/vmalert/rules/inode_usage.nix | 27 ------ .../modules/vmalert/rules/ram_usage.nix | 23 ----- 14 files changed, 384 insertions(+), 166 deletions(-) create mode 100644 hosts/web-arm/modules/grafana/alerting/cpu_usage.nix create mode 100644 hosts/web-arm/modules/grafana/alerting/disk_usage.nix create mode 100644 hosts/web-arm/modules/grafana/alerting/host_down.nix create mode 100644 hosts/web-arm/modules/grafana/alerting/inode_usage.nix create mode 100644 hosts/web-arm/modules/grafana/alerting/ram_usage.nix create mode 100644 hosts/web-arm/modules/grafana/datasources/victoriametrics.nix rename hosts/web-arm/modules/{grafana.nix => grafana/default.nix} (94%) delete mode 100644 hosts/web-arm/modules/vmalert/default.nix delete mode 100644 hosts/web-arm/modules/vmalert/rules/cpu_usage.nix delete mode 100644 hosts/web-arm/modules/vmalert/rules/disk_usage.nix delete mode 100644 hosts/web-arm/modules/vmalert/rules/host_down.nix delete mode 100644 hosts/web-arm/modules/vmalert/rules/inode_usage.nix delete mode 100644 hosts/web-arm/modules/vmalert/rules/ram_usage.nix diff --git a/hosts/web-arm/configuration.nix b/hosts/web-arm/configuration.nix index 68a0e0c..54c74d9 100644 --- a/hosts/web-arm/configuration.nix +++ b/hosts/web-arm/configuration.nix @@ -14,10 +14,9 @@ ./modules/nextcloud ./modules/rustdesk.nix ./modules/postgresql.nix - ./modules/grafana.nix + ./modules/grafana/default.nix ./modules/loki.nix ./modules/victoriametrics.nix - ./modules/vmalert/default.nix # Added vmalert module ./modules/updns.nix ./utils/modules/autoupgrade.nix diff --git a/hosts/web-arm/modules/grafana/alerting/cpu_usage.nix b/hosts/web-arm/modules/grafana/alerting/cpu_usage.nix new file mode 100644 index 0000000..515fabb --- /dev/null +++ b/hosts/web-arm/modules/grafana/alerting/cpu_usage.nix @@ -0,0 +1,66 @@ +{ lib, pkgs, config, ... }: +{ + services.grafana.provision.alerting.rules.settings.groups = [ + { + name = "CPUUsageAlerts"; + folder = "System Alerts"; + interval = "1m"; + + rules = [ + { + uid = "high-cpu-usage-alert-uid"; + title = "HighCPUUsage"; + condition = "D"; # Condition is now D + + data = [ + # Query A: Calculate CPU usage percentage + { + refId = "A"; + datasourceUid = "vm-datasource-uid"; + queryType = "prometheus"; + relativeTimeRange = { from = 60; to = 0; }; # Query over the last minute + model = { + # Calculate average CPU usage over 1m, grouped by instance and job + expr = ''(1 - avg by (instance, job) (rate(node_cpu_seconds_total{mode="idle"}[1m]))) * 100''; + legendFormat = "CPU usage on {{instance}} ({{job}})"; + instant = false; # This is a range query + }; + } + # Expression C: Reduce Query A to its last value, preserving labels + { + refId = "C"; + datasourceUid = "__expr__"; + model = { + type = "reduce"; + expression = "A"; # Input is Query A + reducer = "last"; # Get the last value of each series in A + }; + } + # Expression D: Apply math condition to the reduced values from C + { + refId = "D"; + datasourceUid = "__expr__"; + model = { + type = "math"; + expression = "$C > 90"; # Alert if CPU usage from C is > 90% + }; + } + ]; + + for = "5m"; # Duration the condition must be met + noDataState = "NoData"; + execErrState = "Error"; + + annotations = { + summary = "High CPU usage on {{ $labels.instance }}"; + description = ''CPU usage on {{ $labels.instance }} (job: {{ $labels.job }}) has been above 90% for more than 5 minutes. Current value: {{ if $values.C }}{{ $values.C | humanizePercentage }}{{ else }}N/A{{ end }}%.''; + }; + labels = { + severity = "warning"; + category = "performance"; + }; + } + ]; + } + ]; +} diff --git a/hosts/web-arm/modules/grafana/alerting/disk_usage.nix b/hosts/web-arm/modules/grafana/alerting/disk_usage.nix new file mode 100644 index 0000000..b30686b --- /dev/null +++ b/hosts/web-arm/modules/grafana/alerting/disk_usage.nix @@ -0,0 +1,85 @@ +{ lib, pkgs, config, ... }: +{ + services.grafana.provision.alerting.rules.settings.groups = [ + { + # orgId = 1; # Defaults to 1 for provisioned rules + name = "DiskUsageAlerts"; # Name of the rule group + folder = "System Alerts"; # The folder these rules belong to in Grafana UI + interval = "1m"; # How often to evaluate rules in this group + + rules = [ + { + uid = "high-disk-usage-alert-uid"; # Optional: provide a stable UID for the rule itself + title = "HighDiskUsage"; # Name of the alert rule (was 'alert' in vmalert) + + # Condition for the alert to fire. 'C' refers to the refId of the threshold expression. + condition = "D"; # Condition is now D + # Removed rule-level relativeTimeRange + + # Data queries and expressions + data = [ + # Query A: Calculate disk usage percentage + { + refId = "A"; + datasourceUid = "vm-datasource-uid"; # UID of the VictoriaMetrics datasource + queryType = "prometheus"; # Explicitly set, though often inferred + relativeTimeRange = { from = 60; to = 0; }; # Query-level, integer seconds + model = { + expr = '' + ( + node_filesystem_size_bytes{fstype!~"tmpfs|rootfs",mountpoint!=""} - node_filesystem_avail_bytes{fstype!~"tmpfs|rootfs",mountpoint!=""} + ) / (node_filesystem_size_bytes{fstype!~"tmpfs|rootfs",mountpoint!=""} > 0) * 100 + and node_filesystem_size_bytes{fstype!~"tmpfs|rootfs",mountpoint!=""} + and node_filesystem_avail_bytes{fstype!~"tmpfs|rootfs",mountpoint!=""} + ''; + legendFormat = "{{mountpoint}} on {{instance}}"; # Example legend + instant = false; # For range queries, default is false + }; + } + # Expression C: Reduce Query A to its last value, preserving labels + { + refId = "C"; + datasourceUid = "__expr__"; + model = { + type = "reduce"; + expression = "A"; # Input is Query A + reducer = "last"; # Get the last value of each series in A + }; + } + # Expression D: Apply math condition to the reduced values from C + { + refId = "D"; + datasourceUid = "__expr__"; + model = { + type = "math"; + expression = "$C > 85"; # Check if the last value from each series in C is > 85 + }; + } + ]; + + for = "15m"; # Duration the condition must be met (same as vmalert) + + # How to handle states where data is missing or query errors + noDataState = "NoData"; # Options: NoData, Alerting, OK + execErrState = "Error"; # Options: Error, Alerting, OK + + annotations = { + summary = "High disk usage on {{ $labels.instance }} at {{ $labels.mountpoint }}"; + description = '' + Disk usage on {{ $labels.instance }} for mount point {{ $labels.mountpoint }} + (fstype: {{ $labels.fstype }}) has been above 85% for more than 15 minutes. + Current value: {{ if $values.C }}{{ $values.C | humanizePercentage }}{{ else }}N/A{{ end }}%. + ''; # Using $values.C as it's the input to the math condition D + }; + labels = { + severity = "warning"; + category = "capacity"; + # Grafana automatically adds labels from the query result (instance, mountpoint, etc.) + # and labels from the rule group/folder. + }; + # isPaused = false; # Default is not paused + } + ]; + } + ]; +} diff --git a/hosts/web-arm/modules/grafana/alerting/host_down.nix b/hosts/web-arm/modules/grafana/alerting/host_down.nix new file mode 100644 index 0000000..a2d938f --- /dev/null +++ b/hosts/web-arm/modules/grafana/alerting/host_down.nix @@ -0,0 +1,62 @@ +{ lib, pkgs, config, ... }: +{ + services.grafana.provision.alerting.rules.settings.groups = [ + { + name = "HostStatusAlerts"; + folder = "System Alerts"; + interval = "1m"; + + rules = [ + { + uid = "host-down-alert-uid"; + title = "HostDown"; + condition = "C"; + + data = [ + { + refId = "A"; + datasourceUid = "vm-datasource-uid"; + queryType = "prometheus"; + relativeTimeRange = { from = 60; to = 0; }; # Query over the last minute + model = { + expr = ''up''; + legendFormat = "{{instance}} ({{job}})"; + instant = false; # Changed from true, as relativeTimeRange is used + }; + } + { # New Expression B: Reduce Query A + refId = "B"; + datasourceUid = "__expr__"; + model = { + type = "reduce"; + expression = "A"; # Input is Query A + reducer = "last"; # Get the last value of each series in A + }; + } + { # Modified Expression C: Math condition based on B + refId = "C"; + datasourceUid = "__expr__"; + model = { + type = "math"; + expression = "$B == 0"; # Check if the last value from B is 0 + }; + } + ]; + + for = "2m"; + noDataState = "Alerting"; + execErrState = "Error"; + + annotations = { + summary = "Host {{ $labels.instance }} is down"; + description = ''Host {{ $labels.instance }} (job: {{ $labels.job }}) has been down for more than 2 minutes.''; + }; + labels = { + severity = "critical"; + category = "availability"; + }; + } + ]; + } + ]; +} diff --git a/hosts/web-arm/modules/grafana/alerting/inode_usage.nix b/hosts/web-arm/modules/grafana/alerting/inode_usage.nix new file mode 100644 index 0000000..8f67178 --- /dev/null +++ b/hosts/web-arm/modules/grafana/alerting/inode_usage.nix @@ -0,0 +1,71 @@ +{ lib, pkgs, config, ... }: +{ + services.grafana.provision.alerting.rules.settings.groups = [ + { + name = "InodeUsageAlerts"; + folder = "System Alerts"; + interval = "1m"; + + rules = [ + { + uid = "high-inode-usage-alert-uid"; + title = "HighInodeUsage"; + condition = "D"; # Condition is now D + + data = [ + # Query A: Calculate inode usage percentage + { + refId = "A"; + datasourceUid = "vm-datasource-uid"; + queryType = "prometheus"; + relativeTimeRange = { from = 60; to = 0; }; + model = { + expr = '' + ( + node_filesystem_files{fstype!~"tmpfs|rootfs",mountpoint!=""} - node_filesystem_files_free{fstype!~"tmpfs|rootfs",mountpoint!=""} + ) / (node_filesystem_files{fstype!~"tmpfs|rootfs",mountpoint!=""} > 0) * 100 + and node_filesystem_files{fstype!~"tmpfs|rootfs",mountpoint!=""} + and node_filesystem_files_free{fstype!~"tmpfs|rootfs",mountpoint!=""} + ''; + legendFormat = "{{mountpoint}} on {{instance}}"; + instant = false; + }; + } + # Expression C: Reduce Query A to its last value, preserving labels + { + refId = "C"; + datasourceUid = "__expr__"; + model = { + type = "reduce"; + expression = "A"; # Input is Query A + reducer = "last"; # Get the last value of each series in A + }; + } + # Expression D: Apply math condition to the reduced values from C + { + refId = "D"; + datasourceUid = "__expr__"; + model = { + type = "math"; + expression = "$C > 80"; # Alert if inode usage from C is > 80% + }; + } + ]; + + for = "30m"; # Duration the condition must be met + noDataState = "NoData"; + execErrState = "Error"; + + annotations = { + summary = "High inode usage on {{ $labels.instance }} at {{ $labels.mountpoint }}"; + description = ''Inode usage on {{ $labels.instance }} for mount point {{ $labels.mountpoint }} (fstype: {{ $labels.fstype }}) has been above 80% for more than 30 minutes. Current value: {{ if $values.C }}{{ $values.C | humanizePercentage }}{{ else }}N/A{{ end }}%.''; + }; + labels = { + severity = "warning"; + category = "capacity"; + }; + } + ]; + } + ]; +} diff --git a/hosts/web-arm/modules/grafana/alerting/ram_usage.nix b/hosts/web-arm/modules/grafana/alerting/ram_usage.nix new file mode 100644 index 0000000..03dd931 --- /dev/null +++ b/hosts/web-arm/modules/grafana/alerting/ram_usage.nix @@ -0,0 +1,69 @@ +{ lib, pkgs, config, ... }: +{ + services.grafana.provision.alerting.rules.settings.groups = [ + { + name = "RAMUsageAlerts"; + folder = "System Alerts"; + interval = "1m"; + + rules = [ + { + uid = "high-ram-usage-alert-uid"; + title = "HighRAMUsage"; + condition = "D"; # Condition is now D + + data = [ + # Query A: Calculate RAM usage percentage + { + refId = "A"; + datasourceUid = "vm-datasource-uid"; + queryType = "prometheus"; + relativeTimeRange = { from = 60; to = 0; }; + model = { + expr = '' + (1 - node_memory_MemAvailable_bytes / (node_memory_MemTotal_bytes > 0)) * 100 + and node_memory_MemAvailable_bytes + and node_memory_MemTotal_bytes + ''; + legendFormat = "RAM usage on {{instance}} ({{job}})"; + instant = false; + }; + } + # Expression C: Reduce Query A to its last value, preserving labels + { + refId = "C"; + datasourceUid = "__expr__"; + model = { + type = "reduce"; + expression = "A"; # Input is Query A + reducer = "last"; # Get the last value of each series in A + }; + } + # Expression D: Apply math condition to the reduced values from C + { + refId = "D"; + datasourceUid = "__expr__"; + model = { + type = "math"; + expression = "$C > 90"; # Alert if RAM usage from C is > 90% + }; + } + ]; + + for = "10m"; # Duration the condition must be met + noDataState = "NoData"; + execErrState = "Error"; + + annotations = { + summary = "High RAM usage on {{ $labels.instance }}"; + description = ''RAM usage on {{ $labels.instance }} (job: {{ $labels.job }}) has been above 90% for more than 10 minutes. Current value: {{ if $values.C }}{{ $values.C | humanizePercentage }}{{ else }}N/A{{ end }}%.''; + }; + labels = { + severity = "warning"; + category = "performance"; + }; + } + ]; + } + ]; +} diff --git a/hosts/web-arm/modules/grafana/datasources/victoriametrics.nix b/hosts/web-arm/modules/grafana/datasources/victoriametrics.nix new file mode 100644 index 0000000..57ea78a --- /dev/null +++ b/hosts/web-arm/modules/grafana/datasources/victoriametrics.nix @@ -0,0 +1,18 @@ +{ lib, pkgs, config, ... }: +{ + services.grafana.provision.datasources.settings.datasources = [ + { + name = "VictoriaMetrics"; + uid = "vm-datasource-uid"; # Stable UID for referencing in alerts + type = "prometheus"; + url = "http://localhost:8428"; # URL of VictoriaMetrics + access = "proxy"; # Grafana proxies requests + isDefault = true; # Optional: make this the default datasource + jsonData = { + # timeInterval = "30s"; # Optional: Scrape interval if different from Grafana's default + # httpMethod = "POST"; # Optional: if VictoriaMetrics prefers POST for queries + }; + editable = false; # Recommended for provisioned datasources + } + ]; +} \ No newline at end of file diff --git a/hosts/web-arm/modules/grafana.nix b/hosts/web-arm/modules/grafana/default.nix similarity index 94% rename from hosts/web-arm/modules/grafana.nix rename to hosts/web-arm/modules/grafana/default.nix index 34fcd37..6d1394f 100644 --- a/hosts/web-arm/modules/grafana.nix +++ b/hosts/web-arm/modules/grafana/default.nix @@ -27,6 +27,16 @@ let }; in { + imports = [ + ./alerting/disk_usage.nix + ./alerting/cpu_usage.nix + ./alerting/host_down.nix + ./alerting/inode_usage.nix + ./alerting/ram_usage.nix + # ... other rule files can be added here ... + ./datasources/victoriametrics.nix + ]; + systemd.services.grafana.script = lib.mkBefore '' export GF_AUTH_GENERIC_OAUTH_CLIENT_SECRET=$(cat /run/secrets/grafana-oauth-secret) export PUSHOVER_API_TOKEN=$(cat /run/secrets/pushover-api-token) @@ -89,6 +99,7 @@ in }; provision = { alerting = { + rules.settings.groups = lib.mkMerge []; # Allows rule groups to be merged contactPoints = { settings = { apiVersion = 1; # As per Grafana provisioning API @@ -158,6 +169,7 @@ in }; }; }; + datasources.settings.datasources = lib.mkMerge []; # Allows datasources to be merged }; }; diff --git a/hosts/web-arm/modules/vmalert/default.nix b/hosts/web-arm/modules/vmalert/default.nix deleted file mode 100644 index 6615b58..0000000 --- a/hosts/web-arm/modules/vmalert/default.nix +++ /dev/null @@ -1,38 +0,0 @@ -{ config, pkgs, lib, ... }: -{ - imports = [ - ./rules/cpu_usage.nix - ./rules/disk_usage.nix - ./rules/host_down.nix - ./rules/inode_usage.nix - ./rules/ram_usage.nix - ]; - - # Standard vmalert service configuration - services.vmalert = { - enable = true; - settings = { - "datasource.url" = "http://localhost:8428"; # VictoriaMetrics address - "notifier.url" = [ "http://localhost:3001/api/alertmanager/grafana/api/v2/alerts" ]; # Must be a list of strings - }; - # 'rules' is now set by the mkMerge block above. - }; - - # Override the User and Group for the systemd service managed by the official vmalert module. - systemd.services.vmalert = { - serviceConfig = { - User = "victoriametrics"; - Group = "victoriametrics"; - }; - }; - - # Ensure the user/group itself exists on the system. - users.users.victoriametrics = lib.mkIf (config.services.victoriametrics.enable || config.services.vmalert.enable) { - isSystemUser = true; - group = "victoriametrics"; # Primary group for the user - home = "/var/lib/victoriametrics"; # Standard home for VictoriaMetrics components - }; - users.groups.victoriametrics = lib.mkIf (config.services.victoriametrics.enable || config.services.vmalert.enable) { - # Ensures the group exists. - }; -} diff --git a/hosts/web-arm/modules/vmalert/rules/cpu_usage.nix b/hosts/web-arm/modules/vmalert/rules/cpu_usage.nix deleted file mode 100644 index 71b8dbc..0000000 --- a/hosts/web-arm/modules/vmalert/rules/cpu_usage.nix +++ /dev/null @@ -1,26 +0,0 @@ -{ lib, pkgs, config, ... }: # Standard module arguments -{ - # This module contributes its rule group to a list that will be - # collected and processed by the main vmalert module. - services.vmalert.rules.groups = [ - { - name = "CPUUsageAlerts"; - # interval = "60s"; # Optional: group-level interval - rules = [ # This MUST be a list of rule attribute sets - { - alert = "HighCPUUsage"; - expr = "(1 - avg by (instance, job) (rate(node_cpu_seconds_total{mode=\"idle\"}[1m]))) * 100 > 90"; - for = "5m"; - labels = { - severity = "warning"; - category = "performance"; - }; - annotations = { - summary = "High CPU usage on {{ $labels.instance }}"; - description = "CPU usage on {{ $labels.instance }} (job: {{ $labels.job }}) has been above 90% for more than 5 minutes. Current value: {{ $value | printf \"%.2f\" }}%."; - }; - } - ]; - } - ]; -} diff --git a/hosts/web-arm/modules/vmalert/rules/disk_usage.nix b/hosts/web-arm/modules/vmalert/rules/disk_usage.nix deleted file mode 100644 index 65570fd..0000000 --- a/hosts/web-arm/modules/vmalert/rules/disk_usage.nix +++ /dev/null @@ -1,27 +0,0 @@ -{ lib, pkgs, config, ... }: # Standard module arguments -{ - services.vmalert.rules.groups = [ - { - name = "DiskUsageAlerts"; - rules = [ - { - alert = "HighDiskUsage"; - expr = '' - ( - node_filesystem_size_bytes{fstype!~"tmpfs|rootfs",mountpoint!=""} - node_filesystem_avail_bytes{fstype!~"tmpfs|rootfs",mountpoint!=""} - ) / node_filesystem_size_bytes{fstype!~"tmpfs|rootfs",mountpoint!=""} * 100 > 85 - ''; - for = "15m"; - labels = { - severity = "warning"; - category = "capacity"; - }; - annotations = { - summary = "High disk usage on {{ $labels.instance }} at {{ $labels.mountpoint }}"; - description = "Disk usage on {{ $labels.instance }} for mount point {{ $labels.mountpoint }} (fstype: {{ $labels.fstype }}) has been above 85% for more than 15 minutes. Current value: {{ $value | printf \"%.2f\" }}%."; - }; - } - ]; - } - ]; -} diff --git a/hosts/web-arm/modules/vmalert/rules/host_down.nix b/hosts/web-arm/modules/vmalert/rules/host_down.nix deleted file mode 100644 index 0960bc4..0000000 --- a/hosts/web-arm/modules/vmalert/rules/host_down.nix +++ /dev/null @@ -1,23 +0,0 @@ -{ lib, pkgs, config, ... }: # Standard module arguments -{ - services.vmalert.rules.groups = [ - { - name = "HostStatusAlerts"; - rules = [ - { - alert = "HostDown"; - expr = "up == 0"; - for = "2m"; - labels = { - severity = "critical"; - category = "availability"; - }; - annotations = { - summary = "Host {{ $labels.instance }} is down"; - description = "Host {{ $labels.instance }} (job: {{ $labels.job }}) has been down for more than 2 minutes."; - }; - } - ]; - } - ]; -} diff --git a/hosts/web-arm/modules/vmalert/rules/inode_usage.nix b/hosts/web-arm/modules/vmalert/rules/inode_usage.nix deleted file mode 100644 index 2e2245e..0000000 --- a/hosts/web-arm/modules/vmalert/rules/inode_usage.nix +++ /dev/null @@ -1,27 +0,0 @@ -{ lib, pkgs, config, ... }: # Standard module arguments -{ - services.vmalert.rules.groups = [ - { - name = "InodeUsageAlerts"; - rules = [ - { - alert = "HighInodeUsage"; - expr = '' - ( - node_filesystem_files{fstype!~"tmpfs|rootfs",mountpoint!=""} - node_filesystem_files_free{fstype!~"tmpfs|rootfs",mountpoint!=""} - ) / node_filesystem_files{fstype!~"tmpfs|rootfs",mountpoint!=""} * 100 > 80 - ''; - for = "30m"; - labels = { - severity = "warning"; - category = "capacity"; - }; - annotations = { - summary = "High inode usage on {{ $labels.instance }} at {{ $labels.mountpoint }}"; - description = "Inode usage on {{ $labels.instance }} for mount point {{ $labels.mountpoint }} (fstype: {{ $labels.fstype }}) has been above 80% for more than 30 minutes. Current value: {{ $value | printf \"%.2f\" }}%."; - }; - } - ]; - } - ]; -} diff --git a/hosts/web-arm/modules/vmalert/rules/ram_usage.nix b/hosts/web-arm/modules/vmalert/rules/ram_usage.nix deleted file mode 100644 index 4116b05..0000000 --- a/hosts/web-arm/modules/vmalert/rules/ram_usage.nix +++ /dev/null @@ -1,23 +0,0 @@ -{ lib, pkgs, config, ... }: # Standard module arguments -{ - services.vmalert.rules.groups = [ - { - name = "RAMUsageAlerts"; - rules = [ - { - alert = "HighRAMUsage"; - expr = "(1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100 > 90"; - for = "10m"; - labels = { - severity = "warning"; - category = "performance"; - }; - annotations = { - summary = "High RAM usage on {{ $labels.instance }}"; - description = "RAM usage on {{ $labels.instance }} (job: {{ $labels.job }}) has been above 90% for more than 10 minutes. Current value: {{ $value | printf \"%.2f\" }}%."; - }; - } - ]; - } - ]; -}