feat: implement centralized alerting with vmalert and Grafana, add alert rules for CPU, disk, inode, RAM usage, and host status

This commit is contained in:
2025-05-30 21:39:58 +02:00
parent fa42667c2a
commit 17a3602d3c
8 changed files with 231 additions and 25 deletions

View File

@@ -17,6 +17,7 @@
./modules/grafana.nix ./modules/grafana.nix
./modules/loki.nix ./modules/loki.nix
./modules/victoriametrics.nix ./modules/victoriametrics.nix
./modules/vmalert/default.nix # Added vmalert module
./modules/updns.nix ./modules/updns.nix
./utils/modules/autoupgrade.nix ./utils/modules/autoupgrade.nix

View File

@@ -89,32 +89,73 @@ in
}; };
provision = { provision = {
alerting = { alerting = {
contactPoints.settings = { contactPoints = {
apiVersion = 1; settings = {
apiVersion = 1; # As per Grafana provisioning API
contactPoints = [{ contactPoints = [{
orgId = 1; orgId = 1;
name = "cp_dominik"; name = "cp_dominik";
receivers = [{ receivers = [{
uid = "dominik"; uid = "dominik_pushover_cp_receiver"; # Made UID even more specific
type = "pushover"; type = "pushover";
settings = { settings = {
security.apiToken = "$__file{${config.sops.secrets.pushover-api-token.path}}"; apiToken = "\${PUSHOVER_API_TOKEN}";
security.userKey = "$__file{${config.sops.secrets.pushover-user-key.path}}"; userKey = "\${PUSHOVER_USER_KEY}";
apiToken = "\${PUSHOVER_API_TOKEN}"; device = "iphone";
userKey = "\${PUSHOVER_USER_KEY}"; priority = 2;
device = "iphone"; retry = "30s";
priority = "2"; expire = "2m";
retry = "30"; sound = "siren";
expire = "120"; okSound = "magic";
sound = "siren"; message = ''
okSound = "magic"; {{ template "default.message" . }}
message = '' '';
{{ template "default.message" . }} };
''; }];
};
}]; }];
}]; };
};
policies = { # Corrected from notificationPolicies to policies
settings = {
apiVersion = 1; # As per Grafana provisioning API
# Grafana's new unified alerting expects a single policy tree per org.
# For OrgID 1 (default), this defines the root of that tree.
# The NixOS module should translate this into the correct YAML structure.
# The `policies` attribute within `settings` usually takes a list of policy trees.
# For a single default organization, we define one policy tree.
# Grafana's own YAML examples show a top-level 'route' for the default policy,
# or a list under 'policies' if you're managing multiple policy sets (less common for basic setup).
# Given the NixOS option `services.grafana.provision.alerting.policies.settings.policies`,
# it's likely expecting a list here.
policies = [{ # This outer list corresponds to the `policies` option
# orgId = 1; # Usually implicit for the default policy file, but can be specified
receiver = "cp_dominik"; # This sets the default receiver for the root route
# The actual routing tree starts here.
# For a simple setup where all alerts go to one receiver,
# just setting the top-level 'receiver' is often enough.
# If more complex routing is needed, 'routes' would be defined here.
# Example:
# route = {
# receiver = "cp_dominik";
# group_by = [ "alertname", "job" ];
# # ... other root route settings
# routes = [
# {
# matcher_re = { severity = "critical" };
# receiver = "critical_alerts_receiver"; # Another contact point
# continue = false;
# },
# # ... other specific routes
# ];
# };
# For the simplest case, just defining the receiver at this level should work
# as the root policy for the default organization.
}];
# resetPolicies = false; # Default, set to true to remove existing policies not in this config.
};
}; };
}; };
}; };

View File

@@ -0,0 +1,38 @@
{ config, pkgs, lib, ... }:
{
imports = [
./rules/cpu_usage.nix
./rules/disk_usage.nix
./rules/host_down.nix
./rules/inode_usage.nix
./rules/ram_usage.nix
];
# Standard vmalert service configuration
services.vmalert = {
enable = true;
settings = {
"datasource.url" = "http://localhost:8428"; # VictoriaMetrics address
"notifier.url" = [ "http://localhost:3001/api/alertmanager/grafana/api/v2/alerts" ]; # Must be a list of strings
};
# 'rules' is now set by the mkMerge block above.
};
# Override the User and Group for the systemd service managed by the official vmalert module.
systemd.services.vmalert = {
serviceConfig = {
User = "victoriametrics";
Group = "victoriametrics";
};
};
# Ensure the user/group itself exists on the system.
users.users.victoriametrics = lib.mkIf (config.services.victoriametrics.enable || config.services.vmalert.enable) {
isSystemUser = true;
group = "victoriametrics"; # Primary group for the user
home = "/var/lib/victoriametrics"; # Standard home for VictoriaMetrics components
};
users.groups.victoriametrics = lib.mkIf (config.services.victoriametrics.enable || config.services.vmalert.enable) {
# Ensures the group exists.
};
}

View File

@@ -0,0 +1,26 @@
{ lib, pkgs, config, ... }: # Standard module arguments
{
# This module contributes its rule group to a list that will be
# collected and processed by the main vmalert module.
services.vmalert.rules.groups = [
{
name = "CPUUsageAlerts";
# interval = "60s"; # Optional: group-level interval
rules = [ # This MUST be a list of rule attribute sets
{
alert = "HighCPUUsage";
expr = "(1 - avg by (instance, job) (rate(node_cpu_seconds_total{mode=\"idle\"}[1m]))) * 100 > 90";
for = "5m";
labels = {
severity = "warning";
category = "performance";
};
annotations = {
summary = "High CPU usage on {{ $labels.instance }}";
description = "CPU usage on {{ $labels.instance }} (job: {{ $labels.job }}) has been above 90% for more than 5 minutes. Current value: {{ $value | printf \"%.2f\" }}%.";
};
}
];
}
];
}

View File

@@ -0,0 +1,27 @@
{ lib, pkgs, config, ... }: # Standard module arguments
{
services.vmalert.rules.groups = [
{
name = "DiskUsageAlerts";
rules = [
{
alert = "HighDiskUsage";
expr = ''
(
node_filesystem_size_bytes{fstype!~"tmpfs|rootfs",mountpoint!=""} - node_filesystem_avail_bytes{fstype!~"tmpfs|rootfs",mountpoint!=""}
) / node_filesystem_size_bytes{fstype!~"tmpfs|rootfs",mountpoint!=""} * 100 > 85
'';
for = "15m";
labels = {
severity = "warning";
category = "capacity";
};
annotations = {
summary = "High disk usage on {{ $labels.instance }} at {{ $labels.mountpoint }}";
description = "Disk usage on {{ $labels.instance }} for mount point {{ $labels.mountpoint }} (fstype: {{ $labels.fstype }}) has been above 85% for more than 15 minutes. Current value: {{ $value | printf \"%.2f\" }}%.";
};
}
];
}
];
}

View File

@@ -0,0 +1,23 @@
{ lib, pkgs, config, ... }: # Standard module arguments
{
services.vmalert.rules.groups = [
{
name = "HostStatusAlerts";
rules = [
{
alert = "HostDown";
expr = "up == 0";
for = "2m";
labels = {
severity = "critical";
category = "availability";
};
annotations = {
summary = "Host {{ $labels.instance }} is down";
description = "Host {{ $labels.instance }} (job: {{ $labels.job }}) has been down for more than 2 minutes.";
};
}
];
}
];
}

View File

@@ -0,0 +1,27 @@
{ lib, pkgs, config, ... }: # Standard module arguments
{
services.vmalert.rules.groups = [
{
name = "InodeUsageAlerts";
rules = [
{
alert = "HighInodeUsage";
expr = ''
(
node_filesystem_files{fstype!~"tmpfs|rootfs",mountpoint!=""} - node_filesystem_files_free{fstype!~"tmpfs|rootfs",mountpoint!=""}
) / node_filesystem_files{fstype!~"tmpfs|rootfs",mountpoint!=""} * 100 > 80
'';
for = "30m";
labels = {
severity = "warning";
category = "capacity";
};
annotations = {
summary = "High inode usage on {{ $labels.instance }} at {{ $labels.mountpoint }}";
description = "Inode usage on {{ $labels.instance }} for mount point {{ $labels.mountpoint }} (fstype: {{ $labels.fstype }}) has been above 80% for more than 30 minutes. Current value: {{ $value | printf \"%.2f\" }}%.";
};
}
];
}
];
}

View File

@@ -0,0 +1,23 @@
{ lib, pkgs, config, ... }: # Standard module arguments
{
services.vmalert.rules.groups = [
{
name = "RAMUsageAlerts";
rules = [
{
alert = "HighRAMUsage";
expr = "(1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100 > 90";
for = "10m";
labels = {
severity = "warning";
category = "performance";
};
annotations = {
summary = "High RAM usage on {{ $labels.instance }}";
description = "RAM usage on {{ $labels.instance }} (job: {{ $labels.job }}) has been above 90% for more than 10 minutes. Current value: {{ $value | printf \"%.2f\" }}%.";
};
}
];
}
];
}