diff --git a/hosts/web-arm/configuration.nix b/hosts/web-arm/configuration.nix index ac3d270..68a0e0c 100644 --- a/hosts/web-arm/configuration.nix +++ b/hosts/web-arm/configuration.nix @@ -17,6 +17,7 @@ ./modules/grafana.nix ./modules/loki.nix ./modules/victoriametrics.nix + ./modules/vmalert/default.nix # Added vmalert module ./modules/updns.nix ./utils/modules/autoupgrade.nix diff --git a/hosts/web-arm/modules/grafana.nix b/hosts/web-arm/modules/grafana.nix index f8ef660..34fcd37 100644 --- a/hosts/web-arm/modules/grafana.nix +++ b/hosts/web-arm/modules/grafana.nix @@ -89,32 +89,73 @@ in }; provision = { alerting = { - contactPoints.settings = { - apiVersion = 1; - - contactPoints = [{ - orgId = 1; - name = "cp_dominik"; - receivers = [{ - uid = "dominik"; - type = "pushover"; - settings = { - security.apiToken = "$__file{${config.sops.secrets.pushover-api-token.path}}"; - security.userKey = "$__file{${config.sops.secrets.pushover-user-key.path}}"; - apiToken = "\${PUSHOVER_API_TOKEN}"; - userKey = "\${PUSHOVER_USER_KEY}"; - device = "iphone"; - priority = "2"; - retry = "30"; - expire = "120"; - sound = "siren"; - okSound = "magic"; - message = '' - {{ template "default.message" . }} - ''; - }; + contactPoints = { + settings = { + apiVersion = 1; # As per Grafana provisioning API + contactPoints = [{ + orgId = 1; + name = "cp_dominik"; + receivers = [{ + uid = "dominik_pushover_cp_receiver"; # Made UID even more specific + type = "pushover"; + settings = { + apiToken = "\${PUSHOVER_API_TOKEN}"; + userKey = "\${PUSHOVER_USER_KEY}"; + device = "iphone"; + priority = 2; + retry = "30s"; + expire = "2m"; + sound = "siren"; + okSound = "magic"; + message = '' + {{ template "default.message" . }} + ''; + }; + }]; }]; - }]; + }; + }; + + policies = { # Corrected from notificationPolicies to policies + settings = { + apiVersion = 1; # As per Grafana provisioning API + + # Grafana's new unified alerting expects a single policy tree per org. + # For OrgID 1 (default), this defines the root of that tree. + # The NixOS module should translate this into the correct YAML structure. + # The `policies` attribute within `settings` usually takes a list of policy trees. + # For a single default organization, we define one policy tree. + # Grafana's own YAML examples show a top-level 'route' for the default policy, + # or a list under 'policies' if you're managing multiple policy sets (less common for basic setup). + # Given the NixOS option `services.grafana.provision.alerting.policies.settings.policies`, + # it's likely expecting a list here. + policies = [{ # This outer list corresponds to the `policies` option + # orgId = 1; # Usually implicit for the default policy file, but can be specified + receiver = "cp_dominik"; # This sets the default receiver for the root route + + # The actual routing tree starts here. + # For a simple setup where all alerts go to one receiver, + # just setting the top-level 'receiver' is often enough. + # If more complex routing is needed, 'routes' would be defined here. + # Example: + # route = { + # receiver = "cp_dominik"; + # group_by = [ "alertname", "job" ]; + # # ... other root route settings + # routes = [ + # { + # matcher_re = { severity = "critical" }; + # receiver = "critical_alerts_receiver"; # Another contact point + # continue = false; + # }, + # # ... other specific routes + # ]; + # }; + # For the simplest case, just defining the receiver at this level should work + # as the root policy for the default organization. + }]; + # resetPolicies = false; # Default, set to true to remove existing policies not in this config. + }; }; }; }; diff --git a/hosts/web-arm/modules/vmalert/default.nix b/hosts/web-arm/modules/vmalert/default.nix new file mode 100644 index 0000000..6615b58 --- /dev/null +++ b/hosts/web-arm/modules/vmalert/default.nix @@ -0,0 +1,38 @@ +{ config, pkgs, lib, ... }: +{ + imports = [ + ./rules/cpu_usage.nix + ./rules/disk_usage.nix + ./rules/host_down.nix + ./rules/inode_usage.nix + ./rules/ram_usage.nix + ]; + + # Standard vmalert service configuration + services.vmalert = { + enable = true; + settings = { + "datasource.url" = "http://localhost:8428"; # VictoriaMetrics address + "notifier.url" = [ "http://localhost:3001/api/alertmanager/grafana/api/v2/alerts" ]; # Must be a list of strings + }; + # 'rules' is now set by the mkMerge block above. + }; + + # Override the User and Group for the systemd service managed by the official vmalert module. + systemd.services.vmalert = { + serviceConfig = { + User = "victoriametrics"; + Group = "victoriametrics"; + }; + }; + + # Ensure the user/group itself exists on the system. + users.users.victoriametrics = lib.mkIf (config.services.victoriametrics.enable || config.services.vmalert.enable) { + isSystemUser = true; + group = "victoriametrics"; # Primary group for the user + home = "/var/lib/victoriametrics"; # Standard home for VictoriaMetrics components + }; + users.groups.victoriametrics = lib.mkIf (config.services.victoriametrics.enable || config.services.vmalert.enable) { + # Ensures the group exists. + }; +} diff --git a/hosts/web-arm/modules/vmalert/rules/cpu_usage.nix b/hosts/web-arm/modules/vmalert/rules/cpu_usage.nix new file mode 100644 index 0000000..71b8dbc --- /dev/null +++ b/hosts/web-arm/modules/vmalert/rules/cpu_usage.nix @@ -0,0 +1,26 @@ +{ lib, pkgs, config, ... }: # Standard module arguments +{ + # This module contributes its rule group to a list that will be + # collected and processed by the main vmalert module. + services.vmalert.rules.groups = [ + { + name = "CPUUsageAlerts"; + # interval = "60s"; # Optional: group-level interval + rules = [ # This MUST be a list of rule attribute sets + { + alert = "HighCPUUsage"; + expr = "(1 - avg by (instance, job) (rate(node_cpu_seconds_total{mode=\"idle\"}[1m]))) * 100 > 90"; + for = "5m"; + labels = { + severity = "warning"; + category = "performance"; + }; + annotations = { + summary = "High CPU usage on {{ $labels.instance }}"; + description = "CPU usage on {{ $labels.instance }} (job: {{ $labels.job }}) has been above 90% for more than 5 minutes. Current value: {{ $value | printf \"%.2f\" }}%."; + }; + } + ]; + } + ]; +} diff --git a/hosts/web-arm/modules/vmalert/rules/disk_usage.nix b/hosts/web-arm/modules/vmalert/rules/disk_usage.nix new file mode 100644 index 0000000..65570fd --- /dev/null +++ b/hosts/web-arm/modules/vmalert/rules/disk_usage.nix @@ -0,0 +1,27 @@ +{ lib, pkgs, config, ... }: # Standard module arguments +{ + services.vmalert.rules.groups = [ + { + name = "DiskUsageAlerts"; + rules = [ + { + alert = "HighDiskUsage"; + expr = '' + ( + node_filesystem_size_bytes{fstype!~"tmpfs|rootfs",mountpoint!=""} - node_filesystem_avail_bytes{fstype!~"tmpfs|rootfs",mountpoint!=""} + ) / node_filesystem_size_bytes{fstype!~"tmpfs|rootfs",mountpoint!=""} * 100 > 85 + ''; + for = "15m"; + labels = { + severity = "warning"; + category = "capacity"; + }; + annotations = { + summary = "High disk usage on {{ $labels.instance }} at {{ $labels.mountpoint }}"; + description = "Disk usage on {{ $labels.instance }} for mount point {{ $labels.mountpoint }} (fstype: {{ $labels.fstype }}) has been above 85% for more than 15 minutes. Current value: {{ $value | printf \"%.2f\" }}%."; + }; + } + ]; + } + ]; +} diff --git a/hosts/web-arm/modules/vmalert/rules/host_down.nix b/hosts/web-arm/modules/vmalert/rules/host_down.nix new file mode 100644 index 0000000..0960bc4 --- /dev/null +++ b/hosts/web-arm/modules/vmalert/rules/host_down.nix @@ -0,0 +1,23 @@ +{ lib, pkgs, config, ... }: # Standard module arguments +{ + services.vmalert.rules.groups = [ + { + name = "HostStatusAlerts"; + rules = [ + { + alert = "HostDown"; + expr = "up == 0"; + for = "2m"; + labels = { + severity = "critical"; + category = "availability"; + }; + annotations = { + summary = "Host {{ $labels.instance }} is down"; + description = "Host {{ $labels.instance }} (job: {{ $labels.job }}) has been down for more than 2 minutes."; + }; + } + ]; + } + ]; +} diff --git a/hosts/web-arm/modules/vmalert/rules/inode_usage.nix b/hosts/web-arm/modules/vmalert/rules/inode_usage.nix new file mode 100644 index 0000000..2e2245e --- /dev/null +++ b/hosts/web-arm/modules/vmalert/rules/inode_usage.nix @@ -0,0 +1,27 @@ +{ lib, pkgs, config, ... }: # Standard module arguments +{ + services.vmalert.rules.groups = [ + { + name = "InodeUsageAlerts"; + rules = [ + { + alert = "HighInodeUsage"; + expr = '' + ( + node_filesystem_files{fstype!~"tmpfs|rootfs",mountpoint!=""} - node_filesystem_files_free{fstype!~"tmpfs|rootfs",mountpoint!=""} + ) / node_filesystem_files{fstype!~"tmpfs|rootfs",mountpoint!=""} * 100 > 80 + ''; + for = "30m"; + labels = { + severity = "warning"; + category = "capacity"; + }; + annotations = { + summary = "High inode usage on {{ $labels.instance }} at {{ $labels.mountpoint }}"; + description = "Inode usage on {{ $labels.instance }} for mount point {{ $labels.mountpoint }} (fstype: {{ $labels.fstype }}) has been above 80% for more than 30 minutes. Current value: {{ $value | printf \"%.2f\" }}%."; + }; + } + ]; + } + ]; +} diff --git a/hosts/web-arm/modules/vmalert/rules/ram_usage.nix b/hosts/web-arm/modules/vmalert/rules/ram_usage.nix new file mode 100644 index 0000000..4116b05 --- /dev/null +++ b/hosts/web-arm/modules/vmalert/rules/ram_usage.nix @@ -0,0 +1,23 @@ +{ lib, pkgs, config, ... }: # Standard module arguments +{ + services.vmalert.rules.groups = [ + { + name = "RAMUsageAlerts"; + rules = [ + { + alert = "HighRAMUsage"; + expr = "(1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100 > 90"; + for = "10m"; + labels = { + severity = "warning"; + category = "performance"; + }; + annotations = { + summary = "High RAM usage on {{ $labels.instance }}"; + description = "RAM usage on {{ $labels.instance }} (job: {{ $labels.job }}) has been above 90% for more than 10 minutes. Current value: {{ $value | printf \"%.2f\" }}%."; + }; + } + ]; + } + ]; +}