From fa42667c2a73ef289fec23887ca366de35b24c91 Mon Sep 17 00:00:00 2001 From: Dominik Polakovics Date: Fri, 30 May 2025 18:32:47 +0200 Subject: [PATCH 1/6] fix: update NixOS channel references to version 25.05 and adjust netdata configuration --- hosts/fw/channel | 2 +- hosts/mail/channel | 2 +- hosts/mail/modules/dovecot.nix | 4 +--- hosts/nb/configuration.nix | 1 + hosts/nb/modules/fingerprint.nix | 1 - hosts/web-arm/channel | 2 +- utils/modules/netdata.nix | 8 ++++++-- 7 files changed, 11 insertions(+), 9 deletions(-) diff --git a/hosts/fw/channel b/hosts/fw/channel index ced117e..93f5df5 100644 --- a/hosts/fw/channel +++ b/hosts/fw/channel @@ -1 +1 @@ -https://channels.nixos.org/nixos-24.11 +https://channels.nixos.org/nixos-25.05 diff --git a/hosts/mail/channel b/hosts/mail/channel index ced117e..93f5df5 100644 --- a/hosts/mail/channel +++ b/hosts/mail/channel @@ -1 +1 @@ -https://channels.nixos.org/nixos-24.11 +https://channels.nixos.org/nixos-25.05 diff --git a/hosts/mail/modules/dovecot.nix b/hosts/mail/modules/dovecot.nix index a9ef486..cf8e8c1 100644 --- a/hosts/mail/modules/dovecot.nix +++ b/hosts/mail/modules/dovecot.nix @@ -88,6 +88,7 @@ in { environment.systemPackages = with pkgs; [ doveSync + dovecot_pigeonhole ]; services.dovecot2 = { @@ -215,9 +216,6 @@ in # Read multiple mails in parallel, improves performance mail_prefetch_count = 20 ''; - modules = [ - pkgs.dovecot_pigeonhole - ]; protocols = [ "sieve" ]; diff --git a/hosts/nb/configuration.nix b/hosts/nb/configuration.nix index 0e2bb86..c708175 100644 --- a/hosts/nb/configuration.nix +++ b/hosts/nb/configuration.nix @@ -146,6 +146,7 @@ in { "/var/lib/bluetooth" "/var/lib/docker" "/var/lib/flatpak" + "/var/lib/fprint" "/var/lib/nixos" "/var/lib/mysql" "/etc/NetworkManager/system-connections" diff --git a/hosts/nb/modules/fingerprint.nix b/hosts/nb/modules/fingerprint.nix index d26a50d..5ff0be2 100644 --- a/hosts/nb/modules/fingerprint.nix +++ b/hosts/nb/modules/fingerprint.nix @@ -5,7 +5,6 @@ security.pam.services.login.fprintAuth = true; security.pam.services.sudo.fprintAuth = true; - security.pam.services.sddm.fprintAuth = true; # If you use swaylock and want fingerprint auth for it: security.pam.services.swaylock.fprintAuth = true; # Add Polkit rule to allow locally active users to manage their own fingerprints diff --git a/hosts/web-arm/channel b/hosts/web-arm/channel index ced117e..93f5df5 100644 --- a/hosts/web-arm/channel +++ b/hosts/web-arm/channel @@ -1 +1 @@ -https://channels.nixos.org/nixos-24.11 +https://channels.nixos.org/nixos-25.05 diff --git a/utils/modules/netdata.nix b/utils/modules/netdata.nix index 1aab534..2160f1d 100644 --- a/utils/modules/netdata.nix +++ b/utils/modules/netdata.nix @@ -1,10 +1,14 @@ -{ config, pkgs, ... }: +{ config, lib, pkgs, ... }: let unstable = import (fetchTarball https://nixos.org/channels/nixos-unstable/nixexprs.tar.xz) { config = { allowUnfree = true; }; }; in { + nixpkgs.config.allowUnfreePredicate = pkg: builtins.elem (lib.getName pkg) [ + "netdata" + ]; + services.netdata.configDir."python.d.conf" = pkgs.writeText "python.d.conf" '' postfix: yes ''; @@ -14,7 +18,7 @@ in python.enable = true; package = pkgs.netdata.override { - withCloud = true; + withCloudUi = true; }; config = { From 17a3602d3ce7154a498ffa06ccd21010259cc5a4 Mon Sep 17 00:00:00 2001 From: Dominik Polakovics Date: Fri, 30 May 2025 21:39:58 +0200 Subject: [PATCH 2/6] feat: implement centralized alerting with vmalert and Grafana, add alert rules for CPU, disk, inode, RAM usage, and host status --- hosts/web-arm/configuration.nix | 1 + hosts/web-arm/modules/grafana.nix | 91 ++++++++++++++----- hosts/web-arm/modules/vmalert/default.nix | 38 ++++++++ .../modules/vmalert/rules/cpu_usage.nix | 26 ++++++ .../modules/vmalert/rules/disk_usage.nix | 27 ++++++ .../modules/vmalert/rules/host_down.nix | 23 +++++ .../modules/vmalert/rules/inode_usage.nix | 27 ++++++ .../modules/vmalert/rules/ram_usage.nix | 23 +++++ 8 files changed, 231 insertions(+), 25 deletions(-) create mode 100644 hosts/web-arm/modules/vmalert/default.nix create mode 100644 hosts/web-arm/modules/vmalert/rules/cpu_usage.nix create mode 100644 hosts/web-arm/modules/vmalert/rules/disk_usage.nix create mode 100644 hosts/web-arm/modules/vmalert/rules/host_down.nix create mode 100644 hosts/web-arm/modules/vmalert/rules/inode_usage.nix create mode 100644 hosts/web-arm/modules/vmalert/rules/ram_usage.nix diff --git a/hosts/web-arm/configuration.nix b/hosts/web-arm/configuration.nix index ac3d270..68a0e0c 100644 --- a/hosts/web-arm/configuration.nix +++ b/hosts/web-arm/configuration.nix @@ -17,6 +17,7 @@ ./modules/grafana.nix ./modules/loki.nix ./modules/victoriametrics.nix + ./modules/vmalert/default.nix # Added vmalert module ./modules/updns.nix ./utils/modules/autoupgrade.nix diff --git a/hosts/web-arm/modules/grafana.nix b/hosts/web-arm/modules/grafana.nix index f8ef660..34fcd37 100644 --- a/hosts/web-arm/modules/grafana.nix +++ b/hosts/web-arm/modules/grafana.nix @@ -89,32 +89,73 @@ in }; provision = { alerting = { - contactPoints.settings = { - apiVersion = 1; - - contactPoints = [{ - orgId = 1; - name = "cp_dominik"; - receivers = [{ - uid = "dominik"; - type = "pushover"; - settings = { - security.apiToken = "$__file{${config.sops.secrets.pushover-api-token.path}}"; - security.userKey = "$__file{${config.sops.secrets.pushover-user-key.path}}"; - apiToken = "\${PUSHOVER_API_TOKEN}"; - userKey = "\${PUSHOVER_USER_KEY}"; - device = "iphone"; - priority = "2"; - retry = "30"; - expire = "120"; - sound = "siren"; - okSound = "magic"; - message = '' - {{ template "default.message" . }} - ''; - }; + contactPoints = { + settings = { + apiVersion = 1; # As per Grafana provisioning API + contactPoints = [{ + orgId = 1; + name = "cp_dominik"; + receivers = [{ + uid = "dominik_pushover_cp_receiver"; # Made UID even more specific + type = "pushover"; + settings = { + apiToken = "\${PUSHOVER_API_TOKEN}"; + userKey = "\${PUSHOVER_USER_KEY}"; + device = "iphone"; + priority = 2; + retry = "30s"; + expire = "2m"; + sound = "siren"; + okSound = "magic"; + message = '' + {{ template "default.message" . }} + ''; + }; + }]; }]; - }]; + }; + }; + + policies = { # Corrected from notificationPolicies to policies + settings = { + apiVersion = 1; # As per Grafana provisioning API + + # Grafana's new unified alerting expects a single policy tree per org. + # For OrgID 1 (default), this defines the root of that tree. + # The NixOS module should translate this into the correct YAML structure. + # The `policies` attribute within `settings` usually takes a list of policy trees. + # For a single default organization, we define one policy tree. + # Grafana's own YAML examples show a top-level 'route' for the default policy, + # or a list under 'policies' if you're managing multiple policy sets (less common for basic setup). + # Given the NixOS option `services.grafana.provision.alerting.policies.settings.policies`, + # it's likely expecting a list here. + policies = [{ # This outer list corresponds to the `policies` option + # orgId = 1; # Usually implicit for the default policy file, but can be specified + receiver = "cp_dominik"; # This sets the default receiver for the root route + + # The actual routing tree starts here. + # For a simple setup where all alerts go to one receiver, + # just setting the top-level 'receiver' is often enough. + # If more complex routing is needed, 'routes' would be defined here. + # Example: + # route = { + # receiver = "cp_dominik"; + # group_by = [ "alertname", "job" ]; + # # ... other root route settings + # routes = [ + # { + # matcher_re = { severity = "critical" }; + # receiver = "critical_alerts_receiver"; # Another contact point + # continue = false; + # }, + # # ... other specific routes + # ]; + # }; + # For the simplest case, just defining the receiver at this level should work + # as the root policy for the default organization. + }]; + # resetPolicies = false; # Default, set to true to remove existing policies not in this config. + }; }; }; }; diff --git a/hosts/web-arm/modules/vmalert/default.nix b/hosts/web-arm/modules/vmalert/default.nix new file mode 100644 index 0000000..6615b58 --- /dev/null +++ b/hosts/web-arm/modules/vmalert/default.nix @@ -0,0 +1,38 @@ +{ config, pkgs, lib, ... }: +{ + imports = [ + ./rules/cpu_usage.nix + ./rules/disk_usage.nix + ./rules/host_down.nix + ./rules/inode_usage.nix + ./rules/ram_usage.nix + ]; + + # Standard vmalert service configuration + services.vmalert = { + enable = true; + settings = { + "datasource.url" = "http://localhost:8428"; # VictoriaMetrics address + "notifier.url" = [ "http://localhost:3001/api/alertmanager/grafana/api/v2/alerts" ]; # Must be a list of strings + }; + # 'rules' is now set by the mkMerge block above. + }; + + # Override the User and Group for the systemd service managed by the official vmalert module. + systemd.services.vmalert = { + serviceConfig = { + User = "victoriametrics"; + Group = "victoriametrics"; + }; + }; + + # Ensure the user/group itself exists on the system. + users.users.victoriametrics = lib.mkIf (config.services.victoriametrics.enable || config.services.vmalert.enable) { + isSystemUser = true; + group = "victoriametrics"; # Primary group for the user + home = "/var/lib/victoriametrics"; # Standard home for VictoriaMetrics components + }; + users.groups.victoriametrics = lib.mkIf (config.services.victoriametrics.enable || config.services.vmalert.enable) { + # Ensures the group exists. + }; +} diff --git a/hosts/web-arm/modules/vmalert/rules/cpu_usage.nix b/hosts/web-arm/modules/vmalert/rules/cpu_usage.nix new file mode 100644 index 0000000..71b8dbc --- /dev/null +++ b/hosts/web-arm/modules/vmalert/rules/cpu_usage.nix @@ -0,0 +1,26 @@ +{ lib, pkgs, config, ... }: # Standard module arguments +{ + # This module contributes its rule group to a list that will be + # collected and processed by the main vmalert module. + services.vmalert.rules.groups = [ + { + name = "CPUUsageAlerts"; + # interval = "60s"; # Optional: group-level interval + rules = [ # This MUST be a list of rule attribute sets + { + alert = "HighCPUUsage"; + expr = "(1 - avg by (instance, job) (rate(node_cpu_seconds_total{mode=\"idle\"}[1m]))) * 100 > 90"; + for = "5m"; + labels = { + severity = "warning"; + category = "performance"; + }; + annotations = { + summary = "High CPU usage on {{ $labels.instance }}"; + description = "CPU usage on {{ $labels.instance }} (job: {{ $labels.job }}) has been above 90% for more than 5 minutes. Current value: {{ $value | printf \"%.2f\" }}%."; + }; + } + ]; + } + ]; +} diff --git a/hosts/web-arm/modules/vmalert/rules/disk_usage.nix b/hosts/web-arm/modules/vmalert/rules/disk_usage.nix new file mode 100644 index 0000000..65570fd --- /dev/null +++ b/hosts/web-arm/modules/vmalert/rules/disk_usage.nix @@ -0,0 +1,27 @@ +{ lib, pkgs, config, ... }: # Standard module arguments +{ + services.vmalert.rules.groups = [ + { + name = "DiskUsageAlerts"; + rules = [ + { + alert = "HighDiskUsage"; + expr = '' + ( + node_filesystem_size_bytes{fstype!~"tmpfs|rootfs",mountpoint!=""} - node_filesystem_avail_bytes{fstype!~"tmpfs|rootfs",mountpoint!=""} + ) / node_filesystem_size_bytes{fstype!~"tmpfs|rootfs",mountpoint!=""} * 100 > 85 + ''; + for = "15m"; + labels = { + severity = "warning"; + category = "capacity"; + }; + annotations = { + summary = "High disk usage on {{ $labels.instance }} at {{ $labels.mountpoint }}"; + description = "Disk usage on {{ $labels.instance }} for mount point {{ $labels.mountpoint }} (fstype: {{ $labels.fstype }}) has been above 85% for more than 15 minutes. Current value: {{ $value | printf \"%.2f\" }}%."; + }; + } + ]; + } + ]; +} diff --git a/hosts/web-arm/modules/vmalert/rules/host_down.nix b/hosts/web-arm/modules/vmalert/rules/host_down.nix new file mode 100644 index 0000000..0960bc4 --- /dev/null +++ b/hosts/web-arm/modules/vmalert/rules/host_down.nix @@ -0,0 +1,23 @@ +{ lib, pkgs, config, ... }: # Standard module arguments +{ + services.vmalert.rules.groups = [ + { + name = "HostStatusAlerts"; + rules = [ + { + alert = "HostDown"; + expr = "up == 0"; + for = "2m"; + labels = { + severity = "critical"; + category = "availability"; + }; + annotations = { + summary = "Host {{ $labels.instance }} is down"; + description = "Host {{ $labels.instance }} (job: {{ $labels.job }}) has been down for more than 2 minutes."; + }; + } + ]; + } + ]; +} diff --git a/hosts/web-arm/modules/vmalert/rules/inode_usage.nix b/hosts/web-arm/modules/vmalert/rules/inode_usage.nix new file mode 100644 index 0000000..2e2245e --- /dev/null +++ b/hosts/web-arm/modules/vmalert/rules/inode_usage.nix @@ -0,0 +1,27 @@ +{ lib, pkgs, config, ... }: # Standard module arguments +{ + services.vmalert.rules.groups = [ + { + name = "InodeUsageAlerts"; + rules = [ + { + alert = "HighInodeUsage"; + expr = '' + ( + node_filesystem_files{fstype!~"tmpfs|rootfs",mountpoint!=""} - node_filesystem_files_free{fstype!~"tmpfs|rootfs",mountpoint!=""} + ) / node_filesystem_files{fstype!~"tmpfs|rootfs",mountpoint!=""} * 100 > 80 + ''; + for = "30m"; + labels = { + severity = "warning"; + category = "capacity"; + }; + annotations = { + summary = "High inode usage on {{ $labels.instance }} at {{ $labels.mountpoint }}"; + description = "Inode usage on {{ $labels.instance }} for mount point {{ $labels.mountpoint }} (fstype: {{ $labels.fstype }}) has been above 80% for more than 30 minutes. Current value: {{ $value | printf \"%.2f\" }}%."; + }; + } + ]; + } + ]; +} diff --git a/hosts/web-arm/modules/vmalert/rules/ram_usage.nix b/hosts/web-arm/modules/vmalert/rules/ram_usage.nix new file mode 100644 index 0000000..4116b05 --- /dev/null +++ b/hosts/web-arm/modules/vmalert/rules/ram_usage.nix @@ -0,0 +1,23 @@ +{ lib, pkgs, config, ... }: # Standard module arguments +{ + services.vmalert.rules.groups = [ + { + name = "RAMUsageAlerts"; + rules = [ + { + alert = "HighRAMUsage"; + expr = "(1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100 > 90"; + for = "10m"; + labels = { + severity = "warning"; + category = "performance"; + }; + annotations = { + summary = "High RAM usage on {{ $labels.instance }}"; + description = "RAM usage on {{ $labels.instance }} (job: {{ $labels.job }}) has been above 90% for more than 10 minutes. Current value: {{ $value | printf \"%.2f\" }}%."; + }; + } + ]; + } + ]; +} From 8b5fb0861d090e223736cda7a764052f96fd928e Mon Sep 17 00:00:00 2001 From: Dominik Polakovics Date: Sat, 31 May 2025 09:27:25 +0200 Subject: [PATCH 3/6] feat: restructure Grafana configuration, migrate alert rules to new format and add VictoriaMetrics datasource --- hosts/web-arm/configuration.nix | 3 +- .../modules/grafana/alerting/cpu_usage.nix | 66 ++++++++++++++ .../modules/grafana/alerting/disk_usage.nix | 85 +++++++++++++++++++ .../modules/grafana/alerting/host_down.nix | 62 ++++++++++++++ .../modules/grafana/alerting/inode_usage.nix | 71 ++++++++++++++++ .../modules/grafana/alerting/ram_usage.nix | 69 +++++++++++++++ .../grafana/datasources/victoriametrics.nix | 18 ++++ .../{grafana.nix => grafana/default.nix} | 12 +++ hosts/web-arm/modules/vmalert/default.nix | 38 --------- .../modules/vmalert/rules/cpu_usage.nix | 26 ------ .../modules/vmalert/rules/disk_usage.nix | 27 ------ .../modules/vmalert/rules/host_down.nix | 23 ----- .../modules/vmalert/rules/inode_usage.nix | 27 ------ .../modules/vmalert/rules/ram_usage.nix | 23 ----- 14 files changed, 384 insertions(+), 166 deletions(-) create mode 100644 hosts/web-arm/modules/grafana/alerting/cpu_usage.nix create mode 100644 hosts/web-arm/modules/grafana/alerting/disk_usage.nix create mode 100644 hosts/web-arm/modules/grafana/alerting/host_down.nix create mode 100644 hosts/web-arm/modules/grafana/alerting/inode_usage.nix create mode 100644 hosts/web-arm/modules/grafana/alerting/ram_usage.nix create mode 100644 hosts/web-arm/modules/grafana/datasources/victoriametrics.nix rename hosts/web-arm/modules/{grafana.nix => grafana/default.nix} (94%) delete mode 100644 hosts/web-arm/modules/vmalert/default.nix delete mode 100644 hosts/web-arm/modules/vmalert/rules/cpu_usage.nix delete mode 100644 hosts/web-arm/modules/vmalert/rules/disk_usage.nix delete mode 100644 hosts/web-arm/modules/vmalert/rules/host_down.nix delete mode 100644 hosts/web-arm/modules/vmalert/rules/inode_usage.nix delete mode 100644 hosts/web-arm/modules/vmalert/rules/ram_usage.nix diff --git a/hosts/web-arm/configuration.nix b/hosts/web-arm/configuration.nix index 68a0e0c..54c74d9 100644 --- a/hosts/web-arm/configuration.nix +++ b/hosts/web-arm/configuration.nix @@ -14,10 +14,9 @@ ./modules/nextcloud ./modules/rustdesk.nix ./modules/postgresql.nix - ./modules/grafana.nix + ./modules/grafana/default.nix ./modules/loki.nix ./modules/victoriametrics.nix - ./modules/vmalert/default.nix # Added vmalert module ./modules/updns.nix ./utils/modules/autoupgrade.nix diff --git a/hosts/web-arm/modules/grafana/alerting/cpu_usage.nix b/hosts/web-arm/modules/grafana/alerting/cpu_usage.nix new file mode 100644 index 0000000..515fabb --- /dev/null +++ b/hosts/web-arm/modules/grafana/alerting/cpu_usage.nix @@ -0,0 +1,66 @@ +{ lib, pkgs, config, ... }: +{ + services.grafana.provision.alerting.rules.settings.groups = [ + { + name = "CPUUsageAlerts"; + folder = "System Alerts"; + interval = "1m"; + + rules = [ + { + uid = "high-cpu-usage-alert-uid"; + title = "HighCPUUsage"; + condition = "D"; # Condition is now D + + data = [ + # Query A: Calculate CPU usage percentage + { + refId = "A"; + datasourceUid = "vm-datasource-uid"; + queryType = "prometheus"; + relativeTimeRange = { from = 60; to = 0; }; # Query over the last minute + model = { + # Calculate average CPU usage over 1m, grouped by instance and job + expr = ''(1 - avg by (instance, job) (rate(node_cpu_seconds_total{mode="idle"}[1m]))) * 100''; + legendFormat = "CPU usage on {{instance}} ({{job}})"; + instant = false; # This is a range query + }; + } + # Expression C: Reduce Query A to its last value, preserving labels + { + refId = "C"; + datasourceUid = "__expr__"; + model = { + type = "reduce"; + expression = "A"; # Input is Query A + reducer = "last"; # Get the last value of each series in A + }; + } + # Expression D: Apply math condition to the reduced values from C + { + refId = "D"; + datasourceUid = "__expr__"; + model = { + type = "math"; + expression = "$C > 90"; # Alert if CPU usage from C is > 90% + }; + } + ]; + + for = "5m"; # Duration the condition must be met + noDataState = "NoData"; + execErrState = "Error"; + + annotations = { + summary = "High CPU usage on {{ $labels.instance }}"; + description = ''CPU usage on {{ $labels.instance }} (job: {{ $labels.job }}) has been above 90% for more than 5 minutes. Current value: {{ if $values.C }}{{ $values.C | humanizePercentage }}{{ else }}N/A{{ end }}%.''; + }; + labels = { + severity = "warning"; + category = "performance"; + }; + } + ]; + } + ]; +} diff --git a/hosts/web-arm/modules/grafana/alerting/disk_usage.nix b/hosts/web-arm/modules/grafana/alerting/disk_usage.nix new file mode 100644 index 0000000..b30686b --- /dev/null +++ b/hosts/web-arm/modules/grafana/alerting/disk_usage.nix @@ -0,0 +1,85 @@ +{ lib, pkgs, config, ... }: +{ + services.grafana.provision.alerting.rules.settings.groups = [ + { + # orgId = 1; # Defaults to 1 for provisioned rules + name = "DiskUsageAlerts"; # Name of the rule group + folder = "System Alerts"; # The folder these rules belong to in Grafana UI + interval = "1m"; # How often to evaluate rules in this group + + rules = [ + { + uid = "high-disk-usage-alert-uid"; # Optional: provide a stable UID for the rule itself + title = "HighDiskUsage"; # Name of the alert rule (was 'alert' in vmalert) + + # Condition for the alert to fire. 'C' refers to the refId of the threshold expression. + condition = "D"; # Condition is now D + # Removed rule-level relativeTimeRange + + # Data queries and expressions + data = [ + # Query A: Calculate disk usage percentage + { + refId = "A"; + datasourceUid = "vm-datasource-uid"; # UID of the VictoriaMetrics datasource + queryType = "prometheus"; # Explicitly set, though often inferred + relativeTimeRange = { from = 60; to = 0; }; # Query-level, integer seconds + model = { + expr = '' + ( + node_filesystem_size_bytes{fstype!~"tmpfs|rootfs",mountpoint!=""} - node_filesystem_avail_bytes{fstype!~"tmpfs|rootfs",mountpoint!=""} + ) / (node_filesystem_size_bytes{fstype!~"tmpfs|rootfs",mountpoint!=""} > 0) * 100 + and node_filesystem_size_bytes{fstype!~"tmpfs|rootfs",mountpoint!=""} + and node_filesystem_avail_bytes{fstype!~"tmpfs|rootfs",mountpoint!=""} + ''; + legendFormat = "{{mountpoint}} on {{instance}}"; # Example legend + instant = false; # For range queries, default is false + }; + } + # Expression C: Reduce Query A to its last value, preserving labels + { + refId = "C"; + datasourceUid = "__expr__"; + model = { + type = "reduce"; + expression = "A"; # Input is Query A + reducer = "last"; # Get the last value of each series in A + }; + } + # Expression D: Apply math condition to the reduced values from C + { + refId = "D"; + datasourceUid = "__expr__"; + model = { + type = "math"; + expression = "$C > 85"; # Check if the last value from each series in C is > 85 + }; + } + ]; + + for = "15m"; # Duration the condition must be met (same as vmalert) + + # How to handle states where data is missing or query errors + noDataState = "NoData"; # Options: NoData, Alerting, OK + execErrState = "Error"; # Options: Error, Alerting, OK + + annotations = { + summary = "High disk usage on {{ $labels.instance }} at {{ $labels.mountpoint }}"; + description = '' + Disk usage on {{ $labels.instance }} for mount point {{ $labels.mountpoint }} + (fstype: {{ $labels.fstype }}) has been above 85% for more than 15 minutes. + Current value: {{ if $values.C }}{{ $values.C | humanizePercentage }}{{ else }}N/A{{ end }}%. + ''; # Using $values.C as it's the input to the math condition D + }; + labels = { + severity = "warning"; + category = "capacity"; + # Grafana automatically adds labels from the query result (instance, mountpoint, etc.) + # and labels from the rule group/folder. + }; + # isPaused = false; # Default is not paused + } + ]; + } + ]; +} diff --git a/hosts/web-arm/modules/grafana/alerting/host_down.nix b/hosts/web-arm/modules/grafana/alerting/host_down.nix new file mode 100644 index 0000000..a2d938f --- /dev/null +++ b/hosts/web-arm/modules/grafana/alerting/host_down.nix @@ -0,0 +1,62 @@ +{ lib, pkgs, config, ... }: +{ + services.grafana.provision.alerting.rules.settings.groups = [ + { + name = "HostStatusAlerts"; + folder = "System Alerts"; + interval = "1m"; + + rules = [ + { + uid = "host-down-alert-uid"; + title = "HostDown"; + condition = "C"; + + data = [ + { + refId = "A"; + datasourceUid = "vm-datasource-uid"; + queryType = "prometheus"; + relativeTimeRange = { from = 60; to = 0; }; # Query over the last minute + model = { + expr = ''up''; + legendFormat = "{{instance}} ({{job}})"; + instant = false; # Changed from true, as relativeTimeRange is used + }; + } + { # New Expression B: Reduce Query A + refId = "B"; + datasourceUid = "__expr__"; + model = { + type = "reduce"; + expression = "A"; # Input is Query A + reducer = "last"; # Get the last value of each series in A + }; + } + { # Modified Expression C: Math condition based on B + refId = "C"; + datasourceUid = "__expr__"; + model = { + type = "math"; + expression = "$B == 0"; # Check if the last value from B is 0 + }; + } + ]; + + for = "2m"; + noDataState = "Alerting"; + execErrState = "Error"; + + annotations = { + summary = "Host {{ $labels.instance }} is down"; + description = ''Host {{ $labels.instance }} (job: {{ $labels.job }}) has been down for more than 2 minutes.''; + }; + labels = { + severity = "critical"; + category = "availability"; + }; + } + ]; + } + ]; +} diff --git a/hosts/web-arm/modules/grafana/alerting/inode_usage.nix b/hosts/web-arm/modules/grafana/alerting/inode_usage.nix new file mode 100644 index 0000000..8f67178 --- /dev/null +++ b/hosts/web-arm/modules/grafana/alerting/inode_usage.nix @@ -0,0 +1,71 @@ +{ lib, pkgs, config, ... }: +{ + services.grafana.provision.alerting.rules.settings.groups = [ + { + name = "InodeUsageAlerts"; + folder = "System Alerts"; + interval = "1m"; + + rules = [ + { + uid = "high-inode-usage-alert-uid"; + title = "HighInodeUsage"; + condition = "D"; # Condition is now D + + data = [ + # Query A: Calculate inode usage percentage + { + refId = "A"; + datasourceUid = "vm-datasource-uid"; + queryType = "prometheus"; + relativeTimeRange = { from = 60; to = 0; }; + model = { + expr = '' + ( + node_filesystem_files{fstype!~"tmpfs|rootfs",mountpoint!=""} - node_filesystem_files_free{fstype!~"tmpfs|rootfs",mountpoint!=""} + ) / (node_filesystem_files{fstype!~"tmpfs|rootfs",mountpoint!=""} > 0) * 100 + and node_filesystem_files{fstype!~"tmpfs|rootfs",mountpoint!=""} + and node_filesystem_files_free{fstype!~"tmpfs|rootfs",mountpoint!=""} + ''; + legendFormat = "{{mountpoint}} on {{instance}}"; + instant = false; + }; + } + # Expression C: Reduce Query A to its last value, preserving labels + { + refId = "C"; + datasourceUid = "__expr__"; + model = { + type = "reduce"; + expression = "A"; # Input is Query A + reducer = "last"; # Get the last value of each series in A + }; + } + # Expression D: Apply math condition to the reduced values from C + { + refId = "D"; + datasourceUid = "__expr__"; + model = { + type = "math"; + expression = "$C > 80"; # Alert if inode usage from C is > 80% + }; + } + ]; + + for = "30m"; # Duration the condition must be met + noDataState = "NoData"; + execErrState = "Error"; + + annotations = { + summary = "High inode usage on {{ $labels.instance }} at {{ $labels.mountpoint }}"; + description = ''Inode usage on {{ $labels.instance }} for mount point {{ $labels.mountpoint }} (fstype: {{ $labels.fstype }}) has been above 80% for more than 30 minutes. Current value: {{ if $values.C }}{{ $values.C | humanizePercentage }}{{ else }}N/A{{ end }}%.''; + }; + labels = { + severity = "warning"; + category = "capacity"; + }; + } + ]; + } + ]; +} diff --git a/hosts/web-arm/modules/grafana/alerting/ram_usage.nix b/hosts/web-arm/modules/grafana/alerting/ram_usage.nix new file mode 100644 index 0000000..03dd931 --- /dev/null +++ b/hosts/web-arm/modules/grafana/alerting/ram_usage.nix @@ -0,0 +1,69 @@ +{ lib, pkgs, config, ... }: +{ + services.grafana.provision.alerting.rules.settings.groups = [ + { + name = "RAMUsageAlerts"; + folder = "System Alerts"; + interval = "1m"; + + rules = [ + { + uid = "high-ram-usage-alert-uid"; + title = "HighRAMUsage"; + condition = "D"; # Condition is now D + + data = [ + # Query A: Calculate RAM usage percentage + { + refId = "A"; + datasourceUid = "vm-datasource-uid"; + queryType = "prometheus"; + relativeTimeRange = { from = 60; to = 0; }; + model = { + expr = '' + (1 - node_memory_MemAvailable_bytes / (node_memory_MemTotal_bytes > 0)) * 100 + and node_memory_MemAvailable_bytes + and node_memory_MemTotal_bytes + ''; + legendFormat = "RAM usage on {{instance}} ({{job}})"; + instant = false; + }; + } + # Expression C: Reduce Query A to its last value, preserving labels + { + refId = "C"; + datasourceUid = "__expr__"; + model = { + type = "reduce"; + expression = "A"; # Input is Query A + reducer = "last"; # Get the last value of each series in A + }; + } + # Expression D: Apply math condition to the reduced values from C + { + refId = "D"; + datasourceUid = "__expr__"; + model = { + type = "math"; + expression = "$C > 90"; # Alert if RAM usage from C is > 90% + }; + } + ]; + + for = "10m"; # Duration the condition must be met + noDataState = "NoData"; + execErrState = "Error"; + + annotations = { + summary = "High RAM usage on {{ $labels.instance }}"; + description = ''RAM usage on {{ $labels.instance }} (job: {{ $labels.job }}) has been above 90% for more than 10 minutes. Current value: {{ if $values.C }}{{ $values.C | humanizePercentage }}{{ else }}N/A{{ end }}%.''; + }; + labels = { + severity = "warning"; + category = "performance"; + }; + } + ]; + } + ]; +} diff --git a/hosts/web-arm/modules/grafana/datasources/victoriametrics.nix b/hosts/web-arm/modules/grafana/datasources/victoriametrics.nix new file mode 100644 index 0000000..57ea78a --- /dev/null +++ b/hosts/web-arm/modules/grafana/datasources/victoriametrics.nix @@ -0,0 +1,18 @@ +{ lib, pkgs, config, ... }: +{ + services.grafana.provision.datasources.settings.datasources = [ + { + name = "VictoriaMetrics"; + uid = "vm-datasource-uid"; # Stable UID for referencing in alerts + type = "prometheus"; + url = "http://localhost:8428"; # URL of VictoriaMetrics + access = "proxy"; # Grafana proxies requests + isDefault = true; # Optional: make this the default datasource + jsonData = { + # timeInterval = "30s"; # Optional: Scrape interval if different from Grafana's default + # httpMethod = "POST"; # Optional: if VictoriaMetrics prefers POST for queries + }; + editable = false; # Recommended for provisioned datasources + } + ]; +} \ No newline at end of file diff --git a/hosts/web-arm/modules/grafana.nix b/hosts/web-arm/modules/grafana/default.nix similarity index 94% rename from hosts/web-arm/modules/grafana.nix rename to hosts/web-arm/modules/grafana/default.nix index 34fcd37..6d1394f 100644 --- a/hosts/web-arm/modules/grafana.nix +++ b/hosts/web-arm/modules/grafana/default.nix @@ -27,6 +27,16 @@ let }; in { + imports = [ + ./alerting/disk_usage.nix + ./alerting/cpu_usage.nix + ./alerting/host_down.nix + ./alerting/inode_usage.nix + ./alerting/ram_usage.nix + # ... other rule files can be added here ... + ./datasources/victoriametrics.nix + ]; + systemd.services.grafana.script = lib.mkBefore '' export GF_AUTH_GENERIC_OAUTH_CLIENT_SECRET=$(cat /run/secrets/grafana-oauth-secret) export PUSHOVER_API_TOKEN=$(cat /run/secrets/pushover-api-token) @@ -89,6 +99,7 @@ in }; provision = { alerting = { + rules.settings.groups = lib.mkMerge []; # Allows rule groups to be merged contactPoints = { settings = { apiVersion = 1; # As per Grafana provisioning API @@ -158,6 +169,7 @@ in }; }; }; + datasources.settings.datasources = lib.mkMerge []; # Allows datasources to be merged }; }; diff --git a/hosts/web-arm/modules/vmalert/default.nix b/hosts/web-arm/modules/vmalert/default.nix deleted file mode 100644 index 6615b58..0000000 --- a/hosts/web-arm/modules/vmalert/default.nix +++ /dev/null @@ -1,38 +0,0 @@ -{ config, pkgs, lib, ... }: -{ - imports = [ - ./rules/cpu_usage.nix - ./rules/disk_usage.nix - ./rules/host_down.nix - ./rules/inode_usage.nix - ./rules/ram_usage.nix - ]; - - # Standard vmalert service configuration - services.vmalert = { - enable = true; - settings = { - "datasource.url" = "http://localhost:8428"; # VictoriaMetrics address - "notifier.url" = [ "http://localhost:3001/api/alertmanager/grafana/api/v2/alerts" ]; # Must be a list of strings - }; - # 'rules' is now set by the mkMerge block above. - }; - - # Override the User and Group for the systemd service managed by the official vmalert module. - systemd.services.vmalert = { - serviceConfig = { - User = "victoriametrics"; - Group = "victoriametrics"; - }; - }; - - # Ensure the user/group itself exists on the system. - users.users.victoriametrics = lib.mkIf (config.services.victoriametrics.enable || config.services.vmalert.enable) { - isSystemUser = true; - group = "victoriametrics"; # Primary group for the user - home = "/var/lib/victoriametrics"; # Standard home for VictoriaMetrics components - }; - users.groups.victoriametrics = lib.mkIf (config.services.victoriametrics.enable || config.services.vmalert.enable) { - # Ensures the group exists. - }; -} diff --git a/hosts/web-arm/modules/vmalert/rules/cpu_usage.nix b/hosts/web-arm/modules/vmalert/rules/cpu_usage.nix deleted file mode 100644 index 71b8dbc..0000000 --- a/hosts/web-arm/modules/vmalert/rules/cpu_usage.nix +++ /dev/null @@ -1,26 +0,0 @@ -{ lib, pkgs, config, ... }: # Standard module arguments -{ - # This module contributes its rule group to a list that will be - # collected and processed by the main vmalert module. - services.vmalert.rules.groups = [ - { - name = "CPUUsageAlerts"; - # interval = "60s"; # Optional: group-level interval - rules = [ # This MUST be a list of rule attribute sets - { - alert = "HighCPUUsage"; - expr = "(1 - avg by (instance, job) (rate(node_cpu_seconds_total{mode=\"idle\"}[1m]))) * 100 > 90"; - for = "5m"; - labels = { - severity = "warning"; - category = "performance"; - }; - annotations = { - summary = "High CPU usage on {{ $labels.instance }}"; - description = "CPU usage on {{ $labels.instance }} (job: {{ $labels.job }}) has been above 90% for more than 5 minutes. Current value: {{ $value | printf \"%.2f\" }}%."; - }; - } - ]; - } - ]; -} diff --git a/hosts/web-arm/modules/vmalert/rules/disk_usage.nix b/hosts/web-arm/modules/vmalert/rules/disk_usage.nix deleted file mode 100644 index 65570fd..0000000 --- a/hosts/web-arm/modules/vmalert/rules/disk_usage.nix +++ /dev/null @@ -1,27 +0,0 @@ -{ lib, pkgs, config, ... }: # Standard module arguments -{ - services.vmalert.rules.groups = [ - { - name = "DiskUsageAlerts"; - rules = [ - { - alert = "HighDiskUsage"; - expr = '' - ( - node_filesystem_size_bytes{fstype!~"tmpfs|rootfs",mountpoint!=""} - node_filesystem_avail_bytes{fstype!~"tmpfs|rootfs",mountpoint!=""} - ) / node_filesystem_size_bytes{fstype!~"tmpfs|rootfs",mountpoint!=""} * 100 > 85 - ''; - for = "15m"; - labels = { - severity = "warning"; - category = "capacity"; - }; - annotations = { - summary = "High disk usage on {{ $labels.instance }} at {{ $labels.mountpoint }}"; - description = "Disk usage on {{ $labels.instance }} for mount point {{ $labels.mountpoint }} (fstype: {{ $labels.fstype }}) has been above 85% for more than 15 minutes. Current value: {{ $value | printf \"%.2f\" }}%."; - }; - } - ]; - } - ]; -} diff --git a/hosts/web-arm/modules/vmalert/rules/host_down.nix b/hosts/web-arm/modules/vmalert/rules/host_down.nix deleted file mode 100644 index 0960bc4..0000000 --- a/hosts/web-arm/modules/vmalert/rules/host_down.nix +++ /dev/null @@ -1,23 +0,0 @@ -{ lib, pkgs, config, ... }: # Standard module arguments -{ - services.vmalert.rules.groups = [ - { - name = "HostStatusAlerts"; - rules = [ - { - alert = "HostDown"; - expr = "up == 0"; - for = "2m"; - labels = { - severity = "critical"; - category = "availability"; - }; - annotations = { - summary = "Host {{ $labels.instance }} is down"; - description = "Host {{ $labels.instance }} (job: {{ $labels.job }}) has been down for more than 2 minutes."; - }; - } - ]; - } - ]; -} diff --git a/hosts/web-arm/modules/vmalert/rules/inode_usage.nix b/hosts/web-arm/modules/vmalert/rules/inode_usage.nix deleted file mode 100644 index 2e2245e..0000000 --- a/hosts/web-arm/modules/vmalert/rules/inode_usage.nix +++ /dev/null @@ -1,27 +0,0 @@ -{ lib, pkgs, config, ... }: # Standard module arguments -{ - services.vmalert.rules.groups = [ - { - name = "InodeUsageAlerts"; - rules = [ - { - alert = "HighInodeUsage"; - expr = '' - ( - node_filesystem_files{fstype!~"tmpfs|rootfs",mountpoint!=""} - node_filesystem_files_free{fstype!~"tmpfs|rootfs",mountpoint!=""} - ) / node_filesystem_files{fstype!~"tmpfs|rootfs",mountpoint!=""} * 100 > 80 - ''; - for = "30m"; - labels = { - severity = "warning"; - category = "capacity"; - }; - annotations = { - summary = "High inode usage on {{ $labels.instance }} at {{ $labels.mountpoint }}"; - description = "Inode usage on {{ $labels.instance }} for mount point {{ $labels.mountpoint }} (fstype: {{ $labels.fstype }}) has been above 80% for more than 30 minutes. Current value: {{ $value | printf \"%.2f\" }}%."; - }; - } - ]; - } - ]; -} diff --git a/hosts/web-arm/modules/vmalert/rules/ram_usage.nix b/hosts/web-arm/modules/vmalert/rules/ram_usage.nix deleted file mode 100644 index 4116b05..0000000 --- a/hosts/web-arm/modules/vmalert/rules/ram_usage.nix +++ /dev/null @@ -1,23 +0,0 @@ -{ lib, pkgs, config, ... }: # Standard module arguments -{ - services.vmalert.rules.groups = [ - { - name = "RAMUsageAlerts"; - rules = [ - { - alert = "HighRAMUsage"; - expr = "(1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100 > 90"; - for = "10m"; - labels = { - severity = "warning"; - category = "performance"; - }; - annotations = { - summary = "High RAM usage on {{ $labels.instance }}"; - description = "RAM usage on {{ $labels.instance }} (job: {{ $labels.job }}) has been above 90% for more than 10 minutes. Current value: {{ $value | printf \"%.2f\" }}%."; - }; - } - ]; - } - ]; -} From 35fa61ef34fe8e55849032cab6144cf5f716b904 Mon Sep 17 00:00:00 2001 From: Dominik Polakovics Date: Sat, 31 May 2025 09:57:03 +0200 Subject: [PATCH 4/6] feat: refactor Grafana alerting rules into a consolidated system module and update individual alert files --- .../modules/grafana/alerting/cpu_usage.nix | 100 ++++++------- .../modules/grafana/alerting/disk_usage.nix | 137 ++++++++---------- .../modules/grafana/alerting/host_down.nix | 92 ++++++------ .../modules/grafana/alerting/inode_usage.nix | 110 +++++++------- .../modules/grafana/alerting/ram_usage.nix | 106 +++++++------- .../grafana/alerting/system/default.nix | 21 +++ hosts/web-arm/modules/grafana/default.nix | 14 +- 7 files changed, 281 insertions(+), 299 deletions(-) create mode 100644 hosts/web-arm/modules/grafana/alerting/system/default.nix diff --git a/hosts/web-arm/modules/grafana/alerting/cpu_usage.nix b/hosts/web-arm/modules/grafana/alerting/cpu_usage.nix index 515fabb..9c09881 100644 --- a/hosts/web-arm/modules/grafana/alerting/cpu_usage.nix +++ b/hosts/web-arm/modules/grafana/alerting/cpu_usage.nix @@ -1,66 +1,58 @@ { lib, pkgs, config, ... }: { - services.grafana.provision.alerting.rules.settings.groups = [ + grafanaAlertRuleDefinitions = [ { - name = "CPUUsageAlerts"; - folder = "System Alerts"; - interval = "1m"; + uid = "high-cpu-usage-alert-uid"; + title = "HighCPUUsage"; + condition = "D"; # Condition is now D - rules = [ + data = [ + # Query A: Calculate CPU usage percentage { - uid = "high-cpu-usage-alert-uid"; - title = "HighCPUUsage"; - condition = "D"; # Condition is now D - - data = [ - # Query A: Calculate CPU usage percentage - { - refId = "A"; - datasourceUid = "vm-datasource-uid"; - queryType = "prometheus"; - relativeTimeRange = { from = 60; to = 0; }; # Query over the last minute - model = { - # Calculate average CPU usage over 1m, grouped by instance and job - expr = ''(1 - avg by (instance, job) (rate(node_cpu_seconds_total{mode="idle"}[1m]))) * 100''; - legendFormat = "CPU usage on {{instance}} ({{job}})"; - instant = false; # This is a range query - }; - } - # Expression C: Reduce Query A to its last value, preserving labels - { - refId = "C"; - datasourceUid = "__expr__"; - model = { - type = "reduce"; - expression = "A"; # Input is Query A - reducer = "last"; # Get the last value of each series in A - }; - } - # Expression D: Apply math condition to the reduced values from C - { - refId = "D"; - datasourceUid = "__expr__"; - model = { - type = "math"; - expression = "$C > 90"; # Alert if CPU usage from C is > 90% - }; - } - ]; - - for = "5m"; # Duration the condition must be met - noDataState = "NoData"; - execErrState = "Error"; - - annotations = { - summary = "High CPU usage on {{ $labels.instance }}"; - description = ''CPU usage on {{ $labels.instance }} (job: {{ $labels.job }}) has been above 90% for more than 5 minutes. Current value: {{ if $values.C }}{{ $values.C | humanizePercentage }}{{ else }}N/A{{ end }}%.''; + refId = "A"; + datasourceUid = "vm-datasource-uid"; + queryType = "prometheus"; + relativeTimeRange = { from = 60; to = 0; }; # Query over the last minute + model = { + # Calculate average CPU usage over 1m, grouped by instance and job + expr = ''(1 - avg by (instance, job) (rate(node_cpu_seconds_total{mode="idle"}[1m]))) * 100''; + legendFormat = "CPU usage on {{instance}} ({{job}})"; + instant = false; # This is a range query }; - labels = { - severity = "warning"; - category = "performance"; + } + # Expression C: Reduce Query A to its last value, preserving labels + { + refId = "C"; + datasourceUid = "__expr__"; + model = { + type = "reduce"; + expression = "A"; # Input is Query A + reducer = "last"; # Get the last value of each series in A + }; + } + # Expression D: Apply math condition to the reduced values from C + { + refId = "D"; + datasourceUid = "__expr__"; + model = { + type = "math"; + expression = "$C > 90"; # Alert if CPU usage from C is > 90% }; } ]; + + for = "5m"; # Duration the condition must be met + noDataState = "NoData"; + execErrState = "Error"; + + annotations = { + summary = "High CPU usage on {{ $labels.instance }}"; + description = ''CPU usage on {{ $labels.instance }} (job: {{ $labels.job }}) has been above 90% for more than 5 minutes. Current value: {{ if $values.C }}{{ $values.C | humanizePercentage }}{{ else }}N/A{{ end }}%.''; + }; + labels = { + severity = "warning"; + category = "performance"; + }; } ]; } diff --git a/hosts/web-arm/modules/grafana/alerting/disk_usage.nix b/hosts/web-arm/modules/grafana/alerting/disk_usage.nix index b30686b..020947b 100644 --- a/hosts/web-arm/modules/grafana/alerting/disk_usage.nix +++ b/hosts/web-arm/modules/grafana/alerting/disk_usage.nix @@ -1,85 +1,76 @@ { lib, pkgs, config, ... }: { - services.grafana.provision.alerting.rules.settings.groups = [ + grafanaAlertRuleDefinitions = [ { - # orgId = 1; # Defaults to 1 for provisioned rules - name = "DiskUsageAlerts"; # Name of the rule group - folder = "System Alerts"; # The folder these rules belong to in Grafana UI - interval = "1m"; # How often to evaluate rules in this group + uid = "high-disk-usage-alert-uid"; # Optional: provide a stable UID for the rule itself + title = "HighDiskUsage"; # Name of the alert rule (was 'alert' in vmalert) - rules = [ + # Condition for the alert to fire. 'D' refers to the refId of the threshold expression. + condition = "D"; # Condition is now D + # Removed rule-level relativeTimeRange + + # Data queries and expressions + data = [ + # Query A: Calculate disk usage percentage { - uid = "high-disk-usage-alert-uid"; # Optional: provide a stable UID for the rule itself - title = "HighDiskUsage"; # Name of the alert rule (was 'alert' in vmalert) - - # Condition for the alert to fire. 'C' refers to the refId of the threshold expression. - condition = "D"; # Condition is now D - # Removed rule-level relativeTimeRange - - # Data queries and expressions - data = [ - # Query A: Calculate disk usage percentage - { - refId = "A"; - datasourceUid = "vm-datasource-uid"; # UID of the VictoriaMetrics datasource - queryType = "prometheus"; # Explicitly set, though often inferred - relativeTimeRange = { from = 60; to = 0; }; # Query-level, integer seconds - model = { - expr = '' - ( - node_filesystem_size_bytes{fstype!~"tmpfs|rootfs",mountpoint!=""} - node_filesystem_avail_bytes{fstype!~"tmpfs|rootfs",mountpoint!=""} - ) / (node_filesystem_size_bytes{fstype!~"tmpfs|rootfs",mountpoint!=""} > 0) * 100 - and node_filesystem_size_bytes{fstype!~"tmpfs|rootfs",mountpoint!=""} - and node_filesystem_avail_bytes{fstype!~"tmpfs|rootfs",mountpoint!=""} - ''; - legendFormat = "{{mountpoint}} on {{instance}}"; # Example legend - instant = false; # For range queries, default is false - }; - } - # Expression C: Reduce Query A to its last value, preserving labels - { - refId = "C"; - datasourceUid = "__expr__"; - model = { - type = "reduce"; - expression = "A"; # Input is Query A - reducer = "last"; # Get the last value of each series in A - }; - } - # Expression D: Apply math condition to the reduced values from C - { - refId = "D"; - datasourceUid = "__expr__"; - model = { - type = "math"; - expression = "$C > 85"; # Check if the last value from each series in C is > 85 - }; - } - ]; - - for = "15m"; # Duration the condition must be met (same as vmalert) - - # How to handle states where data is missing or query errors - noDataState = "NoData"; # Options: NoData, Alerting, OK - execErrState = "Error"; # Options: Error, Alerting, OK - - annotations = { - summary = "High disk usage on {{ $labels.instance }} at {{ $labels.mountpoint }}"; - description = '' - Disk usage on {{ $labels.instance }} for mount point {{ $labels.mountpoint }} - (fstype: {{ $labels.fstype }}) has been above 85% for more than 15 minutes. - Current value: {{ if $values.C }}{{ $values.C | humanizePercentage }}{{ else }}N/A{{ end }}%. - ''; # Using $values.C as it's the input to the math condition D + refId = "A"; + datasourceUid = "vm-datasource-uid"; # UID of the VictoriaMetrics datasource + queryType = "prometheus"; # Explicitly set, though often inferred + relativeTimeRange = { from = 60; to = 0; }; # Query-level, integer seconds + model = { + expr = '' + ( + node_filesystem_size_bytes{fstype!~"tmpfs|rootfs",mountpoint!=""} - node_filesystem_avail_bytes{fstype!~"tmpfs|rootfs",mountpoint!=""} + ) / (node_filesystem_size_bytes{fstype!~"tmpfs|rootfs",mountpoint!=""} > 0) * 100 + and node_filesystem_size_bytes{fstype!~"tmpfs|rootfs",mountpoint!=""} + and node_filesystem_avail_bytes{fstype!~"tmpfs|rootfs",mountpoint!=""} + ''; + legendFormat = "{{mountpoint}} on {{instance}}"; # Example legend + instant = false; # For range queries, default is false }; - labels = { - severity = "warning"; - category = "capacity"; - # Grafana automatically adds labels from the query result (instance, mountpoint, etc.) - # and labels from the rule group/folder. + } + # Expression C: Reduce Query A to its last value, preserving labels + { + refId = "C"; + datasourceUid = "__expr__"; + model = { + type = "reduce"; + expression = "A"; # Input is Query A + reducer = "last"; # Get the last value of each series in A + }; + } + # Expression D: Apply math condition to the reduced values from C + { + refId = "D"; + datasourceUid = "__expr__"; + model = { + type = "math"; + expression = "$C > 85"; # Check if the last value from each series in C is > 85 }; - # isPaused = false; # Default is not paused } ]; + + for = "15m"; # Duration the condition must be met (same as vmalert) + + # How to handle states where data is missing or query errors + noDataState = "NoData"; # Options: NoData, Alerting, OK + execErrState = "Error"; # Options: Error, Alerting, OK + + annotations = { + summary = "High disk usage on {{ $labels.instance }} at {{ $labels.mountpoint }}"; + description = '' + Disk usage on {{ $labels.instance }} for mount point {{ $labels.mountpoint }} + (fstype: {{ $labels.fstype }}) has been above 85% for more than 15 minutes. + Current value: {{ if $values.C }}{{ $values.C | humanizePercentage }}{{ else }}N/A{{ end }}%. + ''; # Using $values.C as it's the input to the math condition D + }; + labels = { + severity = "warning"; + category = "capacity"; + # Grafana automatically adds labels from the query result (instance, mountpoint, etc.) + # and labels from the rule group/folder. + }; + # isPaused = false; # Default is not paused } ]; } diff --git a/hosts/web-arm/modules/grafana/alerting/host_down.nix b/hosts/web-arm/modules/grafana/alerting/host_down.nix index a2d938f..1910b23 100644 --- a/hosts/web-arm/modules/grafana/alerting/host_down.nix +++ b/hosts/web-arm/modules/grafana/alerting/host_down.nix @@ -1,62 +1,54 @@ { lib, pkgs, config, ... }: { - services.grafana.provision.alerting.rules.settings.groups = [ + grafanaAlertRuleDefinitions = [ { - name = "HostStatusAlerts"; - folder = "System Alerts"; - interval = "1m"; + uid = "host-down-alert-uid"; + title = "HostDown"; + condition = "C"; - rules = [ + data = [ { - uid = "host-down-alert-uid"; - title = "HostDown"; - condition = "C"; - - data = [ - { - refId = "A"; - datasourceUid = "vm-datasource-uid"; - queryType = "prometheus"; - relativeTimeRange = { from = 60; to = 0; }; # Query over the last minute - model = { - expr = ''up''; - legendFormat = "{{instance}} ({{job}})"; - instant = false; # Changed from true, as relativeTimeRange is used - }; - } - { # New Expression B: Reduce Query A - refId = "B"; - datasourceUid = "__expr__"; - model = { - type = "reduce"; - expression = "A"; # Input is Query A - reducer = "last"; # Get the last value of each series in A - }; - } - { # Modified Expression C: Math condition based on B - refId = "C"; - datasourceUid = "__expr__"; - model = { - type = "math"; - expression = "$B == 0"; # Check if the last value from B is 0 - }; - } - ]; - - for = "2m"; - noDataState = "Alerting"; - execErrState = "Error"; - - annotations = { - summary = "Host {{ $labels.instance }} is down"; - description = ''Host {{ $labels.instance }} (job: {{ $labels.job }}) has been down for more than 2 minutes.''; + refId = "A"; + datasourceUid = "vm-datasource-uid"; + queryType = "prometheus"; + relativeTimeRange = { from = 60; to = 0; }; # Query over the last minute + model = { + expr = ''up''; + legendFormat = "{{instance}} ({{job}})"; + instant = false; # Changed from true, as relativeTimeRange is used }; - labels = { - severity = "critical"; - category = "availability"; + } + { # New Expression B: Reduce Query A + refId = "B"; + datasourceUid = "__expr__"; + model = { + type = "reduce"; + expression = "A"; # Input is Query A + reducer = "last"; # Get the last value of each series in A + }; + } + { # Modified Expression C: Math condition based on B + refId = "C"; + datasourceUid = "__expr__"; + model = { + type = "math"; + expression = "$B == 0"; # Check if the last value from B is 0 }; } ]; + + for = "2m"; + noDataState = "Alerting"; + execErrState = "Error"; + + annotations = { + summary = "Host {{ $labels.instance }} is down"; + description = ''Host {{ $labels.instance }} (job: {{ $labels.job }}) has been down for more than 2 minutes.''; + }; + labels = { + severity = "critical"; + category = "availability"; + }; } ]; } diff --git a/hosts/web-arm/modules/grafana/alerting/inode_usage.nix b/hosts/web-arm/modules/grafana/alerting/inode_usage.nix index 8f67178..ba73f30 100644 --- a/hosts/web-arm/modules/grafana/alerting/inode_usage.nix +++ b/hosts/web-arm/modules/grafana/alerting/inode_usage.nix @@ -1,71 +1,63 @@ { lib, pkgs, config, ... }: { - services.grafana.provision.alerting.rules.settings.groups = [ + grafanaAlertRuleDefinitions = [ { - name = "InodeUsageAlerts"; - folder = "System Alerts"; - interval = "1m"; + uid = "high-inode-usage-alert-uid"; + title = "HighInodeUsage"; + condition = "D"; # Condition is now D - rules = [ + data = [ + # Query A: Calculate inode usage percentage { - uid = "high-inode-usage-alert-uid"; - title = "HighInodeUsage"; - condition = "D"; # Condition is now D - - data = [ - # Query A: Calculate inode usage percentage - { - refId = "A"; - datasourceUid = "vm-datasource-uid"; - queryType = "prometheus"; - relativeTimeRange = { from = 60; to = 0; }; - model = { - expr = '' - ( - node_filesystem_files{fstype!~"tmpfs|rootfs",mountpoint!=""} - node_filesystem_files_free{fstype!~"tmpfs|rootfs",mountpoint!=""} - ) / (node_filesystem_files{fstype!~"tmpfs|rootfs",mountpoint!=""} > 0) * 100 - and node_filesystem_files{fstype!~"tmpfs|rootfs",mountpoint!=""} - and node_filesystem_files_free{fstype!~"tmpfs|rootfs",mountpoint!=""} - ''; - legendFormat = "{{mountpoint}} on {{instance}}"; - instant = false; - }; - } - # Expression C: Reduce Query A to its last value, preserving labels - { - refId = "C"; - datasourceUid = "__expr__"; - model = { - type = "reduce"; - expression = "A"; # Input is Query A - reducer = "last"; # Get the last value of each series in A - }; - } - # Expression D: Apply math condition to the reduced values from C - { - refId = "D"; - datasourceUid = "__expr__"; - model = { - type = "math"; - expression = "$C > 80"; # Alert if inode usage from C is > 80% - }; - } - ]; - - for = "30m"; # Duration the condition must be met - noDataState = "NoData"; - execErrState = "Error"; - - annotations = { - summary = "High inode usage on {{ $labels.instance }} at {{ $labels.mountpoint }}"; - description = ''Inode usage on {{ $labels.instance }} for mount point {{ $labels.mountpoint }} (fstype: {{ $labels.fstype }}) has been above 80% for more than 30 minutes. Current value: {{ if $values.C }}{{ $values.C | humanizePercentage }}{{ else }}N/A{{ end }}%.''; + refId = "A"; + datasourceUid = "vm-datasource-uid"; + queryType = "prometheus"; + relativeTimeRange = { from = 60; to = 0; }; + model = { + expr = '' + ( + node_filesystem_files{fstype!~"tmpfs|rootfs",mountpoint!=""} - node_filesystem_files_free{fstype!~"tmpfs|rootfs",mountpoint!=""} + ) / (node_filesystem_files{fstype!~"tmpfs|rootfs",mountpoint!=""} > 0) * 100 + and node_filesystem_files{fstype!~"tmpfs|rootfs",mountpoint!=""} + and node_filesystem_files_free{fstype!~"tmpfs|rootfs",mountpoint!=""} + ''; + legendFormat = "{{mountpoint}} on {{instance}}"; + instant = false; }; - labels = { - severity = "warning"; - category = "capacity"; + } + # Expression C: Reduce Query A to its last value, preserving labels + { + refId = "C"; + datasourceUid = "__expr__"; + model = { + type = "reduce"; + expression = "A"; # Input is Query A + reducer = "last"; # Get the last value of each series in A + }; + } + # Expression D: Apply math condition to the reduced values from C + { + refId = "D"; + datasourceUid = "__expr__"; + model = { + type = "math"; + expression = "$C > 80"; # Alert if inode usage from C is > 80% }; } ]; + + for = "30m"; # Duration the condition must be met + noDataState = "NoData"; + execErrState = "Error"; + + annotations = { + summary = "High inode usage on {{ $labels.instance }} at {{ $labels.mountpoint }}"; + description = ''Inode usage on {{ $labels.instance }} for mount point {{ $labels.mountpoint }} (fstype: {{ $labels.fstype }}) has been above 80% for more than 30 minutes. Current value: {{ if $values.C }}{{ $values.C | humanizePercentage }}{{ else }}N/A{{ end }}%.''; + }; + labels = { + severity = "warning"; + category = "capacity"; + }; } ]; } diff --git a/hosts/web-arm/modules/grafana/alerting/ram_usage.nix b/hosts/web-arm/modules/grafana/alerting/ram_usage.nix index 03dd931..14a2ea8 100644 --- a/hosts/web-arm/modules/grafana/alerting/ram_usage.nix +++ b/hosts/web-arm/modules/grafana/alerting/ram_usage.nix @@ -1,69 +1,61 @@ { lib, pkgs, config, ... }: { - services.grafana.provision.alerting.rules.settings.groups = [ + grafanaAlertRuleDefinitions = [ { - name = "RAMUsageAlerts"; - folder = "System Alerts"; - interval = "1m"; + uid = "high-ram-usage-alert-uid"; + title = "HighRAMUsage"; + condition = "D"; # Condition is now D - rules = [ + data = [ + # Query A: Calculate RAM usage percentage { - uid = "high-ram-usage-alert-uid"; - title = "HighRAMUsage"; - condition = "D"; # Condition is now D - - data = [ - # Query A: Calculate RAM usage percentage - { - refId = "A"; - datasourceUid = "vm-datasource-uid"; - queryType = "prometheus"; - relativeTimeRange = { from = 60; to = 0; }; - model = { - expr = '' - (1 - node_memory_MemAvailable_bytes / (node_memory_MemTotal_bytes > 0)) * 100 - and node_memory_MemAvailable_bytes - and node_memory_MemTotal_bytes - ''; - legendFormat = "RAM usage on {{instance}} ({{job}})"; - instant = false; - }; - } - # Expression C: Reduce Query A to its last value, preserving labels - { - refId = "C"; - datasourceUid = "__expr__"; - model = { - type = "reduce"; - expression = "A"; # Input is Query A - reducer = "last"; # Get the last value of each series in A - }; - } - # Expression D: Apply math condition to the reduced values from C - { - refId = "D"; - datasourceUid = "__expr__"; - model = { - type = "math"; - expression = "$C > 90"; # Alert if RAM usage from C is > 90% - }; - } - ]; - - for = "10m"; # Duration the condition must be met - noDataState = "NoData"; - execErrState = "Error"; - - annotations = { - summary = "High RAM usage on {{ $labels.instance }}"; - description = ''RAM usage on {{ $labels.instance }} (job: {{ $labels.job }}) has been above 90% for more than 10 minutes. Current value: {{ if $values.C }}{{ $values.C | humanizePercentage }}{{ else }}N/A{{ end }}%.''; + refId = "A"; + datasourceUid = "vm-datasource-uid"; + queryType = "prometheus"; + relativeTimeRange = { from = 60; to = 0; }; + model = { + expr = '' + (1 - node_memory_MemAvailable_bytes / (node_memory_MemTotal_bytes > 0)) * 100 + and node_memory_MemAvailable_bytes + and node_memory_MemTotal_bytes + ''; + legendFormat = "RAM usage on {{instance}} ({{job}})"; + instant = false; }; - labels = { - severity = "warning"; - category = "performance"; + } + # Expression C: Reduce Query A to its last value, preserving labels + { + refId = "C"; + datasourceUid = "__expr__"; + model = { + type = "reduce"; + expression = "A"; # Input is Query A + reducer = "last"; # Get the last value of each series in A + }; + } + # Expression D: Apply math condition to the reduced values from C + { + refId = "D"; + datasourceUid = "__expr__"; + model = { + type = "math"; + expression = "$C > 90"; # Alert if RAM usage from C is > 90% }; } ]; + + for = "10m"; # Duration the condition must be met + noDataState = "NoData"; + execErrState = "Error"; + + annotations = { + summary = "High RAM usage on {{ $labels.instance }}"; + description = ''RAM usage on {{ $labels.instance }} (job: {{ $labels.job }}) has been above 90% for more than 10 minutes. Current value: {{ if $values.C }}{{ $values.C | humanizePercentage }}{{ else }}N/A{{ end }}%.''; + }; + labels = { + severity = "warning"; + category = "performance"; + }; } ]; } diff --git a/hosts/web-arm/modules/grafana/alerting/system/default.nix b/hosts/web-arm/modules/grafana/alerting/system/default.nix new file mode 100644 index 0000000..26db06d --- /dev/null +++ b/hosts/web-arm/modules/grafana/alerting/system/default.nix @@ -0,0 +1,21 @@ +{ lib, pkgs, config, ... }: +let + # Import rule definitions from refactored alert files in the parent 'alerting' directory + cpuAlertRules = (import ../cpu_usage.nix { inherit lib pkgs config; }).grafanaAlertRuleDefinitions; + diskAlertRules = (import ../disk_usage.nix { inherit lib pkgs config; }).grafanaAlertRuleDefinitions; + hostDownAlertRules = (import ../host_down.nix { inherit lib pkgs config; }).grafanaAlertRuleDefinitions; + inodeAlertRules = (import ../inode_usage.nix { inherit lib pkgs config; }).grafanaAlertRuleDefinitions; + ramAlertRules = (import ../ram_usage.nix { inherit lib pkgs config; }).grafanaAlertRuleDefinitions; + + allSystemRules = cpuAlertRules ++ diskAlertRules ++ hostDownAlertRules ++ inodeAlertRules ++ ramAlertRules; +in +{ + services.grafana.provision.alerting.rules.settings.groups = [ + { + name = "System Alerts"; # This is the Grafana alert group name + folder = "System Alerts"; # This is the Grafana folder name + interval = "1m"; + rules = allSystemRules; + } + ]; +} \ No newline at end of file diff --git a/hosts/web-arm/modules/grafana/default.nix b/hosts/web-arm/modules/grafana/default.nix index 6d1394f..6f48794 100644 --- a/hosts/web-arm/modules/grafana/default.nix +++ b/hosts/web-arm/modules/grafana/default.nix @@ -28,11 +28,13 @@ let in { imports = [ - ./alerting/disk_usage.nix - ./alerting/cpu_usage.nix - ./alerting/host_down.nix - ./alerting/inode_usage.nix - ./alerting/ram_usage.nix + # Individual alert files removed, now handled by alerting/system/default.nix + # ./alerting/disk_usage.nix + # ./alerting/cpu_usage.nix + # ./alerting/host_down.nix + # ./alerting/inode_usage.nix + # ./alerting/ram_usage.nix + ./alerting/system/default.nix # Added: Imports the consolidated system alerts module # ... other rule files can be added here ... ./datasources/victoriametrics.nix ]; @@ -99,7 +101,7 @@ in }; provision = { alerting = { - rules.settings.groups = lib.mkMerge []; # Allows rule groups to be merged + rules.settings.groups = lib.mkMerge []; # Allows rule groups to be merged (including the one from system/default.nix) contactPoints = { settings = { apiVersion = 1; # As per Grafana provisioning API From d0c67baeb8b69e7b22c6d0e46112097738d1ff25 Mon Sep 17 00:00:00 2001 From: Dominik Polakovics Date: Sat, 31 May 2025 11:35:17 +0200 Subject: [PATCH 5/6] feat: add Grafana online status monitoring module with Pushover notifications --- hosts/fw/configuration.nix | 5 +- hosts/fw/modules/grafana-monitor.nix | 183 +++++++++++++++++++++++++++ hosts/fw/secrets.yaml | 27 ++-- 3 files changed, 198 insertions(+), 17 deletions(-) create mode 100644 hosts/fw/modules/grafana-monitor.nix diff --git a/hosts/fw/configuration.nix b/hosts/fw/configuration.nix index 5bd0338..fc47be8 100644 --- a/hosts/fw/configuration.nix +++ b/hosts/fw/configuration.nix @@ -65,8 +65,9 @@ # setup network ./modules/setupnetwork.nix ./modules/set-nix-channel.nix # Automatically manage nix-channel from /var/bento/channel - - + ./modules/grafana-monitor.nix # Grafana online status monitor + + ./hardware-configuration.nix ]; diff --git a/hosts/fw/modules/grafana-monitor.nix b/hosts/fw/modules/grafana-monitor.nix new file mode 100644 index 0000000..b8effdb --- /dev/null +++ b/hosts/fw/modules/grafana-monitor.nix @@ -0,0 +1,183 @@ +{ config, pkgs, lib, ... }: + +let + grafanaMonitorUser = "grafana-monitor"; + grafanaMonitorGroup = "grafana-monitor"; + stateDir = "/var/lib/${grafanaMonitorUser}"; + + # Monitoring script will be defined here later + monitorScript = pkgs.writeShellScriptBin "grafana-online-check" '' + #!${pkgs.bash}/bin/bash + set -euo pipefail + + GRAFANA_URL="https://grafana.cloonar.com/api/health" + STATE_FILE="${stateDir}/status.env" + PUSHOVER_API_TOKEN_FILE="/run/secrets/pushover-api-token" + PUSHOVER_USER_KEY_FILE="/run/secrets/pushover-user-key" + MAX_FAILURES=5 + + # Ensure state directory exists (NixOS creates $HOME for the user, which is stateDir) + # The script runs as grafanaMonitorUser, so $HOME will be /var/lib/grafana-monitor + mkdir -p "''${HOME}" + + # Load current state or initialize + CONSECUTIVE_FAILURES=0 + ALERT_SENT="false" + LAST_KNOWN_STATUS="UP" # Assume UP initially if no state file + + # Note: STATE_FILE uses $stateDir which is /var/lib/grafana-monitor. + # The script will run with HOME=/var/lib/grafana-monitor. + # So, using ''${HOME}/status.env or ''${STATE_FILE} should resolve to the same path. + # Let's stick to ''${STATE_FILE} for consistency with its definition. + if [[ -f "''${STATE_FILE}" ]]; then + source "''${STATE_FILE}" + fi + + # Check secrets + if [[ ! -f "''${PUSHOVER_API_TOKEN_FILE}" ]] || [[ ! -r "''${PUSHOVER_API_TOKEN_FILE}" ]]; then + echo "Error: Pushover API token file (''${PUSHOVER_API_TOKEN_FILE}) not found or not readable." >&2 + exit 1 + fi + PUSHOVER_API_TOKEN=$(cat "''${PUSHOVER_API_TOKEN_FILE}") + + if [[ ! -f "''${PUSHOVER_USER_KEY_FILE}" ]] || [[ ! -r "''${PUSHOVER_USER_KEY_FILE}" ]]; then + echo "Error: Pushover user key file (''${PUSHOVER_USER_KEY_FILE}) not found or not readable." >&2 + exit 1 + fi + PUSHOVER_USER_KEY=$(cat "''${PUSHOVER_USER_KEY_FILE}") + + echo "Checking Grafana at ''${GRAFANA_URL}..." + ACTUAL_HTTP_CODE="000" # Default if curl doesn't provide one + CURL_ERROR_MESSAGE="" + CURL_STDERR_OUTPUT=$(mktemp) + # Ensure temp file is cleaned up on exit, error, or interrupt + trap 'rm -f "''${CURL_STDERR_OUTPUT}"' EXIT TERM INT HUP + + # -L: follow redirects + # -sS: silent mode, but show errors + # --fail: curl exits with 22 on server errors (4xx, 5xx) + # --connect-timeout 5: max time to connect + # --max-time 10: max total time for operation + # --stderr: redirect stderr to a file to capture detailed errors + # -o /dev/null: discard response body + # --write-out "%{http_code}": output the HTTP status code + if ACTUAL_HTTP_CODE=$(${pkgs.curl}/bin/curl -L -sS --fail --connect-timeout 5 --max-time 10 \ + --stderr "''${CURL_STDERR_OUTPUT}" \ + -o /dev/null --write-out "%{http_code}" "''${GRAFANA_URL}"); then + # Curl exited with 0. With --fail, this means HTTP status was 2xx. + echo "Grafana is UP (HTTP ''${ACTUAL_HTTP_CODE})." + CURRENT_STATUS="UP" + if [[ "''${LAST_KNOWN_STATUS}" == "DOWN" && "''${ALERT_SENT}" == "true" ]]; then + echo "Grafana recovered. Sending recovery notification." + ${pkgs.curl}/bin/curl -sS -X POST \ + -F "token=''${PUSHOVER_API_TOKEN}" \ + -F "user=''${PUSHOVER_USER_KEY}" \ + -F "message=Grafana at ''${GRAFANA_URL} is back online (HTTP ''${ACTUAL_HTTP_CODE})." \ + -F "title=Grafana Recovered (fw)" \ + -F "priority=0" \ + https://api.pushover.net/1/messages.json + ALERT_SENT="false" + fi + CONSECUTIVE_FAILURES=0 + else + # Curl exited with a non-zero status. + CURL_EXIT_CODE=$? + CURL_ERROR_MESSAGE=$(cat "''${CURL_STDERR_OUTPUT}" | tr -d '\n' | sed 's/"/\\"/g') # Read, remove newlines, escape quotes for JSON + + echo "Grafana check failed. Curl Exit Code: ''${CURL_EXIT_CODE}. HTTP Code reported: ''${ACTUAL_HTTP_CODE}." + echo "Curl Stderr: ''${CURL_ERROR_MESSAGE}" + CURRENT_STATUS="DOWN" + CONSECUTIVE_FAILURES=$(( ''${CONSECUTIVE_FAILURES} + 1 )) + echo "Consecutive failures: ''${CONSECUTIVE_FAILURES}" + + if [[ ''${CONSECUTIVE_FAILURES} -ge ''${MAX_FAILURES} && "''${ALERT_SENT}" == "false" ]]; then + echo "Grafana has been offline for ''${CONSECUTIVE_FAILURES} checks (>= ''${MAX_FAILURES}). Sending alert." + PUSHOVER_TITLE="Grafana OFFLINE (fw)" + PUSHOVER_MSG="Grafana ''${GRAFANA_URL} offline for ''${MAX_FAILURES}+ min. HTTP:''${ACTUAL_HTTP_CODE}, CurlExit:''${CURL_EXIT_CODE}." + if [[ -n "''${CURL_ERROR_MESSAGE}" ]]; then + PUSHOVER_MSG+=" Err: ''${CURL_ERROR_MESSAGE}" + fi + # Truncate message if too long for Pushover (1024 chars) + PUSHOVER_MSG=$(echo "''${PUSHOVER_MSG}" | cut -c 1-1024) + + ${pkgs.curl}/bin/curl -sS -X POST \ + -F "token=''${PUSHOVER_API_TOKEN}" \ + -F "user=''${PUSHOVER_USER_KEY}" \ + -F "message=''${PUSHOVER_MSG}" \ + -F "title=''${PUSHOVER_TITLE}" \ + -F "priority=1" \ + https://api.pushover.net/1/messages.json + ALERT_SENT="true" + fi + fi + # Temp file is removed by trap + + # Save current state + echo "Saving state: CONSECUTIVE_FAILURES=''${CONSECUTIVE_FAILURES}, ALERT_SENT=''${ALERT_SENT}, LAST_KNOWN_STATUS=''${CURRENT_STATUS}" + ( + echo "CONSECUTIVE_FAILURES=''${CONSECUTIVE_FAILURES}" + echo "ALERT_SENT=''${ALERT_SENT}" + echo "LAST_KNOWN_STATUS=''${CURRENT_STATUS}" + ) > "''${STATE_FILE}" # Using STATE_FILE which is ${stateDir}/status.env + chmod 600 "''${STATE_FILE}" + + echo "Grafana check finished." + ''; +in +{ + # Module is now implicitly enabled when imported + config = { + users.users.${grafanaMonitorUser} = { + isSystemUser = true; + group = grafanaMonitorGroup; + home = stateDir; # Home directory for state + createHome = true; # NixOS will create this directory + description = "User for Grafana online monitoring service"; + }; + users.groups.${grafanaMonitorGroup} = {}; + + # Sops secrets for Pushover + sops.secrets."pushover-api-token" = { + owner = grafanaMonitorUser; + group = grafanaMonitorGroup; + mode = "0400"; # Read-only for the user + }; + sops.secrets."pushover-user-key" = { + owner = grafanaMonitorUser; + group = grafanaMonitorGroup; + mode = "0400"; # Read-only for the user + }; + + environment.systemPackages = [ + pkgs.curl + pkgs.coreutils # for mkdir, cat, echo, rm used in script (though bash builtins are often used) + ]; + + systemd.services.grafana-online-check = { + description = "Grafana Online Check Service"; + wantedBy = [ "multi-user.target" ]; # Or timers.target if only started by timer + after = [ "network-online.target" ]; # Ensure network is up and secrets are available + requires = [ "network-online.target" ]; + + serviceConfig = { + Type = "oneshot"; + User = grafanaMonitorUser; + Group = grafanaMonitorGroup; + ExecStart = "${monitorScript}/bin/grafana-online-check"; + # Permissions to write to its own home directory (stateDir) are implicit + # If using StateDirectory= in systemd, it would be different. + # For home directory usage, ensure the user has rights. `createHome = true` helps. + }; + }; + + systemd.timers.grafana-online-check = { + description = "Timer to periodically check Grafana's online status"; + wantedBy = [ "timers.target" ]; + timerConfig = { + OnBootSec = "2min"; # Wait a bit after boot + OnUnitActiveSec = "1min"; # Run every 1 minute after the last run + Unit = "grafana-online-check.service"; + }; + }; + }; +} diff --git a/hosts/fw/secrets.yaml b/hosts/fw/secrets.yaml index 1605aaa..a372ff3 100644 --- a/hosts/fw/secrets.yaml +++ b/hosts/fw/secrets.yaml @@ -1,18 +1,20 @@ +ai-mailer-imap-password: ENC[AES256_GCM,data:kMxDPUK9rk7mbel5JDT03m3Y2w==,iv:cbnkNIVRXd7OLqueSrfYRzfaW9TzI+FauuQD8lgYIy0=,tag:63W7seIgt5TPVFQc84semQ==,type:str] +ai-mailer-openrouter-key: ENC[AES256_GCM,data:PCe8kt/M+7g087AKzYMY2H5WO4L+NGkHLsh47fMK36kz+Ju5kd/kpmM4GQcDbI3LgWm/P+T0/mv7kGGOL6KLmBFaFmGV/88cGw==,iv:ruVftGvnv+PX1Zd92tfOezpyaMbYrqCrexelyPUYFMc=,tag:z4JVUCfz/frehar6y+fOlQ==,type:str] borg-passphrase: ENC[AES256_GCM,data:jHb+yXK0RqNdVYtWiueztZFlHC/xQ6ZiAOUcLt6BxmZQewuL3mh4AZ+lQdmA/4EaaTTIhVMR3xFx5fU6b2CtNLiGb/0=,iv:IW09B1EE1OupMCOvv13MXRYiMsD4VmIfyYONUyrPX1c=,tag:3ankeLOaDJkwRUGCd72DuA==,type:str] borg-ssh-key: ENC[AES256_GCM,data:ir25XfzLBb/H/YWzxP501hCaLBB4jpiLW7WUcnvguzosT9QeOtBdJ0WB1IndEMtiEgQyE9kyGOJ3QJwzbQNkX6CG96Uzt2mKw8gw8ayUqC+B9zR8eIRYiDKOYs+YREVo7nA5pLLzIc/9jaRicDFMmw1Thmk7UUJKB1DNV49nU9K+nAfrCzk7ZQieY8oaasFD0cvNb4Ndj6f9PWSXkNBwKK52ig4hDeNBs1bdy8nDE8VqlwOo8H2DcYMzdMjKCZDBRccy8NofHEhakCW5OdliFyIHsLkcBHca3Bp46JN7wbo8avPPd9bXGuRiOSWYq50RcyZUovnB3g7Dk3swCyuiFztnStN63+g7ZnGFdYLYDYfuDSPN1W2HCkknmaoT910VNE8sEAMyfXk4tqJv4eW4qmFk2UwPlRCrsk9GtdRQ5wm8muNPHEZ8s2dGkn4WDcjy7SUpgF4UJJZV8iJe74W9BK1Ef+AWWNsNjYfZde3iw1+8Fz1u65u4seFWqQMok/noADpszbpk+YYRoM+5D/YVMx+KeDtoFqnZfULM/BqvAqdYYZtRzojndeNW6Ea4sxDE+XQ5b1OwGFlNAlnuS1fYYPvKojrKNgT9KMwbsvPijU5vFddY8Qpz2h6GKEv/OW87j5UeyDW4l32lvyawBuzczBfiFgCElggGSZHM5rjE4Deb06eQleTioZ79EDXTv5UsPQ6Bc1v5Wvnu8DvxJe4B10vxH70JIGIlmjwo0yhMkxDTN7BkAGQC0QAPhwtURDq+XVufQNjlTUjjH1Q1E4u0Vy19clMs8SStqFeMN02BfWZdS9mbueF5Ehc+8wTfAs43CQFublJ4wfG1PzEbqj9LZdimFe4hCnE2y6Gbf591shugVSAMA3UXQUuvFQmm69i9gz88YSYrkLlVStM+dtXCugZho72xgHtnI+5o19wuoZPRoxe47W0T2kJZZeomtqoAsSo5yr5JeYzYdaHYcK2fgRY0HWgWzOxnVEfX/gRPR3b20Tko6yp9lIDECkXVDQSxptxqIYk+VuETnD9YF2OpYeHZLGoo9OLdEHVZRcuy1S74aAOJGO9SAHLw3eukxG//AZlwcOYjOsYDVt3BjhYZEkYCLg8GkAqV/7bGsxT7pgckNEB2NRYQI9ckqEcEw9CdkYre67HwfPCvAble68VnRzgp+v5s0koVjTURF9FTxvVOXQEbvSpY828idyx6nOaAIHoqpIOFz4jsGE9L4FKamqnlnjzj2Ri/MboT9JQBj8bnIF/ej+dQGpfqZo7zqtu3d0B/9e0xuVTcqI9Bxlqn3D4108I8R37Ctr5OFKloeOZ8HHMsHcBUAzZC6/fWrOspru14YHW2YNj8nBxHve/P3oiTQ/nlXLcBGLoFfI+hOpofccQB8FnkKfTbLSRUGrGY6NJt9RCnZgm2+RUgel77XpsCsT/Q5ZGclBdyk8mSaqVjiNyHCbCV5tF/tWnuvf859S0tcmqbJ0FhIRAvwxFucmfi6FSPX5HEMdRbNV7szrHKSX60u7YA2DBBzv3c/+C2bxq70vhwFelqz7FqpVKwebbE4/a59lZpibzefCoji/TPDJB62/ox5NHHE5qenv7IPcEj3dEmdasbrApAw1UFsFlRCnlg4JIYley/AQx7OzUSImqkG8JWvSJ4JXijhsr9dPFR/cb0srUO88aFNh/ZUQhELZCVnzAsF81Y4w6LTGApMfUVN/yx9MqENGvObywzMls1UJphvzDZzvb+Ue6eqELogN1QcEI/WOirwVtJO6E7IevEtK4xxWsLfRHVjtbLc4QjCWuiyszAPTTttKJ+iC2h14Wj1XoiMpWRiVnj+jI9iWRen96P4glYEfuCYQS6vbGkNDEoZt/FnkLJDbLdjXatmhUoRpvExOtp26ULR/f1lwzLMJBt1qPvhuGur1ru2B1e8+AVte1Cfjmk+xrnxNwkTFLGe89Qjd77wPyQv9h0YrhZ6uDi2zLemhZs2LjW5ZvzV5P4thMDxkhezJHatPHAGa8OfclJOyrRTyW2azdz2A45MNzZtCQcnQdQxBXf+XRskLnhquZfgv66hFITjuF/HeI9cq4HJcrgaOcVj+tBdK1bTCyL2kqKkCpSCbh/Pv6FuAlDXgLjsWwZgOKz8gfTIfXMapPLDYVTbS/PPPABylZflN98FFyeFDHB3Fwn1a6qAJ0mC7+4sowVZ1DIAoflaHqNs5TXyb3KeZGgXj5ZQwhv1z6NySvOS6cHxx0PvkFo99T1NHztxCRERNvBdWSwsr32DTwEvZo5iNPy3lvKI5A+rXc7jlQkUbufbddtLw2iPtt29XyMDOysK010fXzzQRjaz4R8ZaDtHNjqPrynvqFPXRB0VSIrwXS2utU7bmD+0dGX26t9k5qRBi7Gm+iZNKGMnSRsm17bVk5o8q0tb1P1eGL9mexZJJvxolfXVFJJtR8m6vLmUX1LSht/JhoWFElrINl0hviwd1dehmTqdQqWz5/imjF+pVOasrt7XVZ+7T/rDpuwNl375qSZptM1pMUExJ3CvzigpnarXXQxEBYkf0haGvQwPWNVHe/bR/1VooSQkH/mGg1g+rcTqp4yB5hsFu1lNK4ph04WQOqaafg40HBv6e5cOjLkFdEtYNpjyd6sRS+WHk7zzFlfPVlzijq8f+oDH9ALRzNnL1Y2DrX53wx4dBBWvxE1Yhb6Kj6Er4ZDiRLLXo+wJOGCpnNTPJMVaYskZ+LN2e9nS2/ZwbsNBnPHxSqCc1oP4d3yXH0j90VKnWg79aIEOagRvTF/9F6SkkGL9zVuUnoVSPwq97etWWtjGoEORMGY7jkGOK+U391p7Z69Hrv2AejS1BoSDeGcxXasFvINpmc+Hl2c+zOlFBySu2zA39cVlcStUFICA5GCmE5Eum4ED9DXP6RAuicD7YE0qSKbMkfLxIWMCZ6wBcwVUjdt43SI/ZqdpDm3E1kTRg07dE0R091rtfzEiIwBM4xFPJBafOx0L/Do61YMOHGzi6wgIQO7P7wIslv62M8MD1KKa/eH0tE2vhG/GyEGtKkg3P9vZRJwioifyshS1hvrt5pLinuCaDYyqMAl8Ro0OOm8di7+mBvXib0nRLfW7wBGDA4ADTipizNWAmbspQQl89kH5gdxgXO5U+N/qc0zXbpB+qeHVkPIK1DmrJ8pHLOE8mOpLy7eHUsSku/WtTt/RP4pcDbBU/43MCbk7NXKu/LjKjkQBjAL49LxnYmhEU7X//jtwSPE3gdx0x+wRJxzlbehM6rpfDRV5WQGSFf7yjLc/Ga1KwsgVdAstJEzDdv2vWSsjNzfJvHVBLrQPIC9fggi3DeLiHTAryCUcLUhNj4xtZWhSS1qmx07E4VzfjDJLMOsLY0vlimgngZ3YYCjC3Sw0frfQH2SZvmbLd3XfBdud67ZaMUobcRhnKzQnilldyD1jWVWLdVTup4RVxT4GYek9nmYflzpWWmwbXatz9Sgcw==,iv:9E1uiPqM3Hh4KWtL8haxm6PRm2VPc+DggrA135FvfB8=,tag:QSOgzVH9IBMgZxJvUhvY2w==,type:str] ddclient: ENC[AES256_GCM,data:EaXjXS/bwL3S/Fr+rzQ7dXA1eIzeFpHH7H+SvoNhVSg=,iv:3BzjnJG5yT1W8ob2nm0oUlr+sSJ73W/ctl48xyxeeWM=,tag:TqKSwfxF0V1v5T8VT/qblw==,type:str] +gitea-mailer-password: ENC[AES256_GCM,data:M4qCWNt1oQVJzxThIjocm2frwuVMyx+69TBpke25RwxJxEQnvHL1CM579OVroTm7+gGE/oOJqAwDIepfiDtyM1xm,iv:jayFZMbu3uDimS/rIKZSeoU0MsYwWp880iEMs1oQE4k=,tag:qGDncRkyuCWaELhcxUrqtQ==,type:str] +gitea-runner: ENC[AES256_GCM,data:NYG3qRLiMjmfA+oHYBXBbxpuX2ZjB/VgvLaS7yr5kJeDN/NukB/B3OZcEfsUWgbBS5IsLENESngWTFmK4W3htN4lSqdg/g4UsUr20beNov+pbyPN05rkBYmSCZZFwZ1L9POEE4GF4LuuoNpDlWIw0mrA8oV8MoI4W5QS2IGranBTIQQaYXU5TEGYa4XMVo4oC75iuH6DIq1KD6OgFAfMhm/wlbP8CP/Iaw2K8CNPxktk93pm3OSmggf22Z4JPEnvV25sc9iBkxLkDk9FXYFys0g=,iv:UzL5ncVOC/loJwcFSG1QJHnzLp3il4Hf3qDwLWxrIlo=,tag:w0Zn/E+02KyAsPXZdOLrew==,type:str] +gitea-runner-token: ENC[AES256_GCM,data:HpBjLS10w78ihbnAUrlCRGvwrXLBYKH5v/P7XggoUSWLoAazSVQArABxaK7PJas=,iv:q3Y6jV0gmug06O0EYqGVyIJ4AvMGr2ydwY17YKxo0Qw=,tag:Ws5HLbdaeYGGXzDZW/FX4w==,type:str] +home-assistant-ldap: ENC[AES256_GCM,data:uZEPbSnkgQYSd8ev6FD8TRHWWr+vusadtMcvP7KKL2AZAV0h1hga5fODN6I5u0DNL9hq2pNM+FwU0E/svWLRww==,iv:IhmUgSu34NaAY+kUZehx40uymydUYYAyte1aGqQ33/8=,tag:BKFCJPr7Vz4EG78ry/ZD7g==,type:str] +home-assistant-secrets.yaml: ENC[AES256_GCM,data:m7uOVo7hPk/RmqqRS6y7NKoMKsR9Bdi1ntatsZdDOAbJMjZmZL2FgPEHi/zF73zCfRfTOca3dwpulR3WXZ9Ic1sbUIggmusJMg4Gellw1CUhx7SbQN5nieAbPbB9GVxMuV4OakD1u7Swz8JggDT6IwojSnuD5omCRCyUH1wvKB+Re59q6EStderlm5MJNVFlVrbKVbLKLcw4yRgTh34BGnTTjcJmgSlQjO1ciu2B7YQmdl0Fw6d8AdbEzgB5TFG5ONc85UhJDE8Wlw==,iv:GCtpcVChN2UMWtfnWURozCfVj2YbRPqp/bH4Jjntybs=,tag:pcxP7gTBtXMNT5iyW5YXTw==,type:str] +pushover-api-token: ENC[AES256_GCM,data:W2ILPksaNeDvbSlSJztu1vu23kQKLDRHYKoUIvyd,iv:RYFAN6AU+DALphpqpiifhOoEQ8++6DEgo2wETSwxBCg=,tag:pRfaNuz4564LvRuaLggatg==,type:str] +pushover-user-key: ENC[AES256_GCM,data:mh3u3FAdFkGD1d4UKcTwLOsCB2vfhEADI5cd1aT4,iv:4bkR7ZNJwWAYBdu435SPZUovGsfb8qivuDOQdGkPd/U=,tag:5UO4vGt75CCFEM5jxTGkGg==,type:str] wrwks_vpn_key: ENC[AES256_GCM,data:gGipXC8JJO59b4KWMSo0+r761raQl7RzgBuUbXmPEKlZR21bs5XRAQalzDCFNtjcpNkXiGqAHCLkDTtjPagMsw==,iv:MH1EBJEOdQDEgm9E0F884fynhsH8KiS5QSc605XbASQ=,tag:FUM1eptHS0rpt6ILyQjGOg==,type:str] wg_cloonar_key: ENC[AES256_GCM,data:Dtp6I5J0jU5LLVwEFU4DFCpUngPRmFMebGXnk2oSwsKtsir/DtRBFG7ictM=,iv:1Abx/EAZRJrRQURljofzUYDgJpuREriX0nSrFbH5Npw=,tag:l4uFl9Uc+W0XeLVfLGmgZA==,type:str] wg_epicenter_works_key: ENC[AES256_GCM,data:LeLjfwfaz+loWyHYRgIMIPzHzlOnhl9tluKcQFgdes6r+deft1JfnUzDuF0=,iv:DKrc3I+U2hWDH8nnc8ZQeaVtA1eVXu7SXdTn1fxHoH4=,tag:V0PL0GrL2NEPVslAZa801A==,type:str] wg_epicenter_works_psk: ENC[AES256_GCM,data:Den3NDWdP013Or6/2Vll1igUahuRSNW4hu+nDa5vkr93bbveQTaWFT4TD4U=,iv:r3UsD3+3lUIP2X3Grti7wpXTQBXtu1/MdrycEmpZfsI=,tag:ghbAcxmjGVOe9jCZsmFzjA==,type:str] wg_ghetto_at_key: ENC[AES256_GCM,data:OIHmoy3SpIi9aefZnZ1PzpyHbEso18ceoTULf2eQkx1rJbaxC6PD1lma7eQ=,iv:u0eFjHHOBzPTmBvBEQsYY5flcBayiAQKd6e7RyiPwJI=,tag:731C9wvv8bA5fuuQq+weVQ==,type:str] -gitea-mailer-password: ENC[AES256_GCM,data:M4qCWNt1oQVJzxThIjocm2frwuVMyx+69TBpke25RwxJxEQnvHL1CM579OVroTm7+gGE/oOJqAwDIepfiDtyM1xm,iv:jayFZMbu3uDimS/rIKZSeoU0MsYwWp880iEMs1oQE4k=,tag:qGDncRkyuCWaELhcxUrqtQ==,type:str] -ai-mailer-imap-password: ENC[AES256_GCM,data:kMxDPUK9rk7mbel5JDT03m3Y2w==,iv:cbnkNIVRXd7OLqueSrfYRzfaW9TzI+FauuQD8lgYIy0=,tag:63W7seIgt5TPVFQc84semQ==,type:str] -ai-mailer-openrouter-key: ENC[AES256_GCM,data:PCe8kt/M+7g087AKzYMY2H5WO4L+NGkHLsh47fMK36kz+Ju5kd/kpmM4GQcDbI3LgWm/P+T0/mv7kGGOL6KLmBFaFmGV/88cGw==,iv:ruVftGvnv+PX1Zd92tfOezpyaMbYrqCrexelyPUYFMc=,tag:z4JVUCfz/frehar6y+fOlQ==,type:str] -gitea-runner: ENC[AES256_GCM,data:NYG3qRLiMjmfA+oHYBXBbxpuX2ZjB/VgvLaS7yr5kJeDN/NukB/B3OZcEfsUWgbBS5IsLENESngWTFmK4W3htN4lSqdg/g4UsUr20beNov+pbyPN05rkBYmSCZZFwZ1L9POEE4GF4LuuoNpDlWIw0mrA8oV8MoI4W5QS2IGranBTIQQaYXU5TEGYa4XMVo4oC75iuH6DIq1KD6OgFAfMhm/wlbP8CP/Iaw2K8CNPxktk93pm3OSmggf22Z4JPEnvV25sc9iBkxLkDk9FXYFys0g=,iv:UzL5ncVOC/loJwcFSG1QJHnzLp3il4Hf3qDwLWxrIlo=,tag:w0Zn/E+02KyAsPXZdOLrew==,type:str] -gitea-runner-token: ENC[AES256_GCM,data:HpBjLS10w78ihbnAUrlCRGvwrXLBYKH5v/P7XggoUSWLoAazSVQArABxaK7PJas=,iv:q3Y6jV0gmug06O0EYqGVyIJ4AvMGr2ydwY17YKxo0Qw=,tag:Ws5HLbdaeYGGXzDZW/FX4w==,type:str] -home-assistant-ldap: ENC[AES256_GCM,data:uZEPbSnkgQYSd8ev6FD8TRHWWr+vusadtMcvP7KKL2AZAV0h1hga5fODN6I5u0DNL9hq2pNM+FwU0E/svWLRww==,iv:IhmUgSu34NaAY+kUZehx40uymydUYYAyte1aGqQ33/8=,tag:BKFCJPr7Vz4EG78ry/ZD7g==,type:str] -home-assistant-secrets.yaml: ENC[AES256_GCM,data:m7uOVo7hPk/RmqqRS6y7NKoMKsR9Bdi1ntatsZdDOAbJMjZmZL2FgPEHi/zF73zCfRfTOca3dwpulR3WXZ9Ic1sbUIggmusJMg4Gellw1CUhx7SbQN5nieAbPbB9GVxMuV4OakD1u7Swz8JggDT6IwojSnuD5omCRCyUH1wvKB+Re59q6EStderlm5MJNVFlVrbKVbLKLcw4yRgTh34BGnTTjcJmgSlQjO1ciu2B7YQmdl0Fw6d8AdbEzgB5TFG5ONc85UhJDE8Wlw==,iv:GCtpcVChN2UMWtfnWURozCfVj2YbRPqp/bH4Jjntybs=,tag:pcxP7gTBtXMNT5iyW5YXTw==,type:str] matrix-shared-secret: ENC[AES256_GCM,data:67imd3m6WBeGP/5Msmjy8B6sP983jMyWzRIzWgNVV5jZslX+GBJyEYzm3OTDs1iTZf4ScvuYheTH0QFPfw==,iv:7ElCpESWumbIHmmFaedcpkFm5M58ZT3vW9wb9e1Sbh4=,tag:wr4FIymtJBtCerVqae+Xlw==,type:str] palworld: ENC[AES256_GCM,data:rdqChPt4gSJHS1D60+HJ+4m5mg35JbC+pOmevK21Y95QyAIeyBLVGhRYlOaUcqdZM2e4atyTTSf6z4nHsm539ddCbW7J2DCdF5PQkrAGDmmdTVq+jyJAT8gTrbXXCglT1wvFYY5dbf2NKA4ASJIA8bdVNuwRZU0CtFiishzLuc9m8ZcGCNwQ/+xkMZgkUAHYRlEJAZyMpXR6KkFftiR05JRAFczD4N7GXPPe+vyvgXg7QBGtf20Qd4SGBUw0zI/SNTRmifHUuc4Z6+Fe9JHgvTc3uFcTMVnty0fEuL+a29liaVdAFq8BnqJfc5CNV401ZSUeMbG41lCn1cegP/WChs9J6HXNrhWDgiXa6ln++NoKcfOHIfZVbYOCoOxFR6+YWeBU2+sHmdwI9j5XQf5Ly2hmg12j0Ds2Cn8k4PG5aQP+HT2bedqyxwSt6fi97A0Osnh4ig7+DzYAjSNLewbYLzVdK39VdvB9hqLto+yFS3gAaeYOHwPwtqa+COI85c55lHiyKHlSwPhBqYaaiDu00lQTUzq9R5vz6F/l+T3bUjuna5RryUu8yhnk5DyK834KycTOg4ETcZTqro6prfiEBxc+Utsc9JvEtZgwFv6fsVLOu7nHxuiYuvseZ4YA8LlYdwPJboMPO2XsuhwWtT1uz/rh2orH7/vsXvzA/kF8NFemWBEMVLYA8byC5ze8doiGDYp4T5AAf10nJB1ceQ==,iv:gs78fxhvo9KlTaR5nzs12/LdgPChSFPHD2k4VQp3ARo=,tag:lpWBOi9xh2cWkS+71KD/UQ==,type:str] ark: ENC[AES256_GCM,data:YYGyzoVIKI9Ac1zGOr0BEpd3fgBsvp1hSwAvfO07/EQdg8ufMWUkNvqNHDKN62ZK5A1NnY3JTA1p4gyZ4ryQeAOsbwqU1GSk2YKHFyPeEnpLz/Ml82KMsv7XPGXuKRXZ4v3UcLu0R8k1Q0gQsMWo4FjCs3FF5mVtJG/YWxxbCYHoBLJ/di5p0DgjuFgJBQknYBpuLzr+yIoeqEyN7XcGYAJO53trEJuOOxLILULifkqISHjZ66i5F1fHW0iUdRbmeWV4aOAeOrsQqXYv,iv:gJwV5ip84zHqpU0l0uESfWWOtcgihMvEEdLaeI+twcU=,tag:sy8udVQsKxV/jOqwhJmWAg==,type:str] @@ -21,10 +23,6 @@ knot-tsig-key: ENC[AES256_GCM,data:H2jEkRSVSIJl1dSolAXj9uUmzD6eEh9zPpoajZLxfuuFt mopidy-spotify: ENC[AES256_GCM,data:O3s6UvTP8z5KZPCq10GaaEQntWAEoxGFMnTkeUz9AfobrpsGZJcQgyazFX2u4DgAaIjNb34032MISotmuVQDJ14mi8xI5vC9w/Vf16v3TFu/dSKGZNb5ZPQwTUQ+iMJf7chgwOV9guThhutVJokb6pLxzt7fSht7,iv:j8+X1AmuWzIJdafzgrE7WBIlZ7coNNi0/Zn6JObR6rw=,tag:fiw6M2/6nfEPqEgV2YOWLg==,type:str] lms-spotify: ENC[AES256_GCM,data:gh5kx/MDSefNLbZsnovRc3rNWxp/RTrJ4A2WIs1QMi4JVGFj9SppdsErMXW4y/IFj/YxH1X7JtwvhptO/p3P2CFK0XL2I1vFVqPuj7LavDHJK7GXPAV6+x17ldvPXgym5NqHjzHi4gtj7U/bMJlz0NxrFsrrjMcY9nmNX2vVwKlINUFqWb1JRvQsJ8ujSutjJbGtAY/bVQI8OFtU29QGKw1CU3RH/bgXIzxGiLQsUd68w7N17oKYj8MiTpGVcovMCRKwwUbd9w==,iv:4aVy+r//s1Cs9q4GasR3vSAb8b/VB/8Mx5E1jWAUA+E=,tag:TgTSLLH1OG9ySi2tZ+hK1Q==,type:str] sops: - kms: [] - gcp_kms: [] - azure_kv: [] - hc_vault: [] age: - recipient: age14grjcxaq4h55yfnjxvnqhtswxhj9sfdcvyas4lwvpa8py27pjy2sv3g6v7 enc: | @@ -62,8 +60,7 @@ sops: WXJpUUxadERyYUExRFMzNzBXaUVET3cKG9ZwWy5YvTr/BAw/i+ZJos5trwRvaW5j eV/SHiEteZZtCuCVFAp3iolE/mJyu97nA2yFwWaLN86h+/xkOJsdqA== -----END AGE ENCRYPTED FILE----- - lastmodified: "2025-05-29T18:23:13Z" - mac: ENC[AES256_GCM,data:19U1KlPoC/hj8sGRjO3j/ONYcFvmUTul6qP6CaRE0BhJfpeaVYq5OvqdErVnw8UA/zBJ+zpSX/N13jcsx8QVqTljMha2fbx7iZxMbpVgzGZ+fhwICLri6PwT/sNLXKFrv8VZqNUYR5q+PWSlKCu8QQarDPvGR6qj4gm7VN7tVsI=,iv:udieJwN63LEeCRhZrLpMN6VCHBzAYt8BeJhbbLVxwCM=,tag:M6iYQb/b7vMoM+9e5is3hw==,type:str] - pgp: [] + lastmodified: "2025-05-31T08:08:02Z" + mac: ENC[AES256_GCM,data:p6FHDa6Xfd66pH4zB8s6nhGGk2Ha2YTC/wUsCrqu+9M01VQ7qv9tha1MpKMj9TUxSPSxPOI++5zkNi5LJbs4Y4q0KH4yd9w/guMmJB2+d2YUwNCTofvmQp3wS1KtaRbaai6mAXZELaVEsRkmwUdkdApNbSZkTZgDc+CMH7OmHbs=,iv:w/kv2wRO6N4k1U7y8efS7LXhrpMxkZ9kTs3lFo23MA8=,tag:F4rZGG00AQZLfGU3djgW8Q==,type:str] unencrypted_suffix: _unencrypted - version: 3.9.4 + version: 3.10.2 From 81f04c6c51ae6febcbe9e5673ca67d3627361a05 Mon Sep 17 00:00:00 2001 From: Dominik Polakovics Date: Sat, 31 May 2025 12:53:02 +0200 Subject: [PATCH 6/6] refactor: remove unused MAC address entry from dnsmasq configuration, update gitea-vm to include network settings, enhance grafana-monitor with internet connectivity check, and clean up web module imports --- hosts/fw/modules/dnsmasq.nix | 1 - hosts/fw/modules/gitea-vm.nix | 9 ++++++++- hosts/fw/modules/grafana-monitor.nix | 10 ++++++++++ hosts/fw/modules/web/default.nix | 1 - 4 files changed, 18 insertions(+), 3 deletions(-) diff --git a/hosts/fw/modules/dnsmasq.nix b/hosts/fw/modules/dnsmasq.nix index 8110621..8be86a7 100644 --- a/hosts/fw/modules/dnsmasq.nix +++ b/hosts/fw/modules/dnsmasq.nix @@ -70,7 +70,6 @@ "24:df:a7:b1:1b:74,${config.networkPrefix}.96.101,rmproplus-b1-1b-74" "1a:c4:04:6e:29:bd,${config.networkPrefix}.97.2,omada" - "02:00:00:00:00:03,${config.networkPrefix}.97.5,web-02" "02:00:00:00:00:04,${config.networkPrefix}.97.6,matrix" "ea:db:d4:c1:18:ba,${config.networkPrefix}.97.50,git" "c2:4f:64:dd:13:0c,${config.networkPrefix}.97.20,home-assistant" diff --git a/hosts/fw/modules/gitea-vm.nix b/hosts/fw/modules/gitea-vm.nix index b9c65b5..d202cd4 100644 --- a/hosts/fw/modules/gitea-vm.nix +++ b/hosts/fw/modules/gitea-vm.nix @@ -1,4 +1,4 @@ -{ lib, nixpkgs, pkgs, ... }: let +{ config, lib, nixpkgs, pkgs, ... }: let # hostname = "git-02"; # json = pkgs.formats.json { }; runners = ["git-runner-1" "git-runner-2"]; @@ -38,6 +38,13 @@ in { ]; }; + systemd.network.networks."10-lan" = { + matchConfig.PermanentMACAddress = "02:00:00:00:00:0${toString idx}"; + address = [ "${config.networkPrefix}.97.5${toString idx}/24" ]; + gateway = [ "${config.networkPrefix}.97.1" ]; + dns = [ "${config.networkPrefix}.97.1" ]; + }; + networking.hostName = runner; virtualisation.podman.enable = true; diff --git a/hosts/fw/modules/grafana-monitor.nix b/hosts/fw/modules/grafana-monitor.nix index b8effdb..c99b9fc 100644 --- a/hosts/fw/modules/grafana-monitor.nix +++ b/hosts/fw/modules/grafana-monitor.nix @@ -46,6 +46,16 @@ let fi PUSHOVER_USER_KEY=$(cat "''${PUSHOVER_USER_KEY_FILE}") +# Internet connectivity check + INTERNET_CHECK_URL="https://1.1.1.1" # Using a reliable IP to bypass potential DNS issues for the check itself + echo "Performing internet connectivity check to ''${INTERNET_CHECK_URL}..." + if ! ${pkgs.curl}/bin/curl --head --silent --fail --connect-timeout 3 --max-time 5 "''${INTERNET_CHECK_URL}" > /dev/null 2>&1; then + echo "Internet connectivity check failed. Cannot reach ''${INTERNET_CHECK_URL}. Skipping Grafana check and exiting successfully." + exit 0 + else + echo "Internet connectivity check successful. Proceeding with Grafana check." + fi + echo "" # Add a blank line for readability before Grafana check logs echo "Checking Grafana at ''${GRAFANA_URL}..." ACTUAL_HTTP_CODE="000" # Default if curl doesn't provide one CURL_ERROR_MESSAGE="" diff --git a/hosts/fw/modules/web/default.nix b/hosts/fw/modules/web/default.nix index 14b06c5..3fcfad8 100644 --- a/hosts/fw/modules/web/default.nix +++ b/hosts/fw/modules/web/default.nix @@ -52,7 +52,6 @@ in { ../network-prefix.nix ../../utils/modules/sops.nix ../../utils/modules/lego/lego.nix - ../../modules/tinder-api.nix # ../../utils/modules/borgbackup.nix ./zammad.nix