From fa42667c2a73ef289fec23887ca366de35b24c91 Mon Sep 17 00:00:00 2001
From: Dominik Polakovics <dominik.polakovics@cloonar.com>
Date: Fri, 30 May 2025 18:32:47 +0200
Subject: [PATCH 1/6] fix: update NixOS channel references to version 25.05 and
 adjust netdata configuration

---
 hosts/fw/channel                 | 2 +-
 hosts/mail/channel               | 2 +-
 hosts/mail/modules/dovecot.nix   | 4 +---
 hosts/nb/configuration.nix       | 1 +
 hosts/nb/modules/fingerprint.nix | 1 -
 hosts/web-arm/channel            | 2 +-
 utils/modules/netdata.nix        | 8 ++++++--
 7 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/hosts/fw/channel b/hosts/fw/channel
index ced117e..93f5df5 100644
--- a/hosts/fw/channel
+++ b/hosts/fw/channel
@@ -1 +1 @@
-https://channels.nixos.org/nixos-24.11
+https://channels.nixos.org/nixos-25.05
diff --git a/hosts/mail/channel b/hosts/mail/channel
index ced117e..93f5df5 100644
--- a/hosts/mail/channel
+++ b/hosts/mail/channel
@@ -1 +1 @@
-https://channels.nixos.org/nixos-24.11
+https://channels.nixos.org/nixos-25.05
diff --git a/hosts/mail/modules/dovecot.nix b/hosts/mail/modules/dovecot.nix
index a9ef486..cf8e8c1 100644
--- a/hosts/mail/modules/dovecot.nix
+++ b/hosts/mail/modules/dovecot.nix
@@ -88,6 +88,7 @@ in
 {
   environment.systemPackages = with pkgs; [
     doveSync
+    dovecot_pigeonhole
   ];
 
   services.dovecot2 = {
@@ -215,9 +216,6 @@ in
       # Read multiple mails in parallel, improves performance
       mail_prefetch_count = 20
     '';
-    modules = [
-      pkgs.dovecot_pigeonhole
-    ];
     protocols = [
       "sieve"
     ];
diff --git a/hosts/nb/configuration.nix b/hosts/nb/configuration.nix
index 0e2bb86..c708175 100644
--- a/hosts/nb/configuration.nix
+++ b/hosts/nb/configuration.nix
@@ -146,6 +146,7 @@ in {
       "/var/lib/bluetooth"
       "/var/lib/docker"
       "/var/lib/flatpak"
+      "/var/lib/fprint"
       "/var/lib/nixos"
       "/var/lib/mysql"
       "/etc/NetworkManager/system-connections"
diff --git a/hosts/nb/modules/fingerprint.nix b/hosts/nb/modules/fingerprint.nix
index d26a50d..5ff0be2 100644
--- a/hosts/nb/modules/fingerprint.nix
+++ b/hosts/nb/modules/fingerprint.nix
@@ -5,7 +5,6 @@
 
   security.pam.services.login.fprintAuth = true;
   security.pam.services.sudo.fprintAuth = true;
-  security.pam.services.sddm.fprintAuth = true;
   # If you use swaylock and want fingerprint auth for it:
   security.pam.services.swaylock.fprintAuth = true;
   # Add Polkit rule to allow locally active users to manage their own fingerprints
diff --git a/hosts/web-arm/channel b/hosts/web-arm/channel
index ced117e..93f5df5 100644
--- a/hosts/web-arm/channel
+++ b/hosts/web-arm/channel
@@ -1 +1 @@
-https://channels.nixos.org/nixos-24.11
+https://channels.nixos.org/nixos-25.05
diff --git a/utils/modules/netdata.nix b/utils/modules/netdata.nix
index 1aab534..2160f1d 100644
--- a/utils/modules/netdata.nix
+++ b/utils/modules/netdata.nix
@@ -1,10 +1,14 @@
-{ config, pkgs, ... }:
+{ config, lib, pkgs, ... }:
 let
   unstable = import (fetchTarball https://nixos.org/channels/nixos-unstable/nixexprs.tar.xz) {
     config = { allowUnfree = true; };
   };
 in
 {
+  nixpkgs.config.allowUnfreePredicate = pkg: builtins.elem (lib.getName pkg) [
+    "netdata"
+  ];
+
   services.netdata.configDir."python.d.conf" = pkgs.writeText "python.d.conf" ''
     postfix: yes
   '';
@@ -14,7 +18,7 @@ in
     python.enable = true;
 
     package = pkgs.netdata.override {
-      withCloud = true;
+      withCloudUi = true;
     };
 
     config = {

From 17a3602d3ce7154a498ffa06ccd21010259cc5a4 Mon Sep 17 00:00:00 2001
From: Dominik Polakovics <dominik.polakovics@cloonar.com>
Date: Fri, 30 May 2025 21:39:58 +0200
Subject: [PATCH 2/6] feat: implement centralized alerting with vmalert and
 Grafana, add alert rules for CPU, disk, inode, RAM usage, and host status

---
 hosts/web-arm/configuration.nix               |  1 +
 hosts/web-arm/modules/grafana.nix             | 91 ++++++++++++++-----
 hosts/web-arm/modules/vmalert/default.nix     | 38 ++++++++
 .../modules/vmalert/rules/cpu_usage.nix       | 26 ++++++
 .../modules/vmalert/rules/disk_usage.nix      | 27 ++++++
 .../modules/vmalert/rules/host_down.nix       | 23 +++++
 .../modules/vmalert/rules/inode_usage.nix     | 27 ++++++
 .../modules/vmalert/rules/ram_usage.nix       | 23 +++++
 8 files changed, 231 insertions(+), 25 deletions(-)
 create mode 100644 hosts/web-arm/modules/vmalert/default.nix
 create mode 100644 hosts/web-arm/modules/vmalert/rules/cpu_usage.nix
 create mode 100644 hosts/web-arm/modules/vmalert/rules/disk_usage.nix
 create mode 100644 hosts/web-arm/modules/vmalert/rules/host_down.nix
 create mode 100644 hosts/web-arm/modules/vmalert/rules/inode_usage.nix
 create mode 100644 hosts/web-arm/modules/vmalert/rules/ram_usage.nix

diff --git a/hosts/web-arm/configuration.nix b/hosts/web-arm/configuration.nix
index ac3d270..68a0e0c 100644
--- a/hosts/web-arm/configuration.nix
+++ b/hosts/web-arm/configuration.nix
@@ -17,6 +17,7 @@
     ./modules/grafana.nix
     ./modules/loki.nix
     ./modules/victoriametrics.nix
+    ./modules/vmalert/default.nix # Added vmalert module
     ./modules/updns.nix
 
     ./utils/modules/autoupgrade.nix
diff --git a/hosts/web-arm/modules/grafana.nix b/hosts/web-arm/modules/grafana.nix
index f8ef660..34fcd37 100644
--- a/hosts/web-arm/modules/grafana.nix
+++ b/hosts/web-arm/modules/grafana.nix
@@ -89,32 +89,73 @@ in
     };
     provision = {
       alerting = {
-        contactPoints.settings = {
-          apiVersion = 1;
-
-          contactPoints = [{
-            orgId = 1;
-            name = "cp_dominik";
-            receivers = [{
-              uid = "dominik";
-              type = "pushover";
-              settings = {
-                security.apiToken = "$__file{${config.sops.secrets.pushover-api-token.path}}";
-                security.userKey = "$__file{${config.sops.secrets.pushover-user-key.path}}";
-                apiToken = "\${PUSHOVER_API_TOKEN}";
-                userKey = "\${PUSHOVER_USER_KEY}";
-                device = "iphone";
-                priority = "2";
-                retry = "30";
-                expire = "120";
-                sound = "siren";
-                okSound = "magic";
-                message = ''
-                  {{ template "default.message" . }}
-                '';
-              };
+        contactPoints = {
+          settings = {
+            apiVersion = 1; # As per Grafana provisioning API
+            contactPoints = [{
+              orgId = 1;
+              name = "cp_dominik";
+              receivers = [{
+                uid = "dominik_pushover_cp_receiver"; # Made UID even more specific
+                type = "pushover";
+                settings = {
+                  apiToken = "\${PUSHOVER_API_TOKEN}";
+                  userKey = "\${PUSHOVER_USER_KEY}";
+                  device = "iphone";
+                  priority = 2;
+                  retry = "30s";
+                  expire = "2m";
+                  sound = "siren";
+                  okSound = "magic";
+                  message = ''
+                    {{ template "default.message" . }}
+                  '';
+                };
+              }];
             }];
-          }];
+          };
+        };
+
+        policies = { # Corrected from notificationPolicies to policies
+          settings = {
+            apiVersion = 1; # As per Grafana provisioning API
+
+            # Grafana's new unified alerting expects a single policy tree per org.
+            # For OrgID 1 (default), this defines the root of that tree.
+            # The NixOS module should translate this into the correct YAML structure.
+            # The `policies` attribute within `settings` usually takes a list of policy trees.
+            # For a single default organization, we define one policy tree.
+            # Grafana's own YAML examples show a top-level 'route' for the default policy,
+            # or a list under 'policies' if you're managing multiple policy sets (less common for basic setup).
+            # Given the NixOS option `services.grafana.provision.alerting.policies.settings.policies`,
+            # it's likely expecting a list here.
+            policies = [{ # This outer list corresponds to the `policies` option
+              # orgId = 1; # Usually implicit for the default policy file, but can be specified
+              receiver = "cp_dominik"; # This sets the default receiver for the root route
+
+              # The actual routing tree starts here.
+              # For a simple setup where all alerts go to one receiver,
+              # just setting the top-level 'receiver' is often enough.
+              # If more complex routing is needed, 'routes' would be defined here.
+              # Example:
+              # route = {
+              #   receiver = "cp_dominik";
+              #   group_by = [ "alertname", "job" ];
+              #   # ... other root route settings
+              #   routes = [
+              #     {
+              #       matcher_re = { severity = "critical" };
+              #       receiver = "critical_alerts_receiver"; # Another contact point
+              #       continue = false;
+              #     },
+              #     # ... other specific routes
+              #   ];
+              # };
+              # For the simplest case, just defining the receiver at this level should work
+              # as the root policy for the default organization.
+            }];
+            # resetPolicies = false; # Default, set to true to remove existing policies not in this config.
+          };
         };
       };
     };
diff --git a/hosts/web-arm/modules/vmalert/default.nix b/hosts/web-arm/modules/vmalert/default.nix
new file mode 100644
index 0000000..6615b58
--- /dev/null
+++ b/hosts/web-arm/modules/vmalert/default.nix
@@ -0,0 +1,38 @@
+{ config, pkgs, lib, ... }:
+{
+  imports = [
+    ./rules/cpu_usage.nix
+    ./rules/disk_usage.nix
+    ./rules/host_down.nix
+    ./rules/inode_usage.nix
+    ./rules/ram_usage.nix
+  ];
+
+  # Standard vmalert service configuration
+  services.vmalert = {
+    enable = true;
+    settings = {
+      "datasource.url" = "http://localhost:8428"; # VictoriaMetrics address
+      "notifier.url" = [ "http://localhost:3001/api/alertmanager/grafana/api/v2/alerts" ]; # Must be a list of strings
+    };
+    # 'rules' is now set by the mkMerge block above.
+  };
+
+  # Override the User and Group for the systemd service managed by the official vmalert module.
+  systemd.services.vmalert = {
+    serviceConfig = {
+      User = "victoriametrics";
+      Group = "victoriametrics";
+    };
+  };
+
+  # Ensure the user/group itself exists on the system.
+  users.users.victoriametrics = lib.mkIf (config.services.victoriametrics.enable || config.services.vmalert.enable) {
+    isSystemUser = true;
+    group = "victoriametrics"; # Primary group for the user
+    home = "/var/lib/victoriametrics"; # Standard home for VictoriaMetrics components
+  };
+  users.groups.victoriametrics = lib.mkIf (config.services.victoriametrics.enable || config.services.vmalert.enable) {
+    # Ensures the group exists.
+  };
+}
diff --git a/hosts/web-arm/modules/vmalert/rules/cpu_usage.nix b/hosts/web-arm/modules/vmalert/rules/cpu_usage.nix
new file mode 100644
index 0000000..71b8dbc
--- /dev/null
+++ b/hosts/web-arm/modules/vmalert/rules/cpu_usage.nix
@@ -0,0 +1,26 @@
+{ lib, pkgs, config, ... }: # Standard module arguments
+{
+  # This module contributes its rule group to a list that will be
+  # collected and processed by the main vmalert module.
+  services.vmalert.rules.groups = [
+    {
+      name = "CPUUsageAlerts";
+      # interval = "60s"; # Optional: group-level interval
+      rules = [ # This MUST be a list of rule attribute sets
+        {
+          alert = "HighCPUUsage";
+          expr = "(1 - avg by (instance, job) (rate(node_cpu_seconds_total{mode=\"idle\"}[1m]))) * 100 > 90";
+          for = "5m";
+          labels = {
+            severity = "warning";
+            category = "performance";
+          };
+          annotations = {
+            summary = "High CPU usage on {{ $labels.instance }}";
+            description = "CPU usage on {{ $labels.instance }} (job: {{ $labels.job }}) has been above 90% for more than 5 minutes. Current value: {{ $value | printf \"%.2f\" }}%.";
+          };
+        }
+      ];
+    }
+  ];
+}
diff --git a/hosts/web-arm/modules/vmalert/rules/disk_usage.nix b/hosts/web-arm/modules/vmalert/rules/disk_usage.nix
new file mode 100644
index 0000000..65570fd
--- /dev/null
+++ b/hosts/web-arm/modules/vmalert/rules/disk_usage.nix
@@ -0,0 +1,27 @@
+{ lib, pkgs, config, ... }: # Standard module arguments
+{
+  services.vmalert.rules.groups = [
+    {
+      name = "DiskUsageAlerts";
+      rules = [
+        {
+          alert = "HighDiskUsage";
+          expr = ''
+            (
+              node_filesystem_size_bytes{fstype!~"tmpfs|rootfs",mountpoint!=""} - node_filesystem_avail_bytes{fstype!~"tmpfs|rootfs",mountpoint!=""}
+            ) / node_filesystem_size_bytes{fstype!~"tmpfs|rootfs",mountpoint!=""} * 100 > 85
+          '';
+          for = "15m";
+          labels = {
+            severity = "warning";
+            category = "capacity";
+          };
+          annotations = {
+            summary = "High disk usage on {{ $labels.instance }} at {{ $labels.mountpoint }}";
+            description = "Disk usage on {{ $labels.instance }} for mount point {{ $labels.mountpoint }} (fstype: {{ $labels.fstype }}) has been above 85% for more than 15 minutes. Current value: {{ $value | printf \"%.2f\" }}%.";
+          };
+        }
+      ];
+    }
+  ];
+}
diff --git a/hosts/web-arm/modules/vmalert/rules/host_down.nix b/hosts/web-arm/modules/vmalert/rules/host_down.nix
new file mode 100644
index 0000000..0960bc4
--- /dev/null
+++ b/hosts/web-arm/modules/vmalert/rules/host_down.nix
@@ -0,0 +1,23 @@
+{ lib, pkgs, config, ... }: # Standard module arguments
+{
+  services.vmalert.rules.groups = [
+    {
+      name = "HostStatusAlerts";
+      rules = [
+        {
+          alert = "HostDown";
+          expr = "up == 0";
+          for = "2m";
+          labels = {
+            severity = "critical";
+            category = "availability";
+          };
+          annotations = {
+            summary = "Host {{ $labels.instance }} is down";
+            description = "Host {{ $labels.instance }} (job: {{ $labels.job }}) has been down for more than 2 minutes.";
+          };
+        }
+      ];
+    }
+  ];
+}
diff --git a/hosts/web-arm/modules/vmalert/rules/inode_usage.nix b/hosts/web-arm/modules/vmalert/rules/inode_usage.nix
new file mode 100644
index 0000000..2e2245e
--- /dev/null
+++ b/hosts/web-arm/modules/vmalert/rules/inode_usage.nix
@@ -0,0 +1,27 @@
+{ lib, pkgs, config, ... }: # Standard module arguments
+{
+  services.vmalert.rules.groups = [
+    {
+      name = "InodeUsageAlerts";
+      rules = [
+        {
+          alert = "HighInodeUsage";
+          expr = ''
+            (
+              node_filesystem_files{fstype!~"tmpfs|rootfs",mountpoint!=""} - node_filesystem_files_free{fstype!~"tmpfs|rootfs",mountpoint!=""}
+            ) / node_filesystem_files{fstype!~"tmpfs|rootfs",mountpoint!=""} * 100 > 80
+          '';
+          for = "30m";
+          labels = {
+            severity = "warning";
+            category = "capacity";
+          };
+          annotations = {
+            summary = "High inode usage on {{ $labels.instance }} at {{ $labels.mountpoint }}";
+            description = "Inode usage on {{ $labels.instance }} for mount point {{ $labels.mountpoint }} (fstype: {{ $labels.fstype }}) has been above 80% for more than 30 minutes. Current value: {{ $value | printf \"%.2f\" }}%.";
+          };
+        }
+      ];
+    }
+  ];
+}
diff --git a/hosts/web-arm/modules/vmalert/rules/ram_usage.nix b/hosts/web-arm/modules/vmalert/rules/ram_usage.nix
new file mode 100644
index 0000000..4116b05
--- /dev/null
+++ b/hosts/web-arm/modules/vmalert/rules/ram_usage.nix
@@ -0,0 +1,23 @@
+{ lib, pkgs, config, ... }: # Standard module arguments
+{
+  services.vmalert.rules.groups = [
+    {
+      name = "RAMUsageAlerts";
+      rules = [
+        {
+          alert = "HighRAMUsage";
+          expr = "(1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100 > 90";
+          for = "10m";
+          labels = {
+            severity = "warning";
+            category = "performance";
+          };
+          annotations = {
+            summary = "High RAM usage on {{ $labels.instance }}";
+            description = "RAM usage on {{ $labels.instance }} (job: {{ $labels.job }}) has been above 90% for more than 10 minutes. Current value: {{ $value | printf \"%.2f\" }}%.";
+          };
+        }
+      ];
+    }
+  ];
+}

From 8b5fb0861d090e223736cda7a764052f96fd928e Mon Sep 17 00:00:00 2001
From: Dominik Polakovics <dominik.polakovics@cloonar.com>
Date: Sat, 31 May 2025 09:27:25 +0200
Subject: [PATCH 3/6] feat: restructure Grafana configuration, migrate alert
 rules to new format and add VictoriaMetrics datasource

---
 hosts/web-arm/configuration.nix               |  3 +-
 .../modules/grafana/alerting/cpu_usage.nix    | 66 ++++++++++++++
 .../modules/grafana/alerting/disk_usage.nix   | 85 +++++++++++++++++++
 .../modules/grafana/alerting/host_down.nix    | 62 ++++++++++++++
 .../modules/grafana/alerting/inode_usage.nix  | 71 ++++++++++++++++
 .../modules/grafana/alerting/ram_usage.nix    | 69 +++++++++++++++
 .../grafana/datasources/victoriametrics.nix   | 18 ++++
 .../{grafana.nix => grafana/default.nix}      | 12 +++
 hosts/web-arm/modules/vmalert/default.nix     | 38 ---------
 .../modules/vmalert/rules/cpu_usage.nix       | 26 ------
 .../modules/vmalert/rules/disk_usage.nix      | 27 ------
 .../modules/vmalert/rules/host_down.nix       | 23 -----
 .../modules/vmalert/rules/inode_usage.nix     | 27 ------
 .../modules/vmalert/rules/ram_usage.nix       | 23 -----
 14 files changed, 384 insertions(+), 166 deletions(-)
 create mode 100644 hosts/web-arm/modules/grafana/alerting/cpu_usage.nix
 create mode 100644 hosts/web-arm/modules/grafana/alerting/disk_usage.nix
 create mode 100644 hosts/web-arm/modules/grafana/alerting/host_down.nix
 create mode 100644 hosts/web-arm/modules/grafana/alerting/inode_usage.nix
 create mode 100644 hosts/web-arm/modules/grafana/alerting/ram_usage.nix
 create mode 100644 hosts/web-arm/modules/grafana/datasources/victoriametrics.nix
 rename hosts/web-arm/modules/{grafana.nix => grafana/default.nix} (94%)
 delete mode 100644 hosts/web-arm/modules/vmalert/default.nix
 delete mode 100644 hosts/web-arm/modules/vmalert/rules/cpu_usage.nix
 delete mode 100644 hosts/web-arm/modules/vmalert/rules/disk_usage.nix
 delete mode 100644 hosts/web-arm/modules/vmalert/rules/host_down.nix
 delete mode 100644 hosts/web-arm/modules/vmalert/rules/inode_usage.nix
 delete mode 100644 hosts/web-arm/modules/vmalert/rules/ram_usage.nix

diff --git a/hosts/web-arm/configuration.nix b/hosts/web-arm/configuration.nix
index 68a0e0c..54c74d9 100644
--- a/hosts/web-arm/configuration.nix
+++ b/hosts/web-arm/configuration.nix
@@ -14,10 +14,9 @@
     ./modules/nextcloud
     ./modules/rustdesk.nix
     ./modules/postgresql.nix
-    ./modules/grafana.nix
+    ./modules/grafana/default.nix
     ./modules/loki.nix
     ./modules/victoriametrics.nix
-    ./modules/vmalert/default.nix # Added vmalert module
     ./modules/updns.nix
 
     ./utils/modules/autoupgrade.nix
diff --git a/hosts/web-arm/modules/grafana/alerting/cpu_usage.nix b/hosts/web-arm/modules/grafana/alerting/cpu_usage.nix
new file mode 100644
index 0000000..515fabb
--- /dev/null
+++ b/hosts/web-arm/modules/grafana/alerting/cpu_usage.nix
@@ -0,0 +1,66 @@
+{ lib, pkgs, config, ... }:
+{
+  services.grafana.provision.alerting.rules.settings.groups = [
+    {
+      name = "CPUUsageAlerts";
+      folder = "System Alerts";
+      interval = "1m";
+
+      rules = [
+        {
+          uid = "high-cpu-usage-alert-uid";
+          title = "HighCPUUsage";
+          condition = "D"; # Condition is now D
+
+          data = [
+            # Query A: Calculate CPU usage percentage
+            {
+              refId = "A";
+              datasourceUid = "vm-datasource-uid";
+              queryType = "prometheus";
+              relativeTimeRange = { from = 60; to = 0; }; # Query over the last minute
+              model = {
+                # Calculate average CPU usage over 1m, grouped by instance and job
+                expr = ''(1 - avg by (instance, job) (rate(node_cpu_seconds_total{mode="idle"}[1m]))) * 100'';
+                legendFormat = "CPU usage on {{instance}} ({{job}})";
+                instant = false; # This is a range query
+              };
+            }
+            # Expression C: Reduce Query A to its last value, preserving labels
+            {
+              refId = "C";
+              datasourceUid = "__expr__";
+              model = {
+                type = "reduce";
+                expression = "A"; # Input is Query A
+                reducer = "last"; # Get the last value of each series in A
+              };
+            }
+            # Expression D: Apply math condition to the reduced values from C
+            {
+              refId = "D";
+              datasourceUid = "__expr__";
+              model = {
+                type = "math";
+                expression = "$C > 90"; # Alert if CPU usage from C is > 90%
+              };
+            }
+          ];
+
+          for = "5m"; # Duration the condition must be met
+          noDataState = "NoData";
+          execErrState = "Error";
+
+          annotations = {
+            summary = "High CPU usage on {{ $labels.instance }}";
+            description = ''CPU usage on {{ $labels.instance }} (job: {{ $labels.job }}) has been above 90% for more than 5 minutes. Current value: {{ if $values.C }}{{ $values.C | humanizePercentage }}{{ else }}N/A{{ end }}%.'';
+          };
+          labels = {
+            severity = "warning";
+            category = "performance";
+          };
+        }
+      ];
+    }
+  ];
+}
diff --git a/hosts/web-arm/modules/grafana/alerting/disk_usage.nix b/hosts/web-arm/modules/grafana/alerting/disk_usage.nix
new file mode 100644
index 0000000..b30686b
--- /dev/null
+++ b/hosts/web-arm/modules/grafana/alerting/disk_usage.nix
@@ -0,0 +1,85 @@
+{ lib, pkgs, config, ... }:
+{
+  services.grafana.provision.alerting.rules.settings.groups = [
+    {
+      # orgId = 1; # Defaults to 1 for provisioned rules
+      name = "DiskUsageAlerts";      # Name of the rule group
+      folder = "System Alerts";       # The folder these rules belong to in Grafana UI
+      interval = "1m";              # How often to evaluate rules in this group
+
+      rules = [
+        {
+          uid = "high-disk-usage-alert-uid"; # Optional: provide a stable UID for the rule itself
+          title = "HighDiskUsage";          # Name of the alert rule (was 'alert' in vmalert)
+
+          # Condition for the alert to fire. 'C' refers to the refId of the threshold expression.
+          condition = "D"; # Condition is now D
+          # Removed rule-level relativeTimeRange
+
+          # Data queries and expressions
+          data = [
+            # Query A: Calculate disk usage percentage
+            {
+              refId = "A";
+              datasourceUid = "vm-datasource-uid"; # UID of the VictoriaMetrics datasource
+              queryType = "prometheus"; # Explicitly set, though often inferred
+              relativeTimeRange = { from = 60; to = 0; }; # Query-level, integer seconds
+              model = {
+                expr = ''
+                  (
+                    node_filesystem_size_bytes{fstype!~"tmpfs|rootfs",mountpoint!=""} - node_filesystem_avail_bytes{fstype!~"tmpfs|rootfs",mountpoint!=""}
+                  ) / (node_filesystem_size_bytes{fstype!~"tmpfs|rootfs",mountpoint!=""} > 0) * 100
+                  and node_filesystem_size_bytes{fstype!~"tmpfs|rootfs",mountpoint!=""}
+                  and node_filesystem_avail_bytes{fstype!~"tmpfs|rootfs",mountpoint!=""}
+                '';
+                legendFormat = "{{mountpoint}} on {{instance}}"; # Example legend
+                instant = false; # For range queries, default is false
+              };
+            }
+            # Expression C: Reduce Query A to its last value, preserving labels
+            {
+              refId = "C";
+              datasourceUid = "__expr__";
+              model = {
+                type = "reduce";
+                expression = "A"; # Input is Query A
+                reducer = "last"; # Get the last value of each series in A
+              };
+            }
+            # Expression D: Apply math condition to the reduced values from C
+            {
+              refId = "D";
+              datasourceUid = "__expr__";
+              model = {
+                type = "math";
+                expression = "$C > 85"; # Check if the last value from each series in C is > 85
+              };
+            }
+          ];
+          
+          for = "15m"; # Duration the condition must be met (same as vmalert)
+          
+          # How to handle states where data is missing or query errors
+          noDataState = "NoData";   # Options: NoData, Alerting, OK
+          execErrState = "Error"; # Options: Error, Alerting, OK
+
+          annotations = {
+            summary = "High disk usage on {{ $labels.instance }} at {{ $labels.mountpoint }}";
+            description = ''
+              Disk usage on {{ $labels.instance }} for mount point {{ $labels.mountpoint }}
+              (fstype: {{ $labels.fstype }}) has been above 85% for more than 15 minutes.
+              Current value: {{ if $values.C }}{{ $values.C | humanizePercentage }}{{ else }}N/A{{ end }}%.
+            ''; # Using $values.C as it's the input to the math condition D
+          };
+          labels = {
+            severity = "warning";
+            category = "capacity";
+            # Grafana automatically adds labels from the query result (instance, mountpoint, etc.)
+            # and labels from the rule group/folder.
+          };
+          # isPaused = false; # Default is not paused
+        }
+      ];
+    }
+  ];
+}
diff --git a/hosts/web-arm/modules/grafana/alerting/host_down.nix b/hosts/web-arm/modules/grafana/alerting/host_down.nix
new file mode 100644
index 0000000..a2d938f
--- /dev/null
+++ b/hosts/web-arm/modules/grafana/alerting/host_down.nix
@@ -0,0 +1,62 @@
+{ lib, pkgs, config, ... }:
+{
+  services.grafana.provision.alerting.rules.settings.groups = [
+    {
+      name = "HostStatusAlerts";
+      folder = "System Alerts";
+      interval = "1m";
+
+      rules = [
+        {
+          uid = "host-down-alert-uid";
+          title = "HostDown";
+          condition = "C";
+
+          data = [
+            {
+              refId = "A";
+              datasourceUid = "vm-datasource-uid";
+              queryType = "prometheus";
+              relativeTimeRange = { from = 60; to = 0; }; # Query over the last minute
+              model = {
+                expr = ''up'';
+                legendFormat = "{{instance}} ({{job}})";
+                instant = false; # Changed from true, as relativeTimeRange is used
+              };
+            }
+            { # New Expression B: Reduce Query A
+              refId = "B";
+              datasourceUid = "__expr__";
+              model = {
+                type = "reduce";
+                expression = "A"; # Input is Query A
+                reducer = "last"; # Get the last value of each series in A
+              };
+            }
+            { # Modified Expression C: Math condition based on B
+              refId = "C";
+              datasourceUid = "__expr__";
+              model = {
+                type = "math";
+                expression = "$B == 0"; # Check if the last value from B is 0
+              };
+            }
+          ];
+
+          for = "2m";
+          noDataState = "Alerting";
+          execErrState = "Error";
+
+          annotations = {
+            summary = "Host {{ $labels.instance }} is down";
+            description = ''Host {{ $labels.instance }} (job: {{ $labels.job }}) has been down for more than 2 minutes.'';
+          };
+          labels = {
+            severity = "critical";
+            category = "availability";
+          };
+        }
+      ];
+    }
+  ];
+}
diff --git a/hosts/web-arm/modules/grafana/alerting/inode_usage.nix b/hosts/web-arm/modules/grafana/alerting/inode_usage.nix
new file mode 100644
index 0000000..8f67178
--- /dev/null
+++ b/hosts/web-arm/modules/grafana/alerting/inode_usage.nix
@@ -0,0 +1,71 @@
+{ lib, pkgs, config, ... }:
+{
+  services.grafana.provision.alerting.rules.settings.groups = [
+    {
+      name = "InodeUsageAlerts";
+      folder = "System Alerts";
+      interval = "1m";
+
+      rules = [
+        {
+          uid = "high-inode-usage-alert-uid";
+          title = "HighInodeUsage";
+          condition = "D"; # Condition is now D
+
+          data = [
+            # Query A: Calculate inode usage percentage
+            {
+              refId = "A";
+              datasourceUid = "vm-datasource-uid";
+              queryType = "prometheus";
+              relativeTimeRange = { from = 60; to = 0; };
+              model = {
+                expr = ''
+                  (
+                    node_filesystem_files{fstype!~"tmpfs|rootfs",mountpoint!=""} - node_filesystem_files_free{fstype!~"tmpfs|rootfs",mountpoint!=""}
+                  ) / (node_filesystem_files{fstype!~"tmpfs|rootfs",mountpoint!=""} > 0) * 100
+                  and node_filesystem_files{fstype!~"tmpfs|rootfs",mountpoint!=""}
+                  and node_filesystem_files_free{fstype!~"tmpfs|rootfs",mountpoint!=""}
+                '';
+                legendFormat = "{{mountpoint}} on {{instance}}";
+                instant = false;
+              };
+            }
+            # Expression C: Reduce Query A to its last value, preserving labels
+            {
+              refId = "C";
+              datasourceUid = "__expr__";
+              model = {
+                type = "reduce";
+                expression = "A"; # Input is Query A
+                reducer = "last"; # Get the last value of each series in A
+              };
+            }
+            # Expression D: Apply math condition to the reduced values from C
+            {
+              refId = "D";
+              datasourceUid = "__expr__";
+              model = {
+                type = "math";
+                expression = "$C > 80"; # Alert if inode usage from C is > 80%
+              };
+            }
+          ];
+
+          for = "30m"; # Duration the condition must be met
+          noDataState = "NoData";
+          execErrState = "Error";
+
+          annotations = {
+            summary = "High inode usage on {{ $labels.instance }} at {{ $labels.mountpoint }}";
+            description = ''Inode usage on {{ $labels.instance }} for mount point {{ $labels.mountpoint }} (fstype: {{ $labels.fstype }}) has been above 80% for more than 30 minutes. Current value: {{ if $values.C }}{{ $values.C | humanizePercentage }}{{ else }}N/A{{ end }}%.'';
+          };
+          labels = {
+            severity = "warning";
+            category = "capacity";
+          };
+        }
+      ];
+    }
+  ];
+}
diff --git a/hosts/web-arm/modules/grafana/alerting/ram_usage.nix b/hosts/web-arm/modules/grafana/alerting/ram_usage.nix
new file mode 100644
index 0000000..03dd931
--- /dev/null
+++ b/hosts/web-arm/modules/grafana/alerting/ram_usage.nix
@@ -0,0 +1,69 @@
+{ lib, pkgs, config, ... }:
+{
+  services.grafana.provision.alerting.rules.settings.groups = [
+    {
+      name = "RAMUsageAlerts";
+      folder = "System Alerts";
+      interval = "1m";
+
+      rules = [
+        {
+          uid = "high-ram-usage-alert-uid";
+          title = "HighRAMUsage";
+          condition = "D"; # Condition is now D
+
+          data = [
+            # Query A: Calculate RAM usage percentage
+            {
+              refId = "A";
+              datasourceUid = "vm-datasource-uid";
+              queryType = "prometheus";
+              relativeTimeRange = { from = 60; to = 0; };
+              model = {
+                expr = ''
+                  (1 - node_memory_MemAvailable_bytes / (node_memory_MemTotal_bytes > 0)) * 100
+                  and node_memory_MemAvailable_bytes
+                  and node_memory_MemTotal_bytes
+                '';
+                legendFormat = "RAM usage on {{instance}} ({{job}})";
+                instant = false;
+              };
+            }
+            # Expression C: Reduce Query A to its last value, preserving labels
+            {
+              refId = "C";
+              datasourceUid = "__expr__";
+              model = {
+                type = "reduce";
+                expression = "A"; # Input is Query A
+                reducer = "last"; # Get the last value of each series in A
+              };
+            }
+            # Expression D: Apply math condition to the reduced values from C
+            {
+              refId = "D";
+              datasourceUid = "__expr__";
+              model = {
+                type = "math";
+                expression = "$C > 90"; # Alert if RAM usage from C is > 90%
+              };
+            }
+          ];
+
+          for = "10m"; # Duration the condition must be met
+          noDataState = "NoData";
+          execErrState = "Error";
+
+          annotations = {
+            summary = "High RAM usage on {{ $labels.instance }}";
+            description = ''RAM usage on {{ $labels.instance }} (job: {{ $labels.job }}) has been above 90% for more than 10 minutes. Current value: {{ if $values.C }}{{ $values.C | humanizePercentage }}{{ else }}N/A{{ end }}%.'';
+          };
+          labels = {
+            severity = "warning";
+            category = "performance";
+          };
+        }
+      ];
+    }
+  ];
+}
diff --git a/hosts/web-arm/modules/grafana/datasources/victoriametrics.nix b/hosts/web-arm/modules/grafana/datasources/victoriametrics.nix
new file mode 100644
index 0000000..57ea78a
--- /dev/null
+++ b/hosts/web-arm/modules/grafana/datasources/victoriametrics.nix
@@ -0,0 +1,18 @@
+{ lib, pkgs, config, ... }:
+{
+  services.grafana.provision.datasources.settings.datasources = [
+    {
+      name = "VictoriaMetrics";
+      uid = "vm-datasource-uid"; # Stable UID for referencing in alerts
+      type = "prometheus";
+      url = "http://localhost:8428"; # URL of VictoriaMetrics
+      access = "proxy"; # Grafana proxies requests
+      isDefault = true; # Optional: make this the default datasource
+      jsonData = {
+        # timeInterval = "30s"; # Optional: Scrape interval if different from Grafana's default
+        # httpMethod = "POST"; # Optional: if VictoriaMetrics prefers POST for queries
+      };
+      editable = false; # Recommended for provisioned datasources
+    }
+  ];
+}
\ No newline at end of file
diff --git a/hosts/web-arm/modules/grafana.nix b/hosts/web-arm/modules/grafana/default.nix
similarity index 94%
rename from hosts/web-arm/modules/grafana.nix
rename to hosts/web-arm/modules/grafana/default.nix
index 34fcd37..6d1394f 100644
--- a/hosts/web-arm/modules/grafana.nix
+++ b/hosts/web-arm/modules/grafana/default.nix
@@ -27,6 +27,16 @@ let
   };
 in
 {
+  imports = [
+    ./alerting/disk_usage.nix
+    ./alerting/cpu_usage.nix
+    ./alerting/host_down.nix
+    ./alerting/inode_usage.nix
+    ./alerting/ram_usage.nix
+    # ... other rule files can be added here ...
+    ./datasources/victoriametrics.nix
+  ];
+
   systemd.services.grafana.script = lib.mkBefore ''
     export GF_AUTH_GENERIC_OAUTH_CLIENT_SECRET=$(cat /run/secrets/grafana-oauth-secret)
     export PUSHOVER_API_TOKEN=$(cat /run/secrets/pushover-api-token)
@@ -89,6 +99,7 @@ in
     };
     provision = {
       alerting = {
+        rules.settings.groups = lib.mkMerge []; # Allows rule groups to be merged
         contactPoints = {
           settings = {
             apiVersion = 1; # As per Grafana provisioning API
@@ -158,6 +169,7 @@ in
           };
         };
       };
+      datasources.settings.datasources = lib.mkMerge []; # Allows datasources to be merged
     };
   };
 
diff --git a/hosts/web-arm/modules/vmalert/default.nix b/hosts/web-arm/modules/vmalert/default.nix
deleted file mode 100644
index 6615b58..0000000
--- a/hosts/web-arm/modules/vmalert/default.nix
+++ /dev/null
@@ -1,38 +0,0 @@
-{ config, pkgs, lib, ... }:
-{
-  imports = [
-    ./rules/cpu_usage.nix
-    ./rules/disk_usage.nix
-    ./rules/host_down.nix
-    ./rules/inode_usage.nix
-    ./rules/ram_usage.nix
-  ];
-
-  # Standard vmalert service configuration
-  services.vmalert = {
-    enable = true;
-    settings = {
-      "datasource.url" = "http://localhost:8428"; # VictoriaMetrics address
-      "notifier.url" = [ "http://localhost:3001/api/alertmanager/grafana/api/v2/alerts" ]; # Must be a list of strings
-    };
-    # 'rules' is now set by the mkMerge block above.
-  };
-
-  # Override the User and Group for the systemd service managed by the official vmalert module.
-  systemd.services.vmalert = {
-    serviceConfig = {
-      User = "victoriametrics";
-      Group = "victoriametrics";
-    };
-  };
-
-  # Ensure the user/group itself exists on the system.
-  users.users.victoriametrics = lib.mkIf (config.services.victoriametrics.enable || config.services.vmalert.enable) {
-    isSystemUser = true;
-    group = "victoriametrics"; # Primary group for the user
-    home = "/var/lib/victoriametrics"; # Standard home for VictoriaMetrics components
-  };
-  users.groups.victoriametrics = lib.mkIf (config.services.victoriametrics.enable || config.services.vmalert.enable) {
-    # Ensures the group exists.
-  };
-}
diff --git a/hosts/web-arm/modules/vmalert/rules/cpu_usage.nix b/hosts/web-arm/modules/vmalert/rules/cpu_usage.nix
deleted file mode 100644
index 71b8dbc..0000000
--- a/hosts/web-arm/modules/vmalert/rules/cpu_usage.nix
+++ /dev/null
@@ -1,26 +0,0 @@
-{ lib, pkgs, config, ... }: # Standard module arguments
-{
-  # This module contributes its rule group to a list that will be
-  # collected and processed by the main vmalert module.
-  services.vmalert.rules.groups = [
-    {
-      name = "CPUUsageAlerts";
-      # interval = "60s"; # Optional: group-level interval
-      rules = [ # This MUST be a list of rule attribute sets
-        {
-          alert = "HighCPUUsage";
-          expr = "(1 - avg by (instance, job) (rate(node_cpu_seconds_total{mode=\"idle\"}[1m]))) * 100 > 90";
-          for = "5m";
-          labels = {
-            severity = "warning";
-            category = "performance";
-          };
-          annotations = {
-            summary = "High CPU usage on {{ $labels.instance }}";
-            description = "CPU usage on {{ $labels.instance }} (job: {{ $labels.job }}) has been above 90% for more than 5 minutes. Current value: {{ $value | printf \"%.2f\" }}%.";
-          };
-        }
-      ];
-    }
-  ];
-}
diff --git a/hosts/web-arm/modules/vmalert/rules/disk_usage.nix b/hosts/web-arm/modules/vmalert/rules/disk_usage.nix
deleted file mode 100644
index 65570fd..0000000
--- a/hosts/web-arm/modules/vmalert/rules/disk_usage.nix
+++ /dev/null
@@ -1,27 +0,0 @@
-{ lib, pkgs, config, ... }: # Standard module arguments
-{
-  services.vmalert.rules.groups = [
-    {
-      name = "DiskUsageAlerts";
-      rules = [
-        {
-          alert = "HighDiskUsage";
-          expr = ''
-            (
-              node_filesystem_size_bytes{fstype!~"tmpfs|rootfs",mountpoint!=""} - node_filesystem_avail_bytes{fstype!~"tmpfs|rootfs",mountpoint!=""}
-            ) / node_filesystem_size_bytes{fstype!~"tmpfs|rootfs",mountpoint!=""} * 100 > 85
-          '';
-          for = "15m";
-          labels = {
-            severity = "warning";
-            category = "capacity";
-          };
-          annotations = {
-            summary = "High disk usage on {{ $labels.instance }} at {{ $labels.mountpoint }}";
-            description = "Disk usage on {{ $labels.instance }} for mount point {{ $labels.mountpoint }} (fstype: {{ $labels.fstype }}) has been above 85% for more than 15 minutes. Current value: {{ $value | printf \"%.2f\" }}%.";
-          };
-        }
-      ];
-    }
-  ];
-}
diff --git a/hosts/web-arm/modules/vmalert/rules/host_down.nix b/hosts/web-arm/modules/vmalert/rules/host_down.nix
deleted file mode 100644
index 0960bc4..0000000
--- a/hosts/web-arm/modules/vmalert/rules/host_down.nix
+++ /dev/null
@@ -1,23 +0,0 @@
-{ lib, pkgs, config, ... }: # Standard module arguments
-{
-  services.vmalert.rules.groups = [
-    {
-      name = "HostStatusAlerts";
-      rules = [
-        {
-          alert = "HostDown";
-          expr = "up == 0";
-          for = "2m";
-          labels = {
-            severity = "critical";
-            category = "availability";
-          };
-          annotations = {
-            summary = "Host {{ $labels.instance }} is down";
-            description = "Host {{ $labels.instance }} (job: {{ $labels.job }}) has been down for more than 2 minutes.";
-          };
-        }
-      ];
-    }
-  ];
-}
diff --git a/hosts/web-arm/modules/vmalert/rules/inode_usage.nix b/hosts/web-arm/modules/vmalert/rules/inode_usage.nix
deleted file mode 100644
index 2e2245e..0000000
--- a/hosts/web-arm/modules/vmalert/rules/inode_usage.nix
+++ /dev/null
@@ -1,27 +0,0 @@
-{ lib, pkgs, config, ... }: # Standard module arguments
-{
-  services.vmalert.rules.groups = [
-    {
-      name = "InodeUsageAlerts";
-      rules = [
-        {
-          alert = "HighInodeUsage";
-          expr = ''
-            (
-              node_filesystem_files{fstype!~"tmpfs|rootfs",mountpoint!=""} - node_filesystem_files_free{fstype!~"tmpfs|rootfs",mountpoint!=""}
-            ) / node_filesystem_files{fstype!~"tmpfs|rootfs",mountpoint!=""} * 100 > 80
-          '';
-          for = "30m";
-          labels = {
-            severity = "warning";
-            category = "capacity";
-          };
-          annotations = {
-            summary = "High inode usage on {{ $labels.instance }} at {{ $labels.mountpoint }}";
-            description = "Inode usage on {{ $labels.instance }} for mount point {{ $labels.mountpoint }} (fstype: {{ $labels.fstype }}) has been above 80% for more than 30 minutes. Current value: {{ $value | printf \"%.2f\" }}%.";
-          };
-        }
-      ];
-    }
-  ];
-}
diff --git a/hosts/web-arm/modules/vmalert/rules/ram_usage.nix b/hosts/web-arm/modules/vmalert/rules/ram_usage.nix
deleted file mode 100644
index 4116b05..0000000
--- a/hosts/web-arm/modules/vmalert/rules/ram_usage.nix
+++ /dev/null
@@ -1,23 +0,0 @@
-{ lib, pkgs, config, ... }: # Standard module arguments
-{
-  services.vmalert.rules.groups = [
-    {
-      name = "RAMUsageAlerts";
-      rules = [
-        {
-          alert = "HighRAMUsage";
-          expr = "(1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100 > 90";
-          for = "10m";
-          labels = {
-            severity = "warning";
-            category = "performance";
-          };
-          annotations = {
-            summary = "High RAM usage on {{ $labels.instance }}";
-            description = "RAM usage on {{ $labels.instance }} (job: {{ $labels.job }}) has been above 90% for more than 10 minutes. Current value: {{ $value | printf \"%.2f\" }}%.";
-          };
-        }
-      ];
-    }
-  ];
-}

From 35fa61ef34fe8e55849032cab6144cf5f716b904 Mon Sep 17 00:00:00 2001
From: Dominik Polakovics <dominik.polakovics@cloonar.com>
Date: Sat, 31 May 2025 09:57:03 +0200
Subject: [PATCH 4/6] feat: refactor Grafana alerting rules into a consolidated
 system module and update individual alert files

---
 .../modules/grafana/alerting/cpu_usage.nix    | 100 ++++++-------
 .../modules/grafana/alerting/disk_usage.nix   | 137 ++++++++----------
 .../modules/grafana/alerting/host_down.nix    |  92 ++++++------
 .../modules/grafana/alerting/inode_usage.nix  | 110 +++++++-------
 .../modules/grafana/alerting/ram_usage.nix    | 106 +++++++-------
 .../grafana/alerting/system/default.nix       |  21 +++
 hosts/web-arm/modules/grafana/default.nix     |  14 +-
 7 files changed, 281 insertions(+), 299 deletions(-)
 create mode 100644 hosts/web-arm/modules/grafana/alerting/system/default.nix

diff --git a/hosts/web-arm/modules/grafana/alerting/cpu_usage.nix b/hosts/web-arm/modules/grafana/alerting/cpu_usage.nix
index 515fabb..9c09881 100644
--- a/hosts/web-arm/modules/grafana/alerting/cpu_usage.nix
+++ b/hosts/web-arm/modules/grafana/alerting/cpu_usage.nix
@@ -1,66 +1,58 @@
 { lib, pkgs, config, ... }:
 {
-  services.grafana.provision.alerting.rules.settings.groups = [
+  grafanaAlertRuleDefinitions = [
     {
-      name = "CPUUsageAlerts";
-      folder = "System Alerts";
-      interval = "1m";
+      uid = "high-cpu-usage-alert-uid";
+      title = "HighCPUUsage";
+      condition = "D"; # Condition is now D
 
-      rules = [
+      data = [
+        # Query A: Calculate CPU usage percentage
         {
-          uid = "high-cpu-usage-alert-uid";
-          title = "HighCPUUsage";
-          condition = "D"; # Condition is now D
-
-          data = [
-            # Query A: Calculate CPU usage percentage
-            {
-              refId = "A";
-              datasourceUid = "vm-datasource-uid";
-              queryType = "prometheus";
-              relativeTimeRange = { from = 60; to = 0; }; # Query over the last minute
-              model = {
-                # Calculate average CPU usage over 1m, grouped by instance and job
-                expr = ''(1 - avg by (instance, job) (rate(node_cpu_seconds_total{mode="idle"}[1m]))) * 100'';
-                legendFormat = "CPU usage on {{instance}} ({{job}})";
-                instant = false; # This is a range query
-              };
-            }
-            # Expression C: Reduce Query A to its last value, preserving labels
-            {
-              refId = "C";
-              datasourceUid = "__expr__";
-              model = {
-                type = "reduce";
-                expression = "A"; # Input is Query A
-                reducer = "last"; # Get the last value of each series in A
-              };
-            }
-            # Expression D: Apply math condition to the reduced values from C
-            {
-              refId = "D";
-              datasourceUid = "__expr__";
-              model = {
-                type = "math";
-                expression = "$C > 90"; # Alert if CPU usage from C is > 90%
-              };
-            }
-          ];
-
-          for = "5m"; # Duration the condition must be met
-          noDataState = "NoData";
-          execErrState = "Error";
-
-          annotations = {
-            summary = "High CPU usage on {{ $labels.instance }}";
-            description = ''CPU usage on {{ $labels.instance }} (job: {{ $labels.job }}) has been above 90% for more than 5 minutes. Current value: {{ if $values.C }}{{ $values.C | humanizePercentage }}{{ else }}N/A{{ end }}%.'';
+          refId = "A";
+          datasourceUid = "vm-datasource-uid";
+          queryType = "prometheus";
+          relativeTimeRange = { from = 60; to = 0; }; # Query over the last minute
+          model = {
+            # Calculate average CPU usage over 1m, grouped by instance and job
+            expr = ''(1 - avg by (instance, job) (rate(node_cpu_seconds_total{mode="idle"}[1m]))) * 100'';
+            legendFormat = "CPU usage on {{instance}} ({{job}})";
+            instant = false; # This is a range query
           };
-          labels = {
-            severity = "warning";
-            category = "performance";
+        }
+        # Expression C: Reduce Query A to its last value, preserving labels
+        {
+          refId = "C";
+          datasourceUid = "__expr__";
+          model = {
+            type = "reduce";
+            expression = "A"; # Input is Query A
+            reducer = "last"; # Get the last value of each series in A
+          };
+        }
+        # Expression D: Apply math condition to the reduced values from C
+        {
+          refId = "D";
+          datasourceUid = "__expr__";
+          model = {
+            type = "math";
+            expression = "$C > 90"; # Alert if CPU usage from C is > 90%
           };
         }
       ];
+
+      for = "5m"; # Duration the condition must be met
+      noDataState = "NoData";
+      execErrState = "Error";
+
+      annotations = {
+        summary = "High CPU usage on {{ $labels.instance }}";
+        description = ''CPU usage on {{ $labels.instance }} (job: {{ $labels.job }}) has been above 90% for more than 5 minutes. Current value: {{ if $values.C }}{{ $values.C | humanizePercentage }}{{ else }}N/A{{ end }}%.'';
+      };
+      labels = {
+        severity = "warning";
+        category = "performance";
+      };
     }
   ];
 }
diff --git a/hosts/web-arm/modules/grafana/alerting/disk_usage.nix b/hosts/web-arm/modules/grafana/alerting/disk_usage.nix
index b30686b..020947b 100644
--- a/hosts/web-arm/modules/grafana/alerting/disk_usage.nix
+++ b/hosts/web-arm/modules/grafana/alerting/disk_usage.nix
@@ -1,85 +1,76 @@
 { lib, pkgs, config, ... }:
 {
-  services.grafana.provision.alerting.rules.settings.groups = [
+  grafanaAlertRuleDefinitions = [
     {
-      # orgId = 1; # Defaults to 1 for provisioned rules
-      name = "DiskUsageAlerts";      # Name of the rule group
-      folder = "System Alerts";       # The folder these rules belong to in Grafana UI
-      interval = "1m";              # How often to evaluate rules in this group
+      uid = "high-disk-usage-alert-uid"; # Optional: provide a stable UID for the rule itself
+      title = "HighDiskUsage";          # Name of the alert rule (was 'alert' in vmalert)
 
-      rules = [
+      # Condition for the alert to fire. 'D' refers to the refId of the threshold expression.
+      condition = "D"; # Condition is now D
+      # Removed rule-level relativeTimeRange
+
+      # Data queries and expressions
+      data = [
+        # Query A: Calculate disk usage percentage
         {
-          uid = "high-disk-usage-alert-uid"; # Optional: provide a stable UID for the rule itself
-          title = "HighDiskUsage";          # Name of the alert rule (was 'alert' in vmalert)
-
-          # Condition for the alert to fire. 'C' refers to the refId of the threshold expression.
-          condition = "D"; # Condition is now D
-          # Removed rule-level relativeTimeRange
-
-          # Data queries and expressions
-          data = [
-            # Query A: Calculate disk usage percentage
-            {
-              refId = "A";
-              datasourceUid = "vm-datasource-uid"; # UID of the VictoriaMetrics datasource
-              queryType = "prometheus"; # Explicitly set, though often inferred
-              relativeTimeRange = { from = 60; to = 0; }; # Query-level, integer seconds
-              model = {
-                expr = ''
-                  (
-                    node_filesystem_size_bytes{fstype!~"tmpfs|rootfs",mountpoint!=""} - node_filesystem_avail_bytes{fstype!~"tmpfs|rootfs",mountpoint!=""}
-                  ) / (node_filesystem_size_bytes{fstype!~"tmpfs|rootfs",mountpoint!=""} > 0) * 100
-                  and node_filesystem_size_bytes{fstype!~"tmpfs|rootfs",mountpoint!=""}
-                  and node_filesystem_avail_bytes{fstype!~"tmpfs|rootfs",mountpoint!=""}
-                '';
-                legendFormat = "{{mountpoint}} on {{instance}}"; # Example legend
-                instant = false; # For range queries, default is false
-              };
-            }
-            # Expression C: Reduce Query A to its last value, preserving labels
-            {
-              refId = "C";
-              datasourceUid = "__expr__";
-              model = {
-                type = "reduce";
-                expression = "A"; # Input is Query A
-                reducer = "last"; # Get the last value of each series in A
-              };
-            }
-            # Expression D: Apply math condition to the reduced values from C
-            {
-              refId = "D";
-              datasourceUid = "__expr__";
-              model = {
-                type = "math";
-                expression = "$C > 85"; # Check if the last value from each series in C is > 85
-              };
-            }
-          ];
-          
-          for = "15m"; # Duration the condition must be met (same as vmalert)
-          
-          # How to handle states where data is missing or query errors
-          noDataState = "NoData";   # Options: NoData, Alerting, OK
-          execErrState = "Error"; # Options: Error, Alerting, OK
-
-          annotations = {
-            summary = "High disk usage on {{ $labels.instance }} at {{ $labels.mountpoint }}";
-            description = ''
-              Disk usage on {{ $labels.instance }} for mount point {{ $labels.mountpoint }}
-              (fstype: {{ $labels.fstype }}) has been above 85% for more than 15 minutes.
-              Current value: {{ if $values.C }}{{ $values.C | humanizePercentage }}{{ else }}N/A{{ end }}%.
-            ''; # Using $values.C as it's the input to the math condition D
+          refId = "A";
+          datasourceUid = "vm-datasource-uid"; # UID of the VictoriaMetrics datasource
+          queryType = "prometheus"; # Explicitly set, though often inferred
+          relativeTimeRange = { from = 60; to = 0; }; # Query-level, integer seconds
+          model = {
+            expr = ''
+              (
+                node_filesystem_size_bytes{fstype!~"tmpfs|rootfs",mountpoint!=""} - node_filesystem_avail_bytes{fstype!~"tmpfs|rootfs",mountpoint!=""}
+              ) / (node_filesystem_size_bytes{fstype!~"tmpfs|rootfs",mountpoint!=""} > 0) * 100
+              and node_filesystem_size_bytes{fstype!~"tmpfs|rootfs",mountpoint!=""}
+              and node_filesystem_avail_bytes{fstype!~"tmpfs|rootfs",mountpoint!=""}
+            '';
+            legendFormat = "{{mountpoint}} on {{instance}}"; # Example legend
+            instant = false; # For range queries, default is false
           };
-          labels = {
-            severity = "warning";
-            category = "capacity";
-            # Grafana automatically adds labels from the query result (instance, mountpoint, etc.)
-            # and labels from the rule group/folder.
+        }
+        # Expression C: Reduce Query A to its last value, preserving labels
+        {
+          refId = "C";
+          datasourceUid = "__expr__";
+          model = {
+            type = "reduce";
+            expression = "A"; # Input is Query A
+            reducer = "last"; # Get the last value of each series in A
+          };
+        }
+        # Expression D: Apply math condition to the reduced values from C
+        {
+          refId = "D";
+          datasourceUid = "__expr__";
+          model = {
+            type = "math";
+            expression = "$C > 85"; # Check if the last value from each series in C is > 85
           };
-          # isPaused = false; # Default is not paused
         }
       ];
+      
+      for = "15m"; # Duration the condition must be met (same as vmalert)
+      
+      # How to handle states where data is missing or query errors
+      noDataState = "NoData";   # Options: NoData, Alerting, OK
+      execErrState = "Error"; # Options: Error, Alerting, OK
+
+      annotations = {
+        summary = "High disk usage on {{ $labels.instance }} at {{ $labels.mountpoint }}";
+        description = ''
+          Disk usage on {{ $labels.instance }} for mount point {{ $labels.mountpoint }}
+          (fstype: {{ $labels.fstype }}) has been above 85% for more than 15 minutes.
+          Current value: {{ if $values.C }}{{ $values.C | humanizePercentage }}{{ else }}N/A{{ end }}%.
+        ''; # Using $values.C as it's the input to the math condition D
+      };
+      labels = {
+        severity = "warning";
+        category = "capacity";
+        # Grafana automatically adds labels from the query result (instance, mountpoint, etc.)
+        # and labels from the rule group/folder.
+      };
+      # isPaused = false; # Default is not paused
     }
   ];
 }
diff --git a/hosts/web-arm/modules/grafana/alerting/host_down.nix b/hosts/web-arm/modules/grafana/alerting/host_down.nix
index a2d938f..1910b23 100644
--- a/hosts/web-arm/modules/grafana/alerting/host_down.nix
+++ b/hosts/web-arm/modules/grafana/alerting/host_down.nix
@@ -1,62 +1,54 @@
 { lib, pkgs, config, ... }:
 {
-  services.grafana.provision.alerting.rules.settings.groups = [
+  grafanaAlertRuleDefinitions = [
     {
-      name = "HostStatusAlerts";
-      folder = "System Alerts";
-      interval = "1m";
+      uid = "host-down-alert-uid";
+      title = "HostDown";
+      condition = "C";
 
-      rules = [
+      data = [
         {
-          uid = "host-down-alert-uid";
-          title = "HostDown";
-          condition = "C";
-
-          data = [
-            {
-              refId = "A";
-              datasourceUid = "vm-datasource-uid";
-              queryType = "prometheus";
-              relativeTimeRange = { from = 60; to = 0; }; # Query over the last minute
-              model = {
-                expr = ''up'';
-                legendFormat = "{{instance}} ({{job}})";
-                instant = false; # Changed from true, as relativeTimeRange is used
-              };
-            }
-            { # New Expression B: Reduce Query A
-              refId = "B";
-              datasourceUid = "__expr__";
-              model = {
-                type = "reduce";
-                expression = "A"; # Input is Query A
-                reducer = "last"; # Get the last value of each series in A
-              };
-            }
-            { # Modified Expression C: Math condition based on B
-              refId = "C";
-              datasourceUid = "__expr__";
-              model = {
-                type = "math";
-                expression = "$B == 0"; # Check if the last value from B is 0
-              };
-            }
-          ];
-
-          for = "2m";
-          noDataState = "Alerting";
-          execErrState = "Error";
-
-          annotations = {
-            summary = "Host {{ $labels.instance }} is down";
-            description = ''Host {{ $labels.instance }} (job: {{ $labels.job }}) has been down for more than 2 minutes.'';
+          refId = "A";
+          datasourceUid = "vm-datasource-uid";
+          queryType = "prometheus";
+          relativeTimeRange = { from = 60; to = 0; }; # Query over the last minute
+          model = {
+            expr = ''up'';
+            legendFormat = "{{instance}} ({{job}})";
+            instant = false; # Changed from true, as relativeTimeRange is used
           };
-          labels = {
-            severity = "critical";
-            category = "availability";
+        }
+        { # New Expression B: Reduce Query A
+          refId = "B";
+          datasourceUid = "__expr__";
+          model = {
+            type = "reduce";
+            expression = "A"; # Input is Query A
+            reducer = "last"; # Get the last value of each series in A
+          };
+        }
+        { # Modified Expression C: Math condition based on B
+          refId = "C";
+          datasourceUid = "__expr__";
+          model = {
+            type = "math";
+            expression = "$B == 0"; # Check if the last value from B is 0
           };
         }
       ];
+
+      for = "2m";
+      noDataState = "Alerting";
+      execErrState = "Error";
+
+      annotations = {
+        summary = "Host {{ $labels.instance }} is down";
+        description = ''Host {{ $labels.instance }} (job: {{ $labels.job }}) has been down for more than 2 minutes.'';
+      };
+      labels = {
+        severity = "critical";
+        category = "availability";
+      };
     }
   ];
 }
diff --git a/hosts/web-arm/modules/grafana/alerting/inode_usage.nix b/hosts/web-arm/modules/grafana/alerting/inode_usage.nix
index 8f67178..ba73f30 100644
--- a/hosts/web-arm/modules/grafana/alerting/inode_usage.nix
+++ b/hosts/web-arm/modules/grafana/alerting/inode_usage.nix
@@ -1,71 +1,63 @@
 { lib, pkgs, config, ... }:
 {
-  services.grafana.provision.alerting.rules.settings.groups = [
+  grafanaAlertRuleDefinitions = [
     {
-      name = "InodeUsageAlerts";
-      folder = "System Alerts";
-      interval = "1m";
+      uid = "high-inode-usage-alert-uid";
+      title = "HighInodeUsage";
+      condition = "D"; # Condition is now D
 
-      rules = [
+      data = [
+        # Query A: Calculate inode usage percentage
         {
-          uid = "high-inode-usage-alert-uid";
-          title = "HighInodeUsage";
-          condition = "D"; # Condition is now D
-
-          data = [
-            # Query A: Calculate inode usage percentage
-            {
-              refId = "A";
-              datasourceUid = "vm-datasource-uid";
-              queryType = "prometheus";
-              relativeTimeRange = { from = 60; to = 0; };
-              model = {
-                expr = ''
-                  (
-                    node_filesystem_files{fstype!~"tmpfs|rootfs",mountpoint!=""} - node_filesystem_files_free{fstype!~"tmpfs|rootfs",mountpoint!=""}
-                  ) / (node_filesystem_files{fstype!~"tmpfs|rootfs",mountpoint!=""} > 0) * 100
-                  and node_filesystem_files{fstype!~"tmpfs|rootfs",mountpoint!=""}
-                  and node_filesystem_files_free{fstype!~"tmpfs|rootfs",mountpoint!=""}
-                '';
-                legendFormat = "{{mountpoint}} on {{instance}}";
-                instant = false;
-              };
-            }
-            # Expression C: Reduce Query A to its last value, preserving labels
-            {
-              refId = "C";
-              datasourceUid = "__expr__";
-              model = {
-                type = "reduce";
-                expression = "A"; # Input is Query A
-                reducer = "last"; # Get the last value of each series in A
-              };
-            }
-            # Expression D: Apply math condition to the reduced values from C
-            {
-              refId = "D";
-              datasourceUid = "__expr__";
-              model = {
-                type = "math";
-                expression = "$C > 80"; # Alert if inode usage from C is > 80%
-              };
-            }
-          ];
-
-          for = "30m"; # Duration the condition must be met
-          noDataState = "NoData";
-          execErrState = "Error";
-
-          annotations = {
-            summary = "High inode usage on {{ $labels.instance }} at {{ $labels.mountpoint }}";
-            description = ''Inode usage on {{ $labels.instance }} for mount point {{ $labels.mountpoint }} (fstype: {{ $labels.fstype }}) has been above 80% for more than 30 minutes. Current value: {{ if $values.C }}{{ $values.C | humanizePercentage }}{{ else }}N/A{{ end }}%.'';
+          refId = "A";
+          datasourceUid = "vm-datasource-uid";
+          queryType = "prometheus";
+          relativeTimeRange = { from = 60; to = 0; };
+          model = {
+            expr = ''
+              (
+                node_filesystem_files{fstype!~"tmpfs|rootfs",mountpoint!=""} - node_filesystem_files_free{fstype!~"tmpfs|rootfs",mountpoint!=""}
+              ) / (node_filesystem_files{fstype!~"tmpfs|rootfs",mountpoint!=""} > 0) * 100
+              and node_filesystem_files{fstype!~"tmpfs|rootfs",mountpoint!=""}
+              and node_filesystem_files_free{fstype!~"tmpfs|rootfs",mountpoint!=""}
+            '';
+            legendFormat = "{{mountpoint}} on {{instance}}";
+            instant = false;
           };
-          labels = {
-            severity = "warning";
-            category = "capacity";
+        }
+        # Expression C: Reduce Query A to its last value, preserving labels
+        {
+          refId = "C";
+          datasourceUid = "__expr__";
+          model = {
+            type = "reduce";
+            expression = "A"; # Input is Query A
+            reducer = "last"; # Get the last value of each series in A
+          };
+        }
+        # Expression D: Apply math condition to the reduced values from C
+        {
+          refId = "D";
+          datasourceUid = "__expr__";
+          model = {
+            type = "math";
+            expression = "$C > 80"; # Alert if inode usage from C is > 80%
           };
         }
       ];
+
+      for = "30m"; # Duration the condition must be met
+      noDataState = "NoData";
+      execErrState = "Error";
+
+      annotations = {
+        summary = "High inode usage on {{ $labels.instance }} at {{ $labels.mountpoint }}";
+        description = ''Inode usage on {{ $labels.instance }} for mount point {{ $labels.mountpoint }} (fstype: {{ $labels.fstype }}) has been above 80% for more than 30 minutes. Current value: {{ if $values.C }}{{ $values.C | humanizePercentage }}{{ else }}N/A{{ end }}%.'';
+      };
+      labels = {
+        severity = "warning";
+        category = "capacity";
+      };
     }
   ];
 }
diff --git a/hosts/web-arm/modules/grafana/alerting/ram_usage.nix b/hosts/web-arm/modules/grafana/alerting/ram_usage.nix
index 03dd931..14a2ea8 100644
--- a/hosts/web-arm/modules/grafana/alerting/ram_usage.nix
+++ b/hosts/web-arm/modules/grafana/alerting/ram_usage.nix
@@ -1,69 +1,61 @@
 { lib, pkgs, config, ... }:
 {
-  services.grafana.provision.alerting.rules.settings.groups = [
+  grafanaAlertRuleDefinitions = [
     {
-      name = "RAMUsageAlerts";
-      folder = "System Alerts";
-      interval = "1m";
+      uid = "high-ram-usage-alert-uid";
+      title = "HighRAMUsage";
+      condition = "D"; # Condition is now D
 
-      rules = [
+      data = [
+        # Query A: Calculate RAM usage percentage
         {
-          uid = "high-ram-usage-alert-uid";
-          title = "HighRAMUsage";
-          condition = "D"; # Condition is now D
-
-          data = [
-            # Query A: Calculate RAM usage percentage
-            {
-              refId = "A";
-              datasourceUid = "vm-datasource-uid";
-              queryType = "prometheus";
-              relativeTimeRange = { from = 60; to = 0; };
-              model = {
-                expr = ''
-                  (1 - node_memory_MemAvailable_bytes / (node_memory_MemTotal_bytes > 0)) * 100
-                  and node_memory_MemAvailable_bytes
-                  and node_memory_MemTotal_bytes
-                '';
-                legendFormat = "RAM usage on {{instance}} ({{job}})";
-                instant = false;
-              };
-            }
-            # Expression C: Reduce Query A to its last value, preserving labels
-            {
-              refId = "C";
-              datasourceUid = "__expr__";
-              model = {
-                type = "reduce";
-                expression = "A"; # Input is Query A
-                reducer = "last"; # Get the last value of each series in A
-              };
-            }
-            # Expression D: Apply math condition to the reduced values from C
-            {
-              refId = "D";
-              datasourceUid = "__expr__";
-              model = {
-                type = "math";
-                expression = "$C > 90"; # Alert if RAM usage from C is > 90%
-              };
-            }
-          ];
-
-          for = "10m"; # Duration the condition must be met
-          noDataState = "NoData";
-          execErrState = "Error";
-
-          annotations = {
-            summary = "High RAM usage on {{ $labels.instance }}";
-            description = ''RAM usage on {{ $labels.instance }} (job: {{ $labels.job }}) has been above 90% for more than 10 minutes. Current value: {{ if $values.C }}{{ $values.C | humanizePercentage }}{{ else }}N/A{{ end }}%.'';
+          refId = "A";
+          datasourceUid = "vm-datasource-uid";
+          queryType = "prometheus";
+          relativeTimeRange = { from = 60; to = 0; };
+          model = {
+            expr = ''
+              (1 - node_memory_MemAvailable_bytes / (node_memory_MemTotal_bytes > 0)) * 100
+              and node_memory_MemAvailable_bytes
+              and node_memory_MemTotal_bytes
+            '';
+            legendFormat = "RAM usage on {{instance}} ({{job}})";
+            instant = false;
           };
-          labels = {
-            severity = "warning";
-            category = "performance";
+        }
+        # Expression C: Reduce Query A to its last value, preserving labels
+        {
+          refId = "C";
+          datasourceUid = "__expr__";
+          model = {
+            type = "reduce";
+            expression = "A"; # Input is Query A
+            reducer = "last"; # Get the last value of each series in A
+          };
+        }
+        # Expression D: Apply math condition to the reduced values from C
+        {
+          refId = "D";
+          datasourceUid = "__expr__";
+          model = {
+            type = "math";
+            expression = "$C > 90"; # Alert if RAM usage from C is > 90%
           };
         }
       ];
+
+      for = "10m"; # Duration the condition must be met
+      noDataState = "NoData";
+      execErrState = "Error";
+
+      annotations = {
+        summary = "High RAM usage on {{ $labels.instance }}";
+        description = ''RAM usage on {{ $labels.instance }} (job: {{ $labels.job }}) has been above 90% for more than 10 minutes. Current value: {{ if $values.C }}{{ $values.C | humanizePercentage }}{{ else }}N/A{{ end }}%.'';
+      };
+      labels = {
+        severity = "warning";
+        category = "performance";
+      };
     }
   ];
 }
diff --git a/hosts/web-arm/modules/grafana/alerting/system/default.nix b/hosts/web-arm/modules/grafana/alerting/system/default.nix
new file mode 100644
index 0000000..26db06d
--- /dev/null
+++ b/hosts/web-arm/modules/grafana/alerting/system/default.nix
@@ -0,0 +1,21 @@
+{ lib, pkgs, config, ... }:
+let
+  # Import rule definitions from refactored alert files in the parent 'alerting' directory
+  cpuAlertRules = (import ../cpu_usage.nix { inherit lib pkgs config; }).grafanaAlertRuleDefinitions;
+  diskAlertRules = (import ../disk_usage.nix { inherit lib pkgs config; }).grafanaAlertRuleDefinitions;
+  hostDownAlertRules = (import ../host_down.nix { inherit lib pkgs config; }).grafanaAlertRuleDefinitions;
+  inodeAlertRules = (import ../inode_usage.nix { inherit lib pkgs config; }).grafanaAlertRuleDefinitions;
+  ramAlertRules = (import ../ram_usage.nix { inherit lib pkgs config; }).grafanaAlertRuleDefinitions;
+
+  allSystemRules = cpuAlertRules ++ diskAlertRules ++ hostDownAlertRules ++ inodeAlertRules ++ ramAlertRules;
+in
+{
+  services.grafana.provision.alerting.rules.settings.groups = [
+    {
+      name = "System Alerts"; # This is the Grafana alert group name
+      folder = "System Alerts"; # This is the Grafana folder name
+      interval = "1m"; 
+      rules = allSystemRules;
+    }
+  ];
+}
\ No newline at end of file
diff --git a/hosts/web-arm/modules/grafana/default.nix b/hosts/web-arm/modules/grafana/default.nix
index 6d1394f..6f48794 100644
--- a/hosts/web-arm/modules/grafana/default.nix
+++ b/hosts/web-arm/modules/grafana/default.nix
@@ -28,11 +28,13 @@ let
 in
 {
   imports = [
-    ./alerting/disk_usage.nix
-    ./alerting/cpu_usage.nix
-    ./alerting/host_down.nix
-    ./alerting/inode_usage.nix
-    ./alerting/ram_usage.nix
+    # Individual alert files removed, now handled by alerting/system/default.nix
+    # ./alerting/disk_usage.nix
+    # ./alerting/cpu_usage.nix
+    # ./alerting/host_down.nix
+    # ./alerting/inode_usage.nix
+    # ./alerting/ram_usage.nix
+    ./alerting/system/default.nix # Added: Imports the consolidated system alerts module
     # ... other rule files can be added here ...
     ./datasources/victoriametrics.nix
   ];
@@ -99,7 +101,7 @@ in
     };
     provision = {
       alerting = {
-        rules.settings.groups = lib.mkMerge []; # Allows rule groups to be merged
+        rules.settings.groups = lib.mkMerge []; # Allows rule groups to be merged (including the one from system/default.nix)
         contactPoints = {
           settings = {
             apiVersion = 1; # As per Grafana provisioning API

From d0c67baeb8b69e7b22c6d0e46112097738d1ff25 Mon Sep 17 00:00:00 2001
From: Dominik Polakovics <dominik.polakovics@cloonar.com>
Date: Sat, 31 May 2025 11:35:17 +0200
Subject: [PATCH 5/6] feat: add Grafana online status monitoring module with
 Pushover notifications

---
 hosts/fw/configuration.nix           |   5 +-
 hosts/fw/modules/grafana-monitor.nix | 183 +++++++++++++++++++++++++++
 hosts/fw/secrets.yaml                |  27 ++--
 3 files changed, 198 insertions(+), 17 deletions(-)
 create mode 100644 hosts/fw/modules/grafana-monitor.nix

diff --git a/hosts/fw/configuration.nix b/hosts/fw/configuration.nix
index 5bd0338..fc47be8 100644
--- a/hosts/fw/configuration.nix
+++ b/hosts/fw/configuration.nix
@@ -65,8 +65,9 @@
     # setup network
     ./modules/setupnetwork.nix
     ./modules/set-nix-channel.nix # Automatically manage nix-channel from /var/bento/channel
-
-
+    ./modules/grafana-monitor.nix # Grafana online status monitor
+ 
+ 
     ./hardware-configuration.nix
   ];
 
diff --git a/hosts/fw/modules/grafana-monitor.nix b/hosts/fw/modules/grafana-monitor.nix
new file mode 100644
index 0000000..b8effdb
--- /dev/null
+++ b/hosts/fw/modules/grafana-monitor.nix
@@ -0,0 +1,183 @@
+{ config, pkgs, lib, ... }:
+
+let
+  grafanaMonitorUser = "grafana-monitor";
+  grafanaMonitorGroup = "grafana-monitor";
+  stateDir = "/var/lib/${grafanaMonitorUser}";
+
+  # Monitoring script will be defined here later
+  monitorScript = pkgs.writeShellScriptBin "grafana-online-check" ''
+    #!${pkgs.bash}/bin/bash
+    set -euo pipefail
+
+    GRAFANA_URL="https://grafana.cloonar.com/api/health"
+    STATE_FILE="${stateDir}/status.env"
+    PUSHOVER_API_TOKEN_FILE="/run/secrets/pushover-api-token"
+    PUSHOVER_USER_KEY_FILE="/run/secrets/pushover-user-key"
+    MAX_FAILURES=5
+
+    # Ensure state directory exists (NixOS creates $HOME for the user, which is stateDir)
+    # The script runs as grafanaMonitorUser, so $HOME will be /var/lib/grafana-monitor
+    mkdir -p "''${HOME}"
+
+    # Load current state or initialize
+    CONSECUTIVE_FAILURES=0
+    ALERT_SENT="false"
+    LAST_KNOWN_STATUS="UP" # Assume UP initially if no state file
+
+    # Note: STATE_FILE uses $stateDir which is /var/lib/grafana-monitor.
+    # The script will run with HOME=/var/lib/grafana-monitor.
+    # So, using ''${HOME}/status.env or ''${STATE_FILE} should resolve to the same path.
+    # Let's stick to ''${STATE_FILE} for consistency with its definition.
+    if [[ -f "''${STATE_FILE}" ]]; then
+      source "''${STATE_FILE}"
+    fi
+
+    # Check secrets
+    if [[ ! -f "''${PUSHOVER_API_TOKEN_FILE}" ]] || [[ ! -r "''${PUSHOVER_API_TOKEN_FILE}" ]]; then
+      echo "Error: Pushover API token file (''${PUSHOVER_API_TOKEN_FILE}) not found or not readable." >&2
+      exit 1
+    fi
+    PUSHOVER_API_TOKEN=$(cat "''${PUSHOVER_API_TOKEN_FILE}")
+
+    if [[ ! -f "''${PUSHOVER_USER_KEY_FILE}" ]] || [[ ! -r "''${PUSHOVER_USER_KEY_FILE}" ]]; then
+      echo "Error: Pushover user key file (''${PUSHOVER_USER_KEY_FILE}) not found or not readable." >&2
+      exit 1
+    fi
+    PUSHOVER_USER_KEY=$(cat "''${PUSHOVER_USER_KEY_FILE}")
+
+    echo "Checking Grafana at ''${GRAFANA_URL}..."
+    ACTUAL_HTTP_CODE="000" # Default if curl doesn't provide one
+    CURL_ERROR_MESSAGE=""
+    CURL_STDERR_OUTPUT=$(mktemp)
+    # Ensure temp file is cleaned up on exit, error, or interrupt
+    trap 'rm -f "''${CURL_STDERR_OUTPUT}"' EXIT TERM INT HUP
+
+    # -L: follow redirects
+    # -sS: silent mode, but show errors
+    # --fail: curl exits with 22 on server errors (4xx, 5xx)
+    # --connect-timeout 5: max time to connect
+    # --max-time 10: max total time for operation
+    # --stderr: redirect stderr to a file to capture detailed errors
+    # -o /dev/null: discard response body
+    # --write-out "%{http_code}": output the HTTP status code
+    if ACTUAL_HTTP_CODE=$(${pkgs.curl}/bin/curl -L -sS --fail --connect-timeout 5 --max-time 10 \
+                            --stderr "''${CURL_STDERR_OUTPUT}" \
+                            -o /dev/null --write-out "%{http_code}" "''${GRAFANA_URL}"); then
+      # Curl exited with 0. With --fail, this means HTTP status was 2xx.
+      echo "Grafana is UP (HTTP ''${ACTUAL_HTTP_CODE})."
+      CURRENT_STATUS="UP"
+      if [[ "''${LAST_KNOWN_STATUS}" == "DOWN" && "''${ALERT_SENT}" == "true" ]]; then
+        echo "Grafana recovered. Sending recovery notification."
+        ${pkgs.curl}/bin/curl -sS -X POST \
+          -F "token=''${PUSHOVER_API_TOKEN}" \
+          -F "user=''${PUSHOVER_USER_KEY}" \
+          -F "message=Grafana at ''${GRAFANA_URL} is back online (HTTP ''${ACTUAL_HTTP_CODE})." \
+          -F "title=Grafana Recovered (fw)" \
+          -F "priority=0" \
+          https://api.pushover.net/1/messages.json
+        ALERT_SENT="false"
+      fi
+      CONSECUTIVE_FAILURES=0
+    else
+      # Curl exited with a non-zero status.
+      CURL_EXIT_CODE=$?
+      CURL_ERROR_MESSAGE=$(cat "''${CURL_STDERR_OUTPUT}" | tr -d '\n' | sed 's/"/\\"/g') # Read, remove newlines, escape quotes for JSON
+      
+      echo "Grafana check failed. Curl Exit Code: ''${CURL_EXIT_CODE}. HTTP Code reported: ''${ACTUAL_HTTP_CODE}."
+      echo "Curl Stderr: ''${CURL_ERROR_MESSAGE}"
+      CURRENT_STATUS="DOWN"
+      CONSECUTIVE_FAILURES=$(( ''${CONSECUTIVE_FAILURES} + 1 ))
+      echo "Consecutive failures: ''${CONSECUTIVE_FAILURES}"
+
+      if [[ ''${CONSECUTIVE_FAILURES} -ge ''${MAX_FAILURES} && "''${ALERT_SENT}" == "false" ]]; then
+        echo "Grafana has been offline for ''${CONSECUTIVE_FAILURES} checks (>= ''${MAX_FAILURES}). Sending alert."
+        PUSHOVER_TITLE="Grafana OFFLINE (fw)"
+        PUSHOVER_MSG="Grafana ''${GRAFANA_URL} offline for ''${MAX_FAILURES}+ min. HTTP:''${ACTUAL_HTTP_CODE}, CurlExit:''${CURL_EXIT_CODE}."
+        if [[ -n "''${CURL_ERROR_MESSAGE}" ]]; then
+            PUSHOVER_MSG+=" Err: ''${CURL_ERROR_MESSAGE}"
+        fi
+        # Truncate message if too long for Pushover (1024 chars)
+        PUSHOVER_MSG=$(echo "''${PUSHOVER_MSG}" | cut -c 1-1024)
+
+        ${pkgs.curl}/bin/curl -sS -X POST \
+          -F "token=''${PUSHOVER_API_TOKEN}" \
+          -F "user=''${PUSHOVER_USER_KEY}" \
+          -F "message=''${PUSHOVER_MSG}" \
+          -F "title=''${PUSHOVER_TITLE}" \
+          -F "priority=1" \
+          https://api.pushover.net/1/messages.json
+        ALERT_SENT="true"
+      fi
+    fi
+    # Temp file is removed by trap
+
+    # Save current state
+    echo "Saving state: CONSECUTIVE_FAILURES=''${CONSECUTIVE_FAILURES}, ALERT_SENT=''${ALERT_SENT}, LAST_KNOWN_STATUS=''${CURRENT_STATUS}"
+    (
+      echo "CONSECUTIVE_FAILURES=''${CONSECUTIVE_FAILURES}"
+      echo "ALERT_SENT=''${ALERT_SENT}"
+      echo "LAST_KNOWN_STATUS=''${CURRENT_STATUS}"
+    ) > "''${STATE_FILE}" # Using STATE_FILE which is ${stateDir}/status.env
+    chmod 600 "''${STATE_FILE}"
+
+    echo "Grafana check finished."
+  '';
+in
+{
+  # Module is now implicitly enabled when imported
+  config = {
+    users.users.${grafanaMonitorUser} = {
+      isSystemUser = true;
+      group = grafanaMonitorGroup;
+      home = stateDir; # Home directory for state
+      createHome = true; # NixOS will create this directory
+      description = "User for Grafana online monitoring service";
+    };
+    users.groups.${grafanaMonitorGroup} = {};
+
+    # Sops secrets for Pushover
+    sops.secrets."pushover-api-token" = {
+      owner = grafanaMonitorUser;
+      group = grafanaMonitorGroup;
+      mode = "0400"; # Read-only for the user
+    };
+    sops.secrets."pushover-user-key" = {
+      owner = grafanaMonitorUser;
+      group = grafanaMonitorGroup;
+      mode = "0400"; # Read-only for the user
+    };
+
+    environment.systemPackages = [
+      pkgs.curl
+      pkgs.coreutils # for mkdir, cat, echo, rm used in script (though bash builtins are often used)
+    ];
+
+    systemd.services.grafana-online-check = {
+      description = "Grafana Online Check Service";
+      wantedBy = [ "multi-user.target" ]; # Or timers.target if only started by timer
+      after = [ "network-online.target" ]; # Ensure network is up and secrets are available
+      requires = [ "network-online.target" ];
+
+      serviceConfig = {
+        Type = "oneshot";
+        User = grafanaMonitorUser;
+        Group = grafanaMonitorGroup;
+        ExecStart = "${monitorScript}/bin/grafana-online-check";
+        # Permissions to write to its own home directory (stateDir) are implicit
+        # If using StateDirectory= in systemd, it would be different.
+        # For home directory usage, ensure the user has rights. `createHome = true` helps.
+      };
+    };
+
+    systemd.timers.grafana-online-check = {
+      description = "Timer to periodically check Grafana's online status";
+      wantedBy = [ "timers.target" ];
+      timerConfig = {
+        OnBootSec = "2min"; # Wait a bit after boot
+        OnUnitActiveSec = "1min"; # Run every 1 minute after the last run
+        Unit = "grafana-online-check.service";
+      };
+    };
+  };
+}
diff --git a/hosts/fw/secrets.yaml b/hosts/fw/secrets.yaml
index 1605aaa..a372ff3 100644
--- a/hosts/fw/secrets.yaml
+++ b/hosts/fw/secrets.yaml
@@ -1,18 +1,20 @@
+ai-mailer-imap-password: ENC[AES256_GCM,data:kMxDPUK9rk7mbel5JDT03m3Y2w==,iv:cbnkNIVRXd7OLqueSrfYRzfaW9TzI+FauuQD8lgYIy0=,tag:63W7seIgt5TPVFQc84semQ==,type:str]
+ai-mailer-openrouter-key: ENC[AES256_GCM,data:PCe8kt/M+7g087AKzYMY2H5WO4L+NGkHLsh47fMK36kz+Ju5kd/kpmM4GQcDbI3LgWm/P+T0/mv7kGGOL6KLmBFaFmGV/88cGw==,iv:ruVftGvnv+PX1Zd92tfOezpyaMbYrqCrexelyPUYFMc=,tag:z4JVUCfz/frehar6y+fOlQ==,type:str]
 borg-passphrase: ENC[AES256_GCM,data:jHb+yXK0RqNdVYtWiueztZFlHC/xQ6ZiAOUcLt6BxmZQewuL3mh4AZ+lQdmA/4EaaTTIhVMR3xFx5fU6b2CtNLiGb/0=,iv:IW09B1EE1OupMCOvv13MXRYiMsD4VmIfyYONUyrPX1c=,tag:3ankeLOaDJkwRUGCd72DuA==,type:str]
 borg-ssh-key: ENC[AES256_GCM,data:ir25XfzLBb/H/YWzxP501hCaLBB4jpiLW7WUcnvguzosT9QeOtBdJ0WB1IndEMtiEgQyE9kyGOJ3QJwzbQNkX6CG96Uzt2mKw8gw8ayUqC+B9zR8eIRYiDKOYs+YREVo7nA5pLLzIc/9jaRicDFMmw1Thmk7UUJKB1DNV49nU9K+nAfrCzk7ZQieY8oaasFD0cvNb4Ndj6f9PWSXkNBwKK52ig4hDeNBs1bdy8nDE8VqlwOo8H2DcYMzdMjKCZDBRccy8NofHEhakCW5OdliFyIHsLkcBHca3Bp46JN7wbo8avPPd9bXGuRiOSWYq50RcyZUovnB3g7Dk3swCyuiFztnStN63+g7ZnGFdYLYDYfuDSPN1W2HCkknmaoT910VNE8sEAMyfXk4tqJv4eW4qmFk2UwPlRCrsk9GtdRQ5wm8muNPHEZ8s2dGkn4WDcjy7SUpgF4UJJZV8iJe74W9BK1Ef+AWWNsNjYfZde3iw1+8Fz1u65u4seFWqQMok/noADpszbpk+YYRoM+5D/YVMx+KeDtoFqnZfULM/BqvAqdYYZtRzojndeNW6Ea4sxDE+XQ5b1OwGFlNAlnuS1fYYPvKojrKNgT9KMwbsvPijU5vFddY8Qpz2h6GKEv/OW87j5UeyDW4l32lvyawBuzczBfiFgCElggGSZHM5rjE4Deb06eQleTioZ79EDXTv5UsPQ6Bc1v5Wvnu8DvxJe4B10vxH70JIGIlmjwo0yhMkxDTN7BkAGQC0QAPhwtURDq+XVufQNjlTUjjH1Q1E4u0Vy19clMs8SStqFeMN02BfWZdS9mbueF5Ehc+8wTfAs43CQFublJ4wfG1PzEbqj9LZdimFe4hCnE2y6Gbf591shugVSAMA3UXQUuvFQmm69i9gz88YSYrkLlVStM+dtXCugZho72xgHtnI+5o19wuoZPRoxe47W0T2kJZZeomtqoAsSo5yr5JeYzYdaHYcK2fgRY0HWgWzOxnVEfX/gRPR3b20Tko6yp9lIDECkXVDQSxptxqIYk+VuETnD9YF2OpYeHZLGoo9OLdEHVZRcuy1S74aAOJGO9SAHLw3eukxG//AZlwcOYjOsYDVt3BjhYZEkYCLg8GkAqV/7bGsxT7pgckNEB2NRYQI9ckqEcEw9CdkYre67HwfPCvAble68VnRzgp+v5s0koVjTURF9FTxvVOXQEbvSpY828idyx6nOaAIHoqpIOFz4jsGE9L4FKamqnlnjzj2Ri/MboT9JQBj8bnIF/ej+dQGpfqZo7zqtu3d0B/9e0xuVTcqI9Bxlqn3D4108I8R37Ctr5OFKloeOZ8HHMsHcBUAzZC6/fWrOspru14YHW2YNj8nBxHve/P3oiTQ/nlXLcBGLoFfI+hOpofccQB8FnkKfTbLSRUGrGY6NJt9RCnZgm2+RUgel77XpsCsT/Q5ZGclBdyk8mSaqVjiNyHCbCV5tF/tWnuvf859S0tcmqbJ0FhIRAvwxFucmfi6FSPX5HEMdRbNV7szrHKSX60u7YA2DBBzv3c/+C2bxq70vhwFelqz7FqpVKwebbE4/a59lZpibzefCoji/TPDJB62/ox5NHHE5qenv7IPcEj3dEmdasbrApAw1UFsFlRCnlg4JIYley/AQx7OzUSImqkG8JWvSJ4JXijhsr9dPFR/cb0srUO88aFNh/ZUQhELZCVnzAsF81Y4w6LTGApMfUVN/yx9MqENGvObywzMls1UJphvzDZzvb+Ue6eqELogN1QcEI/WOirwVtJO6E7IevEtK4xxWsLfRHVjtbLc4QjCWuiyszAPTTttKJ+iC2h14Wj1XoiMpWRiVnj+jI9iWRen96P4glYEfuCYQS6vbGkNDEoZt/FnkLJDbLdjXatmhUoRpvExOtp26ULR/f1lwzLMJBt1qPvhuGur1ru2B1e8+AVte1Cfjmk+xrnxNwkTFLGe89Qjd77wPyQv9h0YrhZ6uDi2zLemhZs2LjW5ZvzV5P4thMDxkhezJHatPHAGa8OfclJOyrRTyW2azdz2A45MNzZtCQcnQdQxBXf+XRskLnhquZfgv66hFITjuF/HeI9cq4HJcrgaOcVj+tBdK1bTCyL2kqKkCpSCbh/Pv6FuAlDXgLjsWwZgOKz8gfTIfXMapPLDYVTbS/PPPABylZflN98FFyeFDHB3Fwn1a6qAJ0mC7+4sowVZ1DIAoflaHqNs5TXyb3KeZGgXj5ZQwhv1z6NySvOS6cHxx0PvkFo99T1NHztxCRERNvBdWSwsr32DTwEvZo5iNPy3lvKI5A+rXc7jlQkUbufbddtLw2iPtt29XyMDOysK010fXzzQRjaz4R8ZaDtHNjqPrynvqFPXRB0VSIrwXS2utU7bmD+0dGX26t9k5qRBi7Gm+iZNKGMnSRsm17bVk5o8q0tb1P1eGL9mexZJJvxolfXVFJJtR8m6vLmUX1LSht/JhoWFElrINl0hviwd1dehmTqdQqWz5/imjF+pVOasrt7XVZ+7T/rDpuwNl375qSZptM1pMUExJ3CvzigpnarXXQxEBYkf0haGvQwPWNVHe/bR/1VooSQkH/mGg1g+rcTqp4yB5hsFu1lNK4ph04WQOqaafg40HBv6e5cOjLkFdEtYNpjyd6sRS+WHk7zzFlfPVlzijq8f+oDH9ALRzNnL1Y2DrX53wx4dBBWvxE1Yhb6Kj6Er4ZDiRLLXo+wJOGCpnNTPJMVaYskZ+LN2e9nS2/ZwbsNBnPHxSqCc1oP4d3yXH0j90VKnWg79aIEOagRvTF/9F6SkkGL9zVuUnoVSPwq97etWWtjGoEORMGY7jkGOK+U391p7Z69Hrv2AejS1BoSDeGcxXasFvINpmc+Hl2c+zOlFBySu2zA39cVlcStUFICA5GCmE5Eum4ED9DXP6RAuicD7YE0qSKbMkfLxIWMCZ6wBcwVUjdt43SI/ZqdpDm3E1kTRg07dE0R091rtfzEiIwBM4xFPJBafOx0L/Do61YMOHGzi6wgIQO7P7wIslv62M8MD1KKa/eH0tE2vhG/GyEGtKkg3P9vZRJwioifyshS1hvrt5pLinuCaDYyqMAl8Ro0OOm8di7+mBvXib0nRLfW7wBGDA4ADTipizNWAmbspQQl89kH5gdxgXO5U+N/qc0zXbpB+qeHVkPIK1DmrJ8pHLOE8mOpLy7eHUsSku/WtTt/RP4pcDbBU/43MCbk7NXKu/LjKjkQBjAL49LxnYmhEU7X//jtwSPE3gdx0x+wRJxzlbehM6rpfDRV5WQGSFf7yjLc/Ga1KwsgVdAstJEzDdv2vWSsjNzfJvHVBLrQPIC9fggi3DeLiHTAryCUcLUhNj4xtZWhSS1qmx07E4VzfjDJLMOsLY0vlimgngZ3YYCjC3Sw0frfQH2SZvmbLd3XfBdud67ZaMUobcRhnKzQnilldyD1jWVWLdVTup4RVxT4GYek9nmYflzpWWmwbXatz9Sgcw==,iv:9E1uiPqM3Hh4KWtL8haxm6PRm2VPc+DggrA135FvfB8=,tag:QSOgzVH9IBMgZxJvUhvY2w==,type:str]
 ddclient: ENC[AES256_GCM,data:EaXjXS/bwL3S/Fr+rzQ7dXA1eIzeFpHH7H+SvoNhVSg=,iv:3BzjnJG5yT1W8ob2nm0oUlr+sSJ73W/ctl48xyxeeWM=,tag:TqKSwfxF0V1v5T8VT/qblw==,type:str]
+gitea-mailer-password: ENC[AES256_GCM,data:M4qCWNt1oQVJzxThIjocm2frwuVMyx+69TBpke25RwxJxEQnvHL1CM579OVroTm7+gGE/oOJqAwDIepfiDtyM1xm,iv:jayFZMbu3uDimS/rIKZSeoU0MsYwWp880iEMs1oQE4k=,tag:qGDncRkyuCWaELhcxUrqtQ==,type:str]
+gitea-runner: ENC[AES256_GCM,data:NYG3qRLiMjmfA+oHYBXBbxpuX2ZjB/VgvLaS7yr5kJeDN/NukB/B3OZcEfsUWgbBS5IsLENESngWTFmK4W3htN4lSqdg/g4UsUr20beNov+pbyPN05rkBYmSCZZFwZ1L9POEE4GF4LuuoNpDlWIw0mrA8oV8MoI4W5QS2IGranBTIQQaYXU5TEGYa4XMVo4oC75iuH6DIq1KD6OgFAfMhm/wlbP8CP/Iaw2K8CNPxktk93pm3OSmggf22Z4JPEnvV25sc9iBkxLkDk9FXYFys0g=,iv:UzL5ncVOC/loJwcFSG1QJHnzLp3il4Hf3qDwLWxrIlo=,tag:w0Zn/E+02KyAsPXZdOLrew==,type:str]
+gitea-runner-token: ENC[AES256_GCM,data:HpBjLS10w78ihbnAUrlCRGvwrXLBYKH5v/P7XggoUSWLoAazSVQArABxaK7PJas=,iv:q3Y6jV0gmug06O0EYqGVyIJ4AvMGr2ydwY17YKxo0Qw=,tag:Ws5HLbdaeYGGXzDZW/FX4w==,type:str]
+home-assistant-ldap: ENC[AES256_GCM,data:uZEPbSnkgQYSd8ev6FD8TRHWWr+vusadtMcvP7KKL2AZAV0h1hga5fODN6I5u0DNL9hq2pNM+FwU0E/svWLRww==,iv:IhmUgSu34NaAY+kUZehx40uymydUYYAyte1aGqQ33/8=,tag:BKFCJPr7Vz4EG78ry/ZD7g==,type:str]
+home-assistant-secrets.yaml: ENC[AES256_GCM,data:m7uOVo7hPk/RmqqRS6y7NKoMKsR9Bdi1ntatsZdDOAbJMjZmZL2FgPEHi/zF73zCfRfTOca3dwpulR3WXZ9Ic1sbUIggmusJMg4Gellw1CUhx7SbQN5nieAbPbB9GVxMuV4OakD1u7Swz8JggDT6IwojSnuD5omCRCyUH1wvKB+Re59q6EStderlm5MJNVFlVrbKVbLKLcw4yRgTh34BGnTTjcJmgSlQjO1ciu2B7YQmdl0Fw6d8AdbEzgB5TFG5ONc85UhJDE8Wlw==,iv:GCtpcVChN2UMWtfnWURozCfVj2YbRPqp/bH4Jjntybs=,tag:pcxP7gTBtXMNT5iyW5YXTw==,type:str]
+pushover-api-token: ENC[AES256_GCM,data:W2ILPksaNeDvbSlSJztu1vu23kQKLDRHYKoUIvyd,iv:RYFAN6AU+DALphpqpiifhOoEQ8++6DEgo2wETSwxBCg=,tag:pRfaNuz4564LvRuaLggatg==,type:str]
+pushover-user-key: ENC[AES256_GCM,data:mh3u3FAdFkGD1d4UKcTwLOsCB2vfhEADI5cd1aT4,iv:4bkR7ZNJwWAYBdu435SPZUovGsfb8qivuDOQdGkPd/U=,tag:5UO4vGt75CCFEM5jxTGkGg==,type:str]
 wrwks_vpn_key: ENC[AES256_GCM,data:gGipXC8JJO59b4KWMSo0+r761raQl7RzgBuUbXmPEKlZR21bs5XRAQalzDCFNtjcpNkXiGqAHCLkDTtjPagMsw==,iv:MH1EBJEOdQDEgm9E0F884fynhsH8KiS5QSc605XbASQ=,tag:FUM1eptHS0rpt6ILyQjGOg==,type:str]
 wg_cloonar_key: ENC[AES256_GCM,data:Dtp6I5J0jU5LLVwEFU4DFCpUngPRmFMebGXnk2oSwsKtsir/DtRBFG7ictM=,iv:1Abx/EAZRJrRQURljofzUYDgJpuREriX0nSrFbH5Npw=,tag:l4uFl9Uc+W0XeLVfLGmgZA==,type:str]
 wg_epicenter_works_key: ENC[AES256_GCM,data:LeLjfwfaz+loWyHYRgIMIPzHzlOnhl9tluKcQFgdes6r+deft1JfnUzDuF0=,iv:DKrc3I+U2hWDH8nnc8ZQeaVtA1eVXu7SXdTn1fxHoH4=,tag:V0PL0GrL2NEPVslAZa801A==,type:str]
 wg_epicenter_works_psk: ENC[AES256_GCM,data:Den3NDWdP013Or6/2Vll1igUahuRSNW4hu+nDa5vkr93bbveQTaWFT4TD4U=,iv:r3UsD3+3lUIP2X3Grti7wpXTQBXtu1/MdrycEmpZfsI=,tag:ghbAcxmjGVOe9jCZsmFzjA==,type:str]
 wg_ghetto_at_key: ENC[AES256_GCM,data:OIHmoy3SpIi9aefZnZ1PzpyHbEso18ceoTULf2eQkx1rJbaxC6PD1lma7eQ=,iv:u0eFjHHOBzPTmBvBEQsYY5flcBayiAQKd6e7RyiPwJI=,tag:731C9wvv8bA5fuuQq+weVQ==,type:str]
-gitea-mailer-password: ENC[AES256_GCM,data:M4qCWNt1oQVJzxThIjocm2frwuVMyx+69TBpke25RwxJxEQnvHL1CM579OVroTm7+gGE/oOJqAwDIepfiDtyM1xm,iv:jayFZMbu3uDimS/rIKZSeoU0MsYwWp880iEMs1oQE4k=,tag:qGDncRkyuCWaELhcxUrqtQ==,type:str]
-ai-mailer-imap-password: ENC[AES256_GCM,data:kMxDPUK9rk7mbel5JDT03m3Y2w==,iv:cbnkNIVRXd7OLqueSrfYRzfaW9TzI+FauuQD8lgYIy0=,tag:63W7seIgt5TPVFQc84semQ==,type:str]
-ai-mailer-openrouter-key: ENC[AES256_GCM,data:PCe8kt/M+7g087AKzYMY2H5WO4L+NGkHLsh47fMK36kz+Ju5kd/kpmM4GQcDbI3LgWm/P+T0/mv7kGGOL6KLmBFaFmGV/88cGw==,iv:ruVftGvnv+PX1Zd92tfOezpyaMbYrqCrexelyPUYFMc=,tag:z4JVUCfz/frehar6y+fOlQ==,type:str]
-gitea-runner: ENC[AES256_GCM,data:NYG3qRLiMjmfA+oHYBXBbxpuX2ZjB/VgvLaS7yr5kJeDN/NukB/B3OZcEfsUWgbBS5IsLENESngWTFmK4W3htN4lSqdg/g4UsUr20beNov+pbyPN05rkBYmSCZZFwZ1L9POEE4GF4LuuoNpDlWIw0mrA8oV8MoI4W5QS2IGranBTIQQaYXU5TEGYa4XMVo4oC75iuH6DIq1KD6OgFAfMhm/wlbP8CP/Iaw2K8CNPxktk93pm3OSmggf22Z4JPEnvV25sc9iBkxLkDk9FXYFys0g=,iv:UzL5ncVOC/loJwcFSG1QJHnzLp3il4Hf3qDwLWxrIlo=,tag:w0Zn/E+02KyAsPXZdOLrew==,type:str]
-gitea-runner-token: ENC[AES256_GCM,data:HpBjLS10w78ihbnAUrlCRGvwrXLBYKH5v/P7XggoUSWLoAazSVQArABxaK7PJas=,iv:q3Y6jV0gmug06O0EYqGVyIJ4AvMGr2ydwY17YKxo0Qw=,tag:Ws5HLbdaeYGGXzDZW/FX4w==,type:str]
-home-assistant-ldap: ENC[AES256_GCM,data:uZEPbSnkgQYSd8ev6FD8TRHWWr+vusadtMcvP7KKL2AZAV0h1hga5fODN6I5u0DNL9hq2pNM+FwU0E/svWLRww==,iv:IhmUgSu34NaAY+kUZehx40uymydUYYAyte1aGqQ33/8=,tag:BKFCJPr7Vz4EG78ry/ZD7g==,type:str]
-home-assistant-secrets.yaml: ENC[AES256_GCM,data:m7uOVo7hPk/RmqqRS6y7NKoMKsR9Bdi1ntatsZdDOAbJMjZmZL2FgPEHi/zF73zCfRfTOca3dwpulR3WXZ9Ic1sbUIggmusJMg4Gellw1CUhx7SbQN5nieAbPbB9GVxMuV4OakD1u7Swz8JggDT6IwojSnuD5omCRCyUH1wvKB+Re59q6EStderlm5MJNVFlVrbKVbLKLcw4yRgTh34BGnTTjcJmgSlQjO1ciu2B7YQmdl0Fw6d8AdbEzgB5TFG5ONc85UhJDE8Wlw==,iv:GCtpcVChN2UMWtfnWURozCfVj2YbRPqp/bH4Jjntybs=,tag:pcxP7gTBtXMNT5iyW5YXTw==,type:str]
 matrix-shared-secret: ENC[AES256_GCM,data:67imd3m6WBeGP/5Msmjy8B6sP983jMyWzRIzWgNVV5jZslX+GBJyEYzm3OTDs1iTZf4ScvuYheTH0QFPfw==,iv:7ElCpESWumbIHmmFaedcpkFm5M58ZT3vW9wb9e1Sbh4=,tag:wr4FIymtJBtCerVqae+Xlw==,type:str]
 palworld: ENC[AES256_GCM,data:rdqChPt4gSJHS1D60+HJ+4m5mg35JbC+pOmevK21Y95QyAIeyBLVGhRYlOaUcqdZM2e4atyTTSf6z4nHsm539ddCbW7J2DCdF5PQkrAGDmmdTVq+jyJAT8gTrbXXCglT1wvFYY5dbf2NKA4ASJIA8bdVNuwRZU0CtFiishzLuc9m8ZcGCNwQ/+xkMZgkUAHYRlEJAZyMpXR6KkFftiR05JRAFczD4N7GXPPe+vyvgXg7QBGtf20Qd4SGBUw0zI/SNTRmifHUuc4Z6+Fe9JHgvTc3uFcTMVnty0fEuL+a29liaVdAFq8BnqJfc5CNV401ZSUeMbG41lCn1cegP/WChs9J6HXNrhWDgiXa6ln++NoKcfOHIfZVbYOCoOxFR6+YWeBU2+sHmdwI9j5XQf5Ly2hmg12j0Ds2Cn8k4PG5aQP+HT2bedqyxwSt6fi97A0Osnh4ig7+DzYAjSNLewbYLzVdK39VdvB9hqLto+yFS3gAaeYOHwPwtqa+COI85c55lHiyKHlSwPhBqYaaiDu00lQTUzq9R5vz6F/l+T3bUjuna5RryUu8yhnk5DyK834KycTOg4ETcZTqro6prfiEBxc+Utsc9JvEtZgwFv6fsVLOu7nHxuiYuvseZ4YA8LlYdwPJboMPO2XsuhwWtT1uz/rh2orH7/vsXvzA/kF8NFemWBEMVLYA8byC5ze8doiGDYp4T5AAf10nJB1ceQ==,iv:gs78fxhvo9KlTaR5nzs12/LdgPChSFPHD2k4VQp3ARo=,tag:lpWBOi9xh2cWkS+71KD/UQ==,type:str]
 ark: ENC[AES256_GCM,data:YYGyzoVIKI9Ac1zGOr0BEpd3fgBsvp1hSwAvfO07/EQdg8ufMWUkNvqNHDKN62ZK5A1NnY3JTA1p4gyZ4ryQeAOsbwqU1GSk2YKHFyPeEnpLz/Ml82KMsv7XPGXuKRXZ4v3UcLu0R8k1Q0gQsMWo4FjCs3FF5mVtJG/YWxxbCYHoBLJ/di5p0DgjuFgJBQknYBpuLzr+yIoeqEyN7XcGYAJO53trEJuOOxLILULifkqISHjZ66i5F1fHW0iUdRbmeWV4aOAeOrsQqXYv,iv:gJwV5ip84zHqpU0l0uESfWWOtcgihMvEEdLaeI+twcU=,tag:sy8udVQsKxV/jOqwhJmWAg==,type:str]
@@ -21,10 +23,6 @@ knot-tsig-key: ENC[AES256_GCM,data:H2jEkRSVSIJl1dSolAXj9uUmzD6eEh9zPpoajZLxfuuFt
 mopidy-spotify: ENC[AES256_GCM,data:O3s6UvTP8z5KZPCq10GaaEQntWAEoxGFMnTkeUz9AfobrpsGZJcQgyazFX2u4DgAaIjNb34032MISotmuVQDJ14mi8xI5vC9w/Vf16v3TFu/dSKGZNb5ZPQwTUQ+iMJf7chgwOV9guThhutVJokb6pLxzt7fSht7,iv:j8+X1AmuWzIJdafzgrE7WBIlZ7coNNi0/Zn6JObR6rw=,tag:fiw6M2/6nfEPqEgV2YOWLg==,type:str]
 lms-spotify: ENC[AES256_GCM,data:gh5kx/MDSefNLbZsnovRc3rNWxp/RTrJ4A2WIs1QMi4JVGFj9SppdsErMXW4y/IFj/YxH1X7JtwvhptO/p3P2CFK0XL2I1vFVqPuj7LavDHJK7GXPAV6+x17ldvPXgym5NqHjzHi4gtj7U/bMJlz0NxrFsrrjMcY9nmNX2vVwKlINUFqWb1JRvQsJ8ujSutjJbGtAY/bVQI8OFtU29QGKw1CU3RH/bgXIzxGiLQsUd68w7N17oKYj8MiTpGVcovMCRKwwUbd9w==,iv:4aVy+r//s1Cs9q4GasR3vSAb8b/VB/8Mx5E1jWAUA+E=,tag:TgTSLLH1OG9ySi2tZ+hK1Q==,type:str]
 sops:
-    kms: []
-    gcp_kms: []
-    azure_kv: []
-    hc_vault: []
     age:
         - recipient: age14grjcxaq4h55yfnjxvnqhtswxhj9sfdcvyas4lwvpa8py27pjy2sv3g6v7
           enc: |
@@ -62,8 +60,7 @@ sops:
             WXJpUUxadERyYUExRFMzNzBXaUVET3cKG9ZwWy5YvTr/BAw/i+ZJos5trwRvaW5j
             eV/SHiEteZZtCuCVFAp3iolE/mJyu97nA2yFwWaLN86h+/xkOJsdqA==
             -----END AGE ENCRYPTED FILE-----
-    lastmodified: "2025-05-29T18:23:13Z"
-    mac: ENC[AES256_GCM,data:19U1KlPoC/hj8sGRjO3j/ONYcFvmUTul6qP6CaRE0BhJfpeaVYq5OvqdErVnw8UA/zBJ+zpSX/N13jcsx8QVqTljMha2fbx7iZxMbpVgzGZ+fhwICLri6PwT/sNLXKFrv8VZqNUYR5q+PWSlKCu8QQarDPvGR6qj4gm7VN7tVsI=,iv:udieJwN63LEeCRhZrLpMN6VCHBzAYt8BeJhbbLVxwCM=,tag:M6iYQb/b7vMoM+9e5is3hw==,type:str]
-    pgp: []
+    lastmodified: "2025-05-31T08:08:02Z"
+    mac: ENC[AES256_GCM,data:p6FHDa6Xfd66pH4zB8s6nhGGk2Ha2YTC/wUsCrqu+9M01VQ7qv9tha1MpKMj9TUxSPSxPOI++5zkNi5LJbs4Y4q0KH4yd9w/guMmJB2+d2YUwNCTofvmQp3wS1KtaRbaai6mAXZELaVEsRkmwUdkdApNbSZkTZgDc+CMH7OmHbs=,iv:w/kv2wRO6N4k1U7y8efS7LXhrpMxkZ9kTs3lFo23MA8=,tag:F4rZGG00AQZLfGU3djgW8Q==,type:str]
     unencrypted_suffix: _unencrypted
-    version: 3.9.4
+    version: 3.10.2

From 81f04c6c51ae6febcbe9e5673ca67d3627361a05 Mon Sep 17 00:00:00 2001
From: Dominik Polakovics <dominik.polakovics@cloonar.com>
Date: Sat, 31 May 2025 12:53:02 +0200
Subject: [PATCH 6/6] refactor: remove unused MAC address entry from dnsmasq
 configuration, update gitea-vm to include network settings, enhance
 grafana-monitor with internet connectivity check, and clean up web module
 imports

---
 hosts/fw/modules/dnsmasq.nix         |  1 -
 hosts/fw/modules/gitea-vm.nix        |  9 ++++++++-
 hosts/fw/modules/grafana-monitor.nix | 10 ++++++++++
 hosts/fw/modules/web/default.nix     |  1 -
 4 files changed, 18 insertions(+), 3 deletions(-)

diff --git a/hosts/fw/modules/dnsmasq.nix b/hosts/fw/modules/dnsmasq.nix
index 8110621..8be86a7 100644
--- a/hosts/fw/modules/dnsmasq.nix
+++ b/hosts/fw/modules/dnsmasq.nix
@@ -70,7 +70,6 @@
         "24:df:a7:b1:1b:74,${config.networkPrefix}.96.101,rmproplus-b1-1b-74"
         
         "1a:c4:04:6e:29:bd,${config.networkPrefix}.97.2,omada"
-        "02:00:00:00:00:03,${config.networkPrefix}.97.5,web-02"
         "02:00:00:00:00:04,${config.networkPrefix}.97.6,matrix"
         "ea:db:d4:c1:18:ba,${config.networkPrefix}.97.50,git"
         "c2:4f:64:dd:13:0c,${config.networkPrefix}.97.20,home-assistant"
diff --git a/hosts/fw/modules/gitea-vm.nix b/hosts/fw/modules/gitea-vm.nix
index b9c65b5..d202cd4 100644
--- a/hosts/fw/modules/gitea-vm.nix
+++ b/hosts/fw/modules/gitea-vm.nix
@@ -1,4 +1,4 @@
-{ lib, nixpkgs, pkgs, ... }: let
+{ config, lib, nixpkgs, pkgs, ... }: let
   # hostname = "git-02";
   # json = pkgs.formats.json { };
   runners = ["git-runner-1" "git-runner-2"];
@@ -38,6 +38,13 @@ in {
         ];
       };
 
+      systemd.network.networks."10-lan" = {
+        matchConfig.PermanentMACAddress = "02:00:00:00:00:0${toString idx}";
+        address = [ "${config.networkPrefix}.97.5${toString idx}/24" ];
+        gateway = [ "${config.networkPrefix}.97.1" ];
+        dns = [ "${config.networkPrefix}.97.1" ];
+      };
+
       networking.hostName = runner;
 
       virtualisation.podman.enable = true;
diff --git a/hosts/fw/modules/grafana-monitor.nix b/hosts/fw/modules/grafana-monitor.nix
index b8effdb..c99b9fc 100644
--- a/hosts/fw/modules/grafana-monitor.nix
+++ b/hosts/fw/modules/grafana-monitor.nix
@@ -46,6 +46,16 @@ let
     fi
     PUSHOVER_USER_KEY=$(cat "''${PUSHOVER_USER_KEY_FILE}")
 
+# Internet connectivity check
+    INTERNET_CHECK_URL="https://1.1.1.1" # Using a reliable IP to bypass potential DNS issues for the check itself
+    echo "Performing internet connectivity check to ''${INTERNET_CHECK_URL}..."
+    if ! ${pkgs.curl}/bin/curl --head --silent --fail --connect-timeout 3 --max-time 5 "''${INTERNET_CHECK_URL}" > /dev/null 2>&1; then
+      echo "Internet connectivity check failed. Cannot reach ''${INTERNET_CHECK_URL}. Skipping Grafana check and exiting successfully."
+      exit 0
+    else
+      echo "Internet connectivity check successful. Proceeding with Grafana check."
+    fi
+    echo "" # Add a blank line for readability before Grafana check logs
     echo "Checking Grafana at ''${GRAFANA_URL}..."
     ACTUAL_HTTP_CODE="000" # Default if curl doesn't provide one
     CURL_ERROR_MESSAGE=""
diff --git a/hosts/fw/modules/web/default.nix b/hosts/fw/modules/web/default.nix
index 14b06c5..3fcfad8 100644
--- a/hosts/fw/modules/web/default.nix
+++ b/hosts/fw/modules/web/default.nix
@@ -52,7 +52,6 @@ in {
           ../network-prefix.nix
           ../../utils/modules/sops.nix
           ../../utils/modules/lego/lego.nix
-          ../../modules/tinder-api.nix
           # ../../utils/modules/borgbackup.nix
 
           ./zammad.nix