From 537f144885fc8314a11058c69327332132b679c4 Mon Sep 17 00:00:00 2001
From: Dominik Polakovics <dominik.polakovics@cloonar.com>
Date: Fri, 28 Nov 2025 23:50:24 +0100
Subject: [PATCH] feat: add smart alerting and noatime to disks

---
 hosts/nas/configuration.nix                   |   3 +
 hosts/nas/hardware-configuration.nix          |  10 +
 hosts/nas/modules/disk-monitoring.nix         | 192 +++++++++++
 hosts/nas/modules/power-management.nix        |  19 ++
 .../grafana/alerting/storage/default.nix      |  17 +
 .../grafana/alerting/storage/raid_alerts.nix  | 102 ++++++
 .../grafana/alerting/storage/smart_alerts.nix | 298 ++++++++++++++++++
 hosts/web-arm/modules/grafana/default.nix     |   1 +
 8 files changed, 642 insertions(+)
 create mode 100644 hosts/nas/modules/disk-monitoring.nix
 create mode 100644 hosts/nas/modules/power-management.nix
 create mode 100644 hosts/web-arm/modules/grafana/alerting/storage/default.nix
 create mode 100644 hosts/web-arm/modules/grafana/alerting/storage/raid_alerts.nix
 create mode 100644 hosts/web-arm/modules/grafana/alerting/storage/smart_alerts.nix

diff --git a/hosts/nas/configuration.nix b/hosts/nas/configuration.nix
index d07c2e5..7cddbcb 100644
--- a/hosts/nas/configuration.nix
+++ b/hosts/nas/configuration.nix
@@ -9,9 +9,12 @@ in {
     "${impermanence}/nixos.nix"
     ./utils/bento.nix
     ./utils/modules/sops.nix
+    ./utils/modules/victoriametrics/default.nix
 
     ./modules/pyload.nix
     ./modules/jellyfin.nix
+    ./modules/power-management.nix
+    ./modules/disk-monitoring.nix
 
     ./hardware-configuration.nix
   ];
diff --git a/hosts/nas/hardware-configuration.nix b/hosts/nas/hardware-configuration.nix
index a3c3edd..077f5e9 100644
--- a/hosts/nas/hardware-configuration.nix
+++ b/hosts/nas/hardware-configuration.nix
@@ -16,6 +16,14 @@
   boot.kernelModules = [ "kvm-intel" ];
   boot.extraModulePackages = [ ];
 
+  # Power management kernel parameters
+  boot.kernelParams = [
+    "intel_pstate=passive"    # Better with powersave governor
+    "i915.enable_rc6=1"       # GPU deep sleep states
+    "i915.enable_dc=2"        # Display C-states (deepest)
+    "i915.enable_fbc=1"       # Frame buffer compression
+  ];
+
   # RAID 1 array for data storage
   boot.swraid = {
     enable = true;
@@ -78,11 +86,13 @@
   fileSystems."/var/lib/downloads" = {
     device = "/dev/vg-data/lv-downloads";
     fsType = "xfs";
+    options = [ "noatime" ];
   };
 
   fileSystems."/var/lib/multimedia" = {
     device = "/dev/vg-data/lv-multimedia";
     fsType = "xfs";
+    options = [ "noatime" ];
   };
 
   # DHCP networking
diff --git a/hosts/nas/modules/disk-monitoring.nix b/hosts/nas/modules/disk-monitoring.nix
new file mode 100644
index 0000000..da42bcb
--- /dev/null
+++ b/hosts/nas/modules/disk-monitoring.nix
@@ -0,0 +1,192 @@
+# Disk monitoring for NAS
+# - S.M.A.R.T. metrics collection (respects disk spindown)
+# - mdadm RAID array status
+# - Exports metrics via node_exporter textfile collector
+{ config, lib, pkgs, ... }:
+
+let
+  # Disk identifiers from hardware-configuration.nix
+  disks = [
+    "/dev/disk/by-id/ata-ST18000NM000J-2TV103_ZR52TBSB"
+    "/dev/disk/by-id/ata-ST18000NM000J-2TV103_ZR52V9QX"
+  ];
+
+  textfileDir = "/var/lib/prometheus-node-exporter";
+
+  # Script to collect S.M.A.R.T. and mdadm metrics
+  collectMetricsScript = pkgs.writeShellScript "collect-disk-metrics" ''
+    set -euo pipefail
+
+    TEXTFILE_DIR="${textfileDir}"
+    METRICS_FILE="$TEXTFILE_DIR/disk_health.prom"
+    TEMP_FILE="$TEXTFILE_DIR/disk_health.prom.tmp"
+
+    mkdir -p "$TEXTFILE_DIR"
+    : > "$TEMP_FILE"
+
+    # Timestamp of collection
+    echo "# HELP disk_metrics_last_update Unix timestamp of last metrics collection" >> "$TEMP_FILE"
+    echo "# TYPE disk_metrics_last_update gauge" >> "$TEMP_FILE"
+    echo "disk_metrics_last_update $(date +%s)" >> "$TEMP_FILE"
+
+    echo "" >> "$TEMP_FILE"
+    echo "# HELP smart_device_active Whether the disk was active (1) or sleeping (0) when checked" >> "$TEMP_FILE"
+    echo "# TYPE smart_device_active gauge" >> "$TEMP_FILE"
+
+    # S.M.A.R.T. metrics for each disk
+    for disk in ${lib.concatStringsSep " " disks}; do
+      if [[ ! -e "$disk" ]]; then
+        echo "Warning: Disk $disk not found, skipping" >&2
+        continue
+      fi
+
+      # Resolve symlink to get actual device
+      device=$(readlink -f "$disk")
+      short_name=$(basename "$device")
+
+      # Extract serial from disk ID for labels
+      serial=$(basename "$disk" | sed 's/ata-ST18000NM000J-2TV103_//')
+
+      # Check power state without waking disk
+      power_state=$(${pkgs.hdparm}/bin/hdparm -C "$device" 2>/dev/null | grep -oP '(standby|active/idle|active|idle)' | head -1 || echo "unknown")
+
+      if [[ "$power_state" == "standby" ]]; then
+        # Disk is sleeping - don't wake it, report inactive
+        echo "smart_device_active{device=\"$short_name\",serial=\"$serial\"} 0" >> "$TEMP_FILE"
+        echo "Disk $short_name is in standby, skipping S.M.A.R.T. collection" >&2
+        continue
+      fi
+
+      # Disk is active - collect S.M.A.R.T. data
+      echo "smart_device_active{device=\"$short_name\",serial=\"$serial\"} 1" >> "$TEMP_FILE"
+
+      # Get S.M.A.R.T. health status
+      if ${pkgs.smartmontools}/bin/smartctl -H "$device" 2>/dev/null | grep -q "PASSED"; then
+        health=1
+      else
+        health=0
+      fi
+
+      # Get S.M.A.R.T. attributes
+      smartctl_output=$(${pkgs.smartmontools}/bin/smartctl -A "$device" 2>/dev/null || true)
+
+      # Parse key attributes
+      # Format: ID# ATTRIBUTE_NAME          FLAG     VALUE WORST THRESH TYPE      UPDATED  WHEN_FAILED RAW_VALUE
+
+      get_raw_value() {
+        local attr_id="$1"
+        echo "$smartctl_output" | awk -v id="$attr_id" '$1 == id { print $10 }' | head -1
+      }
+
+      reallocated=$(get_raw_value "5")
+      power_on_hours=$(get_raw_value "9")
+      temperature=$(get_raw_value "194")
+      reallocated_event=$(get_raw_value "196")
+      pending_sector=$(get_raw_value "197")
+      offline_uncorrectable=$(get_raw_value "198")
+      udma_crc_error=$(get_raw_value "199")
+
+      # Output metrics
+      cat >> "$TEMP_FILE" << EOF
+
+# S.M.A.R.T. metrics for $short_name
+smart_health_passed{device="$short_name",serial="$serial"} $health
+EOF
+
+      [[ -n "$reallocated" ]] && echo "smart_reallocated_sector_ct{device=\"$short_name\",serial=\"$serial\"} $reallocated" >> "$TEMP_FILE"
+      [[ -n "$power_on_hours" ]] && echo "smart_power_on_hours{device=\"$short_name\",serial=\"$serial\"} $power_on_hours" >> "$TEMP_FILE"
+      [[ -n "$temperature" ]] && echo "smart_temperature_celsius{device=\"$short_name\",serial=\"$serial\"} $temperature" >> "$TEMP_FILE"
+      [[ -n "$reallocated_event" ]] && echo "smart_reallocated_event_count{device=\"$short_name\",serial=\"$serial\"} $reallocated_event" >> "$TEMP_FILE"
+      [[ -n "$pending_sector" ]] && echo "smart_current_pending_sector{device=\"$short_name\",serial=\"$serial\"} $pending_sector" >> "$TEMP_FILE"
+      [[ -n "$offline_uncorrectable" ]] && echo "smart_offline_uncorrectable{device=\"$short_name\",serial=\"$serial\"} $offline_uncorrectable" >> "$TEMP_FILE"
+      [[ -n "$udma_crc_error" ]] && echo "smart_udma_crc_error_count{device=\"$short_name\",serial=\"$serial\"} $udma_crc_error" >> "$TEMP_FILE"
+    done
+
+    # mdadm RAID array status (doesn't access disks)
+    echo "" >> "$TEMP_FILE"
+    echo "# HELP mdadm_array_state RAID array state (1=clean, 0=degraded/other)" >> "$TEMP_FILE"
+    echo "# TYPE mdadm_array_state gauge" >> "$TEMP_FILE"
+    echo "# HELP mdadm_array_devices_total Total devices in RAID array" >> "$TEMP_FILE"
+    echo "# TYPE mdadm_array_devices_total gauge" >> "$TEMP_FILE"
+    echo "# HELP mdadm_array_devices_active Active devices in RAID array" >> "$TEMP_FILE"
+    echo "# TYPE mdadm_array_devices_active gauge" >> "$TEMP_FILE"
+
+    # Find RAID arrays
+    for md_device in /dev/md/*; do
+      [[ -e "$md_device" ]] || continue
+
+      array_name=$(basename "$md_device")
+
+      # Get array details
+      mdadm_output=$(${pkgs.mdadm}/bin/mdadm --detail "$md_device" 2>/dev/null || continue)
+
+      # Parse state
+      state=$(echo "$mdadm_output" | grep "State :" | sed 's/.*State : //' | tr -d ' ')
+      if [[ "$state" == "clean" ]] || [[ "$state" == "active" ]]; then
+        state_value=1
+      else
+        state_value=0
+      fi
+
+      # Parse device counts
+      total_devices=$(echo "$mdadm_output" | grep "Raid Devices" | awk '{print $4}')
+      active_devices=$(echo "$mdadm_output" | grep "Active Devices" | awk '{print $4}')
+
+      echo "mdadm_array_state{array=\"$array_name\",state=\"$state\"} $state_value" >> "$TEMP_FILE"
+      [[ -n "$total_devices" ]] && echo "mdadm_array_devices_total{array=\"$array_name\"} $total_devices" >> "$TEMP_FILE"
+      [[ -n "$active_devices" ]] && echo "mdadm_array_devices_active{array=\"$array_name\"} $active_devices" >> "$TEMP_FILE"
+    done
+
+    # Atomically replace the metrics file
+    mv "$TEMP_FILE" "$METRICS_FILE"
+
+    echo "Disk metrics collection complete"
+  '';
+in
+{
+  # Required packages
+  environment.systemPackages = with pkgs; [
+    smartmontools
+    hdparm
+    mdadm
+  ];
+
+  # Node exporter with textfile collector
+  services.prometheus.exporters.node = {
+    enable = true;
+    enabledCollectors = [
+      "textfile"
+      "systemd"
+    ];
+    extraFlags = [
+      "--collector.textfile.directory=${textfileDir}"
+    ];
+  };
+
+  # Systemd service to collect metrics
+  systemd.services.disk-metrics = {
+    description = "Collect S.M.A.R.T. and RAID metrics";
+    path = with pkgs; [ coreutils gawk gnugrep gnused ];
+    serviceConfig = {
+      Type = "oneshot";
+      ExecStart = "${collectMetricsScript}";
+      # Run as root to access disk devices
+      User = "root";
+    };
+  };
+
+  # Timer to run every 20 minutes (5min buffer for 15min spindown)
+  systemd.timers.disk-metrics = {
+    wantedBy = [ "timers.target" ];
+    timerConfig = {
+      OnCalendar = "*:0/20";  # Every 20 minutes
+      RandomizedDelaySec = "1min";
+      Persistent = true;
+    };
+  };
+
+  # Ensure textfile directory exists and is persisted
+  systemd.tmpfiles.rules = [
+    "d ${textfileDir} 0755 root root -"
+  ];
+}
diff --git a/hosts/nas/modules/power-management.nix b/hosts/nas/modules/power-management.nix
new file mode 100644
index 0000000..1d0b338
--- /dev/null
+++ b/hosts/nas/modules/power-management.nix
@@ -0,0 +1,19 @@
+# Power management for NAS
+# - CPU powersave governor (scales up on demand for transcoding)
+# - Disk spindown after 15 minutes idle
+{ config, lib, pkgs, ... }:
+
+{
+  # CPU Power Management - powersave scales up on demand for transcoding
+  powerManagement.cpuFreqGovernor = "powersave";
+
+  # Disk spindown - hdparm for Seagate 18TB drives
+  environment.systemPackages = [ pkgs.hdparm ];
+
+  services.udev.extraRules = ''
+    # Seagate 18TB NAS drives - APM 127 allows spindown, -S 180 = 15 min
+    ACTION=="add", KERNEL=="sd[a-z]", SUBSYSTEM=="block", \
+      ATTRS{model}=="ST18000NM000J*", \
+      RUN+="${pkgs.hdparm}/bin/hdparm -B 127 -S 180 /dev/%k"
+  '';
+}
diff --git a/hosts/web-arm/modules/grafana/alerting/storage/default.nix b/hosts/web-arm/modules/grafana/alerting/storage/default.nix
new file mode 100644
index 0000000..8b63271
--- /dev/null
+++ b/hosts/web-arm/modules/grafana/alerting/storage/default.nix
@@ -0,0 +1,17 @@
+{ lib, pkgs, config, ... }:
+let
+  smartAlertRules = (import ./smart_alerts.nix { inherit lib pkgs config; }).grafanaAlertRuleDefinitions;
+  raidAlertRules = (import ./raid_alerts.nix { inherit lib pkgs config; }).grafanaAlertRuleDefinitions;
+
+  allStorageRules = smartAlertRules ++ raidAlertRules;
+in
+{
+  services.grafana.provision.alerting.rules.settings.groups = [
+    {
+      name = "Storage Alerts";
+      folder = "Storage Alerts";
+      interval = "5m";  # Check every 5 minutes (metrics collected every 20 min)
+      rules = allStorageRules;
+    }
+  ];
+}
diff --git a/hosts/web-arm/modules/grafana/alerting/storage/raid_alerts.nix b/hosts/web-arm/modules/grafana/alerting/storage/raid_alerts.nix
new file mode 100644
index 0000000..82ad73e
--- /dev/null
+++ b/hosts/web-arm/modules/grafana/alerting/storage/raid_alerts.nix
@@ -0,0 +1,102 @@
+{ lib, pkgs, config, ... }:
+{
+  grafanaAlertRuleDefinitions = [
+    # RAID array degraded - critical
+    {
+      uid = "raid-array-degraded-uid";
+      title = "RaidArrayDegraded";
+      condition = "D";
+      data = [
+        {
+          refId = "A";
+          datasourceUid = "vm-datasource-uid";
+          relativeTimeRange = { from = 300; to = 0; };
+          model = {
+            expr = ''mdadm_array_state == 0'';
+            instant = false;
+          };
+        }
+        {
+          refId = "C";
+          datasourceUid = "__expr__";
+          model = {
+            type = "reduce";
+            expression = "A";
+            reducer = "last";
+          };
+        }
+        {
+          refId = "D";
+          datasourceUid = "__expr__";
+          model = {
+            type = "math";
+            expression = "$C == 0";
+          };
+        }
+      ];
+      for = "0s";
+      noDataState = "NoData";
+      execErrState = "Error";
+      annotations = {
+        summary = "RAID array {{ $labels.array }} is degraded";
+        description = ''
+          RAID array {{ $labels.array }} on {{ $labels.instance }} is in state "{{ $labels.state }}".
+          The array is not in a healthy state. Check for failed disks immediately!
+        '';
+      };
+      labels = {
+        severity = "critical";
+        category = "storage";
+      };
+    }
+
+    # RAID missing devices - critical
+    {
+      uid = "raid-missing-devices-uid";
+      title = "RaidMissingDevices";
+      condition = "D";
+      data = [
+        {
+          refId = "A";
+          datasourceUid = "vm-datasource-uid";
+          relativeTimeRange = { from = 300; to = 0; };
+          model = {
+            expr = ''mdadm_array_devices_active < mdadm_array_devices_total'';
+            instant = false;
+          };
+        }
+        {
+          refId = "C";
+          datasourceUid = "__expr__";
+          model = {
+            type = "reduce";
+            expression = "A";
+            reducer = "last";
+          };
+        }
+        {
+          refId = "D";
+          datasourceUid = "__expr__";
+          model = {
+            type = "math";
+            expression = "$C > 0";
+          };
+        }
+      ];
+      for = "0s";
+      noDataState = "NoData";
+      execErrState = "Error";
+      annotations = {
+        summary = "RAID array {{ $labels.array }} has missing devices";
+        description = ''
+          RAID array {{ $labels.array }} on {{ $labels.instance }} has fewer active devices than expected.
+          A disk may have failed or been removed. Check array status immediately!
+        '';
+      };
+      labels = {
+        severity = "critical";
+        category = "storage";
+      };
+    }
+  ];
+}
diff --git a/hosts/web-arm/modules/grafana/alerting/storage/smart_alerts.nix b/hosts/web-arm/modules/grafana/alerting/storage/smart_alerts.nix
new file mode 100644
index 0000000..dd36462
--- /dev/null
+++ b/hosts/web-arm/modules/grafana/alerting/storage/smart_alerts.nix
@@ -0,0 +1,298 @@
+{ lib, pkgs, config, ... }:
+{
+  grafanaAlertRuleDefinitions = [
+    # S.M.A.R.T. overall health failed - critical
+    {
+      uid = "smart-health-failed-uid";
+      title = "DiskSmartHealthFailed";
+      condition = "D";
+      data = [
+        {
+          refId = "A";
+          datasourceUid = "vm-datasource-uid";
+          relativeTimeRange = { from = 300; to = 0; };
+          model = {
+            expr = ''smart_health_passed == 0'';
+            instant = false;
+          };
+        }
+        {
+          refId = "C";
+          datasourceUid = "__expr__";
+          model = {
+            type = "reduce";
+            expression = "A";
+            reducer = "last";
+          };
+        }
+        {
+          refId = "D";
+          datasourceUid = "__expr__";
+          model = {
+            type = "math";
+            expression = "$C == 0";
+          };
+        }
+      ];
+      for = "0s";
+      noDataState = "NoData";
+      execErrState = "Error";
+      annotations = {
+        summary = "S.M.A.R.T. health check FAILED on {{ $labels.device }}";
+        description = ''
+          Disk {{ $labels.device }} ({{ $labels.serial }}) on {{ $labels.instance }} has failed its S.M.A.R.T. health check.
+          This indicates imminent disk failure. Replace the disk immediately!
+        '';
+      };
+      labels = {
+        severity = "critical";
+        category = "storage";
+      };
+    }
+
+    # Reallocated sectors - warning (any count > 0 is concerning)
+    {
+      uid = "smart-reallocated-sectors-uid";
+      title = "DiskReallocatedSectors";
+      condition = "D";
+      data = [
+        {
+          refId = "A";
+          datasourceUid = "vm-datasource-uid";
+          relativeTimeRange = { from = 300; to = 0; };
+          model = {
+            expr = ''smart_reallocated_sector_ct > 0'';
+            instant = false;
+          };
+        }
+        {
+          refId = "C";
+          datasourceUid = "__expr__";
+          model = {
+            type = "reduce";
+            expression = "A";
+            reducer = "last";
+          };
+        }
+        {
+          refId = "D";
+          datasourceUid = "__expr__";
+          model = {
+            type = "math";
+            expression = "$C > 0";
+          };
+        }
+      ];
+      for = "0s";
+      noDataState = "NoData";
+      execErrState = "Error";
+      annotations = {
+        summary = "Reallocated sectors detected on {{ $labels.device }}";
+        description = ''
+          Disk {{ $labels.device }} ({{ $labels.serial }}) on {{ $labels.instance }} has reallocated sectors.
+          This indicates disk surface damage. Monitor closely and plan replacement.
+        '';
+      };
+      labels = {
+        severity = "warning";
+        category = "storage";
+      };
+    }
+
+    # Current pending sectors
+    {
+      uid = "smart-pending-sectors-uid";
+      title = "DiskPendingSectors";
+      condition = "D";
+      data = [
+        {
+          refId = "A";
+          datasourceUid = "vm-datasource-uid";
+          relativeTimeRange = { from = 300; to = 0; };
+          model = {
+            expr = ''smart_current_pending_sector > 0'';
+            instant = false;
+          };
+        }
+        {
+          refId = "C";
+          datasourceUid = "__expr__";
+          model = {
+            type = "reduce";
+            expression = "A";
+            reducer = "last";
+          };
+        }
+        {
+          refId = "D";
+          datasourceUid = "__expr__";
+          model = {
+            type = "math";
+            expression = "$C > 0";
+          };
+        }
+      ];
+      for = "0s";
+      noDataState = "NoData";
+      execErrState = "Error";
+      annotations = {
+        summary = "Pending sectors detected on {{ $labels.device }}";
+        description = ''
+          Disk {{ $labels.device }} ({{ $labels.serial }}) on {{ $labels.instance }} has pending sectors.
+          These sectors could not be read and may be reallocated. Monitor for increase.
+        '';
+      };
+      labels = {
+        severity = "warning";
+        category = "storage";
+      };
+    }
+
+    # Offline uncorrectable errors
+    {
+      uid = "smart-offline-uncorrectable-uid";
+      title = "DiskOfflineUncorrectable";
+      condition = "D";
+      data = [
+        {
+          refId = "A";
+          datasourceUid = "vm-datasource-uid";
+          relativeTimeRange = { from = 300; to = 0; };
+          model = {
+            expr = ''smart_offline_uncorrectable > 0'';
+            instant = false;
+          };
+        }
+        {
+          refId = "C";
+          datasourceUid = "__expr__";
+          model = {
+            type = "reduce";
+            expression = "A";
+            reducer = "last";
+          };
+        }
+        {
+          refId = "D";
+          datasourceUid = "__expr__";
+          model = {
+            type = "math";
+            expression = "$C > 0";
+          };
+        }
+      ];
+      for = "0s";
+      noDataState = "NoData";
+      execErrState = "Error";
+      annotations = {
+        summary = "Offline uncorrectable errors on {{ $labels.device }}";
+        description = ''
+          Disk {{ $labels.device }} ({{ $labels.serial }}) on {{ $labels.instance }} has offline uncorrectable errors.
+          This indicates data integrity issues. Consider replacement.
+        '';
+      };
+      labels = {
+        severity = "warning";
+        category = "storage";
+      };
+    }
+
+    # High temperature (Seagate enterprise: warning at 50C)
+    {
+      uid = "smart-high-temperature-uid";
+      title = "DiskHighTemperature";
+      condition = "D";
+      data = [
+        {
+          refId = "A";
+          datasourceUid = "vm-datasource-uid";
+          relativeTimeRange = { from = 600; to = 0; };
+          model = {
+            expr = ''smart_temperature_celsius > 50'';
+            instant = false;
+          };
+        }
+        {
+          refId = "C";
+          datasourceUid = "__expr__";
+          model = {
+            type = "reduce";
+            expression = "A";
+            reducer = "last";
+          };
+        }
+        {
+          refId = "D";
+          datasourceUid = "__expr__";
+          model = {
+            type = "math";
+            expression = "$C > 0";
+          };
+        }
+      ];
+      for = "10m";
+      noDataState = "NoData";
+      execErrState = "Error";
+      annotations = {
+        summary = "High temperature on {{ $labels.device }}";
+        description = ''
+          Disk {{ $labels.device }} ({{ $labels.serial }}) on {{ $labels.instance }} temperature exceeds 50°C.
+          Check cooling and ventilation.
+        '';
+      };
+      labels = {
+        severity = "warning";
+        category = "storage";
+      };
+    }
+
+    # UDMA CRC errors (cable/connection issues)
+    {
+      uid = "smart-udma-crc-errors-uid";
+      title = "DiskUDMACRCErrors";
+      condition = "D";
+      data = [
+        {
+          refId = "A";
+          datasourceUid = "vm-datasource-uid";
+          relativeTimeRange = { from = 86400; to = 0; };
+          model = {
+            expr = ''increase(smart_udma_crc_error_count[24h]) > 0'';
+            instant = false;
+          };
+        }
+        {
+          refId = "C";
+          datasourceUid = "__expr__";
+          model = {
+            type = "reduce";
+            expression = "A";
+            reducer = "last";
+          };
+        }
+        {
+          refId = "D";
+          datasourceUid = "__expr__";
+          model = {
+            type = "math";
+            expression = "$C > 0";
+          };
+        }
+      ];
+      for = "0s";
+      noDataState = "NoData";
+      execErrState = "Error";
+      annotations = {
+        summary = "UDMA CRC errors on {{ $labels.device }}";
+        description = ''
+          Disk {{ $labels.device }} ({{ $labels.serial }}) on {{ $labels.instance }} has new CRC errors.
+          This typically indicates SATA cable or connection issues. Check cables.
+        '';
+      };
+      labels = {
+        severity = "warning";
+        category = "storage";
+      };
+    }
+  ];
+}
diff --git a/hosts/web-arm/modules/grafana/default.nix b/hosts/web-arm/modules/grafana/default.nix
index 3ac5e10..881f802 100644
--- a/hosts/web-arm/modules/grafana/default.nix
+++ b/hosts/web-arm/modules/grafana/default.nix
@@ -31,6 +31,7 @@ in
     ./alerting/system/default.nix
     ./alerting/service/default.nix
     ./alerting/websites/default.nix
+    # ./alerting/storage/default.nix
 
     ./datasources/victoriametrics.nix
     ./datasources/loki.nix