feat: add smart alerting and noatime to disks

2025-11-28 23:50:24 +01:00
parent dbada3c509
commit 537f144885
8 changed files with 642 additions and 0 deletions
--- a/hosts/nas/configuration.nix
+++ b/hosts/nas/configuration.nix
@@ -9,9 +9,12 @@ in {
    "${impermanence}/nixos.nix"
    ./utils/bento.nix
    ./utils/modules/sops.nix
    ./utils/modules/victoriametrics/default.nix
    ./modules/pyload.nix
    ./modules/jellyfin.nix
    ./modules/power-management.nix
    ./modules/disk-monitoring.nix
    ./hardware-configuration.nix
  ];
--- a/hosts/nas/hardware-configuration.nix
+++ b/hosts/nas/hardware-configuration.nix
@@ -16,6 +16,14 @@
  boot.kernelModules = [ "kvm-intel" ];
  boot.extraModulePackages = [ ];
  # Power management kernel parameters
  boot.kernelParams = [
    "intel_pstate=passive"    # Better with powersave governor
    "i915.enable_rc6=1"       # GPU deep sleep states
    "i915.enable_dc=2"        # Display C-states (deepest)
    "i915.enable_fbc=1"       # Frame buffer compression
  ];
  # RAID 1 array for data storage
  boot.swraid = {
    enable = true;
@@ -78,11 +86,13 @@
  fileSystems."/var/lib/downloads" = {
    device = "/dev/vg-data/lv-downloads";
    fsType = "xfs";
    options = [ "noatime" ];
  };
  fileSystems."/var/lib/multimedia" = {
    device = "/dev/vg-data/lv-multimedia";
    fsType = "xfs";
    options = [ "noatime" ];
  };
  # DHCP networking
--- a/hosts/nas/modules/disk-monitoring.nix
+++ b/hosts/nas/modules/disk-monitoring.nix
@@ -0,0 +1,192 @@
 # Disk monitoring for NAS
 # - S.M.A.R.T. metrics collection (respects disk spindown)
 # - mdadm RAID array status
 # - Exports metrics via node_exporter textfile collector
 { config, lib, pkgs, ... }:
 let
  # Disk identifiers from hardware-configuration.nix
  disks = [
    "/dev/disk/by-id/ata-ST18000NM000J-2TV103_ZR52TBSB"
    "/dev/disk/by-id/ata-ST18000NM000J-2TV103_ZR52V9QX"
  ];
  textfileDir = "/var/lib/prometheus-node-exporter";
  # Script to collect S.M.A.R.T. and mdadm metrics
  collectMetricsScript = pkgs.writeShellScript "collect-disk-metrics" ''
    set -euo pipefail
    TEXTFILE_DIR="${textfileDir}"
    METRICS_FILE="$TEXTFILE_DIR/disk_health.prom"
    TEMP_FILE="$TEXTFILE_DIR/disk_health.prom.tmp"
    mkdir -p "$TEXTFILE_DIR"
    : > "$TEMP_FILE"
    # Timestamp of collection
    echo "# HELP disk_metrics_last_update Unix timestamp of last metrics collection" >> "$TEMP_FILE"
    echo "# TYPE disk_metrics_last_update gauge" >> "$TEMP_FILE"
    echo "disk_metrics_last_update $(date +%s)" >> "$TEMP_FILE"
    echo "" >> "$TEMP_FILE"
    echo "# HELP smart_device_active Whether the disk was active (1) or sleeping (0) when checked" >> "$TEMP_FILE"
    echo "# TYPE smart_device_active gauge" >> "$TEMP_FILE"
    # S.M.A.R.T. metrics for each disk
    for disk in ${lib.concatStringsSep " " disks}; do
      if [[ ! -e "$disk" ]]; then
        echo "Warning: Disk $disk not found, skipping" >&2
        continue
      fi
      # Resolve symlink to get actual device
      device=$(readlink -f "$disk")
      short_name=$(basename "$device")
      # Extract serial from disk ID for labels
      serial=$(basename "$disk" | sed 's/ata-ST18000NM000J-2TV103_//')
      # Check power state without waking disk
      power_state=$(${pkgs.hdparm}/bin/hdparm -C "$device" 2>/dev/null | grep -oP '(standby|active/idle|active|idle)' | head -1 || echo "unknown")
      if [[ "$power_state" == "standby" ]]; then
        # Disk is sleeping - don't wake it, report inactive
        echo "smart_device_active{device=\"$short_name\",serial=\"$serial\"} 0" >> "$TEMP_FILE"
        echo "Disk $short_name is in standby, skipping S.M.A.R.T. collection" >&2
        continue
      fi
      # Disk is active - collect S.M.A.R.T. data
      echo "smart_device_active{device=\"$short_name\",serial=\"$serial\"} 1" >> "$TEMP_FILE"
      # Get S.M.A.R.T. health status
      if ${pkgs.smartmontools}/bin/smartctl -H "$device" 2>/dev/null | grep -q "PASSED"; then
        health=1
      else
        health=0
      fi
      # Get S.M.A.R.T. attributes
      smartctl_output=$(${pkgs.smartmontools}/bin/smartctl -A "$device" 2>/dev/null || true)
      # Parse key attributes
      # Format: ID# ATTRIBUTE_NAME          FLAG     VALUE WORST THRESH TYPE      UPDATED  WHEN_FAILED RAW_VALUE
      get_raw_value() {
        local attr_id="$1"
        echo "$smartctl_output" | awk -v id="$attr_id" '$1 == id { print $10 }' | head -1
      }
      reallocated=$(get_raw_value "5")
      power_on_hours=$(get_raw_value "9")
      temperature=$(get_raw_value "194")
      reallocated_event=$(get_raw_value "196")
      pending_sector=$(get_raw_value "197")
      offline_uncorrectable=$(get_raw_value "198")
      udma_crc_error=$(get_raw_value "199")
      # Output metrics
      cat >> "$TEMP_FILE" << EOF
 # S.M.A.R.T. metrics for $short_name
 smart_health_passed{device="$short_name",serial="$serial"} $health
 EOF
      [[ -n "$reallocated" ]] && echo "smart_reallocated_sector_ct{device=\"$short_name\",serial=\"$serial\"} $reallocated" >> "$TEMP_FILE"
      [[ -n "$power_on_hours" ]] && echo "smart_power_on_hours{device=\"$short_name\",serial=\"$serial\"} $power_on_hours" >> "$TEMP_FILE"
      [[ -n "$temperature" ]] && echo "smart_temperature_celsius{device=\"$short_name\",serial=\"$serial\"} $temperature" >> "$TEMP_FILE"
      [[ -n "$reallocated_event" ]] && echo "smart_reallocated_event_count{device=\"$short_name\",serial=\"$serial\"} $reallocated_event" >> "$TEMP_FILE"
      [[ -n "$pending_sector" ]] && echo "smart_current_pending_sector{device=\"$short_name\",serial=\"$serial\"} $pending_sector" >> "$TEMP_FILE"
      [[ -n "$offline_uncorrectable" ]] && echo "smart_offline_uncorrectable{device=\"$short_name\",serial=\"$serial\"} $offline_uncorrectable" >> "$TEMP_FILE"
      [[ -n "$udma_crc_error" ]] && echo "smart_udma_crc_error_count{device=\"$short_name\",serial=\"$serial\"} $udma_crc_error" >> "$TEMP_FILE"
    done
    # mdadm RAID array status (doesn't access disks)
    echo "" >> "$TEMP_FILE"
    echo "# HELP mdadm_array_state RAID array state (1=clean, 0=degraded/other)" >> "$TEMP_FILE"
    echo "# TYPE mdadm_array_state gauge" >> "$TEMP_FILE"
    echo "# HELP mdadm_array_devices_total Total devices in RAID array" >> "$TEMP_FILE"
    echo "# TYPE mdadm_array_devices_total gauge" >> "$TEMP_FILE"
    echo "# HELP mdadm_array_devices_active Active devices in RAID array" >> "$TEMP_FILE"
    echo "# TYPE mdadm_array_devices_active gauge" >> "$TEMP_FILE"
    # Find RAID arrays
    for md_device in /dev/md/*; do
      [[ -e "$md_device" ]] || continue
      array_name=$(basename "$md_device")
      # Get array details
      mdadm_output=$(${pkgs.mdadm}/bin/mdadm --detail "$md_device" 2>/dev/null || continue)
      # Parse state
      state=$(echo "$mdadm_output" | grep "State :" | sed 's/.*State : //' | tr -d ' ')
      if [[ "$state" == "clean" ]] || [[ "$state" == "active" ]]; then
        state_value=1
      else
        state_value=0
      fi
      # Parse device counts
      total_devices=$(echo "$mdadm_output" | grep "Raid Devices" | awk '{print $4}')
      active_devices=$(echo "$mdadm_output" | grep "Active Devices" | awk '{print $4}')
      echo "mdadm_array_state{array=\"$array_name\",state=\"$state\"} $state_value" >> "$TEMP_FILE"
      [[ -n "$total_devices" ]] && echo "mdadm_array_devices_total{array=\"$array_name\"} $total_devices" >> "$TEMP_FILE"
      [[ -n "$active_devices" ]] && echo "mdadm_array_devices_active{array=\"$array_name\"} $active_devices" >> "$TEMP_FILE"
    done
    # Atomically replace the metrics file
    mv "$TEMP_FILE" "$METRICS_FILE"
    echo "Disk metrics collection complete"
  '';
 in
 {
  # Required packages
  environment.systemPackages = with pkgs; [
    smartmontools
    hdparm
    mdadm
  ];
  # Node exporter with textfile collector
  services.prometheus.exporters.node = {
    enable = true;
    enabledCollectors = [
      "textfile"
      "systemd"
    ];
    extraFlags = [
      "--collector.textfile.directory=${textfileDir}"
    ];
  };
  # Systemd service to collect metrics
  systemd.services.disk-metrics = {
    description = "Collect S.M.A.R.T. and RAID metrics";
    path = with pkgs; [ coreutils gawk gnugrep gnused ];
    serviceConfig = {
      Type = "oneshot";
      ExecStart = "${collectMetricsScript}";
      # Run as root to access disk devices
      User = "root";
    };
  };
  # Timer to run every 20 minutes (5min buffer for 15min spindown)
  systemd.timers.disk-metrics = {
    wantedBy = [ "timers.target" ];
    timerConfig = {
      OnCalendar = "*:0/20";  # Every 20 minutes
      RandomizedDelaySec = "1min";
      Persistent = true;
    };
  };
  # Ensure textfile directory exists and is persisted
  systemd.tmpfiles.rules = [
    "d ${textfileDir} 0755 root root -"
  ];
 }
--- a/hosts/nas/modules/power-management.nix
+++ b/hosts/nas/modules/power-management.nix
@@ -0,0 +1,19 @@
 # Power management for NAS
 # - CPU powersave governor (scales up on demand for transcoding)
 # - Disk spindown after 15 minutes idle
 { config, lib, pkgs, ... }:
 {
  # CPU Power Management - powersave scales up on demand for transcoding
  powerManagement.cpuFreqGovernor = "powersave";
  # Disk spindown - hdparm for Seagate 18TB drives
  environment.systemPackages = [ pkgs.hdparm ];
  services.udev.extraRules = ''
    # Seagate 18TB NAS drives - APM 127 allows spindown, -S 180 = 15 min
    ACTION=="add", KERNEL=="sd[a-z]", SUBSYSTEM=="block", \
      ATTRS{model}=="ST18000NM000J*", \
      RUN+="${pkgs.hdparm}/bin/hdparm -B 127 -S 180 /dev/%k"
  '';
 }
--- a/hosts/web-arm/modules/grafana/alerting/storage/default.nix
+++ b/hosts/web-arm/modules/grafana/alerting/storage/default.nix
@@ -0,0 +1,17 @@
 { lib, pkgs, config, ... }:
 let
  smartAlertRules = (import ./smart_alerts.nix { inherit lib pkgs config; }).grafanaAlertRuleDefinitions;
  raidAlertRules = (import ./raid_alerts.nix { inherit lib pkgs config; }).grafanaAlertRuleDefinitions;
  allStorageRules = smartAlertRules ++ raidAlertRules;
 in
 {
  services.grafana.provision.alerting.rules.settings.groups = [
    {
      name = "Storage Alerts";
      folder = "Storage Alerts";
      interval = "5m";  # Check every 5 minutes (metrics collected every 20 min)
      rules = allStorageRules;
    }
  ];
 }
--- a/hosts/web-arm/modules/grafana/alerting/storage/raid_alerts.nix
+++ b/hosts/web-arm/modules/grafana/alerting/storage/raid_alerts.nix
@@ -0,0 +1,102 @@
 { lib, pkgs, config, ... }:
 {
  grafanaAlertRuleDefinitions = [
    # RAID array degraded - critical
    {
      uid = "raid-array-degraded-uid";
      title = "RaidArrayDegraded";
      condition = "D";
      data = [
        {
          refId = "A";
          datasourceUid = "vm-datasource-uid";
          relativeTimeRange = { from = 300; to = 0; };
          model = {
            expr = ''mdadm_array_state == 0'';
            instant = false;
          };
        }
        {
          refId = "C";
          datasourceUid = "__expr__";
          model = {
            type = "reduce";
            expression = "A";
            reducer = "last";
          };
        }
        {
          refId = "D";
          datasourceUid = "__expr__";
          model = {
            type = "math";
            expression = "$C == 0";
          };
        }
      ];
      for = "0s";
      noDataState = "NoData";
      execErrState = "Error";
      annotations = {
        summary = "RAID array {{ $labels.array }} is degraded";
        description = ''
          RAID array {{ $labels.array }} on {{ $labels.instance }} is in state "{{ $labels.state }}".
          The array is not in a healthy state. Check for failed disks immediately!
        '';
      };
      labels = {
        severity = "critical";
        category = "storage";
      };
    }
    # RAID missing devices - critical
    {
      uid = "raid-missing-devices-uid";
      title = "RaidMissingDevices";
      condition = "D";
      data = [
        {
          refId = "A";
          datasourceUid = "vm-datasource-uid";
          relativeTimeRange = { from = 300; to = 0; };
          model = {
            expr = ''mdadm_array_devices_active < mdadm_array_devices_total'';
            instant = false;
          };
        }
        {
          refId = "C";
          datasourceUid = "__expr__";
          model = {
            type = "reduce";
            expression = "A";
            reducer = "last";
          };
        }
        {
          refId = "D";
          datasourceUid = "__expr__";
          model = {
            type = "math";
            expression = "$C > 0";
          };
        }
      ];
      for = "0s";
      noDataState = "NoData";
      execErrState = "Error";
      annotations = {
        summary = "RAID array {{ $labels.array }} has missing devices";
        description = ''
          RAID array {{ $labels.array }} on {{ $labels.instance }} has fewer active devices than expected.
          A disk may have failed or been removed. Check array status immediately!
        '';
      };
      labels = {
        severity = "critical";
        category = "storage";
      };
    }
  ];
 }
--- a/hosts/web-arm/modules/grafana/alerting/storage/smart_alerts.nix
+++ b/hosts/web-arm/modules/grafana/alerting/storage/smart_alerts.nix
@@ -0,0 +1,298 @@
 { lib, pkgs, config, ... }:
 {
  grafanaAlertRuleDefinitions = [
    # S.M.A.R.T. overall health failed - critical
    {
      uid = "smart-health-failed-uid";
      title = "DiskSmartHealthFailed";
      condition = "D";
      data = [
        {
          refId = "A";
          datasourceUid = "vm-datasource-uid";
          relativeTimeRange = { from = 300; to = 0; };
          model = {
            expr = ''smart_health_passed == 0'';
            instant = false;
          };
        }
        {
          refId = "C";
          datasourceUid = "__expr__";
          model = {
            type = "reduce";
            expression = "A";
            reducer = "last";
          };
        }
        {
          refId = "D";
          datasourceUid = "__expr__";
          model = {
            type = "math";
            expression = "$C == 0";
          };
        }
      ];
      for = "0s";
      noDataState = "NoData";
      execErrState = "Error";
      annotations = {
        summary = "S.M.A.R.T. health check FAILED on {{ $labels.device }}";
        description = ''
          Disk {{ $labels.device }} ({{ $labels.serial }}) on {{ $labels.instance }} has failed its S.M.A.R.T. health check.
          This indicates imminent disk failure. Replace the disk immediately!
        '';
      };
      labels = {
        severity = "critical";
        category = "storage";
      };
    }
    # Reallocated sectors - warning (any count > 0 is concerning)
    {
      uid = "smart-reallocated-sectors-uid";
      title = "DiskReallocatedSectors";
      condition = "D";
      data = [
        {
          refId = "A";
          datasourceUid = "vm-datasource-uid";
          relativeTimeRange = { from = 300; to = 0; };
          model = {
            expr = ''smart_reallocated_sector_ct > 0'';
            instant = false;
          };
        }
        {
          refId = "C";
          datasourceUid = "__expr__";
          model = {
            type = "reduce";
            expression = "A";
            reducer = "last";
          };
        }
        {
          refId = "D";
          datasourceUid = "__expr__";
          model = {
            type = "math";
            expression = "$C > 0";
          };
        }
      ];
      for = "0s";
      noDataState = "NoData";
      execErrState = "Error";
      annotations = {
        summary = "Reallocated sectors detected on {{ $labels.device }}";
        description = ''
          Disk {{ $labels.device }} ({{ $labels.serial }}) on {{ $labels.instance }} has reallocated sectors.
          This indicates disk surface damage. Monitor closely and plan replacement.
        '';
      };
      labels = {
        severity = "warning";
        category = "storage";
      };
    }
    # Current pending sectors
    {
      uid = "smart-pending-sectors-uid";
      title = "DiskPendingSectors";
      condition = "D";
      data = [
        {
          refId = "A";
          datasourceUid = "vm-datasource-uid";
          relativeTimeRange = { from = 300; to = 0; };
          model = {
            expr = ''smart_current_pending_sector > 0'';
            instant = false;
          };
        }
        {
          refId = "C";
          datasourceUid = "__expr__";
          model = {
            type = "reduce";
            expression = "A";
            reducer = "last";
          };
        }
        {
          refId = "D";
          datasourceUid = "__expr__";
          model = {
            type = "math";
            expression = "$C > 0";
          };
        }
      ];
      for = "0s";
      noDataState = "NoData";
      execErrState = "Error";
      annotations = {
        summary = "Pending sectors detected on {{ $labels.device }}";
        description = ''
          Disk {{ $labels.device }} ({{ $labels.serial }}) on {{ $labels.instance }} has pending sectors.
          These sectors could not be read and may be reallocated. Monitor for increase.
        '';
      };
      labels = {
        severity = "warning";
        category = "storage";
      };
    }
    # Offline uncorrectable errors
    {
      uid = "smart-offline-uncorrectable-uid";
      title = "DiskOfflineUncorrectable";
      condition = "D";
      data = [
        {
          refId = "A";
          datasourceUid = "vm-datasource-uid";
          relativeTimeRange = { from = 300; to = 0; };
          model = {
            expr = ''smart_offline_uncorrectable > 0'';
            instant = false;
          };
        }
        {
          refId = "C";
          datasourceUid = "__expr__";
          model = {
            type = "reduce";
            expression = "A";
            reducer = "last";
          };
        }
        {
          refId = "D";
          datasourceUid = "__expr__";
          model = {
            type = "math";
            expression = "$C > 0";
          };
        }
      ];
      for = "0s";
      noDataState = "NoData";
      execErrState = "Error";
      annotations = {
        summary = "Offline uncorrectable errors on {{ $labels.device }}";
        description = ''
          Disk {{ $labels.device }} ({{ $labels.serial }}) on {{ $labels.instance }} has offline uncorrectable errors.
          This indicates data integrity issues. Consider replacement.
        '';
      };
      labels = {
        severity = "warning";
        category = "storage";
      };
    }
    # High temperature (Seagate enterprise: warning at 50C)
    {
      uid = "smart-high-temperature-uid";
      title = "DiskHighTemperature";
      condition = "D";
      data = [
        {
          refId = "A";
          datasourceUid = "vm-datasource-uid";
          relativeTimeRange = { from = 600; to = 0; };
          model = {
            expr = ''smart_temperature_celsius > 50'';
            instant = false;
          };
        }
        {
          refId = "C";
          datasourceUid = "__expr__";
          model = {
            type = "reduce";
            expression = "A";
            reducer = "last";
          };
        }
        {
          refId = "D";
          datasourceUid = "__expr__";
          model = {
            type = "math";
            expression = "$C > 0";
          };
        }
      ];
      for = "10m";
      noDataState = "NoData";
      execErrState = "Error";
      annotations = {
        summary = "High temperature on {{ $labels.device }}";
        description = ''
          Disk {{ $labels.device }} ({{ $labels.serial }}) on {{ $labels.instance }} temperature exceeds 50°C.
          Check cooling and ventilation.
        '';
      };
      labels = {
        severity = "warning";
        category = "storage";
      };
    }
    # UDMA CRC errors (cable/connection issues)
    {
      uid = "smart-udma-crc-errors-uid";
      title = "DiskUDMACRCErrors";
      condition = "D";
      data = [
        {
          refId = "A";
          datasourceUid = "vm-datasource-uid";
          relativeTimeRange = { from = 86400; to = 0; };
          model = {
            expr = ''increase(smart_udma_crc_error_count[24h]) > 0'';
            instant = false;
          };
        }
        {
          refId = "C";
          datasourceUid = "__expr__";
          model = {
            type = "reduce";
            expression = "A";
            reducer = "last";
          };
        }
        {
          refId = "D";
          datasourceUid = "__expr__";
          model = {
            type = "math";
            expression = "$C > 0";
          };
        }
      ];
      for = "0s";
      noDataState = "NoData";
      execErrState = "Error";
      annotations = {
        summary = "UDMA CRC errors on {{ $labels.device }}";
        description = ''
          Disk {{ $labels.device }} ({{ $labels.serial }}) on {{ $labels.instance }} has new CRC errors.
          This typically indicates SATA cable or connection issues. Check cables.
        '';
      };
      labels = {
        severity = "warning";
        category = "storage";
      };
    }
  ];
 }
--- a/hosts/web-arm/modules/grafana/default.nix
+++ b/hosts/web-arm/modules/grafana/default.nix
@@ -31,6 +31,7 @@ in
    ./alerting/system/default.nix
    ./alerting/service/default.nix
    ./alerting/websites/default.nix
    # ./alerting/storage/default.nix
    ./datasources/victoriametrics.nix
    ./datasources/loki.nix