From 537f144885fc8314a11058c69327332132b679c4 Mon Sep 17 00:00:00 2001 From: Dominik Polakovics Date: Fri, 28 Nov 2025 23:50:24 +0100 Subject: [PATCH] feat: add smart alerting and noatime to disks --- hosts/nas/configuration.nix | 3 + hosts/nas/hardware-configuration.nix | 10 + hosts/nas/modules/disk-monitoring.nix | 192 +++++++++++ hosts/nas/modules/power-management.nix | 19 ++ .../grafana/alerting/storage/default.nix | 17 + .../grafana/alerting/storage/raid_alerts.nix | 102 ++++++ .../grafana/alerting/storage/smart_alerts.nix | 298 ++++++++++++++++++ hosts/web-arm/modules/grafana/default.nix | 1 + 8 files changed, 642 insertions(+) create mode 100644 hosts/nas/modules/disk-monitoring.nix create mode 100644 hosts/nas/modules/power-management.nix create mode 100644 hosts/web-arm/modules/grafana/alerting/storage/default.nix create mode 100644 hosts/web-arm/modules/grafana/alerting/storage/raid_alerts.nix create mode 100644 hosts/web-arm/modules/grafana/alerting/storage/smart_alerts.nix diff --git a/hosts/nas/configuration.nix b/hosts/nas/configuration.nix index d07c2e5..7cddbcb 100644 --- a/hosts/nas/configuration.nix +++ b/hosts/nas/configuration.nix @@ -9,9 +9,12 @@ in { "${impermanence}/nixos.nix" ./utils/bento.nix ./utils/modules/sops.nix + ./utils/modules/victoriametrics/default.nix ./modules/pyload.nix ./modules/jellyfin.nix + ./modules/power-management.nix + ./modules/disk-monitoring.nix ./hardware-configuration.nix ]; diff --git a/hosts/nas/hardware-configuration.nix b/hosts/nas/hardware-configuration.nix index a3c3edd..077f5e9 100644 --- a/hosts/nas/hardware-configuration.nix +++ b/hosts/nas/hardware-configuration.nix @@ -16,6 +16,14 @@ boot.kernelModules = [ "kvm-intel" ]; boot.extraModulePackages = [ ]; + # Power management kernel parameters + boot.kernelParams = [ + "intel_pstate=passive" # Better with powersave governor + "i915.enable_rc6=1" # GPU deep sleep states + "i915.enable_dc=2" # Display C-states (deepest) + "i915.enable_fbc=1" # Frame buffer compression + ]; + # RAID 1 array for data storage boot.swraid = { enable = true; @@ -78,11 +86,13 @@ fileSystems."/var/lib/downloads" = { device = "/dev/vg-data/lv-downloads"; fsType = "xfs"; + options = [ "noatime" ]; }; fileSystems."/var/lib/multimedia" = { device = "/dev/vg-data/lv-multimedia"; fsType = "xfs"; + options = [ "noatime" ]; }; # DHCP networking diff --git a/hosts/nas/modules/disk-monitoring.nix b/hosts/nas/modules/disk-monitoring.nix new file mode 100644 index 0000000..da42bcb --- /dev/null +++ b/hosts/nas/modules/disk-monitoring.nix @@ -0,0 +1,192 @@ +# Disk monitoring for NAS +# - S.M.A.R.T. metrics collection (respects disk spindown) +# - mdadm RAID array status +# - Exports metrics via node_exporter textfile collector +{ config, lib, pkgs, ... }: + +let + # Disk identifiers from hardware-configuration.nix + disks = [ + "/dev/disk/by-id/ata-ST18000NM000J-2TV103_ZR52TBSB" + "/dev/disk/by-id/ata-ST18000NM000J-2TV103_ZR52V9QX" + ]; + + textfileDir = "/var/lib/prometheus-node-exporter"; + + # Script to collect S.M.A.R.T. and mdadm metrics + collectMetricsScript = pkgs.writeShellScript "collect-disk-metrics" '' + set -euo pipefail + + TEXTFILE_DIR="${textfileDir}" + METRICS_FILE="$TEXTFILE_DIR/disk_health.prom" + TEMP_FILE="$TEXTFILE_DIR/disk_health.prom.tmp" + + mkdir -p "$TEXTFILE_DIR" + : > "$TEMP_FILE" + + # Timestamp of collection + echo "# HELP disk_metrics_last_update Unix timestamp of last metrics collection" >> "$TEMP_FILE" + echo "# TYPE disk_metrics_last_update gauge" >> "$TEMP_FILE" + echo "disk_metrics_last_update $(date +%s)" >> "$TEMP_FILE" + + echo "" >> "$TEMP_FILE" + echo "# HELP smart_device_active Whether the disk was active (1) or sleeping (0) when checked" >> "$TEMP_FILE" + echo "# TYPE smart_device_active gauge" >> "$TEMP_FILE" + + # S.M.A.R.T. metrics for each disk + for disk in ${lib.concatStringsSep " " disks}; do + if [[ ! -e "$disk" ]]; then + echo "Warning: Disk $disk not found, skipping" >&2 + continue + fi + + # Resolve symlink to get actual device + device=$(readlink -f "$disk") + short_name=$(basename "$device") + + # Extract serial from disk ID for labels + serial=$(basename "$disk" | sed 's/ata-ST18000NM000J-2TV103_//') + + # Check power state without waking disk + power_state=$(${pkgs.hdparm}/bin/hdparm -C "$device" 2>/dev/null | grep -oP '(standby|active/idle|active|idle)' | head -1 || echo "unknown") + + if [[ "$power_state" == "standby" ]]; then + # Disk is sleeping - don't wake it, report inactive + echo "smart_device_active{device=\"$short_name\",serial=\"$serial\"} 0" >> "$TEMP_FILE" + echo "Disk $short_name is in standby, skipping S.M.A.R.T. collection" >&2 + continue + fi + + # Disk is active - collect S.M.A.R.T. data + echo "smart_device_active{device=\"$short_name\",serial=\"$serial\"} 1" >> "$TEMP_FILE" + + # Get S.M.A.R.T. health status + if ${pkgs.smartmontools}/bin/smartctl -H "$device" 2>/dev/null | grep -q "PASSED"; then + health=1 + else + health=0 + fi + + # Get S.M.A.R.T. attributes + smartctl_output=$(${pkgs.smartmontools}/bin/smartctl -A "$device" 2>/dev/null || true) + + # Parse key attributes + # Format: ID# ATTRIBUTE_NAME FLAG VALUE WORST THRESH TYPE UPDATED WHEN_FAILED RAW_VALUE + + get_raw_value() { + local attr_id="$1" + echo "$smartctl_output" | awk -v id="$attr_id" '$1 == id { print $10 }' | head -1 + } + + reallocated=$(get_raw_value "5") + power_on_hours=$(get_raw_value "9") + temperature=$(get_raw_value "194") + reallocated_event=$(get_raw_value "196") + pending_sector=$(get_raw_value "197") + offline_uncorrectable=$(get_raw_value "198") + udma_crc_error=$(get_raw_value "199") + + # Output metrics + cat >> "$TEMP_FILE" << EOF + +# S.M.A.R.T. metrics for $short_name +smart_health_passed{device="$short_name",serial="$serial"} $health +EOF + + [[ -n "$reallocated" ]] && echo "smart_reallocated_sector_ct{device=\"$short_name\",serial=\"$serial\"} $reallocated" >> "$TEMP_FILE" + [[ -n "$power_on_hours" ]] && echo "smart_power_on_hours{device=\"$short_name\",serial=\"$serial\"} $power_on_hours" >> "$TEMP_FILE" + [[ -n "$temperature" ]] && echo "smart_temperature_celsius{device=\"$short_name\",serial=\"$serial\"} $temperature" >> "$TEMP_FILE" + [[ -n "$reallocated_event" ]] && echo "smart_reallocated_event_count{device=\"$short_name\",serial=\"$serial\"} $reallocated_event" >> "$TEMP_FILE" + [[ -n "$pending_sector" ]] && echo "smart_current_pending_sector{device=\"$short_name\",serial=\"$serial\"} $pending_sector" >> "$TEMP_FILE" + [[ -n "$offline_uncorrectable" ]] && echo "smart_offline_uncorrectable{device=\"$short_name\",serial=\"$serial\"} $offline_uncorrectable" >> "$TEMP_FILE" + [[ -n "$udma_crc_error" ]] && echo "smart_udma_crc_error_count{device=\"$short_name\",serial=\"$serial\"} $udma_crc_error" >> "$TEMP_FILE" + done + + # mdadm RAID array status (doesn't access disks) + echo "" >> "$TEMP_FILE" + echo "# HELP mdadm_array_state RAID array state (1=clean, 0=degraded/other)" >> "$TEMP_FILE" + echo "# TYPE mdadm_array_state gauge" >> "$TEMP_FILE" + echo "# HELP mdadm_array_devices_total Total devices in RAID array" >> "$TEMP_FILE" + echo "# TYPE mdadm_array_devices_total gauge" >> "$TEMP_FILE" + echo "# HELP mdadm_array_devices_active Active devices in RAID array" >> "$TEMP_FILE" + echo "# TYPE mdadm_array_devices_active gauge" >> "$TEMP_FILE" + + # Find RAID arrays + for md_device in /dev/md/*; do + [[ -e "$md_device" ]] || continue + + array_name=$(basename "$md_device") + + # Get array details + mdadm_output=$(${pkgs.mdadm}/bin/mdadm --detail "$md_device" 2>/dev/null || continue) + + # Parse state + state=$(echo "$mdadm_output" | grep "State :" | sed 's/.*State : //' | tr -d ' ') + if [[ "$state" == "clean" ]] || [[ "$state" == "active" ]]; then + state_value=1 + else + state_value=0 + fi + + # Parse device counts + total_devices=$(echo "$mdadm_output" | grep "Raid Devices" | awk '{print $4}') + active_devices=$(echo "$mdadm_output" | grep "Active Devices" | awk '{print $4}') + + echo "mdadm_array_state{array=\"$array_name\",state=\"$state\"} $state_value" >> "$TEMP_FILE" + [[ -n "$total_devices" ]] && echo "mdadm_array_devices_total{array=\"$array_name\"} $total_devices" >> "$TEMP_FILE" + [[ -n "$active_devices" ]] && echo "mdadm_array_devices_active{array=\"$array_name\"} $active_devices" >> "$TEMP_FILE" + done + + # Atomically replace the metrics file + mv "$TEMP_FILE" "$METRICS_FILE" + + echo "Disk metrics collection complete" + ''; +in +{ + # Required packages + environment.systemPackages = with pkgs; [ + smartmontools + hdparm + mdadm + ]; + + # Node exporter with textfile collector + services.prometheus.exporters.node = { + enable = true; + enabledCollectors = [ + "textfile" + "systemd" + ]; + extraFlags = [ + "--collector.textfile.directory=${textfileDir}" + ]; + }; + + # Systemd service to collect metrics + systemd.services.disk-metrics = { + description = "Collect S.M.A.R.T. and RAID metrics"; + path = with pkgs; [ coreutils gawk gnugrep gnused ]; + serviceConfig = { + Type = "oneshot"; + ExecStart = "${collectMetricsScript}"; + # Run as root to access disk devices + User = "root"; + }; + }; + + # Timer to run every 20 minutes (5min buffer for 15min spindown) + systemd.timers.disk-metrics = { + wantedBy = [ "timers.target" ]; + timerConfig = { + OnCalendar = "*:0/20"; # Every 20 minutes + RandomizedDelaySec = "1min"; + Persistent = true; + }; + }; + + # Ensure textfile directory exists and is persisted + systemd.tmpfiles.rules = [ + "d ${textfileDir} 0755 root root -" + ]; +} diff --git a/hosts/nas/modules/power-management.nix b/hosts/nas/modules/power-management.nix new file mode 100644 index 0000000..1d0b338 --- /dev/null +++ b/hosts/nas/modules/power-management.nix @@ -0,0 +1,19 @@ +# Power management for NAS +# - CPU powersave governor (scales up on demand for transcoding) +# - Disk spindown after 15 minutes idle +{ config, lib, pkgs, ... }: + +{ + # CPU Power Management - powersave scales up on demand for transcoding + powerManagement.cpuFreqGovernor = "powersave"; + + # Disk spindown - hdparm for Seagate 18TB drives + environment.systemPackages = [ pkgs.hdparm ]; + + services.udev.extraRules = '' + # Seagate 18TB NAS drives - APM 127 allows spindown, -S 180 = 15 min + ACTION=="add", KERNEL=="sd[a-z]", SUBSYSTEM=="block", \ + ATTRS{model}=="ST18000NM000J*", \ + RUN+="${pkgs.hdparm}/bin/hdparm -B 127 -S 180 /dev/%k" + ''; +} diff --git a/hosts/web-arm/modules/grafana/alerting/storage/default.nix b/hosts/web-arm/modules/grafana/alerting/storage/default.nix new file mode 100644 index 0000000..8b63271 --- /dev/null +++ b/hosts/web-arm/modules/grafana/alerting/storage/default.nix @@ -0,0 +1,17 @@ +{ lib, pkgs, config, ... }: +let + smartAlertRules = (import ./smart_alerts.nix { inherit lib pkgs config; }).grafanaAlertRuleDefinitions; + raidAlertRules = (import ./raid_alerts.nix { inherit lib pkgs config; }).grafanaAlertRuleDefinitions; + + allStorageRules = smartAlertRules ++ raidAlertRules; +in +{ + services.grafana.provision.alerting.rules.settings.groups = [ + { + name = "Storage Alerts"; + folder = "Storage Alerts"; + interval = "5m"; # Check every 5 minutes (metrics collected every 20 min) + rules = allStorageRules; + } + ]; +} diff --git a/hosts/web-arm/modules/grafana/alerting/storage/raid_alerts.nix b/hosts/web-arm/modules/grafana/alerting/storage/raid_alerts.nix new file mode 100644 index 0000000..82ad73e --- /dev/null +++ b/hosts/web-arm/modules/grafana/alerting/storage/raid_alerts.nix @@ -0,0 +1,102 @@ +{ lib, pkgs, config, ... }: +{ + grafanaAlertRuleDefinitions = [ + # RAID array degraded - critical + { + uid = "raid-array-degraded-uid"; + title = "RaidArrayDegraded"; + condition = "D"; + data = [ + { + refId = "A"; + datasourceUid = "vm-datasource-uid"; + relativeTimeRange = { from = 300; to = 0; }; + model = { + expr = ''mdadm_array_state == 0''; + instant = false; + }; + } + { + refId = "C"; + datasourceUid = "__expr__"; + model = { + type = "reduce"; + expression = "A"; + reducer = "last"; + }; + } + { + refId = "D"; + datasourceUid = "__expr__"; + model = { + type = "math"; + expression = "$C == 0"; + }; + } + ]; + for = "0s"; + noDataState = "NoData"; + execErrState = "Error"; + annotations = { + summary = "RAID array {{ $labels.array }} is degraded"; + description = '' + RAID array {{ $labels.array }} on {{ $labels.instance }} is in state "{{ $labels.state }}". + The array is not in a healthy state. Check for failed disks immediately! + ''; + }; + labels = { + severity = "critical"; + category = "storage"; + }; + } + + # RAID missing devices - critical + { + uid = "raid-missing-devices-uid"; + title = "RaidMissingDevices"; + condition = "D"; + data = [ + { + refId = "A"; + datasourceUid = "vm-datasource-uid"; + relativeTimeRange = { from = 300; to = 0; }; + model = { + expr = ''mdadm_array_devices_active < mdadm_array_devices_total''; + instant = false; + }; + } + { + refId = "C"; + datasourceUid = "__expr__"; + model = { + type = "reduce"; + expression = "A"; + reducer = "last"; + }; + } + { + refId = "D"; + datasourceUid = "__expr__"; + model = { + type = "math"; + expression = "$C > 0"; + }; + } + ]; + for = "0s"; + noDataState = "NoData"; + execErrState = "Error"; + annotations = { + summary = "RAID array {{ $labels.array }} has missing devices"; + description = '' + RAID array {{ $labels.array }} on {{ $labels.instance }} has fewer active devices than expected. + A disk may have failed or been removed. Check array status immediately! + ''; + }; + labels = { + severity = "critical"; + category = "storage"; + }; + } + ]; +} diff --git a/hosts/web-arm/modules/grafana/alerting/storage/smart_alerts.nix b/hosts/web-arm/modules/grafana/alerting/storage/smart_alerts.nix new file mode 100644 index 0000000..dd36462 --- /dev/null +++ b/hosts/web-arm/modules/grafana/alerting/storage/smart_alerts.nix @@ -0,0 +1,298 @@ +{ lib, pkgs, config, ... }: +{ + grafanaAlertRuleDefinitions = [ + # S.M.A.R.T. overall health failed - critical + { + uid = "smart-health-failed-uid"; + title = "DiskSmartHealthFailed"; + condition = "D"; + data = [ + { + refId = "A"; + datasourceUid = "vm-datasource-uid"; + relativeTimeRange = { from = 300; to = 0; }; + model = { + expr = ''smart_health_passed == 0''; + instant = false; + }; + } + { + refId = "C"; + datasourceUid = "__expr__"; + model = { + type = "reduce"; + expression = "A"; + reducer = "last"; + }; + } + { + refId = "D"; + datasourceUid = "__expr__"; + model = { + type = "math"; + expression = "$C == 0"; + }; + } + ]; + for = "0s"; + noDataState = "NoData"; + execErrState = "Error"; + annotations = { + summary = "S.M.A.R.T. health check FAILED on {{ $labels.device }}"; + description = '' + Disk {{ $labels.device }} ({{ $labels.serial }}) on {{ $labels.instance }} has failed its S.M.A.R.T. health check. + This indicates imminent disk failure. Replace the disk immediately! + ''; + }; + labels = { + severity = "critical"; + category = "storage"; + }; + } + + # Reallocated sectors - warning (any count > 0 is concerning) + { + uid = "smart-reallocated-sectors-uid"; + title = "DiskReallocatedSectors"; + condition = "D"; + data = [ + { + refId = "A"; + datasourceUid = "vm-datasource-uid"; + relativeTimeRange = { from = 300; to = 0; }; + model = { + expr = ''smart_reallocated_sector_ct > 0''; + instant = false; + }; + } + { + refId = "C"; + datasourceUid = "__expr__"; + model = { + type = "reduce"; + expression = "A"; + reducer = "last"; + }; + } + { + refId = "D"; + datasourceUid = "__expr__"; + model = { + type = "math"; + expression = "$C > 0"; + }; + } + ]; + for = "0s"; + noDataState = "NoData"; + execErrState = "Error"; + annotations = { + summary = "Reallocated sectors detected on {{ $labels.device }}"; + description = '' + Disk {{ $labels.device }} ({{ $labels.serial }}) on {{ $labels.instance }} has reallocated sectors. + This indicates disk surface damage. Monitor closely and plan replacement. + ''; + }; + labels = { + severity = "warning"; + category = "storage"; + }; + } + + # Current pending sectors + { + uid = "smart-pending-sectors-uid"; + title = "DiskPendingSectors"; + condition = "D"; + data = [ + { + refId = "A"; + datasourceUid = "vm-datasource-uid"; + relativeTimeRange = { from = 300; to = 0; }; + model = { + expr = ''smart_current_pending_sector > 0''; + instant = false; + }; + } + { + refId = "C"; + datasourceUid = "__expr__"; + model = { + type = "reduce"; + expression = "A"; + reducer = "last"; + }; + } + { + refId = "D"; + datasourceUid = "__expr__"; + model = { + type = "math"; + expression = "$C > 0"; + }; + } + ]; + for = "0s"; + noDataState = "NoData"; + execErrState = "Error"; + annotations = { + summary = "Pending sectors detected on {{ $labels.device }}"; + description = '' + Disk {{ $labels.device }} ({{ $labels.serial }}) on {{ $labels.instance }} has pending sectors. + These sectors could not be read and may be reallocated. Monitor for increase. + ''; + }; + labels = { + severity = "warning"; + category = "storage"; + }; + } + + # Offline uncorrectable errors + { + uid = "smart-offline-uncorrectable-uid"; + title = "DiskOfflineUncorrectable"; + condition = "D"; + data = [ + { + refId = "A"; + datasourceUid = "vm-datasource-uid"; + relativeTimeRange = { from = 300; to = 0; }; + model = { + expr = ''smart_offline_uncorrectable > 0''; + instant = false; + }; + } + { + refId = "C"; + datasourceUid = "__expr__"; + model = { + type = "reduce"; + expression = "A"; + reducer = "last"; + }; + } + { + refId = "D"; + datasourceUid = "__expr__"; + model = { + type = "math"; + expression = "$C > 0"; + }; + } + ]; + for = "0s"; + noDataState = "NoData"; + execErrState = "Error"; + annotations = { + summary = "Offline uncorrectable errors on {{ $labels.device }}"; + description = '' + Disk {{ $labels.device }} ({{ $labels.serial }}) on {{ $labels.instance }} has offline uncorrectable errors. + This indicates data integrity issues. Consider replacement. + ''; + }; + labels = { + severity = "warning"; + category = "storage"; + }; + } + + # High temperature (Seagate enterprise: warning at 50C) + { + uid = "smart-high-temperature-uid"; + title = "DiskHighTemperature"; + condition = "D"; + data = [ + { + refId = "A"; + datasourceUid = "vm-datasource-uid"; + relativeTimeRange = { from = 600; to = 0; }; + model = { + expr = ''smart_temperature_celsius > 50''; + instant = false; + }; + } + { + refId = "C"; + datasourceUid = "__expr__"; + model = { + type = "reduce"; + expression = "A"; + reducer = "last"; + }; + } + { + refId = "D"; + datasourceUid = "__expr__"; + model = { + type = "math"; + expression = "$C > 0"; + }; + } + ]; + for = "10m"; + noDataState = "NoData"; + execErrState = "Error"; + annotations = { + summary = "High temperature on {{ $labels.device }}"; + description = '' + Disk {{ $labels.device }} ({{ $labels.serial }}) on {{ $labels.instance }} temperature exceeds 50°C. + Check cooling and ventilation. + ''; + }; + labels = { + severity = "warning"; + category = "storage"; + }; + } + + # UDMA CRC errors (cable/connection issues) + { + uid = "smart-udma-crc-errors-uid"; + title = "DiskUDMACRCErrors"; + condition = "D"; + data = [ + { + refId = "A"; + datasourceUid = "vm-datasource-uid"; + relativeTimeRange = { from = 86400; to = 0; }; + model = { + expr = ''increase(smart_udma_crc_error_count[24h]) > 0''; + instant = false; + }; + } + { + refId = "C"; + datasourceUid = "__expr__"; + model = { + type = "reduce"; + expression = "A"; + reducer = "last"; + }; + } + { + refId = "D"; + datasourceUid = "__expr__"; + model = { + type = "math"; + expression = "$C > 0"; + }; + } + ]; + for = "0s"; + noDataState = "NoData"; + execErrState = "Error"; + annotations = { + summary = "UDMA CRC errors on {{ $labels.device }}"; + description = '' + Disk {{ $labels.device }} ({{ $labels.serial }}) on {{ $labels.instance }} has new CRC errors. + This typically indicates SATA cable or connection issues. Check cables. + ''; + }; + labels = { + severity = "warning"; + category = "storage"; + }; + } + ]; +} diff --git a/hosts/web-arm/modules/grafana/default.nix b/hosts/web-arm/modules/grafana/default.nix index 3ac5e10..881f802 100644 --- a/hosts/web-arm/modules/grafana/default.nix +++ b/hosts/web-arm/modules/grafana/default.nix @@ -31,6 +31,7 @@ in ./alerting/system/default.nix ./alerting/service/default.nix ./alerting/websites/default.nix + # ./alerting/storage/default.nix ./datasources/victoriametrics.nix ./datasources/loki.nix