# Disk monitoring for NAS # - S.M.A.R.T. metrics collection (respects disk spindown) # - mdadm RAID array status # - Exports metrics via node_exporter textfile collector { config, lib, pkgs, ... }: let # Disk identifiers from hardware-configuration.nix disks = [ "/dev/disk/by-id/ata-ST18000NM000J-2TV103_ZR52TBSB" "/dev/disk/by-id/ata-ST18000NM000J-2TV103_ZR52V9QX" "/dev/disk/by-id/ata-TOSHIBA_MG10ACA20TE_8582A01SF4MJ" "/dev/disk/by-id/ata-TOSHIBA_MG10ACA20TE_75V2A0H3F4MJ" "/dev/disk/by-id/nvme-KIOXIA-EXCERIA_PLUS_G3_SSD_7FJKS1MAZ0E7" "/dev/disk/by-id/nvme-KIOXIA-EXCERIA_PLUS_G3_SSD_7FJKS1M9Z0E7" ]; textfileDir = "/var/lib/prometheus-node-exporter"; # Script to collect S.M.A.R.T. and mdadm metrics collectMetricsScript = pkgs.writeShellScript "collect-disk-metrics" '' set -euo pipefail TEXTFILE_DIR="${textfileDir}" METRICS_FILE="$TEXTFILE_DIR/disk_health.prom" TEMP_FILE="$TEXTFILE_DIR/disk_health.prom.tmp" mkdir -p "$TEXTFILE_DIR" : > "$TEMP_FILE" # Timestamp of collection echo "# HELP disk_metrics_last_update Unix timestamp of last metrics collection" >> "$TEMP_FILE" echo "# TYPE disk_metrics_last_update gauge" >> "$TEMP_FILE" echo "disk_metrics_last_update $(date +%s)" >> "$TEMP_FILE" echo "" >> "$TEMP_FILE" echo "# HELP smart_device_active Whether the disk was active (1) or sleeping (0) when checked" >> "$TEMP_FILE" echo "# TYPE smart_device_active gauge" >> "$TEMP_FILE" # S.M.A.R.T. metrics for each disk for disk in ${lib.concatStringsSep " " disks}; do if [[ ! -e "$disk" ]]; then echo "Warning: Disk $disk not found, skipping" >&2 continue fi # Resolve symlink to get actual device device=$(readlink -f "$disk") short_name=$(basename "$device") # Extract serial from disk ID for labels (part after last underscore) serial=$(basename "$disk" | sed 's/.*_//') # Check power state without waking disk power_state=$(${pkgs.hdparm}/bin/hdparm -C "$device" 2>/dev/null | grep -oP '(standby|active/idle|active|idle)' | head -1 || echo "unknown") if [[ "$power_state" == "standby" ]]; then # Disk is sleeping - don't wake it, report inactive echo "smart_device_active{device=\"$short_name\",serial=\"$serial\"} 0" >> "$TEMP_FILE" echo "Disk $short_name is in standby, skipping S.M.A.R.T. collection" >&2 continue fi # Disk is active - collect S.M.A.R.T. data echo "smart_device_active{device=\"$short_name\",serial=\"$serial\"} 1" >> "$TEMP_FILE" # Get S.M.A.R.T. health status if ${pkgs.smartmontools}/bin/smartctl -H "$device" 2>/dev/null | grep -q "PASSED"; then health=1 else health=0 fi # Get S.M.A.R.T. attributes smartctl_output=$(${pkgs.smartmontools}/bin/smartctl -A "$device" 2>/dev/null || true) # Parse key attributes # Format: ID# ATTRIBUTE_NAME FLAG VALUE WORST THRESH TYPE UPDATED WHEN_FAILED RAW_VALUE get_raw_value() { local attr_id="$1" echo "$smartctl_output" | awk -v id="$attr_id" '$1 == id { print $10 }' | head -1 } reallocated=$(get_raw_value "5") power_on_hours=$(get_raw_value "9") temperature=$(get_raw_value "194") reallocated_event=$(get_raw_value "196") pending_sector=$(get_raw_value "197") offline_uncorrectable=$(get_raw_value "198") udma_crc_error=$(get_raw_value "199") # Output metrics cat >> "$TEMP_FILE" << EOF # S.M.A.R.T. metrics for $short_name smart_health_passed{device="$short_name",serial="$serial"} $health EOF [[ -n "$reallocated" ]] && echo "smart_reallocated_sector_ct{device=\"$short_name\",serial=\"$serial\"} $reallocated" >> "$TEMP_FILE" [[ -n "$power_on_hours" ]] && echo "smart_power_on_hours{device=\"$short_name\",serial=\"$serial\"} $power_on_hours" >> "$TEMP_FILE" [[ -n "$temperature" ]] && echo "smart_temperature_celsius{device=\"$short_name\",serial=\"$serial\"} $temperature" >> "$TEMP_FILE" [[ -n "$reallocated_event" ]] && echo "smart_reallocated_event_count{device=\"$short_name\",serial=\"$serial\"} $reallocated_event" >> "$TEMP_FILE" [[ -n "$pending_sector" ]] && echo "smart_current_pending_sector{device=\"$short_name\",serial=\"$serial\"} $pending_sector" >> "$TEMP_FILE" [[ -n "$offline_uncorrectable" ]] && echo "smart_offline_uncorrectable{device=\"$short_name\",serial=\"$serial\"} $offline_uncorrectable" >> "$TEMP_FILE" [[ -n "$udma_crc_error" ]] && echo "smart_udma_crc_error_count{device=\"$short_name\",serial=\"$serial\"} $udma_crc_error" >> "$TEMP_FILE" done # mdadm RAID array status (doesn't access disks) echo "" >> "$TEMP_FILE" echo "# HELP mdadm_array_state RAID array state (1=clean/active/resyncing, 0=degraded/other)" >> "$TEMP_FILE" echo "# TYPE mdadm_array_state gauge" >> "$TEMP_FILE" echo "# HELP mdadm_array_devices_total Total devices in RAID array" >> "$TEMP_FILE" echo "# TYPE mdadm_array_devices_total gauge" >> "$TEMP_FILE" echo "# HELP mdadm_array_devices_active Active devices in RAID array" >> "$TEMP_FILE" echo "# TYPE mdadm_array_devices_active gauge" >> "$TEMP_FILE" # Find RAID arrays for md_device in /dev/md/*; do [[ -e "$md_device" ]] || continue array_name=$(basename "$md_device") # Get array details mdadm_output=$(${pkgs.mdadm}/bin/mdadm --detail "$md_device" 2>/dev/null || continue) # Parse state state=$(echo "$mdadm_output" | grep "State :" | sed 's/.*State : //' | tr -d ' ') if [[ "$state" == *clean* ]] || [[ "$state" == *active* ]]; then state_value=1 else state_value=0 fi # Parse device counts total_devices=$(echo "$mdadm_output" | grep "Raid Devices" | awk '{print $4}') active_devices=$(echo "$mdadm_output" | grep "Active Devices" | awk '{print $4}') echo "mdadm_array_state{array=\"$array_name\",state=\"$state\"} $state_value" >> "$TEMP_FILE" [[ -n "$total_devices" ]] && echo "mdadm_array_devices_total{array=\"$array_name\"} $total_devices" >> "$TEMP_FILE" [[ -n "$active_devices" ]] && echo "mdadm_array_devices_active{array=\"$array_name\"} $active_devices" >> "$TEMP_FILE" done # Atomically replace the metrics file mv "$TEMP_FILE" "$METRICS_FILE" echo "Disk metrics collection complete" ''; in { # Required packages environment.systemPackages = with pkgs; [ smartmontools hdparm mdadm ]; # Node exporter with textfile collector services.prometheus.exporters.node = { enable = true; enabledCollectors = [ "textfile" ]; extraFlags = [ "--collector.textfile.directory=${textfileDir}" ]; }; # Systemd service to collect metrics systemd.services.disk-metrics = { description = "Collect S.M.A.R.T. and RAID metrics"; path = with pkgs; [ coreutils gawk gnugrep gnused ]; serviceConfig = { Type = "oneshot"; ExecStart = "${collectMetricsScript}"; # Run as root to access disk devices User = "root"; }; }; # Timer to run every 20 minutes (5min buffer for 15min spindown) systemd.timers.disk-metrics = { wantedBy = [ "timers.target" ]; timerConfig = { OnCalendar = "*:0/20"; # Every 20 minutes RandomizedDelaySec = "1min"; Persistent = true; }; }; # Ensure textfile directory exists and is persisted systemd.tmpfiles.rules = [ "d ${textfileDir} 0755 root root -" ]; }