nixos/hosts/nas/modules/disk-monitoring.nix

# Disk monitoring for NAS
# - S.M.A.R.T. metrics collection (respects disk spindown)
# - mdadm RAID array status
# - Exports metrics via node_exporter textfile collector
{ config, lib, pkgs, ... }:

let
  # Disk identifiers from hardware-configuration.nix
  disks = [
    "/dev/disk/by-id/ata-ST18000NM000J-2TV103_ZR52TBSB"
    "/dev/disk/by-id/ata-ST18000NM000J-2TV103_ZR52V9QX"
    "/dev/disk/by-id/ata-TOSHIBA_MG10ACA20TE_8582A01SF4MJ"
    "/dev/disk/by-id/ata-TOSHIBA_MG10ACA20TE_75V2A0H3F4MJ"
    "/dev/disk/by-id/nvme-KIOXIA-EXCERIA_PLUS_G3_SSD_7FJKS1MAZ0E7"
    "/dev/disk/by-id/nvme-KIOXIA-EXCERIA_PLUS_G3_SSD_7FJKS1M9Z0E7"
  ];

  textfileDir = "/var/lib/prometheus-node-exporter";

  # Script to collect S.M.A.R.T. and mdadm metrics
  collectMetricsScript = pkgs.writeShellScript "collect-disk-metrics" ''
    set -euo pipefail

    TEXTFILE_DIR="${textfileDir}"
    METRICS_FILE="$TEXTFILE_DIR/disk_health.prom"
    TEMP_FILE="$TEXTFILE_DIR/disk_health.prom.tmp"

    mkdir -p "$TEXTFILE_DIR"
    : > "$TEMP_FILE"

    # Timestamp of collection
    echo "# HELP disk_metrics_last_update Unix timestamp of last metrics collection" >> "$TEMP_FILE"
    echo "# TYPE disk_metrics_last_update gauge" >> "$TEMP_FILE"
    echo "disk_metrics_last_update $(date +%s)" >> "$TEMP_FILE"

    echo "" >> "$TEMP_FILE"
    echo "# HELP smart_device_active Whether the disk was active (1) or sleeping (0) when checked" >> "$TEMP_FILE"
    echo "# TYPE smart_device_active gauge" >> "$TEMP_FILE"

    # S.M.A.R.T. metrics for each disk
    for disk in ${lib.concatStringsSep " " disks}; do
      if [[ ! -e "$disk" ]]; then
        echo "Warning: Disk $disk not found, skipping" >&2
        continue
      fi

      # Resolve symlink to get actual device (needed for hdparm/smartctl)
      device=$(readlink -f "$disk")

      # Extract model+serial from disk-by-id path for stable labeling
      # ata-ST18000NM000J-2TV103_ZR52TBSB → ST18000NM000J-2TV103-ZR52TBSB
      # nvme-KIOXIA-EXCERIA_PLUS_G3_SSD_7FJKS1MAZ0E7 → KIOXIA-EXCERIA_PLUS_G3_SSD-7FJKS1MAZ0E7
      disk_id=$(basename "$disk")
      serial=$(echo "$disk_id" | sed 's/.*_//')
      model=$(echo "$disk_id" | sed 's/^[^-]*-//; s/_[^_]*$//')
      short_name="$model-$serial"

      # Check power state without waking disk
      power_state=$(${pkgs.hdparm}/bin/hdparm -C "$device" 2>/dev/null | grep -oP '(standby|active/idle|active|idle)' | head -1 || echo "unknown")

      if [[ "$power_state" == "standby" ]]; then
        # Disk is sleeping - don't wake it, report inactive
        echo "smart_device_active{device=\"$short_name\",serial=\"$serial\"} 0" >> "$TEMP_FILE"
        echo "Disk $short_name is in standby, skipping S.M.A.R.T. collection" >&2
        continue
      fi

      # Disk is active - collect S.M.A.R.T. data
      echo "smart_device_active{device=\"$short_name\",serial=\"$serial\"} 1" >> "$TEMP_FILE"

      # Get S.M.A.R.T. health status
      if ${pkgs.smartmontools}/bin/smartctl -H "$device" 2>/dev/null | grep -q "PASSED"; then
        health=1
      else
        health=0
      fi

      # Get S.M.A.R.T. attributes
      smartctl_output=$(${pkgs.smartmontools}/bin/smartctl -A "$device" 2>/dev/null || true)

      # Parse key attributes
      # Format: ID# ATTRIBUTE_NAME          FLAG     VALUE WORST THRESH TYPE      UPDATED  WHEN_FAILED RAW_VALUE

      get_raw_value() {
        local attr_id="$1"
        echo "$smartctl_output" | awk -v id="$attr_id" '$1 == id { print $10 }' | head -1
      }

      reallocated=$(get_raw_value "5")
      power_on_hours=$(get_raw_value "9")
      temperature=$(get_raw_value "194")
      reallocated_event=$(get_raw_value "196")
      pending_sector=$(get_raw_value "197")
      offline_uncorrectable=$(get_raw_value "198")
      udma_crc_error=$(get_raw_value "199")

      # Output metrics
      cat >> "$TEMP_FILE" << EOF

# S.M.A.R.T. metrics for $short_name
smart_health_passed{device="$short_name",serial="$serial"} $health
EOF

      [[ -n "$reallocated" ]] && echo "smart_reallocated_sector_ct{device=\"$short_name\",serial=\"$serial\"} $reallocated" >> "$TEMP_FILE"
      [[ -n "$power_on_hours" ]] && echo "smart_power_on_hours{device=\"$short_name\",serial=\"$serial\"} $power_on_hours" >> "$TEMP_FILE"
      [[ -n "$temperature" ]] && echo "smart_temperature_celsius{device=\"$short_name\",serial=\"$serial\"} $temperature" >> "$TEMP_FILE"
      [[ -n "$reallocated_event" ]] && echo "smart_reallocated_event_count{device=\"$short_name\",serial=\"$serial\"} $reallocated_event" >> "$TEMP_FILE"
      [[ -n "$pending_sector" ]] && echo "smart_current_pending_sector{device=\"$short_name\",serial=\"$serial\"} $pending_sector" >> "$TEMP_FILE"
      [[ -n "$offline_uncorrectable" ]] && echo "smart_offline_uncorrectable{device=\"$short_name\",serial=\"$serial\"} $offline_uncorrectable" >> "$TEMP_FILE"
      [[ -n "$udma_crc_error" ]] && echo "smart_udma_crc_error_count{device=\"$short_name\",serial=\"$serial\"} $udma_crc_error" >> "$TEMP_FILE"
    done

    # mdadm RAID array status (doesn't access disks)
    echo "" >> "$TEMP_FILE"
    echo "# HELP mdadm_array_state RAID array state (1=clean/active/resyncing, 0=degraded/other)" >> "$TEMP_FILE"
    echo "# TYPE mdadm_array_state gauge" >> "$TEMP_FILE"
    echo "# HELP mdadm_array_devices_total Total devices in RAID array" >> "$TEMP_FILE"
    echo "# TYPE mdadm_array_devices_total gauge" >> "$TEMP_FILE"
    echo "# HELP mdadm_array_devices_active Active devices in RAID array" >> "$TEMP_FILE"
    echo "# TYPE mdadm_array_devices_active gauge" >> "$TEMP_FILE"

    # Find RAID arrays
    for md_device in /dev/md/*; do
      [[ -e "$md_device" ]] || continue

      array_name=$(basename "$md_device")

      # Get array details
      mdadm_output=$(${pkgs.mdadm}/bin/mdadm --detail "$md_device" 2>/dev/null || continue)

      # Parse state
      state=$(echo "$mdadm_output" | grep "State :" | sed 's/.*State : //' | tr -d ' ')
      if [[ "$state" == *clean* ]] || [[ "$state" == *active* ]]; then
        state_value=1
      else
        state_value=0
      fi

      # Parse device counts
      total_devices=$(echo "$mdadm_output" | grep "Raid Devices" | awk '{print $4}')
      active_devices=$(echo "$mdadm_output" | grep "Active Devices" | awk '{print $4}')

      echo "mdadm_array_state{array=\"$array_name\",state=\"$state\"} $state_value" >> "$TEMP_FILE"
      [[ -n "$total_devices" ]] && echo "mdadm_array_devices_total{array=\"$array_name\"} $total_devices" >> "$TEMP_FILE"
      [[ -n "$active_devices" ]] && echo "mdadm_array_devices_active{array=\"$array_name\"} $active_devices" >> "$TEMP_FILE"
    done

    # Atomically replace the metrics file
    mv "$TEMP_FILE" "$METRICS_FILE"

    echo "Disk metrics collection complete"
  '';
in
{
  # Required packages
  environment.systemPackages = with pkgs; [
    smartmontools
    hdparm
    mdadm
  ];

  # Node exporter with textfile collector
  services.prometheus.exporters.node = {
    enable = true;
    enabledCollectors = [
      "textfile"
    ];
    extraFlags = [
      "--collector.textfile.directory=${textfileDir}"
    ];
  };

  # Systemd service to collect metrics
  systemd.services.disk-metrics = {
    description = "Collect S.M.A.R.T. and RAID metrics";
    path = with pkgs; [ coreutils gawk gnugrep gnused ];
    serviceConfig = {
      Type = "oneshot";
      ExecStart = "${collectMetricsScript}";
      # Run as root to access disk devices
      User = "root";
    };
  };

  # Timer to run every 20 minutes (5min buffer for 15min spindown)
  systemd.timers.disk-metrics = {
    wantedBy = [ "timers.target" ];
    timerConfig = {
      OnCalendar = "*:0/20";  # Every 20 minutes
      RandomizedDelaySec = "1min";
      Persistent = true;
    };
  };

  # Ensure textfile directory exists and is persisted
  systemd.tmpfiles.rules = [
    "d ${textfileDir} 0755 root root -"
  ];
}