Files
nixos/hosts/nas/modules/disk-monitoring.nix

200 lines
7.7 KiB
Nix

# Disk monitoring for NAS
# - S.M.A.R.T. metrics collection (respects disk spindown)
# - mdadm RAID array status
# - Exports metrics via node_exporter textfile collector
{ config, lib, pkgs, ... }:
let
# Disk identifiers from hardware-configuration.nix
disks = [
"/dev/disk/by-id/ata-ST18000NM000J-2TV103_ZR52TBSB"
"/dev/disk/by-id/ata-ST18000NM000J-2TV103_ZR52V9QX"
"/dev/disk/by-id/ata-TOSHIBA_MG10ACA20TE_8582A01SF4MJ"
"/dev/disk/by-id/ata-TOSHIBA_MG10ACA20TE_75V2A0H3F4MJ"
"/dev/disk/by-id/nvme-KIOXIA-EXCERIA_PLUS_G3_SSD_7FJKS1MAZ0E7"
"/dev/disk/by-id/nvme-KIOXIA-EXCERIA_PLUS_G3_SSD_7FJKS1M9Z0E7"
];
textfileDir = "/var/lib/prometheus-node-exporter";
# Script to collect S.M.A.R.T. and mdadm metrics
collectMetricsScript = pkgs.writeShellScript "collect-disk-metrics" ''
set -euo pipefail
TEXTFILE_DIR="${textfileDir}"
METRICS_FILE="$TEXTFILE_DIR/disk_health.prom"
TEMP_FILE="$TEXTFILE_DIR/disk_health.prom.tmp"
mkdir -p "$TEXTFILE_DIR"
: > "$TEMP_FILE"
# Timestamp of collection
echo "# HELP disk_metrics_last_update Unix timestamp of last metrics collection" >> "$TEMP_FILE"
echo "# TYPE disk_metrics_last_update gauge" >> "$TEMP_FILE"
echo "disk_metrics_last_update $(date +%s)" >> "$TEMP_FILE"
echo "" >> "$TEMP_FILE"
echo "# HELP smart_device_active Whether the disk was active (1) or sleeping (0) when checked" >> "$TEMP_FILE"
echo "# TYPE smart_device_active gauge" >> "$TEMP_FILE"
# S.M.A.R.T. metrics for each disk
for disk in ${lib.concatStringsSep " " disks}; do
if [[ ! -e "$disk" ]]; then
echo "Warning: Disk $disk not found, skipping" >&2
continue
fi
# Resolve symlink to get actual device (needed for hdparm/smartctl)
device=$(readlink -f "$disk")
# Extract model+serial from disk-by-id path for stable labeling
# ata-ST18000NM000J-2TV103_ZR52TBSB ST18000NM000J-2TV103-ZR52TBSB
# nvme-KIOXIA-EXCERIA_PLUS_G3_SSD_7FJKS1MAZ0E7 KIOXIA-EXCERIA_PLUS_G3_SSD-7FJKS1MAZ0E7
disk_id=$(basename "$disk")
serial=$(echo "$disk_id" | sed 's/.*_//')
model=$(echo "$disk_id" | sed 's/^[^-]*-//; s/_[^_]*$//')
short_name="$model-$serial"
# Check power state without waking disk
power_state=$(${pkgs.hdparm}/bin/hdparm -C "$device" 2>/dev/null | grep -oP '(standby|active/idle|active|idle)' | head -1 || echo "unknown")
if [[ "$power_state" == "standby" ]]; then
# Disk is sleeping - don't wake it, report inactive
echo "smart_device_active{device=\"$short_name\",serial=\"$serial\"} 0" >> "$TEMP_FILE"
echo "Disk $short_name is in standby, skipping S.M.A.R.T. collection" >&2
continue
fi
# Disk is active - collect S.M.A.R.T. data
echo "smart_device_active{device=\"$short_name\",serial=\"$serial\"} 1" >> "$TEMP_FILE"
# Get S.M.A.R.T. health status
if ${pkgs.smartmontools}/bin/smartctl -H "$device" 2>/dev/null | grep -q "PASSED"; then
health=1
else
health=0
fi
# Get S.M.A.R.T. attributes
smartctl_output=$(${pkgs.smartmontools}/bin/smartctl -A "$device" 2>/dev/null || true)
# Parse key attributes
# Format: ID# ATTRIBUTE_NAME FLAG VALUE WORST THRESH TYPE UPDATED WHEN_FAILED RAW_VALUE
get_raw_value() {
local attr_id="$1"
echo "$smartctl_output" | awk -v id="$attr_id" '$1 == id { print $10 }' | head -1
}
reallocated=$(get_raw_value "5")
power_on_hours=$(get_raw_value "9")
temperature=$(get_raw_value "194")
reallocated_event=$(get_raw_value "196")
pending_sector=$(get_raw_value "197")
offline_uncorrectable=$(get_raw_value "198")
udma_crc_error=$(get_raw_value "199")
# Output metrics
cat >> "$TEMP_FILE" << EOF
# S.M.A.R.T. metrics for $short_name
smart_health_passed{device="$short_name",serial="$serial"} $health
EOF
[[ -n "$reallocated" ]] && echo "smart_reallocated_sector_ct{device=\"$short_name\",serial=\"$serial\"} $reallocated" >> "$TEMP_FILE"
[[ -n "$power_on_hours" ]] && echo "smart_power_on_hours{device=\"$short_name\",serial=\"$serial\"} $power_on_hours" >> "$TEMP_FILE"
[[ -n "$temperature" ]] && echo "smart_temperature_celsius{device=\"$short_name\",serial=\"$serial\"} $temperature" >> "$TEMP_FILE"
[[ -n "$reallocated_event" ]] && echo "smart_reallocated_event_count{device=\"$short_name\",serial=\"$serial\"} $reallocated_event" >> "$TEMP_FILE"
[[ -n "$pending_sector" ]] && echo "smart_current_pending_sector{device=\"$short_name\",serial=\"$serial\"} $pending_sector" >> "$TEMP_FILE"
[[ -n "$offline_uncorrectable" ]] && echo "smart_offline_uncorrectable{device=\"$short_name\",serial=\"$serial\"} $offline_uncorrectable" >> "$TEMP_FILE"
[[ -n "$udma_crc_error" ]] && echo "smart_udma_crc_error_count{device=\"$short_name\",serial=\"$serial\"} $udma_crc_error" >> "$TEMP_FILE"
done
# mdadm RAID array status (doesn't access disks)
echo "" >> "$TEMP_FILE"
echo "# HELP mdadm_array_state RAID array state (1=clean/active/resyncing, 0=degraded/other)" >> "$TEMP_FILE"
echo "# TYPE mdadm_array_state gauge" >> "$TEMP_FILE"
echo "# HELP mdadm_array_devices_total Total devices in RAID array" >> "$TEMP_FILE"
echo "# TYPE mdadm_array_devices_total gauge" >> "$TEMP_FILE"
echo "# HELP mdadm_array_devices_active Active devices in RAID array" >> "$TEMP_FILE"
echo "# TYPE mdadm_array_devices_active gauge" >> "$TEMP_FILE"
# Find RAID arrays
for md_device in /dev/md/*; do
[[ -e "$md_device" ]] || continue
array_name=$(basename "$md_device")
# Get array details
mdadm_output=$(${pkgs.mdadm}/bin/mdadm --detail "$md_device" 2>/dev/null || continue)
# Parse state
state=$(echo "$mdadm_output" | grep "State :" | sed 's/.*State : //' | tr -d ' ')
if [[ "$state" == *clean* ]] || [[ "$state" == *active* ]]; then
state_value=1
else
state_value=0
fi
# Parse device counts
total_devices=$(echo "$mdadm_output" | grep "Raid Devices" | awk '{print $4}')
active_devices=$(echo "$mdadm_output" | grep "Active Devices" | awk '{print $4}')
echo "mdadm_array_state{array=\"$array_name\",state=\"$state\"} $state_value" >> "$TEMP_FILE"
[[ -n "$total_devices" ]] && echo "mdadm_array_devices_total{array=\"$array_name\"} $total_devices" >> "$TEMP_FILE"
[[ -n "$active_devices" ]] && echo "mdadm_array_devices_active{array=\"$array_name\"} $active_devices" >> "$TEMP_FILE"
done
# Atomically replace the metrics file
mv "$TEMP_FILE" "$METRICS_FILE"
echo "Disk metrics collection complete"
'';
in
{
# Required packages
environment.systemPackages = with pkgs; [
smartmontools
hdparm
mdadm
];
# Node exporter with textfile collector
services.prometheus.exporters.node = {
enable = true;
enabledCollectors = [
"textfile"
];
extraFlags = [
"--collector.textfile.directory=${textfileDir}"
];
};
# Systemd service to collect metrics
systemd.services.disk-metrics = {
description = "Collect S.M.A.R.T. and RAID metrics";
path = with pkgs; [ coreutils gawk gnugrep gnused ];
serviceConfig = {
Type = "oneshot";
ExecStart = "${collectMetricsScript}";
# Run as root to access disk devices
User = "root";
};
};
# Timer to run every 20 minutes (5min buffer for 15min spindown)
systemd.timers.disk-metrics = {
wantedBy = [ "timers.target" ];
timerConfig = {
OnCalendar = "*:0/20"; # Every 20 minutes
RandomizedDelaySec = "1min";
Persistent = true;
};
};
# Ensure textfile directory exists and is persisted
systemd.tmpfiles.rules = [
"d ${textfileDir} 0755 root root -"
];
}