200 lines
7.7 KiB
Nix
200 lines
7.7 KiB
Nix
# Disk monitoring for NAS
|
|
# - S.M.A.R.T. metrics collection (respects disk spindown)
|
|
# - mdadm RAID array status
|
|
# - Exports metrics via node_exporter textfile collector
|
|
{ config, lib, pkgs, ... }:
|
|
|
|
let
|
|
# Disk identifiers from hardware-configuration.nix
|
|
disks = [
|
|
"/dev/disk/by-id/ata-ST18000NM000J-2TV103_ZR52TBSB"
|
|
"/dev/disk/by-id/ata-ST18000NM000J-2TV103_ZR52V9QX"
|
|
"/dev/disk/by-id/ata-TOSHIBA_MG10ACA20TE_8582A01SF4MJ"
|
|
"/dev/disk/by-id/ata-TOSHIBA_MG10ACA20TE_75V2A0H3F4MJ"
|
|
"/dev/disk/by-id/nvme-KIOXIA-EXCERIA_PLUS_G3_SSD_7FJKS1MAZ0E7"
|
|
"/dev/disk/by-id/nvme-KIOXIA-EXCERIA_PLUS_G3_SSD_7FJKS1M9Z0E7"
|
|
];
|
|
|
|
textfileDir = "/var/lib/prometheus-node-exporter";
|
|
|
|
# Script to collect S.M.A.R.T. and mdadm metrics
|
|
collectMetricsScript = pkgs.writeShellScript "collect-disk-metrics" ''
|
|
set -euo pipefail
|
|
|
|
TEXTFILE_DIR="${textfileDir}"
|
|
METRICS_FILE="$TEXTFILE_DIR/disk_health.prom"
|
|
TEMP_FILE="$TEXTFILE_DIR/disk_health.prom.tmp"
|
|
|
|
mkdir -p "$TEXTFILE_DIR"
|
|
: > "$TEMP_FILE"
|
|
|
|
# Timestamp of collection
|
|
echo "# HELP disk_metrics_last_update Unix timestamp of last metrics collection" >> "$TEMP_FILE"
|
|
echo "# TYPE disk_metrics_last_update gauge" >> "$TEMP_FILE"
|
|
echo "disk_metrics_last_update $(date +%s)" >> "$TEMP_FILE"
|
|
|
|
echo "" >> "$TEMP_FILE"
|
|
echo "# HELP smart_device_active Whether the disk was active (1) or sleeping (0) when checked" >> "$TEMP_FILE"
|
|
echo "# TYPE smart_device_active gauge" >> "$TEMP_FILE"
|
|
|
|
# S.M.A.R.T. metrics for each disk
|
|
for disk in ${lib.concatStringsSep " " disks}; do
|
|
if [[ ! -e "$disk" ]]; then
|
|
echo "Warning: Disk $disk not found, skipping" >&2
|
|
continue
|
|
fi
|
|
|
|
# Resolve symlink to get actual device (needed for hdparm/smartctl)
|
|
device=$(readlink -f "$disk")
|
|
|
|
# Extract model+serial from disk-by-id path for stable labeling
|
|
# ata-ST18000NM000J-2TV103_ZR52TBSB → ST18000NM000J-2TV103-ZR52TBSB
|
|
# nvme-KIOXIA-EXCERIA_PLUS_G3_SSD_7FJKS1MAZ0E7 → KIOXIA-EXCERIA_PLUS_G3_SSD-7FJKS1MAZ0E7
|
|
disk_id=$(basename "$disk")
|
|
serial=$(echo "$disk_id" | sed 's/.*_//')
|
|
model=$(echo "$disk_id" | sed 's/^[^-]*-//; s/_[^_]*$//')
|
|
short_name="$model-$serial"
|
|
|
|
# Check power state without waking disk
|
|
power_state=$(${pkgs.hdparm}/bin/hdparm -C "$device" 2>/dev/null | grep -oP '(standby|active/idle|active|idle)' | head -1 || echo "unknown")
|
|
|
|
if [[ "$power_state" == "standby" ]]; then
|
|
# Disk is sleeping - don't wake it, report inactive
|
|
echo "smart_device_active{device=\"$short_name\",serial=\"$serial\"} 0" >> "$TEMP_FILE"
|
|
echo "Disk $short_name is in standby, skipping S.M.A.R.T. collection" >&2
|
|
continue
|
|
fi
|
|
|
|
# Disk is active - collect S.M.A.R.T. data
|
|
echo "smart_device_active{device=\"$short_name\",serial=\"$serial\"} 1" >> "$TEMP_FILE"
|
|
|
|
# Get S.M.A.R.T. health status
|
|
if ${pkgs.smartmontools}/bin/smartctl -H "$device" 2>/dev/null | grep -q "PASSED"; then
|
|
health=1
|
|
else
|
|
health=0
|
|
fi
|
|
|
|
# Get S.M.A.R.T. attributes
|
|
smartctl_output=$(${pkgs.smartmontools}/bin/smartctl -A "$device" 2>/dev/null || true)
|
|
|
|
# Parse key attributes
|
|
# Format: ID# ATTRIBUTE_NAME FLAG VALUE WORST THRESH TYPE UPDATED WHEN_FAILED RAW_VALUE
|
|
|
|
get_raw_value() {
|
|
local attr_id="$1"
|
|
echo "$smartctl_output" | awk -v id="$attr_id" '$1 == id { print $10 }' | head -1
|
|
}
|
|
|
|
reallocated=$(get_raw_value "5")
|
|
power_on_hours=$(get_raw_value "9")
|
|
temperature=$(get_raw_value "194")
|
|
reallocated_event=$(get_raw_value "196")
|
|
pending_sector=$(get_raw_value "197")
|
|
offline_uncorrectable=$(get_raw_value "198")
|
|
udma_crc_error=$(get_raw_value "199")
|
|
|
|
# Output metrics
|
|
cat >> "$TEMP_FILE" << EOF
|
|
|
|
# S.M.A.R.T. metrics for $short_name
|
|
smart_health_passed{device="$short_name",serial="$serial"} $health
|
|
EOF
|
|
|
|
[[ -n "$reallocated" ]] && echo "smart_reallocated_sector_ct{device=\"$short_name\",serial=\"$serial\"} $reallocated" >> "$TEMP_FILE"
|
|
[[ -n "$power_on_hours" ]] && echo "smart_power_on_hours{device=\"$short_name\",serial=\"$serial\"} $power_on_hours" >> "$TEMP_FILE"
|
|
[[ -n "$temperature" ]] && echo "smart_temperature_celsius{device=\"$short_name\",serial=\"$serial\"} $temperature" >> "$TEMP_FILE"
|
|
[[ -n "$reallocated_event" ]] && echo "smart_reallocated_event_count{device=\"$short_name\",serial=\"$serial\"} $reallocated_event" >> "$TEMP_FILE"
|
|
[[ -n "$pending_sector" ]] && echo "smart_current_pending_sector{device=\"$short_name\",serial=\"$serial\"} $pending_sector" >> "$TEMP_FILE"
|
|
[[ -n "$offline_uncorrectable" ]] && echo "smart_offline_uncorrectable{device=\"$short_name\",serial=\"$serial\"} $offline_uncorrectable" >> "$TEMP_FILE"
|
|
[[ -n "$udma_crc_error" ]] && echo "smart_udma_crc_error_count{device=\"$short_name\",serial=\"$serial\"} $udma_crc_error" >> "$TEMP_FILE"
|
|
done
|
|
|
|
# mdadm RAID array status (doesn't access disks)
|
|
echo "" >> "$TEMP_FILE"
|
|
echo "# HELP mdadm_array_state RAID array state (1=clean/active/resyncing, 0=degraded/other)" >> "$TEMP_FILE"
|
|
echo "# TYPE mdadm_array_state gauge" >> "$TEMP_FILE"
|
|
echo "# HELP mdadm_array_devices_total Total devices in RAID array" >> "$TEMP_FILE"
|
|
echo "# TYPE mdadm_array_devices_total gauge" >> "$TEMP_FILE"
|
|
echo "# HELP mdadm_array_devices_active Active devices in RAID array" >> "$TEMP_FILE"
|
|
echo "# TYPE mdadm_array_devices_active gauge" >> "$TEMP_FILE"
|
|
|
|
# Find RAID arrays
|
|
for md_device in /dev/md/*; do
|
|
[[ -e "$md_device" ]] || continue
|
|
|
|
array_name=$(basename "$md_device")
|
|
|
|
# Get array details
|
|
mdadm_output=$(${pkgs.mdadm}/bin/mdadm --detail "$md_device" 2>/dev/null || continue)
|
|
|
|
# Parse state
|
|
state=$(echo "$mdadm_output" | grep "State :" | sed 's/.*State : //' | tr -d ' ')
|
|
if [[ "$state" == *clean* ]] || [[ "$state" == *active* ]]; then
|
|
state_value=1
|
|
else
|
|
state_value=0
|
|
fi
|
|
|
|
# Parse device counts
|
|
total_devices=$(echo "$mdadm_output" | grep "Raid Devices" | awk '{print $4}')
|
|
active_devices=$(echo "$mdadm_output" | grep "Active Devices" | awk '{print $4}')
|
|
|
|
echo "mdadm_array_state{array=\"$array_name\",state=\"$state\"} $state_value" >> "$TEMP_FILE"
|
|
[[ -n "$total_devices" ]] && echo "mdadm_array_devices_total{array=\"$array_name\"} $total_devices" >> "$TEMP_FILE"
|
|
[[ -n "$active_devices" ]] && echo "mdadm_array_devices_active{array=\"$array_name\"} $active_devices" >> "$TEMP_FILE"
|
|
done
|
|
|
|
# Atomically replace the metrics file
|
|
mv "$TEMP_FILE" "$METRICS_FILE"
|
|
|
|
echo "Disk metrics collection complete"
|
|
'';
|
|
in
|
|
{
|
|
# Required packages
|
|
environment.systemPackages = with pkgs; [
|
|
smartmontools
|
|
hdparm
|
|
mdadm
|
|
];
|
|
|
|
# Node exporter with textfile collector
|
|
services.prometheus.exporters.node = {
|
|
enable = true;
|
|
enabledCollectors = [
|
|
"textfile"
|
|
];
|
|
extraFlags = [
|
|
"--collector.textfile.directory=${textfileDir}"
|
|
];
|
|
};
|
|
|
|
# Systemd service to collect metrics
|
|
systemd.services.disk-metrics = {
|
|
description = "Collect S.M.A.R.T. and RAID metrics";
|
|
path = with pkgs; [ coreutils gawk gnugrep gnused ];
|
|
serviceConfig = {
|
|
Type = "oneshot";
|
|
ExecStart = "${collectMetricsScript}";
|
|
# Run as root to access disk devices
|
|
User = "root";
|
|
};
|
|
};
|
|
|
|
# Timer to run every 20 minutes (5min buffer for 15min spindown)
|
|
systemd.timers.disk-metrics = {
|
|
wantedBy = [ "timers.target" ];
|
|
timerConfig = {
|
|
OnCalendar = "*:0/20"; # Every 20 minutes
|
|
RandomizedDelaySec = "1min";
|
|
Persistent = true;
|
|
};
|
|
};
|
|
|
|
# Ensure textfile directory exists and is persisted
|
|
systemd.tmpfiles.rules = [
|
|
"d ${textfileDir} 0755 root root -"
|
|
];
|
|
}
|