feat: add smart alerting and noatime to disks

This commit is contained in:
2025-11-28 23:50:24 +01:00
parent dbada3c509
commit 537f144885
8 changed files with 642 additions and 0 deletions

View File

@@ -9,9 +9,12 @@ in {
"${impermanence}/nixos.nix" "${impermanence}/nixos.nix"
./utils/bento.nix ./utils/bento.nix
./utils/modules/sops.nix ./utils/modules/sops.nix
./utils/modules/victoriametrics/default.nix
./modules/pyload.nix ./modules/pyload.nix
./modules/jellyfin.nix ./modules/jellyfin.nix
./modules/power-management.nix
./modules/disk-monitoring.nix
./hardware-configuration.nix ./hardware-configuration.nix
]; ];

View File

@@ -16,6 +16,14 @@
boot.kernelModules = [ "kvm-intel" ]; boot.kernelModules = [ "kvm-intel" ];
boot.extraModulePackages = [ ]; boot.extraModulePackages = [ ];
# Power management kernel parameters
boot.kernelParams = [
"intel_pstate=passive" # Better with powersave governor
"i915.enable_rc6=1" # GPU deep sleep states
"i915.enable_dc=2" # Display C-states (deepest)
"i915.enable_fbc=1" # Frame buffer compression
];
# RAID 1 array for data storage # RAID 1 array for data storage
boot.swraid = { boot.swraid = {
enable = true; enable = true;
@@ -78,11 +86,13 @@
fileSystems."/var/lib/downloads" = { fileSystems."/var/lib/downloads" = {
device = "/dev/vg-data/lv-downloads"; device = "/dev/vg-data/lv-downloads";
fsType = "xfs"; fsType = "xfs";
options = [ "noatime" ];
}; };
fileSystems."/var/lib/multimedia" = { fileSystems."/var/lib/multimedia" = {
device = "/dev/vg-data/lv-multimedia"; device = "/dev/vg-data/lv-multimedia";
fsType = "xfs"; fsType = "xfs";
options = [ "noatime" ];
}; };
# DHCP networking # DHCP networking

View File

@@ -0,0 +1,192 @@
# Disk monitoring for NAS
# - S.M.A.R.T. metrics collection (respects disk spindown)
# - mdadm RAID array status
# - Exports metrics via node_exporter textfile collector
{ config, lib, pkgs, ... }:
let
# Disk identifiers from hardware-configuration.nix
disks = [
"/dev/disk/by-id/ata-ST18000NM000J-2TV103_ZR52TBSB"
"/dev/disk/by-id/ata-ST18000NM000J-2TV103_ZR52V9QX"
];
textfileDir = "/var/lib/prometheus-node-exporter";
# Script to collect S.M.A.R.T. and mdadm metrics
collectMetricsScript = pkgs.writeShellScript "collect-disk-metrics" ''
set -euo pipefail
TEXTFILE_DIR="${textfileDir}"
METRICS_FILE="$TEXTFILE_DIR/disk_health.prom"
TEMP_FILE="$TEXTFILE_DIR/disk_health.prom.tmp"
mkdir -p "$TEXTFILE_DIR"
: > "$TEMP_FILE"
# Timestamp of collection
echo "# HELP disk_metrics_last_update Unix timestamp of last metrics collection" >> "$TEMP_FILE"
echo "# TYPE disk_metrics_last_update gauge" >> "$TEMP_FILE"
echo "disk_metrics_last_update $(date +%s)" >> "$TEMP_FILE"
echo "" >> "$TEMP_FILE"
echo "# HELP smart_device_active Whether the disk was active (1) or sleeping (0) when checked" >> "$TEMP_FILE"
echo "# TYPE smart_device_active gauge" >> "$TEMP_FILE"
# S.M.A.R.T. metrics for each disk
for disk in ${lib.concatStringsSep " " disks}; do
if [[ ! -e "$disk" ]]; then
echo "Warning: Disk $disk not found, skipping" >&2
continue
fi
# Resolve symlink to get actual device
device=$(readlink -f "$disk")
short_name=$(basename "$device")
# Extract serial from disk ID for labels
serial=$(basename "$disk" | sed 's/ata-ST18000NM000J-2TV103_//')
# Check power state without waking disk
power_state=$(${pkgs.hdparm}/bin/hdparm -C "$device" 2>/dev/null | grep -oP '(standby|active/idle|active|idle)' | head -1 || echo "unknown")
if [[ "$power_state" == "standby" ]]; then
# Disk is sleeping - don't wake it, report inactive
echo "smart_device_active{device=\"$short_name\",serial=\"$serial\"} 0" >> "$TEMP_FILE"
echo "Disk $short_name is in standby, skipping S.M.A.R.T. collection" >&2
continue
fi
# Disk is active - collect S.M.A.R.T. data
echo "smart_device_active{device=\"$short_name\",serial=\"$serial\"} 1" >> "$TEMP_FILE"
# Get S.M.A.R.T. health status
if ${pkgs.smartmontools}/bin/smartctl -H "$device" 2>/dev/null | grep -q "PASSED"; then
health=1
else
health=0
fi
# Get S.M.A.R.T. attributes
smartctl_output=$(${pkgs.smartmontools}/bin/smartctl -A "$device" 2>/dev/null || true)
# Parse key attributes
# Format: ID# ATTRIBUTE_NAME FLAG VALUE WORST THRESH TYPE UPDATED WHEN_FAILED RAW_VALUE
get_raw_value() {
local attr_id="$1"
echo "$smartctl_output" | awk -v id="$attr_id" '$1 == id { print $10 }' | head -1
}
reallocated=$(get_raw_value "5")
power_on_hours=$(get_raw_value "9")
temperature=$(get_raw_value "194")
reallocated_event=$(get_raw_value "196")
pending_sector=$(get_raw_value "197")
offline_uncorrectable=$(get_raw_value "198")
udma_crc_error=$(get_raw_value "199")
# Output metrics
cat >> "$TEMP_FILE" << EOF
# S.M.A.R.T. metrics for $short_name
smart_health_passed{device="$short_name",serial="$serial"} $health
EOF
[[ -n "$reallocated" ]] && echo "smart_reallocated_sector_ct{device=\"$short_name\",serial=\"$serial\"} $reallocated" >> "$TEMP_FILE"
[[ -n "$power_on_hours" ]] && echo "smart_power_on_hours{device=\"$short_name\",serial=\"$serial\"} $power_on_hours" >> "$TEMP_FILE"
[[ -n "$temperature" ]] && echo "smart_temperature_celsius{device=\"$short_name\",serial=\"$serial\"} $temperature" >> "$TEMP_FILE"
[[ -n "$reallocated_event" ]] && echo "smart_reallocated_event_count{device=\"$short_name\",serial=\"$serial\"} $reallocated_event" >> "$TEMP_FILE"
[[ -n "$pending_sector" ]] && echo "smart_current_pending_sector{device=\"$short_name\",serial=\"$serial\"} $pending_sector" >> "$TEMP_FILE"
[[ -n "$offline_uncorrectable" ]] && echo "smart_offline_uncorrectable{device=\"$short_name\",serial=\"$serial\"} $offline_uncorrectable" >> "$TEMP_FILE"
[[ -n "$udma_crc_error" ]] && echo "smart_udma_crc_error_count{device=\"$short_name\",serial=\"$serial\"} $udma_crc_error" >> "$TEMP_FILE"
done
# mdadm RAID array status (doesn't access disks)
echo "" >> "$TEMP_FILE"
echo "# HELP mdadm_array_state RAID array state (1=clean, 0=degraded/other)" >> "$TEMP_FILE"
echo "# TYPE mdadm_array_state gauge" >> "$TEMP_FILE"
echo "# HELP mdadm_array_devices_total Total devices in RAID array" >> "$TEMP_FILE"
echo "# TYPE mdadm_array_devices_total gauge" >> "$TEMP_FILE"
echo "# HELP mdadm_array_devices_active Active devices in RAID array" >> "$TEMP_FILE"
echo "# TYPE mdadm_array_devices_active gauge" >> "$TEMP_FILE"
# Find RAID arrays
for md_device in /dev/md/*; do
[[ -e "$md_device" ]] || continue
array_name=$(basename "$md_device")
# Get array details
mdadm_output=$(${pkgs.mdadm}/bin/mdadm --detail "$md_device" 2>/dev/null || continue)
# Parse state
state=$(echo "$mdadm_output" | grep "State :" | sed 's/.*State : //' | tr -d ' ')
if [[ "$state" == "clean" ]] || [[ "$state" == "active" ]]; then
state_value=1
else
state_value=0
fi
# Parse device counts
total_devices=$(echo "$mdadm_output" | grep "Raid Devices" | awk '{print $4}')
active_devices=$(echo "$mdadm_output" | grep "Active Devices" | awk '{print $4}')
echo "mdadm_array_state{array=\"$array_name\",state=\"$state\"} $state_value" >> "$TEMP_FILE"
[[ -n "$total_devices" ]] && echo "mdadm_array_devices_total{array=\"$array_name\"} $total_devices" >> "$TEMP_FILE"
[[ -n "$active_devices" ]] && echo "mdadm_array_devices_active{array=\"$array_name\"} $active_devices" >> "$TEMP_FILE"
done
# Atomically replace the metrics file
mv "$TEMP_FILE" "$METRICS_FILE"
echo "Disk metrics collection complete"
'';
in
{
# Required packages
environment.systemPackages = with pkgs; [
smartmontools
hdparm
mdadm
];
# Node exporter with textfile collector
services.prometheus.exporters.node = {
enable = true;
enabledCollectors = [
"textfile"
"systemd"
];
extraFlags = [
"--collector.textfile.directory=${textfileDir}"
];
};
# Systemd service to collect metrics
systemd.services.disk-metrics = {
description = "Collect S.M.A.R.T. and RAID metrics";
path = with pkgs; [ coreutils gawk gnugrep gnused ];
serviceConfig = {
Type = "oneshot";
ExecStart = "${collectMetricsScript}";
# Run as root to access disk devices
User = "root";
};
};
# Timer to run every 20 minutes (5min buffer for 15min spindown)
systemd.timers.disk-metrics = {
wantedBy = [ "timers.target" ];
timerConfig = {
OnCalendar = "*:0/20"; # Every 20 minutes
RandomizedDelaySec = "1min";
Persistent = true;
};
};
# Ensure textfile directory exists and is persisted
systemd.tmpfiles.rules = [
"d ${textfileDir} 0755 root root -"
];
}

View File

@@ -0,0 +1,19 @@
# Power management for NAS
# - CPU powersave governor (scales up on demand for transcoding)
# - Disk spindown after 15 minutes idle
{ config, lib, pkgs, ... }:
{
# CPU Power Management - powersave scales up on demand for transcoding
powerManagement.cpuFreqGovernor = "powersave";
# Disk spindown - hdparm for Seagate 18TB drives
environment.systemPackages = [ pkgs.hdparm ];
services.udev.extraRules = ''
# Seagate 18TB NAS drives - APM 127 allows spindown, -S 180 = 15 min
ACTION=="add", KERNEL=="sd[a-z]", SUBSYSTEM=="block", \
ATTRS{model}=="ST18000NM000J*", \
RUN+="${pkgs.hdparm}/bin/hdparm -B 127 -S 180 /dev/%k"
'';
}

View File

@@ -0,0 +1,17 @@
{ lib, pkgs, config, ... }:
let
smartAlertRules = (import ./smart_alerts.nix { inherit lib pkgs config; }).grafanaAlertRuleDefinitions;
raidAlertRules = (import ./raid_alerts.nix { inherit lib pkgs config; }).grafanaAlertRuleDefinitions;
allStorageRules = smartAlertRules ++ raidAlertRules;
in
{
services.grafana.provision.alerting.rules.settings.groups = [
{
name = "Storage Alerts";
folder = "Storage Alerts";
interval = "5m"; # Check every 5 minutes (metrics collected every 20 min)
rules = allStorageRules;
}
];
}

View File

@@ -0,0 +1,102 @@
{ lib, pkgs, config, ... }:
{
grafanaAlertRuleDefinitions = [
# RAID array degraded - critical
{
uid = "raid-array-degraded-uid";
title = "RaidArrayDegraded";
condition = "D";
data = [
{
refId = "A";
datasourceUid = "vm-datasource-uid";
relativeTimeRange = { from = 300; to = 0; };
model = {
expr = ''mdadm_array_state == 0'';
instant = false;
};
}
{
refId = "C";
datasourceUid = "__expr__";
model = {
type = "reduce";
expression = "A";
reducer = "last";
};
}
{
refId = "D";
datasourceUid = "__expr__";
model = {
type = "math";
expression = "$C == 0";
};
}
];
for = "0s";
noDataState = "NoData";
execErrState = "Error";
annotations = {
summary = "RAID array {{ $labels.array }} is degraded";
description = ''
RAID array {{ $labels.array }} on {{ $labels.instance }} is in state "{{ $labels.state }}".
The array is not in a healthy state. Check for failed disks immediately!
'';
};
labels = {
severity = "critical";
category = "storage";
};
}
# RAID missing devices - critical
{
uid = "raid-missing-devices-uid";
title = "RaidMissingDevices";
condition = "D";
data = [
{
refId = "A";
datasourceUid = "vm-datasource-uid";
relativeTimeRange = { from = 300; to = 0; };
model = {
expr = ''mdadm_array_devices_active < mdadm_array_devices_total'';
instant = false;
};
}
{
refId = "C";
datasourceUid = "__expr__";
model = {
type = "reduce";
expression = "A";
reducer = "last";
};
}
{
refId = "D";
datasourceUid = "__expr__";
model = {
type = "math";
expression = "$C > 0";
};
}
];
for = "0s";
noDataState = "NoData";
execErrState = "Error";
annotations = {
summary = "RAID array {{ $labels.array }} has missing devices";
description = ''
RAID array {{ $labels.array }} on {{ $labels.instance }} has fewer active devices than expected.
A disk may have failed or been removed. Check array status immediately!
'';
};
labels = {
severity = "critical";
category = "storage";
};
}
];
}

View File

@@ -0,0 +1,298 @@
{ lib, pkgs, config, ... }:
{
grafanaAlertRuleDefinitions = [
# S.M.A.R.T. overall health failed - critical
{
uid = "smart-health-failed-uid";
title = "DiskSmartHealthFailed";
condition = "D";
data = [
{
refId = "A";
datasourceUid = "vm-datasource-uid";
relativeTimeRange = { from = 300; to = 0; };
model = {
expr = ''smart_health_passed == 0'';
instant = false;
};
}
{
refId = "C";
datasourceUid = "__expr__";
model = {
type = "reduce";
expression = "A";
reducer = "last";
};
}
{
refId = "D";
datasourceUid = "__expr__";
model = {
type = "math";
expression = "$C == 0";
};
}
];
for = "0s";
noDataState = "NoData";
execErrState = "Error";
annotations = {
summary = "S.M.A.R.T. health check FAILED on {{ $labels.device }}";
description = ''
Disk {{ $labels.device }} ({{ $labels.serial }}) on {{ $labels.instance }} has failed its S.M.A.R.T. health check.
This indicates imminent disk failure. Replace the disk immediately!
'';
};
labels = {
severity = "critical";
category = "storage";
};
}
# Reallocated sectors - warning (any count > 0 is concerning)
{
uid = "smart-reallocated-sectors-uid";
title = "DiskReallocatedSectors";
condition = "D";
data = [
{
refId = "A";
datasourceUid = "vm-datasource-uid";
relativeTimeRange = { from = 300; to = 0; };
model = {
expr = ''smart_reallocated_sector_ct > 0'';
instant = false;
};
}
{
refId = "C";
datasourceUid = "__expr__";
model = {
type = "reduce";
expression = "A";
reducer = "last";
};
}
{
refId = "D";
datasourceUid = "__expr__";
model = {
type = "math";
expression = "$C > 0";
};
}
];
for = "0s";
noDataState = "NoData";
execErrState = "Error";
annotations = {
summary = "Reallocated sectors detected on {{ $labels.device }}";
description = ''
Disk {{ $labels.device }} ({{ $labels.serial }}) on {{ $labels.instance }} has reallocated sectors.
This indicates disk surface damage. Monitor closely and plan replacement.
'';
};
labels = {
severity = "warning";
category = "storage";
};
}
# Current pending sectors
{
uid = "smart-pending-sectors-uid";
title = "DiskPendingSectors";
condition = "D";
data = [
{
refId = "A";
datasourceUid = "vm-datasource-uid";
relativeTimeRange = { from = 300; to = 0; };
model = {
expr = ''smart_current_pending_sector > 0'';
instant = false;
};
}
{
refId = "C";
datasourceUid = "__expr__";
model = {
type = "reduce";
expression = "A";
reducer = "last";
};
}
{
refId = "D";
datasourceUid = "__expr__";
model = {
type = "math";
expression = "$C > 0";
};
}
];
for = "0s";
noDataState = "NoData";
execErrState = "Error";
annotations = {
summary = "Pending sectors detected on {{ $labels.device }}";
description = ''
Disk {{ $labels.device }} ({{ $labels.serial }}) on {{ $labels.instance }} has pending sectors.
These sectors could not be read and may be reallocated. Monitor for increase.
'';
};
labels = {
severity = "warning";
category = "storage";
};
}
# Offline uncorrectable errors
{
uid = "smart-offline-uncorrectable-uid";
title = "DiskOfflineUncorrectable";
condition = "D";
data = [
{
refId = "A";
datasourceUid = "vm-datasource-uid";
relativeTimeRange = { from = 300; to = 0; };
model = {
expr = ''smart_offline_uncorrectable > 0'';
instant = false;
};
}
{
refId = "C";
datasourceUid = "__expr__";
model = {
type = "reduce";
expression = "A";
reducer = "last";
};
}
{
refId = "D";
datasourceUid = "__expr__";
model = {
type = "math";
expression = "$C > 0";
};
}
];
for = "0s";
noDataState = "NoData";
execErrState = "Error";
annotations = {
summary = "Offline uncorrectable errors on {{ $labels.device }}";
description = ''
Disk {{ $labels.device }} ({{ $labels.serial }}) on {{ $labels.instance }} has offline uncorrectable errors.
This indicates data integrity issues. Consider replacement.
'';
};
labels = {
severity = "warning";
category = "storage";
};
}
# High temperature (Seagate enterprise: warning at 50C)
{
uid = "smart-high-temperature-uid";
title = "DiskHighTemperature";
condition = "D";
data = [
{
refId = "A";
datasourceUid = "vm-datasource-uid";
relativeTimeRange = { from = 600; to = 0; };
model = {
expr = ''smart_temperature_celsius > 50'';
instant = false;
};
}
{
refId = "C";
datasourceUid = "__expr__";
model = {
type = "reduce";
expression = "A";
reducer = "last";
};
}
{
refId = "D";
datasourceUid = "__expr__";
model = {
type = "math";
expression = "$C > 0";
};
}
];
for = "10m";
noDataState = "NoData";
execErrState = "Error";
annotations = {
summary = "High temperature on {{ $labels.device }}";
description = ''
Disk {{ $labels.device }} ({{ $labels.serial }}) on {{ $labels.instance }} temperature exceeds 50°C.
Check cooling and ventilation.
'';
};
labels = {
severity = "warning";
category = "storage";
};
}
# UDMA CRC errors (cable/connection issues)
{
uid = "smart-udma-crc-errors-uid";
title = "DiskUDMACRCErrors";
condition = "D";
data = [
{
refId = "A";
datasourceUid = "vm-datasource-uid";
relativeTimeRange = { from = 86400; to = 0; };
model = {
expr = ''increase(smart_udma_crc_error_count[24h]) > 0'';
instant = false;
};
}
{
refId = "C";
datasourceUid = "__expr__";
model = {
type = "reduce";
expression = "A";
reducer = "last";
};
}
{
refId = "D";
datasourceUid = "__expr__";
model = {
type = "math";
expression = "$C > 0";
};
}
];
for = "0s";
noDataState = "NoData";
execErrState = "Error";
annotations = {
summary = "UDMA CRC errors on {{ $labels.device }}";
description = ''
Disk {{ $labels.device }} ({{ $labels.serial }}) on {{ $labels.instance }} has new CRC errors.
This typically indicates SATA cable or connection issues. Check cables.
'';
};
labels = {
severity = "warning";
category = "storage";
};
}
];
}

View File

@@ -31,6 +31,7 @@ in
./alerting/system/default.nix ./alerting/system/default.nix
./alerting/service/default.nix ./alerting/service/default.nix
./alerting/websites/default.nix ./alerting/websites/default.nix
# ./alerting/storage/default.nix
./datasources/victoriametrics.nix ./datasources/victoriametrics.nix
./datasources/loki.nix ./datasources/loki.nix