feat: add smart alerting and noatime to disks
This commit is contained in:
@@ -9,9 +9,12 @@ in {
|
|||||||
"${impermanence}/nixos.nix"
|
"${impermanence}/nixos.nix"
|
||||||
./utils/bento.nix
|
./utils/bento.nix
|
||||||
./utils/modules/sops.nix
|
./utils/modules/sops.nix
|
||||||
|
./utils/modules/victoriametrics/default.nix
|
||||||
|
|
||||||
./modules/pyload.nix
|
./modules/pyload.nix
|
||||||
./modules/jellyfin.nix
|
./modules/jellyfin.nix
|
||||||
|
./modules/power-management.nix
|
||||||
|
./modules/disk-monitoring.nix
|
||||||
|
|
||||||
./hardware-configuration.nix
|
./hardware-configuration.nix
|
||||||
];
|
];
|
||||||
|
|||||||
@@ -16,6 +16,14 @@
|
|||||||
boot.kernelModules = [ "kvm-intel" ];
|
boot.kernelModules = [ "kvm-intel" ];
|
||||||
boot.extraModulePackages = [ ];
|
boot.extraModulePackages = [ ];
|
||||||
|
|
||||||
|
# Power management kernel parameters
|
||||||
|
boot.kernelParams = [
|
||||||
|
"intel_pstate=passive" # Better with powersave governor
|
||||||
|
"i915.enable_rc6=1" # GPU deep sleep states
|
||||||
|
"i915.enable_dc=2" # Display C-states (deepest)
|
||||||
|
"i915.enable_fbc=1" # Frame buffer compression
|
||||||
|
];
|
||||||
|
|
||||||
# RAID 1 array for data storage
|
# RAID 1 array for data storage
|
||||||
boot.swraid = {
|
boot.swraid = {
|
||||||
enable = true;
|
enable = true;
|
||||||
@@ -78,11 +86,13 @@
|
|||||||
fileSystems."/var/lib/downloads" = {
|
fileSystems."/var/lib/downloads" = {
|
||||||
device = "/dev/vg-data/lv-downloads";
|
device = "/dev/vg-data/lv-downloads";
|
||||||
fsType = "xfs";
|
fsType = "xfs";
|
||||||
|
options = [ "noatime" ];
|
||||||
};
|
};
|
||||||
|
|
||||||
fileSystems."/var/lib/multimedia" = {
|
fileSystems."/var/lib/multimedia" = {
|
||||||
device = "/dev/vg-data/lv-multimedia";
|
device = "/dev/vg-data/lv-multimedia";
|
||||||
fsType = "xfs";
|
fsType = "xfs";
|
||||||
|
options = [ "noatime" ];
|
||||||
};
|
};
|
||||||
|
|
||||||
# DHCP networking
|
# DHCP networking
|
||||||
|
|||||||
192
hosts/nas/modules/disk-monitoring.nix
Normal file
192
hosts/nas/modules/disk-monitoring.nix
Normal file
@@ -0,0 +1,192 @@
|
|||||||
|
# Disk monitoring for NAS
|
||||||
|
# - S.M.A.R.T. metrics collection (respects disk spindown)
|
||||||
|
# - mdadm RAID array status
|
||||||
|
# - Exports metrics via node_exporter textfile collector
|
||||||
|
{ config, lib, pkgs, ... }:
|
||||||
|
|
||||||
|
let
|
||||||
|
# Disk identifiers from hardware-configuration.nix
|
||||||
|
disks = [
|
||||||
|
"/dev/disk/by-id/ata-ST18000NM000J-2TV103_ZR52TBSB"
|
||||||
|
"/dev/disk/by-id/ata-ST18000NM000J-2TV103_ZR52V9QX"
|
||||||
|
];
|
||||||
|
|
||||||
|
textfileDir = "/var/lib/prometheus-node-exporter";
|
||||||
|
|
||||||
|
# Script to collect S.M.A.R.T. and mdadm metrics
|
||||||
|
collectMetricsScript = pkgs.writeShellScript "collect-disk-metrics" ''
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
TEXTFILE_DIR="${textfileDir}"
|
||||||
|
METRICS_FILE="$TEXTFILE_DIR/disk_health.prom"
|
||||||
|
TEMP_FILE="$TEXTFILE_DIR/disk_health.prom.tmp"
|
||||||
|
|
||||||
|
mkdir -p "$TEXTFILE_DIR"
|
||||||
|
: > "$TEMP_FILE"
|
||||||
|
|
||||||
|
# Timestamp of collection
|
||||||
|
echo "# HELP disk_metrics_last_update Unix timestamp of last metrics collection" >> "$TEMP_FILE"
|
||||||
|
echo "# TYPE disk_metrics_last_update gauge" >> "$TEMP_FILE"
|
||||||
|
echo "disk_metrics_last_update $(date +%s)" >> "$TEMP_FILE"
|
||||||
|
|
||||||
|
echo "" >> "$TEMP_FILE"
|
||||||
|
echo "# HELP smart_device_active Whether the disk was active (1) or sleeping (0) when checked" >> "$TEMP_FILE"
|
||||||
|
echo "# TYPE smart_device_active gauge" >> "$TEMP_FILE"
|
||||||
|
|
||||||
|
# S.M.A.R.T. metrics for each disk
|
||||||
|
for disk in ${lib.concatStringsSep " " disks}; do
|
||||||
|
if [[ ! -e "$disk" ]]; then
|
||||||
|
echo "Warning: Disk $disk not found, skipping" >&2
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Resolve symlink to get actual device
|
||||||
|
device=$(readlink -f "$disk")
|
||||||
|
short_name=$(basename "$device")
|
||||||
|
|
||||||
|
# Extract serial from disk ID for labels
|
||||||
|
serial=$(basename "$disk" | sed 's/ata-ST18000NM000J-2TV103_//')
|
||||||
|
|
||||||
|
# Check power state without waking disk
|
||||||
|
power_state=$(${pkgs.hdparm}/bin/hdparm -C "$device" 2>/dev/null | grep -oP '(standby|active/idle|active|idle)' | head -1 || echo "unknown")
|
||||||
|
|
||||||
|
if [[ "$power_state" == "standby" ]]; then
|
||||||
|
# Disk is sleeping - don't wake it, report inactive
|
||||||
|
echo "smart_device_active{device=\"$short_name\",serial=\"$serial\"} 0" >> "$TEMP_FILE"
|
||||||
|
echo "Disk $short_name is in standby, skipping S.M.A.R.T. collection" >&2
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Disk is active - collect S.M.A.R.T. data
|
||||||
|
echo "smart_device_active{device=\"$short_name\",serial=\"$serial\"} 1" >> "$TEMP_FILE"
|
||||||
|
|
||||||
|
# Get S.M.A.R.T. health status
|
||||||
|
if ${pkgs.smartmontools}/bin/smartctl -H "$device" 2>/dev/null | grep -q "PASSED"; then
|
||||||
|
health=1
|
||||||
|
else
|
||||||
|
health=0
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Get S.M.A.R.T. attributes
|
||||||
|
smartctl_output=$(${pkgs.smartmontools}/bin/smartctl -A "$device" 2>/dev/null || true)
|
||||||
|
|
||||||
|
# Parse key attributes
|
||||||
|
# Format: ID# ATTRIBUTE_NAME FLAG VALUE WORST THRESH TYPE UPDATED WHEN_FAILED RAW_VALUE
|
||||||
|
|
||||||
|
get_raw_value() {
|
||||||
|
local attr_id="$1"
|
||||||
|
echo "$smartctl_output" | awk -v id="$attr_id" '$1 == id { print $10 }' | head -1
|
||||||
|
}
|
||||||
|
|
||||||
|
reallocated=$(get_raw_value "5")
|
||||||
|
power_on_hours=$(get_raw_value "9")
|
||||||
|
temperature=$(get_raw_value "194")
|
||||||
|
reallocated_event=$(get_raw_value "196")
|
||||||
|
pending_sector=$(get_raw_value "197")
|
||||||
|
offline_uncorrectable=$(get_raw_value "198")
|
||||||
|
udma_crc_error=$(get_raw_value "199")
|
||||||
|
|
||||||
|
# Output metrics
|
||||||
|
cat >> "$TEMP_FILE" << EOF
|
||||||
|
|
||||||
|
# S.M.A.R.T. metrics for $short_name
|
||||||
|
smart_health_passed{device="$short_name",serial="$serial"} $health
|
||||||
|
EOF
|
||||||
|
|
||||||
|
[[ -n "$reallocated" ]] && echo "smart_reallocated_sector_ct{device=\"$short_name\",serial=\"$serial\"} $reallocated" >> "$TEMP_FILE"
|
||||||
|
[[ -n "$power_on_hours" ]] && echo "smart_power_on_hours{device=\"$short_name\",serial=\"$serial\"} $power_on_hours" >> "$TEMP_FILE"
|
||||||
|
[[ -n "$temperature" ]] && echo "smart_temperature_celsius{device=\"$short_name\",serial=\"$serial\"} $temperature" >> "$TEMP_FILE"
|
||||||
|
[[ -n "$reallocated_event" ]] && echo "smart_reallocated_event_count{device=\"$short_name\",serial=\"$serial\"} $reallocated_event" >> "$TEMP_FILE"
|
||||||
|
[[ -n "$pending_sector" ]] && echo "smart_current_pending_sector{device=\"$short_name\",serial=\"$serial\"} $pending_sector" >> "$TEMP_FILE"
|
||||||
|
[[ -n "$offline_uncorrectable" ]] && echo "smart_offline_uncorrectable{device=\"$short_name\",serial=\"$serial\"} $offline_uncorrectable" >> "$TEMP_FILE"
|
||||||
|
[[ -n "$udma_crc_error" ]] && echo "smart_udma_crc_error_count{device=\"$short_name\",serial=\"$serial\"} $udma_crc_error" >> "$TEMP_FILE"
|
||||||
|
done
|
||||||
|
|
||||||
|
# mdadm RAID array status (doesn't access disks)
|
||||||
|
echo "" >> "$TEMP_FILE"
|
||||||
|
echo "# HELP mdadm_array_state RAID array state (1=clean, 0=degraded/other)" >> "$TEMP_FILE"
|
||||||
|
echo "# TYPE mdadm_array_state gauge" >> "$TEMP_FILE"
|
||||||
|
echo "# HELP mdadm_array_devices_total Total devices in RAID array" >> "$TEMP_FILE"
|
||||||
|
echo "# TYPE mdadm_array_devices_total gauge" >> "$TEMP_FILE"
|
||||||
|
echo "# HELP mdadm_array_devices_active Active devices in RAID array" >> "$TEMP_FILE"
|
||||||
|
echo "# TYPE mdadm_array_devices_active gauge" >> "$TEMP_FILE"
|
||||||
|
|
||||||
|
# Find RAID arrays
|
||||||
|
for md_device in /dev/md/*; do
|
||||||
|
[[ -e "$md_device" ]] || continue
|
||||||
|
|
||||||
|
array_name=$(basename "$md_device")
|
||||||
|
|
||||||
|
# Get array details
|
||||||
|
mdadm_output=$(${pkgs.mdadm}/bin/mdadm --detail "$md_device" 2>/dev/null || continue)
|
||||||
|
|
||||||
|
# Parse state
|
||||||
|
state=$(echo "$mdadm_output" | grep "State :" | sed 's/.*State : //' | tr -d ' ')
|
||||||
|
if [[ "$state" == "clean" ]] || [[ "$state" == "active" ]]; then
|
||||||
|
state_value=1
|
||||||
|
else
|
||||||
|
state_value=0
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Parse device counts
|
||||||
|
total_devices=$(echo "$mdadm_output" | grep "Raid Devices" | awk '{print $4}')
|
||||||
|
active_devices=$(echo "$mdadm_output" | grep "Active Devices" | awk '{print $4}')
|
||||||
|
|
||||||
|
echo "mdadm_array_state{array=\"$array_name\",state=\"$state\"} $state_value" >> "$TEMP_FILE"
|
||||||
|
[[ -n "$total_devices" ]] && echo "mdadm_array_devices_total{array=\"$array_name\"} $total_devices" >> "$TEMP_FILE"
|
||||||
|
[[ -n "$active_devices" ]] && echo "mdadm_array_devices_active{array=\"$array_name\"} $active_devices" >> "$TEMP_FILE"
|
||||||
|
done
|
||||||
|
|
||||||
|
# Atomically replace the metrics file
|
||||||
|
mv "$TEMP_FILE" "$METRICS_FILE"
|
||||||
|
|
||||||
|
echo "Disk metrics collection complete"
|
||||||
|
'';
|
||||||
|
in
|
||||||
|
{
|
||||||
|
# Required packages
|
||||||
|
environment.systemPackages = with pkgs; [
|
||||||
|
smartmontools
|
||||||
|
hdparm
|
||||||
|
mdadm
|
||||||
|
];
|
||||||
|
|
||||||
|
# Node exporter with textfile collector
|
||||||
|
services.prometheus.exporters.node = {
|
||||||
|
enable = true;
|
||||||
|
enabledCollectors = [
|
||||||
|
"textfile"
|
||||||
|
"systemd"
|
||||||
|
];
|
||||||
|
extraFlags = [
|
||||||
|
"--collector.textfile.directory=${textfileDir}"
|
||||||
|
];
|
||||||
|
};
|
||||||
|
|
||||||
|
# Systemd service to collect metrics
|
||||||
|
systemd.services.disk-metrics = {
|
||||||
|
description = "Collect S.M.A.R.T. and RAID metrics";
|
||||||
|
path = with pkgs; [ coreutils gawk gnugrep gnused ];
|
||||||
|
serviceConfig = {
|
||||||
|
Type = "oneshot";
|
||||||
|
ExecStart = "${collectMetricsScript}";
|
||||||
|
# Run as root to access disk devices
|
||||||
|
User = "root";
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
|
# Timer to run every 20 minutes (5min buffer for 15min spindown)
|
||||||
|
systemd.timers.disk-metrics = {
|
||||||
|
wantedBy = [ "timers.target" ];
|
||||||
|
timerConfig = {
|
||||||
|
OnCalendar = "*:0/20"; # Every 20 minutes
|
||||||
|
RandomizedDelaySec = "1min";
|
||||||
|
Persistent = true;
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
|
# Ensure textfile directory exists and is persisted
|
||||||
|
systemd.tmpfiles.rules = [
|
||||||
|
"d ${textfileDir} 0755 root root -"
|
||||||
|
];
|
||||||
|
}
|
||||||
19
hosts/nas/modules/power-management.nix
Normal file
19
hosts/nas/modules/power-management.nix
Normal file
@@ -0,0 +1,19 @@
|
|||||||
|
# Power management for NAS
|
||||||
|
# - CPU powersave governor (scales up on demand for transcoding)
|
||||||
|
# - Disk spindown after 15 minutes idle
|
||||||
|
{ config, lib, pkgs, ... }:
|
||||||
|
|
||||||
|
{
|
||||||
|
# CPU Power Management - powersave scales up on demand for transcoding
|
||||||
|
powerManagement.cpuFreqGovernor = "powersave";
|
||||||
|
|
||||||
|
# Disk spindown - hdparm for Seagate 18TB drives
|
||||||
|
environment.systemPackages = [ pkgs.hdparm ];
|
||||||
|
|
||||||
|
services.udev.extraRules = ''
|
||||||
|
# Seagate 18TB NAS drives - APM 127 allows spindown, -S 180 = 15 min
|
||||||
|
ACTION=="add", KERNEL=="sd[a-z]", SUBSYSTEM=="block", \
|
||||||
|
ATTRS{model}=="ST18000NM000J*", \
|
||||||
|
RUN+="${pkgs.hdparm}/bin/hdparm -B 127 -S 180 /dev/%k"
|
||||||
|
'';
|
||||||
|
}
|
||||||
17
hosts/web-arm/modules/grafana/alerting/storage/default.nix
Normal file
17
hosts/web-arm/modules/grafana/alerting/storage/default.nix
Normal file
@@ -0,0 +1,17 @@
|
|||||||
|
{ lib, pkgs, config, ... }:
|
||||||
|
let
|
||||||
|
smartAlertRules = (import ./smart_alerts.nix { inherit lib pkgs config; }).grafanaAlertRuleDefinitions;
|
||||||
|
raidAlertRules = (import ./raid_alerts.nix { inherit lib pkgs config; }).grafanaAlertRuleDefinitions;
|
||||||
|
|
||||||
|
allStorageRules = smartAlertRules ++ raidAlertRules;
|
||||||
|
in
|
||||||
|
{
|
||||||
|
services.grafana.provision.alerting.rules.settings.groups = [
|
||||||
|
{
|
||||||
|
name = "Storage Alerts";
|
||||||
|
folder = "Storage Alerts";
|
||||||
|
interval = "5m"; # Check every 5 minutes (metrics collected every 20 min)
|
||||||
|
rules = allStorageRules;
|
||||||
|
}
|
||||||
|
];
|
||||||
|
}
|
||||||
102
hosts/web-arm/modules/grafana/alerting/storage/raid_alerts.nix
Normal file
102
hosts/web-arm/modules/grafana/alerting/storage/raid_alerts.nix
Normal file
@@ -0,0 +1,102 @@
|
|||||||
|
{ lib, pkgs, config, ... }:
|
||||||
|
{
|
||||||
|
grafanaAlertRuleDefinitions = [
|
||||||
|
# RAID array degraded - critical
|
||||||
|
{
|
||||||
|
uid = "raid-array-degraded-uid";
|
||||||
|
title = "RaidArrayDegraded";
|
||||||
|
condition = "D";
|
||||||
|
data = [
|
||||||
|
{
|
||||||
|
refId = "A";
|
||||||
|
datasourceUid = "vm-datasource-uid";
|
||||||
|
relativeTimeRange = { from = 300; to = 0; };
|
||||||
|
model = {
|
||||||
|
expr = ''mdadm_array_state == 0'';
|
||||||
|
instant = false;
|
||||||
|
};
|
||||||
|
}
|
||||||
|
{
|
||||||
|
refId = "C";
|
||||||
|
datasourceUid = "__expr__";
|
||||||
|
model = {
|
||||||
|
type = "reduce";
|
||||||
|
expression = "A";
|
||||||
|
reducer = "last";
|
||||||
|
};
|
||||||
|
}
|
||||||
|
{
|
||||||
|
refId = "D";
|
||||||
|
datasourceUid = "__expr__";
|
||||||
|
model = {
|
||||||
|
type = "math";
|
||||||
|
expression = "$C == 0";
|
||||||
|
};
|
||||||
|
}
|
||||||
|
];
|
||||||
|
for = "0s";
|
||||||
|
noDataState = "NoData";
|
||||||
|
execErrState = "Error";
|
||||||
|
annotations = {
|
||||||
|
summary = "RAID array {{ $labels.array }} is degraded";
|
||||||
|
description = ''
|
||||||
|
RAID array {{ $labels.array }} on {{ $labels.instance }} is in state "{{ $labels.state }}".
|
||||||
|
The array is not in a healthy state. Check for failed disks immediately!
|
||||||
|
'';
|
||||||
|
};
|
||||||
|
labels = {
|
||||||
|
severity = "critical";
|
||||||
|
category = "storage";
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
# RAID missing devices - critical
|
||||||
|
{
|
||||||
|
uid = "raid-missing-devices-uid";
|
||||||
|
title = "RaidMissingDevices";
|
||||||
|
condition = "D";
|
||||||
|
data = [
|
||||||
|
{
|
||||||
|
refId = "A";
|
||||||
|
datasourceUid = "vm-datasource-uid";
|
||||||
|
relativeTimeRange = { from = 300; to = 0; };
|
||||||
|
model = {
|
||||||
|
expr = ''mdadm_array_devices_active < mdadm_array_devices_total'';
|
||||||
|
instant = false;
|
||||||
|
};
|
||||||
|
}
|
||||||
|
{
|
||||||
|
refId = "C";
|
||||||
|
datasourceUid = "__expr__";
|
||||||
|
model = {
|
||||||
|
type = "reduce";
|
||||||
|
expression = "A";
|
||||||
|
reducer = "last";
|
||||||
|
};
|
||||||
|
}
|
||||||
|
{
|
||||||
|
refId = "D";
|
||||||
|
datasourceUid = "__expr__";
|
||||||
|
model = {
|
||||||
|
type = "math";
|
||||||
|
expression = "$C > 0";
|
||||||
|
};
|
||||||
|
}
|
||||||
|
];
|
||||||
|
for = "0s";
|
||||||
|
noDataState = "NoData";
|
||||||
|
execErrState = "Error";
|
||||||
|
annotations = {
|
||||||
|
summary = "RAID array {{ $labels.array }} has missing devices";
|
||||||
|
description = ''
|
||||||
|
RAID array {{ $labels.array }} on {{ $labels.instance }} has fewer active devices than expected.
|
||||||
|
A disk may have failed or been removed. Check array status immediately!
|
||||||
|
'';
|
||||||
|
};
|
||||||
|
labels = {
|
||||||
|
severity = "critical";
|
||||||
|
category = "storage";
|
||||||
|
};
|
||||||
|
}
|
||||||
|
];
|
||||||
|
}
|
||||||
298
hosts/web-arm/modules/grafana/alerting/storage/smart_alerts.nix
Normal file
298
hosts/web-arm/modules/grafana/alerting/storage/smart_alerts.nix
Normal file
@@ -0,0 +1,298 @@
|
|||||||
|
{ lib, pkgs, config, ... }:
|
||||||
|
{
|
||||||
|
grafanaAlertRuleDefinitions = [
|
||||||
|
# S.M.A.R.T. overall health failed - critical
|
||||||
|
{
|
||||||
|
uid = "smart-health-failed-uid";
|
||||||
|
title = "DiskSmartHealthFailed";
|
||||||
|
condition = "D";
|
||||||
|
data = [
|
||||||
|
{
|
||||||
|
refId = "A";
|
||||||
|
datasourceUid = "vm-datasource-uid";
|
||||||
|
relativeTimeRange = { from = 300; to = 0; };
|
||||||
|
model = {
|
||||||
|
expr = ''smart_health_passed == 0'';
|
||||||
|
instant = false;
|
||||||
|
};
|
||||||
|
}
|
||||||
|
{
|
||||||
|
refId = "C";
|
||||||
|
datasourceUid = "__expr__";
|
||||||
|
model = {
|
||||||
|
type = "reduce";
|
||||||
|
expression = "A";
|
||||||
|
reducer = "last";
|
||||||
|
};
|
||||||
|
}
|
||||||
|
{
|
||||||
|
refId = "D";
|
||||||
|
datasourceUid = "__expr__";
|
||||||
|
model = {
|
||||||
|
type = "math";
|
||||||
|
expression = "$C == 0";
|
||||||
|
};
|
||||||
|
}
|
||||||
|
];
|
||||||
|
for = "0s";
|
||||||
|
noDataState = "NoData";
|
||||||
|
execErrState = "Error";
|
||||||
|
annotations = {
|
||||||
|
summary = "S.M.A.R.T. health check FAILED on {{ $labels.device }}";
|
||||||
|
description = ''
|
||||||
|
Disk {{ $labels.device }} ({{ $labels.serial }}) on {{ $labels.instance }} has failed its S.M.A.R.T. health check.
|
||||||
|
This indicates imminent disk failure. Replace the disk immediately!
|
||||||
|
'';
|
||||||
|
};
|
||||||
|
labels = {
|
||||||
|
severity = "critical";
|
||||||
|
category = "storage";
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
# Reallocated sectors - warning (any count > 0 is concerning)
|
||||||
|
{
|
||||||
|
uid = "smart-reallocated-sectors-uid";
|
||||||
|
title = "DiskReallocatedSectors";
|
||||||
|
condition = "D";
|
||||||
|
data = [
|
||||||
|
{
|
||||||
|
refId = "A";
|
||||||
|
datasourceUid = "vm-datasource-uid";
|
||||||
|
relativeTimeRange = { from = 300; to = 0; };
|
||||||
|
model = {
|
||||||
|
expr = ''smart_reallocated_sector_ct > 0'';
|
||||||
|
instant = false;
|
||||||
|
};
|
||||||
|
}
|
||||||
|
{
|
||||||
|
refId = "C";
|
||||||
|
datasourceUid = "__expr__";
|
||||||
|
model = {
|
||||||
|
type = "reduce";
|
||||||
|
expression = "A";
|
||||||
|
reducer = "last";
|
||||||
|
};
|
||||||
|
}
|
||||||
|
{
|
||||||
|
refId = "D";
|
||||||
|
datasourceUid = "__expr__";
|
||||||
|
model = {
|
||||||
|
type = "math";
|
||||||
|
expression = "$C > 0";
|
||||||
|
};
|
||||||
|
}
|
||||||
|
];
|
||||||
|
for = "0s";
|
||||||
|
noDataState = "NoData";
|
||||||
|
execErrState = "Error";
|
||||||
|
annotations = {
|
||||||
|
summary = "Reallocated sectors detected on {{ $labels.device }}";
|
||||||
|
description = ''
|
||||||
|
Disk {{ $labels.device }} ({{ $labels.serial }}) on {{ $labels.instance }} has reallocated sectors.
|
||||||
|
This indicates disk surface damage. Monitor closely and plan replacement.
|
||||||
|
'';
|
||||||
|
};
|
||||||
|
labels = {
|
||||||
|
severity = "warning";
|
||||||
|
category = "storage";
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
# Current pending sectors
|
||||||
|
{
|
||||||
|
uid = "smart-pending-sectors-uid";
|
||||||
|
title = "DiskPendingSectors";
|
||||||
|
condition = "D";
|
||||||
|
data = [
|
||||||
|
{
|
||||||
|
refId = "A";
|
||||||
|
datasourceUid = "vm-datasource-uid";
|
||||||
|
relativeTimeRange = { from = 300; to = 0; };
|
||||||
|
model = {
|
||||||
|
expr = ''smart_current_pending_sector > 0'';
|
||||||
|
instant = false;
|
||||||
|
};
|
||||||
|
}
|
||||||
|
{
|
||||||
|
refId = "C";
|
||||||
|
datasourceUid = "__expr__";
|
||||||
|
model = {
|
||||||
|
type = "reduce";
|
||||||
|
expression = "A";
|
||||||
|
reducer = "last";
|
||||||
|
};
|
||||||
|
}
|
||||||
|
{
|
||||||
|
refId = "D";
|
||||||
|
datasourceUid = "__expr__";
|
||||||
|
model = {
|
||||||
|
type = "math";
|
||||||
|
expression = "$C > 0";
|
||||||
|
};
|
||||||
|
}
|
||||||
|
];
|
||||||
|
for = "0s";
|
||||||
|
noDataState = "NoData";
|
||||||
|
execErrState = "Error";
|
||||||
|
annotations = {
|
||||||
|
summary = "Pending sectors detected on {{ $labels.device }}";
|
||||||
|
description = ''
|
||||||
|
Disk {{ $labels.device }} ({{ $labels.serial }}) on {{ $labels.instance }} has pending sectors.
|
||||||
|
These sectors could not be read and may be reallocated. Monitor for increase.
|
||||||
|
'';
|
||||||
|
};
|
||||||
|
labels = {
|
||||||
|
severity = "warning";
|
||||||
|
category = "storage";
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
# Offline uncorrectable errors
|
||||||
|
{
|
||||||
|
uid = "smart-offline-uncorrectable-uid";
|
||||||
|
title = "DiskOfflineUncorrectable";
|
||||||
|
condition = "D";
|
||||||
|
data = [
|
||||||
|
{
|
||||||
|
refId = "A";
|
||||||
|
datasourceUid = "vm-datasource-uid";
|
||||||
|
relativeTimeRange = { from = 300; to = 0; };
|
||||||
|
model = {
|
||||||
|
expr = ''smart_offline_uncorrectable > 0'';
|
||||||
|
instant = false;
|
||||||
|
};
|
||||||
|
}
|
||||||
|
{
|
||||||
|
refId = "C";
|
||||||
|
datasourceUid = "__expr__";
|
||||||
|
model = {
|
||||||
|
type = "reduce";
|
||||||
|
expression = "A";
|
||||||
|
reducer = "last";
|
||||||
|
};
|
||||||
|
}
|
||||||
|
{
|
||||||
|
refId = "D";
|
||||||
|
datasourceUid = "__expr__";
|
||||||
|
model = {
|
||||||
|
type = "math";
|
||||||
|
expression = "$C > 0";
|
||||||
|
};
|
||||||
|
}
|
||||||
|
];
|
||||||
|
for = "0s";
|
||||||
|
noDataState = "NoData";
|
||||||
|
execErrState = "Error";
|
||||||
|
annotations = {
|
||||||
|
summary = "Offline uncorrectable errors on {{ $labels.device }}";
|
||||||
|
description = ''
|
||||||
|
Disk {{ $labels.device }} ({{ $labels.serial }}) on {{ $labels.instance }} has offline uncorrectable errors.
|
||||||
|
This indicates data integrity issues. Consider replacement.
|
||||||
|
'';
|
||||||
|
};
|
||||||
|
labels = {
|
||||||
|
severity = "warning";
|
||||||
|
category = "storage";
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
# High temperature (Seagate enterprise: warning at 50C)
|
||||||
|
{
|
||||||
|
uid = "smart-high-temperature-uid";
|
||||||
|
title = "DiskHighTemperature";
|
||||||
|
condition = "D";
|
||||||
|
data = [
|
||||||
|
{
|
||||||
|
refId = "A";
|
||||||
|
datasourceUid = "vm-datasource-uid";
|
||||||
|
relativeTimeRange = { from = 600; to = 0; };
|
||||||
|
model = {
|
||||||
|
expr = ''smart_temperature_celsius > 50'';
|
||||||
|
instant = false;
|
||||||
|
};
|
||||||
|
}
|
||||||
|
{
|
||||||
|
refId = "C";
|
||||||
|
datasourceUid = "__expr__";
|
||||||
|
model = {
|
||||||
|
type = "reduce";
|
||||||
|
expression = "A";
|
||||||
|
reducer = "last";
|
||||||
|
};
|
||||||
|
}
|
||||||
|
{
|
||||||
|
refId = "D";
|
||||||
|
datasourceUid = "__expr__";
|
||||||
|
model = {
|
||||||
|
type = "math";
|
||||||
|
expression = "$C > 0";
|
||||||
|
};
|
||||||
|
}
|
||||||
|
];
|
||||||
|
for = "10m";
|
||||||
|
noDataState = "NoData";
|
||||||
|
execErrState = "Error";
|
||||||
|
annotations = {
|
||||||
|
summary = "High temperature on {{ $labels.device }}";
|
||||||
|
description = ''
|
||||||
|
Disk {{ $labels.device }} ({{ $labels.serial }}) on {{ $labels.instance }} temperature exceeds 50°C.
|
||||||
|
Check cooling and ventilation.
|
||||||
|
'';
|
||||||
|
};
|
||||||
|
labels = {
|
||||||
|
severity = "warning";
|
||||||
|
category = "storage";
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
# UDMA CRC errors (cable/connection issues)
|
||||||
|
{
|
||||||
|
uid = "smart-udma-crc-errors-uid";
|
||||||
|
title = "DiskUDMACRCErrors";
|
||||||
|
condition = "D";
|
||||||
|
data = [
|
||||||
|
{
|
||||||
|
refId = "A";
|
||||||
|
datasourceUid = "vm-datasource-uid";
|
||||||
|
relativeTimeRange = { from = 86400; to = 0; };
|
||||||
|
model = {
|
||||||
|
expr = ''increase(smart_udma_crc_error_count[24h]) > 0'';
|
||||||
|
instant = false;
|
||||||
|
};
|
||||||
|
}
|
||||||
|
{
|
||||||
|
refId = "C";
|
||||||
|
datasourceUid = "__expr__";
|
||||||
|
model = {
|
||||||
|
type = "reduce";
|
||||||
|
expression = "A";
|
||||||
|
reducer = "last";
|
||||||
|
};
|
||||||
|
}
|
||||||
|
{
|
||||||
|
refId = "D";
|
||||||
|
datasourceUid = "__expr__";
|
||||||
|
model = {
|
||||||
|
type = "math";
|
||||||
|
expression = "$C > 0";
|
||||||
|
};
|
||||||
|
}
|
||||||
|
];
|
||||||
|
for = "0s";
|
||||||
|
noDataState = "NoData";
|
||||||
|
execErrState = "Error";
|
||||||
|
annotations = {
|
||||||
|
summary = "UDMA CRC errors on {{ $labels.device }}";
|
||||||
|
description = ''
|
||||||
|
Disk {{ $labels.device }} ({{ $labels.serial }}) on {{ $labels.instance }} has new CRC errors.
|
||||||
|
This typically indicates SATA cable or connection issues. Check cables.
|
||||||
|
'';
|
||||||
|
};
|
||||||
|
labels = {
|
||||||
|
severity = "warning";
|
||||||
|
category = "storage";
|
||||||
|
};
|
||||||
|
}
|
||||||
|
];
|
||||||
|
}
|
||||||
@@ -31,6 +31,7 @@ in
|
|||||||
./alerting/system/default.nix
|
./alerting/system/default.nix
|
||||||
./alerting/service/default.nix
|
./alerting/service/default.nix
|
||||||
./alerting/websites/default.nix
|
./alerting/websites/default.nix
|
||||||
|
# ./alerting/storage/default.nix
|
||||||
|
|
||||||
./datasources/victoriametrics.nix
|
./datasources/victoriametrics.nix
|
||||||
./datasources/loki.nix
|
./datasources/loki.nix
|
||||||
|
|||||||
Reference in New Issue
Block a user