{ lib, pkgs, config, ... }: { grafanaAlertRuleDefinitions = [ # S.M.A.R.T. overall health failed - critical { uid = "smart-health-failed-uid"; title = "DiskSmartHealthFailed"; condition = "D"; data = [ { refId = "A"; datasourceUid = "vm-datasource-uid"; relativeTimeRange = { from = 300; to = 0; }; model = { expr = ''smart_health_passed == 0''; instant = false; }; } { refId = "C"; datasourceUid = "__expr__"; model = { type = "reduce"; expression = "A"; reducer = "last"; }; } { refId = "D"; datasourceUid = "__expr__"; model = { type = "math"; expression = "$C == 0"; }; } ]; for = "0s"; noDataState = "NoData"; execErrState = "Error"; annotations = { summary = "S.M.A.R.T. health check FAILED on {{ $labels.device }}"; description = '' Disk {{ $labels.device }} ({{ $labels.serial }}) on {{ $labels.instance }} has failed its S.M.A.R.T. health check. This indicates imminent disk failure. Replace the disk immediately! ''; }; labels = { severity = "critical"; category = "storage"; }; } # Reallocated sectors - warning (any count > 0 is concerning) { uid = "smart-reallocated-sectors-uid"; title = "DiskReallocatedSectors"; condition = "D"; data = [ { refId = "A"; datasourceUid = "vm-datasource-uid"; relativeTimeRange = { from = 300; to = 0; }; model = { expr = ''smart_reallocated_sector_ct > 0''; instant = false; }; } { refId = "C"; datasourceUid = "__expr__"; model = { type = "reduce"; expression = "A"; reducer = "last"; }; } { refId = "D"; datasourceUid = "__expr__"; model = { type = "math"; expression = "$C > 0"; }; } ]; for = "0s"; noDataState = "NoData"; execErrState = "Error"; annotations = { summary = "Reallocated sectors detected on {{ $labels.device }}"; description = '' Disk {{ $labels.device }} ({{ $labels.serial }}) on {{ $labels.instance }} has reallocated sectors. This indicates disk surface damage. Monitor closely and plan replacement. ''; }; labels = { severity = "warning"; category = "storage"; }; } # Current pending sectors { uid = "smart-pending-sectors-uid"; title = "DiskPendingSectors"; condition = "D"; data = [ { refId = "A"; datasourceUid = "vm-datasource-uid"; relativeTimeRange = { from = 300; to = 0; }; model = { expr = ''smart_current_pending_sector > 0''; instant = false; }; } { refId = "C"; datasourceUid = "__expr__"; model = { type = "reduce"; expression = "A"; reducer = "last"; }; } { refId = "D"; datasourceUid = "__expr__"; model = { type = "math"; expression = "$C > 0"; }; } ]; for = "0s"; noDataState = "NoData"; execErrState = "Error"; annotations = { summary = "Pending sectors detected on {{ $labels.device }}"; description = '' Disk {{ $labels.device }} ({{ $labels.serial }}) on {{ $labels.instance }} has pending sectors. These sectors could not be read and may be reallocated. Monitor for increase. ''; }; labels = { severity = "warning"; category = "storage"; }; } # Offline uncorrectable errors { uid = "smart-offline-uncorrectable-uid"; title = "DiskOfflineUncorrectable"; condition = "D"; data = [ { refId = "A"; datasourceUid = "vm-datasource-uid"; relativeTimeRange = { from = 300; to = 0; }; model = { expr = ''smart_offline_uncorrectable > 0''; instant = false; }; } { refId = "C"; datasourceUid = "__expr__"; model = { type = "reduce"; expression = "A"; reducer = "last"; }; } { refId = "D"; datasourceUid = "__expr__"; model = { type = "math"; expression = "$C > 0"; }; } ]; for = "0s"; noDataState = "NoData"; execErrState = "Error"; annotations = { summary = "Offline uncorrectable errors on {{ $labels.device }}"; description = '' Disk {{ $labels.device }} ({{ $labels.serial }}) on {{ $labels.instance }} has offline uncorrectable errors. This indicates data integrity issues. Consider replacement. ''; }; labels = { severity = "warning"; category = "storage"; }; } # High temperature (Seagate enterprise: warning at 50C) { uid = "smart-high-temperature-uid"; title = "DiskHighTemperature"; condition = "D"; data = [ { refId = "A"; datasourceUid = "vm-datasource-uid"; relativeTimeRange = { from = 600; to = 0; }; model = { expr = ''smart_temperature_celsius > 50''; instant = false; }; } { refId = "C"; datasourceUid = "__expr__"; model = { type = "reduce"; expression = "A"; reducer = "last"; }; } { refId = "D"; datasourceUid = "__expr__"; model = { type = "math"; expression = "$C > 0"; }; } ]; for = "10m"; noDataState = "NoData"; execErrState = "Error"; annotations = { summary = "High temperature on {{ $labels.device }}"; description = '' Disk {{ $labels.device }} ({{ $labels.serial }}) on {{ $labels.instance }} temperature exceeds 50°C. Check cooling and ventilation. ''; }; labels = { severity = "warning"; category = "storage"; }; } # UDMA CRC errors (cable/connection issues) { uid = "smart-udma-crc-errors-uid"; title = "DiskUDMACRCErrors"; condition = "D"; data = [ { refId = "A"; datasourceUid = "vm-datasource-uid"; relativeTimeRange = { from = 86400; to = 0; }; model = { expr = ''increase(smart_udma_crc_error_count[24h]) > 0''; instant = false; }; } { refId = "C"; datasourceUid = "__expr__"; model = { type = "reduce"; expression = "A"; reducer = "last"; }; } { refId = "D"; datasourceUid = "__expr__"; model = { type = "math"; expression = "$C > 0"; }; } ]; for = "0s"; noDataState = "NoData"; execErrState = "Error"; annotations = { summary = "UDMA CRC errors on {{ $labels.device }}"; description = '' Disk {{ $labels.device }} ({{ $labels.serial }}) on {{ $labels.instance }} has new CRC errors. This typically indicates SATA cable or connection issues. Check cables. ''; }; labels = { severity = "warning"; category = "storage"; }; } ]; }