feat: add disks to monitoring

This commit is contained in:
2025-12-05 21:57:58 +01:00
parent ada9db7942
commit 8e0e5c0d16
5 changed files with 518 additions and 32 deletions

View File

@@ -0,0 +1,463 @@
{ lib, pkgs }:
let
datasourceUid = "vm-datasource-uid";
# Helper to create a panel with common defaults
mkPanel = { id, title, type, gridPos, targets, options ? { }, fieldConfig ? { }, ... }@args:
{
inherit id title type gridPos targets;
datasource = { uid = datasourceUid; type = "prometheus"; };
options = options;
fieldConfig = {
defaults = fieldConfig.defaults or { };
overrides = fieldConfig.overrides or [ ];
};
} // (builtins.removeAttrs args [ "id" "title" "type" "gridPos" "targets" "options" "fieldConfig" ]);
# Dashboard definition
dashboard = {
uid = "smart-disk-health";
title = "S.M.A.R.T Disk Health";
description = "S.M.A.R.T metrics and RAID array status";
tags = [ "disk" "smart" "storage" "nas" ];
timezone = "browser";
editable = false;
refresh = "5m";
schemaVersion = 39;
version = 1;
# Variables
templating.list = [
{
name = "host";
label = "Host";
type = "query";
datasource = { uid = datasourceUid; type = "prometheus"; };
query = "label_values(smart_health_passed, instance)";
regex = "";
sort = 1;
refresh = 1;
includeAll = true;
multi = false;
current = { selected = true; text = "All"; value = "$__all"; };
}
];
# Panels
panels = [
# === OVERVIEW ROW ===
{
id = 1;
type = "row";
title = "Overview";
collapsed = false;
gridPos = { x = 0; y = 0; w = 24; h = 1; };
panels = [ ];
}
# Alert Status - Shows firing disk alerts
{
id = 5;
title = "Alert Status";
type = "alertlist";
gridPos = { x = 0; y = 1; w = 6; h = 5; };
options = {
alertInstanceLabelFilter = "";
alertName = "Disk";
dashboardAlerts = false;
groupBy = [ ];
groupMode = "default";
maxItems = 20;
sortOrder = 1;
stateFilter = {
"error" = true;
firing = true;
noData = false;
normal = false;
pending = false;
};
viewMode = "list";
};
}
# Health Status - Stat panel
(mkPanel {
id = 2;
title = "Disk Health Status";
type = "stat";
gridPos = { x = 6; y = 1; w = 6; h = 5; };
targets = [{
expr = ''smart_health_passed{instance=~"$host"}'';
legendFormat = "{{device}}";
refId = "A";
}];
options = {
reduceOptions = { values = false; calcs = [ "lastNotNull" ]; fields = ""; };
orientation = "horizontal";
textMode = "auto";
colorMode = "background";
graphMode = "none";
};
fieldConfig = {
defaults = {
mappings = [
{ type = "value"; options."1" = { text = "PASSED"; color = "green"; index = 0; }; }
{ type = "value"; options."0" = { text = "FAILED"; color = "red"; index = 1; }; }
];
thresholds = {
mode = "absolute";
steps = [
{ color = "red"; value = null; }
{ color = "green"; value = 1; }
];
};
};
};
})
# Temperature Gauge
(mkPanel {
id = 3;
title = "Disk Temperatures";
type = "gauge";
gridPos = { x = 12; y = 1; w = 6; h = 8; };
targets = [{
expr = ''smart_temperature_celsius{instance=~"$host"}'';
legendFormat = "{{device}}";
refId = "A";
}];
options = {
reduceOptions = { values = false; calcs = [ "lastNotNull" ]; fields = ""; };
orientation = "auto";
showThresholdLabels = false;
showThresholdMarkers = true;
};
fieldConfig = {
defaults = {
unit = "celsius";
min = 0;
max = 70;
thresholds = {
mode = "absolute";
steps = [
{ color = "green"; value = null; }
{ color = "yellow"; value = 45; }
{ color = "red"; value = 55; }
];
};
};
};
})
# RAID Status - Stat panel
(mkPanel {
id = 4;
title = "RAID Array Status";
type = "stat";
gridPos = { x = 18; y = 1; w = 6; h = 8; };
targets = [{
expr = ''mdadm_array_state{instance=~"$host"}'';
legendFormat = "{{array}}";
refId = "A";
}];
options = {
reduceOptions = { values = false; calcs = [ "lastNotNull" ]; fields = ""; };
orientation = "horizontal";
textMode = "auto";
colorMode = "background";
graphMode = "none";
};
fieldConfig = {
defaults = {
mappings = [
{ type = "value"; options."1" = { text = "Healthy"; color = "green"; index = 0; }; }
{ type = "value"; options."0" = { text = "Degraded"; color = "red"; index = 1; }; }
];
thresholds = {
mode = "absolute";
steps = [
{ color = "red"; value = null; }
{ color = "green"; value = 1; }
];
};
};
};
})
# Sector Health Table - Promoted to overview for visibility
(mkPanel {
id = 13;
title = "Sector Health";
type = "table";
gridPos = { x = 0; y = 6; w = 12; h = 4; };
targets = [
{
expr = ''smart_reallocated_sector_ct{instance=~"$host"}'';
legendFormat = "{{device}}";
refId = "A";
format = "table";
instant = true;
}
{
expr = ''smart_current_pending_sector{instance=~"$host"}'';
legendFormat = "{{device}}";
refId = "B";
format = "table";
instant = true;
}
{
expr = ''smart_offline_uncorrectable{instance=~"$host"}'';
legendFormat = "{{device}}";
refId = "C";
format = "table";
instant = true;
}
];
options = {
showHeader = true;
cellHeight = "sm";
};
transformations = [
{ id = "merge"; options = { }; }
{
id = "organize";
options = {
excludeByName = { Time = true; __name__ = true; instance = true; job = true; serial = true; };
renameByName = {
device = "Device";
"Value #A" = "Reallocated Sectors";
"Value #B" = "Pending Sectors";
"Value #C" = "Offline Uncorrectable";
};
};
}
];
fieldConfig = {
defaults = {
thresholds = {
mode = "absolute";
steps = [
{ color = "green"; value = null; }
{ color = "yellow"; value = 1; }
{ color = "red"; value = 10; }
];
};
custom = { displayMode = "color-background-solid"; };
};
};
})
# === DETAILED METRICS ROW ===
{
id = 10;
type = "row";
title = "Detailed Metrics";
collapsed = false;
gridPos = { x = 0; y = 10; w = 24; h = 1; };
panels = [ ];
}
# Temperature Time Series
(mkPanel {
id = 11;
title = "Temperature Over Time";
type = "timeseries";
gridPos = { x = 0; y = 11; w = 12; h = 8; };
targets = [{
expr = ''smart_temperature_celsius{instance=~"$host"}'';
legendFormat = "{{device}}";
refId = "A";
}];
options = {
legend = { displayMode = "list"; placement = "bottom"; showLegend = true; };
tooltip = { mode = "multi"; sort = "desc"; };
};
fieldConfig = {
defaults = {
unit = "celsius";
custom = {
drawStyle = "line";
lineInterpolation = "smooth";
fillOpacity = 10;
pointSize = 5;
showPoints = "auto";
};
thresholds = {
mode = "absolute";
steps = [
{ color = "green"; value = null; }
{ color = "yellow"; value = 45; }
{ color = "red"; value = 55; }
];
};
};
};
})
# Power On Hours
(mkPanel {
id = 12;
title = "Power On Hours";
type = "stat";
gridPos = { x = 12; y = 11; w = 12; h = 8; };
targets = [{
expr = ''smart_power_on_hours{instance=~"$host"}'';
legendFormat = "{{device}}";
refId = "A";
}];
options = {
reduceOptions = { values = false; calcs = [ "lastNotNull" ]; fields = ""; };
orientation = "horizontal";
textMode = "value_and_name";
colorMode = "none";
graphMode = "none";
};
fieldConfig = {
defaults = {
unit = "h";
};
};
})
# === RAID DETAILS ROW ===
{
id = 20;
type = "row";
title = "RAID Details";
collapsed = false;
gridPos = { x = 0; y = 19; w = 24; h = 1; };
panels = [ ];
}
# RAID Devices
(mkPanel {
id = 21;
title = "RAID Array Devices";
type = "stat";
gridPos = { x = 0; y = 20; w = 12; h = 4; };
targets = [
{
expr = ''mdadm_array_devices_active{instance=~"$host"}'';
legendFormat = "{{array}} Active";
refId = "A";
}
{
expr = ''mdadm_array_devices_total{instance=~"$host"}'';
legendFormat = "{{array}} Total";
refId = "B";
}
];
options = {
reduceOptions = { values = false; calcs = [ "lastNotNull" ]; fields = ""; };
orientation = "horizontal";
textMode = "value_and_name";
colorMode = "value";
graphMode = "none";
};
fieldConfig = {
defaults = {
unit = "short";
};
};
})
# UDMA CRC Errors
(mkPanel {
id = 22;
title = "UDMA CRC Errors";
type = "timeseries";
gridPos = { x = 12; y = 20; w = 12; h = 4; };
targets = [{
expr = ''smart_udma_crc_error_count{instance=~"$host"}'';
legendFormat = "{{device}}";
refId = "A";
}];
options = {
legend = { displayMode = "list"; placement = "bottom"; showLegend = true; };
tooltip = { mode = "multi"; sort = "desc"; };
};
fieldConfig = {
defaults = {
unit = "short";
custom = {
drawStyle = "line";
lineInterpolation = "stepAfter";
fillOpacity = 0;
pointSize = 5;
showPoints = "auto";
};
};
};
})
# Last Update Timestamp
(mkPanel {
id = 30;
title = "Last Metrics Update";
type = "stat";
gridPos = { x = 0; y = 24; w = 6; h = 5; };
targets = [{
expr = ''time() - disk_metrics_last_update{instance=~"$host"}'';
legendFormat = "Age";
refId = "A";
}];
options = {
reduceOptions = { values = false; calcs = [ "lastNotNull" ]; fields = ""; };
orientation = "horizontal";
textMode = "value";
colorMode = "value";
graphMode = "none";
};
fieldConfig = {
defaults = {
unit = "s";
thresholds = {
mode = "absolute";
steps = [
{ color = "green"; value = null; }
{ color = "yellow"; value = 1800; }
{ color = "red"; value = 3600; }
];
};
};
};
})
# Device Activity Status
(mkPanel {
id = 31;
title = "Device Activity";
type = "stat";
gridPos = { x = 6; y = 24; w = 18; h = 5; };
targets = [{
expr = ''smart_device_active{instance=~"$host"}'';
legendFormat = "{{device}}";
refId = "A";
}];
options = {
reduceOptions = { values = false; calcs = [ "lastNotNull" ]; fields = ""; };
orientation = "horizontal";
textMode = "auto";
colorMode = "background";
graphMode = "none";
};
fieldConfig = {
defaults = {
mappings = [
{ type = "value"; options."1" = { text = "Active"; color = "green"; index = 0; }; }
{ type = "value"; options."0" = { text = "Standby"; color = "blue"; index = 1; }; }
];
thresholds = {
mode = "absolute";
steps = [
{ color = "blue"; value = null; }
{ color = "green"; value = 1; }
];
};
};
};
})
];
};
in
pkgs.writeText "smart-dashboard.json" (builtins.toJSON dashboard)