feat: add disks to monitoring
This commit is contained in:
463
hosts/web-arm/modules/grafana/dashboards/smart-dashboard.nix
Normal file
463
hosts/web-arm/modules/grafana/dashboards/smart-dashboard.nix
Normal file
@@ -0,0 +1,463 @@
|
||||
{ lib, pkgs }:
|
||||
let
|
||||
datasourceUid = "vm-datasource-uid";
|
||||
|
||||
# Helper to create a panel with common defaults
|
||||
mkPanel = { id, title, type, gridPos, targets, options ? { }, fieldConfig ? { }, ... }@args:
|
||||
{
|
||||
inherit id title type gridPos targets;
|
||||
datasource = { uid = datasourceUid; type = "prometheus"; };
|
||||
options = options;
|
||||
fieldConfig = {
|
||||
defaults = fieldConfig.defaults or { };
|
||||
overrides = fieldConfig.overrides or [ ];
|
||||
};
|
||||
} // (builtins.removeAttrs args [ "id" "title" "type" "gridPos" "targets" "options" "fieldConfig" ]);
|
||||
|
||||
# Dashboard definition
|
||||
dashboard = {
|
||||
uid = "smart-disk-health";
|
||||
title = "S.M.A.R.T Disk Health";
|
||||
description = "S.M.A.R.T metrics and RAID array status";
|
||||
tags = [ "disk" "smart" "storage" "nas" ];
|
||||
timezone = "browser";
|
||||
editable = false;
|
||||
refresh = "5m";
|
||||
schemaVersion = 39;
|
||||
version = 1;
|
||||
|
||||
# Variables
|
||||
templating.list = [
|
||||
{
|
||||
name = "host";
|
||||
label = "Host";
|
||||
type = "query";
|
||||
datasource = { uid = datasourceUid; type = "prometheus"; };
|
||||
query = "label_values(smart_health_passed, instance)";
|
||||
regex = "";
|
||||
sort = 1;
|
||||
refresh = 1;
|
||||
includeAll = true;
|
||||
multi = false;
|
||||
current = { selected = true; text = "All"; value = "$__all"; };
|
||||
}
|
||||
];
|
||||
|
||||
# Panels
|
||||
panels = [
|
||||
# === OVERVIEW ROW ===
|
||||
{
|
||||
id = 1;
|
||||
type = "row";
|
||||
title = "Overview";
|
||||
collapsed = false;
|
||||
gridPos = { x = 0; y = 0; w = 24; h = 1; };
|
||||
panels = [ ];
|
||||
}
|
||||
|
||||
# Alert Status - Shows firing disk alerts
|
||||
{
|
||||
id = 5;
|
||||
title = "Alert Status";
|
||||
type = "alertlist";
|
||||
gridPos = { x = 0; y = 1; w = 6; h = 5; };
|
||||
options = {
|
||||
alertInstanceLabelFilter = "";
|
||||
alertName = "Disk";
|
||||
dashboardAlerts = false;
|
||||
groupBy = [ ];
|
||||
groupMode = "default";
|
||||
maxItems = 20;
|
||||
sortOrder = 1;
|
||||
stateFilter = {
|
||||
"error" = true;
|
||||
firing = true;
|
||||
noData = false;
|
||||
normal = false;
|
||||
pending = false;
|
||||
};
|
||||
viewMode = "list";
|
||||
};
|
||||
}
|
||||
|
||||
# Health Status - Stat panel
|
||||
(mkPanel {
|
||||
id = 2;
|
||||
title = "Disk Health Status";
|
||||
type = "stat";
|
||||
gridPos = { x = 6; y = 1; w = 6; h = 5; };
|
||||
targets = [{
|
||||
expr = ''smart_health_passed{instance=~"$host"}'';
|
||||
legendFormat = "{{device}}";
|
||||
refId = "A";
|
||||
}];
|
||||
options = {
|
||||
reduceOptions = { values = false; calcs = [ "lastNotNull" ]; fields = ""; };
|
||||
orientation = "horizontal";
|
||||
textMode = "auto";
|
||||
colorMode = "background";
|
||||
graphMode = "none";
|
||||
};
|
||||
fieldConfig = {
|
||||
defaults = {
|
||||
mappings = [
|
||||
{ type = "value"; options."1" = { text = "PASSED"; color = "green"; index = 0; }; }
|
||||
{ type = "value"; options."0" = { text = "FAILED"; color = "red"; index = 1; }; }
|
||||
];
|
||||
thresholds = {
|
||||
mode = "absolute";
|
||||
steps = [
|
||||
{ color = "red"; value = null; }
|
||||
{ color = "green"; value = 1; }
|
||||
];
|
||||
};
|
||||
};
|
||||
};
|
||||
})
|
||||
|
||||
# Temperature Gauge
|
||||
(mkPanel {
|
||||
id = 3;
|
||||
title = "Disk Temperatures";
|
||||
type = "gauge";
|
||||
gridPos = { x = 12; y = 1; w = 6; h = 8; };
|
||||
targets = [{
|
||||
expr = ''smart_temperature_celsius{instance=~"$host"}'';
|
||||
legendFormat = "{{device}}";
|
||||
refId = "A";
|
||||
}];
|
||||
options = {
|
||||
reduceOptions = { values = false; calcs = [ "lastNotNull" ]; fields = ""; };
|
||||
orientation = "auto";
|
||||
showThresholdLabels = false;
|
||||
showThresholdMarkers = true;
|
||||
};
|
||||
fieldConfig = {
|
||||
defaults = {
|
||||
unit = "celsius";
|
||||
min = 0;
|
||||
max = 70;
|
||||
thresholds = {
|
||||
mode = "absolute";
|
||||
steps = [
|
||||
{ color = "green"; value = null; }
|
||||
{ color = "yellow"; value = 45; }
|
||||
{ color = "red"; value = 55; }
|
||||
];
|
||||
};
|
||||
};
|
||||
};
|
||||
})
|
||||
|
||||
# RAID Status - Stat panel
|
||||
(mkPanel {
|
||||
id = 4;
|
||||
title = "RAID Array Status";
|
||||
type = "stat";
|
||||
gridPos = { x = 18; y = 1; w = 6; h = 8; };
|
||||
targets = [{
|
||||
expr = ''mdadm_array_state{instance=~"$host"}'';
|
||||
legendFormat = "{{array}}";
|
||||
refId = "A";
|
||||
}];
|
||||
options = {
|
||||
reduceOptions = { values = false; calcs = [ "lastNotNull" ]; fields = ""; };
|
||||
orientation = "horizontal";
|
||||
textMode = "auto";
|
||||
colorMode = "background";
|
||||
graphMode = "none";
|
||||
};
|
||||
fieldConfig = {
|
||||
defaults = {
|
||||
mappings = [
|
||||
{ type = "value"; options."1" = { text = "Healthy"; color = "green"; index = 0; }; }
|
||||
{ type = "value"; options."0" = { text = "Degraded"; color = "red"; index = 1; }; }
|
||||
];
|
||||
thresholds = {
|
||||
mode = "absolute";
|
||||
steps = [
|
||||
{ color = "red"; value = null; }
|
||||
{ color = "green"; value = 1; }
|
||||
];
|
||||
};
|
||||
};
|
||||
};
|
||||
})
|
||||
|
||||
# Sector Health Table - Promoted to overview for visibility
|
||||
(mkPanel {
|
||||
id = 13;
|
||||
title = "Sector Health";
|
||||
type = "table";
|
||||
gridPos = { x = 0; y = 6; w = 12; h = 4; };
|
||||
targets = [
|
||||
{
|
||||
expr = ''smart_reallocated_sector_ct{instance=~"$host"}'';
|
||||
legendFormat = "{{device}}";
|
||||
refId = "A";
|
||||
format = "table";
|
||||
instant = true;
|
||||
}
|
||||
{
|
||||
expr = ''smart_current_pending_sector{instance=~"$host"}'';
|
||||
legendFormat = "{{device}}";
|
||||
refId = "B";
|
||||
format = "table";
|
||||
instant = true;
|
||||
}
|
||||
{
|
||||
expr = ''smart_offline_uncorrectable{instance=~"$host"}'';
|
||||
legendFormat = "{{device}}";
|
||||
refId = "C";
|
||||
format = "table";
|
||||
instant = true;
|
||||
}
|
||||
];
|
||||
options = {
|
||||
showHeader = true;
|
||||
cellHeight = "sm";
|
||||
};
|
||||
transformations = [
|
||||
{ id = "merge"; options = { }; }
|
||||
{
|
||||
id = "organize";
|
||||
options = {
|
||||
excludeByName = { Time = true; __name__ = true; instance = true; job = true; serial = true; };
|
||||
renameByName = {
|
||||
device = "Device";
|
||||
"Value #A" = "Reallocated Sectors";
|
||||
"Value #B" = "Pending Sectors";
|
||||
"Value #C" = "Offline Uncorrectable";
|
||||
};
|
||||
};
|
||||
}
|
||||
];
|
||||
fieldConfig = {
|
||||
defaults = {
|
||||
thresholds = {
|
||||
mode = "absolute";
|
||||
steps = [
|
||||
{ color = "green"; value = null; }
|
||||
{ color = "yellow"; value = 1; }
|
||||
{ color = "red"; value = 10; }
|
||||
];
|
||||
};
|
||||
custom = { displayMode = "color-background-solid"; };
|
||||
};
|
||||
};
|
||||
})
|
||||
|
||||
# === DETAILED METRICS ROW ===
|
||||
{
|
||||
id = 10;
|
||||
type = "row";
|
||||
title = "Detailed Metrics";
|
||||
collapsed = false;
|
||||
gridPos = { x = 0; y = 10; w = 24; h = 1; };
|
||||
panels = [ ];
|
||||
}
|
||||
|
||||
# Temperature Time Series
|
||||
(mkPanel {
|
||||
id = 11;
|
||||
title = "Temperature Over Time";
|
||||
type = "timeseries";
|
||||
gridPos = { x = 0; y = 11; w = 12; h = 8; };
|
||||
targets = [{
|
||||
expr = ''smart_temperature_celsius{instance=~"$host"}'';
|
||||
legendFormat = "{{device}}";
|
||||
refId = "A";
|
||||
}];
|
||||
options = {
|
||||
legend = { displayMode = "list"; placement = "bottom"; showLegend = true; };
|
||||
tooltip = { mode = "multi"; sort = "desc"; };
|
||||
};
|
||||
fieldConfig = {
|
||||
defaults = {
|
||||
unit = "celsius";
|
||||
custom = {
|
||||
drawStyle = "line";
|
||||
lineInterpolation = "smooth";
|
||||
fillOpacity = 10;
|
||||
pointSize = 5;
|
||||
showPoints = "auto";
|
||||
};
|
||||
thresholds = {
|
||||
mode = "absolute";
|
||||
steps = [
|
||||
{ color = "green"; value = null; }
|
||||
{ color = "yellow"; value = 45; }
|
||||
{ color = "red"; value = 55; }
|
||||
];
|
||||
};
|
||||
};
|
||||
};
|
||||
})
|
||||
|
||||
# Power On Hours
|
||||
(mkPanel {
|
||||
id = 12;
|
||||
title = "Power On Hours";
|
||||
type = "stat";
|
||||
gridPos = { x = 12; y = 11; w = 12; h = 8; };
|
||||
targets = [{
|
||||
expr = ''smart_power_on_hours{instance=~"$host"}'';
|
||||
legendFormat = "{{device}}";
|
||||
refId = "A";
|
||||
}];
|
||||
options = {
|
||||
reduceOptions = { values = false; calcs = [ "lastNotNull" ]; fields = ""; };
|
||||
orientation = "horizontal";
|
||||
textMode = "value_and_name";
|
||||
colorMode = "none";
|
||||
graphMode = "none";
|
||||
};
|
||||
fieldConfig = {
|
||||
defaults = {
|
||||
unit = "h";
|
||||
};
|
||||
};
|
||||
})
|
||||
|
||||
# === RAID DETAILS ROW ===
|
||||
{
|
||||
id = 20;
|
||||
type = "row";
|
||||
title = "RAID Details";
|
||||
collapsed = false;
|
||||
gridPos = { x = 0; y = 19; w = 24; h = 1; };
|
||||
panels = [ ];
|
||||
}
|
||||
|
||||
# RAID Devices
|
||||
(mkPanel {
|
||||
id = 21;
|
||||
title = "RAID Array Devices";
|
||||
type = "stat";
|
||||
gridPos = { x = 0; y = 20; w = 12; h = 4; };
|
||||
targets = [
|
||||
{
|
||||
expr = ''mdadm_array_devices_active{instance=~"$host"}'';
|
||||
legendFormat = "{{array}} Active";
|
||||
refId = "A";
|
||||
}
|
||||
{
|
||||
expr = ''mdadm_array_devices_total{instance=~"$host"}'';
|
||||
legendFormat = "{{array}} Total";
|
||||
refId = "B";
|
||||
}
|
||||
];
|
||||
options = {
|
||||
reduceOptions = { values = false; calcs = [ "lastNotNull" ]; fields = ""; };
|
||||
orientation = "horizontal";
|
||||
textMode = "value_and_name";
|
||||
colorMode = "value";
|
||||
graphMode = "none";
|
||||
};
|
||||
fieldConfig = {
|
||||
defaults = {
|
||||
unit = "short";
|
||||
};
|
||||
};
|
||||
})
|
||||
|
||||
# UDMA CRC Errors
|
||||
(mkPanel {
|
||||
id = 22;
|
||||
title = "UDMA CRC Errors";
|
||||
type = "timeseries";
|
||||
gridPos = { x = 12; y = 20; w = 12; h = 4; };
|
||||
targets = [{
|
||||
expr = ''smart_udma_crc_error_count{instance=~"$host"}'';
|
||||
legendFormat = "{{device}}";
|
||||
refId = "A";
|
||||
}];
|
||||
options = {
|
||||
legend = { displayMode = "list"; placement = "bottom"; showLegend = true; };
|
||||
tooltip = { mode = "multi"; sort = "desc"; };
|
||||
};
|
||||
fieldConfig = {
|
||||
defaults = {
|
||||
unit = "short";
|
||||
custom = {
|
||||
drawStyle = "line";
|
||||
lineInterpolation = "stepAfter";
|
||||
fillOpacity = 0;
|
||||
pointSize = 5;
|
||||
showPoints = "auto";
|
||||
};
|
||||
};
|
||||
};
|
||||
})
|
||||
|
||||
# Last Update Timestamp
|
||||
(mkPanel {
|
||||
id = 30;
|
||||
title = "Last Metrics Update";
|
||||
type = "stat";
|
||||
gridPos = { x = 0; y = 24; w = 6; h = 5; };
|
||||
targets = [{
|
||||
expr = ''time() - disk_metrics_last_update{instance=~"$host"}'';
|
||||
legendFormat = "Age";
|
||||
refId = "A";
|
||||
}];
|
||||
options = {
|
||||
reduceOptions = { values = false; calcs = [ "lastNotNull" ]; fields = ""; };
|
||||
orientation = "horizontal";
|
||||
textMode = "value";
|
||||
colorMode = "value";
|
||||
graphMode = "none";
|
||||
};
|
||||
fieldConfig = {
|
||||
defaults = {
|
||||
unit = "s";
|
||||
thresholds = {
|
||||
mode = "absolute";
|
||||
steps = [
|
||||
{ color = "green"; value = null; }
|
||||
{ color = "yellow"; value = 1800; }
|
||||
{ color = "red"; value = 3600; }
|
||||
];
|
||||
};
|
||||
};
|
||||
};
|
||||
})
|
||||
|
||||
# Device Activity Status
|
||||
(mkPanel {
|
||||
id = 31;
|
||||
title = "Device Activity";
|
||||
type = "stat";
|
||||
gridPos = { x = 6; y = 24; w = 18; h = 5; };
|
||||
targets = [{
|
||||
expr = ''smart_device_active{instance=~"$host"}'';
|
||||
legendFormat = "{{device}}";
|
||||
refId = "A";
|
||||
}];
|
||||
options = {
|
||||
reduceOptions = { values = false; calcs = [ "lastNotNull" ]; fields = ""; };
|
||||
orientation = "horizontal";
|
||||
textMode = "auto";
|
||||
colorMode = "background";
|
||||
graphMode = "none";
|
||||
};
|
||||
fieldConfig = {
|
||||
defaults = {
|
||||
mappings = [
|
||||
{ type = "value"; options."1" = { text = "Active"; color = "green"; index = 0; }; }
|
||||
{ type = "value"; options."0" = { text = "Standby"; color = "blue"; index = 1; }; }
|
||||
];
|
||||
thresholds = {
|
||||
mode = "absolute";
|
||||
steps = [
|
||||
{ color = "blue"; value = null; }
|
||||
{ color = "green"; value = 1; }
|
||||
];
|
||||
};
|
||||
};
|
||||
};
|
||||
})
|
||||
];
|
||||
};
|
||||
in
|
||||
pkgs.writeText "smart-dashboard.json" (builtins.toJSON dashboard)
|
||||
Reference in New Issue
Block a user