91 lines
3.1 KiB
Nix
91 lines
3.1 KiB
Nix
{ lib, pkgs, config, ... }:
|
|
let
|
|
# Add services here - each entry generates an alert rule
|
|
# instance = which node exporter to query (hostname:9100)
|
|
monitoredServices = [
|
|
{ name = "AI-Mailer"; service = "ai-mailer.service"; instance = "fw:9100"; }
|
|
{ name = "Postfix"; service = "postfix.service"; instance = "mail:9100"; }
|
|
{ name = "Dovecot"; service = "dovecot.service"; instance = "mail:9100"; }
|
|
{ name = "OpenLDAP"; service = "openldap.service"; instance = "mail:9100"; }
|
|
{ name = "Gitea"; service = "container@git.service"; instance = "fw:9100"; }
|
|
{ name = "Gitea Runner"; service = "microvm@git-runner-1.service"; instance = "fw:9100"; }
|
|
{ name = "WireGuard"; service = "wireguard-wg_cloonar.service"; instance = "mail:9100"; }
|
|
{ name = "MySQL"; service = "mysql.service"; instance = "amzebs-01:9100"; }
|
|
{ name = "Nginx"; service = "nginx.service"; instance = "amzebs-01:9100"; }
|
|
{ name = "PHP-FPM"; service = "phpfpm-.*\\.service"; instance = "amzebs-01:9100"; }
|
|
];
|
|
|
|
# Extract host from instance (e.g., "fw:9100" -> "fw")
|
|
getHost = instance: lib.head (lib.splitString ":" instance);
|
|
|
|
# Generate a unique UID from service name
|
|
mkUid = name: "${lib.toLower (lib.replaceStrings [" " "@" "."] ["-" "-" "-"] name)}-down-uid";
|
|
|
|
# Check if service pattern uses regex (contains special chars)
|
|
isRegex = svc: lib.hasInfix ".*" svc || lib.hasInfix "\\" svc;
|
|
|
|
# Build the PromQL expression
|
|
mkExpr = svc:
|
|
let
|
|
nameMatch = if isRegex svc.service
|
|
then "name=~\"${svc.service}\""
|
|
else "name=\"${svc.service}\"";
|
|
in "node_systemd_unit_state{state=\"active\", ${nameMatch}, instance=\"${svc.instance}\"} OR on() vector(0)";
|
|
|
|
mkServiceAlert = svc: {
|
|
uid = mkUid svc.name;
|
|
title = "${svc.name} Service Down on ${getHost svc.instance}";
|
|
condition = "C";
|
|
data = [
|
|
{
|
|
refId = "A";
|
|
relativeTimeRange = {
|
|
from = 300;
|
|
to = 0;
|
|
};
|
|
datasourceUid = "vm-datasource-uid";
|
|
model = {
|
|
editorMode = "code";
|
|
expr = mkExpr svc;
|
|
hide = false;
|
|
intervalMs = 1000;
|
|
legendFormat = "__auto";
|
|
maxDataPoints = 43200;
|
|
range = true;
|
|
refId = "A";
|
|
};
|
|
}
|
|
{
|
|
refId = "B";
|
|
datasourceUid = "__expr__";
|
|
model = {
|
|
type = "reduce";
|
|
expression = "A";
|
|
reducer = "last";
|
|
};
|
|
}
|
|
{
|
|
refId = "C";
|
|
datasourceUid = "__expr__";
|
|
model = {
|
|
type = "math";
|
|
expression = "$B < 1";
|
|
};
|
|
}
|
|
];
|
|
noDataState = "Alerting";
|
|
execErrState = "Alerting";
|
|
for = "5m";
|
|
annotations = {
|
|
description = "${svc.name} service is down on ${getHost svc.instance}";
|
|
summary = "${svc.name} Service Down";
|
|
};
|
|
labels = {
|
|
severity = "critical";
|
|
host = getHost svc.instance;
|
|
};
|
|
};
|
|
in {
|
|
grafanaAlertRuleDefinitions = map mkServiceAlert monitoredServices;
|
|
}
|