feat: add cleanup for grafana alerting rules

This commit is contained in:
2025-11-01 11:09:05 +01:00
parent 819bfc1531
commit db25b2bfbb
3 changed files with 102 additions and 2 deletions

View File

@@ -0,0 +1,99 @@
{ lib, pkgs, config, ... }:
let
cfg = config.services.grafana;
# Extract all UIDs from configured alert rules
extractRuleUids = groups:
lib.unique (lib.flatten (
map (group: map (rule: rule.uid) group.rules) groups
));
# Collect all alert rule groups from the three modules
allGroups = cfg.provision.alerting.rules.settings.groups or [];
expectedUids = extractRuleUids allGroups;
# Generate manifest JSON
cleanupManifest = pkgs.writeTextFile {
name = "grafana-alert-cleanup-manifest.json";
text = builtins.toJSON {
expected_uids = expectedUids;
};
};
# Cleanup script using PostgreSQL
cleanupScript = pkgs.writeShellScriptBin "grafana-alert-cleanup" ''
#!${pkgs.bash}/bin/bash
set -euo pipefail
MANIFEST="${cleanupManifest}"
DB_NAME="grafana"
echo "=== Grafana Alert Rule Cleanup (PostgreSQL) ==="
echo "Loading expected UIDs from manifest..."
EXPECTED_UIDS=$(${pkgs.jq}/bin/jq -r '.expected_uids[]' "$MANIFEST")
EXPECTED_COUNT=$(echo "$EXPECTED_UIDS" | wc -l)
echo "Expected UIDs count: $EXPECTED_COUNT"
echo "Querying database for current provisioned alert rules..."
# Query database for all provisioned rule UIDs
CURRENT_UIDS=$(${pkgs.postgresql}/bin/psql -h /run/postgresql -d "$DB_NAME" -t -A -c \
"SELECT uid FROM alert_rule WHERE updated_by = 'service';" || echo "")
if [[ -z "$CURRENT_UIDS" ]]; then
echo "No provisioned rules found in database."
exit 0
fi
CURRENT_COUNT=$(echo "$CURRENT_UIDS" | wc -l)
echo "Current provisioned UIDs count: $CURRENT_COUNT"
# Find orphaned UIDs (in database but not in expected list)
ORPHANED_UIDS=""
ORPHAN_COUNT=0
while IFS= read -r uid; do
if [[ -n "$uid" ]] && ! echo "$EXPECTED_UIDS" | grep -qx "$uid"; then
ORPHANED_UIDS="$ORPHANED_UIDS$uid "
ORPHAN_COUNT=$((ORPHAN_COUNT + 1))
fi
done <<< "$CURRENT_UIDS"
if [[ $ORPHAN_COUNT -eq 0 ]]; then
echo "No orphaned alert rules found. All rules match configuration."
exit 0
fi
echo "Found $ORPHAN_COUNT orphaned rule(s)"
# Delete orphaned rules
for uid in $ORPHANED_UIDS; do
echo "Deleting orphaned rule: $uid"
${pkgs.postgresql}/bin/psql -h /run/postgresql -d "$DB_NAME" -c \
"DELETE FROM alert_rule WHERE uid = '$uid' AND updated_by = 'service';" >/dev/null 2>&1
if [[ $? -eq 0 ]]; then
echo " Deleted $uid"
else
echo " Failed to delete $uid" >&2
fi
done
echo "=== Cleanup Complete ==="
'';
in
{
config = lib.mkIf cfg.enable {
# Systemd service that runs before Grafana starts
systemd.services.grafana.serviceConfig.ExecStartPre = pkgs.writeShellScript "grafana-alert-cleanup-pre" ''
echo "Running Grafana alert rule cleanup..."
${cleanupScript}/bin/grafana-alert-cleanup
'';
};
}

View File

@@ -11,8 +11,7 @@ let
httpsDomains = lib.map (d: "https://${d}") filteredDomains; httpsDomains = lib.map (d: "https://${d}") filteredDomains;
websiteAlertRules = lib.map (target: websiteAlertRules = lib.map (target:
let let
domain = lib.replaceStrings ["://" "." "-" "/" ] ["-" "-" "_" "_"] target + "-down-alert"; uid = "website-" + (builtins.replaceStrings ["https://" "http://" "." "/"] ["" "" "-" "-"] target);
uid = builtins.hashString "sha1" domain;
in { in {
uid = uid; uid = uid;
title = "Website " + target + " Down"; title = "Website " + target + " Down";

View File

@@ -34,6 +34,8 @@ in
./datasources/victoriametrics.nix ./datasources/victoriametrics.nix
./datasources/loki.nix ./datasources/loki.nix
./alert-cleanup.nix
]; ];
systemd.services.grafana.script = lib.mkBefore '' systemd.services.grafana.script = lib.mkBefore ''