Files
nixos/hosts/web-arm/modules/grafana/alert-cleanup.nix

100 lines
2.8 KiB
Nix

{ lib, pkgs, config, ... }:
let
cfg = config.services.grafana;
# Extract all UIDs from configured alert rules
extractRuleUids = groups:
lib.unique (lib.flatten (
map (group: map (rule: rule.uid) group.rules) groups
));
# Collect all alert rule groups from the three modules
allGroups = cfg.provision.alerting.rules.settings.groups or [];
expectedUids = extractRuleUids allGroups;
# Generate manifest JSON
cleanupManifest = pkgs.writeTextFile {
name = "grafana-alert-cleanup-manifest.json";
text = builtins.toJSON {
expected_uids = expectedUids;
};
};
# Cleanup script using PostgreSQL
cleanupScript = pkgs.writeShellScriptBin "grafana-alert-cleanup" ''
#!${pkgs.bash}/bin/bash
set -euo pipefail
MANIFEST="${cleanupManifest}"
DB_NAME="grafana"
echo "=== Grafana Alert Rule Cleanup (PostgreSQL) ==="
echo "Loading expected UIDs from manifest..."
EXPECTED_UIDS=$(${pkgs.jq}/bin/jq -r '.expected_uids[]' "$MANIFEST")
EXPECTED_COUNT=$(echo "$EXPECTED_UIDS" | wc -l)
echo "Expected UIDs count: $EXPECTED_COUNT"
echo "Querying database for current provisioned alert rules..."
# Query database for all provisioned rule UIDs
CURRENT_UIDS=$(${pkgs.postgresql}/bin/psql -h /run/postgresql -d "$DB_NAME" -t -A -c \
"SELECT uid FROM alert_rule WHERE updated_by = 'service';" || echo "")
if [[ -z "$CURRENT_UIDS" ]]; then
echo "No provisioned rules found in database."
exit 0
fi
CURRENT_COUNT=$(echo "$CURRENT_UIDS" | wc -l)
echo "Current provisioned UIDs count: $CURRENT_COUNT"
# Find orphaned UIDs (in database but not in expected list)
ORPHANED_UIDS=""
ORPHAN_COUNT=0
while IFS= read -r uid; do
if [[ -n "$uid" ]] && ! echo "$EXPECTED_UIDS" | grep -qx "$uid"; then
ORPHANED_UIDS="$ORPHANED_UIDS$uid "
ORPHAN_COUNT=$((ORPHAN_COUNT + 1))
fi
done <<< "$CURRENT_UIDS"
if [[ $ORPHAN_COUNT -eq 0 ]]; then
echo "No orphaned alert rules found. All rules match configuration."
exit 0
fi
echo "Found $ORPHAN_COUNT orphaned rule(s)"
# Delete orphaned rules
for uid in $ORPHANED_UIDS; do
echo "Deleting orphaned rule: $uid"
${pkgs.postgresql}/bin/psql -h /run/postgresql -d "$DB_NAME" -c \
"DELETE FROM alert_rule WHERE uid = '$uid' AND updated_by = 'service';" >/dev/null 2>&1
if [[ $? -eq 0 ]]; then
echo " Deleted $uid"
else
echo " Failed to delete $uid" >&2
fi
done
echo "=== Cleanup Complete ==="
'';
in
{
config = lib.mkIf cfg.enable {
# Systemd service that runs before Grafana starts
systemd.services.grafana.serviceConfig.ExecStartPre = pkgs.writeShellScript "grafana-alert-cleanup-pre" ''
echo "Running Grafana alert rule cleanup..."
${cleanupScript}/bin/grafana-alert-cleanup
'';
};
}