feat: add Grafana online status monitoring module with Pushover notifications
This commit is contained in:
183
hosts/fw/modules/grafana-monitor.nix
Normal file
183
hosts/fw/modules/grafana-monitor.nix
Normal file
@@ -0,0 +1,183 @@
|
||||
{ config, pkgs, lib, ... }:
|
||||
|
||||
let
|
||||
grafanaMonitorUser = "grafana-monitor";
|
||||
grafanaMonitorGroup = "grafana-monitor";
|
||||
stateDir = "/var/lib/${grafanaMonitorUser}";
|
||||
|
||||
# Monitoring script will be defined here later
|
||||
monitorScript = pkgs.writeShellScriptBin "grafana-online-check" ''
|
||||
#!${pkgs.bash}/bin/bash
|
||||
set -euo pipefail
|
||||
|
||||
GRAFANA_URL="https://grafana.cloonar.com/api/health"
|
||||
STATE_FILE="${stateDir}/status.env"
|
||||
PUSHOVER_API_TOKEN_FILE="/run/secrets/pushover-api-token"
|
||||
PUSHOVER_USER_KEY_FILE="/run/secrets/pushover-user-key"
|
||||
MAX_FAILURES=5
|
||||
|
||||
# Ensure state directory exists (NixOS creates $HOME for the user, which is stateDir)
|
||||
# The script runs as grafanaMonitorUser, so $HOME will be /var/lib/grafana-monitor
|
||||
mkdir -p "''${HOME}"
|
||||
|
||||
# Load current state or initialize
|
||||
CONSECUTIVE_FAILURES=0
|
||||
ALERT_SENT="false"
|
||||
LAST_KNOWN_STATUS="UP" # Assume UP initially if no state file
|
||||
|
||||
# Note: STATE_FILE uses $stateDir which is /var/lib/grafana-monitor.
|
||||
# The script will run with HOME=/var/lib/grafana-monitor.
|
||||
# So, using ''${HOME}/status.env or ''${STATE_FILE} should resolve to the same path.
|
||||
# Let's stick to ''${STATE_FILE} for consistency with its definition.
|
||||
if [[ -f "''${STATE_FILE}" ]]; then
|
||||
source "''${STATE_FILE}"
|
||||
fi
|
||||
|
||||
# Check secrets
|
||||
if [[ ! -f "''${PUSHOVER_API_TOKEN_FILE}" ]] || [[ ! -r "''${PUSHOVER_API_TOKEN_FILE}" ]]; then
|
||||
echo "Error: Pushover API token file (''${PUSHOVER_API_TOKEN_FILE}) not found or not readable." >&2
|
||||
exit 1
|
||||
fi
|
||||
PUSHOVER_API_TOKEN=$(cat "''${PUSHOVER_API_TOKEN_FILE}")
|
||||
|
||||
if [[ ! -f "''${PUSHOVER_USER_KEY_FILE}" ]] || [[ ! -r "''${PUSHOVER_USER_KEY_FILE}" ]]; then
|
||||
echo "Error: Pushover user key file (''${PUSHOVER_USER_KEY_FILE}) not found or not readable." >&2
|
||||
exit 1
|
||||
fi
|
||||
PUSHOVER_USER_KEY=$(cat "''${PUSHOVER_USER_KEY_FILE}")
|
||||
|
||||
echo "Checking Grafana at ''${GRAFANA_URL}..."
|
||||
ACTUAL_HTTP_CODE="000" # Default if curl doesn't provide one
|
||||
CURL_ERROR_MESSAGE=""
|
||||
CURL_STDERR_OUTPUT=$(mktemp)
|
||||
# Ensure temp file is cleaned up on exit, error, or interrupt
|
||||
trap 'rm -f "''${CURL_STDERR_OUTPUT}"' EXIT TERM INT HUP
|
||||
|
||||
# -L: follow redirects
|
||||
# -sS: silent mode, but show errors
|
||||
# --fail: curl exits with 22 on server errors (4xx, 5xx)
|
||||
# --connect-timeout 5: max time to connect
|
||||
# --max-time 10: max total time for operation
|
||||
# --stderr: redirect stderr to a file to capture detailed errors
|
||||
# -o /dev/null: discard response body
|
||||
# --write-out "%{http_code}": output the HTTP status code
|
||||
if ACTUAL_HTTP_CODE=$(${pkgs.curl}/bin/curl -L -sS --fail --connect-timeout 5 --max-time 10 \
|
||||
--stderr "''${CURL_STDERR_OUTPUT}" \
|
||||
-o /dev/null --write-out "%{http_code}" "''${GRAFANA_URL}"); then
|
||||
# Curl exited with 0. With --fail, this means HTTP status was 2xx.
|
||||
echo "Grafana is UP (HTTP ''${ACTUAL_HTTP_CODE})."
|
||||
CURRENT_STATUS="UP"
|
||||
if [[ "''${LAST_KNOWN_STATUS}" == "DOWN" && "''${ALERT_SENT}" == "true" ]]; then
|
||||
echo "Grafana recovered. Sending recovery notification."
|
||||
${pkgs.curl}/bin/curl -sS -X POST \
|
||||
-F "token=''${PUSHOVER_API_TOKEN}" \
|
||||
-F "user=''${PUSHOVER_USER_KEY}" \
|
||||
-F "message=Grafana at ''${GRAFANA_URL} is back online (HTTP ''${ACTUAL_HTTP_CODE})." \
|
||||
-F "title=Grafana Recovered (fw)" \
|
||||
-F "priority=0" \
|
||||
https://api.pushover.net/1/messages.json
|
||||
ALERT_SENT="false"
|
||||
fi
|
||||
CONSECUTIVE_FAILURES=0
|
||||
else
|
||||
# Curl exited with a non-zero status.
|
||||
CURL_EXIT_CODE=$?
|
||||
CURL_ERROR_MESSAGE=$(cat "''${CURL_STDERR_OUTPUT}" | tr -d '\n' | sed 's/"/\\"/g') # Read, remove newlines, escape quotes for JSON
|
||||
|
||||
echo "Grafana check failed. Curl Exit Code: ''${CURL_EXIT_CODE}. HTTP Code reported: ''${ACTUAL_HTTP_CODE}."
|
||||
echo "Curl Stderr: ''${CURL_ERROR_MESSAGE}"
|
||||
CURRENT_STATUS="DOWN"
|
||||
CONSECUTIVE_FAILURES=$(( ''${CONSECUTIVE_FAILURES} + 1 ))
|
||||
echo "Consecutive failures: ''${CONSECUTIVE_FAILURES}"
|
||||
|
||||
if [[ ''${CONSECUTIVE_FAILURES} -ge ''${MAX_FAILURES} && "''${ALERT_SENT}" == "false" ]]; then
|
||||
echo "Grafana has been offline for ''${CONSECUTIVE_FAILURES} checks (>= ''${MAX_FAILURES}). Sending alert."
|
||||
PUSHOVER_TITLE="Grafana OFFLINE (fw)"
|
||||
PUSHOVER_MSG="Grafana ''${GRAFANA_URL} offline for ''${MAX_FAILURES}+ min. HTTP:''${ACTUAL_HTTP_CODE}, CurlExit:''${CURL_EXIT_CODE}."
|
||||
if [[ -n "''${CURL_ERROR_MESSAGE}" ]]; then
|
||||
PUSHOVER_MSG+=" Err: ''${CURL_ERROR_MESSAGE}"
|
||||
fi
|
||||
# Truncate message if too long for Pushover (1024 chars)
|
||||
PUSHOVER_MSG=$(echo "''${PUSHOVER_MSG}" | cut -c 1-1024)
|
||||
|
||||
${pkgs.curl}/bin/curl -sS -X POST \
|
||||
-F "token=''${PUSHOVER_API_TOKEN}" \
|
||||
-F "user=''${PUSHOVER_USER_KEY}" \
|
||||
-F "message=''${PUSHOVER_MSG}" \
|
||||
-F "title=''${PUSHOVER_TITLE}" \
|
||||
-F "priority=1" \
|
||||
https://api.pushover.net/1/messages.json
|
||||
ALERT_SENT="true"
|
||||
fi
|
||||
fi
|
||||
# Temp file is removed by trap
|
||||
|
||||
# Save current state
|
||||
echo "Saving state: CONSECUTIVE_FAILURES=''${CONSECUTIVE_FAILURES}, ALERT_SENT=''${ALERT_SENT}, LAST_KNOWN_STATUS=''${CURRENT_STATUS}"
|
||||
(
|
||||
echo "CONSECUTIVE_FAILURES=''${CONSECUTIVE_FAILURES}"
|
||||
echo "ALERT_SENT=''${ALERT_SENT}"
|
||||
echo "LAST_KNOWN_STATUS=''${CURRENT_STATUS}"
|
||||
) > "''${STATE_FILE}" # Using STATE_FILE which is ${stateDir}/status.env
|
||||
chmod 600 "''${STATE_FILE}"
|
||||
|
||||
echo "Grafana check finished."
|
||||
'';
|
||||
in
|
||||
{
|
||||
# Module is now implicitly enabled when imported
|
||||
config = {
|
||||
users.users.${grafanaMonitorUser} = {
|
||||
isSystemUser = true;
|
||||
group = grafanaMonitorGroup;
|
||||
home = stateDir; # Home directory for state
|
||||
createHome = true; # NixOS will create this directory
|
||||
description = "User for Grafana online monitoring service";
|
||||
};
|
||||
users.groups.${grafanaMonitorGroup} = {};
|
||||
|
||||
# Sops secrets for Pushover
|
||||
sops.secrets."pushover-api-token" = {
|
||||
owner = grafanaMonitorUser;
|
||||
group = grafanaMonitorGroup;
|
||||
mode = "0400"; # Read-only for the user
|
||||
};
|
||||
sops.secrets."pushover-user-key" = {
|
||||
owner = grafanaMonitorUser;
|
||||
group = grafanaMonitorGroup;
|
||||
mode = "0400"; # Read-only for the user
|
||||
};
|
||||
|
||||
environment.systemPackages = [
|
||||
pkgs.curl
|
||||
pkgs.coreutils # for mkdir, cat, echo, rm used in script (though bash builtins are often used)
|
||||
];
|
||||
|
||||
systemd.services.grafana-online-check = {
|
||||
description = "Grafana Online Check Service";
|
||||
wantedBy = [ "multi-user.target" ]; # Or timers.target if only started by timer
|
||||
after = [ "network-online.target" ]; # Ensure network is up and secrets are available
|
||||
requires = [ "network-online.target" ];
|
||||
|
||||
serviceConfig = {
|
||||
Type = "oneshot";
|
||||
User = grafanaMonitorUser;
|
||||
Group = grafanaMonitorGroup;
|
||||
ExecStart = "${monitorScript}/bin/grafana-online-check";
|
||||
# Permissions to write to its own home directory (stateDir) are implicit
|
||||
# If using StateDirectory= in systemd, it would be different.
|
||||
# For home directory usage, ensure the user has rights. `createHome = true` helps.
|
||||
};
|
||||
};
|
||||
|
||||
systemd.timers.grafana-online-check = {
|
||||
description = "Timer to periodically check Grafana's online status";
|
||||
wantedBy = [ "timers.target" ];
|
||||
timerConfig = {
|
||||
OnBootSec = "2min"; # Wait a bit after boot
|
||||
OnUnitActiveSec = "1min"; # Run every 1 minute after the last run
|
||||
Unit = "grafana-online-check.service";
|
||||
};
|
||||
};
|
||||
};
|
||||
}
|
||||
Reference in New Issue
Block a user