Files
nixos/hosts/fw/modules/grafana-monitor.nix

194 lines
8.2 KiB
Nix

{ config, pkgs, lib, ... }:
let
grafanaMonitorUser = "grafana-monitor";
grafanaMonitorGroup = "grafana-monitor";
stateDir = "/var/lib/${grafanaMonitorUser}";
# Monitoring script will be defined here later
monitorScript = pkgs.writeShellScriptBin "grafana-online-check" ''
#!${pkgs.bash}/bin/bash
set -euo pipefail
GRAFANA_URL="https://grafana.cloonar.com/api/health"
STATE_FILE="${stateDir}/status.env"
PUSHOVER_API_TOKEN_FILE="/run/secrets/pushover-api-token"
PUSHOVER_USER_KEY_FILE="/run/secrets/pushover-user-key"
MAX_FAILURES=5
# Ensure state directory exists (NixOS creates $HOME for the user, which is stateDir)
# The script runs as grafanaMonitorUser, so $HOME will be /var/lib/grafana-monitor
mkdir -p "''${HOME}"
# Load current state or initialize
CONSECUTIVE_FAILURES=0
ALERT_SENT="false"
LAST_KNOWN_STATUS="UP" # Assume UP initially if no state file
# Note: STATE_FILE uses $stateDir which is /var/lib/grafana-monitor.
# The script will run with HOME=/var/lib/grafana-monitor.
# So, using ''${HOME}/status.env or ''${STATE_FILE} should resolve to the same path.
# Let's stick to ''${STATE_FILE} for consistency with its definition.
if [[ -f "''${STATE_FILE}" ]]; then
source "''${STATE_FILE}"
fi
# Check secrets
if [[ ! -f "''${PUSHOVER_API_TOKEN_FILE}" ]] || [[ ! -r "''${PUSHOVER_API_TOKEN_FILE}" ]]; then
echo "Error: Pushover API token file (''${PUSHOVER_API_TOKEN_FILE}) not found or not readable." >&2
exit 1
fi
PUSHOVER_API_TOKEN=$(cat "''${PUSHOVER_API_TOKEN_FILE}")
if [[ ! -f "''${PUSHOVER_USER_KEY_FILE}" ]] || [[ ! -r "''${PUSHOVER_USER_KEY_FILE}" ]]; then
echo "Error: Pushover user key file (''${PUSHOVER_USER_KEY_FILE}) not found or not readable." >&2
exit 1
fi
PUSHOVER_USER_KEY=$(cat "''${PUSHOVER_USER_KEY_FILE}")
# Internet connectivity check
INTERNET_CHECK_URL="https://1.1.1.1" # Using a reliable IP to bypass potential DNS issues for the check itself
echo "Performing internet connectivity check to ''${INTERNET_CHECK_URL}..."
if ! ${pkgs.curl}/bin/curl --head --silent --fail --connect-timeout 3 --max-time 5 "''${INTERNET_CHECK_URL}" > /dev/null 2>&1; then
echo "Internet connectivity check failed. Cannot reach ''${INTERNET_CHECK_URL}. Skipping Grafana check and exiting successfully."
exit 0
else
echo "Internet connectivity check successful. Proceeding with Grafana check."
fi
echo "" # Add a blank line for readability before Grafana check logs
echo "Checking Grafana at ''${GRAFANA_URL}..."
ACTUAL_HTTP_CODE="000" # Default if curl doesn't provide one
CURL_ERROR_MESSAGE=""
CURL_STDERR_OUTPUT=$(mktemp)
# Ensure temp file is cleaned up on exit, error, or interrupt
trap 'rm -f "''${CURL_STDERR_OUTPUT}"' EXIT TERM INT HUP
# -L: follow redirects
# -sS: silent mode, but show errors
# --fail: curl exits with 22 on server errors (4xx, 5xx)
# --connect-timeout 5: max time to connect
# --max-time 10: max total time for operation
# --stderr: redirect stderr to a file to capture detailed errors
# -o /dev/null: discard response body
# --write-out "%{http_code}": output the HTTP status code
if ACTUAL_HTTP_CODE=$(${pkgs.curl}/bin/curl -L -sS --fail --connect-timeout 5 --max-time 10 \
--stderr "''${CURL_STDERR_OUTPUT}" \
-o /dev/null --write-out "%{http_code}" "''${GRAFANA_URL}"); then
# Curl exited with 0. With --fail, this means HTTP status was 2xx.
echo "Grafana is UP (HTTP ''${ACTUAL_HTTP_CODE})."
CURRENT_STATUS="UP"
if [[ "''${LAST_KNOWN_STATUS}" == "DOWN" && "''${ALERT_SENT}" == "true" ]]; then
echo "Grafana recovered. Sending recovery notification."
${pkgs.curl}/bin/curl -sS -X POST \
-F "token=''${PUSHOVER_API_TOKEN}" \
-F "user=''${PUSHOVER_USER_KEY}" \
-F "message=Grafana at ''${GRAFANA_URL} is back online (HTTP ''${ACTUAL_HTTP_CODE})." \
-F "title=Grafana Recovered (fw)" \
-F "priority=0" \
https://api.pushover.net/1/messages.json
ALERT_SENT="false"
fi
CONSECUTIVE_FAILURES=0
else
# Curl exited with a non-zero status.
CURL_EXIT_CODE=$?
CURL_ERROR_MESSAGE=$(cat "''${CURL_STDERR_OUTPUT}" | tr -d '\n' | sed 's/"/\\"/g') # Read, remove newlines, escape quotes for JSON
echo "Grafana check failed. Curl Exit Code: ''${CURL_EXIT_CODE}. HTTP Code reported: ''${ACTUAL_HTTP_CODE}."
echo "Curl Stderr: ''${CURL_ERROR_MESSAGE}"
CURRENT_STATUS="DOWN"
CONSECUTIVE_FAILURES=$(( ''${CONSECUTIVE_FAILURES} + 1 ))
echo "Consecutive failures: ''${CONSECUTIVE_FAILURES}"
if [[ ''${CONSECUTIVE_FAILURES} -ge ''${MAX_FAILURES} && "''${ALERT_SENT}" == "false" ]]; then
echo "Grafana has been offline for ''${CONSECUTIVE_FAILURES} checks (>= ''${MAX_FAILURES}). Sending alert."
PUSHOVER_TITLE="Grafana OFFLINE (fw)"
PUSHOVER_MSG="Grafana ''${GRAFANA_URL} offline for ''${MAX_FAILURES}+ min. HTTP:''${ACTUAL_HTTP_CODE}, CurlExit:''${CURL_EXIT_CODE}."
if [[ -n "''${CURL_ERROR_MESSAGE}" ]]; then
PUSHOVER_MSG+=" Err: ''${CURL_ERROR_MESSAGE}"
fi
# Truncate message if too long for Pushover (1024 chars)
PUSHOVER_MSG=$(echo "''${PUSHOVER_MSG}" | cut -c 1-1024)
${pkgs.curl}/bin/curl -sS -X POST \
-F "token=''${PUSHOVER_API_TOKEN}" \
-F "user=''${PUSHOVER_USER_KEY}" \
-F "message=''${PUSHOVER_MSG}" \
-F "title=''${PUSHOVER_TITLE}" \
-F "priority=1" \
https://api.pushover.net/1/messages.json
ALERT_SENT="true"
fi
fi
# Temp file is removed by trap
# Save current state
echo "Saving state: CONSECUTIVE_FAILURES=''${CONSECUTIVE_FAILURES}, ALERT_SENT=''${ALERT_SENT}, LAST_KNOWN_STATUS=''${CURRENT_STATUS}"
(
echo "CONSECUTIVE_FAILURES=''${CONSECUTIVE_FAILURES}"
echo "ALERT_SENT=''${ALERT_SENT}"
echo "LAST_KNOWN_STATUS=''${CURRENT_STATUS}"
) > "''${STATE_FILE}" # Using STATE_FILE which is ${stateDir}/status.env
chmod 600 "''${STATE_FILE}"
echo "Grafana check finished."
'';
in
{
# Module is now implicitly enabled when imported
config = {
users.users.${grafanaMonitorUser} = {
isSystemUser = true;
group = grafanaMonitorGroup;
home = stateDir; # Home directory for state
createHome = true; # NixOS will create this directory
description = "User for Grafana online monitoring service";
};
users.groups.${grafanaMonitorGroup} = {};
# Sops secrets for Pushover
sops.secrets."pushover-api-token" = {
owner = grafanaMonitorUser;
group = grafanaMonitorGroup;
mode = "0400"; # Read-only for the user
};
sops.secrets."pushover-user-key" = {
owner = grafanaMonitorUser;
group = grafanaMonitorGroup;
mode = "0400"; # Read-only for the user
};
environment.systemPackages = [
pkgs.curl
pkgs.coreutils # for mkdir, cat, echo, rm used in script (though bash builtins are often used)
];
systemd.services.grafana-online-check = {
description = "Grafana Online Check Service";
wantedBy = [ "multi-user.target" ]; # Or timers.target if only started by timer
after = [ "network-online.target" ]; # Ensure network is up and secrets are available
requires = [ "network-online.target" ];
serviceConfig = {
Type = "oneshot";
User = grafanaMonitorUser;
Group = grafanaMonitorGroup;
ExecStart = "${monitorScript}/bin/grafana-online-check";
# Permissions to write to its own home directory (stateDir) are implicit
# If using StateDirectory= in systemd, it would be different.
# For home directory usage, ensure the user has rights. `createHome = true` helps.
};
};
systemd.timers.grafana-online-check = {
description = "Timer to periodically check Grafana's online status";
wantedBy = [ "timers.target" ];
timerConfig = {
OnBootSec = "2min"; # Wait a bit after boot
OnUnitActiveSec = "1min"; # Run every 1 minute after the last run
Unit = "grafana-online-check.service";
};
};
};
}