{ config, pkgs, lib, ... }: let grafanaMonitorUser = "grafana-monitor"; grafanaMonitorGroup = "grafana-monitor"; stateDir = "/var/lib/${grafanaMonitorUser}"; # Monitoring script will be defined here later monitorScript = pkgs.writeShellScriptBin "grafana-online-check" '' #!${pkgs.bash}/bin/bash set -euo pipefail GRAFANA_URL="https://grafana.cloonar.com/api/health" STATE_FILE="${stateDir}/status.env" PUSHOVER_API_TOKEN_FILE="/run/secrets/pushover-api-token" PUSHOVER_USER_KEY_FILE="/run/secrets/pushover-user-key" MAX_FAILURES=5 # Ensure state directory exists (NixOS creates $HOME for the user, which is stateDir) # The script runs as grafanaMonitorUser, so $HOME will be /var/lib/grafana-monitor mkdir -p "''${HOME}" # Load current state or initialize CONSECUTIVE_FAILURES=0 ALERT_SENT="false" LAST_KNOWN_STATUS="UP" # Assume UP initially if no state file # Note: STATE_FILE uses $stateDir which is /var/lib/grafana-monitor. # The script will run with HOME=/var/lib/grafana-monitor. # So, using ''${HOME}/status.env or ''${STATE_FILE} should resolve to the same path. # Let's stick to ''${STATE_FILE} for consistency with its definition. if [[ -f "''${STATE_FILE}" ]]; then source "''${STATE_FILE}" fi # Check secrets if [[ ! -f "''${PUSHOVER_API_TOKEN_FILE}" ]] || [[ ! -r "''${PUSHOVER_API_TOKEN_FILE}" ]]; then echo "Error: Pushover API token file (''${PUSHOVER_API_TOKEN_FILE}) not found or not readable." >&2 exit 1 fi PUSHOVER_API_TOKEN=$(cat "''${PUSHOVER_API_TOKEN_FILE}") if [[ ! -f "''${PUSHOVER_USER_KEY_FILE}" ]] || [[ ! -r "''${PUSHOVER_USER_KEY_FILE}" ]]; then echo "Error: Pushover user key file (''${PUSHOVER_USER_KEY_FILE}) not found or not readable." >&2 exit 1 fi PUSHOVER_USER_KEY=$(cat "''${PUSHOVER_USER_KEY_FILE}") # Internet connectivity check INTERNET_CHECK_URL="https://1.1.1.1" # Using a reliable IP to bypass potential DNS issues for the check itself echo "Performing internet connectivity check to ''${INTERNET_CHECK_URL}..." if ! ${pkgs.curl}/bin/curl --head --silent --fail --connect-timeout 3 --max-time 5 "''${INTERNET_CHECK_URL}" > /dev/null 2>&1; then echo "Internet connectivity check failed. Cannot reach ''${INTERNET_CHECK_URL}. Skipping Grafana check and exiting successfully." exit 0 else echo "Internet connectivity check successful. Proceeding with Grafana check." fi echo "" # Add a blank line for readability before Grafana check logs echo "Checking Grafana at ''${GRAFANA_URL}..." ACTUAL_HTTP_CODE="000" # Default if curl doesn't provide one CURL_ERROR_MESSAGE="" CURL_STDERR_OUTPUT=$(mktemp) # Ensure temp file is cleaned up on exit, error, or interrupt trap 'rm -f "''${CURL_STDERR_OUTPUT}"' EXIT TERM INT HUP # -L: follow redirects # -sS: silent mode, but show errors # --fail: curl exits with 22 on server errors (4xx, 5xx) # --connect-timeout 5: max time to connect # --max-time 10: max total time for operation # --stderr: redirect stderr to a file to capture detailed errors # -o /dev/null: discard response body # --write-out "%{http_code}": output the HTTP status code if ACTUAL_HTTP_CODE=$(${pkgs.curl}/bin/curl -L -sS --fail --connect-timeout 5 --max-time 10 \ --stderr "''${CURL_STDERR_OUTPUT}" \ -o /dev/null --write-out "%{http_code}" "''${GRAFANA_URL}"); then # Curl exited with 0. With --fail, this means HTTP status was 2xx. echo "Grafana is UP (HTTP ''${ACTUAL_HTTP_CODE})." CURRENT_STATUS="UP" if [[ "''${LAST_KNOWN_STATUS}" == "DOWN" && "''${ALERT_SENT}" == "true" ]]; then echo "Grafana recovered. Sending recovery notification." ${pkgs.curl}/bin/curl -sS -X POST \ -F "token=''${PUSHOVER_API_TOKEN}" \ -F "user=''${PUSHOVER_USER_KEY}" \ -F "message=Grafana at ''${GRAFANA_URL} is back online (HTTP ''${ACTUAL_HTTP_CODE})." \ -F "title=Grafana Recovered (fw)" \ -F "priority=0" \ https://api.pushover.net/1/messages.json ALERT_SENT="false" fi CONSECUTIVE_FAILURES=0 else # Curl exited with a non-zero status. CURL_EXIT_CODE=$? CURL_ERROR_MESSAGE=$(cat "''${CURL_STDERR_OUTPUT}" | tr -d '\n' | sed 's/"/\\"/g') # Read, remove newlines, escape quotes for JSON echo "Grafana check failed. Curl Exit Code: ''${CURL_EXIT_CODE}. HTTP Code reported: ''${ACTUAL_HTTP_CODE}." echo "Curl Stderr: ''${CURL_ERROR_MESSAGE}" CURRENT_STATUS="DOWN" CONSECUTIVE_FAILURES=$(( ''${CONSECUTIVE_FAILURES} + 1 )) echo "Consecutive failures: ''${CONSECUTIVE_FAILURES}" if [[ ''${CONSECUTIVE_FAILURES} -ge ''${MAX_FAILURES} && "''${ALERT_SENT}" == "false" ]]; then echo "Grafana has been offline for ''${CONSECUTIVE_FAILURES} checks (>= ''${MAX_FAILURES}). Sending alert." PUSHOVER_TITLE="Grafana OFFLINE (fw)" PUSHOVER_MSG="Grafana ''${GRAFANA_URL} offline for ''${MAX_FAILURES}+ min. HTTP:''${ACTUAL_HTTP_CODE}, CurlExit:''${CURL_EXIT_CODE}." if [[ -n "''${CURL_ERROR_MESSAGE}" ]]; then PUSHOVER_MSG+=" Err: ''${CURL_ERROR_MESSAGE}" fi # Truncate message if too long for Pushover (1024 chars) PUSHOVER_MSG=$(echo "''${PUSHOVER_MSG}" | cut -c 1-1024) ${pkgs.curl}/bin/curl -sS -X POST \ -F "token=''${PUSHOVER_API_TOKEN}" \ -F "user=''${PUSHOVER_USER_KEY}" \ -F "message=''${PUSHOVER_MSG}" \ -F "title=''${PUSHOVER_TITLE}" \ -F "priority=1" \ https://api.pushover.net/1/messages.json ALERT_SENT="true" fi fi # Temp file is removed by trap # Save current state echo "Saving state: CONSECUTIVE_FAILURES=''${CONSECUTIVE_FAILURES}, ALERT_SENT=''${ALERT_SENT}, LAST_KNOWN_STATUS=''${CURRENT_STATUS}" ( echo "CONSECUTIVE_FAILURES=''${CONSECUTIVE_FAILURES}" echo "ALERT_SENT=''${ALERT_SENT}" echo "LAST_KNOWN_STATUS=''${CURRENT_STATUS}" ) > "''${STATE_FILE}" # Using STATE_FILE which is ${stateDir}/status.env chmod 600 "''${STATE_FILE}" echo "Grafana check finished." ''; in { # Module is now implicitly enabled when imported config = { users.users.${grafanaMonitorUser} = { isSystemUser = true; group = grafanaMonitorGroup; home = stateDir; # Home directory for state createHome = true; # NixOS will create this directory description = "User for Grafana online monitoring service"; }; users.groups.${grafanaMonitorGroup} = {}; # Sops secrets for Pushover sops.secrets."pushover-api-token" = { owner = grafanaMonitorUser; group = grafanaMonitorGroup; mode = "0400"; # Read-only for the user }; sops.secrets."pushover-user-key" = { owner = grafanaMonitorUser; group = grafanaMonitorGroup; mode = "0400"; # Read-only for the user }; environment.systemPackages = [ pkgs.curl pkgs.coreutils # for mkdir, cat, echo, rm used in script (though bash builtins are often used) ]; systemd.services.grafana-online-check = { description = "Grafana Online Check Service"; wantedBy = [ "multi-user.target" ]; # Or timers.target if only started by timer after = [ "network-online.target" ]; # Ensure network is up and secrets are available requires = [ "network-online.target" ]; serviceConfig = { Type = "oneshot"; User = grafanaMonitorUser; Group = grafanaMonitorGroup; ExecStart = "${monitorScript}/bin/grafana-online-check"; # Permissions to write to its own home directory (stateDir) are implicit # If using StateDirectory= in systemd, it would be different. # For home directory usage, ensure the user has rights. `createHome = true` helps. }; }; systemd.timers.grafana-online-check = { description = "Timer to periodically check Grafana's online status"; wantedBy = [ "timers.target" ]; timerConfig = { OnBootSec = "2min"; # Wait a bit after boot OnUnitActiveSec = "1min"; # Run every 1 minute after the last run Unit = "grafana-online-check.service"; }; }; }; }