#!/bin/bash # Health check monitoring script for DocFast # Runs every 5 minutes via cron HEALTH_URL="https://docfast.dev/health" LOG_FILE="/var/log/docfast-healthcheck.log" DOWN_MARKER="/tmp/docfast-down" STATE_FILE="/tmp/docfast-healthcheck-state" MAX_LOG_LINES=1000 # Initialize state file if it doesn't exist if [ ! -f "$STATE_FILE" ]; then echo "0" > "$STATE_FILE" fi # Function to rotate log if it exceeds max lines rotate_log() { if [ -f "$LOG_FILE" ]; then local line_count=$(wc -l < "$LOG_FILE") if [ "$line_count" -gt "$MAX_LOG_LINES" ]; then # Keep only the last 1000 lines tail -n "$MAX_LOG_LINES" "$LOG_FILE" > "${LOG_FILE}.tmp" mv "${LOG_FILE}.tmp" "$LOG_FILE" fi fi } # Function to log messages log_message() { local msg="$1" echo "[$(date '+%Y-%m-%d %H:%M:%S')] $msg" >> "$LOG_FILE" rotate_log } # Perform health check check_health() { response=$(curl -s -w "\n%{http_code}" "$HEALTH_URL" 2>/dev/null) http_code=$(echo "$response" | tail -n1) body=$(echo "$response" | head -n-1) # Check if HTTP 200 and contains "status":"ok" if [ "$http_code" = "200" ] && echo "$body" | grep -q '"status":"ok"'; then return 0 # Success else return 1 # Failure fi } # Get current failure count get_failure_count() { cat "$STATE_FILE" } # Increment failure count increment_failure() { local count=$(get_failure_count) echo $((count + 1)) > "$STATE_FILE" } # Reset failure count reset_failure() { echo "0" > "$STATE_FILE" } # Main logic if check_health; then # Health check passed failure_count=$(get_failure_count) if [ "$failure_count" -gt 0 ]; then log_message "✓ Service recovered after $failure_count failure(s)" if [ -f "$DOWN_MARKER" ]; then rm -f "$DOWN_MARKER" log_message "Removed downtime marker" fi fi reset_failure else # Health check failed increment_failure failure_count=$(get_failure_count) log_message "✗ Health check failed (attempt $failure_count)" # After 2 consecutive failures, mark as down if [ "$failure_count" -ge 2 ]; then if [ ! -f "$DOWN_MARKER" ]; then touch "$DOWN_MARKER" log_message "⚠ DOWNTIME DETECTED - Marker file created" fi fi fi exit 0