fix: database connection resilience — retry on transient errors, TCP keepalive, health check timeout
All checks were successful
Build & Deploy to Staging / Build & Deploy to Staging (push) Successful in 9m25s
Promote to Production / Deploy to Production (push) Successful in 1m36s

- Enable TCP keepalive on pg.Pool to detect dead connections
- Add connectionTimeoutMillis (5s) to prevent hanging on stale connections
- Add queryWithRetry() with exponential backoff for transient DB errors
- Add connectWithRetry() for transaction-based operations
- Detect PgBouncer "no available server" and other transient errors
- Health check has 3s timeout and returns 503 on DB failure
- All DB operations in keys, verification, usage use retry logic

Fixes BUG-075: PgBouncer failover causes permanent pod failures
This commit is contained in:
OpenClaw Deployer 2026-02-18 14:08:29 +00:00
parent 97744897f0
commit 8d88a9c235
5 changed files with 149 additions and 43 deletions

View file

@ -3,6 +3,21 @@ import pg from "pg";
import logger from "./logger.js";
const { Pool } = pg;
// Transient error codes from PgBouncer / PostgreSQL that warrant retry
const TRANSIENT_ERRORS = new Set([
"ECONNRESET",
"ECONNREFUSED",
"EPIPE",
"ETIMEDOUT",
"CONNECTION_LOST",
"57P01", // admin_shutdown
"57P02", // crash_shutdown
"57P03", // cannot_connect_now
"08006", // connection_failure
"08003", // connection_does_not_exist
"08001", // sqlclient_unable_to_establish_sqlconnection
]);
const pool = new Pool({
host: process.env.DATABASE_HOST || "172.17.0.1",
port: parseInt(process.env.DATABASE_PORT || "5432", 10),
@ -11,14 +26,97 @@ const pool = new Pool({
password: process.env.DATABASE_PASSWORD || "docfast",
max: 10,
idleTimeoutMillis: 30000,
connectionTimeoutMillis: 5000, // Don't wait forever for a connection
allowExitOnIdle: false,
keepAlive: true, // TCP keepalive to detect dead connections
keepAliveInitialDelayMillis: 10000, // Start keepalive probes after 10s idle
});
pool.on("error", (err) => {
logger.error({ err }, "Unexpected PostgreSQL pool error");
logger.error({ err }, "Unexpected PostgreSQL pool error — connection will be removed from pool");
});
/**
* Determine if an error is transient (PgBouncer failover, network blip)
*/
export function isTransientError(err: any): boolean {
if (!err) return false;
const code = err.code || "";
const msg = (err.message || "").toLowerCase();
if (TRANSIENT_ERRORS.has(code)) return true;
if (msg.includes("no available server")) return true; // PgBouncer specific
if (msg.includes("connection terminated")) return true;
if (msg.includes("connection refused")) return true;
if (msg.includes("server closed the connection")) return true;
if (msg.includes("timeout expired")) return true;
return false;
}
/**
* Execute a query with automatic retry on transient errors.
* Retries up to `maxRetries` times with exponential backoff.
*/
export async function queryWithRetry(
queryText: string,
params?: any[],
maxRetries = 3
): Promise<pg.QueryResult> {
let lastError: any;
for (let attempt = 0; attempt <= maxRetries; attempt++) {
try {
return await pool.query(queryText, params);
} catch (err: any) {
lastError = err;
if (!isTransientError(err) || attempt === maxRetries) {
throw err;
}
const delayMs = Math.min(1000 * Math.pow(2, attempt), 5000); // 1s, 2s, 4s (capped at 5s)
logger.warn(
{ err: err.message, code: err.code, attempt: attempt + 1, maxRetries, delayMs },
"Transient DB error, retrying..."
);
await new Promise(resolve => setTimeout(resolve, delayMs));
}
}
throw lastError;
}
/**
* Connect with retry for operations that need a client (transactions).
*/
export async function connectWithRetry(maxRetries = 3): Promise<pg.PoolClient> {
let lastError: any;
for (let attempt = 0; attempt <= maxRetries; attempt++) {
try {
return await pool.connect();
} catch (err: any) {
lastError = err;
if (!isTransientError(err) || attempt === maxRetries) {
throw err;
}
const delayMs = Math.min(1000 * Math.pow(2, attempt), 5000);
logger.warn(
{ err: err.message, code: err.code, attempt: attempt + 1, maxRetries, delayMs },
"Transient DB connect error, retrying..."
);
await new Promise(resolve => setTimeout(resolve, delayMs));
}
}
throw lastError;
}
export async function initDatabase(): Promise<void> {
const client = await pool.connect();
const client = await connectWithRetry();
try {
await client.query(`
CREATE TABLE IF NOT EXISTS api_keys (