fix: database connection resilience — retry on transient errors, TCP keepalive, health check timeout
- Enable TCP keepalive on pg.Pool to detect dead connections - Add connectionTimeoutMillis (5s) to prevent hanging on stale connections - Add queryWithRetry() with exponential backoff for transient DB errors - Add connectWithRetry() for transaction-based operations - Detect PgBouncer "no available server" and other transient errors - Health check has 3s timeout and returns 503 on DB failure - All DB operations in keys, verification, usage use retry logic Fixes BUG-075: PgBouncer failover causes permanent pod failures
This commit is contained in:
parent
97744897f0
commit
8d88a9c235
5 changed files with 149 additions and 43 deletions
|
|
@ -3,6 +3,21 @@ import pg from "pg";
|
|||
import logger from "./logger.js";
|
||||
const { Pool } = pg;
|
||||
|
||||
// Transient error codes from PgBouncer / PostgreSQL that warrant retry
|
||||
const TRANSIENT_ERRORS = new Set([
|
||||
"ECONNRESET",
|
||||
"ECONNREFUSED",
|
||||
"EPIPE",
|
||||
"ETIMEDOUT",
|
||||
"CONNECTION_LOST",
|
||||
"57P01", // admin_shutdown
|
||||
"57P02", // crash_shutdown
|
||||
"57P03", // cannot_connect_now
|
||||
"08006", // connection_failure
|
||||
"08003", // connection_does_not_exist
|
||||
"08001", // sqlclient_unable_to_establish_sqlconnection
|
||||
]);
|
||||
|
||||
const pool = new Pool({
|
||||
host: process.env.DATABASE_HOST || "172.17.0.1",
|
||||
port: parseInt(process.env.DATABASE_PORT || "5432", 10),
|
||||
|
|
@ -11,14 +26,97 @@ const pool = new Pool({
|
|||
password: process.env.DATABASE_PASSWORD || "docfast",
|
||||
max: 10,
|
||||
idleTimeoutMillis: 30000,
|
||||
connectionTimeoutMillis: 5000, // Don't wait forever for a connection
|
||||
allowExitOnIdle: false,
|
||||
keepAlive: true, // TCP keepalive to detect dead connections
|
||||
keepAliveInitialDelayMillis: 10000, // Start keepalive probes after 10s idle
|
||||
});
|
||||
|
||||
pool.on("error", (err) => {
|
||||
logger.error({ err }, "Unexpected PostgreSQL pool error");
|
||||
logger.error({ err }, "Unexpected PostgreSQL pool error — connection will be removed from pool");
|
||||
});
|
||||
|
||||
/**
|
||||
* Determine if an error is transient (PgBouncer failover, network blip)
|
||||
*/
|
||||
export function isTransientError(err: any): boolean {
|
||||
if (!err) return false;
|
||||
const code = err.code || "";
|
||||
const msg = (err.message || "").toLowerCase();
|
||||
|
||||
if (TRANSIENT_ERRORS.has(code)) return true;
|
||||
if (msg.includes("no available server")) return true; // PgBouncer specific
|
||||
if (msg.includes("connection terminated")) return true;
|
||||
if (msg.includes("connection refused")) return true;
|
||||
if (msg.includes("server closed the connection")) return true;
|
||||
if (msg.includes("timeout expired")) return true;
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Execute a query with automatic retry on transient errors.
|
||||
* Retries up to `maxRetries` times with exponential backoff.
|
||||
*/
|
||||
export async function queryWithRetry(
|
||||
queryText: string,
|
||||
params?: any[],
|
||||
maxRetries = 3
|
||||
): Promise<pg.QueryResult> {
|
||||
let lastError: any;
|
||||
|
||||
for (let attempt = 0; attempt <= maxRetries; attempt++) {
|
||||
try {
|
||||
return await pool.query(queryText, params);
|
||||
} catch (err: any) {
|
||||
lastError = err;
|
||||
|
||||
if (!isTransientError(err) || attempt === maxRetries) {
|
||||
throw err;
|
||||
}
|
||||
|
||||
const delayMs = Math.min(1000 * Math.pow(2, attempt), 5000); // 1s, 2s, 4s (capped at 5s)
|
||||
logger.warn(
|
||||
{ err: err.message, code: err.code, attempt: attempt + 1, maxRetries, delayMs },
|
||||
"Transient DB error, retrying..."
|
||||
);
|
||||
await new Promise(resolve => setTimeout(resolve, delayMs));
|
||||
}
|
||||
}
|
||||
|
||||
throw lastError;
|
||||
}
|
||||
|
||||
/**
|
||||
* Connect with retry — for operations that need a client (transactions).
|
||||
*/
|
||||
export async function connectWithRetry(maxRetries = 3): Promise<pg.PoolClient> {
|
||||
let lastError: any;
|
||||
|
||||
for (let attempt = 0; attempt <= maxRetries; attempt++) {
|
||||
try {
|
||||
return await pool.connect();
|
||||
} catch (err: any) {
|
||||
lastError = err;
|
||||
|
||||
if (!isTransientError(err) || attempt === maxRetries) {
|
||||
throw err;
|
||||
}
|
||||
|
||||
const delayMs = Math.min(1000 * Math.pow(2, attempt), 5000);
|
||||
logger.warn(
|
||||
{ err: err.message, code: err.code, attempt: attempt + 1, maxRetries, delayMs },
|
||||
"Transient DB connect error, retrying..."
|
||||
);
|
||||
await new Promise(resolve => setTimeout(resolve, delayMs));
|
||||
}
|
||||
}
|
||||
|
||||
throw lastError;
|
||||
}
|
||||
|
||||
export async function initDatabase(): Promise<void> {
|
||||
const client = await pool.connect();
|
||||
const client = await connectWithRetry();
|
||||
try {
|
||||
await client.query(`
|
||||
CREATE TABLE IF NOT EXISTS api_keys (
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue