fix: database connection resilience — retry on transient errors, TCP keepalive, health check timeout
All checks were successful
Build & Deploy to Staging / Build & Deploy to Staging (push) Successful in 9m25s
Promote to Production / Deploy to Production (push) Successful in 1m36s

- Enable TCP keepalive on pg.Pool to detect dead connections
- Add connectionTimeoutMillis (5s) to prevent hanging on stale connections
- Add queryWithRetry() with exponential backoff for transient DB errors
- Add connectWithRetry() for transaction-based operations
- Detect PgBouncer "no available server" and other transient errors
- Health check has 3s timeout and returns 503 on DB failure
- All DB operations in keys, verification, usage use retry logic

Fixes BUG-075: PgBouncer failover causes permanent pod failures
This commit is contained in:
OpenClaw Deployer 2026-02-18 14:08:29 +00:00
parent 97744897f0
commit 8d88a9c235
5 changed files with 149 additions and 43 deletions

View file

@ -1,6 +1,7 @@
import { randomBytes, randomInt, timingSafeEqual } from "crypto";
import logger from "./logger.js";
import pool from "./db.js";
import { queryWithRetry, connectWithRetry } from "./db.js";
export interface Verification {
email: string;
@ -24,7 +25,7 @@ const MAX_ATTEMPTS = 3;
export async function createVerification(email: string, apiKey: string): Promise<Verification> {
// Check for existing unexpired, unverified
const existing = await pool.query(
const existing = await queryWithRetry(
"SELECT * FROM verifications WHERE email = $1 AND verified_at IS NULL AND created_at > NOW() - INTERVAL '24 hours' LIMIT 1",
[email]
);
@ -34,11 +35,11 @@ export async function createVerification(email: string, apiKey: string): Promise
}
// Remove old unverified
await pool.query("DELETE FROM verifications WHERE email = $1 AND verified_at IS NULL", [email]);
await queryWithRetry("DELETE FROM verifications WHERE email = $1 AND verified_at IS NULL", [email]);
const token = randomBytes(32).toString("hex");
const now = new Date().toISOString();
await pool.query(
await queryWithRetry(
"INSERT INTO verifications (email, token, api_key, created_at) VALUES ($1, $2, $3, $4)",
[email, token, apiKey, now]
);
@ -56,7 +57,7 @@ export function verifyToken(token: string): { status: "ok"; verification: Verifi
let verificationsCache: Verification[] = [];
export async function loadVerifications(): Promise<void> {
const result = await pool.query("SELECT * FROM verifications");
const result = await queryWithRetry("SELECT * FROM verifications");
verificationsCache = result.rows.map((r) => ({
email: r.email,
token: r.token,
@ -85,12 +86,12 @@ function verifyTokenSync(token: string): { status: "ok"; verification: Verificat
if (age > TOKEN_EXPIRY_MS) return { status: "expired" };
v.verifiedAt = new Date().toISOString();
// Update DB async
pool.query("UPDATE verifications SET verified_at = $1 WHERE token = $2", [v.verifiedAt, token]).catch((err) => logger.error({ err }, "Failed to update verification"));
queryWithRetry("UPDATE verifications SET verified_at = $1 WHERE token = $2", [v.verifiedAt, token]).catch((err) => logger.error({ err }, "Failed to update verification"));
return { status: "ok", verification: v };
}
export async function createPendingVerification(email: string): Promise<PendingVerification> {
await pool.query("DELETE FROM pending_verifications WHERE email = $1", [email]);
await queryWithRetry("DELETE FROM pending_verifications WHERE email = $1", [email]);
const now = new Date();
const pending: PendingVerification = {
@ -101,7 +102,7 @@ export async function createPendingVerification(email: string): Promise<PendingV
attempts: 0,
};
await pool.query(
await queryWithRetry(
"INSERT INTO pending_verifications (email, code, created_at, expires_at, attempts) VALUES ($1, $2, $3, $4, $5)",
[pending.email, pending.code, pending.createdAt, pending.expiresAt, pending.attempts]
);
@ -110,34 +111,34 @@ export async function createPendingVerification(email: string): Promise<PendingV
export async function verifyCode(email: string, code: string): Promise<{ status: "ok" | "invalid" | "expired" | "max_attempts" }> {
const cleanEmail = email.trim().toLowerCase();
const result = await pool.query("SELECT * FROM pending_verifications WHERE email = $1", [cleanEmail]);
const result = await queryWithRetry("SELECT * FROM pending_verifications WHERE email = $1", [cleanEmail]);
const pending = result.rows[0];
if (!pending) return { status: "invalid" };
if (new Date() > new Date(pending.expires_at)) {
await pool.query("DELETE FROM pending_verifications WHERE email = $1", [cleanEmail]);
await queryWithRetry("DELETE FROM pending_verifications WHERE email = $1", [cleanEmail]);
return { status: "expired" };
}
if (pending.attempts >= MAX_ATTEMPTS) {
await pool.query("DELETE FROM pending_verifications WHERE email = $1", [cleanEmail]);
await queryWithRetry("DELETE FROM pending_verifications WHERE email = $1", [cleanEmail]);
return { status: "max_attempts" };
}
await pool.query("UPDATE pending_verifications SET attempts = attempts + 1 WHERE email = $1", [cleanEmail]);
await queryWithRetry("UPDATE pending_verifications SET attempts = attempts + 1 WHERE email = $1", [cleanEmail]);
const a = Buffer.from(pending.code, "utf8"); const b = Buffer.from(code, "utf8"); const codeMatch = a.length === b.length && timingSafeEqual(a, b);
if (!codeMatch) {
return { status: "invalid" };
}
await pool.query("DELETE FROM pending_verifications WHERE email = $1", [cleanEmail]);
await queryWithRetry("DELETE FROM pending_verifications WHERE email = $1", [cleanEmail]);
return { status: "ok" };
}
export async function isEmailVerified(email: string): Promise<boolean> {
const result = await pool.query(
const result = await queryWithRetry(
"SELECT 1 FROM verifications WHERE email = $1 AND verified_at IS NOT NULL LIMIT 1",
[email]
);
@ -145,7 +146,7 @@ export async function isEmailVerified(email: string): Promise<boolean> {
}
export async function getVerifiedApiKey(email: string): Promise<string | null> {
const result = await pool.query(
const result = await queryWithRetry(
"SELECT api_key FROM verifications WHERE email = $1 AND verified_at IS NOT NULL LIMIT 1",
[email]
);