fix: database connection resilience — retry on transient errors, TCP keepalive, health check timeout
- Enable TCP keepalive on pg.Pool to detect dead connections - Add connectionTimeoutMillis (5s) to prevent hanging on stale connections - Add queryWithRetry() with exponential backoff for transient DB errors - Add connectWithRetry() for transaction-based operations - Detect PgBouncer "no available server" and other transient errors - Health check has 3s timeout and returns 503 on DB failure - All DB operations in keys, verification, usage use retry logic Fixes BUG-075: PgBouncer failover causes permanent pod failures
This commit is contained in:
parent
97744897f0
commit
8d88a9c235
5 changed files with 149 additions and 43 deletions
|
|
@ -8,29 +8,34 @@ const { version: APP_VERSION } = require("../../package.json");
|
|||
|
||||
export const healthRouter = Router();
|
||||
|
||||
const HEALTH_CHECK_TIMEOUT_MS = 3000;
|
||||
|
||||
healthRouter.get("/", async (_req, res) => {
|
||||
const poolStats = getPoolStats();
|
||||
let databaseStatus: any;
|
||||
let overallStatus = "ok";
|
||||
let httpStatus = 200;
|
||||
|
||||
// Check database connectivity
|
||||
// Check database connectivity with a timeout
|
||||
try {
|
||||
const client = await pool.connect();
|
||||
try {
|
||||
const result = await client.query('SELECT version()');
|
||||
const version = result.rows[0]?.version || 'Unknown';
|
||||
// Extract just the PostgreSQL version number (e.g., "PostgreSQL 15.4")
|
||||
const versionMatch = version.match(/PostgreSQL ([\d.]+)/);
|
||||
const shortVersion = versionMatch ? `PostgreSQL ${versionMatch[1]}` : 'PostgreSQL';
|
||||
|
||||
databaseStatus = {
|
||||
status: "ok",
|
||||
version: shortVersion
|
||||
};
|
||||
} finally {
|
||||
client.release();
|
||||
}
|
||||
const dbCheck = async () => {
|
||||
const client = await pool.connect();
|
||||
try {
|
||||
const result = await client.query('SELECT version()');
|
||||
const version = result.rows[0]?.version || 'Unknown';
|
||||
const versionMatch = version.match(/PostgreSQL ([\d.]+)/);
|
||||
const shortVersion = versionMatch ? `PostgreSQL ${versionMatch[1]}` : 'PostgreSQL';
|
||||
return { status: "ok", version: shortVersion };
|
||||
} finally {
|
||||
client.release();
|
||||
}
|
||||
};
|
||||
|
||||
const timeout = new Promise<never>((_resolve, reject) =>
|
||||
setTimeout(() => reject(new Error("Database health check timed out")), HEALTH_CHECK_TIMEOUT_MS)
|
||||
);
|
||||
|
||||
databaseStatus = await Promise.race([dbCheck(), timeout]);
|
||||
} catch (error: any) {
|
||||
databaseStatus = {
|
||||
status: "error",
|
||||
|
|
@ -56,4 +61,4 @@ healthRouter.get("/", async (_req, res) => {
|
|||
};
|
||||
|
||||
res.status(httpStatus).json(response);
|
||||
});
|
||||
});
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue