fix: destroy dead pool connections on transient errors (proper failover)

- queryWithRetry now uses explicit client checkout; on transient error, calls client.release(true) to DESTROY the dead connection instead of returning it to pool. Fresh connections are created on retry. - connectWithRetry validates connections with SELECT 1 before returning - Health check destroys bad connections on failure - Reduced idleTimeoutMillis from 30s to 10s for faster stale connection eviction - Fixes BUG-075: pool kept reusing dead TCP sockets after PgBouncer pod restart
2026-02-18 14:28:47 +00:00 · 2026-02-18 14:28:47 +00:00 · 95ca10175f
commit 95ca10175f
parent 8d88a9c235
2 changed files with 52 additions and 13 deletions
--- a/src/routes/health.ts
+++ b/src/routes/health.ts
@ -16,18 +16,23 @@ healthRouter.get("/", async (_req, res) => {
  let overallStatus = "ok";
  let httpStatus = 200;

-  // Check database connectivity with a timeout
+  // Check database connectivity with a real query and timeout
  try {
    const dbCheck = async () => {
      const client = await pool.connect();
      try {
+        // Use SELECT 1 as a lightweight liveness probe
+        await client.query('SELECT 1');
        const result = await client.query('SELECT version()');
        const version = result.rows[0]?.version || 'Unknown';
        const versionMatch = version.match(/PostgreSQL ([\d.]+)/);
        const shortVersion = versionMatch ? `PostgreSQL ${versionMatch[1]}` : 'PostgreSQL';
-        return { status: "ok", version: shortVersion };
-      } finally {
        client.release();
+        return { status: "ok", version: shortVersion };
+      } catch (queryErr) {
+        // Destroy the bad connection so it doesn't go back to the pool
+        try { client.release(true); } catch (_) {}
+        throw queryErr;
      }
    };