fix: database connection resilience — retry on transient errors, TCP keepalive, health check timeout

- Enable TCP keepalive on pg.Pool to detect dead connections - Add connectionTimeoutMillis (5s) to prevent hanging on stale connections - Add queryWithRetry() with exponential backoff for transient DB errors - Add connectWithRetry() for transaction-based operations - Detect PgBouncer "no available server" and other transient errors - Health check has 3s timeout and returns 503 on DB failure - All DB operations in keys, verification, usage use retry logic Fixes BUG-075: PgBouncer failover causes permanent pod failures
2026-02-18 14:08:29 +00:00 · 2026-02-18 14:08:29 +00:00 · 8d88a9c235
commit 8d88a9c235
parent 97744897f0
5 changed files with 149 additions and 43 deletions
--- a/src/middleware/usage.ts
+++ b/src/middleware/usage.ts
@ -1,6 +1,7 @@
 import { isProKey } from "../services/keys.js";
 import logger from "../services/logger.js";
 import pool from "../services/db.js";
+import { queryWithRetry, connectWithRetry } from "../services/db.js";

 const FREE_TIER_LIMIT = 100;
 const PRO_TIER_LIMIT = 5000;
@ -22,7 +23,7 @@ function getMonthKey(): string {

 export async function loadUsageData(): Promise<void> {
  try {
-    const result = await pool.query("SELECT key, count, month_key FROM usage");
+    const result = await queryWithRetry("SELECT key, count, month_key FROM usage");
    usage = new Map();
    for (const row of result.rows) {
      usage.set(row.key, { count: row.count, monthKey: row.month_key });
@ -40,7 +41,7 @@ async function flushDirtyEntries(): Promise<void> {

  const keysToFlush = [...dirtyKeys];
  
-  const client = await pool.connect();
+  const client = await connectWithRetry();
  try {
    await client.query("BEGIN");
    for (const key of keysToFlush) {