fix: database connection resilience — retry on transient errors, TCP keepalive, health check timeout
- Enable TCP keepalive on pg.Pool to detect dead connections - Add connectionTimeoutMillis (5s) to prevent hanging on stale connections - Add queryWithRetry() with exponential backoff for transient DB errors - Add connectWithRetry() for transaction-based operations - Detect PgBouncer "no available server" and other transient errors - Health check has 3s timeout and returns 503 on DB failure - All DB operations in keys, verification, usage use retry logic Fixes BUG-075: PgBouncer failover causes permanent pod failures
This commit is contained in:
parent
97744897f0
commit
8d88a9c235
5 changed files with 149 additions and 43 deletions
|
|
@ -1,6 +1,7 @@
|
|||
import { isProKey } from "../services/keys.js";
|
||||
import logger from "../services/logger.js";
|
||||
import pool from "../services/db.js";
|
||||
import { queryWithRetry, connectWithRetry } from "../services/db.js";
|
||||
|
||||
const FREE_TIER_LIMIT = 100;
|
||||
const PRO_TIER_LIMIT = 5000;
|
||||
|
|
@ -22,7 +23,7 @@ function getMonthKey(): string {
|
|||
|
||||
export async function loadUsageData(): Promise<void> {
|
||||
try {
|
||||
const result = await pool.query("SELECT key, count, month_key FROM usage");
|
||||
const result = await queryWithRetry("SELECT key, count, month_key FROM usage");
|
||||
usage = new Map();
|
||||
for (const row of result.rows) {
|
||||
usage.set(row.key, { count: row.count, monthKey: row.month_key });
|
||||
|
|
@ -40,7 +41,7 @@ async function flushDirtyEntries(): Promise<void> {
|
|||
|
||||
const keysToFlush = [...dirtyKeys];
|
||||
|
||||
const client = await pool.connect();
|
||||
const client = await connectWithRetry();
|
||||
try {
|
||||
await client.query("BEGIN");
|
||||
for (const key of keysToFlush) {
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue