Skip to content

Commit dbbf780

Browse files
committed
fix: add retry backoff to escrow monitor to prevent OOM crashes
The monitor was retrying failed escrow settlements and recurring series creation every 15s cycle with no backoff or max retry limit. When settlements failed (e.g. insufficient SOL for rent) or series creation hit rate limits, error objects accumulated in memory until the process hit the V8 heap limit (~507MB) and crashed. Changes: - Add in-memory retry tracking with exponential backoff (1/2/4/8/16 min) - Max 5 retries before skipping until process restart - Permanent errors (insufficient funds/lamports) skip retries immediately - Cap retry map at 500 entries to prevent unbounded growth - Log warnings for permanently failed items needing manual intervention Pre-existing test failures in login/page.test.tsx (useSearchParams mock) are unrelated to this change.
1 parent 07eefc0 commit dbbf780

File tree

1 file changed

+86
-11
lines changed

1 file changed

+86
-11
lines changed

src/lib/payments/monitor-escrow.ts

Lines changed: 86 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,49 @@
44

55
import { checkBalance, processPayment, type Payment } from './monitor-balance';
66

7+
// ── Retry tracking (in-memory) ──
8+
// Prevents infinite retry loops that leak memory and cause OOM crashes.
9+
// Each failed ID gets exponential backoff; after MAX_RETRIES it's skipped
10+
// until the process restarts (Railway restart clears the map).
11+
const MAX_RETRIES = 5;
12+
const BACKOFF_BASE_MS = 60_000; // 1 min, then 2, 4, 8, 16 min
13+
const retryState = new Map<string, { count: number; nextRetryAt: number; lastError: string }>();
14+
15+
function shouldRetry(id: string): boolean {
16+
const state = retryState.get(id);
17+
if (!state) return true;
18+
if (state.count >= MAX_RETRIES) return false;
19+
return Date.now() >= state.nextRetryAt;
20+
}
21+
22+
function recordFailure(id: string, error: string): void {
23+
const state = retryState.get(id) || { count: 0, nextRetryAt: 0, lastError: '' };
24+
state.count++;
25+
state.lastError = error;
26+
state.nextRetryAt = Date.now() + BACKOFF_BASE_MS * Math.pow(2, state.count - 1);
27+
retryState.set(id, state);
28+
if (retryState.size > 500) {
29+
const oldest = retryState.keys().next().value;
30+
if (oldest) retryState.delete(oldest);
31+
}
32+
}
33+
34+
function recordSuccess(id: string): void {
35+
retryState.delete(id);
36+
}
37+
38+
// Errors that require manual intervention — don't retry
39+
const PERMANENT_ERRORS = [
40+
'insufficient funds for rent',
41+
'insufficient lamports',
42+
'account not found',
43+
];
44+
45+
function isPermanentError(errorText: string): boolean {
46+
const lower = errorText.toLowerCase();
47+
return PERMANENT_ERRORS.some(e => lower.includes(e));
48+
}
49+
750
// Escrow Monitoring
851
// ────────────────────────────────────────────────────────────
952

@@ -50,7 +93,6 @@ export async function runEscrowCycle(supabase: any, now: Date): Promise<EscrowSt
5093
for (const escrow of pendingEscrows) {
5194
stats.checked++;
5295
try {
53-
// Check if expired
5496
if (new Date(escrow.expires_at) < now) {
5597
await supabase
5698
.from('escrows')
@@ -68,13 +110,11 @@ export async function runEscrowCycle(supabase: any, now: Date): Promise<EscrowSt
68110
continue;
69111
}
70112

71-
// Check balance on-chain using existing checkBalance function
72113
const balanceResult = await checkBalance(escrow.escrow_address, escrow.chain);
73114
const balance = balanceResult.balance;
74115
const tolerance = escrow.amount * 0.01;
75116

76117
if (balance >= escrow.amount - tolerance) {
77-
// Mark as funded
78118
await supabase
79119
.from('escrows')
80120
.update({
@@ -136,7 +176,6 @@ export async function runEscrowCycle(supabase: any, now: Date): Promise<EscrowSt
136176
continue;
137177
}
138178

139-
// Mark as refunded so step 3 picks it up for settlement
140179
await supabase
141180
.from('escrows')
142181
.update({ status: 'refunded' })
@@ -199,6 +238,7 @@ export async function runEscrowCycle(supabase: any, now: Date): Promise<EscrowSt
199238

200239
/**
201240
* Process escrow settlement via internal API calls
241+
* Now with retry tracking + exponential backoff to prevent OOM from infinite loops
202242
*/
203243
async function processEscrowSettlement(escrows: Escrow[], action: 'release' | 'refund'): Promise<{ settled: number; errors: number }> {
204244
const stats = { settled: 0, errors: 0 };
@@ -212,6 +252,17 @@ async function processEscrowSettlement(escrows: Escrow[], action: 'release' | 'r
212252
}
213253

214254
for (const escrow of escrows) {
255+
const retryKey = `settle:${escrow.id}`;
256+
257+
if (!shouldRetry(retryKey)) {
258+
const state = retryState.get(retryKey);
259+
if (state && state.count === MAX_RETRIES) {
260+
console.warn(`[Monitor] Escrow ${escrow.id} settlement skipped — max retries exceeded (${state.lastError}). Needs manual intervention.`);
261+
state.count++; // stop re-logging
262+
}
263+
continue;
264+
}
265+
215266
try {
216267
const body = action === 'refund' ? JSON.stringify({ action: 'refund' }) : undefined;
217268
const settleResponse = await fetch(`${appUrl}/api/escrow/${escrow.id}/settle`, {
@@ -225,14 +276,24 @@ async function processEscrowSettlement(escrows: Escrow[], action: 'release' | 'r
225276

226277
if (settleResponse.ok) {
227278
console.log(`[Monitor] Settlement triggered for escrow ${escrow.id} (${action})`);
279+
recordSuccess(retryKey);
228280
stats.settled++;
229281
} else {
230282
const errorText = await settleResponse.text();
231283
console.error(`[Monitor] Settlement failed for escrow ${escrow.id}: ${settleResponse.status} - ${errorText}`);
284+
285+
if (isPermanentError(errorText)) {
286+
console.warn(`[Monitor] Escrow ${escrow.id}: PERMANENT error — ${errorText.slice(0, 120)}. Will not retry. Top up wallet or cancel escrow.`);
287+
retryState.set(retryKey, { count: MAX_RETRIES + 1, nextRetryAt: Infinity, lastError: errorText.slice(0, 200) });
288+
} else {
289+
recordFailure(retryKey, errorText.slice(0, 200));
290+
}
232291
stats.errors++;
233292
}
234-
} catch (settleError) {
235-
console.error(`[Monitor] Error settling escrow ${escrow.id}:`, settleError);
293+
} catch (settleError: any) {
294+
const msg = settleError?.message || String(settleError);
295+
console.error(`[Monitor] Error settling escrow ${escrow.id}:`, msg);
296+
recordFailure(retryKey, msg.slice(0, 200));
236297
stats.errors++;
237298
}
238299
}
@@ -294,12 +355,22 @@ export async function runRecurringEscrowCycle(supabase: any, now: Date): Promise
294355
console.log(`[Monitor] Processing ${dueSeries.length} due recurring escrow series`);
295356

296357
for (const series of dueSeries) {
358+
const retryKey = `series:${series.id}`;
359+
360+
if (!shouldRetry(retryKey)) {
361+
const state = retryState.get(retryKey);
362+
if (state && state.count === MAX_RETRIES) {
363+
console.warn(`[Monitor] Series ${series.id} skipped — max retries exceeded (${state.lastError})`);
364+
state.count++;
365+
}
366+
continue;
367+
}
368+
297369
stats.processed++;
298370
try {
299371
let childCreated = false;
300372

301373
if (series.payment_method === 'crypto') {
302-
// Create crypto escrow via internal API
303374
const res = await fetch(`${appUrl}/api/escrow`, {
304375
method: 'POST',
305376
headers: {
@@ -320,21 +391,21 @@ export async function runRecurringEscrowCycle(supabase: any, now: Date): Promise
320391

321392
if (res.ok) {
322393
const escrow = await res.json();
323-
// Link series_id
324394
await supabase
325395
.from('escrows')
326396
.update({ series_id: series.id })
327397
.eq('id', escrow.id);
328398
childCreated = true;
399+
recordSuccess(retryKey);
329400
console.log(`[Monitor] Created crypto escrow ${escrow.id} for series ${series.id}`);
330401
} else {
331402
const errText = await res.text();
332403
console.error(`[Monitor] Failed to create crypto escrow for series ${series.id}: ${errText}`);
404+
recordFailure(retryKey, errText.slice(0, 200));
333405
stats.errors++;
334406
continue;
335407
}
336408
} else if (series.payment_method === 'card') {
337-
// Create Stripe escrow via internal API
338409
const res = await fetch(`${appUrl}/api/stripe/payments/create`, {
339410
method: 'POST',
340411
headers: {
@@ -353,10 +424,12 @@ export async function runRecurringEscrowCycle(supabase: any, now: Date): Promise
353424

354425
if (res.ok) {
355426
childCreated = true;
427+
recordSuccess(retryKey);
356428
console.log(`[Monitor] Created card payment for series ${series.id}`);
357429
} else {
358430
const errText = await res.text();
359431
console.error(`[Monitor] Failed to create card escrow for series ${series.id}: ${errText}`);
432+
recordFailure(retryKey, errText.slice(0, 200));
360433
stats.errors++;
361434
continue;
362435
}
@@ -383,8 +456,10 @@ export async function runRecurringEscrowCycle(supabase: any, now: Date): Promise
383456
console.log(`[Monitor] Series ${series.id} completed (${newPeriodsCompleted}/${series.max_periods})`);
384457
}
385458
}
386-
} catch (seriesError) {
387-
console.error(`[Monitor] Error processing series ${series.id}:`, seriesError);
459+
} catch (seriesError: any) {
460+
const msg = seriesError?.message || String(seriesError);
461+
console.error(`[Monitor] Error processing series ${series.id}:`, msg);
462+
recordFailure(retryKey, msg.slice(0, 200));
388463
stats.errors++;
389464
}
390465
}

0 commit comments

Comments
 (0)