Skip to content

Commit 4a1e9ae

Browse files
committed
feat: add retry mechanism with exponential backoff
- Track retryCount and lastErrorTime in state for crash recovery - Implement exponential backoff with jitter (max 5 min wait) - Skip failed tasks after maxRetries exceeded - Add alert() method to logger for critical errors - Reset retry count on successful phase completion - Add comprehensive tests for backoff calculation and retry tracking Signed-off-by: leocavalcante <[email protected]>
1 parent 544d565 commit 4a1e9ae

File tree

6 files changed

+261
-6
lines changed

6 files changed

+261
-6
lines changed

src/logger.ts

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,16 @@ export class Logger {
116116
this.writeToAlerts(formatted)
117117
}
118118

119+
/**
120+
* Log a critical alert - always written to alerts file and shown prominently
121+
*/
122+
alert(message: string): void {
123+
const formatted = `[ALERT] ${message}`
124+
console.error(`${ANSI.red}${ANSI.bold}${formatted}${ANSI.reset}`)
125+
this.writeToBuffer(this.formatForFile(formatted))
126+
this.writeToAlerts(formatted)
127+
}
128+
119129
/**
120130
* Log only if verbose mode enabled
121131
*/

src/loop.ts

Lines changed: 106 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ import { Builder } from "./builder.ts"
1212
import { extractEvaluationReason, isComplete, parseEvaluation } from "./evaluator.ts"
1313
import {
1414
ensureDirectories,
15+
getISOTimestamp,
1516
getTimestampForFilename,
1617
initializePaths,
1718
readFileOrNull,
@@ -87,14 +88,48 @@ export async function runLoop(config: Config): Promise<void> {
8788
await runEvaluationPhase(state, builder, paths, logger, config)
8889
break
8990
}
91+
92+
// Success - reset retry count
93+
state.retryCount = 0
94+
state.lastErrorTime = undefined
9095
} catch (err) {
91-
logger.logError(`Error in ${state.phase} phase: ${err}`)
96+
// Track the error
97+
state.retryCount++
98+
state.lastErrorTime = getISOTimestamp()
99+
100+
logger.logError(
101+
`Error in ${state.phase} phase (attempt ${state.retryCount}/${config.maxRetries}): ${err}`,
102+
)
92103

93-
// Retry with backoff
94-
if (!shutdownRequested) {
95-
const backoffMs = config.backoffBase * 1000
96-
logger.say(`Retrying in ${config.backoffBase} seconds...`)
97-
await sleep(backoffMs)
104+
// Check if we've exceeded max retries
105+
if (state.retryCount >= config.maxRetries) {
106+
logger.logError(`Max retries (${config.maxRetries}) exceeded for ${state.phase} phase`)
107+
logger.alert(`CRITICAL: ${state.phase} phase failed after ${config.maxRetries} attempts`)
108+
109+
// Reset retry count and move to next phase or skip task
110+
state.retryCount = 0
111+
state.lastErrorTime = undefined
112+
113+
if (state.phase === "build") {
114+
// Skip the failed task and continue
115+
logger.warn("Skipping failed task and continuing...")
116+
await skipCurrentTask(state, paths, logger)
117+
} else if (state.phase === "plan") {
118+
// Clear any stuck idea and retry planning
119+
state.currentIdeaPath = undefined
120+
state.currentIdeaFilename = undefined
121+
logger.warn("Clearing idea state and retrying plan phase...")
122+
}
123+
// For evaluation, just retry - it will eventually succeed or the user will intervene
124+
} else {
125+
// Retry with exponential backoff
126+
const backoffMs = calculateBackoff(state.retryCount, config.backoffBase)
127+
const backoffSec = Math.round(backoffMs / 1000)
128+
logger.say(`Retrying in ${backoffSec} seconds...`)
129+
130+
if (!shutdownRequested) {
131+
await sleep(backoffMs)
132+
}
98133
}
99134
}
100135

@@ -460,6 +495,71 @@ export function sleep(ms: number): Promise<void> {
460495
return new Promise((resolve) => setTimeout(resolve, ms))
461496
}
462497

498+
/**
499+
* Calculate exponential backoff with jitter.
500+
* @param retryCount - Current retry attempt (1-based)
501+
* @param baseSeconds - Base delay in seconds
502+
* @returns Delay in milliseconds
503+
*/
504+
export function calculateBackoff(retryCount: number, baseSeconds: number): number {
505+
// Exponential backoff: base * 2^(retry-1) with max of 5 minutes
506+
const exponentialDelay = baseSeconds * 2 ** (retryCount - 1)
507+
const cappedDelay = Math.min(exponentialDelay, 300) // Max 5 minutes
508+
509+
// Add jitter (up to 20% randomness) to prevent thundering herd
510+
const jitter = cappedDelay * 0.2 * Math.random()
511+
512+
return Math.round((cappedDelay + jitter) * 1000)
513+
}
514+
515+
/**
516+
* Skip the current task in build phase when it has failed too many times.
517+
* Marks the task as completed (with a note) and moves to the next task.
518+
*/
519+
async function skipCurrentTask(state: RuntimeState, paths: Paths, logger: Logger): Promise<void> {
520+
const planContent = await readFileOrNull(paths.currentPlan)
521+
if (!planContent) return
522+
523+
const uncompletedTasks = getUncompletedTasks(planContent)
524+
525+
if (uncompletedTasks.length === 0) {
526+
state.phase = "evaluation"
527+
return
528+
}
529+
530+
const currentTask = uncompletedTasks[0]
531+
if (!currentTask) {
532+
state.phase = "evaluation"
533+
return
534+
}
535+
536+
// Mark task as completed (even though it failed - to allow progress)
537+
const updatedPlan = markTaskComplete(planContent, currentTask.lineNumber)
538+
539+
// Add a note about the skipped task
540+
const noteComment = `<!-- SKIPPED: Task failed after max retries -->`
541+
const planWithNote = updatedPlan.replace(
542+
new RegExp(`(- \\[x\\] ${escapeRegExp(currentTask.description)})`),
543+
`$1 ${noteComment}`,
544+
)
545+
546+
await writeFile(paths.currentPlan, planWithNote)
547+
logger.warn(`Skipped failed task: ${currentTask.description}`)
548+
549+
// Check if this was the last task
550+
const remainingTasks = getUncompletedTasks(planWithNote)
551+
if (remainingTasks.length === 0) {
552+
state.phase = "evaluation"
553+
}
554+
}
555+
556+
/**
557+
* Escape special regex characters in a string.
558+
*/
559+
function escapeRegExp(str: string): string {
560+
return str.replace(/[.*+?^${}()|[\]\\]/g, "\\$&")
561+
}
562+
463563
/**
464564
* Check if shutdown has been requested.
465565
* Exported for testing and external shutdown checks.

src/state.ts

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,8 @@ const DEFAULT_STATE: State = {
1313
lastUpdate: "",
1414
currentIdeaPath: undefined,
1515
currentIdeaFilename: undefined,
16+
retryCount: 0,
17+
lastErrorTime: undefined,
1618
}
1719

1820
/**
@@ -57,6 +59,14 @@ export async function loadState(stateFile: string): Promise<RuntimeState> {
5759
)
5860
}
5961

62+
// Validate retryCount is a non-negative number
63+
if (
64+
parsed.retryCount !== undefined &&
65+
(typeof parsed.retryCount !== "number" || parsed.retryCount < 0)
66+
) {
67+
parsed.retryCount = DEFAULT_STATE.retryCount
68+
}
69+
6070
// Merge with defaults to handle missing fields
6171
const state: State = {
6272
cycle: parsed.cycle ?? DEFAULT_STATE.cycle,
@@ -66,6 +76,8 @@ export async function loadState(stateFile: string): Promise<RuntimeState> {
6676
lastUpdate: parsed.lastUpdate ?? DEFAULT_STATE.lastUpdate,
6777
currentIdeaPath: parsed.currentIdeaPath,
6878
currentIdeaFilename: parsed.currentIdeaFilename,
79+
retryCount: parsed.retryCount ?? DEFAULT_STATE.retryCount,
80+
lastErrorTime: parsed.lastErrorTime,
6981
}
7082

7183
return toRuntimeState(state)
@@ -100,6 +112,8 @@ export async function saveState(stateFile: string, state: RuntimeState): Promise
100112
lastUpdate: getISOTimestamp(),
101113
currentIdeaPath: state.currentIdeaPath,
102114
currentIdeaFilename: state.currentIdeaFilename,
115+
retryCount: state.retryCount,
116+
lastErrorTime: state.lastErrorTime,
103117
}
104118

105119
const content = JSON.stringify(persistedState, null, 2)
@@ -156,5 +170,7 @@ export function newCycleState(currentCycle: number): Partial<RuntimeState> {
156170
currentTaskDesc: "",
157171
currentIdeaPath: undefined,
158172
currentIdeaFilename: undefined,
173+
retryCount: 0,
174+
lastErrorTime: undefined,
159175
}
160176
}

src/types.ts

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,10 @@ export interface State {
4949
currentIdeaPath?: string
5050
/** Filename of the idea currently being processed (for display) */
5151
currentIdeaFilename?: string
52+
/** Number of consecutive failures in current phase */
53+
retryCount: number
54+
/** ISO timestamp of last error (for backoff calculation) */
55+
lastErrorTime?: string
5256
}
5357

5458
/** Runtime state with additional non-persisted fields */

tests/loop.test.ts

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ import { join } from "node:path"
88
import { Logger } from "../src/logger.ts"
99
import {
1010
archivePlan,
11+
calculateBackoff,
1112
isShutdownRequested,
1213
logStartupInfo,
1314
requestShutdown,
@@ -44,6 +45,9 @@ function createTestConfig(overrides?: Partial<Config>): Config {
4445
backoffBase: 10,
4546
logRetention: 30,
4647
taskPauseSeconds: 2,
48+
autoCommit: true,
49+
autoPush: true,
50+
commitSignoff: false,
4751
...overrides,
4852
}
4953
}
@@ -117,6 +121,55 @@ describe("loop", () => {
117121
})
118122
})
119123

124+
describe("calculateBackoff", () => {
125+
test("first retry uses base delay", () => {
126+
const delay = calculateBackoff(1, 10)
127+
// Base is 10 seconds = 10000ms, with up to 20% jitter
128+
expect(delay).toBeGreaterThanOrEqual(10000)
129+
expect(delay).toBeLessThanOrEqual(12000)
130+
})
131+
132+
test("second retry doubles delay", () => {
133+
const delay = calculateBackoff(2, 10)
134+
// 10 * 2^1 = 20 seconds, with jitter
135+
expect(delay).toBeGreaterThanOrEqual(20000)
136+
expect(delay).toBeLessThanOrEqual(24000)
137+
})
138+
139+
test("third retry quadruples delay", () => {
140+
const delay = calculateBackoff(3, 10)
141+
// 10 * 2^2 = 40 seconds, with jitter
142+
expect(delay).toBeGreaterThanOrEqual(40000)
143+
expect(delay).toBeLessThanOrEqual(48000)
144+
})
145+
146+
test("caps at 5 minutes maximum", () => {
147+
const delay = calculateBackoff(10, 10)
148+
// Should cap at 300 seconds (5 min), with jitter
149+
expect(delay).toBeGreaterThanOrEqual(300000)
150+
expect(delay).toBeLessThanOrEqual(360000)
151+
})
152+
153+
test("works with different base values", () => {
154+
const delay = calculateBackoff(1, 5)
155+
// Base is 5 seconds = 5000ms, with jitter
156+
expect(delay).toBeGreaterThanOrEqual(5000)
157+
expect(delay).toBeLessThanOrEqual(6000)
158+
})
159+
160+
test("returns milliseconds", () => {
161+
const delay = calculateBackoff(1, 1)
162+
// 1 second base = ~1000-1200ms with jitter
163+
expect(delay).toBeGreaterThanOrEqual(1000)
164+
expect(delay).toBeLessThanOrEqual(1200)
165+
})
166+
167+
test("is non-negative", () => {
168+
const delay = calculateBackoff(1, 0)
169+
expect(delay).toBeGreaterThanOrEqual(0)
170+
})
171+
})
172+
120173
describe("isShutdownRequested", () => {
121174
test("returns false initially", () => {
122175
resetShutdownFlags()

tests/state.test.ts

Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -149,5 +149,77 @@ describe("state", () => {
149149
expect(newState.taskIndex).toBe(0)
150150
expect(newState.totalTasks).toBe(0)
151151
})
152+
153+
test("resets retry count", () => {
154+
const newState = newCycleState(5)
155+
156+
expect(newState.retryCount).toBe(0)
157+
expect(newState.lastErrorTime).toBeUndefined()
158+
})
159+
})
160+
161+
describe("retryCount tracking", () => {
162+
test("loadState returns default retryCount of 0", async () => {
163+
const stateFile = join(TEST_DIR, "nonexistent.json")
164+
const state = await loadState(stateFile)
165+
166+
expect(state.retryCount).toBe(0)
167+
})
168+
169+
test("loadState preserves retryCount from file", async () => {
170+
const stateFile = join(TEST_DIR, "retry-state.json")
171+
const savedState = {
172+
cycle: 1,
173+
phase: "build",
174+
taskIndex: 0,
175+
retryCount: 2,
176+
lastErrorTime: "2024-01-01T00:00:00Z",
177+
}
178+
await Bun.write(stateFile, JSON.stringify(savedState))
179+
180+
const state = await loadState(stateFile)
181+
182+
expect(state.retryCount).toBe(2)
183+
expect(state.lastErrorTime).toBe("2024-01-01T00:00:00Z")
184+
})
185+
186+
test("saveState persists retryCount", async () => {
187+
const stateFile = join(TEST_DIR, "save-retry.json")
188+
const state = {
189+
cycle: 1,
190+
phase: "build" as const,
191+
taskIndex: 0,
192+
totalTasks: 3,
193+
currentTaskNum: 1,
194+
currentTaskDesc: "Test",
195+
lastUpdate: "",
196+
retryCount: 3,
197+
lastErrorTime: "2024-01-01T12:00:00Z",
198+
}
199+
200+
await saveState(stateFile, state)
201+
202+
const content = await Bun.file(stateFile).text()
203+
const saved = JSON.parse(content)
204+
205+
expect(saved.retryCount).toBe(3)
206+
expect(saved.lastErrorTime).toBe("2024-01-01T12:00:00Z")
207+
})
208+
209+
test("invalid retryCount defaults to 0", async () => {
210+
const stateFile = join(TEST_DIR, "invalid-retry.json")
211+
await Bun.write(stateFile, JSON.stringify({ cycle: 1, retryCount: -1 }))
212+
213+
const state = await loadState(stateFile)
214+
215+
expect(state.retryCount).toBe(0)
216+
})
217+
218+
test("resetState clears retryCount", () => {
219+
const state = resetState()
220+
221+
expect(state.retryCount).toBe(0)
222+
expect(state.lastErrorTime).toBeUndefined()
223+
})
152224
})
153225
})

0 commit comments

Comments
 (0)