Skip to content

Commit 7cbe08d

Browse files
committed
fix: retry with exponential backoff on empty model responses
Agent was crashing silently when models (Gemini, Ollama) returned empty responses after tool execution. Now retries up to 5 times with exponential backoff (1s, 2s, 4s, 8s, 16s) and surfaces the actual API error (e.g. 503 rate limit) through a new 'status' event type that updates the UI spinner during retries.
1 parent 8873aa7 commit 7cbe08d

File tree

10 files changed

+513
-79
lines changed

10 files changed

+513
-79
lines changed

src/agents/planning.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -133,6 +133,7 @@ export async function createPlanningAgent(additionalSubAgents: LlmAgent[] = []):
133133
try {
134134
initializedMcpAgent = await createMcpAgent();
135135
} catch (error) {
136+
// TODO(stability): MCP fallback uses placeholder agent with no tools — user won't know delegation will fail
136137
agentLogger.warn({ error }, 'MCP agent creation failed, using placeholder');
137138
initializedMcpAgent = mcpAgent;
138139
}

src/agents/runner.ts

Lines changed: 113 additions & 74 deletions
Original file line numberDiff line numberDiff line change
@@ -72,109 +72,148 @@ export async function* runAgent(
7272
runner: InMemoryRunner,
7373
sessionId?: string
7474
): AsyncGenerator<AgentStreamChunk, void, unknown> {
75-
const activeRunner = runner;
7675
const sid = sessionId ?? crypto.randomUUID();
7776

78-
// Try to get existing session, or create a new one
79-
let session = await activeRunner.sessionService.getSession({
77+
let session = await runner.sessionService.getSession({
8078
appName: APP_NAME,
8179
userId: 'default_user',
8280
sessionId: sid,
8381
});
8482

8583
if (!session) {
86-
session = await activeRunner.sessionService.createSession({
84+
session = await runner.sessionService.createSession({
8785
appName: APP_NAME,
8886
userId: 'default_user',
8987
sessionId: sid,
9088
});
9189
}
9290

93-
// Create user message
9491
const userMessage = createUserContent(input);
9592

96-
// Run the agent and stream responses
93+
const MAX_RETRIES = 5;
94+
const BASE_DELAY_MS = 1000;
95+
let attempt = 0;
96+
let gotFinalContent = false;
97+
let lastErrorCode: string | undefined;
98+
let lastErrorMessage: string | undefined;
99+
100+
function retryReason(): string {
101+
return lastErrorMessage ?? lastErrorCode ?? 'empty response';
102+
}
103+
97104
try {
98-
let eventIndex = 0;
99-
for await (const event of activeRunner.runAsync({
100-
userId: 'default_user',
101-
sessionId: sid,
102-
newMessage: userMessage,
103-
})) {
104-
eventIndex++;
105-
agentLogger.debug(`[Runner] ===== EVENT #${eventIndex} =====`);
106-
agentLogger.debug(
107-
`[Runner] Event ID: ${event.id}, from ${event.author}, parts: ${event.content?.parts?.length ?? 0}, role: ${event.content?.role}, isFinal: ${isFinalResponse(event)}, transferToAgent: ${event.actions?.transferToAgent ?? 'none'}`
108-
);
109-
110-
// Highlight if this event has a transfer action
111-
if (event.actions?.transferToAgent) {
112-
agentLogger.debug(
113-
`[Runner] *** TRANSFER DETECTED: ${event.author} -> ${event.actions.transferToAgent} ***`
105+
while (attempt <= MAX_RETRIES && !gotFinalContent) {
106+
if (attempt > 0) {
107+
const delayMs = BASE_DELAY_MS * Math.pow(2, attempt - 1); // 1s, 2s, 4s, 8s, 16s
108+
const reason = retryReason();
109+
agentLogger.info(
110+
`[Runner] Retrying in ${delayMs}ms (attempt ${attempt + 1}/${MAX_RETRIES + 1}): ${reason}`
114111
);
112+
yield {
113+
type: 'status',
114+
content: `Retrying (${attempt}/${MAX_RETRIES}): ${reason}`,
115+
};
116+
await new Promise((resolve) => setTimeout(resolve, delayMs));
115117
}
116118

117-
// Log what type of parts this event has
118-
const partTypes =
119-
event.content?.parts
120-
?.map((p) => {
121-
if ('text' in p && p.text) return 'text';
122-
if ('functionCall' in p && p.functionCall) return `functionCall:${p.functionCall.name}`;
123-
if ('functionResponse' in p && p.functionResponse)
124-
return `functionResponse:${(p.functionResponse as { name?: string }).name}`;
125-
return `unknown(${Object.keys(p).join(',')})`;
126-
})
127-
.join(', ') ?? 'no parts';
128-
agentLogger.debug(`[Runner] Event parts: ${partTypes}`);
129-
130-
// If this is a function response, log details
131-
if (event.content?.parts?.some((p) => 'functionResponse' in p)) {
132-
agentLogger.debug('[Runner] Function response event detected!');
133-
}
119+
const message =
120+
attempt === 0 ? userMessage : createUserContent('Please continue with your response.');
121+
122+
let eventIndex = 0;
123+
for await (const event of runner.runAsync({
124+
userId: 'default_user',
125+
sessionId: sid,
126+
newMessage: message,
127+
})) {
128+
eventIndex++;
129+
agentLogger.debug(`[Runner] ===== EVENT #${eventIndex} (attempt ${attempt + 1}) =====`);
130+
agentLogger.debug(
131+
`[Runner] Event ID: ${event.id}, from ${event.author}, parts: ${event.content?.parts?.length ?? 0}, role: ${event.content?.role}, isFinal: ${isFinalResponse(event)}, transferToAgent: ${event.actions?.transferToAgent ?? 'none'}, errorCode: ${(event as any).errorCode ?? 'none'}, errorMessage: ${(event as any).errorMessage ?? 'none'}`
132+
);
134133

135-
// Extract text content from event
136-
if (event.content?.parts) {
137-
for (const part of event.content.parts) {
138-
if (part.text) {
139-
yield { type: 'text', content: part.text };
140-
}
141-
// Yield tool calls
142-
if ('functionCall' in part && part.functionCall?.name) {
143-
agentLogger.debug(
144-
`[Runner] Tool call: ${part.functionCall.name}, args: ${JSON.stringify(part.functionCall.args)}`
145-
);
146-
yield {
147-
type: 'tool_call',
148-
toolCall: {
149-
function: {
150-
name: part.functionCall.name,
151-
arguments: part.functionCall.args as Record<string, unknown>,
134+
if (event.actions?.transferToAgent) {
135+
agentLogger.debug(
136+
`[Runner] *** TRANSFER DETECTED: ${event.author} -> ${event.actions.transferToAgent} ***`
137+
);
138+
}
139+
140+
const partTypes =
141+
event.content?.parts
142+
?.map((p) => {
143+
if ('text' in p && p.text) return 'text';
144+
if ('functionCall' in p && p.functionCall)
145+
return `functionCall:${p.functionCall.name}`;
146+
if ('functionResponse' in p && p.functionResponse)
147+
return `functionResponse:${(p.functionResponse as { name?: string }).name}`;
148+
return `unknown(${Object.keys(p).join(',')})`;
149+
})
150+
.join(', ') ?? 'no parts';
151+
agentLogger.debug(`[Runner] Event parts: ${partTypes}`);
152+
153+
if (event.content?.parts?.some((p) => 'functionResponse' in p)) {
154+
agentLogger.debug('[Runner] Function response event detected!');
155+
}
156+
157+
if (event.content?.parts) {
158+
for (const part of event.content.parts) {
159+
if (part.text) {
160+
yield { type: 'text', content: part.text };
161+
}
162+
if ('functionCall' in part && part.functionCall?.name) {
163+
agentLogger.debug(
164+
`[Runner] Tool call: ${part.functionCall.name}, args: ${JSON.stringify(part.functionCall.args)}`
165+
);
166+
yield {
167+
type: 'tool_call',
168+
toolCall: {
169+
function: {
170+
name: part.functionCall.name,
171+
arguments: part.functionCall.args as Record<string, unknown>,
172+
},
152173
},
153-
},
154-
};
174+
};
175+
}
155176
}
156177
}
157-
}
158178

159-
// Check for final response
160-
// Note: ADK may yield empty "auth" events that appear final but aren't meaningful
161-
// Skip these and continue to the next event
162-
if (isFinalResponse(event)) {
163-
const hasContent =
164-
(event.content?.parts?.length ?? 0) > 0 || event.actions?.transferToAgent;
165-
if (hasContent) {
166-
agentLogger.debug(`[Runner] Final response received from ${event.author}`);
167-
yield { type: 'done' };
168-
return;
179+
// ADK may yield empty "auth" events that appear final but aren't meaningful.
180+
// Skip these and retry.
181+
if (isFinalResponse(event)) {
182+
const hasContent =
183+
(event.content?.parts?.length ?? 0) > 0 || event.actions?.transferToAgent;
184+
185+
if (hasContent) {
186+
agentLogger.debug(`[Runner] Final response received from ${event.author}`);
187+
gotFinalContent = true;
188+
yield { type: 'done' };
189+
return;
190+
}
191+
192+
lastErrorCode = (event as any).errorCode?.toString();
193+
lastErrorMessage = (event as any).errorMessage;
194+
agentLogger.warn(
195+
`[Runner] Empty final event from ${event.author} ` +
196+
`(attempt ${attempt + 1}/${MAX_RETRIES + 1}). ` +
197+
`errorCode: ${lastErrorCode ?? 'none'}, errorMessage: ${lastErrorMessage ?? 'none'}`
198+
);
199+
break;
169200
}
170-
agentLogger.warn(
171-
`[Runner] Empty final event from ${event.author} — model may have failed silently (auth error? invalid model name?). Event: ${JSON.stringify({ id: event.id, role: event.content?.role, parts: event.content?.parts?.length ?? 0, actions: event.actions })}`
172-
);
201+
}
202+
203+
if (!gotFinalContent) {
204+
attempt++;
173205
}
174206
}
175207

176-
agentLogger.debug('[Runner] Loop completed without final response');
177-
yield { type: 'done' };
208+
if (!gotFinalContent) {
209+
const reason = retryReason();
210+
agentLogger.error(`[Runner] All ${MAX_RETRIES + 1} attempts exhausted — ${reason}`);
211+
yield {
212+
type: 'text',
213+
content: `The model failed after ${MAX_RETRIES + 1} attempts: ${reason}`,
214+
};
215+
yield { type: 'done' };
216+
}
178217
} catch (error) {
179218
agentLogger.error({ error }, '[Runner] Error during agent execution');
180219
yield {

src/agents/types.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ export type AdkAfterModelCallback = (params: {
3333
* Backwards-compatible stream chunk type for server API responses
3434
*/
3535
export interface AgentStreamChunk {
36-
type: 'text' | 'tool_call' | 'tool_result' | 'transfer' | 'done';
36+
type: 'text' | 'tool_call' | 'tool_result' | 'transfer' | 'status' | 'done';
3737
content?: string;
3838
toolCall?: {
3939
function: {

src/llm/ollama-adk.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,7 @@ export class OllamaLlm extends BaseLlm {
101101

102102
if (stream) {
103103
// Streaming mode
104+
// TODO(stability): Ollama client call has no timeout — will hang if Ollama is unresponsive
104105
const response = await this.client.chat({
105106
model: this.actualModel,
106107
messages,
@@ -113,6 +114,7 @@ export class OllamaLlm extends BaseLlm {
113114
}
114115
} else {
115116
// Non-streaming mode
117+
// TODO(stability): Ollama client call has no timeout — will hang if Ollama is unresponsive
116118
const response = await this.client.chat({
117119
model: this.actualModel,
118120
messages,

src/sandbox/pyodide-engine.ts

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,8 @@ export class PythonSandbox {
4848
agentLogger.info('Sandbox: Pyodide initialized');
4949
} catch (error) {
5050
agentLogger.warn({ error }, 'Sandbox: Pyodide not available');
51-
this.initialized = true; // Mark as initialized to prevent retry
51+
// TODO(stability): Setting initialized = true on failure prevents any future retry
52+
this.initialized = true;
5253
}
5354
}
5455

src/tools/brave-search.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ export async function braveSearch(query: string): Promise<string> {
1919
url.searchParams.set('count', '10');
2020

2121
try {
22+
// TODO(stability): fetch() has no timeout/AbortSignal — long requests will hang indefinitely
2223
const response = await fetch(url.toString(), {
2324
headers: {
2425
Accept: 'application/json',

src/tools/web-reader.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ const MAX_CONTENT_LENGTH = 10000;
1111

1212
export async function readWebpage(url: string): Promise<string> {
1313
try {
14+
// TODO(stability): fetch() has no timeout/AbortSignal — long requests will hang indefinitely
1415
const response = await fetch(url, {
1516
headers: {
1617
'User-Agent': 'Mozilla/5.0 (compatible; Solenoid/2.0; +https://github.com/solenoid)',

src/ui/app.tsx

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
1-
import { Box, useApp, useInput } from 'ink';
21
/**
32
* Main App Component
43
*
@@ -10,6 +9,7 @@ import { Box, useApp, useInput } from 'ink';
109
* - ink: React-based terminal UI framework
1110
* - React Suspense: Handles loading state during agent initialization
1211
*/
12+
import { Box, useApp, useInput } from 'ink';
1313
import { Suspense, useEffect, useState } from 'react';
1414
import { loadSettings } from '../config/index.js';
1515
import { uiLogger } from '../utils/logger.js';
@@ -237,6 +237,12 @@ function AppContent() {
237237
}
238238
break;
239239

240+
case 'status':
241+
if (event.content) {
242+
setStatus(event.content);
243+
}
244+
break;
245+
240246
case 'error':
241247
setMessages((prev) =>
242248
prev.map((msg) =>

src/ui/hooks/useAgent.ts

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,15 @@
1-
import type { InMemoryRunner } from '@google/adk';
21
/**
32
* useAgent Hook
43
*
54
* Provides direct ADK integration for the Ink UI using React 18 Suspense.
65
* Uses a resource pattern to suspend until MCP tools are loaded.
76
*/
7+
import type { InMemoryRunner } from '@google/adk';
88
import { useCallback, useRef } from 'react';
99
import { createAdkAgentHierarchy, runAgent } from '../../agents/index.js';
1010

1111
export interface AgentEvent {
12-
type: 'text' | 'tool_start' | 'tool_args' | 'tool_end' | 'transfer' | 'done' | 'error';
12+
type: 'text' | 'tool_start' | 'tool_args' | 'tool_end' | 'transfer' | 'status' | 'done' | 'error';
1313
content?: string;
1414
toolCallId?: string;
1515
toolName?: string;
@@ -95,6 +95,11 @@ export function useAgent() {
9595
yield { type: 'transfer', transferTo: chunk.transferTo };
9696
}
9797
break;
98+
case 'status':
99+
if (chunk.content) {
100+
yield { type: 'status', content: chunk.content };
101+
}
102+
break;
98103
case 'done':
99104
yield { type: 'done' };
100105
break;

0 commit comments

Comments
 (0)