fix: retry with exponential backoff on empty model responses

elicollinson · elicollinson · commit 7cbe08dfbfbe · 2026-02-11T20:36:50.000-05:00
Agent was crashing silently when models (Gemini, Ollama) returned empty
responses after tool execution. Now retries up to 5 times with
exponential backoff (1s, 2s, 4s, 8s, 16s) and surfaces the actual API
error (e.g. 503 rate limit) through a new 'status' event type that
updates the UI spinner during retries.
diff --git a/src/agents/planning.ts b/src/agents/planning.ts
@@ -133,6 +133,7 @@ export async function createPlanningAgent(additionalSubAgents: LlmAgent[] = []):
   try {
     initializedMcpAgent = await createMcpAgent();
   } catch (error) {
+    // TODO(stability): MCP fallback uses placeholder agent with no tools — user won't know delegation will fail
     agentLogger.warn({ error }, 'MCP agent creation failed, using placeholder');
     initializedMcpAgent = mcpAgent;
   }
diff --git a/src/agents/runner.ts b/src/agents/runner.ts
@@ -72,109 +72,148 @@ export async function* runAgent(
   runner: InMemoryRunner,
   sessionId?: string
 ): AsyncGenerator<AgentStreamChunk, void, unknown> {
-  const activeRunner = runner;
   const sid = sessionId ?? crypto.randomUUID();
 
-  // Try to get existing session, or create a new one
-  let session = await activeRunner.sessionService.getSession({
+  let session = await runner.sessionService.getSession({
     appName: APP_NAME,
     userId: 'default_user',
     sessionId: sid,
   });
 
   if (!session) {
-    session = await activeRunner.sessionService.createSession({
+    session = await runner.sessionService.createSession({
       appName: APP_NAME,
       userId: 'default_user',
       sessionId: sid,
     });
   }
 
-  // Create user message
   const userMessage = createUserContent(input);
 
-  // Run the agent and stream responses
+  const MAX_RETRIES = 5;
+  const BASE_DELAY_MS = 1000;
+  let attempt = 0;
+  let gotFinalContent = false;
+  let lastErrorCode: string | undefined;
+  let lastErrorMessage: string | undefined;
+
+  function retryReason(): string {
+    return lastErrorMessage ?? lastErrorCode ?? 'empty response';
+  }
+
   try {
-    let eventIndex = 0;
-    for await (const event of activeRunner.runAsync({
-      userId: 'default_user',
-      sessionId: sid,
-      newMessage: userMessage,
-    })) {
-      eventIndex++;
-      agentLogger.debug(`[Runner] ===== EVENT #${eventIndex} =====`);
-      agentLogger.debug(
-        `[Runner] Event ID: ${event.id}, from ${event.author}, parts: ${event.content?.parts?.length ?? 0}, role: ${event.content?.role}, isFinal: ${isFinalResponse(event)}, transferToAgent: ${event.actions?.transferToAgent ?? 'none'}`
-      );
-
-      // Highlight if this event has a transfer action
-      if (event.actions?.transferToAgent) {
-        agentLogger.debug(
-          `[Runner] *** TRANSFER DETECTED: ${event.author} -> ${event.actions.transferToAgent} ***`
+    while (attempt <= MAX_RETRIES && !gotFinalContent) {
+      if (attempt > 0) {
+        const delayMs = BASE_DELAY_MS * Math.pow(2, attempt - 1); // 1s, 2s, 4s, 8s, 16s
+        const reason = retryReason();
+        agentLogger.info(
+          `[Runner] Retrying in ${delayMs}ms (attempt ${attempt + 1}/${MAX_RETRIES + 1}): ${reason}`
         );
+        yield {
+          type: 'status',
+          content: `Retrying (${attempt}/${MAX_RETRIES}): ${reason}`,
+        };
+        await new Promise((resolve) => setTimeout(resolve, delayMs));
       }
 
-      // Log what type of parts this event has
-      const partTypes =
-        event.content?.parts
-          ?.map((p) => {
-            if ('text' in p && p.text) return 'text';
-            if ('functionCall' in p && p.functionCall) return `functionCall:${p.functionCall.name}`;
-            if ('functionResponse' in p && p.functionResponse)
-              return `functionResponse:${(p.functionResponse as { name?: string }).name}`;
-            return `unknown(${Object.keys(p).join(',')})`;
-          })
-          .join(', ') ?? 'no parts';
-      agentLogger.debug(`[Runner] Event parts: ${partTypes}`);
-
-      // If this is a function response, log details
-      if (event.content?.parts?.some((p) => 'functionResponse' in p)) {
-        agentLogger.debug('[Runner] Function response event detected!');
-      }
+      const message =
+        attempt === 0 ? userMessage : createUserContent('Please continue with your response.');
+
+      let eventIndex = 0;
+      for await (const event of runner.runAsync({
+        userId: 'default_user',
+        sessionId: sid,
+        newMessage: message,
+      })) {
+        eventIndex++;
+        agentLogger.debug(`[Runner] ===== EVENT #${eventIndex} (attempt ${attempt + 1}) =====`);
+        agentLogger.debug(
+          `[Runner] Event ID: ${event.id}, from ${event.author}, parts: ${event.content?.parts?.length ?? 0}, role: ${event.content?.role}, isFinal: ${isFinalResponse(event)}, transferToAgent: ${event.actions?.transferToAgent ?? 'none'}, errorCode: ${(event as any).errorCode ?? 'none'}, errorMessage: ${(event as any).errorMessage ?? 'none'}`
+        );
 
-      // Extract text content from event
-      if (event.content?.parts) {
-        for (const part of event.content.parts) {
-          if (part.text) {
-            yield { type: 'text', content: part.text };
-          }
-          // Yield tool calls
-          if ('functionCall' in part && part.functionCall?.name) {
-            agentLogger.debug(
-              `[Runner] Tool call: ${part.functionCall.name}, args: ${JSON.stringify(part.functionCall.args)}`
-            );
-            yield {
-              type: 'tool_call',
-              toolCall: {
-                function: {
-                  name: part.functionCall.name,
-                  arguments: part.functionCall.args as Record<string, unknown>,
+        if (event.actions?.transferToAgent) {
+          agentLogger.debug(
+            `[Runner] *** TRANSFER DETECTED: ${event.author} -> ${event.actions.transferToAgent} ***`
+          );
+        }
+
+        const partTypes =
+          event.content?.parts
+            ?.map((p) => {
+              if ('text' in p && p.text) return 'text';
+              if ('functionCall' in p && p.functionCall)
+                return `functionCall:${p.functionCall.name}`;
+              if ('functionResponse' in p && p.functionResponse)
+                return `functionResponse:${(p.functionResponse as { name?: string }).name}`;
+              return `unknown(${Object.keys(p).join(',')})`;
+            })
+            .join(', ') ?? 'no parts';
+        agentLogger.debug(`[Runner] Event parts: ${partTypes}`);
+
+        if (event.content?.parts?.some((p) => 'functionResponse' in p)) {
+          agentLogger.debug('[Runner] Function response event detected!');
+        }
+
+        if (event.content?.parts) {
+          for (const part of event.content.parts) {
+            if (part.text) {
+              yield { type: 'text', content: part.text };
+            }
+            if ('functionCall' in part && part.functionCall?.name) {
+              agentLogger.debug(
+                `[Runner] Tool call: ${part.functionCall.name}, args: ${JSON.stringify(part.functionCall.args)}`
+              );
+              yield {
+                type: 'tool_call',
+                toolCall: {
+                  function: {
+                    name: part.functionCall.name,
+                    arguments: part.functionCall.args as Record<string, unknown>,
+                  },
                 },
-              },
-            };
+              };
+            }
           }
         }
-      }
 
-      // Check for final response
-      // Note: ADK may yield empty "auth" events that appear final but aren't meaningful
-      // Skip these and continue to the next event
-      if (isFinalResponse(event)) {
-        const hasContent =
-          (event.content?.parts?.length ?? 0) > 0 || event.actions?.transferToAgent;
-        if (hasContent) {
-          agentLogger.debug(`[Runner] Final response received from ${event.author}`);
-          yield { type: 'done' };
-          return;
+        // ADK may yield empty "auth" events that appear final but aren't meaningful.
+        // Skip these and retry.
+        if (isFinalResponse(event)) {
+          const hasContent =
+            (event.content?.parts?.length ?? 0) > 0 || event.actions?.transferToAgent;
+
+          if (hasContent) {
+            agentLogger.debug(`[Runner] Final response received from ${event.author}`);
+            gotFinalContent = true;
+            yield { type: 'done' };
+            return;
+          }
+
+          lastErrorCode = (event as any).errorCode?.toString();
+          lastErrorMessage = (event as any).errorMessage;
+          agentLogger.warn(
+            `[Runner] Empty final event from ${event.author} ` +
+              `(attempt ${attempt + 1}/${MAX_RETRIES + 1}). ` +
+              `errorCode: ${lastErrorCode ?? 'none'}, errorMessage: ${lastErrorMessage ?? 'none'}`
+          );
+          break;
         }
-        agentLogger.warn(
-          `[Runner] Empty final event from ${event.author} — model may have failed silently (auth error? invalid model name?). Event: ${JSON.stringify({ id: event.id, role: event.content?.role, parts: event.content?.parts?.length ?? 0, actions: event.actions })}`
-        );
+      }
+
+      if (!gotFinalContent) {
+        attempt++;
       }
     }
 
-    agentLogger.debug('[Runner] Loop completed without final response');
-    yield { type: 'done' };
+    if (!gotFinalContent) {
+      const reason = retryReason();
+      agentLogger.error(`[Runner] All ${MAX_RETRIES + 1} attempts exhausted — ${reason}`);
+      yield {
+        type: 'text',
+        content: `The model failed after ${MAX_RETRIES + 1} attempts: ${reason}`,
+      };
+      yield { type: 'done' };
+    }
   } catch (error) {
     agentLogger.error({ error }, '[Runner] Error during agent execution');
     yield {
diff --git a/src/agents/types.ts b/src/agents/types.ts
@@ -33,7 +33,7 @@ export type AdkAfterModelCallback = (params: {
  * Backwards-compatible stream chunk type for server API responses
  */
 export interface AgentStreamChunk {
-  type: 'text' | 'tool_call' | 'tool_result' | 'transfer' | 'done';
+  type: 'text' | 'tool_call' | 'tool_result' | 'transfer' | 'status' | 'done';
   content?: string;
   toolCall?: {
     function: {
diff --git a/src/llm/ollama-adk.ts b/src/llm/ollama-adk.ts
@@ -101,6 +101,7 @@ export class OllamaLlm extends BaseLlm {
 
       if (stream) {
         // Streaming mode
+        // TODO(stability): Ollama client call has no timeout — will hang if Ollama is unresponsive
         const response = await this.client.chat({
           model: this.actualModel,
           messages,
@@ -113,6 +114,7 @@ export class OllamaLlm extends BaseLlm {
         }
       } else {
         // Non-streaming mode
+        // TODO(stability): Ollama client call has no timeout — will hang if Ollama is unresponsive
         const response = await this.client.chat({
           model: this.actualModel,
           messages,
diff --git a/src/sandbox/pyodide-engine.ts b/src/sandbox/pyodide-engine.ts
@@ -48,7 +48,8 @@ export class PythonSandbox {
       agentLogger.info('Sandbox: Pyodide initialized');
     } catch (error) {
       agentLogger.warn({ error }, 'Sandbox: Pyodide not available');
-      this.initialized = true; // Mark as initialized to prevent retry
+      // TODO(stability): Setting initialized = true on failure prevents any future retry
+      this.initialized = true;
     }
   }
 
diff --git a/src/tools/brave-search.ts b/src/tools/brave-search.ts
@@ -19,6 +19,7 @@ export async function braveSearch(query: string): Promise<string> {
   url.searchParams.set('count', '10');
 
   try {
+    // TODO(stability): fetch() has no timeout/AbortSignal — long requests will hang indefinitely
     const response = await fetch(url.toString(), {
       headers: {
         Accept: 'application/json',
diff --git a/src/tools/web-reader.ts b/src/tools/web-reader.ts
@@ -11,6 +11,7 @@ const MAX_CONTENT_LENGTH = 10000;
 
 export async function readWebpage(url: string): Promise<string> {
   try {
+    // TODO(stability): fetch() has no timeout/AbortSignal — long requests will hang indefinitely
     const response = await fetch(url, {
       headers: {
         'User-Agent': 'Mozilla/5.0 (compatible; Solenoid/2.0; +https://github.com/solenoid)',
diff --git a/src/ui/app.tsx b/src/ui/app.tsx
@@ -1,4 +1,3 @@
-import { Box, useApp, useInput } from 'ink';
 /**
  * Main App Component
  *
@@ -10,6 +9,7 @@ import { Box, useApp, useInput } from 'ink';
  * - ink: React-based terminal UI framework
  * - React Suspense: Handles loading state during agent initialization
  */
+import { Box, useApp, useInput } from 'ink';
 import { Suspense, useEffect, useState } from 'react';
 import { loadSettings } from '../config/index.js';
 import { uiLogger } from '../utils/logger.js';
@@ -237,6 +237,12 @@ function AppContent() {
             }
             break;
 
+          case 'status':
+            if (event.content) {
+              setStatus(event.content);
+            }
+            break;
+
           case 'error':
             setMessages((prev) =>
               prev.map((msg) =>
diff --git a/src/ui/hooks/useAgent.ts b/src/ui/hooks/useAgent.ts
@@ -1,15 +1,15 @@
-import type { InMemoryRunner } from '@google/adk';
 /**
  * useAgent Hook
  *
  * Provides direct ADK integration for the Ink UI using React 18 Suspense.
  * Uses a resource pattern to suspend until MCP tools are loaded.
  */
+import type { InMemoryRunner } from '@google/adk';
 import { useCallback, useRef } from 'react';
 import { createAdkAgentHierarchy, runAgent } from '../../agents/index.js';
 
 export interface AgentEvent {
-  type: 'text' | 'tool_start' | 'tool_args' | 'tool_end' | 'transfer' | 'done' | 'error';
+  type: 'text' | 'tool_start' | 'tool_args' | 'tool_end' | 'transfer' | 'status' | 'done' | 'error';
   content?: string;
   toolCallId?: string;
   toolName?: string;
@@ -95,6 +95,11 @@ export function useAgent() {
                 yield { type: 'transfer', transferTo: chunk.transferTo };
               }
               break;
+            case 'status':
+              if (chunk.content) {
+                yield { type: 'status', content: chunk.content };
+              }
+              break;
             case 'done':
               yield { type: 'done' };
               break;
diff --git a/tests/unit/runner-retry.test.ts b/tests/unit/runner-retry.test.ts

Original file line number	Diff line number	Diff line change
`@@ -133,6 +133,7 @@ export async function createPlanningAgent(additionalSubAgents: LlmAgent[] = []):`
`133`	`133`	`try {`
`134`	`134`	`initializedMcpAgent = await createMcpAgent();`
`135`	`135`	`} catch (error) {`
	`136`	`+ // TODO(stability): MCP fallback uses placeholder agent with no tools — user won't know delegation will fail`
`136`	`137`	`agentLogger.warn({ error }, 'MCP agent creation failed, using placeholder');`
`137`	`138`	`initializedMcpAgent = mcpAgent;`
`138`	`139`	`}`
Original file line number	Diff line number	Diff line change
`@@ -48,7 +48,8 @@ export class PythonSandbox {`
`48`	`48`	`agentLogger.info('Sandbox: Pyodide initialized');`
`49`	`49`	`} catch (error) {`
`50`	`50`	`agentLogger.warn({ error }, 'Sandbox: Pyodide not available');`
`51`		`- this.initialized = true; // Mark as initialized to prevent retry`
	`51`	`+ // TODO(stability): Setting initialized = true on failure prevents any future retry`
	`52`	`+ this.initialized = true;`
`52`	`53`	`}`
`53`	`54`	`}`
`54`	`55`