fix: recover from agent transfer errors with retry and error context

elicollinson · elicollinson · commit 569267a2338f · 2026-02-12T07:58:42.000-05:00
Move try-catch inside the runner's retry loop so exceptions (e.g. ADK's
JSON.parse failures during transfers) trigger retries instead of killing
the session. Add serializeError helper for non-standard error objects,
yield transfer chunks to the UI, and send error context in retry messages
so the planning agent can choose fallback agents. Add transfer discipline
to all sub-agent prompts and orchestration guidance to the planning agent.
diff --git a/src/agents/chart-generator.ts b/src/agents/chart-generator.ts
@@ -25,6 +25,7 @@ import { z } from 'zod';
 import { parseChartArgs } from '../charts/index.js';
 import { getAdkModelName, getAgentPrompt, loadSettings } from '../config/index.js';
 import { saveMemoriesOnFinalResponse } from '../memory/callbacks.js';
+import { TRANSFER_BACK_INSTRUCTION } from './types.js';
 
 const DEFAULT_INSTRUCTION = `You are a Chart Generator Agent specializing in terminal-based data visualizations.
 
@@ -107,7 +108,8 @@ Use these color names: "red", "green", "blue", "yellow", "cyan", "magenta", "whi
 - Choose the most appropriate chart type for the data
 - Use descriptive titles that explain what the chart shows
 - Use colors to distinguish different data series or categories
-- Keep labels concise but meaningful`;
+- Keep labels concise but meaningful
+${TRANSFER_BACK_INSTRUCTION}`;
 
 /**
  * Google ADK FunctionTool for chart rendering.
diff --git a/src/agents/code-executor.ts b/src/agents/code-executor.ts
@@ -21,6 +21,7 @@ import { getAdkModelName, getAgentPrompt, loadSettings } from '../config/index.j
 import { saveMemoriesOnFinalResponse } from '../memory/callbacks.js';
 import { executeCodeAdkTool } from '../tools/adk-tools.js';
 import { executeCode } from '../tools/code-execution.js';
+import { TRANSFER_BACK_INSTRUCTION } from './types.js';
 
 // Re-export executeCode for backward compatibility
 export { executeCode };
@@ -67,7 +68,8 @@ Python standard library including:
 ### CONSTRAINTS
 - NEVER execute code that could be harmful
 - NEVER attempt file system operations outside the sandbox
-- ALWAYS use print() to output results`;
+- ALWAYS use print() to output results
+${TRANSFER_BACK_INSTRUCTION}`;
 
 // Load settings with fallback
 let settings: AppSettings | null;
diff --git a/src/agents/generic.ts b/src/agents/generic.ts
@@ -18,6 +18,7 @@ import { LlmAgent } from '@google/adk';
 import type { AppSettings } from '../config/index.js';
 import { getAdkModelName, getAgentPrompt, loadSettings } from '../config/index.js';
 import { saveMemoriesOnFinalResponse } from '../memory/callbacks.js';
+import { TRANSFER_BACK_INSTRUCTION } from './types.js';
 
 const DEFAULT_INSTRUCTION = `You are the Generic Executor Agent, handling knowledge tasks.
 
@@ -41,8 +42,8 @@ You handle general-purpose tasks. You are the "knowledge worker" for text-based
 
 ### CONSTRAINTS
 - ALWAYS provide helpful, accurate responses
-- ALWAYS transfer your result to your parent agent upon completion
-- If asked to do something outside your capabilities, clearly state what agent should be used instead`;
+- If asked to do something outside your capabilities, clearly state what agent should be used instead
+${TRANSFER_BACK_INSTRUCTION}`;
 
 // Load settings with fallback
 let settings: AppSettings | null;
diff --git a/src/agents/mcp.ts b/src/agents/mcp.ts
@@ -21,6 +21,7 @@ import { getAdkModelName, getAgentPrompt, loadSettings } from '../config/index.j
 import { getMcpManager } from '../mcp/index.js';
 import { saveMemoriesOnFinalResponse } from '../memory/callbacks.js';
 import { createMcpAdkTools } from '../tools/mcp-adk-adapter.js';
+import { TRANSFER_BACK_INSTRUCTION } from './types.js';
 
 const DEFAULT_INSTRUCTION = `You are an MCP tools specialist. You MUST use the tools provided to you.
 
@@ -49,7 +50,10 @@ After calling tools and getting results, format your response as:
 [Summarize what you found from the tool calls]
 
 ## Status
-Success / Partial / Could Not Complete`;
+Success / Partial / Could Not Complete
+
+### CONSTRAINTS
+${TRANSFER_BACK_INSTRUCTION}`;
 
 // Load settings with fallback
 let settings: AppSettings | null;
diff --git a/src/agents/planning.ts b/src/agents/planning.ts
@@ -67,14 +67,21 @@ PLAN:
 2. [Task] → [agent_name]
 \`\`\`
 
-**STEP 2: EXECUTE ONE STEP AT A TIME**
+**STEP 2: EXECUTE**
 - Delegate to the agent for step 1
 - Wait for response
 - Check if successful
+- For sequential multi-agent tasks, you can instruct an agent to transfer directly to the next agent:
+  Example: "Research this topic, then transfer to code_executor_agent to analyze the data"
+- For simpler tasks, just delegate and the agent will return results to you.
 
 **STEP 3: HANDLE FAILURES IMMEDIATELY**
 If an agent returns error or no useful result:
 → IMMEDIATELY try the fallback agent. Do NOT retry the same agent.
+If you receive a message about a previous error:
+- Analyze the error to understand what failed
+- Choose a different agent or approach
+- Do NOT retry the same agent that failed
 
 **STEP 4: SYNTHESIZE AND RETURN**
 When all steps complete, combine results and transfer to parent.
@@ -91,7 +98,8 @@ When the user request is missing details:
 - NEVER delegate without stating which step you're on
 - NEVER retry a failed agent—use the fallback instead
 - NEVER call tools directly—you have no tools
-- ALWAYS transfer final result to parent agent when done`;
+- ALWAYS transfer final result to parent agent when done
+- Sub-agents transfer back to you by default. You can chain agents by telling a sub-agent to transfer to another specific agent upon completion.`;
 
 // Load settings with fallback
 let settings: AppSettings | null;
diff --git a/src/agents/research.ts b/src/agents/research.ts
@@ -18,6 +18,7 @@ import type { AppSettings } from '../config/index.js';
 import { getAdkModelName, getAgentPrompt, loadSettings } from '../config/index.js';
 import { saveMemoriesOnFinalResponse } from '../memory/callbacks.js';
 import { braveSearchAdkTool, readWebpageAdkTool } from '../tools/adk-tools.js';
+import { TRANSFER_BACK_INSTRUCTION } from './types.js';
 
 const DEFAULT_INSTRUCTION = `You are the Research Specialist, an expert in gathering comprehensive information from the web.
 
@@ -65,7 +66,8 @@ Structure your research report as:
 - NEVER fabricate information or URLs.
 - NEVER present speculation as fact.
 - ALWAYS cite sources for factual claims.
-- Maximum 5 page reads per research task.`;
+- Maximum 5 page reads per research task.
+${TRANSFER_BACK_INSTRUCTION}`;
 
 // Load settings with fallback
 let settings: AppSettings | null;
diff --git a/src/agents/runner.ts b/src/agents/runner.ts
@@ -24,6 +24,29 @@ import type { AgentStreamChunk } from './types.js';
 
 const APP_NAME = 'Solenoid';
 
+/**
+ * Serialize any error value into a readable string.
+ * ADK sometimes throws non-standard error objects (e.g. `{}`) that lose
+ * information with naive stringification.
+ */
+function serializeError(error: unknown): string {
+  if (error instanceof Error) return error.message || error.constructor.name;
+  if (typeof error === 'string') return error;
+  try {
+    const str = String(error);
+    if (str !== '[object Object]') return str;
+  } catch {
+    // fall through
+  }
+  try {
+    const json = JSON.stringify(error);
+    if (json && json !== '{}') return json;
+  } catch {
+    // fall through
+  }
+  return 'Unknown error (non-serializable)';
+}
+
 /**
  * Debug: Log the agent hierarchy
  */
@@ -101,24 +124,34 @@ export async function* runAgent(
     return lastErrorMessage ?? lastErrorCode ?? 'empty response';
   }
 
-  try {
-    while (attempt <= MAX_RETRIES && !gotFinalContent) {
-      if (attempt > 0) {
-        const delayMs = BASE_DELAY_MS * Math.pow(2, attempt - 1); // 1s, 2s, 4s, 8s, 16s
-        const reason = retryReason();
-        agentLogger.info(
-          `[Runner] Retrying in ${delayMs}ms (attempt ${attempt + 1}/${MAX_RETRIES + 1}): ${reason}`
-        );
-        yield {
-          type: 'status',
-          content: `Retrying (${attempt}/${MAX_RETRIES}): ${reason}`,
-        };
-        await new Promise((resolve) => setTimeout(resolve, delayMs));
-      }
+  while (attempt <= MAX_RETRIES && !gotFinalContent) {
+    if (attempt > 0) {
+      const delayMs = BASE_DELAY_MS * Math.pow(2, attempt - 1); // 1s, 2s, 4s, 8s, 16s
+      const reason = retryReason();
+      agentLogger.info(
+        `[Runner] Retrying in ${delayMs}ms (attempt ${attempt + 1}/${MAX_RETRIES + 1}): ${reason}`
+      );
+      yield {
+        type: 'status',
+        content: `Retrying (${attempt}/${MAX_RETRIES}): ${reason}`,
+      };
+      await new Promise((resolve) => setTimeout(resolve, delayMs));
+    }
 
-      const message =
-        attempt === 0 ? userMessage : createUserContent('Please continue with your response.');
+    let message: Content;
+    if (attempt === 0) {
+      message = userMessage;
+    } else if (lastErrorMessage && !lastErrorCode) {
+      // Exception occurred (no ADK error code) — give the model error context
+      message = createUserContent(
+        `The previous attempt encountered an error: ${lastErrorMessage}. Please try an alternative approach or a different agent.`
+      );
+    } else {
+      // Empty response or ADK error code — nudge the model to continue
+      message = createUserContent('Please continue with your response.');
+    }
 
+    try {
       let eventIndex = 0;
       for await (const event of runner.runAsync({
         userId: 'default_user',
@@ -132,9 +165,10 @@ export async function* runAgent(
         );
 
         if (event.actions?.transferToAgent) {
-          agentLogger.debug(
-            `[Runner] *** TRANSFER DETECTED: ${event.author} -> ${event.actions.transferToAgent} ***`
+          agentLogger.info(
+            `[Runner] *** TRANSFER: ${event.author} -> ${event.actions.transferToAgent} ***`
           );
+          yield { type: 'transfer', transferTo: event.actions.transferToAgent };
         }
 
         const partTypes =
@@ -199,26 +233,27 @@ export async function* runAgent(
           break;
         }
       }
-
-      if (!gotFinalContent) {
-        attempt++;
-      }
+    } catch (error) {
+      const serialized = serializeError(error);
+      lastErrorMessage = serialized;
+      lastErrorCode = undefined;
+      agentLogger.error(
+        { errorType: error?.constructor?.name, attempt: attempt + 1, message: serialized },
+        '[Runner] Exception during runAsync — will retry'
+      );
     }
 
     if (!gotFinalContent) {
-      const reason = retryReason();
-      agentLogger.error(`[Runner] All ${MAX_RETRIES + 1} attempts exhausted — ${reason}`);
-      yield {
-        type: 'text',
-        content: `The model failed after ${MAX_RETRIES + 1} attempts: ${reason}`,
-      };
-      yield { type: 'done' };
+      attempt++;
     }
-  } catch (error) {
-    agentLogger.error({ error }, '[Runner] Error during agent execution');
+  }
+
+  if (!gotFinalContent) {
+    const reason = retryReason();
+    agentLogger.error(`[Runner] All ${MAX_RETRIES + 1} attempts exhausted — ${reason}`);
     yield {
       type: 'text',
-      content: `Error: ${error instanceof Error ? error.message : String(error)}`,
+      content: `The model failed after ${MAX_RETRIES + 1} attempts: ${reason}`,
     };
     yield { type: 'done' };
   }
diff --git a/src/agents/types.ts b/src/agents/types.ts
@@ -69,3 +69,12 @@ export interface SessionState {
 export interface AgentRunner {
   run(input: string, sessionId?: string): AsyncGenerator<AgentStreamChunk, void, unknown>;
 }
+
+/**
+ * Shared instruction block appended to every sub-agent's CONSTRAINTS section.
+ * Ensures sub-agents transfer results back to the planning agent by default,
+ * while allowing explicit chaining overrides from the planner.
+ */
+export const TRANSFER_BACK_INSTRUCTION = `- When your task is complete, transfer back to planning_agent with your results.
+- EXCEPTION: If the planning agent explicitly told you to transfer to a specific agent next, follow that instruction.
+- Do NOT independently decide to transfer to sibling agents — let the planning agent orchestrate the sequence.`;
diff --git a/tests/unit/runner-retry.test.ts b/tests/unit/runner-retry.test.ts