fix(core): extract pi-cli tool calls from streaming events (#782)

christso · claude · web-flow · commit 6d8f63158683 · 2026-03-26T19:49:58.000+11:00
* fix(core): extract pi-cli tool calls from streaming events for skill-trigger Pi CLI emits tool_execution_start/end events in JSONL output, but the provider only extracted tool calls from message content arrays. This caused the skill-trigger evaluator to miss pi's skill file reads. Now extractMessages() also scans for tool_execution_start/end events and injects reconstructed tool calls into assistant messages. Also handles tool_call (snake_case) content type variant. Closes #780 Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * fix(core): avoid mutating readonly Message in injectEventToolCalls Replace target message with a new object instead of casting to bypass readonly constraint, per code review feedback. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * fix(evals): restore skill-trigger assertion for agent-plugin-review eval Re-adds the skill-trigger assertion that was removed as a workaround for #780. Now that pi-cli tool call extraction is fixed, the evaluator can detect when pi loads the agent-plugin-review skill. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * fix(evals): configure pi-cli target with model and remove workers: 1 Pi-cli target needs subprovider/model/api_key to produce meaningful output. Without them, pi uses its default which returns empty responses. Also removes workers: 1 from agent-plugin-review eval since all test cases are read-only reviews that can safely run in parallel. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * style: fix formatting in package.json files Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
diff --git a/.agentv/targets.yaml b/.agentv/targets.yaml
@@ -13,6 +13,8 @@ targets:
   - name: pi-cli
     provider: pi-cli
     subprovider: openrouter
+    model: openai/gpt-5.1-codex
+    api_key: ${{ OPENROUTER_API_KEY }}
     grader_target: gemini-flash
 
   - name: pi-coding-agent
diff --git a/apps/cli/package.json b/apps/cli/package.json
@@ -14,10 +14,7 @@
   "bin": {
     "agentv": "./dist/cli.js"
   },
-  "files": [
-    "dist",
-    "README.md"
-  ],
+  "files": ["dist", "README.md"],
   "scripts": {
     "dev": "bun src/cli.ts",
     "build": "tsup && bun run copy-readme",
diff --git a/evals/agentic-engineering/agent-plugin-review.eval.yaml b/evals/agentic-engineering/agent-plugin-review.eval.yaml
@@ -3,7 +3,6 @@ description: Evaluates that the agent-plugin-review skill is triggered and catch
 execution:
   targets:
     - pi-cli
-  workers: 1
 
 workspace:
   template: ./workspace-template
@@ -20,6 +19,8 @@ tests:
       Review the deploy-auto plugin in this repo for completeness.
       Check that every skill has a corresponding eval file.
     assertions:
+      - type: skill-trigger
+        skill: agent-plugin-review
       - type: contains
         value: deploy-rollback
       - type: rubrics
diff --git a/packages/core/package.json b/packages/core/package.json
@@ -38,10 +38,7 @@
     "diagnostics:azure": "bun src/diagnostics/azure-deployment-diag.ts",
     "generate:schema": "bun scripts/generate-eval-schema.ts"
   },
-  "files": [
-    "dist",
-    "README.md"
-  ],
+  "files": ["dist", "README.md"],
   "dependencies": {
     "@agentclientprotocol/sdk": "^0.14.1",
     "@agentv/eval": "workspace:*",
diff --git a/packages/core/src/evaluation/providers/pi-cli.ts b/packages/core/src/evaluation/providers/pi-cli.ts
@@ -539,6 +539,10 @@ function summarizePiEvent(event: unknown): string | undefined {
       }
       return `message_update: ${eventType}`;
     }
+    case 'tool_execution_start':
+      return `tool_start: ${record.toolName}`;
+    case 'tool_execution_end':
+      return `tool_end: ${record.toolName}`;
     default:
       return type;
   }
@@ -580,29 +584,119 @@ function parsePiJsonl(output: string): unknown[] {
 }
 
 function extractMessages(events: unknown[]): readonly Message[] {
+  let messages: Message[] | undefined;
+
   for (let i = events.length - 1; i >= 0; i--) {
     const event = events[i];
     if (!event || typeof event !== 'object') continue;
     const record = event as Record<string, unknown>;
     if (record.type !== 'agent_end') continue;
 
-    const messages = record.messages;
-    if (!Array.isArray(messages)) continue;
+    const msgs = record.messages;
+    if (!Array.isArray(msgs)) continue;
 
-    return messages.map(convertPiMessage).filter((m): m is Message => m !== undefined);
+    messages = msgs.map(convertPiMessage).filter((m): m is Message => m !== undefined);
+    break;
   }
 
-  const output: Message[] = [];
+  if (!messages) {
+    messages = [];
+    for (const event of events) {
+      if (!event || typeof event !== 'object') continue;
+      const record = event as Record<string, unknown>;
+      if (record.type === 'turn_end') {
+        const converted = convertPiMessage(record.message);
+        if (converted) messages.push(converted);
+      }
+    }
+  }
+
+  // Pi CLI may emit tool_execution_start/tool_execution_end events whose tool
+  // calls are absent from the final agent_end messages. Reconstruct them and
+  // inject into the last assistant message so evaluators (e.g. skill-trigger)
+  // can detect them.
+  const eventToolCalls = extractToolCallsFromEvents(events);
+  if (eventToolCalls.length > 0) {
+    injectEventToolCalls(messages, eventToolCalls);
+  }
+
+  return messages;
+}
+
+/**
+ * Scan JSONL events for tool_execution_start / tool_execution_end pairs and
+ * reconstruct ToolCall objects from them.
+ */
+function extractToolCallsFromEvents(events: unknown[]): ToolCall[] {
+  const starts = new Map<string, { tool: string; input: unknown }>();
+  const results = new Map<string, unknown>();
+
   for (const event of events) {
     if (!event || typeof event !== 'object') continue;
-    const record = event as Record<string, unknown>;
-    if (record.type === 'turn_end') {
-      const converted = convertPiMessage(record.message);
-      if (converted) output.push(converted);
+    const r = event as Record<string, unknown>;
+    const type = r.type;
+    if (type === 'tool_execution_start' && typeof r.toolName === 'string') {
+      const id = typeof r.toolCallId === 'string' ? r.toolCallId : undefined;
+      starts.set(id ?? `anon-${starts.size}`, { tool: r.toolName, input: r.args });
+    } else if (type === 'tool_execution_end') {
+      const id = typeof r.toolCallId === 'string' ? r.toolCallId : undefined;
+      if (id) results.set(id, r.result);
+    }
+  }
+
+  const toolCalls: ToolCall[] = [];
+  for (const [id, { tool, input }] of starts) {
+    toolCalls.push({
+      tool,
+      input: input as Record<string, unknown> | undefined,
+      id: id.startsWith('anon-') ? undefined : id,
+      output: results.get(id),
+    });
+  }
+  return toolCalls;
+}
+
+/**
+ * Merge event-sourced tool calls into messages. For each tool call, if it
+ * already exists (by id) in some message, skip it. Otherwise, append it to
+ * the last assistant message (creating one if needed).
+ */
+function injectEventToolCalls(messages: Message[], eventToolCalls: ToolCall[]): void {
+  const existingIds = new Set<string>();
+  const existingTools = new Set<string>();
+  for (const msg of messages) {
+    if (!msg.toolCalls) continue;
+    for (const tc of msg.toolCalls) {
+      if (tc.id) existingIds.add(tc.id);
+      // Track tool+input combos to avoid duplicates when there's no id
+      existingTools.add(`${tc.tool}:${JSON.stringify(tc.input)}`);
+    }
+  }
+
+  const missing = eventToolCalls.filter((tc) => {
+    if (tc.id && existingIds.has(tc.id)) return false;
+    if (existingTools.has(`${tc.tool}:${JSON.stringify(tc.input)}`)) return false;
+    return true;
+  });
+
+  if (missing.length === 0) return;
+
+  // Find the last assistant message and replace it with an enriched copy
+  let targetIdx = -1;
+  for (let i = messages.length - 1; i >= 0; i--) {
+    if (messages[i].role === 'assistant') {
+      targetIdx = i;
+      break;
     }
   }
 
-  return output;
+  if (targetIdx >= 0) {
+    const target = messages[targetIdx];
+    messages[targetIdx] = { ...target, toolCalls: [...(target.toolCalls ?? []), ...missing] };
+  } else {
+    // No assistant message — create a synthetic one
+    messages.push({ role: 'assistant', content: '', toolCalls: missing });
+  }
 }
 
 function extractTokenUsage(events: unknown[]): ProviderTokenUsage | undefined {
@@ -720,15 +814,13 @@ function extractToolCalls(content: unknown): readonly ToolCall[] {
         input: p.input,
         id: typeof p.id === 'string' ? p.id : undefined,
       });
-    }
-    if (p.type === 'toolCall' && typeof p.name === 'string') {
+    } else if ((p.type === 'toolCall' || p.type === 'tool_call') && typeof p.name === 'string') {
       toolCalls.push({
         tool: p.name,
-        input: p.arguments,
+        input: p.arguments ?? p.input,
         id: typeof p.id === 'string' ? p.id : undefined,
       });
-    }
-    if (p.type === 'tool_result' && typeof p.tool_use_id === 'string') {
+    } else if (p.type === 'tool_result' && typeof p.tool_use_id === 'string') {
       const existing = toolCalls.find((tc) => tc.id === p.tool_use_id);
       if (existing) {
         const idx = toolCalls.indexOf(existing);
@@ -830,3 +922,10 @@ async function defaultPiRunner(options: PiRunOptions): Promise<PiRunResult> {
     });
   });
 }
+
+/** @internal Exported for testing only. */
+export const _internal = {
+  extractMessages,
+  extractToolCallsFromEvents,
+  parsePiJsonl,
+};
diff --git a/packages/core/test/evaluation/providers/pi-cli-tool-extraction.test.ts b/packages/core/test/evaluation/providers/pi-cli-tool-extraction.test.ts
diff --git a/packages/eval/package.json b/packages/eval/package.json