Skip to content

Commit 6d8f631

Browse files
christsoclaude
andauthored
fix(core): extract pi-cli tool calls from streaming events (#782)
* fix(core): extract pi-cli tool calls from streaming events for skill-trigger Pi CLI emits tool_execution_start/end events in JSONL output, but the provider only extracted tool calls from message content arrays. This caused the skill-trigger evaluator to miss pi's skill file reads. Now extractMessages() also scans for tool_execution_start/end events and injects reconstructed tool calls into assistant messages. Also handles tool_call (snake_case) content type variant. Closes #780 Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * fix(core): avoid mutating readonly Message in injectEventToolCalls Replace target message with a new object instead of casting to bypass readonly constraint, per code review feedback. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * fix(evals): restore skill-trigger assertion for agent-plugin-review eval Re-adds the skill-trigger assertion that was removed as a workaround for #780. Now that pi-cli tool call extraction is fixed, the evaluator can detect when pi loads the agent-plugin-review skill. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * fix(evals): configure pi-cli target with model and remove workers: 1 Pi-cli target needs subprovider/model/api_key to produce meaningful output. Without them, pi uses its default which returns empty responses. Also removes workers: 1 from agent-plugin-review eval since all test cases are read-only reviews that can safely run in parallel. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * style: fix formatting in package.json files Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent c5c7a11 commit 6d8f631

File tree

7 files changed

+351
-27
lines changed

7 files changed

+351
-27
lines changed

.agentv/targets.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,8 @@ targets:
1313
- name: pi-cli
1414
provider: pi-cli
1515
subprovider: openrouter
16+
model: openai/gpt-5.1-codex
17+
api_key: ${{ OPENROUTER_API_KEY }}
1618
grader_target: gemini-flash
1719

1820
- name: pi-coding-agent

apps/cli/package.json

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -14,10 +14,7 @@
1414
"bin": {
1515
"agentv": "./dist/cli.js"
1616
},
17-
"files": [
18-
"dist",
19-
"README.md"
20-
],
17+
"files": ["dist", "README.md"],
2118
"scripts": {
2219
"dev": "bun src/cli.ts",
2320
"build": "tsup && bun run copy-readme",

evals/agentic-engineering/agent-plugin-review.eval.yaml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@ description: Evaluates that the agent-plugin-review skill is triggered and catch
33
execution:
44
targets:
55
- pi-cli
6-
workers: 1
76

87
workspace:
98
template: ./workspace-template
@@ -20,6 +19,8 @@ tests:
2019
Review the deploy-auto plugin in this repo for completeness.
2120
Check that every skill has a corresponding eval file.
2221
assertions:
22+
- type: skill-trigger
23+
skill: agent-plugin-review
2324
- type: contains
2425
value: deploy-rollback
2526
- type: rubrics

packages/core/package.json

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -38,10 +38,7 @@
3838
"diagnostics:azure": "bun src/diagnostics/azure-deployment-diag.ts",
3939
"generate:schema": "bun scripts/generate-eval-schema.ts"
4040
},
41-
"files": [
42-
"dist",
43-
"README.md"
44-
],
41+
"files": ["dist", "README.md"],
4542
"dependencies": {
4643
"@agentclientprotocol/sdk": "^0.14.1",
4744
"@agentv/eval": "workspace:*",

packages/core/src/evaluation/providers/pi-cli.ts

Lines changed: 113 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -539,6 +539,10 @@ function summarizePiEvent(event: unknown): string | undefined {
539539
}
540540
return `message_update: ${eventType}`;
541541
}
542+
case 'tool_execution_start':
543+
return `tool_start: ${record.toolName}`;
544+
case 'tool_execution_end':
545+
return `tool_end: ${record.toolName}`;
542546
default:
543547
return type;
544548
}
@@ -580,29 +584,119 @@ function parsePiJsonl(output: string): unknown[] {
580584
}
581585

582586
function extractMessages(events: unknown[]): readonly Message[] {
587+
let messages: Message[] | undefined;
588+
583589
for (let i = events.length - 1; i >= 0; i--) {
584590
const event = events[i];
585591
if (!event || typeof event !== 'object') continue;
586592
const record = event as Record<string, unknown>;
587593
if (record.type !== 'agent_end') continue;
588594

589-
const messages = record.messages;
590-
if (!Array.isArray(messages)) continue;
595+
const msgs = record.messages;
596+
if (!Array.isArray(msgs)) continue;
591597

592-
return messages.map(convertPiMessage).filter((m): m is Message => m !== undefined);
598+
messages = msgs.map(convertPiMessage).filter((m): m is Message => m !== undefined);
599+
break;
593600
}
594601

595-
const output: Message[] = [];
602+
if (!messages) {
603+
messages = [];
604+
for (const event of events) {
605+
if (!event || typeof event !== 'object') continue;
606+
const record = event as Record<string, unknown>;
607+
if (record.type === 'turn_end') {
608+
const converted = convertPiMessage(record.message);
609+
if (converted) messages.push(converted);
610+
}
611+
}
612+
}
613+
614+
// Pi CLI may emit tool_execution_start/tool_execution_end events whose tool
615+
// calls are absent from the final agent_end messages. Reconstruct them and
616+
// inject into the last assistant message so evaluators (e.g. skill-trigger)
617+
// can detect them.
618+
const eventToolCalls = extractToolCallsFromEvents(events);
619+
if (eventToolCalls.length > 0) {
620+
injectEventToolCalls(messages, eventToolCalls);
621+
}
622+
623+
return messages;
624+
}
625+
626+
/**
627+
* Scan JSONL events for tool_execution_start / tool_execution_end pairs and
628+
* reconstruct ToolCall objects from them.
629+
*/
630+
function extractToolCallsFromEvents(events: unknown[]): ToolCall[] {
631+
const starts = new Map<string, { tool: string; input: unknown }>();
632+
const results = new Map<string, unknown>();
633+
596634
for (const event of events) {
597635
if (!event || typeof event !== 'object') continue;
598-
const record = event as Record<string, unknown>;
599-
if (record.type === 'turn_end') {
600-
const converted = convertPiMessage(record.message);
601-
if (converted) output.push(converted);
636+
const r = event as Record<string, unknown>;
637+
const type = r.type;
638+
if (type === 'tool_execution_start' && typeof r.toolName === 'string') {
639+
const id = typeof r.toolCallId === 'string' ? r.toolCallId : undefined;
640+
starts.set(id ?? `anon-${starts.size}`, { tool: r.toolName, input: r.args });
641+
} else if (type === 'tool_execution_end') {
642+
const id = typeof r.toolCallId === 'string' ? r.toolCallId : undefined;
643+
if (id) results.set(id, r.result);
644+
}
645+
}
646+
647+
const toolCalls: ToolCall[] = [];
648+
for (const [id, { tool, input }] of starts) {
649+
toolCalls.push({
650+
tool,
651+
input: input as Record<string, unknown> | undefined,
652+
id: id.startsWith('anon-') ? undefined : id,
653+
output: results.get(id),
654+
});
655+
}
656+
return toolCalls;
657+
}
658+
659+
/**
660+
* Merge event-sourced tool calls into messages. For each tool call, if it
661+
* already exists (by id) in some message, skip it. Otherwise, append it to
662+
* the last assistant message (creating one if needed).
663+
*/
664+
function injectEventToolCalls(messages: Message[], eventToolCalls: ToolCall[]): void {
665+
const existingIds = new Set<string>();
666+
const existingTools = new Set<string>();
667+
for (const msg of messages) {
668+
if (!msg.toolCalls) continue;
669+
for (const tc of msg.toolCalls) {
670+
if (tc.id) existingIds.add(tc.id);
671+
// Track tool+input combos to avoid duplicates when there's no id
672+
existingTools.add(`${tc.tool}:${JSON.stringify(tc.input)}`);
673+
}
674+
}
675+
676+
const missing = eventToolCalls.filter((tc) => {
677+
if (tc.id && existingIds.has(tc.id)) return false;
678+
if (existingTools.has(`${tc.tool}:${JSON.stringify(tc.input)}`)) return false;
679+
return true;
680+
});
681+
682+
if (missing.length === 0) return;
683+
684+
// Find the last assistant message and replace it with an enriched copy
685+
let targetIdx = -1;
686+
for (let i = messages.length - 1; i >= 0; i--) {
687+
if (messages[i].role === 'assistant') {
688+
targetIdx = i;
689+
break;
602690
}
603691
}
604692

605-
return output;
693+
if (targetIdx >= 0) {
694+
const target = messages[targetIdx];
695+
messages[targetIdx] = { ...target, toolCalls: [...(target.toolCalls ?? []), ...missing] };
696+
} else {
697+
// No assistant message — create a synthetic one
698+
messages.push({ role: 'assistant', content: '', toolCalls: missing });
699+
}
606700
}
607701

608702
function extractTokenUsage(events: unknown[]): ProviderTokenUsage | undefined {
@@ -720,15 +814,13 @@ function extractToolCalls(content: unknown): readonly ToolCall[] {
720814
input: p.input,
721815
id: typeof p.id === 'string' ? p.id : undefined,
722816
});
723-
}
724-
if (p.type === 'toolCall' && typeof p.name === 'string') {
817+
} else if ((p.type === 'toolCall' || p.type === 'tool_call') && typeof p.name === 'string') {
725818
toolCalls.push({
726819
tool: p.name,
727-
input: p.arguments,
820+
input: p.arguments ?? p.input,
728821
id: typeof p.id === 'string' ? p.id : undefined,
729822
});
730-
}
731-
if (p.type === 'tool_result' && typeof p.tool_use_id === 'string') {
823+
} else if (p.type === 'tool_result' && typeof p.tool_use_id === 'string') {
732824
const existing = toolCalls.find((tc) => tc.id === p.tool_use_id);
733825
if (existing) {
734826
const idx = toolCalls.indexOf(existing);
@@ -830,3 +922,10 @@ async function defaultPiRunner(options: PiRunOptions): Promise<PiRunResult> {
830922
});
831923
});
832924
}
925+
926+
/** @internal Exported for testing only. */
927+
export const _internal = {
928+
extractMessages,
929+
extractToolCallsFromEvents,
930+
parsePiJsonl,
931+
};

0 commit comments

Comments
 (0)