Skip to content

Commit 9297d8e

Browse files
authored
Add tool call assertions to Agent Evals (#9357)
1 parent 299aa83 commit 9297d8e

File tree

6 files changed

+241
-20
lines changed

6 files changed

+241
-20
lines changed

scripts/agent-evals/src/runner/agent-test-runner.ts

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
import { ToolDef } from "./tool-matcher.js";
2+
13
export interface AgentTestRunner {
24
/**
35
* Simulates typing a string and waits for the turn to complete. It types one
@@ -13,8 +15,8 @@ export interface AgentTestRunner {
1315
expectText(text: string | RegExp): Promise<void>;
1416

1517
/**
16-
* Reads the agent's telemetry file and looks for the given event. Throws if
17-
* the event is not found
18+
* Reads the agent's telemetry and looks for the given tool calls. Throws if
19+
* an event is not found
1820
*/
19-
expectTelemetryEvent(eventName: string): Promise<void>;
21+
expectToolCalls(tools: ToolDef[]): Promise<void>;
2022
}

scripts/agent-evals/src/runner/gemini-cli-runner.ts

Lines changed: 137 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,44 @@
1-
import { mkdirSync, writeFileSync, readFileSync, existsSync } from "node:fs";
1+
import { mkdirSync, writeFileSync, readFileSync } from "node:fs";
22
import path from "node:path";
33
import { InteractiveCLI, poll } from "./interactive-cli.js";
44
import { AgentTestRunner } from "./agent-test-runner.js";
5+
import {
6+
ParsedToolLog,
7+
getToolName,
8+
toolArgumentsMatch,
9+
getToolArgumentsDebug,
10+
} from "./tool-matcher.js";
11+
import fs from "fs";
12+
import { throwFailure } from "./logging.js";
513

614
const READY_PROMPT = "Type your message";
715

16+
interface ParsedTelemetryLog {
17+
attributes?: {
18+
"event.name"?: string;
19+
function_name?: string;
20+
function_args?: string;
21+
success?: boolean;
22+
duration_ms?: number;
23+
};
24+
scopeMetrics?: {
25+
metrics: {
26+
descriptor: {
27+
name: string;
28+
};
29+
}[];
30+
}[];
31+
}
32+
833
export class GeminiCliRunner implements AgentTestRunner {
934
private readonly cli: InteractiveCLI;
1035
private readonly telemetryPath: string;
1136
private readonly telemetryTimeout = 15000;
1237

38+
// Determines which tools to start from for this turn so we don't detect tool
39+
// calls from previous turns
40+
private turnToolIndex = 0;
41+
1342
constructor(
1443
private readonly testName: string,
1544
testDir: string,
@@ -29,8 +58,6 @@ export class GeminiCliRunner implements AgentTestRunner {
2958
},
3059
mcpServers: {
3160
firebase: {
32-
// TODO: Add a mode where developers can run against their npm run watch command
33-
// command: path.resolve(runDir, "../../../../../lib/bin/firebase.js"),
3461
command: "firebase",
3562
args: ["experimental:mcp"],
3663
},
@@ -52,6 +79,8 @@ export class GeminiCliRunner implements AgentTestRunner {
5279
}
5380

5481
async type(text: string): Promise<void> {
82+
const toolLogs = this.readToolLogs();
83+
this.turnToolIndex = toolLogs.length;
5584
return this.cli.type(text);
5685
}
5786

@@ -67,21 +96,115 @@ export class GeminiCliRunner implements AgentTestRunner {
6796
* Reads the agent's telemetry file and looks for the given event. Throws if
6897
* the event is not found
6998
*/
70-
async expectTelemetryEvent(eventName: string): Promise<void> {
71-
// NOTE: This doesn't take into account "turns" yet. It will likely look
72-
// through the entire history, not just the last turn
73-
const found = await poll(() => {
74-
if (!existsSync(this.telemetryPath)) {
99+
async expectToolCalls(tools: string[]): Promise<void> {
100+
await this.waitForTelemetryReady();
101+
102+
// We still need to poll because telemetry can take time to write each turn
103+
let messages: string[] = [];
104+
const success = await poll(() => {
105+
messages = [];
106+
let allSucceeded = true;
107+
// Start at this.turnToolIndex so we only read the tools used this turn
108+
const toolLogs = this.readToolLogs().slice(this.turnToolIndex);
109+
const foundToolNames = toolLogs.map((log) => log.name);
110+
for (const toolDef of tools) {
111+
const toolName = getToolName(toolDef);
112+
const matchingTool = toolLogs.find((log) => log.name === toolName);
113+
if (!matchingTool) {
114+
messages.push(
115+
`Did not find expected tool call: "${toolName}" in the telemetry log. Found [${foundToolNames}]`,
116+
);
117+
allSucceeded = false;
118+
} else {
119+
const foundMatchingArguments = toolLogs.some(
120+
(log) => log.name === toolName && toolArgumentsMatch(toolDef, log),
121+
);
122+
if (!foundMatchingArguments) {
123+
messages.push(
124+
`Tool arguments matcher "${getToolArgumentsDebug(toolDef)}" for "${toolName}" did not match any tool results in the telemetry log. All tools are: [${JSON.stringify(toolLogs)}]`,
125+
);
126+
allSucceeded = false;
127+
}
128+
}
129+
}
130+
return allSucceeded;
131+
}, this.telemetryTimeout);
132+
133+
if (!success) {
134+
throwFailure(messages.join("\n"));
135+
}
136+
}
137+
138+
// Implementation for this is borrowed from the Gemini CLI's test-helper
139+
private async waitForTelemetryReady() {
140+
// Wait for telemetry file to exist and have content
141+
await poll(() => {
142+
if (!fs.existsSync(this.telemetryPath)) return false;
143+
try {
144+
const content = readFileSync(this.telemetryPath, "utf-8");
145+
// Check if file has at lease one event in it
146+
return content.includes('"event.name"');
147+
} catch {
75148
return false;
76149
}
77-
const content = readFileSync(this.telemetryPath, "utf-8");
78-
return content.includes(eventName);
79150
}, this.telemetryTimeout);
151+
}
152+
153+
// Implementation for this is borrowed from the Gemini CLI's test-helper
154+
private readToolLogs(): ParsedToolLog[] {
155+
const parsedLogs = this.readAndParseTelemetryLog();
156+
const logs: ParsedToolLog[] = [];
80157

81-
if (!found) {
82-
throw new Error(`Did not find expected telemetry event: "${eventName}" in the telemetry log`);
83-
} else {
84-
console.log(` [FOUND] expectTelemetryEvent: ${eventName}`);
158+
for (const logData of parsedLogs) {
159+
// Look for tool call logs
160+
if (
161+
logData.attributes?.function_name &&
162+
logData.attributes["event.name"] === "gemini_cli.tool_call"
163+
) {
164+
logs.push({
165+
name: logData.attributes.function_name,
166+
args: logData.attributes.function_args ?? "{}",
167+
success: logData.attributes.success ?? false,
168+
duration_ms: logData.attributes.duration_ms ?? 0,
169+
});
170+
}
171+
}
172+
173+
return logs;
174+
}
175+
176+
// Implementation for this is borrowed from the Gemini CLI's test-helper
177+
private readAndParseTelemetryLog(): ParsedTelemetryLog[] {
178+
const logFilePath = this.telemetryPath;
179+
if (!logFilePath || !fs.existsSync(logFilePath)) {
180+
return [];
181+
}
182+
183+
const content = readFileSync(logFilePath, "utf-8");
184+
185+
// Split the content into individual JSON objects
186+
// They are separated by "}\n{"
187+
const jsonObjects = content
188+
.split(/}\n{/)
189+
.map((obj, index, array) => {
190+
// Add back the braces we removed during split
191+
if (index > 0) obj = "{" + obj;
192+
if (index < array.length - 1) obj = obj + "}";
193+
return obj.trim();
194+
})
195+
.filter((obj) => obj);
196+
197+
const logs: ParsedTelemetryLog[] = [];
198+
199+
for (const jsonStr of jsonObjects) {
200+
try {
201+
const logData = JSON.parse(jsonStr);
202+
logs.push(logData);
203+
} catch (e) {
204+
// Skip objects that aren't valid JSON
205+
}
85206
}
207+
208+
return logs;
86209
}
87210
}

scripts/agent-evals/src/runner/interactive-cli.ts

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import * as pty from "node-pty";
22
import { IPty } from "node-pty";
33
import stripAnsi from "strip-ansi";
4+
import { throwFailure } from "./logging.js";
45

56
export async function poll(predicate: () => boolean, timeout: number): Promise<boolean> {
67
const startTime = Date.now();
@@ -96,7 +97,7 @@ export class InteractiveCLI {
9697
}, this.timeout);
9798

9899
if (!found) {
99-
throw new Error(`Did not find expected text: "${text}" in output within ${this.timeout}ms`);
100+
throwFailure(`Did not find expected text: "${text}" in output within ${this.timeout}ms`);
100101
}
101102
}
102103

@@ -121,7 +122,7 @@ export class InteractiveCLI {
121122
}, timeout);
122123

123124
if (!stoppedChanging) {
124-
throw new Error(`CLI did not stop changing output within ${timeout}ms`);
125+
throwFailure(`CLI did not stop changing output within ${timeout}ms`);
125126
}
126127
}
127128

@@ -140,7 +141,7 @@ export class InteractiveCLI {
140141
}
141142

142143
if (!found) {
143-
throw new Error(`Did not find expected text: "${text}" in the latest output`);
144+
throwFailure(`Did not find expected text: "${text}" in the latest output`);
144145
} else {
145146
console.log(` [FOUND] expectText: ${text}`);
146147
}
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
const COLORS = {
2+
RESET: "\x1b[0m",
3+
BRIGHT: "\x1b[1m",
4+
BLUE: "\x1b[34m",
5+
GREEN: "\x1b[32m",
6+
RED: "\x1b[31m",
7+
};
8+
9+
function colorLog(color: string, message: string): void {
10+
console.log(`${color}${message}${COLORS.RESET}`);
11+
}
12+
13+
export function throwFailure(message: string) {
14+
// Log this separately because mocha doesn't print errors from failures
15+
// that happen before the final repetition. The failure can be helpful to get
16+
// early signal that the test is going to fail all reptitions
17+
colorLog(COLORS.BRIGHT + COLORS.RED, message);
18+
throw new Error(message);
19+
}
Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
export type ToolDef =
2+
// Asserts that the tool with this name was called successfully
3+
| string
4+
| {
5+
// Name of the tool
6+
name: string;
7+
// Asserts that the tool arguments contain this string
8+
argumentContains?: string;
9+
// Asserts that the tool's success equals this value
10+
successIs?: boolean;
11+
};
12+
13+
export interface ParsedToolLog {
14+
name: string;
15+
args: string;
16+
success: boolean;
17+
duration_ms: number;
18+
}
19+
20+
export function getToolName(toolDef: ToolDef): string {
21+
if (typeof toolDef === "string") {
22+
return toolDef;
23+
}
24+
return toolDef.name;
25+
}
26+
27+
export function getToolArgumentsDebug(toolDef: ToolDef): string {
28+
if (typeof toolDef !== "string") {
29+
const out = [];
30+
if (toolDef.successIs) {
31+
out.push(`success=${toolDef.successIs}`);
32+
// If you don't pass successIs, assert that it was successful
33+
} else {
34+
out.push(`success=true`);
35+
}
36+
if (toolDef.argumentContains) {
37+
out.push(`contains=${toolDef.argumentContains}`);
38+
}
39+
return out.join(",");
40+
}
41+
// If you just pass a string, assert that the tool was successful
42+
return "success=true";
43+
}
44+
45+
export function toolArgumentsMatch(toolDef: ToolDef, log: ParsedToolLog): boolean {
46+
let success = true;
47+
if (typeof toolDef !== "string") {
48+
if (toolDef.argumentContains) {
49+
if (!log.args.includes(toolDef.argumentContains)) {
50+
success = false;
51+
}
52+
}
53+
if (toolDef.successIs !== undefined) {
54+
if (log.success !== toolDef.successIs) {
55+
success = false;
56+
}
57+
// If you don't pass successIs, assert that it was successful
58+
} else if (!log.success) {
59+
success = false;
60+
}
61+
// If you just pass a string, assert that the tool was successful
62+
} else {
63+
if (!log.success) {
64+
success = false;
65+
}
66+
}
67+
return success;
68+
}

scripts/agent-evals/src/tests/firebase-init.spec.ts

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,5 +19,13 @@ describe("/firebase:init", function (this: Mocha.Suite) {
1919
);
2020

2121
await run.type("Yes that looks good. Use Firebase Project gcli-ext-sam-01");
22+
await run.expectToolCalls([
23+
"firebase_update_environment",
24+
{
25+
name: "firebase_read_resources",
26+
argumentContains: "firebase://guides/init/backend",
27+
successIs: true,
28+
},
29+
]);
2230
});
2331
});

0 commit comments

Comments
 (0)