Add tool call assertions to Agent Evals (#9357)

samedson · web-flow · commit 9297d8e7cafc · 2025-10-23T14:06:22.000Z
diff --git a/scripts/agent-evals/src/runner/agent-test-runner.ts b/scripts/agent-evals/src/runner/agent-test-runner.ts
@@ -1,3 +1,5 @@
+import { ToolDef } from "./tool-matcher.js";
+
 export interface AgentTestRunner {
   /**
    * Simulates typing a string and waits for the turn to complete. It types one
@@ -13,8 +15,8 @@ export interface AgentTestRunner {
   expectText(text: string | RegExp): Promise<void>;
 
   /**
-   * Reads the agent's telemetry file and looks for the given event. Throws if
-   * the event is not found
+   * Reads the agent's telemetry and looks for the given tool calls. Throws if
+   * an event is not found
    */
-  expectTelemetryEvent(eventName: string): Promise<void>;
+  expectToolCalls(tools: ToolDef[]): Promise<void>;
 }
diff --git a/scripts/agent-evals/src/runner/gemini-cli-runner.ts b/scripts/agent-evals/src/runner/gemini-cli-runner.ts
@@ -1,15 +1,44 @@
-import { mkdirSync, writeFileSync, readFileSync, existsSync } from "node:fs";
+import { mkdirSync, writeFileSync, readFileSync } from "node:fs";
 import path from "node:path";
 import { InteractiveCLI, poll } from "./interactive-cli.js";
 import { AgentTestRunner } from "./agent-test-runner.js";
+import {
+  ParsedToolLog,
+  getToolName,
+  toolArgumentsMatch,
+  getToolArgumentsDebug,
+} from "./tool-matcher.js";
+import fs from "fs";
+import { throwFailure } from "./logging.js";
 
 const READY_PROMPT = "Type your message";
 
+interface ParsedTelemetryLog {
+  attributes?: {
+    "event.name"?: string;
+    function_name?: string;
+    function_args?: string;
+    success?: boolean;
+    duration_ms?: number;
+  };
+  scopeMetrics?: {
+    metrics: {
+      descriptor: {
+        name: string;
+      };
+    }[];
+  }[];
+}
+
 export class GeminiCliRunner implements AgentTestRunner {
   private readonly cli: InteractiveCLI;
   private readonly telemetryPath: string;
   private readonly telemetryTimeout = 15000;
 
+  // Determines which tools to start from for this turn so we don't detect tool
+  // calls from previous turns
+  private turnToolIndex = 0;
+
   constructor(
     private readonly testName: string,
     testDir: string,
@@ -29,8 +58,6 @@ export class GeminiCliRunner implements AgentTestRunner {
       },
       mcpServers: {
         firebase: {
-          // TODO: Add a mode where developers can run against their npm run watch command
-          // command: path.resolve(runDir, "../../../../../lib/bin/firebase.js"),
           command: "firebase",
           args: ["experimental:mcp"],
         },
@@ -52,6 +79,8 @@ export class GeminiCliRunner implements AgentTestRunner {
   }
 
   async type(text: string): Promise<void> {
+    const toolLogs = this.readToolLogs();
+    this.turnToolIndex = toolLogs.length;
     return this.cli.type(text);
   }
 
@@ -67,21 +96,115 @@ export class GeminiCliRunner implements AgentTestRunner {
    * Reads the agent's telemetry file and looks for the given event. Throws if
    * the event is not found
    */
-  async expectTelemetryEvent(eventName: string): Promise<void> {
-    // NOTE: This doesn't take into account "turns" yet. It will likely look
-    // through the entire history, not just the last turn
-    const found = await poll(() => {
-      if (!existsSync(this.telemetryPath)) {
+  async expectToolCalls(tools: string[]): Promise<void> {
+    await this.waitForTelemetryReady();
+
+    // We still need to poll because telemetry can take time to write each turn
+    let messages: string[] = [];
+    const success = await poll(() => {
+      messages = [];
+      let allSucceeded = true;
+      // Start at this.turnToolIndex so we only read the tools used this turn
+      const toolLogs = this.readToolLogs().slice(this.turnToolIndex);
+      const foundToolNames = toolLogs.map((log) => log.name);
+      for (const toolDef of tools) {
+        const toolName = getToolName(toolDef);
+        const matchingTool = toolLogs.find((log) => log.name === toolName);
+        if (!matchingTool) {
+          messages.push(
+            `Did not find expected tool call: "${toolName}" in the telemetry log. Found [${foundToolNames}]`,
+          );
+          allSucceeded = false;
+        } else {
+          const foundMatchingArguments = toolLogs.some(
+            (log) => log.name === toolName && toolArgumentsMatch(toolDef, log),
+          );
+          if (!foundMatchingArguments) {
+            messages.push(
+              `Tool arguments matcher "${getToolArgumentsDebug(toolDef)}" for "${toolName}" did not match any tool results in the telemetry log. All tools are: [${JSON.stringify(toolLogs)}]`,
+            );
+            allSucceeded = false;
+          }
+        }
+      }
+      return allSucceeded;
+    }, this.telemetryTimeout);
+
+    if (!success) {
+      throwFailure(messages.join("\n"));
+    }
+  }
+
+  // Implementation for this is borrowed from the Gemini CLI's test-helper
+  private async waitForTelemetryReady() {
+    // Wait for telemetry file to exist and have content
+    await poll(() => {
+      if (!fs.existsSync(this.telemetryPath)) return false;
+      try {
+        const content = readFileSync(this.telemetryPath, "utf-8");
+        // Check if file has at lease one event in it
+        return content.includes('"event.name"');
+      } catch {
         return false;
       }
-      const content = readFileSync(this.telemetryPath, "utf-8");
-      return content.includes(eventName);
     }, this.telemetryTimeout);
+  }
+
+  // Implementation for this is borrowed from the Gemini CLI's test-helper
+  private readToolLogs(): ParsedToolLog[] {
+    const parsedLogs = this.readAndParseTelemetryLog();
+    const logs: ParsedToolLog[] = [];
 
-    if (!found) {
-      throw new Error(`Did not find expected telemetry event: "${eventName}" in the telemetry log`);
-    } else {
-      console.log(`  [FOUND] expectTelemetryEvent: ${eventName}`);
+    for (const logData of parsedLogs) {
+      // Look for tool call logs
+      if (
+        logData.attributes?.function_name &&
+        logData.attributes["event.name"] === "gemini_cli.tool_call"
+      ) {
+        logs.push({
+          name: logData.attributes.function_name,
+          args: logData.attributes.function_args ?? "{}",
+          success: logData.attributes.success ?? false,
+          duration_ms: logData.attributes.duration_ms ?? 0,
+        });
+      }
+    }
+
+    return logs;
+  }
+
+  // Implementation for this is borrowed from the Gemini CLI's test-helper
+  private readAndParseTelemetryLog(): ParsedTelemetryLog[] {
+    const logFilePath = this.telemetryPath;
+    if (!logFilePath || !fs.existsSync(logFilePath)) {
+      return [];
+    }
+
+    const content = readFileSync(logFilePath, "utf-8");
+
+    // Split the content into individual JSON objects
+    // They are separated by "}\n{"
+    const jsonObjects = content
+      .split(/}\n{/)
+      .map((obj, index, array) => {
+        // Add back the braces we removed during split
+        if (index > 0) obj = "{" + obj;
+        if (index < array.length - 1) obj = obj + "}";
+        return obj.trim();
+      })
+      .filter((obj) => obj);
+
+    const logs: ParsedTelemetryLog[] = [];
+
+    for (const jsonStr of jsonObjects) {
+      try {
+        const logData = JSON.parse(jsonStr);
+        logs.push(logData);
+      } catch (e) {
+        // Skip objects that aren't valid JSON
+      }
     }
+
+    return logs;
   }
 }
diff --git a/scripts/agent-evals/src/runner/interactive-cli.ts b/scripts/agent-evals/src/runner/interactive-cli.ts
@@ -1,6 +1,7 @@
 import * as pty from "node-pty";
 import { IPty } from "node-pty";
 import stripAnsi from "strip-ansi";
+import { throwFailure } from "./logging.js";
 
 export async function poll(predicate: () => boolean, timeout: number): Promise<boolean> {
   const startTime = Date.now();
@@ -96,7 +97,7 @@ export class InteractiveCLI {
     }, this.timeout);
 
     if (!found) {
-      throw new Error(`Did not find expected text: "${text}" in output within ${this.timeout}ms`);
+      throwFailure(`Did not find expected text: "${text}" in output within ${this.timeout}ms`);
     }
   }
 
@@ -121,7 +122,7 @@ export class InteractiveCLI {
     }, timeout);
 
     if (!stoppedChanging) {
-      throw new Error(`CLI did not stop changing output within ${timeout}ms`);
+      throwFailure(`CLI did not stop changing output within ${timeout}ms`);
     }
   }
 
@@ -140,7 +141,7 @@ export class InteractiveCLI {
     }
 
     if (!found) {
-      throw new Error(`Did not find expected text: "${text}" in the latest output`);
+      throwFailure(`Did not find expected text: "${text}" in the latest output`);
     } else {
       console.log(`  [FOUND] expectText: ${text}`);
     }
diff --git a/scripts/agent-evals/src/runner/logging.ts b/scripts/agent-evals/src/runner/logging.ts
@@ -0,0 +1,19 @@
+const COLORS = {
+  RESET: "\x1b[0m",
+  BRIGHT: "\x1b[1m",
+  BLUE: "\x1b[34m",
+  GREEN: "\x1b[32m",
+  RED: "\x1b[31m",
+};
+
+function colorLog(color: string, message: string): void {
+  console.log(`${color}${message}${COLORS.RESET}`);
+}
+
+export function throwFailure(message: string) {
+  // Log this separately because mocha doesn't print errors from failures
+  // that happen before the final repetition. The failure can be helpful to get
+  // early signal that the test is going to fail all reptitions
+  colorLog(COLORS.BRIGHT + COLORS.RED, message);
+  throw new Error(message);
+}
diff --git a/scripts/agent-evals/src/runner/tool-matcher.ts b/scripts/agent-evals/src/runner/tool-matcher.ts
@@ -0,0 +1,68 @@
+export type ToolDef =
+  // Asserts that the tool with this name was called successfully
+  | string
+  | {
+      // Name of the tool
+      name: string;
+      // Asserts that the tool arguments contain this string
+      argumentContains?: string;
+      // Asserts that the tool's success equals this value
+      successIs?: boolean;
+    };
+
+export interface ParsedToolLog {
+  name: string;
+  args: string;
+  success: boolean;
+  duration_ms: number;
+}
+
+export function getToolName(toolDef: ToolDef): string {
+  if (typeof toolDef === "string") {
+    return toolDef;
+  }
+  return toolDef.name;
+}
+
+export function getToolArgumentsDebug(toolDef: ToolDef): string {
+  if (typeof toolDef !== "string") {
+    const out = [];
+    if (toolDef.successIs) {
+      out.push(`success=${toolDef.successIs}`);
+      // If you don't pass successIs, assert that it was successful
+    } else {
+      out.push(`success=true`);
+    }
+    if (toolDef.argumentContains) {
+      out.push(`contains=${toolDef.argumentContains}`);
+    }
+    return out.join(",");
+  }
+  // If you just pass a string, assert that the tool was successful
+  return "success=true";
+}
+
+export function toolArgumentsMatch(toolDef: ToolDef, log: ParsedToolLog): boolean {
+  let success = true;
+  if (typeof toolDef !== "string") {
+    if (toolDef.argumentContains) {
+      if (!log.args.includes(toolDef.argumentContains)) {
+        success = false;
+      }
+    }
+    if (toolDef.successIs !== undefined) {
+      if (log.success !== toolDef.successIs) {
+        success = false;
+      }
+      // If you don't pass successIs, assert that it was successful
+    } else if (!log.success) {
+      success = false;
+    }
+    // If you just pass a string, assert that the tool was successful
+  } else {
+    if (!log.success) {
+      success = false;
+    }
+  }
+  return success;
+}
diff --git a/scripts/agent-evals/src/tests/firebase-init.spec.ts b/scripts/agent-evals/src/tests/firebase-init.spec.ts
@@ -19,5 +19,13 @@ describe("/firebase:init", function (this: Mocha.Suite) {
     );
 
     await run.type("Yes that looks good. Use Firebase Project gcli-ext-sam-01");
+    await run.expectToolCalls([
+      "firebase_update_environment",
+      {
+        name: "firebase_read_resources",
+        argumentContains: "firebase://guides/init/backend",
+        successIs: true,
+      },
+    ]);
   });
 });

Original file line number	Diff line number	Diff line change
`@@ -1,6 +1,7 @@`
`1`	`1`	`import * as pty from "node-pty";`
`2`	`2`	`import { IPty } from "node-pty";`
`3`	`3`	`import stripAnsi from "strip-ansi";`
	`4`	`+import { throwFailure } from "./logging.js";`
`4`	`5`
`5`	`6`	`export async function poll(predicate: () => boolean, timeout: number): Promise<boolean> {`
`6`	`7`	`const startTime = Date.now();`
`@@ -96,7 +97,7 @@ export class InteractiveCLI {`
`96`	`97`	`}, this.timeout);`
`97`	`98`
`98`	`99`	`if (!found) {`
`99`		- throw new Error(`Did not find expected text: "${text}" in output within ${this.timeout}ms`);
	`100`	+ throwFailure(`Did not find expected text: "${text}" in output within ${this.timeout}ms`);
`100`	`101`	`}`
`101`	`102`	`}`
`102`	`103`
`@@ -121,7 +122,7 @@ export class InteractiveCLI {`
`121`	`122`	`}, timeout);`
`122`	`123`
`123`	`124`	`if (!stoppedChanging) {`
`124`		- throw new Error(`CLI did not stop changing output within ${timeout}ms`);
	`125`	+ throwFailure(`CLI did not stop changing output within ${timeout}ms`);
`125`	`126`	`}`
`126`	`127`	`}`
`127`	`128`
`@@ -140,7 +141,7 @@ export class InteractiveCLI {`
`140`	`141`	`}`
`141`	`142`
`142`	`143`	`if (!found) {`
`143`		- throw new Error(`Did not find expected text: "${text}" in the latest output`);
	`144`	+ throwFailure(`Did not find expected text: "${text}" in the latest output`);
`144`	`145`	`} else {`
`145`	`146`	console.log(` [FOUND] expectText: ${text}`);
`146`	`147`	`}`