Skip to content

Commit 257b994

Browse files
authored
chore: add basic eval (ChromeDevTools#766)
This PR adds a tool based on the node test runner that runs a loop to see what tools a model chooses given a prompt. The expectations are encoding for each prompt. Run `npm run eval` to get results. Currently, only Gemini and needs and API key.
1 parent e8e2910 commit 257b994

File tree

7 files changed

+338
-2
lines changed

7 files changed

+338
-2
lines changed

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,9 @@ yarn-error.log*
77
lerna-debug.log*
88
.pnpm-debug.log*
99

10+
trace.json
11+
trace.json.gz
12+
1013
# Diagnostic reports (https://nodejs.org/api/report.html)
1114
report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json
1215

GEMINI.md

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,4 +3,15 @@
33
- Use only scripts from `package.json` to run commands.
44
- Use `npm run build` to run tsc and test build.
55
- Use `npm run test` to build and run tests, run all tests to verify correctness.
6-
- use `npm run test path-to-test.ts` to build and run a single test file, for example, `npm run test tests/McpContext.test.ts`.
6+
- Use `npm run test path-to-test.ts` to build and run a single test file, for example, `npm run test tests/McpContext.test.ts`.
7+
- Use `npm run format` to fix formatting and get linting errors.
8+
9+
## Rules for TypeScript
10+
11+
- Do not use `any` type.
12+
- Do not use `as` keyword for type casting.
13+
- Do not use `!` operator for type assertion.
14+
- Do not use `// @ts-ignore` comments.
15+
- Do not use `// @ts-nocheck` comments.
16+
- Do not use `// @ts-expect-error` comments.
17+
- Prefer `for..of` instead of `forEach`.

package-lock.json

Lines changed: 11 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

package.json

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,8 @@
2121
"test:only": "npm run build && node scripts/test.mjs --test-only",
2222
"test:update-snapshots": "npm run build && node scripts/test.mjs --test-update-snapshots",
2323
"prepare": "node --experimental-strip-types scripts/prepare.ts",
24-
"verify-server-json-version": "node --experimental-strip-types scripts/verify-server-json-version.ts"
24+
"verify-server-json-version": "node --experimental-strip-types scripts/verify-server-json-version.ts",
25+
"eval": "npm run build && node --experimental-strip-types --test scripts/eval_gemini.ts"
2526
},
2627
"files": [
2728
"build/src",
@@ -39,6 +40,7 @@
3940
"mcpName": "io.github.ChromeDevTools/chrome-devtools-mcp",
4041
"devDependencies": {
4142
"@eslint/js": "^9.35.0",
43+
"@google/generative-ai": "^0.24.1",
4244
"@modelcontextprotocol/sdk": "1.25.2",
4345
"@rollup/plugin-commonjs": "^29.0.0",
4446
"@rollup/plugin-json": "^6.1.0",

scripts/eval_gemini.ts

Lines changed: 268 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,268 @@
1+
/**
2+
* @license
3+
* Copyright 2026 Google LLC
4+
* SPDX-License-Identifier: Apache-2.0
5+
*/
6+
7+
import fs from 'node:fs';
8+
import path from 'node:path';
9+
import {describe, test} from 'node:test';
10+
11+
import {
12+
GoogleGenerativeAI,
13+
type FunctionDeclaration,
14+
SchemaType,
15+
} from '@google/generative-ai';
16+
import {Client} from '@modelcontextprotocol/sdk/client/index.js';
17+
import {StdioClientTransport} from '@modelcontextprotocol/sdk/client/stdio.js';
18+
19+
const ROOT_DIR = path.resolve(import.meta.dirname, '..');
20+
const SCENARIOS_DIR = path.join(import.meta.dirname, 'eval_scenarios');
21+
22+
// Define schema for our test scenarios
23+
export interface CapturedFunctionCall {
24+
name: string;
25+
args: Record<string, unknown>;
26+
}
27+
28+
export interface TestScenario {
29+
prompt: string;
30+
maxTurns: number;
31+
expectations: (calls: CapturedFunctionCall[]) => void;
32+
}
33+
34+
async function loadScenario(scenarioPath: string): Promise<TestScenario> {
35+
// Dynamic import of the test scenario
36+
// We assume the test file exports a 'scenario' object
37+
const module = await import(scenarioPath);
38+
if (!module.scenario) {
39+
throw new Error(
40+
`Scenario file ${scenarioPath} does not export a 'scenario' object.`,
41+
);
42+
}
43+
return module.scenario;
44+
}
45+
46+
// Helper to sanitize schema for Gemini
47+
function isRecord(v: unknown): v is Record<string, unknown> {
48+
return typeof v === 'object' && v !== null && !Array.isArray(v);
49+
}
50+
51+
const cleanSchemaRecursive = (schema: unknown): unknown => {
52+
if (!isRecord(schema)) {
53+
return schema;
54+
}
55+
56+
const out: Record<string, unknown> = {};
57+
for (const key in schema) {
58+
if (
59+
key === 'default' ||
60+
key === 'additionalProperties' ||
61+
key === 'exclusiveMinimum'
62+
) {
63+
continue;
64+
}
65+
66+
const value = schema[key];
67+
if (Array.isArray(value)) {
68+
out[key] = value.map(cleanSchemaRecursive);
69+
} else if (isRecord(value)) {
70+
out[key] = cleanSchemaRecursive(value);
71+
} else {
72+
out[key] = value;
73+
}
74+
}
75+
return out;
76+
};
77+
78+
async function runSingleScenario(
79+
scenarioPath: string,
80+
apiKey: string,
81+
): Promise<void> {
82+
const absolutePath = path.resolve(scenarioPath);
83+
console.log(`\n### Running Scenario: ${absolutePath} ###`);
84+
85+
let client: Client | undefined;
86+
let transport: StdioClientTransport | undefined;
87+
88+
try {
89+
const scenario = await loadScenario(absolutePath);
90+
91+
// Path to the compiled MCP server
92+
const serverPath = path.join(ROOT_DIR, 'build/src/index.js');
93+
if (!fs.existsSync(serverPath)) {
94+
throw new Error(
95+
`MCP server not found at ${serverPath}. Please run 'npm run build' first.`,
96+
);
97+
}
98+
99+
// Environment variables
100+
const env: Record<string, string> = {};
101+
Object.entries(process.env).forEach(([key, value]) => {
102+
if (value !== undefined) {
103+
env[key] = value;
104+
}
105+
});
106+
107+
transport = new StdioClientTransport({
108+
command: 'node',
109+
args: [serverPath],
110+
env,
111+
});
112+
113+
client = new Client(
114+
{name: 'gemini-eval-client', version: '1.0.0'},
115+
{capabilities: {}},
116+
);
117+
118+
await client.connect(transport);
119+
120+
const toolsResult = await client.listTools();
121+
const mcpTools = toolsResult.tools;
122+
123+
// Convert MCP tools to Gemini function declarations
124+
const functionDeclarations: FunctionDeclaration[] = mcpTools.map(tool => ({
125+
name: tool.name.replace(/-/g, '_').replace(/\./g, '_'), // Sanitize name for Gemini
126+
description: tool.description?.substring(0, 1024) || '',
127+
parameters: cleanSchemaRecursive({
128+
type: SchemaType.OBJECT,
129+
properties:
130+
isRecord(tool.inputSchema) && 'properties' in tool.inputSchema
131+
? tool.inputSchema.properties
132+
: {},
133+
required:
134+
isRecord(tool.inputSchema) &&
135+
'required' in tool.inputSchema &&
136+
Array.isArray(tool.inputSchema.required)
137+
? tool.inputSchema.required
138+
: [],
139+
}) as FunctionDeclaration['parameters'],
140+
}));
141+
142+
// Keep a map of sanitized names to original names for execution
143+
const contentToolsMap = new Map<string, string>();
144+
for (const tool of mcpTools) {
145+
const sanitized = tool.name.replace(/-/g, '_').replace(/\./g, '_');
146+
contentToolsMap.set(sanitized, tool.name);
147+
}
148+
149+
const genAI = new GoogleGenerativeAI(apiKey);
150+
const model = genAI.getGenerativeModel({
151+
model: 'gemini-3-pro-preview',
152+
tools: [{functionDeclarations}],
153+
});
154+
155+
const chat = model.startChat({
156+
systemInstruction: {
157+
role: 'system',
158+
parts: [{text: `Use available tools.`}],
159+
},
160+
});
161+
162+
const expectations = scenario.expectations;
163+
const allCalls: CapturedFunctionCall[] = [];
164+
165+
// Execute turns
166+
let turnCount = 0;
167+
console.log(`\n--- Turn 1 (User) ---`);
168+
console.log(scenario.prompt);
169+
170+
let result = await chat.sendMessage(scenario.prompt);
171+
let response = result.response;
172+
173+
while (turnCount < scenario.maxTurns) {
174+
turnCount++;
175+
console.log(`\n--- Turn ${turnCount} (Model) ---`);
176+
const text = response.text();
177+
if (text) {
178+
console.log(`Text: ${text}`);
179+
}
180+
181+
const functionCalls = response.functionCalls();
182+
if (functionCalls && functionCalls.length > 0) {
183+
console.log(
184+
`Function Calls: ${JSON.stringify(functionCalls, null, 2)}`,
185+
);
186+
187+
const functionResponses = [];
188+
for (const call of functionCalls) {
189+
const originalName = contentToolsMap.get(call.name);
190+
if (!originalName) {
191+
console.error(`Unknown tool called: ${call.name}`);
192+
functionResponses.push({
193+
functionResponse: {
194+
name: call.name,
195+
response: {error: `Unknown tool: ${call.name}`},
196+
},
197+
});
198+
continue;
199+
}
200+
201+
const safeArgs = isRecord(call.args) ? call.args : {};
202+
203+
console.log(
204+
`Executing tool: ${originalName} with args: ${JSON.stringify(call.args)}`,
205+
);
206+
207+
allCalls.push({
208+
name: originalName,
209+
args: safeArgs,
210+
});
211+
212+
try {
213+
const toolResult = await client.callTool({
214+
name: originalName,
215+
arguments: safeArgs,
216+
});
217+
218+
functionResponses.push({
219+
functionResponse: {
220+
name: call.name,
221+
response: {name: call.name, content: toolResult},
222+
},
223+
});
224+
} catch (e) {
225+
const errorMessage = e instanceof Error ? e.message : String(e);
226+
console.error(`Error executing tool ${originalName}:`, e);
227+
functionResponses.push({
228+
functionResponse: {
229+
name: call.name,
230+
response: {error: errorMessage},
231+
},
232+
});
233+
}
234+
}
235+
236+
// Send tool results back
237+
console.log(`Sending ${functionResponses.length} tool outputs back...`);
238+
result = await chat.sendMessage(functionResponses);
239+
response = result.response;
240+
} else {
241+
console.log('No tool calls. Interaction finished.');
242+
break;
243+
}
244+
}
245+
246+
console.log('\nVerifying expectations...');
247+
expectations(allCalls);
248+
} finally {
249+
await client?.close();
250+
await transport?.close();
251+
}
252+
}
253+
const apiKey = process.env.GEMINI_API_KEY;
254+
if (!apiKey) {
255+
throw new Error('GEMINI_API_KEY environment variable is required.');
256+
}
257+
258+
void describe('Gemini Eval Scenarios', () => {
259+
const files = fs.readdirSync(SCENARIOS_DIR).filter(file => {
260+
return file.endsWith('.ts') || file.endsWith('.js');
261+
});
262+
263+
for (const file of files) {
264+
void test(file, async () => {
265+
await runSingleScenario(path.join(SCENARIOS_DIR, file), apiKey);
266+
});
267+
}
268+
});
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
/**
2+
* @license
3+
* Copyright 2026 Google LLC
4+
* SPDX-License-Identifier: Apache-2.0
5+
*/
6+
import assert from 'node:assert';
7+
8+
import type {TestScenario} from '../eval_gemini.ts';
9+
10+
export const scenario: TestScenario = {
11+
prompt: 'Navigate to https://developers.chrome.com and tell me if it worked.',
12+
maxTurns: 1,
13+
expectations: calls => {
14+
assert.deepStrictEqual(calls, [
15+
{
16+
name: 'navigate_page',
17+
args: {url: 'https://developers.chrome.com'},
18+
},
19+
]);
20+
},
21+
};
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
/**
2+
* @license
3+
* Copyright 2026 Google LLC
4+
* SPDX-License-Identifier: Apache-2.0
5+
*/
6+
import assert from 'node:assert';
7+
8+
import type {TestScenario} from '../eval_gemini.ts';
9+
10+
export const scenario: TestScenario = {
11+
prompt: 'Check the performance of https://developers.chrome.com',
12+
maxTurns: 2,
13+
expectations: calls => {
14+
assert.strictEqual(calls.length, 2);
15+
assert.ok(
16+
calls[0].name === 'navigate_page' || calls[0].name === 'new_page',
17+
);
18+
assert.ok(calls[1].name === 'performance_start_trace');
19+
},
20+
};

0 commit comments

Comments
 (0)