Skip to content

Commit a534ab6

Browse files
jackfranklinDevtools-frontend LUCI CQ
authored andcommitted
AI: basic building blocks of an evaluation framework
This CL lays the building blocks for a primitive evaluation framework that we can build out. Its structure is as follows: 1. Outputs from DevTools get gathered by the auto-ai tool (this CL does not change any of that functionality). 2. You then take those outputs, and run them through to_eval_output.ts script. This creates a JSON file containing the output we need for the evaluation suite (the full output is much more verbose and contains details that are not important for evaluation). 3. You then move the output into the right place: `outputs/type/YYYY-MM-DD/label/`. Type here should map roughly to an agent, e.g. "performance". Within each `label/` folder, you can have any number of output files. You could have 1 file with 3 examples, 5 files with 5 examples, or any combination. It doesn't matter. The tool will parse them all out. 4. You then write your evaluation suite using two helpers: - `evalGroup` loads the outputs you want to test. - `itEval` lets you define the evals. Currently you can evaluate if a function was called, and you can use the LLMComparison to judge the response. 5. To use the LLM Comparison, you define instructions in `suite/instructions/X.md`. There are some shared instructions that get prepended to every command. Running it gives an output like this: Results for: performance/lcp-breakdown ┌────────────┬────────────────────────────────────────────────────┐ │ (index) │ getMainThreadActivity │ is an accurate response │ ├────────────┼────────────────────────────────────────────────────┤ │ 2025-07-10 │ '1 / 1 passed' │ 1.0 average from 1 inputs. │ │ 2025-07-28 │ '1 / 1 passed' │ 2.0 average from 1 inputs. │ │ 2025-08-01 │ '3 / 3 passed' │ 3.0 average from 3 inputs. │ └────────────┴────────────────────────────────────────────────────┘ This CL also doesn't propose a robust scoring system; I am still thinking about that. I don't know if a table like above is enough, or if we want to try to produce an overall score for an example's evaluation. Bug: 425270170 Change-Id: I55c3cb2f47b759bc0854ef133104401b1932dc87 Reviewed-on: https://chromium-review.googlesource.com/c/devtools/devtools-frontend/+/6810886 Auto-Submit: Jack Franklin <[email protected]> Reviewed-by: Ergün Erdoğmuş <[email protected]> Reviewed-by: Alex Rudenko <[email protected]> Commit-Queue: Jack Franklin <[email protected]>
1 parent e9b4e2b commit a534ab6

File tree

13 files changed

+702
-2
lines changed

13 files changed

+702
-2
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ npm-debug.log
2828
/scripts/ai_assistance/auto-run/data
2929
/scripts/ai_assistance/performance-trace-downloads
3030
/scripts/ai_assistance/auto-run/performance-trace-downloads
31+
/scripts/ai_assistance/suite/outputs/**/*.json
3132

3233
/build
3334
/buildtools

scripts/ai_assistance/package.json

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
"type": "module",
33
"scripts": {
44
"auto-run": "node --no-warnings --experimental-strip-types auto-run/auto-run.ts",
5-
"auto-run:test": "npx --node-options='--no-warnings --experimental-strip-types' mocha auto-run/**/*.test.ts"
5+
"auto-run:test": "npx --node-options='--no-warnings --experimental-strip-types' mocha auto-run/**/*.test.ts",
6+
"eval-suite": "node --no-warnings --experimental-strip-types suite/*.eval.ts"
67
}
78
}
Lines changed: 183 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,183 @@
1+
// Copyright 2025 The Chromium Authors
2+
// Use of this source code is governed by a BSD-style license that can be
3+
// found in the LICENSE file.
4+
5+
import assert from 'node:assert';
6+
7+
import {loadInstructions} from '../instructions/load.ts';
8+
import type {Conversation} from '../types';
9+
10+
import {generateGeminiContent} from './gemini.ts';
11+
import {getMarkdownConversation, getOutputs, type Output} from './outputs.ts';
12+
13+
abstract class Evaluator {}
14+
15+
export class FunctionCalled extends Evaluator {
16+
static nameOnly(example: Conversation, funcName: string): boolean {
17+
return example.queries.some(q => {
18+
return q.response.functionCallRequests?.some(call => call.name === funcName);
19+
});
20+
}
21+
}
22+
23+
export class LLMComparison extends Evaluator {
24+
static async judge(example: Conversation, prompt: string): Promise<{score: number, reasons: string}> {
25+
const scoringInstructions = loadInstructions('scoring');
26+
const exampleAsMarkdown = getMarkdownConversation(example);
27+
const response = await generateGeminiContent(
28+
`${scoringInstructions}
29+
30+
${prompt}.
31+
32+
## Conversation to score:
33+
${exampleAsMarkdown}`,
34+
'gemini-2.5-flash', {
35+
type: 'object',
36+
properties: {
37+
score: {type: 'number', description: 'A numerical score assigned by the AI.'},
38+
reasons: {type: 'string', description: 'A string containing the reasons for the assigned score.'}
39+
},
40+
required: ['score', 'reasons']
41+
});
42+
const r = JSON.parse(response) as {score: number, reasons: string};
43+
return {score: r.score, reasons: r.reasons};
44+
}
45+
}
46+
47+
interface GroupTestState {
48+
store: ResultStore;
49+
outputsByDate: Partial<Record<string, Output[]>>;
50+
}
51+
52+
let state: GroupTestState|null = null;
53+
54+
export type ItEval = {
55+
test: string,
56+
}&({
57+
succeed: (example: Conversation) => boolean,
58+
}|{
59+
judge: (example: Conversation) => Promise<{score: number, reasons: string}>,
60+
});
61+
62+
export async function itEval(config: ItEval): Promise<void> {
63+
assert.ok(state);
64+
if ('succeed' in config) {
65+
for (const [date, outputs] of Object.entries(state.outputsByDate)) {
66+
if (!outputs) {
67+
continue;
68+
}
69+
70+
const allDevToolsConversations = outputs.flatMap(o => o.contents.conversations);
71+
72+
let total = 0;
73+
let succeeded = 0;
74+
for (const conversation of allDevToolsConversations) {
75+
total++;
76+
if (config.succeed(conversation)) {
77+
succeeded++;
78+
}
79+
state.store.saveResult(config.test, date, {type: 'BINARY', success: succeeded, total});
80+
}
81+
}
82+
} else if ('judge' in config) {
83+
for (const [date, outputs] of Object.entries(state.outputsByDate)) {
84+
if (!outputs) {
85+
continue;
86+
}
87+
const allDevToolsConversations = outputs.flatMap(o => o.contents.conversations);
88+
const scores = await Promise.all(allDevToolsConversations.map(async example => {
89+
const result = await config.judge(example);
90+
return result.score;
91+
}));
92+
const totalOfAllScores = scores.reduce((acc: number, score: number) => acc + score, 0);
93+
const average = totalOfAllScores / scores.length;
94+
state.store.saveResult(config.test, date, {type: 'JUDGE', average, allScores: scores, total: totalOfAllScores});
95+
}
96+
}
97+
}
98+
99+
export interface GroupConfig {
100+
type: string;
101+
label: string;
102+
}
103+
104+
export async function evalGroup(config: GroupConfig, cb: (() => Promise<void>)): Promise<void> {
105+
const store = new ResultStore(config.type, config.label);
106+
const outputs = await getOutputs(config.type, config.label);
107+
const outputsByDate = Object.groupBy(outputs, o => o.dateFolder);
108+
state = {
109+
store,
110+
outputsByDate,
111+
};
112+
113+
await cb();
114+
printResults(state.store);
115+
}
116+
117+
function log(indentation: number, message: string): void {
118+
console.log(`${' '.repeat(indentation)}${message}`);
119+
}
120+
121+
function printResults(store: ResultStore): void {
122+
log(0, `Results for: ${store.type}/${store.label}`);
123+
124+
// Structures the results in Date => <Test Name, Test Output>.
125+
const dataForTable: Record<string, Record<string, string>> = {};
126+
127+
for (const [test, dateToResult] of store.results) {
128+
for (const [date, result] of dateToResult) {
129+
dataForTable[date] ??= {};
130+
switch (result.type) {
131+
case 'BINARY':
132+
dataForTable[date][test] = `${result.success} / ${result.total} passed`;
133+
break;
134+
case 'JUDGE':
135+
dataForTable[date][test] = `${result.average.toFixed(1)} average from ${result.allScores.length} inputs.`;
136+
break;
137+
default:
138+
throw new Error('Unknown result type!');
139+
}
140+
}
141+
}
142+
console.table(dataForTable);
143+
}
144+
145+
type Result = {
146+
type: 'BINARY',
147+
total: number,
148+
success: number,
149+
}|{
150+
type: 'JUDGE',
151+
average: number,
152+
total: number,
153+
allScores: number[],
154+
};
155+
156+
class ResultStore {
157+
// Map of testName => YYYY-MM-DD => Result
158+
#results = new Map<string, Map<string, Result>>();
159+
#type: string;
160+
#label: string;
161+
162+
constructor(type: string, label: string) {
163+
this.#type = type;
164+
this.#label = label;
165+
}
166+
167+
get type(): string {
168+
return this.#type;
169+
}
170+
get label(): string {
171+
return this.#label;
172+
}
173+
174+
get results(): ReadonlyMap<string, ReadonlyMap<string, Result>> {
175+
return this.#results;
176+
}
177+
178+
saveResult(testName: string, dateFolder: string, result: Result): void {
179+
const forTest = this.#results.get(testName) ?? new Map<string, Result>();
180+
forTest.set(dateFolder, result);
181+
this.#results.set(testName, forTest);
182+
}
183+
}
Lines changed: 122 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,122 @@
1+
// Copyright 2025 The Chromium Authors
2+
// Use of this source code is governed by a BSD-style license that can be
3+
// found in the LICENSE file.
4+
5+
/**
6+
* Makes requests to the Gemini API. Assumes process.env.GEMINI_API_KEY is available.
7+
* Note: the reason we do not use any Google Gemini SDK here is to save on the
8+
* node module install & committing it to the repo. Our API usage of Gemini is
9+
* lightweight so it doesn't feel worth it vs just wrapping the XHR requests
10+
* ourselves.
11+
*/
12+
13+
// Define interfaces for better type safety
14+
interface Part {
15+
text: string;
16+
}
17+
18+
interface Content {
19+
parts: Part[];
20+
}
21+
22+
interface GenerateContentRequestBody {
23+
contents: Content[];
24+
generationConfig?: {
25+
responseMimeType: string,
26+
responseSchema: object,
27+
};
28+
}
29+
30+
interface CandidatePart {
31+
text: string;
32+
}
33+
34+
interface CandidateContent {
35+
parts: CandidatePart[];
36+
role: string;
37+
}
38+
39+
interface Candidate {
40+
content: CandidateContent;
41+
finishReason: string;
42+
index: number;
43+
}
44+
45+
interface GenerateContentResponse {
46+
candidates: Candidate[];
47+
}
48+
49+
/**
50+
* Helper function to construct the request body for the Gemini generateContent API.
51+
*
52+
* @param promptText The text prompt for the Gemini model.
53+
* @param jsonSchema An optional JSON schema to define the expected response structure.
54+
* @returns The constructed GenerateContentRequestBody object.
55+
*/
56+
function buildGeminiRequestBody(promptText: string, jsonSchema?: object): GenerateContentRequestBody {
57+
const requestBody: GenerateContentRequestBody = {
58+
contents: [
59+
{
60+
parts: [
61+
{
62+
text: promptText,
63+
},
64+
],
65+
},
66+
],
67+
};
68+
69+
if (jsonSchema) {
70+
requestBody.generationConfig = {
71+
responseMimeType: 'application/json',
72+
responseSchema: jsonSchema,
73+
};
74+
}
75+
76+
return requestBody;
77+
}
78+
79+
/**
80+
* Makes a request to the Gemini API's generateContent endpoint.
81+
*
82+
* @param promptText The text prompt to send to the Gemini model.
83+
* @param modelName The name of the Gemini model to use (e.g., 'gemini-pro', 'gemini-1.5-flash').
84+
* @returns A Promise that resolves to the generated text, or an error string.
85+
*/
86+
export async function generateGeminiContent(
87+
promptText: string, modelName = 'gemini-2.5-flash', jsonSchema?: object): Promise<string> {
88+
const apiKey = process.env.GEMINI_API_KEY;
89+
90+
if (!apiKey) {
91+
throw new Error('GEMINI_API_KEY environment variable is not set. Please provide your API key.');
92+
}
93+
94+
const apiUrl = `https://generativelanguage.googleapis.com/v1beta/models/${modelName}:generateContent`;
95+
96+
const requestBody = buildGeminiRequestBody(promptText, jsonSchema);
97+
98+
try {
99+
const response = await fetch(apiUrl, {
100+
method: 'POST',
101+
headers: {
102+
'Content-Type': 'application/json',
103+
'x-goog-api-key': apiKey,
104+
},
105+
body: JSON.stringify(requestBody),
106+
});
107+
108+
if (!response.ok) {
109+
const errorData = await response.json();
110+
throw new Error(`API error: ${response.status} ${response.statusText} - ${JSON.stringify(errorData)}`);
111+
}
112+
113+
const data: GenerateContentResponse = await response.json();
114+
115+
// Safely access the generated content
116+
return (
117+
data.candidates?.[0]?.content?.parts?.[0]?.text || 'No content generated or content not in expected format.');
118+
} catch (error) {
119+
console.error('Error generating content:', error);
120+
return `Error: Could not generate content. Details: ${error instanceof Error ? error.message : String(error)}`;
121+
}
122+
}

0 commit comments

Comments
 (0)