Skip to content

Commit 81e40ab

Browse files
jackfranklinDevtools-frontend LUCI CQ
authored andcommitted
Eval: update auto-run to output eval format
This CL updates the auto run script with a `--eval` flag which will cause the runner to output a secondary file named `*.eval.json` which then can be moved into the `suites/output` folder and synced with GCP. [email protected] Fixed: 436224413 Change-Id: I2f7c39973e6d17b88579e17e1aae2f3c5433ffb9 Reviewed-on: https://chromium-review.googlesource.com/c/devtools/devtools-frontend/+/6905366 Reviewed-by: Alex Rudenko <[email protected]> Commit-Queue: Jack Franklin <[email protected]>
1 parent 4a33e3c commit 81e40ab

File tree

6 files changed

+134
-104
lines changed

6 files changed

+134
-104
lines changed

DEPS

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -222,10 +222,10 @@ deps = {
222222
"bucket": "chrome-devtools-ai-evals",
223223
"objects": [
224224
{
225-
"object_name": "f0e8e7b99dc61f7a943bfdf284552982c63bdf8d6217091f5260bc8ebd84ca9f",
226-
"sha256sum": "af579f30f2384089e1bece67db9afb71b902aa6ff99cb9749d4694ce53783670",
227-
"size_bytes": 3582,
228-
"generation": 1755705853621054
225+
"object_name": "8aaaea341cac9e6dad90a0685f4eeae39fabb9f655761eed9c3af16795a40f14",
226+
"sha256sum": "c2e5b18a77095451dbaae27cea913aace1fc267e8e9e0f0b4f297a6215eb5299",
227+
"size_bytes": 6801,
228+
"generation": 1756718606230139
229229
}
230230
]
231231
},

scripts/ai_assistance/README.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,9 @@ This directory contains scripts for the prompt iteration & evaluation process fo
44

55
Mainly, `auto-run/auto-run.ts` script takes example URLs, runs the examples and outputs the results to the `auto-run/data/` directory. Then, the HTML page in `eval/` folder takes these results and presents them in a UI for evaluation.
66

7+
**NOTE: looking for the automatic evaluation suite?**
8+
As of September 2025, we also have an evaluation suite where we can define evaluations to apply to an output and have them automatically evaluated, including using an LLM as judge. See the README in `suites/` for more detail on this.
9+
710
## Running
811

912
**Prerequisites**

scripts/ai_assistance/auto-run/auto-run.ts

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ import puppeteer from 'puppeteer-core';
99
import {hideBin} from 'yargs/helpers';
1010
import yargs from 'yargs/yargs';
1111

12+
import {convertRawOutputToEval, type RawOutput} from '../suite/to_eval_output.ts';
1213
import type {
1314
ExampleMetadata, ExecutedExample, IndividualPromptRequestResponse, Logs, RpcGlobalId, RunResult, TestTarget} from
1415
'../types';
@@ -57,6 +58,11 @@ const globalUserArgs =
5758
const,
5859
demandOption: true,
5960
})
61+
.option('eval', {
62+
describe: 'Also output to the format required for the DevTools Eval framework',
63+
boolean: true,
64+
default: false,
65+
})
6066
.parseSync();
6167

6268
const exampleUrls: string[] = [];
@@ -420,6 +426,17 @@ async function main() {
420426
fs.mkdirSync(OUTPUT_DIR);
421427
}
422428
fs.writeFileSync(outputPath, JSON.stringify(output, null, 2));
429+
if (globalUserArgs.eval) {
430+
const convertedOutput = convertRawOutputToEval({
431+
inputFromAutoRun: output as RawOutput,
432+
label: globalUserArgs.label,
433+
});
434+
const evalOutputPath = outputPath.replace('.json', '.eval.json');
435+
fs.writeFileSync(evalOutputPath, JSON.stringify(convertedOutput, null, 2));
436+
console.info(
437+
`\n[Info]: Exported eval output to ${evalOutputPath}`,
438+
);
439+
}
423440
console.info(
424441
`\n[Info]: Finished exporting results to ${outputPath}, it took ${formatElapsedTime()}`,
425442
);

scripts/ai_assistance/suite/README.md

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ At this time, this is being heavily iterated on and may change rapidly. Chat to
66

77
## Getting started
88

9-
### 1: get the outputs from GCP
9+
### 1: download the outputs from GCP
1010

1111
The actual output files you need to run the suite are hosted in a GCP bucket. The contents are fetched for you by `gclient sync` but only if you set the `checkout_ai_evals` arg in your `.gclient` config:
1212

@@ -37,7 +37,9 @@ Run `cd scripts/ai_assistance && npm run eval-suite` to execute the suite.
3737

3838
## Adding new outputs
3939

40-
Once you have new outputs you want to put into the set, move them into the right place in the `suite/outputs/outputs` folder.:
40+
To get outputs, you should use the auto-run tool but pass the `--eval` flag. This will cause it to output a secondary file named `*.eval.json` that contains the output in the format the evaluation suiteexpects.
41+
42+
Once you have new outputs you want to put into the set, move them into the right place in the `suite/outputs/outputs` folder.
4143

4244
The structure of files in this folder is like so: `outputs/type/YYYY-MM-DD/label-XYZ.json`.
4345

@@ -51,7 +53,12 @@ Then, run (from the DevTools root directory in this case, but it doesn't matter)
5153
node scripts/ai_assistance/suite/upload_to_gcp.ts
5254
```
5355

54-
This will upload the changes to the GCP bucket and update the `DEPS` file for you, which you should ensure you commit in a CL.
56+
This will upload the changes to the GCP bucket and update the `DEPS` file for you, which you should ensure you commit in a CL. The best workflow is:
57+
58+
1. Generate your new output file(s).
59+
2. Move any new files into `suites/outputs/...`.
60+
3. Use the `upload_to_gcp.ts` script.
61+
4. Commit the `DEPS` change and send the CL for review.
5562

5663
If you get any authorisation errors, run `gsutil.py config` to refresh your authentication status.
5764

scripts/ai_assistance/suite/to_eval_output.ts

Lines changed: 100 additions & 96 deletions
Original file line numberDiff line numberDiff line change
@@ -11,25 +11,8 @@ import yargs from 'yargs/yargs';
1111

1212
import type {Conversation, EvalFileOutput, ProcessedQuery} from './types';
1313

14-
const userArgs =
15-
yargs(hideBin(process.argv))
16-
.option('file', {type: 'string', demandOption: true, description: 'The raw JSON file from Auto Run.'})
17-
.option('label', {type: 'string', demandOption: true, desc: 'A human readable, short label to use.'})
18-
.option('pretty', {
19-
type: 'boolean',
20-
demandOption: false,
21-
default: false,
22-
description: 'Output formatted JSON rather than minified.'
23-
})
24-
.parseSync();
25-
26-
const inputPath = path.isAbsolute(userArgs.file) ? userArgs.file : path.join(process.cwd(), userArgs.file);
27-
const contents = fs.readFileSync(inputPath, 'utf8');
28-
29-
const INPUT_HASH = hash(contents, null);
30-
3114
// Note: non-exhaustive.
32-
interface RawOutput {
15+
export interface RawOutput {
3316
metadata: Array<{exampleId: string, explanation: string}>;
3417
examples: Array<{
3518
exampleId: string,
@@ -59,8 +42,8 @@ interface RawOutput {
5942
},
6043
aidaResponse: {
6144
metadata: {
62-
rcpGlobalId: string,
63-
inferenceOptionMetadata: {
45+
rcpGlobalId?: string,
46+
inferenceOptionMetadata?: {
6447
modelId: string,
6548
modelVersion: string,
6649
},
@@ -72,87 +55,108 @@ interface RawOutput {
7255
}>;
7356
}
7457

75-
const json = JSON.parse(contents) as RawOutput;
76-
77-
const examples = json.metadata.map(m => m.exampleId);
78-
79-
const processedExamples: Conversation[] =
80-
examples
81-
.map((exampleIdFromInput, index) => {
82-
const data = json.examples.filter(e => e.exampleId === exampleIdFromInput);
83-
if (!data.length) {
84-
return null;
85-
}
86-
87-
const exampleMetadata = json.metadata[index];
88-
89-
const id = INPUT_HASH + '-' + index;
90-
const chromeVersion = data.at(0)?.request.metadata.client_version;
91-
assert.ok(chromeVersion, 'No client_version');
92-
const modelData = data.at(0)?.aidaResponse.metadata.inferenceOptionMetadata;
93-
assert.ok(modelData, 'No inferenceOptionMetadata');
94-
const processed: Conversation = {
95-
id,
96-
chromeVersion,
97-
explanation: exampleMetadata?.explanation ?? '',
98-
model: {
99-
id: modelData?.modelId,
100-
version: modelData?.modelVersion,
101-
},
102-
queries: [],
103-
};
104-
105-
for (const {request, aidaResponse} of data) {
106-
if (!aidaResponse.completed) {
107-
continue;
108-
}
58+
interface RawToEvalOptions {
59+
inputFromAutoRun: RawOutput;
60+
label: string;
61+
}
10962

110-
const responseText = aidaResponse.explanation?.trim() ?? undefined;
63+
export function convertRawOutputToEval(opts: RawToEvalOptions): EvalFileOutput {
64+
const inputHash = hash(JSON.stringify(opts.inputFromAutoRun));
65+
const exampleIds = opts.inputFromAutoRun.metadata.map(m => m.exampleId);
11166

112-
const query: ProcessedQuery = {
113-
request: {
114-
prompt: request.current_message.parts[0].text,
115-
functionCallResponse: request.current_message.parts[0].functionResponse?.name,
116-
availableFunctionNames: request.function_declarations.map(dec => dec.name),
67+
const processedExamples: Conversation[] =
68+
exampleIds
69+
.map((exampleIdFromInput, index) => {
70+
const data = opts.inputFromAutoRun.examples.filter(e => e.exampleId === exampleIdFromInput);
71+
if (!data.length) {
72+
return null;
73+
}
74+
const exampleMetadata = opts.inputFromAutoRun.metadata[index];
75+
76+
const id = inputHash + '-' + index;
77+
const chromeVersion = data.at(0)?.request.metadata.client_version;
78+
assert.ok(chromeVersion, 'No client_version');
79+
const modelData = data.at(0)?.aidaResponse.metadata.inferenceOptionMetadata;
80+
assert.ok(modelData, 'No inferenceOptionMetadata');
81+
const processed: Conversation = {
82+
id,
83+
chromeVersion,
84+
explanation: exampleMetadata?.explanation ?? '',
85+
model: {
86+
id: modelData?.modelId,
87+
version: modelData?.modelVersion,
11788
},
118-
response: {
119-
rpcGlobalId: aidaResponse.metadata.rcpGlobalId,
120-
text: responseText,
121-
functionCallRequests: aidaResponse.functionCalls?.map(call => {
122-
return {
123-
name: call.name,
124-
args: call.args,
125-
};
126-
}),
127-
}
89+
queries: [],
12890
};
129-
processed.queries.push(query);
130-
}
131-
return processed;
132-
})
133-
.filter(x => x !== null);
134-
135-
const finalOutput: EvalFileOutput = {
136-
metadata: {
137-
createdAt: new Date().toISOString(),
138-
id: hash(processedExamples.map(x => x.id).join(''), 16),
139-
label: userArgs.label,
140-
},
141-
conversations: processedExamples,
142-
};
143-
144-
const stringified = userArgs.pretty ? JSON.stringify(finalOutput, null, 2) : JSON.stringify(finalOutput);
145-
146-
const fileName = `${slug(userArgs.label)}-${finalOutput.metadata.id}.json`;
147-
fs.writeFileSync(path.join(process.cwd(), fileName), stringified, 'utf8');
148-
console.log(`Wrote ${fileName} to disk.`);
149-
150-
function hash(str: string, length: number|null) {
91+
92+
for (const {request, aidaResponse} of data) {
93+
if (!aidaResponse.completed) {
94+
continue;
95+
}
96+
97+
const responseText = aidaResponse.explanation?.trim() ?? undefined;
98+
99+
const query: ProcessedQuery = {
100+
request: {
101+
prompt: request.current_message.parts[0].text,
102+
functionCallResponse: request.current_message.parts[0].functionResponse?.name,
103+
availableFunctionNames: request.function_declarations.map(dec => dec.name),
104+
},
105+
response: {
106+
rpcGlobalId: aidaResponse.metadata.rcpGlobalId ?? '',
107+
text: responseText,
108+
functionCallRequests: aidaResponse.functionCalls?.map(call => {
109+
return {
110+
name: call.name,
111+
args: call.args,
112+
};
113+
}),
114+
}
115+
};
116+
processed.queries.push(query);
117+
}
118+
return processed;
119+
})
120+
.filter(x => x !== null);
121+
const finalOutput: EvalFileOutput = {
122+
metadata: {
123+
createdAt: new Date().toISOString(),
124+
id: hash(processedExamples.map(x => x.id).join('')),
125+
},
126+
conversations: processedExamples,
127+
};
128+
return finalOutput;
129+
}
130+
131+
const isBeingRunOnCommandLine = process.argv[1] === import.meta.url.replace('file://', '');
132+
133+
if (isBeingRunOnCommandLine) {
134+
const userArgs =
135+
yargs(hideBin(process.argv))
136+
.option('file', {type: 'string', demandOption: true, description: 'The raw JSON file from Auto Run.'})
137+
.option('label', {type: 'string', demandOption: true, desc: 'A human readable, short label to use.'})
138+
.option('pretty', {
139+
type: 'boolean',
140+
demandOption: false,
141+
default: false,
142+
description: 'Output formatted JSON rather than minified.'
143+
})
144+
.parseSync();
145+
146+
const inputPath = path.isAbsolute(userArgs.file) ? userArgs.file : path.join(process.cwd(), userArgs.file);
147+
const contents = fs.readFileSync(inputPath, 'utf8');
148+
const finalOutput =
149+
convertRawOutputToEval({inputFromAutoRun: JSON.parse(contents) as RawOutput, label: userArgs.label});
150+
151+
const stringified = userArgs.pretty ? JSON.stringify(finalOutput, null, 2) : JSON.stringify(finalOutput);
152+
const fileName = `${slug(userArgs.label)}-${finalOutput.metadata.id}.json`;
153+
fs.writeFileSync(path.join(process.cwd(), fileName), stringified, 'utf8');
154+
console.log(`Wrote ${fileName} to disk.`);
155+
}
156+
157+
function hash(str: string) {
151158
const hash = crypto.createHash('md5').update(str).digest('hex');
152-
if (length) {
153-
return hash.substring(0, length);
154-
}
155-
return hash;
159+
return hash.substring(0, 15);
156160
}
157161

158162
function slug(str: string): string {

scripts/ai_assistance/suite/types.d.ts

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,6 @@ export interface EvalFileOutput {
2424
metadata: {
2525
createdAt: string,
2626
id: string,
27-
label: string,
2827
};
2928
conversations: Conversation[];
3029
}

0 commit comments

Comments
 (0)