Eval: update auto-run to output eval format

jackfranklin · Devtools-frontend LUCI CQ · commit 81e40ab02fae · 2025-09-01T03:26:47.000-07:00
This CL updates the auto run script with a `--eval` flag which will cause the runner to output a secondary file named `*.eval.json` which then can be moved into the `suites/output` folder and synced with GCP. R=alexrudenko@chromium.org Fixed: 436224413 Change-Id: I2f7c39973e6d17b88579e17e1aae2f3c5433ffb9 Reviewed-on: https://chromium-review.googlesource.com/c/devtools/devtools-frontend/+/6905366 Reviewed-by: Alex Rudenko <alexrudenko@chromium.org> Commit-Queue: Jack Franklin <jacktfranklin@chromium.org>
diff --git a/DEPS b/DEPS
@@ -222,10 +222,10 @@ deps = {
     "bucket": "chrome-devtools-ai-evals",
     "objects": [
       {
-        "object_name": "f0e8e7b99dc61f7a943bfdf284552982c63bdf8d6217091f5260bc8ebd84ca9f",
-        "sha256sum": "af579f30f2384089e1bece67db9afb71b902aa6ff99cb9749d4694ce53783670",
-        "size_bytes": 3582,
-        "generation": 1755705853621054
+        "object_name": "8aaaea341cac9e6dad90a0685f4eeae39fabb9f655761eed9c3af16795a40f14",
+        "sha256sum": "c2e5b18a77095451dbaae27cea913aace1fc267e8e9e0f0b4f297a6215eb5299",
+        "size_bytes": 6801,
+        "generation": 1756718606230139
       }
     ]
   },
diff --git a/scripts/ai_assistance/README.md b/scripts/ai_assistance/README.md
@@ -4,6 +4,9 @@ This directory contains scripts for the prompt iteration & evaluation process fo
 
 Mainly, `auto-run/auto-run.ts` script takes example URLs, runs the examples and outputs the results to the `auto-run/data/` directory. Then, the HTML page in `eval/` folder takes these results and presents them in a UI for evaluation.
 
+**NOTE: looking for the automatic evaluation suite?**
+As of September 2025, we also have an evaluation suite where we can define evaluations to apply to an output and have them automatically evaluated, including using an LLM as judge. See the README in `suites/` for more detail on this.
+
 ## Running
 
 **Prerequisites**
diff --git a/scripts/ai_assistance/auto-run/auto-run.ts b/scripts/ai_assistance/auto-run/auto-run.ts
@@ -9,6 +9,7 @@ import puppeteer from 'puppeteer-core';
 import {hideBin} from 'yargs/helpers';
 import yargs from 'yargs/yargs';
 
+import {convertRawOutputToEval, type RawOutput} from '../suite/to_eval_output.ts';
 import type {
   ExampleMetadata, ExecutedExample, IndividualPromptRequestResponse, Logs, RpcGlobalId, RunResult, TestTarget} from
   '../types';
@@ -57,6 +58,11 @@ const globalUserArgs =
               const,
           demandOption: true,
         })
+        .option('eval', {
+          describe: 'Also output to the format required for the DevTools Eval framework',
+          boolean: true,
+          default: false,
+        })
         .parseSync();
 
 const exampleUrls: string[] = [];
@@ -420,6 +426,17 @@ async function main() {
     fs.mkdirSync(OUTPUT_DIR);
   }
   fs.writeFileSync(outputPath, JSON.stringify(output, null, 2));
+  if (globalUserArgs.eval) {
+    const convertedOutput = convertRawOutputToEval({
+      inputFromAutoRun: output as RawOutput,
+      label: globalUserArgs.label,
+    });
+    const evalOutputPath = outputPath.replace('.json', '.eval.json');
+    fs.writeFileSync(evalOutputPath, JSON.stringify(convertedOutput, null, 2));
+    console.info(
+        `\n[Info]: Exported eval output to ${evalOutputPath}`,
+    );
+  }
   console.info(
       `\n[Info]: Finished exporting results to ${outputPath}, it took ${formatElapsedTime()}`,
   );
diff --git a/scripts/ai_assistance/suite/README.md b/scripts/ai_assistance/suite/README.md
@@ -6,7 +6,7 @@ At this time, this is being heavily iterated on and may change rapidly. Chat to
 
 ## Getting started
 
-### 1: get the outputs from GCP
+### 1: download the outputs from GCP
 
 The actual output files you need to run the suite are hosted in a GCP bucket. The contents are fetched for you by `gclient sync` but only if you set the `checkout_ai_evals` arg in your `.gclient` config:
 
@@ -37,7 +37,9 @@ Run `cd scripts/ai_assistance && npm run eval-suite` to execute the suite.
 
 ## Adding new outputs
 
-Once you have new outputs you want to put into the set, move them into the right place in the `suite/outputs/outputs` folder.:
+To get outputs, you should use the auto-run tool but pass the `--eval` flag. This will cause it to output a secondary file named `*.eval.json` that contains the output in the format the evaluation suiteexpects.
+
+Once you have new outputs you want to put into the set, move them into the right place in the `suite/outputs/outputs` folder.
 
 The structure of files in this folder is like so: `outputs/type/YYYY-MM-DD/label-XYZ.json`.
 
@@ -51,7 +53,12 @@ Then, run (from the DevTools root directory in this case, but it doesn't matter)
 node scripts/ai_assistance/suite/upload_to_gcp.ts
 ```
 
-This will upload the changes to the GCP bucket and update the `DEPS` file for you, which you should ensure you commit in a CL.
+This will upload the changes to the GCP bucket and update the `DEPS` file for you, which you should ensure you commit in a CL. The best workflow is:
+
+1. Generate your new output file(s).
+2. Move any new files into `suites/outputs/...`.
+3. Use the `upload_to_gcp.ts` script.
+4. Commit the `DEPS` change and send the CL for review.
 
 If you get any authorisation errors, run `gsutil.py config` to refresh your authentication status.
 
diff --git a/scripts/ai_assistance/suite/to_eval_output.ts b/scripts/ai_assistance/suite/to_eval_output.ts
@@ -11,25 +11,8 @@ import yargs from 'yargs/yargs';
 
 import type {Conversation, EvalFileOutput, ProcessedQuery} from './types';
 
-const userArgs =
-    yargs(hideBin(process.argv))
-        .option('file', {type: 'string', demandOption: true, description: 'The raw JSON file from Auto Run.'})
-        .option('label', {type: 'string', demandOption: true, desc: 'A human readable, short label to use.'})
-        .option('pretty', {
-          type: 'boolean',
-          demandOption: false,
-          default: false,
-          description: 'Output formatted JSON rather than minified.'
-        })
-        .parseSync();
-
-const inputPath = path.isAbsolute(userArgs.file) ? userArgs.file : path.join(process.cwd(), userArgs.file);
-const contents = fs.readFileSync(inputPath, 'utf8');
-
-const INPUT_HASH = hash(contents, null);
-
 // Note: non-exhaustive.
-interface RawOutput {
+export interface RawOutput {
   metadata: Array<{exampleId: string, explanation: string}>;
   examples: Array<{
     exampleId: string,
@@ -59,8 +42,8 @@ interface RawOutput {
     },
     aidaResponse: {
       metadata: {
-        rcpGlobalId: string,
-        inferenceOptionMetadata: {
+        rcpGlobalId?: string,
+        inferenceOptionMetadata?: {
           modelId: string,
           modelVersion: string,
         },
@@ -72,87 +55,108 @@ interface RawOutput {
   }>;
 }
 
-const json = JSON.parse(contents) as RawOutput;
-
-const examples = json.metadata.map(m => m.exampleId);
-
-const processedExamples: Conversation[] =
-    examples
-        .map((exampleIdFromInput, index) => {
-          const data = json.examples.filter(e => e.exampleId === exampleIdFromInput);
-          if (!data.length) {
-            return null;
-          }
-
-          const exampleMetadata = json.metadata[index];
-
-          const id = INPUT_HASH + '-' + index;
-          const chromeVersion = data.at(0)?.request.metadata.client_version;
-          assert.ok(chromeVersion, 'No client_version');
-          const modelData = data.at(0)?.aidaResponse.metadata.inferenceOptionMetadata;
-          assert.ok(modelData, 'No inferenceOptionMetadata');
-          const processed: Conversation = {
-            id,
-            chromeVersion,
-            explanation: exampleMetadata?.explanation ?? '',
-            model: {
-              id: modelData?.modelId,
-              version: modelData?.modelVersion,
-            },
-            queries: [],
-          };
-
-          for (const {request, aidaResponse} of data) {
-            if (!aidaResponse.completed) {
-              continue;
-            }
+interface RawToEvalOptions {
+  inputFromAutoRun: RawOutput;
+  label: string;
+}
 
-            const responseText = aidaResponse.explanation?.trim() ?? undefined;
+export function convertRawOutputToEval(opts: RawToEvalOptions): EvalFileOutput {
+  const inputHash = hash(JSON.stringify(opts.inputFromAutoRun));
+  const exampleIds = opts.inputFromAutoRun.metadata.map(m => m.exampleId);
 
-            const query: ProcessedQuery = {
-              request: {
-                prompt: request.current_message.parts[0].text,
-                functionCallResponse: request.current_message.parts[0].functionResponse?.name,
-                availableFunctionNames: request.function_declarations.map(dec => dec.name),
+  const processedExamples: Conversation[] =
+      exampleIds
+          .map((exampleIdFromInput, index) => {
+            const data = opts.inputFromAutoRun.examples.filter(e => e.exampleId === exampleIdFromInput);
+            if (!data.length) {
+              return null;
+            }
+            const exampleMetadata = opts.inputFromAutoRun.metadata[index];
+
+            const id = inputHash + '-' + index;
+            const chromeVersion = data.at(0)?.request.metadata.client_version;
+            assert.ok(chromeVersion, 'No client_version');
+            const modelData = data.at(0)?.aidaResponse.metadata.inferenceOptionMetadata;
+            assert.ok(modelData, 'No inferenceOptionMetadata');
+            const processed: Conversation = {
+              id,
+              chromeVersion,
+              explanation: exampleMetadata?.explanation ?? '',
+              model: {
+                id: modelData?.modelId,
+                version: modelData?.modelVersion,
               },
-              response: {
-                rpcGlobalId: aidaResponse.metadata.rcpGlobalId,
-                text: responseText,
-                functionCallRequests: aidaResponse.functionCalls?.map(call => {
-                  return {
-                    name: call.name,
-                    args: call.args,
-                  };
-                }),
-              }
+              queries: [],
             };
-            processed.queries.push(query);
-          }
-          return processed;
-        })
-        .filter(x => x !== null);
-
-const finalOutput: EvalFileOutput = {
-  metadata: {
-    createdAt: new Date().toISOString(),
-    id: hash(processedExamples.map(x => x.id).join(''), 16),
-    label: userArgs.label,
-  },
-  conversations: processedExamples,
-};
-
-const stringified = userArgs.pretty ? JSON.stringify(finalOutput, null, 2) : JSON.stringify(finalOutput);
-
-const fileName = `${slug(userArgs.label)}-${finalOutput.metadata.id}.json`;
-fs.writeFileSync(path.join(process.cwd(), fileName), stringified, 'utf8');
-console.log(`Wrote ${fileName} to disk.`);
-
-function hash(str: string, length: number|null) {
+
+            for (const {request, aidaResponse} of data) {
+              if (!aidaResponse.completed) {
+                continue;
+              }
+
+              const responseText = aidaResponse.explanation?.trim() ?? undefined;
+
+              const query: ProcessedQuery = {
+                request: {
+                  prompt: request.current_message.parts[0].text,
+                  functionCallResponse: request.current_message.parts[0].functionResponse?.name,
+                  availableFunctionNames: request.function_declarations.map(dec => dec.name),
+                },
+                response: {
+                  rpcGlobalId: aidaResponse.metadata.rcpGlobalId ?? '',
+                  text: responseText,
+                  functionCallRequests: aidaResponse.functionCalls?.map(call => {
+                    return {
+                      name: call.name,
+                      args: call.args,
+                    };
+                  }),
+                }
+              };
+              processed.queries.push(query);
+            }
+            return processed;
+          })
+          .filter(x => x !== null);
+  const finalOutput: EvalFileOutput = {
+    metadata: {
+      createdAt: new Date().toISOString(),
+      id: hash(processedExamples.map(x => x.id).join('')),
+    },
+    conversations: processedExamples,
+  };
+  return finalOutput;
+}
+
+const isBeingRunOnCommandLine = process.argv[1] === import.meta.url.replace('file://', '');
+
+if (isBeingRunOnCommandLine) {
+  const userArgs =
+      yargs(hideBin(process.argv))
+          .option('file', {type: 'string', demandOption: true, description: 'The raw JSON file from Auto Run.'})
+          .option('label', {type: 'string', demandOption: true, desc: 'A human readable, short label to use.'})
+          .option('pretty', {
+            type: 'boolean',
+            demandOption: false,
+            default: false,
+            description: 'Output formatted JSON rather than minified.'
+          })
+          .parseSync();
+
+  const inputPath = path.isAbsolute(userArgs.file) ? userArgs.file : path.join(process.cwd(), userArgs.file);
+  const contents = fs.readFileSync(inputPath, 'utf8');
+  const finalOutput =
+      convertRawOutputToEval({inputFromAutoRun: JSON.parse(contents) as RawOutput, label: userArgs.label});
+
+  const stringified = userArgs.pretty ? JSON.stringify(finalOutput, null, 2) : JSON.stringify(finalOutput);
+  const fileName = `${slug(userArgs.label)}-${finalOutput.metadata.id}.json`;
+  fs.writeFileSync(path.join(process.cwd(), fileName), stringified, 'utf8');
+  console.log(`Wrote ${fileName} to disk.`);
+}
+
+function hash(str: string) {
   const hash = crypto.createHash('md5').update(str).digest('hex');
-  if (length) {
-    return hash.substring(0, length);
-  }
-  return hash;
+  return hash.substring(0, 15);
 }
 
 function slug(str: string): string {
diff --git a/scripts/ai_assistance/suite/types.d.ts b/scripts/ai_assistance/suite/types.d.ts
@@ -24,7 +24,6 @@ export interface EvalFileOutput {
   metadata: {
     createdAt: string,
     id: string,
-    label: string,
   };
   conversations: Conversation[];
 }

Original file line number	Diff line number	Diff line change
`@@ -222,10 +222,10 @@ deps = {`
`222`	`222`	`"bucket": "chrome-devtools-ai-evals",`
`223`	`223`	`"objects": [`
`224`	`224`	`{`
`225`		`- "object_name": "f0e8e7b99dc61f7a943bfdf284552982c63bdf8d6217091f5260bc8ebd84ca9f",`
`226`		`- "sha256sum": "af579f30f2384089e1bece67db9afb71b902aa6ff99cb9749d4694ce53783670",`
`227`		`- "size_bytes": 3582,`
`228`		`- "generation": 1755705853621054`
	`225`	`+ "object_name": "8aaaea341cac9e6dad90a0685f4eeae39fabb9f655761eed9c3af16795a40f14",`
	`226`	`+ "sha256sum": "c2e5b18a77095451dbaae27cea913aace1fc267e8e9e0f0b4f297a6215eb5299",`
	`227`	`+ "size_bytes": 6801,`
	`228`	`+ "generation": 1756718606230139`
`229`	`229`	`}`
`230`	`230`	`]`
`231`	`231`	`},`
Original file line number	Diff line number	Diff line change
`@@ -24,7 +24,6 @@ export interface EvalFileOutput {`
`24`	`24`	`metadata: {`
`25`	`25`	`createdAt: string,`
`26`	`26`	`id: string,`
`27`		`- label: string,`
`28`	`27`	`};`
`29`	`28`	`conversations: Conversation[];`
`30`	`29`	`}`