ChromeDevTools
diff --git a/‎.gitignore‎
Lines changed: 1 addition & 0 deletions b/‎.gitignore‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎scripts/ai_assistance/package.json‎
Lines changed: 2 additions & 1 deletion b/‎scripts/ai_assistance/package.json‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎scripts/ai_assistance/suite/helpers/evaluators.ts‎
Lines changed: 183 additions & 0 deletions b/‎scripts/ai_assistance/suite/helpers/evaluators.ts‎
Lines changed: 183 additions & 0 deletions
diff --git a/‎scripts/ai_assistance/suite/helpers/gemini.ts‎
Lines changed: 122 additions & 0 deletions b/‎scripts/ai_assistance/suite/helpers/gemini.ts‎
Lines changed: 122 additions & 0 deletions
@@ -28,6 +28,7 @@ npm-debug.log
 /scripts/ai_assistance/auto-run/data
 /scripts/ai_assistance/performance-trace-downloads
 /scripts/ai_assistance/auto-run/performance-trace-downloads
+/scripts/ai_assistance/suite/outputs/**/*.json
 
 /build
 /buildtools
 
@@ -2,6 +2,7 @@
   "type": "module",
   "scripts": {
     "auto-run": "node --no-warnings --experimental-strip-types auto-run/auto-run.ts",
-    "auto-run:test": "npx --node-options='--no-warnings --experimental-strip-types' mocha auto-run/**/*.test.ts"
+    "auto-run:test": "npx --node-options='--no-warnings --experimental-strip-types' mocha auto-run/**/*.test.ts",
+    "eval-suite": "node --no-warnings --experimental-strip-types suite/*.eval.ts"
   }
 }
@@ -0,0 +1,183 @@
+// Copyright 2025 The Chromium Authors
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+import assert from 'node:assert';
+
+import {loadInstructions} from '../instructions/load.ts';
+import type {Conversation} from '../types';
+
+import {generateGeminiContent} from './gemini.ts';
+import {getMarkdownConversation, getOutputs, type Output} from './outputs.ts';
+
+abstract class Evaluator {}
+
+export class FunctionCalled extends Evaluator {
+  static nameOnly(example: Conversation, funcName: string): boolean {
+    return example.queries.some(q => {
+      return q.response.functionCallRequests?.some(call => call.name === funcName);
+    });
+  }
+}
+
+export class LLMComparison extends Evaluator {
+  static async judge(example: Conversation, prompt: string): Promise<{score: number, reasons: string}> {
+    const scoringInstructions = loadInstructions('scoring');
+    const exampleAsMarkdown = getMarkdownConversation(example);
+    const response = await generateGeminiContent(
+        `${scoringInstructions}
+
+        ${prompt}.
+
+## Conversation to score:
+${exampleAsMarkdown}`,
+        'gemini-2.5-flash', {
+          type: 'object',
+          properties: {
+            score: {type: 'number', description: 'A numerical score assigned by the AI.'},
+            reasons: {type: 'string', description: 'A string containing the reasons for the assigned score.'}
+          },
+          required: ['score', 'reasons']
+        });
+    const r = JSON.parse(response) as {score: number, reasons: string};
+    return {score: r.score, reasons: r.reasons};
+  }
+}
+
+interface GroupTestState {
+  store: ResultStore;
+  outputsByDate: Partial<Record<string, Output[]>>;
+}
+
+let state: GroupTestState|null = null;
+
+export type ItEval = {
+  test: string,
+}&({
+  succeed: (example: Conversation) => boolean,
+}|{
+  judge: (example: Conversation) => Promise<{score: number, reasons: string}>,
+});
+
+export async function itEval(config: ItEval): Promise<void> {
+  assert.ok(state);
+  if ('succeed' in config) {
+    for (const [date, outputs] of Object.entries(state.outputsByDate)) {
+      if (!outputs) {
+        continue;
+      }
+
+      const allDevToolsConversations = outputs.flatMap(o => o.contents.conversations);
+
+      let total = 0;
+      let succeeded = 0;
+      for (const conversation of allDevToolsConversations) {
+        total++;
+        if (config.succeed(conversation)) {
+          succeeded++;
+        }
+        state.store.saveResult(config.test, date, {type: 'BINARY', success: succeeded, total});
+      }
+    }
+  } else if ('judge' in config) {
+    for (const [date, outputs] of Object.entries(state.outputsByDate)) {
+      if (!outputs) {
+        continue;
+      }
+      const allDevToolsConversations = outputs.flatMap(o => o.contents.conversations);
+      const scores = await Promise.all(allDevToolsConversations.map(async example => {
+        const result = await config.judge(example);
+        return result.score;
+      }));
+      const totalOfAllScores = scores.reduce((acc: number, score: number) => acc + score, 0);
+      const average = totalOfAllScores / scores.length;
+      state.store.saveResult(config.test, date, {type: 'JUDGE', average, allScores: scores, total: totalOfAllScores});
+    }
+  }
+}
+
+export interface GroupConfig {
+  type: string;
+  label: string;
+}
+
+export async function evalGroup(config: GroupConfig, cb: (() => Promise<void>)): Promise<void> {
+  const store = new ResultStore(config.type, config.label);
+  const outputs = await getOutputs(config.type, config.label);
+  const outputsByDate = Object.groupBy(outputs, o => o.dateFolder);
+  state = {
+    store,
+    outputsByDate,
+  };
+
+  await cb();
+  printResults(state.store);
+}
+
+function log(indentation: number, message: string): void {
+  console.log(`${' '.repeat(indentation)}${message}`);
+}
+
+function printResults(store: ResultStore): void {
+  log(0, `Results for: ${store.type}/${store.label}`);
+
+  // Structures the results in Date => <Test Name, Test Output>.
+  const dataForTable: Record<string, Record<string, string>> = {};
+
+  for (const [test, dateToResult] of store.results) {
+    for (const [date, result] of dateToResult) {
+      dataForTable[date] ??= {};
+      switch (result.type) {
+        case 'BINARY':
+          dataForTable[date][test] = `${result.success} / ${result.total} passed`;
+          break;
+        case 'JUDGE':
+          dataForTable[date][test] = `${result.average.toFixed(1)} average from ${result.allScores.length} inputs.`;
+          break;
+        default:
+          throw new Error('Unknown result type!');
+      }
+    }
+  }
+  console.table(dataForTable);
+}
+
+type Result = {
+  type: 'BINARY',
+  total: number,
+  success: number,
+}|{
+  type: 'JUDGE',
+  average: number,
+  total: number,
+  allScores: number[],
+};
+
+class ResultStore {
+  // Map of testName => YYYY-MM-DD => Result
+  #results = new Map<string, Map<string, Result>>();
+  #type: string;
+  #label: string;
+
+  constructor(type: string, label: string) {
+    this.#type = type;
+    this.#label = label;
+  }
+
+  get type(): string {
+    return this.#type;
+  }
+  get label(): string {
+    return this.#label;
+  }
+
+  get results(): ReadonlyMap<string, ReadonlyMap<string, Result>> {
+    return this.#results;
+  }
+
+  saveResult(testName: string, dateFolder: string, result: Result): void {
+    const forTest = this.#results.get(testName) ?? new Map<string, Result>();
+    forTest.set(dateFolder, result);
+    this.#results.set(testName, forTest);
+  }
+}
@@ -0,0 +1,122 @@
+// Copyright 2025 The Chromium Authors
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+/**
+ * Makes requests to the Gemini API. Assumes process.env.GEMINI_API_KEY is available.
+ * Note: the reason we do not use any Google Gemini SDK here is to save on the
+ * node module install & committing it to the repo. Our API usage of Gemini is
+ * lightweight so it doesn't feel worth it vs just wrapping the XHR requests
+ * ourselves.
+ */
+
+// Define interfaces for better type safety
+interface Part {
+  text: string;
+}
+
+interface Content {
+  parts: Part[];
+}
+
+interface GenerateContentRequestBody {
+  contents: Content[];
+  generationConfig?: {
+    responseMimeType: string,
+    responseSchema: object,
+  };
+}
+
+interface CandidatePart {
+  text: string;
+}
+
+interface CandidateContent {
+  parts: CandidatePart[];
+  role: string;
+}
+
+interface Candidate {
+  content: CandidateContent;
+  finishReason: string;
+  index: number;
+}
+
+interface GenerateContentResponse {
+  candidates: Candidate[];
+}
+
+/**
+ * Helper function to construct the request body for the Gemini generateContent API.
+ *
+ * @param promptText The text prompt for the Gemini model.
+ * @param jsonSchema An optional JSON schema to define the expected response structure.
+ * @returns The constructed GenerateContentRequestBody object.
+ */
+function buildGeminiRequestBody(promptText: string, jsonSchema?: object): GenerateContentRequestBody {
+  const requestBody: GenerateContentRequestBody = {
+    contents: [
+      {
+        parts: [
+          {
+            text: promptText,
+          },
+        ],
+      },
+    ],
+  };
+
+  if (jsonSchema) {
+    requestBody.generationConfig = {
+      responseMimeType: 'application/json',
+      responseSchema: jsonSchema,
+    };
+  }
+
+  return requestBody;
+}
+
+/**
+ * Makes a request to the Gemini API's generateContent endpoint.
+ *
+ * @param promptText The text prompt to send to the Gemini model.
+ * @param modelName The name of the Gemini model to use (e.g., 'gemini-pro', 'gemini-1.5-flash').
+ * @returns A Promise that resolves to the generated text, or an error string.
+ */
+export async function generateGeminiContent(
+    promptText: string, modelName = 'gemini-2.5-flash', jsonSchema?: object): Promise<string> {
+  const apiKey = process.env.GEMINI_API_KEY;
+
+  if (!apiKey) {
+    throw new Error('GEMINI_API_KEY environment variable is not set. Please provide your API key.');
+  }
+
+  const apiUrl = `https://generativelanguage.googleapis.com/v1beta/models/${modelName}:generateContent`;
+
+  const requestBody = buildGeminiRequestBody(promptText, jsonSchema);
+
+  try {
+    const response = await fetch(apiUrl, {
+      method: 'POST',
+      headers: {
+        'Content-Type': 'application/json',
+        'x-goog-api-key': apiKey,
+      },
+      body: JSON.stringify(requestBody),
+    });
+
+    if (!response.ok) {
+      const errorData = await response.json();
+      throw new Error(`API error: ${response.status} ${response.statusText} - ${JSON.stringify(errorData)}`);
+    }
+
+    const data: GenerateContentResponse = await response.json();
+
+    // Safely access the generated content
+    return (
+        data.candidates?.[0]?.content?.parts?.[0]?.text || 'No content generated or content not in expected format.');
+  } catch (error) {
+    console.error('Error generating content:', error);
+    return `Error: Could not generate content. Details: ${error instanceof Error ? error.message : String(error)}`;
+  }
+}
Original file line number	Diff line number	Diff line change
`@@ -2,6 +2,7 @@`
`2`	`2`	`"type": "module",`
`3`	`3`	`"scripts": {`
`4`	`4`	`"auto-run": "node --no-warnings --experimental-strip-types auto-run/auto-run.ts",`
`5`		`- "auto-run:test": "npx --node-options='--no-warnings --experimental-strip-types' mocha auto-run/*/.test.ts"`
	`5`	`+ "auto-run:test": "npx --node-options='--no-warnings --experimental-strip-types' mocha auto-run/*/.test.ts",`
	`6`	`+ "eval-suite": "node --no-warnings --experimental-strip-types suite/*.eval.ts"`
`6`	`7`	`}`
`7`	`8`	`}`