feat: support changing the autorater model

crisbeto · crisbeto · commit 4ce8bbf6a7c8 · 2025-09-19T12:59:10.000+02:00
Adds a flag that overrides the model used for auto rating. Fixes angular#14.
diff --git a/runner/eval-cli.ts b/runner/eval-cli.ts
@@ -2,7 +2,10 @@ import { Arguments, Argv, CommandModule } from 'yargs';
 import chalk from 'chalk';
 import { join } from 'path';
 import { assertValidModelName, LlmRunner } from './codegen/llm-runner.js';
-import { DEFAULT_MODEL_NAME } from './configuration/constants.js';
+import {
+  DEFAULT_AUTORATER_MODEL_NAME,
+  DEFAULT_MODEL_NAME,
+} from './configuration/constants.js';
 import { generateCodeAndAssess } from './orchestration/generate.js';
 import {
   logReportToConsole,
@@ -48,6 +51,7 @@ interface Options {
   skipAxeTesting?: boolean;
   enableUserJourneyTesting?: boolean;
   enableAutoCsp?: boolean;
+  autoraterModel?: string;
   logging?: 'text-only' | 'dynamic';
 }
 
@@ -156,6 +160,11 @@ function builder(argv: Argv): Argv<Options> {
       description:
         'Whether to include a automatic hash-based Content-Security-Policy and Trusted Types to find incompatibilities.',
     })
+    .option('autorater-model', {
+      type: 'string',
+      default: DEFAULT_AUTORATER_MODEL_NAME,
+      descript: 'Model to use when automatically rating generated code',
+    })
     .strict()
     .version(false)
     .help()
@@ -204,6 +213,7 @@ async function handler(cliArgs: Arguments<Options>): Promise<void> {
       enableUserJourneyTesting: cliArgs.enableUserJourneyTesting,
       enableAutoCsp: cliArgs.enableAutoCsp,
       logging: cliArgs.logging,
+      autoraterModel: cliArgs.autoraterModel,
     });
 
     logReportToConsole(runInfo);
diff --git a/runner/orchestration/generate.ts b/runner/orchestration/generate.ts
@@ -6,7 +6,11 @@ import PQueue from 'p-queue';
 import { basename, join } from 'path';
 import { existsSync, readdirSync } from 'fs';
 import { LlmGenerateFilesContext, LlmRunner } from '../codegen/llm-runner.js';
-import { LLM_OUTPUT_DIR, REPORT_VERSION } from '../configuration/constants.js';
+import {
+  DEFAULT_AUTORATER_MODEL_NAME,
+  LLM_OUTPUT_DIR,
+  REPORT_VERSION,
+} from '../configuration/constants.js';
 import { Environment } from '../configuration/environment.js';
 import { rateGeneratedCode } from '../ratings/rate-code.js';
 import { summarizeReportWithAI } from '../reporting/ai-summarize.js';
@@ -77,6 +81,7 @@ export async function generateCodeAndAssess(options: {
   enableUserJourneyTesting?: boolean;
   enableAutoCsp?: boolean;
   logging?: 'text-only' | 'dynamic';
+  autoraterModel?: string;
 }): Promise<RunInfo> {
   const env = await getEnvironmentByPath(options.environmentConfigPath);
   const promptsToProcess = getCandidateExecutablePrompts(
@@ -163,7 +168,8 @@ export async function generateCodeAndAssess(options: {
                   !!options.enableUserJourneyTesting,
                   !!options.enableAutoCsp,
                   workerConcurrencyQueue,
-                  progress
+                  progress,
+                  options.autoraterModel || DEFAULT_AUTORATER_MODEL_NAME
                 ),
               // 10min max per app evaluation.  We just want to make sure it never gets stuck.
               10
@@ -291,7 +297,8 @@ async function startEvaluationTask(
   enableUserJourneyTesting: boolean,
   enableAutoCsp: boolean,
   workerConcurrencyQueue: PQueue,
-  progress: ProgressLogger
+  progress: ProgressLogger,
+  autoraterModel: string
 ): Promise<AssessmentResult[]> {
   // Set up the project structure once for the root project.
   const { directory, cleanup } = await setupProjectStructure(
@@ -444,7 +451,8 @@ async function startEvaluationTask(
       attempt.repairAttempts,
       attempt.axeRepairAttempts,
       abortSignal,
-      progress
+      progress,
+      autoraterModel
     );
 
     results.push({
diff --git a/runner/ratings/built-in-ratings/code-quality-rating.ts b/runner/ratings/built-in-ratings/code-quality-rating.ts
@@ -1,4 +1,3 @@
-import { DEFAULT_AUTORATER_MODEL_NAME } from '../../configuration/constants.js';
 import { autoRateCode } from '../autoraters/code-rater.js';
 import {
   LLMBasedRating,
@@ -12,7 +11,6 @@ export const codeQualityRating: LLMBasedRating = {
   kind: RatingKind.LLM_BASED,
   name: 'Code Quality (LLM-rated)',
   description: `Rates the app's source code via LLM`,
-  model: DEFAULT_AUTORATER_MODEL_NAME,
   category: RatingCategory.MEDIUM_IMPACT,
   id: 'common-autorater-code-quality',
   scoreReduction: '30%',
diff --git a/runner/ratings/built-in-ratings/visual-appearance-rating.ts b/runner/ratings/built-in-ratings/visual-appearance-rating.ts
@@ -1,5 +1,4 @@
 import { TimeoutError } from 'puppeteer';
-import { DEFAULT_AUTORATER_MODEL_NAME } from '../../configuration/constants.js';
 import { AutoRateResult } from '../autoraters/auto-rate-shared.js';
 import { autoRateAppearance } from '../autoraters/visuals-rater.js';
 import {
@@ -18,7 +17,6 @@ export const visualAppearanceRating: LLMBasedRating = {
   category: RatingCategory.MEDIUM_IMPACT,
   scoreReduction: '30%',
   id: 'common-autorater-visuals',
-  model: DEFAULT_AUTORATER_MODEL_NAME,
   rate: async (ctx) => {
     if (ctx.buildResult.screenshotPngUrl === undefined) {
       return {
diff --git a/runner/ratings/rate-code.ts b/runner/ratings/rate-code.ts
@@ -54,7 +54,8 @@ export async function rateGeneratedCode(
   repairAttempts: number,
   axeRepairAttempts: number,
   abortSignal: AbortSignal,
-  progress: ProgressLogger
+  progress: ProgressLogger,
+  autoraterModel: string
 ): Promise<CodeAssessmentScore> {
   let categorizedFiles: CategorizedFiles | null = null;
   let totalPoints = 0;
@@ -107,7 +108,8 @@ export async function rateGeneratedCode(
           buildResult,
           repairAttempts,
           axeRepairAttempts,
-          abortSignal
+          abortSignal,
+          autoraterModel
         );
       } else {
         throw new UserFacingError(`Unsupported rating type ${current}`);
@@ -269,14 +271,15 @@ async function runLlmBasedRating(
   buildResult: BuildResult,
   repairAttempts: number,
   axeRepairAttempts: number,
-  abortSignal: AbortSignal
+  abortSignal: AbortSignal,
+  autoraterModel: string
 ): Promise<IndividualAssessment | SkippedIndividualAssessment> {
   const result = await rating.rate({
     environment,
     fullPromptText,
     currentPromptDef,
     llm,
-    model: rating.model,
+    model: autoraterModel,
     outputFiles,
     buildResult,
     repairAttempts,
diff --git a/runner/ratings/rating-types.ts b/runner/ratings/rating-types.ts
@@ -99,7 +99,6 @@ const llmBasedRatingSchema = z
   .object({
     ...ratingSchemaCommonFields,
     kind: z.literal(RatingKind.LLM_BASED),
-    model: z.string(),
     rate: z
       .function()
       .args(z.custom<LLMBasedRatingContext>())
diff --git a/runner/reporting/report-logging.ts b/runner/reporting/report-logging.ts
@@ -6,7 +6,10 @@ import {
   RunInfo,
   ScoreBucket,
 } from '../shared-interfaces.js';
-import { REPORTS_ROOT_DIR } from '../configuration/constants.js';
+import {
+  DEFAULT_AUTORATER_MODEL_NAME,
+  REPORTS_ROOT_DIR,
+} from '../configuration/constants.js';
 import { calculateBuildAndCheckStats } from '../ratings/stats.js';
 import { safeWriteFile } from '../file-system-utils.js';
 import { BuildResultStatus } from '../builder/builder-types.js';
@@ -160,13 +163,18 @@ export function logReportHeader(
     llm: LlmRunner;
     labels: string[];
     startMcp?: boolean;
+    autoraterModel?: string;
   }
 ): void {
   const titleCardText = [
     'Running a codegen evaluation with configuration:',
     '',
     ` - Environment: ${env.displayName}`,
     ` - Model: ${options.model}`,
+    options.autoraterModel &&
+    options.autoraterModel !== DEFAULT_AUTORATER_MODEL_NAME
+      ? ` - Autorater model: ${options.autoraterModel}`
+      : null,
     ` - Runner: ${options.llm.displayName}`,
     ` - MCP servers: ${options.startMcp && env.mcpServerOptions.length ? env.mcpServerOptions.length : 'none'}`,
     options.labels.length ? ` - Labels: ${options.labels.join(', ')}` : null,