Skip to content

Commit 4ce8bbf

Browse files
committed
feat: support changing the autorater model
Adds a flag that overrides the model used for auto rating. Fixes angular#14.
1 parent fac7e30 commit 4ce8bbf

File tree

7 files changed

+39
-15
lines changed

7 files changed

+39
-15
lines changed

runner/eval-cli.ts

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,10 @@ import { Arguments, Argv, CommandModule } from 'yargs';
22
import chalk from 'chalk';
33
import { join } from 'path';
44
import { assertValidModelName, LlmRunner } from './codegen/llm-runner.js';
5-
import { DEFAULT_MODEL_NAME } from './configuration/constants.js';
5+
import {
6+
DEFAULT_AUTORATER_MODEL_NAME,
7+
DEFAULT_MODEL_NAME,
8+
} from './configuration/constants.js';
69
import { generateCodeAndAssess } from './orchestration/generate.js';
710
import {
811
logReportToConsole,
@@ -48,6 +51,7 @@ interface Options {
4851
skipAxeTesting?: boolean;
4952
enableUserJourneyTesting?: boolean;
5053
enableAutoCsp?: boolean;
54+
autoraterModel?: string;
5155
logging?: 'text-only' | 'dynamic';
5256
}
5357

@@ -156,6 +160,11 @@ function builder(argv: Argv): Argv<Options> {
156160
description:
157161
'Whether to include a automatic hash-based Content-Security-Policy and Trusted Types to find incompatibilities.',
158162
})
163+
.option('autorater-model', {
164+
type: 'string',
165+
default: DEFAULT_AUTORATER_MODEL_NAME,
166+
descript: 'Model to use when automatically rating generated code',
167+
})
159168
.strict()
160169
.version(false)
161170
.help()
@@ -204,6 +213,7 @@ async function handler(cliArgs: Arguments<Options>): Promise<void> {
204213
enableUserJourneyTesting: cliArgs.enableUserJourneyTesting,
205214
enableAutoCsp: cliArgs.enableAutoCsp,
206215
logging: cliArgs.logging,
216+
autoraterModel: cliArgs.autoraterModel,
207217
});
208218

209219
logReportToConsole(runInfo);

runner/orchestration/generate.ts

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,11 @@ import PQueue from 'p-queue';
66
import { basename, join } from 'path';
77
import { existsSync, readdirSync } from 'fs';
88
import { LlmGenerateFilesContext, LlmRunner } from '../codegen/llm-runner.js';
9-
import { LLM_OUTPUT_DIR, REPORT_VERSION } from '../configuration/constants.js';
9+
import {
10+
DEFAULT_AUTORATER_MODEL_NAME,
11+
LLM_OUTPUT_DIR,
12+
REPORT_VERSION,
13+
} from '../configuration/constants.js';
1014
import { Environment } from '../configuration/environment.js';
1115
import { rateGeneratedCode } from '../ratings/rate-code.js';
1216
import { summarizeReportWithAI } from '../reporting/ai-summarize.js';
@@ -77,6 +81,7 @@ export async function generateCodeAndAssess(options: {
7781
enableUserJourneyTesting?: boolean;
7882
enableAutoCsp?: boolean;
7983
logging?: 'text-only' | 'dynamic';
84+
autoraterModel?: string;
8085
}): Promise<RunInfo> {
8186
const env = await getEnvironmentByPath(options.environmentConfigPath);
8287
const promptsToProcess = getCandidateExecutablePrompts(
@@ -163,7 +168,8 @@ export async function generateCodeAndAssess(options: {
163168
!!options.enableUserJourneyTesting,
164169
!!options.enableAutoCsp,
165170
workerConcurrencyQueue,
166-
progress
171+
progress,
172+
options.autoraterModel || DEFAULT_AUTORATER_MODEL_NAME
167173
),
168174
// 10min max per app evaluation. We just want to make sure it never gets stuck.
169175
10
@@ -291,7 +297,8 @@ async function startEvaluationTask(
291297
enableUserJourneyTesting: boolean,
292298
enableAutoCsp: boolean,
293299
workerConcurrencyQueue: PQueue,
294-
progress: ProgressLogger
300+
progress: ProgressLogger,
301+
autoraterModel: string
295302
): Promise<AssessmentResult[]> {
296303
// Set up the project structure once for the root project.
297304
const { directory, cleanup } = await setupProjectStructure(
@@ -444,7 +451,8 @@ async function startEvaluationTask(
444451
attempt.repairAttempts,
445452
attempt.axeRepairAttempts,
446453
abortSignal,
447-
progress
454+
progress,
455+
autoraterModel
448456
);
449457

450458
results.push({

runner/ratings/built-in-ratings/code-quality-rating.ts

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
1-
import { DEFAULT_AUTORATER_MODEL_NAME } from '../../configuration/constants.js';
21
import { autoRateCode } from '../autoraters/code-rater.js';
32
import {
43
LLMBasedRating,
@@ -12,7 +11,6 @@ export const codeQualityRating: LLMBasedRating = {
1211
kind: RatingKind.LLM_BASED,
1312
name: 'Code Quality (LLM-rated)',
1413
description: `Rates the app's source code via LLM`,
15-
model: DEFAULT_AUTORATER_MODEL_NAME,
1614
category: RatingCategory.MEDIUM_IMPACT,
1715
id: 'common-autorater-code-quality',
1816
scoreReduction: '30%',

runner/ratings/built-in-ratings/visual-appearance-rating.ts

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
import { TimeoutError } from 'puppeteer';
2-
import { DEFAULT_AUTORATER_MODEL_NAME } from '../../configuration/constants.js';
32
import { AutoRateResult } from '../autoraters/auto-rate-shared.js';
43
import { autoRateAppearance } from '../autoraters/visuals-rater.js';
54
import {
@@ -18,7 +17,6 @@ export const visualAppearanceRating: LLMBasedRating = {
1817
category: RatingCategory.MEDIUM_IMPACT,
1918
scoreReduction: '30%',
2019
id: 'common-autorater-visuals',
21-
model: DEFAULT_AUTORATER_MODEL_NAME,
2220
rate: async (ctx) => {
2321
if (ctx.buildResult.screenshotPngUrl === undefined) {
2422
return {

runner/ratings/rate-code.ts

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,8 @@ export async function rateGeneratedCode(
5454
repairAttempts: number,
5555
axeRepairAttempts: number,
5656
abortSignal: AbortSignal,
57-
progress: ProgressLogger
57+
progress: ProgressLogger,
58+
autoraterModel: string
5859
): Promise<CodeAssessmentScore> {
5960
let categorizedFiles: CategorizedFiles | null = null;
6061
let totalPoints = 0;
@@ -107,7 +108,8 @@ export async function rateGeneratedCode(
107108
buildResult,
108109
repairAttempts,
109110
axeRepairAttempts,
110-
abortSignal
111+
abortSignal,
112+
autoraterModel
111113
);
112114
} else {
113115
throw new UserFacingError(`Unsupported rating type ${current}`);
@@ -269,14 +271,15 @@ async function runLlmBasedRating(
269271
buildResult: BuildResult,
270272
repairAttempts: number,
271273
axeRepairAttempts: number,
272-
abortSignal: AbortSignal
274+
abortSignal: AbortSignal,
275+
autoraterModel: string
273276
): Promise<IndividualAssessment | SkippedIndividualAssessment> {
274277
const result = await rating.rate({
275278
environment,
276279
fullPromptText,
277280
currentPromptDef,
278281
llm,
279-
model: rating.model,
282+
model: autoraterModel,
280283
outputFiles,
281284
buildResult,
282285
repairAttempts,

runner/ratings/rating-types.ts

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,6 @@ const llmBasedRatingSchema = z
9999
.object({
100100
...ratingSchemaCommonFields,
101101
kind: z.literal(RatingKind.LLM_BASED),
102-
model: z.string(),
103102
rate: z
104103
.function()
105104
.args(z.custom<LLMBasedRatingContext>())

runner/reporting/report-logging.ts

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,10 @@ import {
66
RunInfo,
77
ScoreBucket,
88
} from '../shared-interfaces.js';
9-
import { REPORTS_ROOT_DIR } from '../configuration/constants.js';
9+
import {
10+
DEFAULT_AUTORATER_MODEL_NAME,
11+
REPORTS_ROOT_DIR,
12+
} from '../configuration/constants.js';
1013
import { calculateBuildAndCheckStats } from '../ratings/stats.js';
1114
import { safeWriteFile } from '../file-system-utils.js';
1215
import { BuildResultStatus } from '../builder/builder-types.js';
@@ -160,13 +163,18 @@ export function logReportHeader(
160163
llm: LlmRunner;
161164
labels: string[];
162165
startMcp?: boolean;
166+
autoraterModel?: string;
163167
}
164168
): void {
165169
const titleCardText = [
166170
'Running a codegen evaluation with configuration:',
167171
'',
168172
` - Environment: ${env.displayName}`,
169173
` - Model: ${options.model}`,
174+
options.autoraterModel &&
175+
options.autoraterModel !== DEFAULT_AUTORATER_MODEL_NAME
176+
? ` - Autorater model: ${options.autoraterModel}`
177+
: null,
170178
` - Runner: ${options.llm.displayName}`,
171179
` - MCP servers: ${options.startMcp && env.mcpServerOptions.length ? env.mcpServerOptions.length : 'none'}`,
172180
options.labels.length ? ` - Labels: ${options.labels.join(', ')}` : null,

0 commit comments

Comments
 (0)