Skip to content

Commit a454390

Browse files
aaronshimdevversion
authored andcommitted
Auto-rater allows some previous rater results to be passed in as context.
1 parent 96aba2d commit a454390

File tree

6 files changed

+80
-19
lines changed

6 files changed

+80
-19
lines changed

runner/ratings/autoraters/code-rater.ts

Lines changed: 26 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2,14 +2,20 @@ import { readFileSync } from 'node:fs';
22
import { z } from 'zod';
33
import { prepareContextFilesMessage } from '../../orchestration/codegen.js';
44
import { Environment } from '../../configuration/environment.js';
5-
import { LlmResponseFile } from '../../shared-interfaces.js';
5+
import {
6+
IndividualAssessment,
7+
IndividualAssessmentState,
8+
LlmResponseFile,
9+
SkippedIndividualAssessment,
10+
} from '../../shared-interfaces.js';
611
import {
712
AutoRateResult,
813
getCoefficient,
914
MAX_RATING,
1015
} from './auto-rate-shared.js';
1116
import { GenkitRunner } from '../../codegen/genkit/genkit-runner.js';
1217
import defaultCodeRaterPrompt from './code-rating-prompt.js';
18+
import { RatingsContext } from '../rating-types.js';
1319

1420
/** Framework-specific hints for the rating prompt. */
1521
const FW_HINTS: Record<string, string | undefined> = {
@@ -33,14 +39,16 @@ const CACHED_RATING_PROMPTS: Record<string, string> = {};
3339
* @param environment Environment in which the rating is running.
3440
* @param files Files to be rated.
3541
* @param appPrompt Prompt to be used for the rating.
42+
* @param ratingsContext Context containing results from previous ratings.
3643
*/
3744
export async function autoRateCode(
3845
llm: GenkitRunner,
3946
abortSignal: AbortSignal,
4047
model: string,
4148
environment: Environment,
4249
files: LlmResponseFile[],
43-
appPrompt: string
50+
appPrompt: string,
51+
ratingsContext: RatingsContext
4452
): Promise<AutoRateResult> {
4553
const contextMessage = prepareContextFilesMessage(
4654
files.map((o) => ({
@@ -61,10 +69,22 @@ export async function autoRateCode(
6169
promptText = defaultCodeRaterPrompt;
6270
}
6371

64-
const prompt = environment.renderPrompt(promptText, null, {
65-
APP_PROMPT: appPrompt,
66-
FRAMEWORK_SPECIFIC_HINTS: FW_HINTS[environment.fullStackFramework.id] ?? '',
67-
}).result;
72+
const safetyRating = ratingsContext['safety-web'];
73+
const safetyWebResultsJson =
74+
safetyRating?.state === IndividualAssessmentState.EXECUTED
75+
? JSON.stringify(safetyRating, null, 2)
76+
: '';
77+
78+
const prompt = environment.renderPrompt(
79+
promptText,
80+
environment.codeRatingPromptPath,
81+
{
82+
APP_PROMPT: appPrompt,
83+
FRAMEWORK_SPECIFIC_HINTS:
84+
FW_HINTS[environment.fullStackFramework.id] ?? '',
85+
SAFETY_WEB_RESULTS_JSON: safetyWebResultsJson,
86+
}
87+
).result;
6888

6989
const result = await llm.generateConstrained({
7090
abortSignal,

runner/ratings/autoraters/rate-files.ts

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,15 @@
11
import { greenCheckmark } from '../../reporting/format.js';
2-
import { AutoraterRunInfo, LlmResponseFile } from '../../shared-interfaces.js';
2+
import {
3+
AutoraterRunInfo,
4+
IndividualAssessment,
5+
LlmResponseFile,
6+
SkippedIndividualAssessment,
7+
} from '../../shared-interfaces.js';
38
import { autoRateCode } from './code-rater.js';
49
import { autoRateAppearance } from './visuals-rater.js';
510
import { Environment } from '../../configuration/environment.js';
611
import { GenkitRunner } from '../../codegen/genkit/genkit-runner.js';
12+
import { RatingsContext } from '../rating-types.js';
713

814
/**
915
* Automatically rates the code inside of a file.
@@ -13,6 +19,7 @@ import { GenkitRunner } from '../../codegen/genkit/genkit-runner.js';
1319
* @param filePath Path to the file to be rated.
1420
* @param appPrompt Prompt that should be checked.
1521
* @param screenshotPath Path to the screenshot to use for visual rating.
22+
* @param ratingsContext Context containing results from previous ratings.
1623
*/
1724
export async function autoRateFiles(
1825
llm: GenkitRunner,
@@ -21,7 +28,8 @@ export async function autoRateFiles(
2128
environment: Environment,
2229
files: LlmResponseFile[],
2330
appPrompt: string,
24-
screenshotPngUrl: string | null
31+
screenshotPngUrl: string | null,
32+
ratingsContext: RatingsContext
2533
): Promise<AutoraterRunInfo> {
2634
console.log(`Autorater is using '${model}' model. \n`);
2735

@@ -33,7 +41,8 @@ export async function autoRateFiles(
3341
model,
3442
environment,
3543
files,
36-
appPrompt
44+
appPrompt,
45+
ratingsContext
3746
);
3847
console.log(`${greenCheckmark()} Code scoring is successful.`);
3948

runner/ratings/built-in-ratings/code-quality-rating.ts

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,8 @@ export const codeQualityRating: LLMBasedRating = {
2121
ctx.model,
2222
ctx.environment,
2323
ctx.outputFiles,
24-
ctx.fullPromptText
24+
ctx.fullPromptText,
25+
ctx.ratingsContext
2526
);
2627

2728
return {

runner/ratings/built-in.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,14 +20,14 @@ import {
2020
export function getBuiltInRatings(): Rating[] {
2121
return [
2222
successfulBuildRating,
23+
safetyWebRating,
2324
noRuntimeExceptionsRating,
2425
sufficientCodeSizeRating,
2526
sufficientGeneratedFilesRating,
2627
codeQualityRating,
2728
visualAppearanceRating,
2829
validCssRating,
2930
axeRating,
30-
safetyWebRating,
3131
userJourneysRating,
3232
NoInnerHtmlBindingsRating,
3333
NoDangerouslySetInnerHtmlRating,

runner/ratings/rate-code.ts

Lines changed: 21 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ import {
2020
POINTS_FOR_CATEGORIES,
2121
Rating,
2222
CATEGORY_NAMES,
23+
RatingsContext,
2324
} from './rating-types.js';
2425
import { extractEmbeddedCodeFromTypeScript } from './embedded-languages.js';
2526
import { Environment } from '../configuration/environment.js';
@@ -62,6 +63,7 @@ export async function rateGeneratedCode(
6263
let categorizedFiles: CategorizedFiles | null = null;
6364
let totalPoints = 0;
6465
let maxOverallPoints = 0;
66+
const ratingsContext: RatingsContext = {};
6567

6668
// Rating may also invoke LLMs. Track the usage.
6769
const tokenUsage = {
@@ -95,11 +97,16 @@ export async function rateGeneratedCode(
9597
serveTestingResult,
9698
repairAttempts,
9799
outputFiles.length,
98-
axeRepairAttempts
100+
axeRepairAttempts,
101+
ratingsContext
99102
);
100103
} else if (current.kind === RatingKind.PER_FILE) {
101104
categorizedFiles ??= splitFilesIntoCategories(outputFiles);
102-
result = await runPerFileRating(current, categorizedFiles);
105+
result = await runPerFileRating(
106+
current,
107+
categorizedFiles,
108+
ratingsContext
109+
);
103110
} else if (current.kind === RatingKind.LLM_BASED) {
104111
result = await runLlmBasedRating(
105112
environment,
@@ -113,7 +120,8 @@ export async function rateGeneratedCode(
113120
repairAttempts,
114121
axeRepairAttempts,
115122
abortSignal,
116-
autoraterModel
123+
autoraterModel,
124+
ratingsContext
117125
);
118126
} else {
119127
throw new UserFacingError(`Unsupported rating type ${current}`);
@@ -139,6 +147,7 @@ export async function rateGeneratedCode(
139147
);
140148
}
141149

150+
ratingsContext[current.id] = result;
142151
category.assessments.push(result);
143152
}
144153

@@ -178,14 +187,16 @@ function runPerBuildRating(
178187
serveResult: ServeTestingResult | null,
179188
repairAttempts: number,
180189
generatedFileCount: number,
181-
axeRepairAttempts: number
190+
axeRepairAttempts: number,
191+
ratingsContext: RatingsContext
182192
): IndividualAssessment | SkippedIndividualAssessment {
183193
const rateResult = rating.rate({
184194
buildResult,
185195
serveResult,
186196
repairAttempts,
187197
generatedFileCount,
188198
axeRepairAttempts,
199+
ratingsContext,
189200
});
190201

191202
// If the rating was skipped (e.g., Axe test wasn't run), create a skipped assessment.
@@ -203,7 +214,8 @@ function runPerBuildRating(
203214

204215
async function runPerFileRating(
205216
rating: PerFileRating,
206-
categorizedFiles: CategorizedFiles
217+
categorizedFiles: CategorizedFiles,
218+
ratingsContext: RatingsContext
207219
): Promise<IndividualAssessment | SkippedIndividualAssessment> {
208220
const errorMessages: string[] = [];
209221
let contentType: PerFileRatingContentType;
@@ -234,7 +246,7 @@ async function runPerFileRating(
234246
// Remove comments from the code to avoid false-detection of bad patterns.
235247
// Some keywords like `NgModule` can be used in code comments.
236248
const code = removeComments(file.code, contentType);
237-
const result = await rating.rate(code, file.filePath);
249+
const result = await rating.rate(code, file.filePath, ratingsContext);
238250
let coeff: number;
239251

240252
if (typeof result === 'number') {
@@ -279,7 +291,8 @@ async function runLlmBasedRating(
279291
repairAttempts: number,
280292
axeRepairAttempts: number,
281293
abortSignal: AbortSignal,
282-
autoraterModel: string
294+
autoraterModel: string,
295+
ratingsContext: RatingsContext
283296
): Promise<IndividualAssessment | SkippedIndividualAssessment> {
284297
const result = await rating.rate({
285298
environment,
@@ -293,6 +306,7 @@ async function runLlmBasedRating(
293306
repairAttempts,
294307
axeRepairAttempts,
295308
abortSignal,
309+
ratingsContext,
296310
});
297311

298312
if (result.state === RatingState.SKIPPED) {

runner/ratings/rating-types.ts

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,10 @@
11
import z from 'zod';
22
import { BuildResult } from '../workers/builder/builder-types.js';
33
import type {
4+
IndividualAssessment,
45
LlmResponseFile,
56
PromptDefinition,
7+
SkippedIndividualAssessment,
68
Usage,
79
} from '../shared-interfaces.js';
810
import { Environment } from '../configuration/environment.js';
@@ -64,6 +66,9 @@ const perBuildRatingSchema = z
6466
repairAttempts: z.number(),
6567
axeRepairAttempts: z.number(),
6668
generatedFileCount: z.number(),
69+
ratingsContext: z.record(
70+
z.custom<IndividualAssessment | SkippedIndividualAssessment>()
71+
),
6772
})
6873
)
6974
.returns(z.custom<PerBuildRatingResult>()),
@@ -76,7 +81,13 @@ const perFileRatingSchema = z
7681
kind: z.literal(RatingKind.PER_FILE),
7782
rate: z
7883
.function()
79-
.args(z.string(), z.string().optional())
84+
.args(
85+
z.string(),
86+
z.string().optional(),
87+
z.record(
88+
z.custom<IndividualAssessment | SkippedIndividualAssessment>()
89+
)
90+
)
8091
.returns(z.custom<PerFileRatingResult>()),
8192
filter: z.union([
8293
z
@@ -171,6 +182,11 @@ export interface ExecutedLLMBasedRating {
171182
};
172183
}
173184

185+
export type RatingsContext = Record<
186+
string,
187+
IndividualAssessment | SkippedIndividualAssessment
188+
>;
189+
174190
export interface LLMBasedRatingContext {
175191
environment: Environment;
176192
fullPromptText: string;
@@ -183,6 +199,7 @@ export interface LLMBasedRatingContext {
183199
repairAttempts: number;
184200
axeRepairAttempts: number;
185201
abortSignal: AbortSignal;
202+
ratingsContext: RatingsContext;
186203
}
187204

188205
/** Rating that applies over build results. */

0 commit comments

Comments
 (0)