Skip to content

Commit 4184520

Browse files
committed
feat: allow running WCS as standalone auto-rater
In some cases it's useful to run WCS with a custom executor that provides pre-scraped LLM output for rating. These environments might not have access to a Gemini Key for auto-rating. In such cases, WCS should be smart enough to detect that there are no LLM-autorater ratings and not require a Gemini key (by calling `getRunnerByName()`).
1 parent aed9302 commit 4184520

File tree

5 files changed

+73
-55
lines changed

5 files changed

+73
-55
lines changed

runner/orchestration/generate-eval-task.ts

Lines changed: 24 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -12,11 +12,12 @@ import {EvalID} from './executors/executor.js';
1212
import {ProgressLogger} from '../progress/progress-logger.js';
1313
import {resolveContextFiles, setupProjectStructure, writeResponseFiles} from './file-system.js';
1414
import {generateInitialFiles} from './generate-initial-files.js';
15-
import {generateUserJourneysForApp} from './user-journeys.js';
15+
import {generateUserJourneysForApp, UserJourneysResult} from './user-journeys.js';
1616
import {BrowserAgentTaskInput} from '../testing/browser-agent/models.js';
1717
import {attemptBuildAndTest} from './build-serve-test-loop.js';
1818
import {rateGeneratedCode} from '../ratings/rate-code.js';
1919
import {DEFAULT_AUTORATER_MODEL_NAME} from '../configuration/constants.js';
20+
import assert from 'node:assert';
2021

2122
/**
2223
* Creates and executes a task to generate or load code for a given prompt,
@@ -25,24 +26,14 @@ import {DEFAULT_AUTORATER_MODEL_NAME} from '../configuration/constants.js';
2526
* This function handles both online (AI-generated) and local (file-based) code retrieval.
2627
* It manages build attempts and AI-driven repair cycles.
2728
*
28-
* @param evalID ID of the evaluation task.
29-
* @param env Environment for this evaluation.
30-
* @param model Name of the LLM to use.
31-
* @param rootPromptDef Definition of the root prompt being processed.
32-
* @param localMode A boolean indicating whether to load code from local files instead of generating it.
33-
* @param skipScreenshots Whether to skip taking screenshot of a running application.
34-
* @param outputDirectory Directory in which to generate the output. Convenient for debugging.
35-
* @param abortSignal Abort signal for when the evaluation task should be aborted.
36-
* @param skipAxeTesting Whether or not to skip Axe testing of the app.
37-
* @param enableUserJourneyTesting Whether to enable user journey testing of generated apps.
38-
* @param workerConcurrencyQueue Concurrency queue for controlling parallelism of worker invocations (as they are more expensive than LLM calls).
3929
* @returns A Promise that resolves to an AssessmentResult object containing all details of the task's execution.
4030
*/
4131
export async function startEvaluationTask(
4232
config: AssessmentConfig,
4333
evalID: EvalID,
4434
env: Environment,
45-
ratingLlm: GenkitRunner,
35+
autoraterLlm: GenkitRunner | null,
36+
cujGenerationLlm: GenkitRunner | null,
4637
rootPromptDef: PromptDefinition | MultiStepPromptDefinition,
4738
abortSignal: AbortSignal,
4839
workerConcurrencyQueue: PQueue,
@@ -128,24 +119,25 @@ export async function startEvaluationTask(
128119
break;
129120
}
130121

131-
const userJourneys = config.enableUserJourneyTesting
132-
? await generateUserJourneysForApp(
133-
ratingLlm,
134-
rootPromptDef.name,
135-
defsToExecute[0].prompt,
136-
initialResponse.files,
137-
abortSignal,
138-
)
139-
: undefined;
140-
141-
// TODO: Only execute the serve command on the "final working attempt".
142-
// TODO: Incorporate usage.
143-
const userJourneyAgentTaskInput: BrowserAgentTaskInput | undefined = userJourneys
144-
? {
145-
userJourneys: userJourneys.result,
146-
appPrompt: defsToExecute[0].prompt,
147-
}
148-
: undefined;
122+
let userJourneys: UserJourneysResult | undefined = undefined;
123+
let userJourneyAgentTaskInput: BrowserAgentTaskInput | undefined = undefined;
124+
125+
if (config.enableUserJourneyTesting) {
126+
assert(cujGenerationLlm, 'Expected a CUJ generation LLM to be available.');
127+
userJourneys = await generateUserJourneysForApp(
128+
cujGenerationLlm,
129+
rootPromptDef.name,
130+
defsToExecute[0].prompt,
131+
initialResponse.files,
132+
abortSignal,
133+
);
134+
135+
// TODO: Incorporate usage.
136+
userJourneyAgentTaskInput = {
137+
userJourneys: userJourneys.result,
138+
appPrompt: defsToExecute[0].prompt,
139+
};
140+
}
149141

150142
const attemptDetails: AttemptDetails[] = []; // Store details for assessment.json
151143

@@ -172,7 +164,7 @@ export async function startEvaluationTask(
172164
}
173165

174166
const score = await rateGeneratedCode(
175-
ratingLlm,
167+
autoraterLlm,
176168
env,
177169
promptDef,
178170
fullPromptText,

runner/orchestration/generate-summary.ts

Lines changed: 9 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -9,13 +9,12 @@ import {AssessmentResult, CompletionStats, RunSummary} from '../shared-interface
99
* and also some extra metadata about the run.
1010
*/
1111
export async function prepareSummary(
12-
genkit: GenkitRunner,
12+
generateAiSummaryLlm: GenkitRunner | null,
1313
abortSignal: AbortSignal,
1414
model: string,
1515
env: Environment,
1616
assessments: AssessmentResult[],
1717
completionStats: CompletionStats,
18-
opts: {skipAiSummary?: boolean},
1918
): Promise<RunSummary> {
2019
let inputTokens = 0;
2120
let outputTokens = 0;
@@ -40,22 +39,19 @@ export async function prepareSummary(
4039
});
4140

4241
let aiSummary: string | undefined = undefined;
43-
if (!opts.skipAiSummary) {
42+
if (generateAiSummaryLlm) {
4443
console.log(`✨ Generating AI summary for evaluation run..`);
4544
try {
46-
const result = await summarizeReportWithAI(genkit, abortSignal, assessments);
47-
48-
if (result !== null) {
49-
inputTokens += result.usage.inputTokens;
50-
outputTokens += result.usage.outputTokens;
51-
totalTokens += result.usage.totalTokens;
52-
aiSummary = result.responseHtml;
53-
console.log(`✅ Generated AI summary.`);
54-
}
45+
const result = await summarizeReportWithAI(generateAiSummaryLlm, abortSignal, assessments);
46+
inputTokens += result.usage.inputTokens;
47+
outputTokens += result.usage.outputTokens;
48+
totalTokens += result.usage.totalTokens;
49+
aiSummary = result.responseHtml;
50+
console.log(`✅ Generated AI summary.`);
5551
} catch (e) {
5652
console.log(`${redX()} Failed to generate AI summary, skipping summary.`);
5753

58-
if ((e as Partial<Error>).stack) {
54+
if (process.env.DEBUG === '1' && (e as Partial<Error>).stack) {
5955
console.error((e as Error).stack);
6056
}
6157
}

runner/orchestration/generate.ts

Lines changed: 35 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ import {existsSync, readdirSync} from 'fs';
33
import {availableParallelism} from 'os';
44
import PQueue from 'p-queue';
55
import {basename, join} from 'path';
6-
import {assertValidModelName} from '../codegen/llm-runner.js';
6+
import {assertValidModelName, LlmRunner} from '../codegen/llm-runner.js';
77
import {getRunnerByName} from '../codegen/runner-creation.js';
88
import {LLM_OUTPUT_DIR, REPORT_VERSION} from '../configuration/constants.js';
99
import {getEnvironmentByPath} from '../configuration/environment-resolution.js';
@@ -27,6 +27,7 @@ import {startEvaluationTask} from './generate-eval-task.js';
2727
import {prepareSummary} from './generate-summary.js';
2828
import {getRunGroupId} from './grouping.js';
2929
import {combineAbortSignals} from '../utils/abort-signal.js';
30+
import {RatingKind} from '../ratings/rating-types.js';
3031

3132
/**
3233
* Orchestrates the entire assessment process for each prompt defined in the `prompts` array.
@@ -43,10 +44,14 @@ import {combineAbortSignals} from '../utils/abort-signal.js';
4344
*/
4445
export async function generateCodeAndAssess(options: AssessmentConfig): Promise<RunInfo> {
4546
const env = await getEnvironmentByPath(options.environmentConfigPath, options.runner);
47+
const extraCleanupFns: (() => Promise<void>)[] = [];
4648
const cleanup = async () => {
4749
// Clean-up should never interrupt a potentially passing completion.
4850
try {
4951
await env.executor.destroy();
52+
for (const cleanupFn of extraCleanupFns) {
53+
await cleanupFn();
54+
}
5055
} catch (e) {
5156
console.error(`Failed to destroy executor: ${e}`);
5257
if (e instanceof Error) {
@@ -58,16 +63,39 @@ export async function generateCodeAndAssess(options: AssessmentConfig): Promise<
5863
// Ensure cleanup logic runs when the evaluation is aborted.
5964
options.abortSignal?.addEventListener('abort', cleanup);
6065

61-
await assertValidModelName(options.model, env.executor);
62-
63-
const ratingLlm = await getRunnerByName('genkit');
6466
const allTasksAbortCtrl = new AbortController();
6567

6668
try {
69+
await assertValidModelName(options.model, env.executor);
70+
6771
const promptsToProcess = (
6872
await getCandidateExecutablePrompts(env, options.localMode, options.promptFilter)
6973
).slice(0, options.limit);
7074

75+
const hasLlmBasedRatings = promptsToProcess.some(p =>
76+
p.kind === 'single'
77+
? // Check if some ratings are LLM based.
78+
p.ratings.some(r => r.kind === RatingKind.LLM_BASED)
79+
: // Check if some steps contain LLM based ratings.
80+
p.steps.some(s => s.ratings.some(r => r.kind === RatingKind.LLM_BASED)),
81+
);
82+
83+
// Only construct LLMs when necessary. This is helpful in cases where WCS is invoked
84+
// as a auto-rater that doesn't have access to other LLMs.
85+
const autoraterLlm = hasLlmBasedRatings ? await getRunnerByName('genkit') : null;
86+
const cujGenerationLlm = options.enableUserJourneyTesting
87+
? (autoraterLlm ?? (await getRunnerByName('genkit')))
88+
: null;
89+
const generateAiSummaryLlm = !options.skipAiSummary
90+
? (autoraterLlm ?? cujGenerationLlm ?? (await getRunnerByName('genkit')))
91+
: null;
92+
93+
extraCleanupFns.push(async () => {
94+
await autoraterLlm?.dispose();
95+
await cujGenerationLlm?.dispose();
96+
await generateAiSummaryLlm?.dispose();
97+
});
98+
7199
const progress =
72100
options.logging === 'dynamic' ? new DynamicProgressLogger() : new TextProgressLogger();
73101
const appConcurrency =
@@ -128,7 +156,8 @@ export async function generateCodeAndAssess(options: AssessmentConfig): Promise<
128156
options,
129157
evalID,
130158
env,
131-
ratingLlm,
159+
autoraterLlm,
160+
cujGenerationLlm,
132161
rootPromptDef,
133162
combineAbortSignals(
134163
allTasksAbortCtrl.signal,
@@ -187,7 +216,7 @@ export async function generateCodeAndAssess(options: AssessmentConfig): Promise<
187216
const timestamp = new Date();
188217
const details = {
189218
summary: await prepareSummary(
190-
ratingLlm,
219+
generateAiSummaryLlm,
191220
allTasksAbortCtrl.signal,
192221
options.model,
193222
env,
@@ -196,7 +225,6 @@ export async function generateCodeAndAssess(options: AssessmentConfig): Promise<
196225
allPromptsCount: promptsToProcess.length,
197226
failedPrompts,
198227
},
199-
options,
200228
),
201229
timestamp: timestamp.toISOString(),
202230
reportName: options.reportName,

runner/ratings/rate-code.ts

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ import {GenkitRunner} from '../codegen/genkit/genkit-runner.js';
2929
import {ProgressLogger} from '../progress/progress-logger.js';
3030
import {UserFacingError} from '../utils/errors.js';
3131
import {ServeTestingResult} from '../workers/serve-testing/worker-types.js';
32+
import assert from 'assert';
3233

3334
interface FileOrEmbeddedSyntheticFile {
3435
/**
@@ -45,7 +46,7 @@ interface FileOrEmbeddedSyntheticFile {
4546
type CategorizedFiles = Record<PerFileRatingContentType, FileOrEmbeddedSyntheticFile[]>;
4647

4748
export async function rateGeneratedCode(
48-
llm: GenkitRunner,
49+
autoraterLlm: GenkitRunner | null,
4950
environment: Environment,
5051
currentPromptDef: PromptDefinition,
5152
fullPromptText: string,
@@ -107,12 +108,13 @@ export async function rateGeneratedCode(
107108
categorizedFiles ??= splitFilesIntoCategories(outputFiles);
108109
result = await runPerFileRating(currentPromptDef, current, categorizedFiles, ratingsResult);
109110
} else if (current.kind === RatingKind.LLM_BASED) {
111+
assert(autoraterLlm !== null, 'Expected an auto-rater LLM to be available.');
110112
result = await runLlmBasedRating(
111113
environment,
112114
current,
113115
fullPromptText,
114116
currentPromptDef,
115-
llm,
117+
autoraterLlm,
116118
outputFiles,
117119
buildResult,
118120
serveTestingResult,

runner/reporting/report-ai-summary.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ export async function summarizeReportWithAI(
1010
const model = 'gemini-2.5-flash-lite';
1111

1212
if (!llm.getSupportedModels().includes(model)) {
13-
return null;
13+
throw new Error(`Unable to generate AI summary due to unsupported model: ${model}`);
1414
}
1515

1616
return chatWithReportAI(

0 commit comments

Comments
 (0)