feat: allow running WCS as standalone auto-rater

devversion · devversion · commit 418452092106 · 2025-11-13T08:32:08.000+01:00
In some cases it's useful to run WCS with a custom executor that
provides pre-scraped LLM output for rating. These environments might not
have access to a Gemini Key for auto-rating. In such cases, WCS should
be smart enough to detect that there are no LLM-autorater ratings and
not require a Gemini key (by calling `getRunnerByName()`).
diff --git a/runner/orchestration/generate-eval-task.ts b/runner/orchestration/generate-eval-task.ts
@@ -12,11 +12,12 @@ import {EvalID} from './executors/executor.js';
 import {ProgressLogger} from '../progress/progress-logger.js';
 import {resolveContextFiles, setupProjectStructure, writeResponseFiles} from './file-system.js';
 import {generateInitialFiles} from './generate-initial-files.js';
-import {generateUserJourneysForApp} from './user-journeys.js';
+import {generateUserJourneysForApp, UserJourneysResult} from './user-journeys.js';
 import {BrowserAgentTaskInput} from '../testing/browser-agent/models.js';
 import {attemptBuildAndTest} from './build-serve-test-loop.js';
 import {rateGeneratedCode} from '../ratings/rate-code.js';
 import {DEFAULT_AUTORATER_MODEL_NAME} from '../configuration/constants.js';
+import assert from 'node:assert';
 
 /**
  * Creates and executes a task to generate or load code for a given prompt,
@@ -25,24 +26,14 @@ import {DEFAULT_AUTORATER_MODEL_NAME} from '../configuration/constants.js';
  * This function handles both online (AI-generated) and local (file-based) code retrieval.
  * It manages build attempts and AI-driven repair cycles.
  *
- * @param evalID ID of the evaluation task.
- * @param env Environment for this evaluation.
- * @param model Name of the LLM to use.
- * @param rootPromptDef Definition of the root prompt being processed.
- * @param localMode A boolean indicating whether to load code from local files instead of generating it.
- * @param skipScreenshots Whether to skip taking screenshot of a running application.
- * @param outputDirectory Directory in which to generate the output. Convenient for debugging.
- * @param abortSignal Abort signal for when the evaluation task should be aborted.
- * @param skipAxeTesting Whether or not to skip Axe testing of the app.
- * @param enableUserJourneyTesting Whether to enable user journey testing of generated apps.
- * @param workerConcurrencyQueue Concurrency queue for controlling parallelism of worker invocations (as they are more expensive than LLM calls).
  * @returns A Promise that resolves to an AssessmentResult object containing all details of the task's execution.
  */
 export async function startEvaluationTask(
   config: AssessmentConfig,
   evalID: EvalID,
   env: Environment,
-  ratingLlm: GenkitRunner,
+  autoraterLlm: GenkitRunner | null,
+  cujGenerationLlm: GenkitRunner | null,
   rootPromptDef: PromptDefinition | MultiStepPromptDefinition,
   abortSignal: AbortSignal,
   workerConcurrencyQueue: PQueue,
@@ -128,24 +119,25 @@ export async function startEvaluationTask(
       break;
     }
 
-    const userJourneys = config.enableUserJourneyTesting
-      ? await generateUserJourneysForApp(
-          ratingLlm,
-          rootPromptDef.name,
-          defsToExecute[0].prompt,
-          initialResponse.files,
-          abortSignal,
-        )
-      : undefined;
-
-    // TODO: Only execute the serve command on the "final working attempt".
-    // TODO: Incorporate usage.
-    const userJourneyAgentTaskInput: BrowserAgentTaskInput | undefined = userJourneys
-      ? {
-          userJourneys: userJourneys.result,
-          appPrompt: defsToExecute[0].prompt,
-        }
-      : undefined;
+    let userJourneys: UserJourneysResult | undefined = undefined;
+    let userJourneyAgentTaskInput: BrowserAgentTaskInput | undefined = undefined;
+
+    if (config.enableUserJourneyTesting) {
+      assert(cujGenerationLlm, 'Expected a CUJ generation LLM to be available.');
+      userJourneys = await generateUserJourneysForApp(
+        cujGenerationLlm,
+        rootPromptDef.name,
+        defsToExecute[0].prompt,
+        initialResponse.files,
+        abortSignal,
+      );
+
+      // TODO: Incorporate usage.
+      userJourneyAgentTaskInput = {
+        userJourneys: userJourneys.result,
+        appPrompt: defsToExecute[0].prompt,
+      };
+    }
 
     const attemptDetails: AttemptDetails[] = []; // Store details for assessment.json
 
@@ -172,7 +164,7 @@ export async function startEvaluationTask(
     }
 
     const score = await rateGeneratedCode(
-      ratingLlm,
+      autoraterLlm,
       env,
       promptDef,
       fullPromptText,
diff --git a/runner/orchestration/generate-summary.ts b/runner/orchestration/generate-summary.ts
@@ -9,13 +9,12 @@ import {AssessmentResult, CompletionStats, RunSummary} from '../shared-interface
  * and also some extra metadata about the run.
  */
 export async function prepareSummary(
-  genkit: GenkitRunner,
+  generateAiSummaryLlm: GenkitRunner | null,
   abortSignal: AbortSignal,
   model: string,
   env: Environment,
   assessments: AssessmentResult[],
   completionStats: CompletionStats,
-  opts: {skipAiSummary?: boolean},
 ): Promise<RunSummary> {
   let inputTokens = 0;
   let outputTokens = 0;
@@ -40,22 +39,19 @@ export async function prepareSummary(
   });
 
   let aiSummary: string | undefined = undefined;
-  if (!opts.skipAiSummary) {
+  if (generateAiSummaryLlm) {
     console.log(`✨ Generating AI summary for evaluation run..`);
     try {
-      const result = await summarizeReportWithAI(genkit, abortSignal, assessments);
-
-      if (result !== null) {
-        inputTokens += result.usage.inputTokens;
-        outputTokens += result.usage.outputTokens;
-        totalTokens += result.usage.totalTokens;
-        aiSummary = result.responseHtml;
-        console.log(`✅ Generated AI summary.`);
-      }
+      const result = await summarizeReportWithAI(generateAiSummaryLlm, abortSignal, assessments);
+      inputTokens += result.usage.inputTokens;
+      outputTokens += result.usage.outputTokens;
+      totalTokens += result.usage.totalTokens;
+      aiSummary = result.responseHtml;
+      console.log(`✅ Generated AI summary.`);
     } catch (e) {
       console.log(`${redX()} Failed to generate AI summary, skipping summary.`);
 
-      if ((e as Partial<Error>).stack) {
+      if (process.env.DEBUG === '1' && (e as Partial<Error>).stack) {
         console.error((e as Error).stack);
       }
     }
diff --git a/runner/orchestration/generate.ts b/runner/orchestration/generate.ts
@@ -3,7 +3,7 @@ import {existsSync, readdirSync} from 'fs';
 import {availableParallelism} from 'os';
 import PQueue from 'p-queue';
 import {basename, join} from 'path';
-import {assertValidModelName} from '../codegen/llm-runner.js';
+import {assertValidModelName, LlmRunner} from '../codegen/llm-runner.js';
 import {getRunnerByName} from '../codegen/runner-creation.js';
 import {LLM_OUTPUT_DIR, REPORT_VERSION} from '../configuration/constants.js';
 import {getEnvironmentByPath} from '../configuration/environment-resolution.js';
@@ -27,6 +27,7 @@ import {startEvaluationTask} from './generate-eval-task.js';
 import {prepareSummary} from './generate-summary.js';
 import {getRunGroupId} from './grouping.js';
 import {combineAbortSignals} from '../utils/abort-signal.js';
+import {RatingKind} from '../ratings/rating-types.js';
 
 /**
  * Orchestrates the entire assessment process for each prompt defined in the `prompts` array.
@@ -43,10 +44,14 @@ import {combineAbortSignals} from '../utils/abort-signal.js';
  */
 export async function generateCodeAndAssess(options: AssessmentConfig): Promise<RunInfo> {
   const env = await getEnvironmentByPath(options.environmentConfigPath, options.runner);
+  const extraCleanupFns: (() => Promise<void>)[] = [];
   const cleanup = async () => {
     // Clean-up should never interrupt a potentially passing completion.
     try {
       await env.executor.destroy();
+      for (const cleanupFn of extraCleanupFns) {
+        await cleanupFn();
+      }
     } catch (e) {
       console.error(`Failed to destroy executor: ${e}`);
       if (e instanceof Error) {
@@ -58,16 +63,39 @@ export async function generateCodeAndAssess(options: AssessmentConfig): Promise<
   // Ensure cleanup logic runs when the evaluation is aborted.
   options.abortSignal?.addEventListener('abort', cleanup);
 
-  await assertValidModelName(options.model, env.executor);
-
-  const ratingLlm = await getRunnerByName('genkit');
   const allTasksAbortCtrl = new AbortController();
 
   try {
+    await assertValidModelName(options.model, env.executor);
+
     const promptsToProcess = (
       await getCandidateExecutablePrompts(env, options.localMode, options.promptFilter)
     ).slice(0, options.limit);
 
+    const hasLlmBasedRatings = promptsToProcess.some(p =>
+      p.kind === 'single'
+        ? // Check if some ratings are LLM based.
+          p.ratings.some(r => r.kind === RatingKind.LLM_BASED)
+        : // Check if some steps contain LLM based ratings.
+          p.steps.some(s => s.ratings.some(r => r.kind === RatingKind.LLM_BASED)),
+    );
+
+    // Only construct LLMs when necessary. This is helpful in cases where WCS is invoked
+    // as a auto-rater that doesn't have access to other LLMs.
+    const autoraterLlm = hasLlmBasedRatings ? await getRunnerByName('genkit') : null;
+    const cujGenerationLlm = options.enableUserJourneyTesting
+      ? (autoraterLlm ?? (await getRunnerByName('genkit')))
+      : null;
+    const generateAiSummaryLlm = !options.skipAiSummary
+      ? (autoraterLlm ?? cujGenerationLlm ?? (await getRunnerByName('genkit')))
+      : null;
+
+    extraCleanupFns.push(async () => {
+      await autoraterLlm?.dispose();
+      await cujGenerationLlm?.dispose();
+      await generateAiSummaryLlm?.dispose();
+    });
+
     const progress =
       options.logging === 'dynamic' ? new DynamicProgressLogger() : new TextProgressLogger();
     const appConcurrency =
@@ -128,7 +156,8 @@ export async function generateCodeAndAssess(options: AssessmentConfig): Promise<
                   options,
                   evalID,
                   env,
-                  ratingLlm,
+                  autoraterLlm,
+                  cujGenerationLlm,
                   rootPromptDef,
                   combineAbortSignals(
                     allTasksAbortCtrl.signal,
@@ -187,7 +216,7 @@ export async function generateCodeAndAssess(options: AssessmentConfig): Promise<
     const timestamp = new Date();
     const details = {
       summary: await prepareSummary(
-        ratingLlm,
+        generateAiSummaryLlm,
         allTasksAbortCtrl.signal,
         options.model,
         env,
@@ -196,7 +225,6 @@ export async function generateCodeAndAssess(options: AssessmentConfig): Promise<
           allPromptsCount: promptsToProcess.length,
           failedPrompts,
         },
-        options,
       ),
       timestamp: timestamp.toISOString(),
       reportName: options.reportName,
diff --git a/runner/ratings/rate-code.ts b/runner/ratings/rate-code.ts
@@ -29,6 +29,7 @@ import {GenkitRunner} from '../codegen/genkit/genkit-runner.js';
 import {ProgressLogger} from '../progress/progress-logger.js';
 import {UserFacingError} from '../utils/errors.js';
 import {ServeTestingResult} from '../workers/serve-testing/worker-types.js';
+import assert from 'assert';
 
 interface FileOrEmbeddedSyntheticFile {
   /**
@@ -45,7 +46,7 @@ interface FileOrEmbeddedSyntheticFile {
 type CategorizedFiles = Record<PerFileRatingContentType, FileOrEmbeddedSyntheticFile[]>;
 
 export async function rateGeneratedCode(
-  llm: GenkitRunner,
+  autoraterLlm: GenkitRunner | null,
   environment: Environment,
   currentPromptDef: PromptDefinition,
   fullPromptText: string,
@@ -107,12 +108,13 @@ export async function rateGeneratedCode(
         categorizedFiles ??= splitFilesIntoCategories(outputFiles);
         result = await runPerFileRating(currentPromptDef, current, categorizedFiles, ratingsResult);
       } else if (current.kind === RatingKind.LLM_BASED) {
+        assert(autoraterLlm !== null, 'Expected an auto-rater LLM to be available.');
         result = await runLlmBasedRating(
           environment,
           current,
           fullPromptText,
           currentPromptDef,
-          llm,
+          autoraterLlm,
           outputFiles,
           buildResult,
           serveTestingResult,
diff --git a/runner/reporting/report-ai-summary.ts b/runner/reporting/report-ai-summary.ts
@@ -10,7 +10,7 @@ export async function summarizeReportWithAI(
   const model = 'gemini-2.5-flash-lite';
 
   if (!llm.getSupportedModels().includes(model)) {
-    return null;
+    throw new Error(`Unable to generate AI summary due to unsupported model: ${model}`);
   }
 
   return chatWithReportAI(

Original file line number	Diff line number	Diff line change
`@@ -10,7 +10,7 @@ export async function summarizeReportWithAI(`
`10`	`10`	`const model = 'gemini-2.5-flash-lite';`
`11`	`11`
`12`	`12`	`if (!llm.getSupportedModels().includes(model)) {`
`13`		`- return null;`
	`13`	+ throw new Error(`Unable to generate AI summary due to unsupported model: ${model}`);
`14`	`14`	`}`
`15`	`15`
`16`	`16`	`return chatWithReportAI(`