diff --git a/runner/orchestration/build-serve-loop.ts b/runner/orchestration/build-serve-loop.ts index 665d8a0..db806e9 100644 --- a/runner/orchestration/build-serve-loop.ts +++ b/runner/orchestration/build-serve-loop.ts @@ -2,7 +2,12 @@ import PQueue from 'p-queue'; import {LlmGenerateFilesResponse} from '../codegen/llm-runner.js'; import {BuildResultStatus} from '../workers/builder/builder-types.js'; import {Environment} from '../configuration/environment.js'; -import {AttemptDetails, LlmContextFile, RootPromptDefinition} from '../shared-interfaces.js'; +import { + AssessmentConfig, + AttemptDetails, + LlmContextFile, + RootPromptDefinition, +} from '../shared-interfaces.js'; import {DEFAULT_MAX_REPAIR_ATTEMPTS} from '../configuration/constants.js'; import {ProgressLogger} from '../progress/progress-logger.js'; import {runBuild} from './build-worker.js'; @@ -31,9 +36,9 @@ import {BrowserAgentTaskInput} from '../testing/browser-agent/models.js'; * @param workerConcurrencyQueue Concurrency queue for controlling parallelism of worker invocations (as they are more expensive than LLM calls). */ export async function attemptBuild( + config: AssessmentConfig, evalID: EvalID, gateway: Gateway, - model: string, env: Environment, rootPromptDef: RootPromptDefinition, directory: string, @@ -43,12 +48,7 @@ export async function attemptBuild( abortSignal: AbortSignal, workerConcurrencyQueue: PQueue, progress: ProgressLogger, - skipScreenshots: boolean, - skipAxeTesting: boolean, - enableAutoCsp: boolean, - skipLighthouse: boolean, userJourneyAgentTaskInput: BrowserAgentTaskInput | undefined, - maxAxeRepairAttempts: number, ) { const initialBuildResult = await runBuild( evalID, @@ -93,7 +93,7 @@ export async function attemptBuild( const attempt = await repairAndBuild( evalID, gateway, - model, + config.model, env, rootPromptDef, directory, @@ -115,6 +115,7 @@ export async function attemptBuild( // Now that we got a working app, try to serve it and collect // findings from the running app. lastAttempt.serveTestingResult = await serveAndTestApp( + config, evalID, gateway, directory, @@ -123,10 +124,6 @@ export async function attemptBuild( workerConcurrencyQueue, abortSignal, progress, - skipScreenshots, - skipAxeTesting, - enableAutoCsp, - skipLighthouse, userJourneyAgentTaskInput, ); } @@ -138,7 +135,7 @@ export async function attemptBuild( while ( lastAttempt.serveTestingResult && (lastAttempt.serveTestingResult.axeViolations?.length ?? 0) > 0 && - axeRepairAttempts < maxAxeRepairAttempts + axeRepairAttempts < (config.a11yRepairAttempts ?? 0) ) { axeRepairAttempts++; progress.log( @@ -158,7 +155,7 @@ export async function attemptBuild( const attempt = await repairAndBuild( evalID, gateway, - model, + config.model, env, rootPromptDef, directory, @@ -185,6 +182,7 @@ export async function attemptBuild( // Re-run serving & tests after Axe repair. // This allows us to check if we fixed the violations. attempt.serveTestingResult = await serveAndTestApp( + config, evalID, gateway, directory, @@ -193,10 +191,6 @@ export async function attemptBuild( workerConcurrencyQueue, abortSignal, progress, - skipScreenshots, - skipAxeTesting, - enableAutoCsp, - skipLighthouse, userJourneyAgentTaskInput, ); diff --git a/runner/orchestration/generate.ts b/runner/orchestration/generate.ts index 891e27e..f322502 100644 --- a/runner/orchestration/generate.ts +++ b/runner/orchestration/generate.ts @@ -19,6 +19,7 @@ import {Environment} from '../configuration/environment.js'; import {rateGeneratedCode} from '../ratings/rate-code.js'; import {redX} from '../reporting/format.js'; import { + AssessmentConfig, AssessmentResult, AttemptDetails, CompletionStats, @@ -49,7 +50,7 @@ import {getRunGroupId} from './grouping.js'; import {executeCommand} from '../utils/exec.js'; import {EvalID, Gateway} from './gateway.js'; import {LocalEnvironment} from '../configuration/environment-local.js'; -import {getRunnerByName, RunnerName} from '../codegen/runner-creation.js'; +import {getRunnerByName} from '../codegen/runner-creation.js'; import {summarizeReportWithAI} from '../reporting/report-ai-summary.js'; /** @@ -64,29 +65,7 @@ import {summarizeReportWithAI} from '../reporting/report-ai-summary.js'; * @returns A Promise that resolves to an array of AssessmentResult objects, * each containing the prompt, generated code, and final validation status. */ -export async function generateCodeAndAssess(options: { - model: string; - runner: RunnerName; - environmentConfigPath: string; - localMode: boolean; - limit: number; - concurrency: number | 'auto'; - reportName: string; - skipScreenshots: boolean; - startMcp?: boolean; - ragEndpoint?: string; - outputDirectory?: string; - promptFilter?: string; - labels: string[]; - skipAiSummary?: boolean; - skipAxeTesting: boolean; - enableUserJourneyTesting?: boolean; - enableAutoCsp?: boolean; - logging?: 'text-only' | 'dynamic'; - autoraterModel?: string; - a11yRepairAttempts?: number; - skipLighthouse?: boolean; -}): Promise { +export async function generateCodeAndAssess(options: AssessmentConfig): Promise { const env = await getEnvironmentByPath(options.environmentConfigPath, options.runner); const ratingLlm = await getRunnerByName('genkit'); @@ -162,25 +141,15 @@ export async function generateCodeAndAssess(options: { `Evaluation of ${rootPromptDef.name}`, async abortSignal => startEvaluationTask( + options, evalID, env, env.gateway, ratingLlm, - options.model, rootPromptDef, - options.localMode, - options.skipScreenshots, - options.outputDirectory, - options.ragEndpoint, abortSignal, - options.skipAxeTesting, - !!options.enableUserJourneyTesting, - !!options.enableAutoCsp, workerConcurrencyQueue, progress, - options.autoraterModel || DEFAULT_AUTORATER_MODEL_NAME, - options.a11yRepairAttempts ?? 0, - !!options.skipLighthouse, ), // 10min max per app evaluation. We just want to make sure it never gets stuck. 10, @@ -291,32 +260,22 @@ export async function generateCodeAndAssess(options: { * @returns A Promise that resolves to an AssessmentResult object containing all details of the task's execution. */ async function startEvaluationTask( + config: AssessmentConfig, evalID: EvalID, env: Environment, gateway: Gateway, ratingLlm: GenkitRunner, - model: string, rootPromptDef: PromptDefinition | MultiStepPromptDefinition, - localMode: boolean, - skipScreenshots: boolean, - outputDirectory: string | undefined, - ragEndpoint: string | undefined, abortSignal: AbortSignal, - skipAxeTesting: boolean, - enableUserJourneyTesting: boolean, - enableAutoCsp: boolean, workerConcurrencyQueue: PQueue, progress: ProgressLogger, - autoraterModel: string, - a11yRepairAttempts: number, - skipLighthouse: boolean, ): Promise { // Set up the project structure once for the root project. const {directory, cleanup} = await setupProjectStructure( env, rootPromptDef, progress, - outputDirectory, + config.outputDirectory, ); const results: AssessmentResult[] = []; @@ -324,7 +283,7 @@ async function startEvaluationTask( for (const promptDef of defsToExecute) { const [fullPromptText, systemInstructions] = await Promise.all([ - env.getPrompt(promptDef.systemPromptType, promptDef.prompt, ragEndpoint), + env.getPrompt(promptDef.systemPromptType, promptDef.prompt, config.ragEndpoint), env.getPrompt(promptDef.systemPromptType, ''), ]); @@ -334,9 +293,8 @@ async function startEvaluationTask( // Generate the initial set of files through the LLM. const initialResponse = await generateInitialFiles( + config, evalID, - gateway, - model, env, promptDef, { @@ -349,7 +307,6 @@ async function startEvaluationTask( possiblePackageManagers: getPossiblePackageManagers().slice(), }, contextFiles, - localMode, abortSignal, progress, ); @@ -406,21 +363,22 @@ async function startEvaluationTask( // TODO: Only execute the serve command on the "final working attempt". // TODO: Incorporate usage. - const userJourneyAgentTaskInput: BrowserAgentTaskInput | undefined = enableUserJourneyTesting - ? { - userJourneys: userJourneys.result, - appPrompt: defsToExecute[0].prompt, - } - : undefined; + const userJourneyAgentTaskInput: BrowserAgentTaskInput | undefined = + config.enableUserJourneyTesting + ? { + userJourneys: userJourneys.result, + appPrompt: defsToExecute[0].prompt, + } + : undefined; const attemptDetails: AttemptDetails[] = []; // Store details for assessment.json // Try to build the files in the root prompt directory. // This will also attempt to fix issues with the generated code. const attempt = await attemptBuild( + config, evalID, gateway, - model, env, rootPromptDef, directory, @@ -430,12 +388,7 @@ async function startEvaluationTask( abortSignal, workerConcurrencyQueue, progress, - skipScreenshots, - skipAxeTesting, - enableAutoCsp, - skipLighthouse, userJourneyAgentTaskInput, - a11yRepairAttempts, ); if (!attempt) { @@ -455,7 +408,7 @@ async function startEvaluationTask( attempt.axeRepairAttempts, abortSignal, progress, - autoraterModel, + config.autoraterModel || DEFAULT_AUTORATER_MODEL_NAME, ); results.push({ @@ -493,18 +446,16 @@ async function startEvaluationTask( * @param abortSignal Signal to fire when this process should be aborted. */ async function generateInitialFiles( + options: AssessmentConfig, evalID: EvalID, - gateway: Gateway, - model: string, env: Environment, promptDef: RootPromptDefinition, codegenContext: LlmGenerateFilesContext, contextFiles: LlmContextFile[], - localMode: boolean, abortSignal: AbortSignal, progress: ProgressLogger, ): Promise { - if (localMode) { + if (options.localMode) { const localFilesDirectory = join(LLM_OUTPUT_DIR, env.id, promptDef.name); const filePaths = globSync('**/*', {cwd: localFilesDirectory}); @@ -531,10 +482,10 @@ async function generateInitialFiles( progress.log(promptDef, 'codegen', 'Generating code with AI'); - const response = await gateway.generateInitialFiles( + const response = await env.gateway.generateInitialFiles( evalID, codegenContext, - model, + options.model, contextFiles, abortSignal, ); diff --git a/runner/orchestration/serve-testing-worker.ts b/runner/orchestration/serve-testing-worker.ts index 88e8390..c5ae35f 100644 --- a/runner/orchestration/serve-testing-worker.ts +++ b/runner/orchestration/serve-testing-worker.ts @@ -2,7 +2,7 @@ import {ChildProcess, fork} from 'node:child_process'; import path from 'node:path'; import {Environment} from '../configuration/environment.js'; import {ProgressLogger} from '../progress/progress-logger.js'; -import {RootPromptDefinition} from '../shared-interfaces.js'; +import {AssessmentConfig, RootPromptDefinition} from '../shared-interfaces.js'; import {killChildProcessGracefully} from '../utils/kill-gracefully.js'; import { ServeTestingResult, @@ -15,6 +15,7 @@ import PQueue from 'p-queue'; /** Attempts to run & test an eval app. */ export async function serveAndTestApp( + config: AssessmentConfig, evalID: EvalID, gateway: Gateway, appDirectoryPath: string, @@ -23,10 +24,6 @@ export async function serveAndTestApp( workerConcurrencyQueue: PQueue, abortSignal: AbortSignal, progress: ProgressLogger, - skipScreenshots: boolean, - skipAxeTesting: boolean, - enableAutoCsp: boolean, - skipLighthouse: boolean, userJourneyAgentTaskInput?: BrowserAgentTaskInput, ): Promise { progress.log(rootPromptDef, 'serve-testing', `Testing the app`); @@ -41,10 +38,10 @@ export async function serveAndTestApp( const serveParams: ServeTestingWorkerMessage = { serveUrl, appName: rootPromptDef.name, - enableAutoCsp, - includeAxeTesting: skipAxeTesting === false, - takeScreenshots: skipScreenshots === false, - includeLighthouseData: skipLighthouse === false, + enableAutoCsp: !!config.enableAutoCsp, + includeAxeTesting: config.skipAxeTesting === false, + takeScreenshots: config.skipScreenshots === false, + includeLighthouseData: config.skipLighthouse === false, userJourneyAgentTaskInput, }; diff --git a/runner/shared-interfaces.ts b/runner/shared-interfaces.ts index a43cb9a..9d78eea 100644 --- a/runner/shared-interfaces.ts +++ b/runner/shared-interfaces.ts @@ -4,6 +4,32 @@ import type {UserJourneysResult} from './orchestration/user-journeys.js'; import type {AutoRateResult} from './ratings/autoraters/auto-rate-shared.js'; import type {Rating, RatingCategory} from './ratings/rating-types.js'; import type {ServeTestingResult} from './workers/serve-testing/worker-types.js'; +import type {RunnerName} from './codegen/runner-creation.js'; + +/** Configuration options necessary for kicking off an assessment run. */ +export interface AssessmentConfig { + model: string; + runner: RunnerName; + environmentConfigPath: string; + localMode: boolean; + limit: number; + concurrency: number | 'auto'; + reportName: string; + skipScreenshots: boolean; + startMcp?: boolean; + ragEndpoint?: string; + outputDirectory?: string; + promptFilter?: string; + labels: string[]; + skipAiSummary?: boolean; + skipAxeTesting: boolean; + enableUserJourneyTesting?: boolean; + enableAutoCsp?: boolean; + logging?: 'text-only' | 'dynamic'; + autoraterModel?: string; + a11yRepairAttempts?: number; + skipLighthouse?: boolean; +} /** * Represents a single prompt definition and extra metadata for it.