From d1c3eaafa0ffb84b87467671fa839d42a287ba47 Mon Sep 17 00:00:00 2001 From: Andrew Scott Date: Mon, 29 Sep 2025 12:46:12 -0700 Subject: [PATCH] feat(runner): add support for running and repairing tests This commit introduces the ability to run tests against the generated code as part of the evaluation process. A new optional `testCommand` can be in the environment configuration. If provided, this command will be executed after a successful build. If the tests fail, the tool will attempt to repair the code using the LLM, similar to how build failures are handled. The number of repair attempts is configurable. The report has been updated to display the test results for each run, including whether the tests passed, failed, or passed after repair. The summary view also includes aggregated statistics about the test results. --- docs/environment-reference.md | 5 + .../pages/report-viewer/report-viewer.html | 64 +++++++- .../app/pages/report-viewer/report-viewer.ts | 30 +++- .../configuration/base-environment-config.ts | 2 - runner/configuration/environment-config.ts | 5 - runner/configuration/environment-local.ts | 9 +- runner/configuration/package-managers.ts | 4 + runner/eval-cli.ts | 9 +- runner/orchestration/build-serve-loop.ts | 140 +++++++++++++----- runner/orchestration/codegen.ts | 19 ++- runner/orchestration/gateway.ts | 15 +- .../orchestration/gateways/local_gateway.ts | 49 +++++- runner/orchestration/generate.ts | 22 ++- .../{build-repair.ts => repair.ts} | 64 ++++---- runner/orchestration/test-worker.ts | 44 ++++++ runner/progress/dynamic-progress-logger.ts | 1 + runner/progress/progress-logger.ts | 11 +- .../successful-tests-rating.ts | 28 ++++ runner/ratings/rate-code.ts | 9 ++ runner/ratings/rating-types.ts | 3 + runner/ratings/stats.ts | 24 +++ runner/shared-interfaces.ts | 32 +++- 22 files changed, 480 insertions(+), 109 deletions(-) create mode 100644 runner/configuration/package-managers.ts rename runner/orchestration/{build-repair.ts => repair.ts} (95%) create mode 100644 runner/orchestration/test-worker.ts create mode 100644 runner/ratings/built-in-ratings/successful-tests-rating.ts diff --git a/docs/environment-reference.md b/docs/environment-reference.md index 3b3f7ba..c37fbea 100644 --- a/docs/environment-reference.md +++ b/docs/environment-reference.md @@ -179,3 +179,8 @@ Defaults to ` run build`. Command used to start a local dev server as a part of the evaluation. Defaults to ` run start --port 0`. + +### `testCommand` + +Command used to run tests against the generated code. If this property is not provided, tests will not be run. The command should exit with code 0 on success and a non-zero exit code on failure. The output from the command (both `stdout` and `stderr`) is captured and used for repair attempts if the tests fail. The test command will time out after 4 minutes. + diff --git a/report-app/src/app/pages/report-viewer/report-viewer.html b/report-app/src/app/pages/report-viewer/report-viewer.html index 274842c..0a74fd7 100644 --- a/report-app/src/app/pages/report-viewer/report-viewer.html +++ b/report-app/src/app/pages/report-viewer/report-viewer.html @@ -73,6 +73,20 @@

+ @if (overview.stats.tests) { +
+

+ quiz + Tests +

+
+ +
+
+ } @if (overview.stats.runtime) {

@@ -281,9 +295,19 @@

Generated applications

Initial build failed } - @if (hasBuildFailureDuringA11yRepair(result)) { + @if (hasBuildFailureDuringTestRepair(result)) { Build failed after a11y repair } + + @if (finalAttempt.testResult) { + @if (finalAttempt.testResult.passed) { + @if ((result.testRepairAttempts || 0) > 0) { + Tests passed after repair + } + } @else { + Tests failed + } + }
@@ -355,12 +379,36 @@
+ @if (result.testResult) { +
+

Test Results

+
+ @if (result.testResult.passed) { + โœ” Tests passed + @if ((result.testRepairAttempts || 0) > 0) { +  after {{ result.testRepairAttempts }} repair attempt(s) + } + } @else { + โœ˜ Tests failed + } +
+ + @if (result.testResult.output && !result.testResult.passed) { +
+ See Test Output +
{{ result.testResult.output }}
+
+ } +
+ } +

Additional info

@for (attempt of result.attemptDetails; track attempt) { @let isBuilt = attempt.buildResult.status === 'success'; @let axeViolations = attempt.serveTestingResult?.axeViolations; @let hasAxeViolations = axeViolations && axeViolations.length > 0; + @let testsFailed = attempt.testResult?.passed === false; @@ -385,6 +433,15 @@

Additional info

>A11y } + + @if (attempt.testResult) { + Tests + }
@if (expansionPanel.opened()) { @@ -421,6 +478,11 @@

A11y Violations

} + @if (testsFailed) { +

Failed Tests

+
{{ attempt.testResult?.output }}
+ } +

Generated Code

@for (file of attempt.outputFiles; track file) { diff --git a/report-app/src/app/pages/report-viewer/report-viewer.ts b/report-app/src/app/pages/report-viewer/report-viewer.ts index 8e02168..78a9085 100644 --- a/report-app/src/app/pages/report-viewer/report-viewer.ts +++ b/report-app/src/app/pages/report-viewer/report-viewer.ts @@ -25,6 +25,7 @@ import { LlmResponseFile, RunInfo, RunSummaryBuilds, + RunSummaryTests, RuntimeStats, ScoreBucket, SkippedIndividualAssessment, @@ -271,6 +272,31 @@ export class ReportViewer { ]; } + protected testsAsGraphData(tests: RunSummaryTests): StackedBarChartData { + return [ + { + label: 'Passed', + color: ScoreCssVariable.excellent, + value: tests.successfulInitialTests, + }, + { + label: 'Passed after repair', + color: ScoreCssVariable.great, + value: tests.successfulTestsAfterRepair, + }, + { + label: 'Failed', + color: ScoreCssVariable.poor, + value: tests.failedTests, + }, + { + label: 'No tests run', + color: ScoreCssVariable.neutral, + value: tests.noTestsRun, + }, + ]; + } + protected checksAsGraphData(buckets: ScoreBucket[]): StackedBarChartData { return buckets.map(b => ({ label: b.nameWithLabels, @@ -427,7 +453,7 @@ export class ReportViewer { return `wcs run --prompt=${result.promptDef.name} --env=`; } - protected hasBuildFailureDuringA11yRepair(result: AssessmentResult): boolean { - return result.attemptDetails.some(attempt => attempt.buildFailedDuringA11yRepair); + protected hasBuildFailureDuringTestRepair(result: AssessmentResult): boolean { + return result.attemptDetails.some(attempt => attempt.buildFailedDuringTestRepair); } } diff --git a/runner/configuration/base-environment-config.ts b/runner/configuration/base-environment-config.ts index fe311ae..4aa3e40 100644 --- a/runner/configuration/base-environment-config.ts +++ b/runner/configuration/base-environment-config.ts @@ -1,8 +1,6 @@ import z from 'zod'; import {ratingSchema} from '../ratings/rating-types.js'; import {MultiStepPrompt} from './multi-step-prompt.js'; -import {mcpServerOptionsSchema} from '../codegen/llm-runner.js'; -import {getPossiblePackageManagers} from './environment-config.js'; export const baseEnvironmentConfigSchema = z.strictObject({ /** Display name for the environment. */ diff --git a/runner/configuration/environment-config.ts b/runner/configuration/environment-config.ts index e15361e..77bdf28 100644 --- a/runner/configuration/environment-config.ts +++ b/runner/configuration/environment-config.ts @@ -15,11 +15,6 @@ const environmentConfigSchema = z.union([ */ export type EnvironmentConfig = z.infer; -/** Package managers that are currently supported. */ -export function getPossiblePackageManagers() { - return ['npm', 'pnpm', 'yarn'] as const; -} - /** Asserts that the specified data is a valid environment config. */ export function assertIsEnvironmentConfig(value: unknown): asserts value is EnvironmentConfig { const validationResult = environmentConfigSchema.safeParse(value); diff --git a/runner/configuration/environment-local.ts b/runner/configuration/environment-local.ts index 9eefed2..041f9a8 100644 --- a/runner/configuration/environment-local.ts +++ b/runner/configuration/environment-local.ts @@ -3,7 +3,7 @@ import z from 'zod'; import {LlmRunner, McpServerOptions, mcpServerOptionsSchema} from '../codegen/llm-runner.js'; import {LocalGateway} from '../orchestration/gateways/local_gateway.js'; import {BaseEnvironment} from './base-environment.js'; -import {EnvironmentConfig, getPossiblePackageManagers} from './environment-config.js'; +import {getPossiblePackageManagers} from './package-managers.js'; import {baseEnvironmentConfigSchema} from './base-environment-config.js'; export const localEnvironmentConfigSchema = baseEnvironmentConfigSchema.extend({ @@ -28,6 +28,10 @@ export const localEnvironmentConfigSchema = baseEnvironmentConfigSchema.extend({ * Defaults to ` run start --port 0`. */ serveCommand: z.string().optional(), + /** + * Command to run when testing the code. + */ + testCommand: z.string().optional(), /** * Whether to skip installing dependencies when running evals in the environment. * Useful if you're managing dependencies yourself. @@ -47,6 +51,8 @@ export class LocalEnvironment extends BaseEnvironment { readonly buildCommand: string; /** Command to run when starting a development server inside the app. */ readonly serveCommand: string; + /** Command to run when starting tests inside the app. */ + readonly testCommand: string | null; /** * Absolute path at which files specific to this environment are located. Will be merged in * with the files from the `projectTemplatePath` to get the final project structure. @@ -82,6 +88,7 @@ export class LocalEnvironment extends BaseEnvironment { this.installCommand = `${packageManager} install --silent`; this.buildCommand = config.buildCommand || `${packageManager} run build`; this.serveCommand = config.serveCommand || this.getDefaultServeCommand(packageManager); + this.testCommand = config.testCommand ?? null; this.projectTemplatePath = projectTemplatePath; this.sourceDirectory = sourceDirectory; this.mcpServerOptions = config.mcpServers || []; diff --git a/runner/configuration/package-managers.ts b/runner/configuration/package-managers.ts new file mode 100644 index 0000000..6929cd2 --- /dev/null +++ b/runner/configuration/package-managers.ts @@ -0,0 +1,4 @@ +/** Package managers that are currently supported. */ +export function getPossiblePackageManagers() { + return ['npm', 'pnpm', 'yarn'] as const; +} diff --git a/runner/eval-cli.ts b/runner/eval-cli.ts index 8734583..d6eb23c 100644 --- a/runner/eval-cli.ts +++ b/runner/eval-cli.ts @@ -36,7 +36,7 @@ interface Options { enableUserJourneyTesting?: boolean; enableAutoCsp?: boolean; autoraterModel?: string; - a11yRepairAttempts?: number; + testRepairAttempts?: number; logging?: 'text-only' | 'dynamic'; } @@ -148,10 +148,11 @@ function builder(argv: Argv): Argv { default: DEFAULT_AUTORATER_MODEL_NAME, description: 'Model to use when automatically rating generated code', }) - .option('a11y-repair-attempts', { + .option('test-repair-attempts', { type: 'number', default: 0, - description: 'Number of repair attempts for discovered a11y violations', + description: + 'Number of repair attempts for discovered test failures (including a11y violations and ones from testCommand)', }) .strict() .version(false) @@ -196,7 +197,7 @@ async function handler(cliArgs: Arguments): Promise { logging: cliArgs.logging, autoraterModel: cliArgs.autoraterModel, skipAiSummary: cliArgs.skipAiSummary, - a11yRepairAttempts: cliArgs.a11yRepairAttempts, + testRepairAttempts: cliArgs.testRepairAttempts, }); logReportToConsole(runInfo); diff --git a/runner/orchestration/build-serve-loop.ts b/runner/orchestration/build-serve-loop.ts index 9bbd849..7f7d1b5 100644 --- a/runner/orchestration/build-serve-loop.ts +++ b/runner/orchestration/build-serve-loop.ts @@ -6,9 +6,10 @@ import {AttemptDetails, LlmContextFile, RootPromptDefinition} from '../shared-in import {DEFAULT_MAX_REPAIR_ATTEMPTS} from '../configuration/constants.js'; import {ProgressLogger} from '../progress/progress-logger.js'; import {runBuild} from './build-worker.js'; -import {repairAndBuild} from './build-repair.js'; +import {repairAndBuild} from './repair.js'; import {EvalID, Gateway} from './gateway.js'; import {serveAndTestApp} from './serve-testing-worker.js'; +import {runTest} from './test-worker.js'; import {BrowserAgentTaskInput} from '../testing/browser-agent/models.js'; /** @@ -30,7 +31,7 @@ import {BrowserAgentTaskInput} from '../testing/browser-agent/models.js'; * @param abortSignal Signal to fire when the build should be aborted. * @param workerConcurrencyQueue Concurrency queue for controlling parallelism of worker invocations (as they are more expensive than LLM calls). */ -export async function attemptBuild( +export async function attemptBuildAndTest( evalID: EvalID, gateway: Gateway, model: string, @@ -47,7 +48,7 @@ export async function attemptBuild( skipAxeTesting: boolean, enableAutoCsp: boolean, userJourneyAgentTaskInput: BrowserAgentTaskInput | undefined, - maxAxeRepairAttempts: number, + maxTestRepairAttempts: number, ) { const initialBuildResult = await runBuild( evalID, @@ -97,13 +98,18 @@ export async function attemptBuild( rootPromptDef, directory, lastAttempt.outputFiles, - lastAttempt.buildResult.message, - 'There are the following build errors:', + [ + { + errorContext: 'There are the following build errors:', + errorMessage: lastAttempt.buildResult.message, + }, + ], contextFiles, abortSignal, workerConcurrencyQueue, repairAttempts, progress, + 'build', ); attemptDetails.push(attempt); @@ -127,31 +133,70 @@ export async function attemptBuild( enableAutoCsp, userJourneyAgentTaskInput, ); + const testResult = await runTest( + evalID, + gateway, + directory, + env, + rootPromptDef, + abortSignal, + workerConcurrencyQueue, + progress, + ); + + if (testResult !== null) { + lastAttempt.testResult = testResult; + } } - // Attempt to repair axe testing. This only runs when the last build - // passed and serving did run. Note: By default, we don't run axe repair + // Attempt to repair testing. This only runs when the last build + // passed and serving did run. Note: By default, we don't run repair // attempts as it's not commonly done by LLMs in the ecosystem. let axeRepairAttempts = 0; - while ( - lastAttempt.serveTestingResult && - (lastAttempt.serveTestingResult.axeViolations?.length ?? 0) > 0 && - axeRepairAttempts < maxAxeRepairAttempts - ) { - axeRepairAttempts++; - progress.log( - rootPromptDef, - 'build', - `Trying to repair axe accessibility violations (attempt #${axeRepairAttempts + 1})...`, - ); + let testRepairAttempts = 0; + for (let testRepairAttempt = 0; testRepairAttempt < maxTestRepairAttempts; testRepairAttempt++) { + const hasAxeFailure = + lastAttempt.serveTestingResult && lastAttempt.serveTestingResult.axeViolations?.length; + const hasTestFailure = lastAttempt.testResult && !lastAttempt.testResult.passed; + if (!hasAxeFailure && !hasTestFailure) { + break; + } - const axeViolationsError = JSON.stringify( - lastAttempt.serveTestingResult.axeViolations, - null, - 2, - ); + const attemptId = testRepairAttempt + repairAttempts + 1; - progress.log(rootPromptDef, 'error', 'Found Axe accessibility violations'); + const errors: Array<{errorContext: string; errorMessage: string}> = []; + if (hasAxeFailure) { + axeRepairAttempts++; + progress.log( + rootPromptDef, + 'build', + `Trying to repair axe accessibility violations (attempt #${attemptId})...`, + ); + const axeViolationsError = JSON.stringify( + lastAttempt.serveTestingResult!.axeViolations, + null, + 2, + ); + progress.log(rootPromptDef, 'error', 'Found Axe accessibility violations'); + errors.push({ + errorContext: + 'There are the following accessibility errors from axe accessibility violations:', + errorMessage: axeViolationsError, + }); + } + if (hasTestFailure) { + testRepairAttempts++; + progress.log( + rootPromptDef, + 'test', + `Trying to repair test failures (attempt #${attemptId})...`, + ); + + errors.push({ + errorContext: 'Application tests failed. Attempt to fix them. Test output was:', + errorMessage: lastAttempt.testResult!.output, + }); + } const attempt = await repairAndBuild( evalID, @@ -161,28 +206,28 @@ export async function attemptBuild( rootPromptDef, directory, lastAttempt.outputFiles, - axeViolationsError, - 'There are the following accessibility errors from axe accessibility violations:', + errors, contextFiles, abortSignal, workerConcurrencyQueue, - axeRepairAttempts + repairAttempts, + attemptId, progress, + 'test', ); let hasBuildFailure = attempt.buildResult.status !== BuildResultStatus.SUCCESS; - attempt.buildFailedDuringA11yRepair = hasBuildFailure; + attempt.buildFailedDuringTestRepair = hasBuildFailure; attemptDetails.push(attempt); lastAttempt = attempt; + // If we somehow introduced build errors via the repair loop, we abort + // further repairs and capture the failed build. This is useful insight + // as LLMs seem to regress when asked to repair violations. + if (hasBuildFailure) { + break; + } - // If we somehow introduced build errors via the Axe repair loop, we abort - // further a11y repairs and capture the failed build. This is useful insight - // as LLMs seem to regress when asked to repair a11y violations. - if (hasBuildFailure) break; - - // Re-run serving & tests after Axe repair. - // This allows us to check if we fixed the violations. - attempt.serveTestingResult = await serveAndTestApp( + // Re-run serving & tests after repair. + lastAttempt.serveTestingResult = await serveAndTestApp( evalID, gateway, directory, @@ -196,10 +241,27 @@ export async function attemptBuild( enableAutoCsp, userJourneyAgentTaskInput, ); + const testResult = await runTest( + evalID, + gateway, + directory, + env, + rootPromptDef, + abortSignal, + workerConcurrencyQueue, + progress, + ); + + if (testResult !== null) { + lastAttempt.testResult = testResult; + } - if (attempt.serveTestingResult.axeViolations?.length === 0) { + if (hasAxeFailure && lastAttempt.serveTestingResult.axeViolations?.length === 0) { progress.log(rootPromptDef, 'success', `Successfully fixed all Axe accessibility violations`); } + if (hasTestFailure && lastAttempt.testResult?.passed) { + progress.log(rootPromptDef, 'success', `Successfully fixed all test failures`); + } } return { @@ -207,6 +269,8 @@ export async function attemptBuild( serveTestingResult: lastAttempt.serveTestingResult, outputFiles: lastAttempt.outputFiles, repairAttempts, - axeRepairAttempts, + axeRepairAttempts: axeRepairAttempts, + testResult: lastAttempt.testResult, + testRepairAttempts: testRepairAttempts, }; } diff --git a/runner/orchestration/codegen.ts b/runner/orchestration/codegen.ts index 0ff1097..7cfed2b 100644 --- a/runner/orchestration/codegen.ts +++ b/runner/orchestration/codegen.ts @@ -8,10 +8,10 @@ import { } from '../shared-interfaces.js'; import {LlmGenerateFilesContext, LlmRunner, PromptDataMessage} from '../codegen/llm-runner.js'; import {Environment} from '../configuration/environment.js'; -import {getPossiblePackageManagers} from '../configuration/environment-config.js'; import {ProgressLogger} from '../progress/progress-logger.js'; import {EvalID, Gateway} from './gateway.js'; import {LocalEnvironment} from '../configuration/environment-local.js'; +import {getPossiblePackageManagers} from '../configuration/package-managers.js'; /** * Generates code using the configured AI model based on the provided prompt. @@ -94,18 +94,17 @@ export async function repairCodeWithAI( promptDef: RootPromptDefinition, directory: string, appFiles: LlmResponseFile[], - errorMessage: string, - errorContext: string, + errors: Array<{errorContext: string; errorMessage: string}>, contextFiles: LlmContextFile[], abortSignal: AbortSignal, progress: ProgressLogger, + repairType: 'build' | 'test', ): Promise { const repairSystemInstructions = env.systemPromptRepair(); const repairPrompt = [ - errorContext, - '```', - errorMessage, - '```', + ...errors.map(({errorContext, errorMessage}) => + [errorContext, '```', errorMessage, '```'].join('\n'), + ), '', 'In the following source code:', ...appFiles.map(file => `${file.filePath}:\n\`\`\`\n${file.code}\`\`\`\n\n`), @@ -121,13 +120,13 @@ export async function repairCodeWithAI( possiblePackageManagers: getPossiblePackageManagers().slice(), }; - progress.log(promptDef, 'codegen', 'Repairing code with AI'); + progress.log(promptDef, 'codegen', `Repairing ${repairType} failures with AI`); - const response = await gateway.repairBuild( + const response = await gateway.repairCode( evalID, context, model, - errorMessage, + errors.map(ec => ec.errorMessage).join('\n'), appFiles, contextFiles, abortSignal, diff --git a/runner/orchestration/gateway.ts b/runner/orchestration/gateway.ts index 7e2bf01..f761366 100644 --- a/runner/orchestration/gateway.ts +++ b/runner/orchestration/gateway.ts @@ -7,6 +7,7 @@ import { LlmResponse, LlmResponseFile, RootPromptDefinition, + TestExecutionResult, } from '../shared-interfaces.js'; import {BuildResult} from '../workers/builder/builder-types.js'; @@ -25,7 +26,7 @@ export interface Gateway { abortSignal: AbortSignal, ): Promise; - repairBuild( + repairCode( id: EvalID, requestCtx: LlmGenerateFilesContext, model: string, @@ -47,6 +48,18 @@ export interface Gateway { progress: ProgressLogger, ): Promise; + tryTest( + id: EvalID, + env: Env, + appDirectoryPath: string, + rootPromptDef: RootPromptDefinition, + workerConcurrencyQueue: PQueue, + abortSignal: AbortSignal, + progress: ProgressLogger, + ): Promise; + + shouldRetryFailedTestExecution(evalID: EvalID): boolean; + serveBuild( id: EvalID, env: Env, diff --git a/runner/orchestration/gateways/local_gateway.ts b/runner/orchestration/gateways/local_gateway.ts index 75cd970..69cbcc8 100644 --- a/runner/orchestration/gateways/local_gateway.ts +++ b/runner/orchestration/gateways/local_gateway.ts @@ -10,6 +10,7 @@ import { LlmContextFile, LlmResponse, LlmResponseFile, + TestExecutionResult, } from '../../shared-interfaces.js'; import {generateCodeWithAI} from '../codegen.js'; import {EvalID, Gateway} from '../gateway.js'; @@ -19,6 +20,9 @@ import {ProgressLogger} from '../../progress/progress-logger.js'; import {serveApp} from '../../workers/serve-testing/serve-app.js'; import {LocalEnvironment} from '../../configuration/environment-local.js'; import PQueue from 'p-queue'; +import {executeCommand} from '../../utils/exec.js'; +import {callWithTimeout} from '../../utils/timeout.js'; +import {cleanupBuildMessage} from '../../workers/builder/worker.js'; let uniqueIDs = 0; @@ -39,7 +43,7 @@ export class LocalGateway implements Gateway { return await generateCodeWithAI(this.llm, model, requestCtx, contextFiles, abortSignal); } - async repairBuild( + async repairCode( _id: EvalID, requestCtx: LlmGenerateFilesContext, model: string, @@ -88,6 +92,45 @@ export class LocalGateway implements Gateway { ); } + async tryTest( + _id: EvalID, + env: LocalEnvironment, + appDirectoryPath: string, + rootPromptDef: RootPromptDefinition, + workerConcurrencyQueue: PQueue, + abortSignal: AbortSignal, + progress: ProgressLogger, + ): Promise { + if (!env.testCommand) { + return Promise.resolve(null); + } + + let output: string; + let passed: boolean; + + try { + // Run the test command inside the temporary project directory + const stdout = await callWithTimeout( + `Testing ${rootPromptDef.name}`, + timeoutAbort => + executeCommand(env.testCommand!, appDirectoryPath, undefined, { + abortSignal: AbortSignal.any([abortSignal, timeoutAbort]), + }), + 4, // 4min. This is a safety boundary. Lots of parallelism can slow-down. + ); + output = stdout; + passed = true; + } catch (error: any) { + output = error.message; + passed = false; + } + + return { + passed, + output: cleanupBuildMessage(output), + } satisfies TestExecutionResult; + } + async serveBuild( _id: EvalID, env: LocalEnvironment, @@ -109,5 +152,9 @@ export class LocalGateway implements Gateway { return this.llm.hasBuiltInRepairLoop === false; } + shouldRetryFailedTestExecution(): boolean { + return this.llm.hasBuiltInRepairLoop === false; + } + async finalizeEval(_id: EvalID): Promise {} } diff --git a/runner/orchestration/generate.ts b/runner/orchestration/generate.ts index 95b12c4..45d7ce8 100644 --- a/runner/orchestration/generate.ts +++ b/runner/orchestration/generate.ts @@ -29,17 +29,17 @@ import { RunDetails, RunInfo, RunSummary, + TestExecutionResult, Usage, } from '../shared-interfaces.js'; import {BrowserAgentTaskInput} from '../testing/browser-agent/models.js'; import {callWithTimeout} from '../utils/timeout.js'; -import {attemptBuild} from './build-serve-loop.js'; +import {attemptBuildAndTest} from './build-serve-loop.js'; import {createLlmResponseTokenUsageMessage} from './codegen.js'; import {generateUserJourneysForApp} from './user-journeys.js'; import {resolveContextFiles, setupProjectStructure, writeResponseFiles} from './file-system.js'; import {GenkitRunner} from '../codegen/genkit/genkit-runner.js'; import {getEnvironmentByPath} from '../configuration/environment-resolution.js'; -import {getPossiblePackageManagers} from '../configuration/environment-config.js'; import {ProgressLogger} from '../progress/progress-logger.js'; import {TextProgressLogger} from '../progress/text-progress-logger.js'; import {logReportHeader} from '../reporting/report-logging.js'; @@ -51,6 +51,7 @@ import {EvalID, Gateway} from './gateway.js'; import {LocalEnvironment} from '../configuration/environment-local.js'; import {getRunnerByName, RunnerName} from '../codegen/runner-creation.js'; import {summarizeReportWithAI} from '../reporting/report-ai-summary.js'; +import {getPossiblePackageManagers} from '../configuration/package-managers.js'; /** * Orchestrates the entire assessment process for each prompt defined in the `prompts` array. @@ -59,7 +60,8 @@ import {summarizeReportWithAI} from '../reporting/report-ai-summary.js'; * 1. Makes a request to Gemini to generate code. * 2. Attempts to build it in a template Angular project. * 3. If the build fails, it makes a number of "fix it" Gemini requests. - * 4. Runs other validations and computes a score for generated output. + * 4. If configured, runs unit tests and attempts to repair test failures. + * 5. Runs other validations and computes a score for generated output. * * @returns A Promise that resolves to an array of AssessmentResult objects, * each containing the prompt, generated code, and final validation status. @@ -84,7 +86,7 @@ export async function generateCodeAndAssess(options: { enableAutoCsp?: boolean; logging?: 'text-only' | 'dynamic'; autoraterModel?: string; - a11yRepairAttempts?: number; + testRepairAttempts?: number; }): Promise { const env = await getEnvironmentByPath(options.environmentConfigPath, options.runner); const ratingLlm = await getRunnerByName('genkit'); @@ -179,7 +181,7 @@ export async function generateCodeAndAssess(options: { workerConcurrencyQueue, progress, options.autoraterModel || DEFAULT_AUTORATER_MODEL_NAME, - options.a11yRepairAttempts ?? 0, + options.testRepairAttempts ?? 0, ), // 10min max per app evaluation. We just want to make sure it never gets stuck. 10, @@ -309,7 +311,7 @@ async function startEvaluationTask( workerConcurrencyQueue: PQueue, progress: ProgressLogger, autoraterModel: string, - a11yRepairAttempts: number, + testRepairAttempts: number, ): Promise { // Set up the project structure once for the root project. const {directory, cleanup} = await setupProjectStructure( @@ -417,7 +419,7 @@ async function startEvaluationTask( // Try to build the files in the root prompt directory. // This will also attempt to fix issues with the generated code. - const attempt = await attemptBuild( + const attempt = await attemptBuildAndTest( evalID, gateway, model, @@ -434,7 +436,7 @@ async function startEvaluationTask( skipAxeTesting, enableAutoCsp, userJourneyAgentTaskInput, - a11yRepairAttempts, + testRepairAttempts, ); if (!attempt) { @@ -455,6 +457,8 @@ async function startEvaluationTask( abortSignal, progress, autoraterModel, + attempt.testResult ?? null, + attempt.testRepairAttempts, ); results.push({ @@ -472,6 +476,8 @@ async function startEvaluationTask( userJourneys: userJourneys, axeRepairAttempts: attempt.axeRepairAttempts, toolLogs, + testResult: attempt.testResult ?? null, + testRepairAttempts: attempt.testRepairAttempts, } satisfies AssessmentResult); } diff --git a/runner/orchestration/build-repair.ts b/runner/orchestration/repair.ts similarity index 95% rename from runner/orchestration/build-repair.ts rename to runner/orchestration/repair.ts index b275fa1..8f6e736 100644 --- a/runner/orchestration/build-repair.ts +++ b/runner/orchestration/repair.ts @@ -1,4 +1,6 @@ +import {Environment} from '../configuration/environment.js'; import PQueue from 'p-queue'; +import {ProgressLogger} from '../progress/progress-logger.js'; import { AttemptDetails, LlmContextFile, @@ -6,12 +8,10 @@ import { LlmResponseFile, RootPromptDefinition, } from '../shared-interfaces.js'; -import {Environment} from '../configuration/environment.js'; -import {repairCodeWithAI} from './codegen.js'; -import {writeResponseFiles} from './file-system.js'; import {runBuild} from './build-worker.js'; -import {ProgressLogger} from '../progress/progress-logger.js'; +import {writeResponseFiles} from './file-system.js'; import {EvalID, Gateway} from './gateway.js'; +import {repairCodeWithAI} from './codegen.js'; /** * Calls the LLM to repair code, handles the response, and attempts to build the project again. @@ -23,12 +23,11 @@ import {EvalID, Gateway} from './gateway.js'; * @param directory The working directory. * @param finalOutputFiles The list of output files to be modified. * @param errorMessage The error message from the failed build. - * @param errorContext Additional context for the error. + * @param errors Additional context for the error. * @param contextFiles A list of context files for the LLM. * @param abortSignal An AbortSignal to cancel the operation. * @param workerConcurrencyQueue The queue for managing worker concurrency. * @param attempts The current attempt number. - * @param repairType The type of repair being performed. * @returns A promise that resolves to the new BuildResult. */ export async function repairAndBuild( @@ -39,13 +38,13 @@ export async function repairAndBuild( rootPromptDef: RootPromptDefinition, directory: string, previousAttemptFiles: LlmResponseFile[], - errorMessage: string, - errorContext: string, + errors: Array<{errorContext: string; errorMessage: string}>, contextFiles: LlmContextFile[], abortSignal: AbortSignal, workerConcurrencyQueue: PQueue, attempts: number, progress: ProgressLogger, + repairType: 'build' | 'test', ): Promise { const repairResponse = await repairCodeWithAI( evalID, @@ -55,11 +54,11 @@ export async function repairAndBuild( rootPromptDef, directory, previousAttemptFiles, - errorMessage, - errorContext, + errors, contextFiles, abortSignal, progress, + repairType, ); return await handleRepairResponse( @@ -77,6 +76,27 @@ export async function repairAndBuild( ); } +/** + * Merges a set of new or updated files from a repair attempt into the + * current set of files. + * @param repairOutputFiles The array of new or updated files to merge. + * @param finalFiles The array of files to be updated. + */ +function mergeRepairFiles(repairOutputFiles: LlmResponseFile[], finalFiles: LlmResponseFile[]) { + // Merge the repair response into the original files. Otherwise we may end up dropping + // files that were valid in the initial response and the LLM decided not to touch, because + // they're still valid. + for (const file of repairOutputFiles) { + const existingFile = finalFiles.find(f => f.filePath === file.filePath); + + if (existingFile) { + existingFile.code = file.code; + } else { + finalFiles.push(file); + } + } +} + /** * Processes an LLM repair response by merging the suggested file changes, * writing them to disk, rebuilding the application, and logging the outcome. @@ -93,7 +113,7 @@ async function handleRepairResponse( abortSignal: AbortSignal, attempts: number, progress: ProgressLogger, -) { +): Promise { if (!repairResponse.success) { progress.log( rootPromptDef, @@ -104,7 +124,6 @@ async function handleRepairResponse( // Stop trying to repair if AI can't suggest a fix (API request fails) throw new Error(`Repair request failed: ${repairResponse.errors.join('\n')}`); } - // Clone the previous files because `mergeRepairFiles` mutates the attempt files. // We don't want to change files of a previous attempt. const newAttemptFiles = previousAttemptFiles.map(f => ({...f})); @@ -132,24 +151,3 @@ async function handleRepairResponse( attempt: attempts, }; } - -/** - * Merges a set of new or updated files from a repair attempt into the - * current set of files. - * @param repairOutputFiles The array of new or updated files to merge. - * @param finalFiles The array of files to be updated. - */ -function mergeRepairFiles(repairOutputFiles: LlmResponseFile[], finalFiles: LlmResponseFile[]) { - // Merge the repair response into the original files. Otherwise we may end up dropping - // files that were valid in the initial response and the LLM decided not to touch, because - // they're still valid. - for (const file of repairOutputFiles) { - const existingFile = finalFiles.find(f => f.filePath === file.filePath); - - if (existingFile) { - existingFile.code = file.code; - } else { - finalFiles.push(file); - } - } -} diff --git a/runner/orchestration/test-worker.ts b/runner/orchestration/test-worker.ts new file mode 100644 index 0000000..53b4fe6 --- /dev/null +++ b/runner/orchestration/test-worker.ts @@ -0,0 +1,44 @@ +import PQueue from 'p-queue'; +import {RootPromptDefinition, TestExecutionResult} from '../shared-interfaces.js'; +import {ProgressLogger} from '../progress/progress-logger.js'; +import {EvalID, Gateway} from './gateway.js'; +import {Environment} from '../configuration/environment.js'; + +export async function runTest( + evalID: EvalID, + gateway: Gateway, + appDirectoryPath: string, + env: Environment, + rootPromptDef: RootPromptDefinition, + abortSignal: AbortSignal, + workerConcurrencyQueue: PQueue, + progress: ProgressLogger, +): Promise { + progress.log(rootPromptDef, 'test', `Running tests`); + + try { + const result = await gateway.tryTest( + evalID, + env, + appDirectoryPath, + rootPromptDef, + workerConcurrencyQueue, + abortSignal, + progress, + ); + if (result === null) { + return result; + } + + if (result.passed) { + progress.log(rootPromptDef, 'success', 'Tests have passed'); + } else { + progress.log(rootPromptDef, 'error', 'Tests have failed'); + } + + return result; + } catch (err) { + progress.log(rootPromptDef, 'error', `Error when executing tests`, err + ''); + throw err; + } +} diff --git a/runner/progress/dynamic-progress-logger.ts b/runner/progress/dynamic-progress-logger.ts index 949cf96..0e68632 100644 --- a/runner/progress/dynamic-progress-logger.ts +++ b/runner/progress/dynamic-progress-logger.ts @@ -148,6 +148,7 @@ export class DynamicProgressLogger implements ProgressLogger { switch (type) { case 'success': case 'serve-testing': + case 'test': case 'build': return chalk.green; case 'error': diff --git a/runner/progress/progress-logger.ts b/runner/progress/progress-logger.ts index c888aba..b029aa6 100644 --- a/runner/progress/progress-logger.ts +++ b/runner/progress/progress-logger.ts @@ -2,7 +2,14 @@ import {greenCheckmark, redX} from '../reporting/format.js'; import {AssessmentResult, RootPromptDefinition} from '../shared-interfaces.js'; /** Possible progress event types. */ -export type ProgressType = 'codegen' | 'build' | 'serve-testing' | 'success' | 'error' | 'eval'; +export type ProgressType = + | 'codegen' + | 'build' + | 'test' + | 'serve-testing' + | 'success' + | 'error' + | 'eval'; /** Maps a ProgressType to an icon that can represent it. */ export function progressTypeToIcon(type: ProgressType): string { @@ -12,6 +19,8 @@ export function progressTypeToIcon(type: ProgressType): string { return '๐Ÿค–'; case 'build': return '๐Ÿ”จ'; + case 'test': + return '๐Ÿงช'; case 'serve-testing': return '๐ŸŒŠ'; case 'success': diff --git a/runner/ratings/built-in-ratings/successful-tests-rating.ts b/runner/ratings/built-in-ratings/successful-tests-rating.ts new file mode 100644 index 0000000..2941fd3 --- /dev/null +++ b/runner/ratings/built-in-ratings/successful-tests-rating.ts @@ -0,0 +1,28 @@ +import {PerBuildRating, RatingKind, RatingCategory, RatingState} from '../rating-types.js'; + +/** Rating which verifies that unit tests pass successfully. */ +export const successfulTestsRating: PerBuildRating = { + name: 'Tests pass successfully', + description: 'Ensures tests run and pass without errors.', + id: 'common-successful-tests', + kind: RatingKind.PER_BUILD, + category: RatingCategory.MEDIUM_IMPACT, + scoreReduction: '30%', + // Reduce the amount of points in case we've had test repair attempts. + rate: ({testResult, testRepairAttempts}) => { + // If no test results are available, skip this rating + if (!testResult) { + return { + state: RatingState.SKIPPED, + message: 'Unit testing not configured.', + }; + } + + return { + state: RatingState.EXECUTED, + coefficient: testResult.passed + ? 1 / ((testRepairAttempts || 0) + 1) // Reduce score based on repair attempts + : 0, // No points if tests failed + }; + }, +}; diff --git a/runner/ratings/rate-code.ts b/runner/ratings/rate-code.ts index 99d0874..c0500ec 100644 --- a/runner/ratings/rate-code.ts +++ b/runner/ratings/rate-code.ts @@ -8,6 +8,7 @@ import { IndividualAssessmentState, PromptDefinition, AssessmentCategory, + TestExecutionResult, } from '../shared-interfaces.js'; import { RatingState, @@ -56,6 +57,8 @@ export async function rateGeneratedCode( abortSignal: AbortSignal, progress: ProgressLogger, autoraterModel: string, + testResult: TestExecutionResult | null, + testRepairAttempts: number, ): Promise { let categorizedFiles: CategorizedFiles | null = null; let totalPoints = 0; @@ -93,6 +96,8 @@ export async function rateGeneratedCode( buildResult, serveTestingResult, repairAttempts, + testResult, + testRepairAttempts, outputFiles.length, axeRepairAttempts, ratingsResult, @@ -173,6 +178,8 @@ function runPerBuildRating( buildResult: BuildResult, serveResult: ServeTestingResult | null, repairAttempts: number, + testResult: TestExecutionResult | null, + testRepairAttempts: number, generatedFileCount: number, axeRepairAttempts: number, ratingsResult: RatingsResult, @@ -184,6 +191,8 @@ function runPerBuildRating( generatedFileCount, axeRepairAttempts, ratingsResult, + testResult, + testRepairAttempts, }); // If the rating was skipped (e.g., Axe test wasn't run), create a skipped assessment. diff --git a/runner/ratings/rating-types.ts b/runner/ratings/rating-types.ts index fceb104..6dcbf1c 100644 --- a/runner/ratings/rating-types.ts +++ b/runner/ratings/rating-types.ts @@ -5,6 +5,7 @@ import type { LlmResponseFile, PromptDefinition, SkippedIndividualAssessment, + TestExecutionResult, Usage, } from '../shared-interfaces.js'; import {Environment} from '../configuration/environment.js'; @@ -64,6 +65,8 @@ const perBuildRatingSchema = z buildResult: z.custom(), serveResult: z.custom(), repairAttempts: z.number(), + testResult: z.custom(), + testRepairAttempts: z.number(), axeRepairAttempts: z.number(), generatedFileCount: z.number(), ratingsResult: z.record(z.custom()), diff --git a/runner/ratings/stats.ts b/runner/ratings/stats.ts index 7d94753..a97e927 100644 --- a/runner/ratings/stats.ts +++ b/runner/ratings/stats.ts @@ -25,6 +25,10 @@ export function calculateBuildAndCheckStats(assessments: AssessmentResult[]): Ag let successfulInitialBuilds = 0; let successfulBuildsAfterRepair = 0; let failedBuilds = 0; + let successfulInitialTests = 0; + let successfulTestsAfterRepair = 0; + let failedTests = 0; + let noTestsRun = 0; let runtimeStats: RuntimeStats | undefined; let accessibilityStats: | { @@ -59,6 +63,20 @@ export function calculateBuildAndCheckStats(assessments: AssessmentResult[]): Ag } } + // Calculate test statistics + if (result.testResult) { + if (result.testResult.passed) { + if ((result.testRepairAttempts || 0) === 0) { + successfulInitialTests++; + } else { + successfulTestsAfterRepair++; + } + } else { + failedTests++; + } + } else { + noTestsRun++; + } if (result.finalAttempt.serveTestingResult?.runtimeErrors != undefined) { runtimeStats ??= {appsWithErrors: 0, appsWithoutErrors: 0}; if (result.finalAttempt.serveTestingResult.runtimeErrors.trim() != '') { @@ -124,6 +142,12 @@ export function calculateBuildAndCheckStats(assessments: AssessmentResult[]): Ag failedBuilds, errorDistribution: Object.keys(errorDistribution).length > 0 ? errorDistribution : undefined, }, + tests: { + successfulInitialTests, + successfulTestsAfterRepair, + failedTests, + noTestsRun, + }, buckets, runtime: runtimeStats ? { diff --git a/runner/shared-interfaces.ts b/runner/shared-interfaces.ts index a43cb9a..6ca6251 100644 --- a/runner/shared-interfaces.ts +++ b/runner/shared-interfaces.ts @@ -221,8 +221,12 @@ export interface AttemptDetails { // Note: May not be set in older reports. reasoning?: string; - /** Whether the build failed during an accessibility repair attempt. */ - buildFailedDuringA11yRepair?: boolean; + /** Whether the build failed during an test repair attempt (a11y or unit). */ + buildFailedDuringTestRepair?: boolean; + /** Result of running tests for this attempt. */ + testResult?: TestExecutionResult; + /** The number of repair attempts made for tests in this attempt. */ + testRepairAttempts?: number; } /** Statistics related to the build process of the generated applications. */ @@ -237,6 +241,18 @@ export interface RunSummaryBuilds { errorDistribution?: Partial>; } +/** Statistics related to the test process of the generated applications. */ +export interface RunSummaryTests { + /** The number of applications that had tests run and all tests passed on the first attempt. */ + successfulInitialTests: number; + /** The number of applications that had tests run and all tests passed after repair attempts. */ + successfulTestsAfterRepair: number; + /** The number of applications that had tests run but tests failed even after repair attempts. */ + failedTests: number; + /** The number of applications that did not have tests run (no test command configured). */ + noTestsRun: number; +} + /** Buckets into which scores can be categorized. */ export interface ScoreBucket { /** Plain name of the bucket, e.g. "Good" */ @@ -271,6 +287,8 @@ export interface AggregatedRunStats { buckets: ScoreBucket[]; /** Runtime stats. Not present for reports that didn't request runtime error collection. */ runtime?: RuntimeStats; + /** Test stats. Not present for reports that didn't run tests or older reports. */ + tests?: RunSummaryTests; accessibility?: { appsWithErrors: number; @@ -449,6 +467,10 @@ export interface AssessmentResult { axeRepairAttempts: number; /** Tool requests logs (e.g. MCP requests and responses). */ toolLogs?: ToolLogEntry[]; + /** Result of running unit tests. */ + testResult: TestExecutionResult | null; + /** Number of repair attempts for tests. */ + testRepairAttempts?: number; } /** @@ -521,3 +543,9 @@ export interface RunGroup { /** Runner used to generate code for the runs in the group. */ runner?: CodegenRunnerInfo; } + +/** Result of running tests. */ +export interface TestExecutionResult { + passed: boolean; + output: string; +}