diff --git a/docs/environment-reference.md b/docs/environment-reference.md index 3b3f7ba..c37fbea 100644 --- a/docs/environment-reference.md +++ b/docs/environment-reference.md @@ -179,3 +179,8 @@ Defaults to ` run build`. Command used to start a local dev server as a part of the evaluation. Defaults to ` run start --port 0`. + +### `testCommand` + +Command used to run tests against the generated code. If this property is not provided, tests will not be run. The command should exit with code 0 on success and a non-zero exit code on failure. The output from the command (both `stdout` and `stderr`) is captured and used for repair attempts if the tests fail. The test command will time out after 4 minutes. + diff --git a/report-app/src/app/pages/report-viewer/report-viewer.html b/report-app/src/app/pages/report-viewer/report-viewer.html index 274842c..0a74fd7 100644 --- a/report-app/src/app/pages/report-viewer/report-viewer.html +++ b/report-app/src/app/pages/report-viewer/report-viewer.html @@ -73,6 +73,20 @@

+ @if (overview.stats.tests) { +
+

+ quiz + Tests +

+
+ +
+
+ } @if (overview.stats.runtime) {

@@ -281,9 +295,19 @@

Generated applications

Initial build failed } - @if (hasBuildFailureDuringA11yRepair(result)) { + @if (hasBuildFailureDuringTestRepair(result)) { Build failed after a11y repair } + + @if (finalAttempt.testResult) { + @if (finalAttempt.testResult.passed) { + @if ((result.testRepairAttempts || 0) > 0) { + Tests passed after repair + } + } @else { + Tests failed + } + }
@@ -355,12 +379,36 @@
+ @if (result.testResult) { +
+

Test Results

+
+ @if (result.testResult.passed) { + โœ” Tests passed + @if ((result.testRepairAttempts || 0) > 0) { +  after {{ result.testRepairAttempts }} repair attempt(s) + } + } @else { + โœ˜ Tests failed + } +
+ + @if (result.testResult.output && !result.testResult.passed) { +
+ See Test Output +
{{ result.testResult.output }}
+
+ } +
+ } +

Additional info

@for (attempt of result.attemptDetails; track attempt) { @let isBuilt = attempt.buildResult.status === 'success'; @let axeViolations = attempt.serveTestingResult?.axeViolations; @let hasAxeViolations = axeViolations && axeViolations.length > 0; + @let testsFailed = attempt.testResult?.passed === false; @@ -385,6 +433,15 @@

Additional info

>A11y } + + @if (attempt.testResult) { + Tests + }
@if (expansionPanel.opened()) { @@ -421,6 +478,11 @@

A11y Violations

} + @if (testsFailed) { +

Failed Tests

+
{{ attempt.testResult?.output }}
+ } +

Generated Code

@for (file of attempt.outputFiles; track file) { diff --git a/report-app/src/app/pages/report-viewer/report-viewer.ts b/report-app/src/app/pages/report-viewer/report-viewer.ts index 8e02168..78a9085 100644 --- a/report-app/src/app/pages/report-viewer/report-viewer.ts +++ b/report-app/src/app/pages/report-viewer/report-viewer.ts @@ -25,6 +25,7 @@ import { LlmResponseFile, RunInfo, RunSummaryBuilds, + RunSummaryTests, RuntimeStats, ScoreBucket, SkippedIndividualAssessment, @@ -271,6 +272,31 @@ export class ReportViewer { ]; } + protected testsAsGraphData(tests: RunSummaryTests): StackedBarChartData { + return [ + { + label: 'Passed', + color: ScoreCssVariable.excellent, + value: tests.successfulInitialTests, + }, + { + label: 'Passed after repair', + color: ScoreCssVariable.great, + value: tests.successfulTestsAfterRepair, + }, + { + label: 'Failed', + color: ScoreCssVariable.poor, + value: tests.failedTests, + }, + { + label: 'No tests run', + color: ScoreCssVariable.neutral, + value: tests.noTestsRun, + }, + ]; + } + protected checksAsGraphData(buckets: ScoreBucket[]): StackedBarChartData { return buckets.map(b => ({ label: b.nameWithLabels, @@ -427,7 +453,7 @@ export class ReportViewer { return `wcs run --prompt=${result.promptDef.name} --env=`; } - protected hasBuildFailureDuringA11yRepair(result: AssessmentResult): boolean { - return result.attemptDetails.some(attempt => attempt.buildFailedDuringA11yRepair); + protected hasBuildFailureDuringTestRepair(result: AssessmentResult): boolean { + return result.attemptDetails.some(attempt => attempt.buildFailedDuringTestRepair); } } diff --git a/runner/configuration/base-environment-config.ts b/runner/configuration/base-environment-config.ts index fe311ae..4aa3e40 100644 --- a/runner/configuration/base-environment-config.ts +++ b/runner/configuration/base-environment-config.ts @@ -1,8 +1,6 @@ import z from 'zod'; import {ratingSchema} from '../ratings/rating-types.js'; import {MultiStepPrompt} from './multi-step-prompt.js'; -import {mcpServerOptionsSchema} from '../codegen/llm-runner.js'; -import {getPossiblePackageManagers} from './environment-config.js'; export const baseEnvironmentConfigSchema = z.strictObject({ /** Display name for the environment. */ diff --git a/runner/configuration/environment-config.ts b/runner/configuration/environment-config.ts index e15361e..77bdf28 100644 --- a/runner/configuration/environment-config.ts +++ b/runner/configuration/environment-config.ts @@ -15,11 +15,6 @@ const environmentConfigSchema = z.union([ */ export type EnvironmentConfig = z.infer; -/** Package managers that are currently supported. */ -export function getPossiblePackageManagers() { - return ['npm', 'pnpm', 'yarn'] as const; -} - /** Asserts that the specified data is a valid environment config. */ export function assertIsEnvironmentConfig(value: unknown): asserts value is EnvironmentConfig { const validationResult = environmentConfigSchema.safeParse(value); diff --git a/runner/configuration/environment-local.ts b/runner/configuration/environment-local.ts index 9eefed2..041f9a8 100644 --- a/runner/configuration/environment-local.ts +++ b/runner/configuration/environment-local.ts @@ -3,7 +3,7 @@ import z from 'zod'; import {LlmRunner, McpServerOptions, mcpServerOptionsSchema} from '../codegen/llm-runner.js'; import {LocalGateway} from '../orchestration/gateways/local_gateway.js'; import {BaseEnvironment} from './base-environment.js'; -import {EnvironmentConfig, getPossiblePackageManagers} from './environment-config.js'; +import {getPossiblePackageManagers} from './package-managers.js'; import {baseEnvironmentConfigSchema} from './base-environment-config.js'; export const localEnvironmentConfigSchema = baseEnvironmentConfigSchema.extend({ @@ -28,6 +28,10 @@ export const localEnvironmentConfigSchema = baseEnvironmentConfigSchema.extend({ * Defaults to ` run start --port 0`. */ serveCommand: z.string().optional(), + /** + * Command to run when testing the code. + */ + testCommand: z.string().optional(), /** * Whether to skip installing dependencies when running evals in the environment. * Useful if you're managing dependencies yourself. @@ -47,6 +51,8 @@ export class LocalEnvironment extends BaseEnvironment { readonly buildCommand: string; /** Command to run when starting a development server inside the app. */ readonly serveCommand: string; + /** Command to run when starting tests inside the app. */ + readonly testCommand: string | null; /** * Absolute path at which files specific to this environment are located. Will be merged in * with the files from the `projectTemplatePath` to get the final project structure. @@ -82,6 +88,7 @@ export class LocalEnvironment extends BaseEnvironment { this.installCommand = `${packageManager} install --silent`; this.buildCommand = config.buildCommand || `${packageManager} run build`; this.serveCommand = config.serveCommand || this.getDefaultServeCommand(packageManager); + this.testCommand = config.testCommand ?? null; this.projectTemplatePath = projectTemplatePath; this.sourceDirectory = sourceDirectory; this.mcpServerOptions = config.mcpServers || []; diff --git a/runner/configuration/package-managers.ts b/runner/configuration/package-managers.ts new file mode 100644 index 0000000..6929cd2 --- /dev/null +++ b/runner/configuration/package-managers.ts @@ -0,0 +1,4 @@ +/** Package managers that are currently supported. */ +export function getPossiblePackageManagers() { + return ['npm', 'pnpm', 'yarn'] as const; +} diff --git a/runner/eval-cli.ts b/runner/eval-cli.ts index 8734583..d6eb23c 100644 --- a/runner/eval-cli.ts +++ b/runner/eval-cli.ts @@ -36,7 +36,7 @@ interface Options { enableUserJourneyTesting?: boolean; enableAutoCsp?: boolean; autoraterModel?: string; - a11yRepairAttempts?: number; + testRepairAttempts?: number; logging?: 'text-only' | 'dynamic'; } @@ -148,10 +148,11 @@ function builder(argv: Argv): Argv { default: DEFAULT_AUTORATER_MODEL_NAME, description: 'Model to use when automatically rating generated code', }) - .option('a11y-repair-attempts', { + .option('test-repair-attempts', { type: 'number', default: 0, - description: 'Number of repair attempts for discovered a11y violations', + description: + 'Number of repair attempts for discovered test failures (including a11y violations and ones from testCommand)', }) .strict() .version(false) @@ -196,7 +197,7 @@ async function handler(cliArgs: Arguments): Promise { logging: cliArgs.logging, autoraterModel: cliArgs.autoraterModel, skipAiSummary: cliArgs.skipAiSummary, - a11yRepairAttempts: cliArgs.a11yRepairAttempts, + testRepairAttempts: cliArgs.testRepairAttempts, }); logReportToConsole(runInfo); diff --git a/runner/orchestration/build-serve-loop.ts b/runner/orchestration/build-serve-loop.ts index 9bbd849..7f7d1b5 100644 --- a/runner/orchestration/build-serve-loop.ts +++ b/runner/orchestration/build-serve-loop.ts @@ -6,9 +6,10 @@ import {AttemptDetails, LlmContextFile, RootPromptDefinition} from '../shared-in import {DEFAULT_MAX_REPAIR_ATTEMPTS} from '../configuration/constants.js'; import {ProgressLogger} from '../progress/progress-logger.js'; import {runBuild} from './build-worker.js'; -import {repairAndBuild} from './build-repair.js'; +import {repairAndBuild} from './repair.js'; import {EvalID, Gateway} from './gateway.js'; import {serveAndTestApp} from './serve-testing-worker.js'; +import {runTest} from './test-worker.js'; import {BrowserAgentTaskInput} from '../testing/browser-agent/models.js'; /** @@ -30,7 +31,7 @@ import {BrowserAgentTaskInput} from '../testing/browser-agent/models.js'; * @param abortSignal Signal to fire when the build should be aborted. * @param workerConcurrencyQueue Concurrency queue for controlling parallelism of worker invocations (as they are more expensive than LLM calls). */ -export async function attemptBuild( +export async function attemptBuildAndTest( evalID: EvalID, gateway: Gateway, model: string, @@ -47,7 +48,7 @@ export async function attemptBuild( skipAxeTesting: boolean, enableAutoCsp: boolean, userJourneyAgentTaskInput: BrowserAgentTaskInput | undefined, - maxAxeRepairAttempts: number, + maxTestRepairAttempts: number, ) { const initialBuildResult = await runBuild( evalID, @@ -97,13 +98,18 @@ export async function attemptBuild( rootPromptDef, directory, lastAttempt.outputFiles, - lastAttempt.buildResult.message, - 'There are the following build errors:', + [ + { + errorContext: 'There are the following build errors:', + errorMessage: lastAttempt.buildResult.message, + }, + ], contextFiles, abortSignal, workerConcurrencyQueue, repairAttempts, progress, + 'build', ); attemptDetails.push(attempt); @@ -127,31 +133,70 @@ export async function attemptBuild( enableAutoCsp, userJourneyAgentTaskInput, ); + const testResult = await runTest( + evalID, + gateway, + directory, + env, + rootPromptDef, + abortSignal, + workerConcurrencyQueue, + progress, + ); + + if (testResult !== null) { + lastAttempt.testResult = testResult; + } } - // Attempt to repair axe testing. This only runs when the last build - // passed and serving did run. Note: By default, we don't run axe repair + // Attempt to repair testing. This only runs when the last build + // passed and serving did run. Note: By default, we don't run repair // attempts as it's not commonly done by LLMs in the ecosystem. let axeRepairAttempts = 0; - while ( - lastAttempt.serveTestingResult && - (lastAttempt.serveTestingResult.axeViolations?.length ?? 0) > 0 && - axeRepairAttempts < maxAxeRepairAttempts - ) { - axeRepairAttempts++; - progress.log( - rootPromptDef, - 'build', - `Trying to repair axe accessibility violations (attempt #${axeRepairAttempts + 1})...`, - ); + let testRepairAttempts = 0; + for (let testRepairAttempt = 0; testRepairAttempt < maxTestRepairAttempts; testRepairAttempt++) { + const hasAxeFailure = + lastAttempt.serveTestingResult && lastAttempt.serveTestingResult.axeViolations?.length; + const hasTestFailure = lastAttempt.testResult && !lastAttempt.testResult.passed; + if (!hasAxeFailure && !hasTestFailure) { + break; + } - const axeViolationsError = JSON.stringify( - lastAttempt.serveTestingResult.axeViolations, - null, - 2, - ); + const attemptId = testRepairAttempt + repairAttempts + 1; - progress.log(rootPromptDef, 'error', 'Found Axe accessibility violations'); + const errors: Array<{errorContext: string; errorMessage: string}> = []; + if (hasAxeFailure) { + axeRepairAttempts++; + progress.log( + rootPromptDef, + 'build', + `Trying to repair axe accessibility violations (attempt #${attemptId})...`, + ); + const axeViolationsError = JSON.stringify( + lastAttempt.serveTestingResult!.axeViolations, + null, + 2, + ); + progress.log(rootPromptDef, 'error', 'Found Axe accessibility violations'); + errors.push({ + errorContext: + 'There are the following accessibility errors from axe accessibility violations:', + errorMessage: axeViolationsError, + }); + } + if (hasTestFailure) { + testRepairAttempts++; + progress.log( + rootPromptDef, + 'test', + `Trying to repair test failures (attempt #${attemptId})...`, + ); + + errors.push({ + errorContext: 'Application tests failed. Attempt to fix them. Test output was:', + errorMessage: lastAttempt.testResult!.output, + }); + } const attempt = await repairAndBuild( evalID, @@ -161,28 +206,28 @@ export async function attemptBuild( rootPromptDef, directory, lastAttempt.outputFiles, - axeViolationsError, - 'There are the following accessibility errors from axe accessibility violations:', + errors, contextFiles, abortSignal, workerConcurrencyQueue, - axeRepairAttempts + repairAttempts, + attemptId, progress, + 'test', ); let hasBuildFailure = attempt.buildResult.status !== BuildResultStatus.SUCCESS; - attempt.buildFailedDuringA11yRepair = hasBuildFailure; + attempt.buildFailedDuringTestRepair = hasBuildFailure; attemptDetails.push(attempt); lastAttempt = attempt; + // If we somehow introduced build errors via the repair loop, we abort + // further repairs and capture the failed build. This is useful insight + // as LLMs seem to regress when asked to repair violations. + if (hasBuildFailure) { + break; + } - // If we somehow introduced build errors via the Axe repair loop, we abort - // further a11y repairs and capture the failed build. This is useful insight - // as LLMs seem to regress when asked to repair a11y violations. - if (hasBuildFailure) break; - - // Re-run serving & tests after Axe repair. - // This allows us to check if we fixed the violations. - attempt.serveTestingResult = await serveAndTestApp( + // Re-run serving & tests after repair. + lastAttempt.serveTestingResult = await serveAndTestApp( evalID, gateway, directory, @@ -196,10 +241,27 @@ export async function attemptBuild( enableAutoCsp, userJourneyAgentTaskInput, ); + const testResult = await runTest( + evalID, + gateway, + directory, + env, + rootPromptDef, + abortSignal, + workerConcurrencyQueue, + progress, + ); + + if (testResult !== null) { + lastAttempt.testResult = testResult; + } - if (attempt.serveTestingResult.axeViolations?.length === 0) { + if (hasAxeFailure && lastAttempt.serveTestingResult.axeViolations?.length === 0) { progress.log(rootPromptDef, 'success', `Successfully fixed all Axe accessibility violations`); } + if (hasTestFailure && lastAttempt.testResult?.passed) { + progress.log(rootPromptDef, 'success', `Successfully fixed all test failures`); + } } return { @@ -207,6 +269,8 @@ export async function attemptBuild( serveTestingResult: lastAttempt.serveTestingResult, outputFiles: lastAttempt.outputFiles, repairAttempts, - axeRepairAttempts, + axeRepairAttempts: axeRepairAttempts, + testResult: lastAttempt.testResult, + testRepairAttempts: testRepairAttempts, }; } diff --git a/runner/orchestration/codegen.ts b/runner/orchestration/codegen.ts index 0ff1097..7cfed2b 100644 --- a/runner/orchestration/codegen.ts +++ b/runner/orchestration/codegen.ts @@ -8,10 +8,10 @@ import { } from '../shared-interfaces.js'; import {LlmGenerateFilesContext, LlmRunner, PromptDataMessage} from '../codegen/llm-runner.js'; import {Environment} from '../configuration/environment.js'; -import {getPossiblePackageManagers} from '../configuration/environment-config.js'; import {ProgressLogger} from '../progress/progress-logger.js'; import {EvalID, Gateway} from './gateway.js'; import {LocalEnvironment} from '../configuration/environment-local.js'; +import {getPossiblePackageManagers} from '../configuration/package-managers.js'; /** * Generates code using the configured AI model based on the provided prompt. @@ -94,18 +94,17 @@ export async function repairCodeWithAI( promptDef: RootPromptDefinition, directory: string, appFiles: LlmResponseFile[], - errorMessage: string, - errorContext: string, + errors: Array<{errorContext: string; errorMessage: string}>, contextFiles: LlmContextFile[], abortSignal: AbortSignal, progress: ProgressLogger, + repairType: 'build' | 'test', ): Promise { const repairSystemInstructions = env.systemPromptRepair(); const repairPrompt = [ - errorContext, - '```', - errorMessage, - '```', + ...errors.map(({errorContext, errorMessage}) => + [errorContext, '```', errorMessage, '```'].join('\n'), + ), '', 'In the following source code:', ...appFiles.map(file => `${file.filePath}:\n\`\`\`\n${file.code}\`\`\`\n\n`), @@ -121,13 +120,13 @@ export async function repairCodeWithAI( possiblePackageManagers: getPossiblePackageManagers().slice(), }; - progress.log(promptDef, 'codegen', 'Repairing code with AI'); + progress.log(promptDef, 'codegen', `Repairing ${repairType} failures with AI`); - const response = await gateway.repairBuild( + const response = await gateway.repairCode( evalID, context, model, - errorMessage, + errors.map(ec => ec.errorMessage).join('\n'), appFiles, contextFiles, abortSignal, diff --git a/runner/orchestration/gateway.ts b/runner/orchestration/gateway.ts index 7e2bf01..f761366 100644 --- a/runner/orchestration/gateway.ts +++ b/runner/orchestration/gateway.ts @@ -7,6 +7,7 @@ import { LlmResponse, LlmResponseFile, RootPromptDefinition, + TestExecutionResult, } from '../shared-interfaces.js'; import {BuildResult} from '../workers/builder/builder-types.js'; @@ -25,7 +26,7 @@ export interface Gateway { abortSignal: AbortSignal, ): Promise; - repairBuild( + repairCode( id: EvalID, requestCtx: LlmGenerateFilesContext, model: string, @@ -47,6 +48,18 @@ export interface Gateway { progress: ProgressLogger, ): Promise; + tryTest( + id: EvalID, + env: Env, + appDirectoryPath: string, + rootPromptDef: RootPromptDefinition, + workerConcurrencyQueue: PQueue, + abortSignal: AbortSignal, + progress: ProgressLogger, + ): Promise; + + shouldRetryFailedTestExecution(evalID: EvalID): boolean; + serveBuild( id: EvalID, env: Env, diff --git a/runner/orchestration/gateways/local_gateway.ts b/runner/orchestration/gateways/local_gateway.ts index 75cd970..69cbcc8 100644 --- a/runner/orchestration/gateways/local_gateway.ts +++ b/runner/orchestration/gateways/local_gateway.ts @@ -10,6 +10,7 @@ import { LlmContextFile, LlmResponse, LlmResponseFile, + TestExecutionResult, } from '../../shared-interfaces.js'; import {generateCodeWithAI} from '../codegen.js'; import {EvalID, Gateway} from '../gateway.js'; @@ -19,6 +20,9 @@ import {ProgressLogger} from '../../progress/progress-logger.js'; import {serveApp} from '../../workers/serve-testing/serve-app.js'; import {LocalEnvironment} from '../../configuration/environment-local.js'; import PQueue from 'p-queue'; +import {executeCommand} from '../../utils/exec.js'; +import {callWithTimeout} from '../../utils/timeout.js'; +import {cleanupBuildMessage} from '../../workers/builder/worker.js'; let uniqueIDs = 0; @@ -39,7 +43,7 @@ export class LocalGateway implements Gateway { return await generateCodeWithAI(this.llm, model, requestCtx, contextFiles, abortSignal); } - async repairBuild( + async repairCode( _id: EvalID, requestCtx: LlmGenerateFilesContext, model: string, @@ -88,6 +92,45 @@ export class LocalGateway implements Gateway { ); } + async tryTest( + _id: EvalID, + env: LocalEnvironment, + appDirectoryPath: string, + rootPromptDef: RootPromptDefinition, + workerConcurrencyQueue: PQueue, + abortSignal: AbortSignal, + progress: ProgressLogger, + ): Promise { + if (!env.testCommand) { + return Promise.resolve(null); + } + + let output: string; + let passed: boolean; + + try { + // Run the test command inside the temporary project directory + const stdout = await callWithTimeout( + `Testing ${rootPromptDef.name}`, + timeoutAbort => + executeCommand(env.testCommand!, appDirectoryPath, undefined, { + abortSignal: AbortSignal.any([abortSignal, timeoutAbort]), + }), + 4, // 4min. This is a safety boundary. Lots of parallelism can slow-down. + ); + output = stdout; + passed = true; + } catch (error: any) { + output = error.message; + passed = false; + } + + return { + passed, + output: cleanupBuildMessage(output), + } satisfies TestExecutionResult; + } + async serveBuild( _id: EvalID, env: LocalEnvironment, @@ -109,5 +152,9 @@ export class LocalGateway implements Gateway { return this.llm.hasBuiltInRepairLoop === false; } + shouldRetryFailedTestExecution(): boolean { + return this.llm.hasBuiltInRepairLoop === false; + } + async finalizeEval(_id: EvalID): Promise {} } diff --git a/runner/orchestration/generate.ts b/runner/orchestration/generate.ts index 95b12c4..45d7ce8 100644 --- a/runner/orchestration/generate.ts +++ b/runner/orchestration/generate.ts @@ -29,17 +29,17 @@ import { RunDetails, RunInfo, RunSummary, + TestExecutionResult, Usage, } from '../shared-interfaces.js'; import {BrowserAgentTaskInput} from '../testing/browser-agent/models.js'; import {callWithTimeout} from '../utils/timeout.js'; -import {attemptBuild} from './build-serve-loop.js'; +import {attemptBuildAndTest} from './build-serve-loop.js'; import {createLlmResponseTokenUsageMessage} from './codegen.js'; import {generateUserJourneysForApp} from './user-journeys.js'; import {resolveContextFiles, setupProjectStructure, writeResponseFiles} from './file-system.js'; import {GenkitRunner} from '../codegen/genkit/genkit-runner.js'; import {getEnvironmentByPath} from '../configuration/environment-resolution.js'; -import {getPossiblePackageManagers} from '../configuration/environment-config.js'; import {ProgressLogger} from '../progress/progress-logger.js'; import {TextProgressLogger} from '../progress/text-progress-logger.js'; import {logReportHeader} from '../reporting/report-logging.js'; @@ -51,6 +51,7 @@ import {EvalID, Gateway} from './gateway.js'; import {LocalEnvironment} from '../configuration/environment-local.js'; import {getRunnerByName, RunnerName} from '../codegen/runner-creation.js'; import {summarizeReportWithAI} from '../reporting/report-ai-summary.js'; +import {getPossiblePackageManagers} from '../configuration/package-managers.js'; /** * Orchestrates the entire assessment process for each prompt defined in the `prompts` array. @@ -59,7 +60,8 @@ import {summarizeReportWithAI} from '../reporting/report-ai-summary.js'; * 1. Makes a request to Gemini to generate code. * 2. Attempts to build it in a template Angular project. * 3. If the build fails, it makes a number of "fix it" Gemini requests. - * 4. Runs other validations and computes a score for generated output. + * 4. If configured, runs unit tests and attempts to repair test failures. + * 5. Runs other validations and computes a score for generated output. * * @returns A Promise that resolves to an array of AssessmentResult objects, * each containing the prompt, generated code, and final validation status. @@ -84,7 +86,7 @@ export async function generateCodeAndAssess(options: { enableAutoCsp?: boolean; logging?: 'text-only' | 'dynamic'; autoraterModel?: string; - a11yRepairAttempts?: number; + testRepairAttempts?: number; }): Promise { const env = await getEnvironmentByPath(options.environmentConfigPath, options.runner); const ratingLlm = await getRunnerByName('genkit'); @@ -179,7 +181,7 @@ export async function generateCodeAndAssess(options: { workerConcurrencyQueue, progress, options.autoraterModel || DEFAULT_AUTORATER_MODEL_NAME, - options.a11yRepairAttempts ?? 0, + options.testRepairAttempts ?? 0, ), // 10min max per app evaluation. We just want to make sure it never gets stuck. 10, @@ -309,7 +311,7 @@ async function startEvaluationTask( workerConcurrencyQueue: PQueue, progress: ProgressLogger, autoraterModel: string, - a11yRepairAttempts: number, + testRepairAttempts: number, ): Promise { // Set up the project structure once for the root project. const {directory, cleanup} = await setupProjectStructure( @@ -417,7 +419,7 @@ async function startEvaluationTask( // Try to build the files in the root prompt directory. // This will also attempt to fix issues with the generated code. - const attempt = await attemptBuild( + const attempt = await attemptBuildAndTest( evalID, gateway, model, @@ -434,7 +436,7 @@ async function startEvaluationTask( skipAxeTesting, enableAutoCsp, userJourneyAgentTaskInput, - a11yRepairAttempts, + testRepairAttempts, ); if (!attempt) { @@ -455,6 +457,8 @@ async function startEvaluationTask( abortSignal, progress, autoraterModel, + attempt.testResult ?? null, + attempt.testRepairAttempts, ); results.push({ @@ -472,6 +476,8 @@ async function startEvaluationTask( userJourneys: userJourneys, axeRepairAttempts: attempt.axeRepairAttempts, toolLogs, + testResult: attempt.testResult ?? null, + testRepairAttempts: attempt.testRepairAttempts, } satisfies AssessmentResult); } diff --git a/runner/orchestration/build-repair.ts b/runner/orchestration/repair.ts similarity index 95% rename from runner/orchestration/build-repair.ts rename to runner/orchestration/repair.ts index b275fa1..8f6e736 100644 --- a/runner/orchestration/build-repair.ts +++ b/runner/orchestration/repair.ts @@ -1,4 +1,6 @@ +import {Environment} from '../configuration/environment.js'; import PQueue from 'p-queue'; +import {ProgressLogger} from '../progress/progress-logger.js'; import { AttemptDetails, LlmContextFile, @@ -6,12 +8,10 @@ import { LlmResponseFile, RootPromptDefinition, } from '../shared-interfaces.js'; -import {Environment} from '../configuration/environment.js'; -import {repairCodeWithAI} from './codegen.js'; -import {writeResponseFiles} from './file-system.js'; import {runBuild} from './build-worker.js'; -import {ProgressLogger} from '../progress/progress-logger.js'; +import {writeResponseFiles} from './file-system.js'; import {EvalID, Gateway} from './gateway.js'; +import {repairCodeWithAI} from './codegen.js'; /** * Calls the LLM to repair code, handles the response, and attempts to build the project again. @@ -23,12 +23,11 @@ import {EvalID, Gateway} from './gateway.js'; * @param directory The working directory. * @param finalOutputFiles The list of output files to be modified. * @param errorMessage The error message from the failed build. - * @param errorContext Additional context for the error. + * @param errors Additional context for the error. * @param contextFiles A list of context files for the LLM. * @param abortSignal An AbortSignal to cancel the operation. * @param workerConcurrencyQueue The queue for managing worker concurrency. * @param attempts The current attempt number. - * @param repairType The type of repair being performed. * @returns A promise that resolves to the new BuildResult. */ export async function repairAndBuild( @@ -39,13 +38,13 @@ export async function repairAndBuild( rootPromptDef: RootPromptDefinition, directory: string, previousAttemptFiles: LlmResponseFile[], - errorMessage: string, - errorContext: string, + errors: Array<{errorContext: string; errorMessage: string}>, contextFiles: LlmContextFile[], abortSignal: AbortSignal, workerConcurrencyQueue: PQueue, attempts: number, progress: ProgressLogger, + repairType: 'build' | 'test', ): Promise { const repairResponse = await repairCodeWithAI( evalID, @@ -55,11 +54,11 @@ export async function repairAndBuild( rootPromptDef, directory, previousAttemptFiles, - errorMessage, - errorContext, + errors, contextFiles, abortSignal, progress, + repairType, ); return await handleRepairResponse( @@ -77,6 +76,27 @@ export async function repairAndBuild( ); } +/** + * Merges a set of new or updated files from a repair attempt into the + * current set of files. + * @param repairOutputFiles The array of new or updated files to merge. + * @param finalFiles The array of files to be updated. + */ +function mergeRepairFiles(repairOutputFiles: LlmResponseFile[], finalFiles: LlmResponseFile[]) { + // Merge the repair response into the original files. Otherwise we may end up dropping + // files that were valid in the initial response and the LLM decided not to touch, because + // they're still valid. + for (const file of repairOutputFiles) { + const existingFile = finalFiles.find(f => f.filePath === file.filePath); + + if (existingFile) { + existingFile.code = file.code; + } else { + finalFiles.push(file); + } + } +} + /** * Processes an LLM repair response by merging the suggested file changes, * writing them to disk, rebuilding the application, and logging the outcome. @@ -93,7 +113,7 @@ async function handleRepairResponse( abortSignal: AbortSignal, attempts: number, progress: ProgressLogger, -) { +): Promise { if (!repairResponse.success) { progress.log( rootPromptDef, @@ -104,7 +124,6 @@ async function handleRepairResponse( // Stop trying to repair if AI can't suggest a fix (API request fails) throw new Error(`Repair request failed: ${repairResponse.errors.join('\n')}`); } - // Clone the previous files because `mergeRepairFiles` mutates the attempt files. // We don't want to change files of a previous attempt. const newAttemptFiles = previousAttemptFiles.map(f => ({...f})); @@ -132,24 +151,3 @@ async function handleRepairResponse( attempt: attempts, }; } - -/** - * Merges a set of new or updated files from a repair attempt into the - * current set of files. - * @param repairOutputFiles The array of new or updated files to merge. - * @param finalFiles The array of files to be updated. - */ -function mergeRepairFiles(repairOutputFiles: LlmResponseFile[], finalFiles: LlmResponseFile[]) { - // Merge the repair response into the original files. Otherwise we may end up dropping - // files that were valid in the initial response and the LLM decided not to touch, because - // they're still valid. - for (const file of repairOutputFiles) { - const existingFile = finalFiles.find(f => f.filePath === file.filePath); - - if (existingFile) { - existingFile.code = file.code; - } else { - finalFiles.push(file); - } - } -} diff --git a/runner/orchestration/test-worker.ts b/runner/orchestration/test-worker.ts new file mode 100644 index 0000000..53b4fe6 --- /dev/null +++ b/runner/orchestration/test-worker.ts @@ -0,0 +1,44 @@ +import PQueue from 'p-queue'; +import {RootPromptDefinition, TestExecutionResult} from '../shared-interfaces.js'; +import {ProgressLogger} from '../progress/progress-logger.js'; +import {EvalID, Gateway} from './gateway.js'; +import {Environment} from '../configuration/environment.js'; + +export async function runTest( + evalID: EvalID, + gateway: Gateway, + appDirectoryPath: string, + env: Environment, + rootPromptDef: RootPromptDefinition, + abortSignal: AbortSignal, + workerConcurrencyQueue: PQueue, + progress: ProgressLogger, +): Promise { + progress.log(rootPromptDef, 'test', `Running tests`); + + try { + const result = await gateway.tryTest( + evalID, + env, + appDirectoryPath, + rootPromptDef, + workerConcurrencyQueue, + abortSignal, + progress, + ); + if (result === null) { + return result; + } + + if (result.passed) { + progress.log(rootPromptDef, 'success', 'Tests have passed'); + } else { + progress.log(rootPromptDef, 'error', 'Tests have failed'); + } + + return result; + } catch (err) { + progress.log(rootPromptDef, 'error', `Error when executing tests`, err + ''); + throw err; + } +} diff --git a/runner/progress/dynamic-progress-logger.ts b/runner/progress/dynamic-progress-logger.ts index 949cf96..0e68632 100644 --- a/runner/progress/dynamic-progress-logger.ts +++ b/runner/progress/dynamic-progress-logger.ts @@ -148,6 +148,7 @@ export class DynamicProgressLogger implements ProgressLogger { switch (type) { case 'success': case 'serve-testing': + case 'test': case 'build': return chalk.green; case 'error': diff --git a/runner/progress/progress-logger.ts b/runner/progress/progress-logger.ts index c888aba..b029aa6 100644 --- a/runner/progress/progress-logger.ts +++ b/runner/progress/progress-logger.ts @@ -2,7 +2,14 @@ import {greenCheckmark, redX} from '../reporting/format.js'; import {AssessmentResult, RootPromptDefinition} from '../shared-interfaces.js'; /** Possible progress event types. */ -export type ProgressType = 'codegen' | 'build' | 'serve-testing' | 'success' | 'error' | 'eval'; +export type ProgressType = + | 'codegen' + | 'build' + | 'test' + | 'serve-testing' + | 'success' + | 'error' + | 'eval'; /** Maps a ProgressType to an icon that can represent it. */ export function progressTypeToIcon(type: ProgressType): string { @@ -12,6 +19,8 @@ export function progressTypeToIcon(type: ProgressType): string { return '๐Ÿค–'; case 'build': return '๐Ÿ”จ'; + case 'test': + return '๐Ÿงช'; case 'serve-testing': return '๐ŸŒŠ'; case 'success': diff --git a/runner/ratings/built-in-ratings/successful-tests-rating.ts b/runner/ratings/built-in-ratings/successful-tests-rating.ts new file mode 100644 index 0000000..2941fd3 --- /dev/null +++ b/runner/ratings/built-in-ratings/successful-tests-rating.ts @@ -0,0 +1,28 @@ +import {PerBuildRating, RatingKind, RatingCategory, RatingState} from '../rating-types.js'; + +/** Rating which verifies that unit tests pass successfully. */ +export const successfulTestsRating: PerBuildRating = { + name: 'Tests pass successfully', + description: 'Ensures tests run and pass without errors.', + id: 'common-successful-tests', + kind: RatingKind.PER_BUILD, + category: RatingCategory.MEDIUM_IMPACT, + scoreReduction: '30%', + // Reduce the amount of points in case we've had test repair attempts. + rate: ({testResult, testRepairAttempts}) => { + // If no test results are available, skip this rating + if (!testResult) { + return { + state: RatingState.SKIPPED, + message: 'Unit testing not configured.', + }; + } + + return { + state: RatingState.EXECUTED, + coefficient: testResult.passed + ? 1 / ((testRepairAttempts || 0) + 1) // Reduce score based on repair attempts + : 0, // No points if tests failed + }; + }, +}; diff --git a/runner/ratings/rate-code.ts b/runner/ratings/rate-code.ts index 99d0874..c0500ec 100644 --- a/runner/ratings/rate-code.ts +++ b/runner/ratings/rate-code.ts @@ -8,6 +8,7 @@ import { IndividualAssessmentState, PromptDefinition, AssessmentCategory, + TestExecutionResult, } from '../shared-interfaces.js'; import { RatingState, @@ -56,6 +57,8 @@ export async function rateGeneratedCode( abortSignal: AbortSignal, progress: ProgressLogger, autoraterModel: string, + testResult: TestExecutionResult | null, + testRepairAttempts: number, ): Promise { let categorizedFiles: CategorizedFiles | null = null; let totalPoints = 0; @@ -93,6 +96,8 @@ export async function rateGeneratedCode( buildResult, serveTestingResult, repairAttempts, + testResult, + testRepairAttempts, outputFiles.length, axeRepairAttempts, ratingsResult, @@ -173,6 +178,8 @@ function runPerBuildRating( buildResult: BuildResult, serveResult: ServeTestingResult | null, repairAttempts: number, + testResult: TestExecutionResult | null, + testRepairAttempts: number, generatedFileCount: number, axeRepairAttempts: number, ratingsResult: RatingsResult, @@ -184,6 +191,8 @@ function runPerBuildRating( generatedFileCount, axeRepairAttempts, ratingsResult, + testResult, + testRepairAttempts, }); // If the rating was skipped (e.g., Axe test wasn't run), create a skipped assessment. diff --git a/runner/ratings/rating-types.ts b/runner/ratings/rating-types.ts index fceb104..6dcbf1c 100644 --- a/runner/ratings/rating-types.ts +++ b/runner/ratings/rating-types.ts @@ -5,6 +5,7 @@ import type { LlmResponseFile, PromptDefinition, SkippedIndividualAssessment, + TestExecutionResult, Usage, } from '../shared-interfaces.js'; import {Environment} from '../configuration/environment.js'; @@ -64,6 +65,8 @@ const perBuildRatingSchema = z buildResult: z.custom(), serveResult: z.custom(), repairAttempts: z.number(), + testResult: z.custom(), + testRepairAttempts: z.number(), axeRepairAttempts: z.number(), generatedFileCount: z.number(), ratingsResult: z.record(z.custom()), diff --git a/runner/ratings/stats.ts b/runner/ratings/stats.ts index 7d94753..a97e927 100644 --- a/runner/ratings/stats.ts +++ b/runner/ratings/stats.ts @@ -25,6 +25,10 @@ export function calculateBuildAndCheckStats(assessments: AssessmentResult[]): Ag let successfulInitialBuilds = 0; let successfulBuildsAfterRepair = 0; let failedBuilds = 0; + let successfulInitialTests = 0; + let successfulTestsAfterRepair = 0; + let failedTests = 0; + let noTestsRun = 0; let runtimeStats: RuntimeStats | undefined; let accessibilityStats: | { @@ -59,6 +63,20 @@ export function calculateBuildAndCheckStats(assessments: AssessmentResult[]): Ag } } + // Calculate test statistics + if (result.testResult) { + if (result.testResult.passed) { + if ((result.testRepairAttempts || 0) === 0) { + successfulInitialTests++; + } else { + successfulTestsAfterRepair++; + } + } else { + failedTests++; + } + } else { + noTestsRun++; + } if (result.finalAttempt.serveTestingResult?.runtimeErrors != undefined) { runtimeStats ??= {appsWithErrors: 0, appsWithoutErrors: 0}; if (result.finalAttempt.serveTestingResult.runtimeErrors.trim() != '') { @@ -124,6 +142,12 @@ export function calculateBuildAndCheckStats(assessments: AssessmentResult[]): Ag failedBuilds, errorDistribution: Object.keys(errorDistribution).length > 0 ? errorDistribution : undefined, }, + tests: { + successfulInitialTests, + successfulTestsAfterRepair, + failedTests, + noTestsRun, + }, buckets, runtime: runtimeStats ? { diff --git a/runner/shared-interfaces.ts b/runner/shared-interfaces.ts index a43cb9a..6ca6251 100644 --- a/runner/shared-interfaces.ts +++ b/runner/shared-interfaces.ts @@ -221,8 +221,12 @@ export interface AttemptDetails { // Note: May not be set in older reports. reasoning?: string; - /** Whether the build failed during an accessibility repair attempt. */ - buildFailedDuringA11yRepair?: boolean; + /** Whether the build failed during an test repair attempt (a11y or unit). */ + buildFailedDuringTestRepair?: boolean; + /** Result of running tests for this attempt. */ + testResult?: TestExecutionResult; + /** The number of repair attempts made for tests in this attempt. */ + testRepairAttempts?: number; } /** Statistics related to the build process of the generated applications. */ @@ -237,6 +241,18 @@ export interface RunSummaryBuilds { errorDistribution?: Partial>; } +/** Statistics related to the test process of the generated applications. */ +export interface RunSummaryTests { + /** The number of applications that had tests run and all tests passed on the first attempt. */ + successfulInitialTests: number; + /** The number of applications that had tests run and all tests passed after repair attempts. */ + successfulTestsAfterRepair: number; + /** The number of applications that had tests run but tests failed even after repair attempts. */ + failedTests: number; + /** The number of applications that did not have tests run (no test command configured). */ + noTestsRun: number; +} + /** Buckets into which scores can be categorized. */ export interface ScoreBucket { /** Plain name of the bucket, e.g. "Good" */ @@ -271,6 +287,8 @@ export interface AggregatedRunStats { buckets: ScoreBucket[]; /** Runtime stats. Not present for reports that didn't request runtime error collection. */ runtime?: RuntimeStats; + /** Test stats. Not present for reports that didn't run tests or older reports. */ + tests?: RunSummaryTests; accessibility?: { appsWithErrors: number; @@ -449,6 +467,10 @@ export interface AssessmentResult { axeRepairAttempts: number; /** Tool requests logs (e.g. MCP requests and responses). */ toolLogs?: ToolLogEntry[]; + /** Result of running unit tests. */ + testResult: TestExecutionResult | null; + /** Number of repair attempts for tests. */ + testRepairAttempts?: number; } /** @@ -521,3 +543,9 @@ export interface RunGroup { /** Runner used to generate code for the runs in the group. */ runner?: CodegenRunnerInfo; } + +/** Result of running tests. */ +export interface TestExecutionResult { + passed: boolean; + output: string; +}