diff --git a/report-app/src/app/pages/report-viewer/report-viewer.html b/report-app/src/app/pages/report-viewer/report-viewer.html index af25e46..85e3390 100644 --- a/report-app/src/app/pages/report-viewer/report-viewer.html +++ b/report-app/src/app/pages/report-viewer/report-viewer.html @@ -258,18 +258,16 @@

Generated applications

@let initialAttempt = result.attemptDetails[0]; - @let repairAttempt = - result.attemptDetails.length > 1 - ? result.attemptDetails[1] - : null; @let finalAttempt = result.attemptDetails.at(-1)!; @if (finalAttempt.serveTestingResult?.runtimeErrors) { Runtime error } - @if (repairAttempt?.buildResult?.status === 'error') { - Build after repair + @if (finalAttempt?.buildResult?.status === 'error') { + @if (result.attemptDetails.length > 1) { + Build failed + } } @if (initialAttempt?.buildResult?.status === 'error') { @@ -366,15 +364,19 @@

Additional info

? 'Initial response' : `Repair attempt #${$index}` }} - @if (!isBuilt) { - Build - } - @if (hasAxeViolations) { + + Build + + @if (isBuilt) { A11y } diff --git a/report-app/src/app/pages/report-viewer/report-viewer.ts b/report-app/src/app/pages/report-viewer/report-viewer.ts index 00dcd5b..c8a6c9f 100644 --- a/report-app/src/app/pages/report-viewer/report-viewer.ts +++ b/report-app/src/app/pages/report-viewer/report-viewer.ts @@ -12,7 +12,10 @@ import { viewChild, } from '@angular/core'; import { NgxJsonViewerModule } from 'ngx-json-viewer'; -import { BuildErrorType } from '../../../../../runner/workers/builder/builder-types'; +import { + BuildErrorType, + BuildResultStatus, +} from '../../../../../runner/workers/builder/builder-types'; import { AssessmentResult, IndividualAssessment, diff --git a/runner/eval-cli.ts b/runner/eval-cli.ts index 78fdb77..f9be502 100644 --- a/runner/eval-cli.ts +++ b/runner/eval-cli.ts @@ -39,6 +39,7 @@ interface Options { enableUserJourneyTesting?: boolean; enableAutoCsp?: boolean; autoraterModel?: string; + a11yRepairAttempts?: number; logging?: 'text-only' | 'dynamic'; } @@ -156,6 +157,11 @@ function builder(argv: Argv): Argv { default: DEFAULT_AUTORATER_MODEL_NAME, description: 'Model to use when automatically rating generated code', }) + .option('a11y-repair-attempts', { + type: 'number', + default: 0, + description: 'Number of repair attempts for discovered a11y violations', + }) .strict() .version(false) .help() @@ -199,6 +205,8 @@ async function handler(cliArgs: Arguments): Promise { enableAutoCsp: cliArgs.enableAutoCsp, logging: cliArgs.logging, autoraterModel: cliArgs.autoraterModel, + skipAiSummary: cliArgs.skipAiSummary, + a11yRepairAttempts: cliArgs.a11yRepairAttempts, }); logReportToConsole(runInfo); diff --git a/runner/orchestration/build-repair.ts b/runner/orchestration/build-repair.ts index 0a44536..6a53fe7 100644 --- a/runner/orchestration/build-repair.ts +++ b/runner/orchestration/build-repair.ts @@ -38,7 +38,7 @@ export async function repairAndBuild( env: Environment, rootPromptDef: RootPromptDefinition, directory: string, - finalOutputFiles: LlmResponseFile[], + previousAttemptFiles: LlmResponseFile[], errorMessage: string, errorContext: string, contextFiles: LlmContextFile[], @@ -54,7 +54,7 @@ export async function repairAndBuild( env, rootPromptDef, directory, - finalOutputFiles, + previousAttemptFiles, errorMessage, errorContext, contextFiles, @@ -66,7 +66,7 @@ export async function repairAndBuild( evalID, gateway, repairResponse, - finalOutputFiles, + previousAttemptFiles, env, rootPromptDef, directory, @@ -85,7 +85,7 @@ async function handleRepairResponse( evalID: EvalID, gateway: Gateway, repairResponse: LlmResponse, - finalOutputFiles: LlmResponseFile[], + previousAttemptFiles: LlmResponseFile[], env: Environment, rootPromptDef: RootPromptDefinition, directory: string, @@ -106,8 +106,13 @@ async function handleRepairResponse( `Repair request failed: ${repairResponse.errors.join('\n')}` ); } - mergeRepairFiles(repairResponse.outputFiles, finalOutputFiles); - writeResponseFiles(directory, finalOutputFiles, env, rootPromptDef.name); + + // Clone the previous files because `mergeRepairFiles` mutates the attempt files. + // We don't want to change files of a previous attempt. + const newAttemptFiles = previousAttemptFiles.map((f) => ({ ...f })); + + mergeRepairFiles(repairResponse.outputFiles, newAttemptFiles); + writeResponseFiles(directory, newAttemptFiles, env, rootPromptDef.name); const buildResult = await runBuild( evalID, @@ -120,12 +125,8 @@ async function handleRepairResponse( progress ); - // Capture attempt's full files. Copy because `finalOutputFiles` can be - // mutated in subsequent repair attempts. - const attemptFullFiles = finalOutputFiles.map((f) => ({ ...f })); - return { - outputFiles: attemptFullFiles, + outputFiles: newAttemptFiles, usage: repairResponse.usage, reasoning: repairResponse.reasoning, buildResult, diff --git a/runner/orchestration/build-serve-loop.ts b/runner/orchestration/build-serve-loop.ts index 1bb5ed5..03a6220 100644 --- a/runner/orchestration/build-serve-loop.ts +++ b/runner/orchestration/build-serve-loop.ts @@ -50,13 +50,9 @@ export async function attemptBuild( skipScreenshots: boolean, skipAxeTesting: boolean, enableAutoCsp: boolean, - userJourneyAgentTaskInput?: BrowserAgentTaskInput + userJourneyAgentTaskInput: BrowserAgentTaskInput | undefined, + maxAxeRepairAttempts: number ) { - // Clone the original files, because we're going to mutate them between repair - // attempts and we don't want the different runs to influence each other. - const finalOutputFiles = initialResponse.files.map((file) => ({ - ...file, - })); const initialBuildResult = await runBuild( evalID, gateway, @@ -104,7 +100,7 @@ export async function attemptBuild( env, rootPromptDef, directory, - finalOutputFiles, + lastAttempt.outputFiles, lastAttempt.buildResult.message, 'There are the following build errors:', contextFiles, @@ -137,13 +133,14 @@ export async function attemptBuild( ); } - // Attempt to repair axe testing. - // This only runs when the last build completed & serving did run. + // Attempt to repair axe testing. This only runs when the last build + // passed and serving did run. Note: By default, we don't run axe repair + // attempts as it's not commonly done by LLMs in the ecosystem. let axeRepairAttempts = 0; while ( lastAttempt.serveTestingResult && (lastAttempt.serveTestingResult.axeViolations?.length ?? 0) > 0 && - axeRepairAttempts < maxRepairAttempts + axeRepairAttempts < maxAxeRepairAttempts ) { axeRepairAttempts++; progress.log( @@ -167,7 +164,7 @@ export async function attemptBuild( env, rootPromptDef, directory, - finalOutputFiles, + lastAttempt.outputFiles, axeViolationsError, 'There are the following accessibility errors from axe accessibility violations:', contextFiles, @@ -180,8 +177,9 @@ export async function attemptBuild( attemptDetails.push(attempt); lastAttempt = attempt; - // If we somehow introduced build errors via the Axe repair loop, - // then we should abort and let that last attempt have "build errors". + // If we somehow introduced build errors via the Axe repair loop, we abort + // further a11y repairs and capture the failed build. This is useful insight + // as LLMs seem to regress when asked to repair a11y violations. if (attempt.buildResult.status !== BuildResultStatus.SUCCESS) { break; } @@ -215,7 +213,7 @@ export async function attemptBuild( return { buildResult: lastAttempt.buildResult, serveTestingResult: lastAttempt.serveTestingResult, - outputFiles: finalOutputFiles, + outputFiles: lastAttempt.outputFiles, repairAttempts, axeRepairAttempts, }; diff --git a/runner/orchestration/generate.ts b/runner/orchestration/generate.ts index e5c2347..dd3012f 100644 --- a/runner/orchestration/generate.ts +++ b/runner/orchestration/generate.ts @@ -88,6 +88,7 @@ export async function generateCodeAndAssess(options: { enableAutoCsp?: boolean; logging?: 'text-only' | 'dynamic'; autoraterModel?: string; + a11yRepairAttempts?: number; }): Promise { const env = await getEnvironmentByPath( options.environmentConfigPath, @@ -190,7 +191,8 @@ export async function generateCodeAndAssess(options: { !!options.enableAutoCsp, workerConcurrencyQueue, progress, - options.autoraterModel || DEFAULT_AUTORATER_MODEL_NAME + options.autoraterModel || DEFAULT_AUTORATER_MODEL_NAME, + options.a11yRepairAttempts ?? 0 ), // 10min max per app evaluation. We just want to make sure it never gets stuck. 10 @@ -328,7 +330,8 @@ async function startEvaluationTask( enableAutoCsp: boolean, workerConcurrencyQueue: PQueue, progress: ProgressLogger, - autoraterModel: string + autoraterModel: string, + a11yRepairAttempts: number ): Promise { // Set up the project structure once for the root project. const { directory, cleanup } = await setupProjectStructure( @@ -469,7 +472,8 @@ async function startEvaluationTask( skipScreenshots, skipAxeTesting, enableAutoCsp, - userJourneyAgentTaskInput + userJourneyAgentTaskInput, + a11yRepairAttempts ); if (!attempt) {