refactor: do not repair a11y failures by default

devversion · crisbeto · commit d3df340b0cd4 · 2025-09-30T13:36:53.000+02:00
We will stop repairing a11y failures by default because
agents by default don't do this and it would therefore
give a more realistic insight into app quality.

The main motivator triggering this change is that we occasionally see
that a11y repairs re-introduce actual build failures. Those cases are
captured as they can be useful insight, but it would rather confuse
results in the default mode. Hence the change.

The a11y repair attempts can be configured via an opt-in CLI flag.
diff --git a/report-app/src/app/pages/report-viewer/report-viewer.html b/report-app/src/app/pages/report-viewer/report-viewer.html
@@ -258,18 +258,16 @@ <h2>Generated applications</h2>
 
               <div class="status-badge-group">
                 @let initialAttempt = result.attemptDetails[0];
-                @let repairAttempt =
-                  result.attemptDetails.length > 1
-                    ? result.attemptDetails[1]
-                    : null;
                 @let finalAttempt = result.attemptDetails.at(-1)!;
 
                 @if (finalAttempt.serveTestingResult?.runtimeErrors) {
                   <span class="status-badge error">Runtime error</span>
                 }
 
-                @if (repairAttempt?.buildResult?.status === 'error') {
-                  <span class="status-badge error">Build after repair</span>
+                @if (finalAttempt?.buildResult?.status === 'error') {
+                  @if (result.attemptDetails.length > 1) {
+                    <span class="status-badge error">Build failed</span>
+                  }
                 }
 
                 @if (initialAttempt?.buildResult?.status === 'error') {
@@ -366,15 +364,19 @@ <h4>Additional info</h4>
                           ? 'Initial response'
                           : `Repair attempt #${$index}`
                       }}
-                      @if (!isBuilt) {
-                        <span class="status-badge" [class.error]="!isBuilt"
-                          >Build</span
-                        >
-                      }
-                      @if (hasAxeViolations) {
+
+                      <span
+                        class="status-badge error"
+                        [class.error]="!isBuilt"
+                        [class.success]="isBuilt"
+                        >Build</span
+                      >
+
+                      @if (isBuilt) {
                         <span
                           class="status-badge"
                           [class.error]="hasAxeViolations"
+                          [class.success]="!hasAxeViolations"
                           >A11y</span
                         >
                       }
diff --git a/report-app/src/app/pages/report-viewer/report-viewer.ts b/report-app/src/app/pages/report-viewer/report-viewer.ts
@@ -12,7 +12,10 @@ import {
   viewChild,
 } from '@angular/core';
 import { NgxJsonViewerModule } from 'ngx-json-viewer';
-import { BuildErrorType } from '../../../../../runner/workers/builder/builder-types';
+import {
+  BuildErrorType,
+  BuildResultStatus,
+} from '../../../../../runner/workers/builder/builder-types';
 import {
   AssessmentResult,
   IndividualAssessment,
diff --git a/runner/eval-cli.ts b/runner/eval-cli.ts
@@ -39,6 +39,7 @@ interface Options {
   enableUserJourneyTesting?: boolean;
   enableAutoCsp?: boolean;
   autoraterModel?: string;
+  a11yRepairAttempts?: number;
   logging?: 'text-only' | 'dynamic';
 }
 
@@ -156,6 +157,11 @@ function builder(argv: Argv): Argv<Options> {
         default: DEFAULT_AUTORATER_MODEL_NAME,
         description: 'Model to use when automatically rating generated code',
       })
+      .option('a11y-repair-attempts', {
+        type: 'number',
+        default: 0,
+        description: 'Number of repair attempts for discovered a11y violations',
+      })
       .strict()
       .version(false)
       .help()
@@ -199,6 +205,8 @@ async function handler(cliArgs: Arguments<Options>): Promise<void> {
       enableAutoCsp: cliArgs.enableAutoCsp,
       logging: cliArgs.logging,
       autoraterModel: cliArgs.autoraterModel,
+      skipAiSummary: cliArgs.skipAiSummary,
+      a11yRepairAttempts: cliArgs.a11yRepairAttempts,
     });
 
     logReportToConsole(runInfo);
diff --git a/runner/orchestration/build-repair.ts b/runner/orchestration/build-repair.ts
@@ -38,7 +38,7 @@ export async function repairAndBuild(
   env: Environment,
   rootPromptDef: RootPromptDefinition,
   directory: string,
-  finalOutputFiles: LlmResponseFile[],
+  previousAttemptFiles: LlmResponseFile[],
   errorMessage: string,
   errorContext: string,
   contextFiles: LlmContextFile[],
@@ -54,7 +54,7 @@ export async function repairAndBuild(
     env,
     rootPromptDef,
     directory,
-    finalOutputFiles,
+    previousAttemptFiles,
     errorMessage,
     errorContext,
     contextFiles,
@@ -66,7 +66,7 @@ export async function repairAndBuild(
     evalID,
     gateway,
     repairResponse,
-    finalOutputFiles,
+    previousAttemptFiles,
     env,
     rootPromptDef,
     directory,
@@ -85,7 +85,7 @@ async function handleRepairResponse(
   evalID: EvalID,
   gateway: Gateway<Environment>,
   repairResponse: LlmResponse,
-  finalOutputFiles: LlmResponseFile[],
+  previousAttemptFiles: LlmResponseFile[],
   env: Environment,
   rootPromptDef: RootPromptDefinition,
   directory: string,
@@ -106,8 +106,13 @@ async function handleRepairResponse(
       `Repair request failed: ${repairResponse.errors.join('\n')}`
     );
   }
-  mergeRepairFiles(repairResponse.outputFiles, finalOutputFiles);
-  writeResponseFiles(directory, finalOutputFiles, env, rootPromptDef.name);
+
+  // Clone the previous files because `mergeRepairFiles` mutates the attempt files.
+  // We don't want to change files of a previous attempt.
+  const newAttemptFiles = previousAttemptFiles.map((f) => ({ ...f }));
+
+  mergeRepairFiles(repairResponse.outputFiles, newAttemptFiles);
+  writeResponseFiles(directory, newAttemptFiles, env, rootPromptDef.name);
 
   const buildResult = await runBuild(
     evalID,
@@ -120,12 +125,8 @@ async function handleRepairResponse(
     progress
   );
 
-  // Capture attempt's full files. Copy because `finalOutputFiles` can be
-  // mutated in subsequent repair attempts.
-  const attemptFullFiles = finalOutputFiles.map((f) => ({ ...f }));
-
   return {
-    outputFiles: attemptFullFiles,
+    outputFiles: newAttemptFiles,
     usage: repairResponse.usage,
     reasoning: repairResponse.reasoning,
     buildResult,
diff --git a/runner/orchestration/build-serve-loop.ts b/runner/orchestration/build-serve-loop.ts
@@ -50,13 +50,9 @@ export async function attemptBuild(
   skipScreenshots: boolean,
   skipAxeTesting: boolean,
   enableAutoCsp: boolean,
-  userJourneyAgentTaskInput?: BrowserAgentTaskInput
+  userJourneyAgentTaskInput: BrowserAgentTaskInput | undefined,
+  maxAxeRepairAttempts: number
 ) {
-  // Clone the original files, because we're going to mutate them between repair
-  // attempts and we don't want the different runs to influence each other.
-  const finalOutputFiles = initialResponse.files.map((file) => ({
-    ...file,
-  }));
   const initialBuildResult = await runBuild(
     evalID,
     gateway,
@@ -104,7 +100,7 @@ export async function attemptBuild(
       env,
       rootPromptDef,
       directory,
-      finalOutputFiles,
+      lastAttempt.outputFiles,
       lastAttempt.buildResult.message,
       'There are the following build errors:',
       contextFiles,
@@ -137,13 +133,14 @@ export async function attemptBuild(
     );
   }
 
-  // Attempt to repair axe testing.
-  // This only runs when the last build completed & serving did run.
+  // Attempt to repair axe testing. This only runs when the last build
+  // passed and serving did run. Note: By default, we don't run axe repair
+  // attempts as it's not commonly done by LLMs in the ecosystem.
   let axeRepairAttempts = 0;
   while (
     lastAttempt.serveTestingResult &&
     (lastAttempt.serveTestingResult.axeViolations?.length ?? 0) > 0 &&
-    axeRepairAttempts < maxRepairAttempts
+    axeRepairAttempts < maxAxeRepairAttempts
   ) {
     axeRepairAttempts++;
     progress.log(
@@ -167,7 +164,7 @@ export async function attemptBuild(
       env,
       rootPromptDef,
       directory,
-      finalOutputFiles,
+      lastAttempt.outputFiles,
       axeViolationsError,
       'There are the following accessibility errors from axe accessibility violations:',
       contextFiles,
@@ -180,8 +177,9 @@ export async function attemptBuild(
     attemptDetails.push(attempt);
     lastAttempt = attempt;
 
-    // If we somehow introduced build errors via the Axe repair loop,
-    // then we should abort and let that last attempt have "build errors".
+    // If we somehow introduced build errors via the Axe repair loop, we abort
+    // further a11y repairs and capture the failed build. This is useful insight
+    // as LLMs seem to regress when asked to repair a11y violations.
     if (attempt.buildResult.status !== BuildResultStatus.SUCCESS) {
       break;
     }
@@ -215,7 +213,7 @@ export async function attemptBuild(
   return {
     buildResult: lastAttempt.buildResult,
     serveTestingResult: lastAttempt.serveTestingResult,
-    outputFiles: finalOutputFiles,
+    outputFiles: lastAttempt.outputFiles,
     repairAttempts,
     axeRepairAttempts,
   };
diff --git a/runner/orchestration/generate.ts b/runner/orchestration/generate.ts
@@ -88,6 +88,7 @@ export async function generateCodeAndAssess(options: {
   enableAutoCsp?: boolean;
   logging?: 'text-only' | 'dynamic';
   autoraterModel?: string;
+  a11yRepairAttempts?: number;
 }): Promise<RunInfo> {
   const env = await getEnvironmentByPath(
     options.environmentConfigPath,
@@ -190,7 +191,8 @@ export async function generateCodeAndAssess(options: {
                     !!options.enableAutoCsp,
                     workerConcurrencyQueue,
                     progress,
-                    options.autoraterModel || DEFAULT_AUTORATER_MODEL_NAME
+                    options.autoraterModel || DEFAULT_AUTORATER_MODEL_NAME,
+                    options.a11yRepairAttempts ?? 0
                   ),
                 // 10min max per app evaluation.  We just want to make sure it never gets stuck.
                 10
@@ -328,7 +330,8 @@ async function startEvaluationTask(
   enableAutoCsp: boolean,
   workerConcurrencyQueue: PQueue,
   progress: ProgressLogger,
-  autoraterModel: string
+  autoraterModel: string,
+  a11yRepairAttempts: number
 ): Promise<AssessmentResult[]> {
   // Set up the project structure once for the root project.
   const { directory, cleanup } = await setupProjectStructure(
@@ -469,7 +472,8 @@ async function startEvaluationTask(
       skipScreenshots,
       skipAxeTesting,
       enableAutoCsp,
-      userJourneyAgentTaskInput
+      userJourneyAgentTaskInput,
+      a11yRepairAttempts
     );
 
     if (!attempt) {