From af7657ad1485e04933c5a6bf9ace13ee4a8fb4d0 Mon Sep 17 00:00:00 2001
From: Paul Gschwendtner <paulgschwendtner@gmail.com>
Date: Tue, 30 Sep 2025 10:18:55 +0000
Subject: [PATCH] refactor: do not repair a11y failures by default

We will stop repairing a11y failures by default because
agents by default don't do this and it would therefore
give a more realistic insight into app quality.

The main motivator triggering this change is that we occasionally see
that a11y repairs re-introduce actual build failures. Those cases are
captured as they can be useful insight, but it would rather confuse
results in the default mode. Hence the change.

The a11y repair attempts can be configured via an opt-in CLI flag.
---
 .../pages/report-viewer/report-viewer.html    | 26 ++++++++++---------
 .../app/pages/report-viewer/report-viewer.ts  |  5 +++-
 runner/eval-cli.ts                            |  8 ++++++
 runner/orchestration/build-repair.ts          | 23 ++++++++--------
 runner/orchestration/build-serve-loop.ts      | 26 +++++++++----------
 runner/orchestration/generate.ts              | 10 ++++---
 6 files changed, 57 insertions(+), 41 deletions(-)
diff --git a/report-app/src/app/pages/report-viewer/report-viewer.html b/report-app/src/app/pages/report-viewer/report-viewer.html
index af25e46..85e3390 100644
--- a/report-app/src/app/pages/report-viewer/report-viewer.html
+++ b/report-app/src/app/pages/report-viewer/report-viewer.html
@@ -258,18 +258,16 @@ <h2>Generated applications</h2>
 
               <div class="status-badge-group">
                 @let initialAttempt = result.attemptDetails[0];
-                @let repairAttempt =
-                  result.attemptDetails.length > 1
-                    ? result.attemptDetails[1]
-                    : null;
                 @let finalAttempt = result.attemptDetails.at(-1)!;
 
                 @if (finalAttempt.serveTestingResult?.runtimeErrors) {
                   <span class="status-badge error">Runtime error</span>
                 }
 
-                @if (repairAttempt?.buildResult?.status === 'error') {
-                  <span class="status-badge error">Build after repair</span>
+                @if (finalAttempt?.buildResult?.status === 'error') {
+                  @if (result.attemptDetails.length > 1) {
+                    <span class="status-badge error">Build failed</span>
+                  }
                 }
 
                 @if (initialAttempt?.buildResult?.status === 'error') {
@@ -366,15 +364,19 @@ <h4>Additional info</h4>
                           ? 'Initial response'
                           : `Repair attempt #${$index}`
                       }}
-                      @if (!isBuilt) {
-                        <span class="status-badge" [class.error]="!isBuilt"
-                          >Build</span
-                        >
-                      }
-                      @if (hasAxeViolations) {
+
+                      <span
+                        class="status-badge error"
+                        [class.error]="!isBuilt"
+                        [class.success]="isBuilt"
+                        >Build</span
+                      >
+
+                      @if (isBuilt) {
                         <span
                           class="status-badge"
                           [class.error]="hasAxeViolations"
+                          [class.success]="!hasAxeViolations"
                           >A11y</span
                         >
                       }
diff --git a/report-app/src/app/pages/report-viewer/report-viewer.ts b/report-app/src/app/pages/report-viewer/report-viewer.ts
index 00dcd5b..c8a6c9f 100644
--- a/report-app/src/app/pages/report-viewer/report-viewer.ts
+++ b/report-app/src/app/pages/report-viewer/report-viewer.ts
@@ -12,7 +12,10 @@ import {
   viewChild,
 } from '@angular/core';
 import { NgxJsonViewerModule } from 'ngx-json-viewer';
-import { BuildErrorType } from '../../../../../runner/workers/builder/builder-types';
+import {
+  BuildErrorType,
+  BuildResultStatus,
+} from '../../../../../runner/workers/builder/builder-types';
 import {
   AssessmentResult,
   IndividualAssessment,
diff --git a/runner/eval-cli.ts b/runner/eval-cli.ts
index 78fdb77..f9be502 100644
--- a/runner/eval-cli.ts
+++ b/runner/eval-cli.ts
@@ -39,6 +39,7 @@ interface Options {
   enableUserJourneyTesting?: boolean;
   enableAutoCsp?: boolean;
   autoraterModel?: string;
+  a11yRepairAttempts?: number;
   logging?: 'text-only' | 'dynamic';
 }
 
@@ -156,6 +157,11 @@ function builder(argv: Argv): Argv<Options> {
         default: DEFAULT_AUTORATER_MODEL_NAME,
         description: 'Model to use when automatically rating generated code',
       })
+      .option('a11y-repair-attempts', {
+        type: 'number',
+        default: 0,
+        description: 'Number of repair attempts for discovered a11y violations',
+      })
       .strict()
       .version(false)
       .help()
@@ -199,6 +205,8 @@ async function handler(cliArgs: Arguments<Options>): Promise<void> {
       enableAutoCsp: cliArgs.enableAutoCsp,
       logging: cliArgs.logging,
       autoraterModel: cliArgs.autoraterModel,
+      skipAiSummary: cliArgs.skipAiSummary,
+      a11yRepairAttempts: cliArgs.a11yRepairAttempts,
     });
 
     logReportToConsole(runInfo);
diff --git a/runner/orchestration/build-repair.ts b/runner/orchestration/build-repair.ts
index 0a44536..6a53fe7 100644
--- a/runner/orchestration/build-repair.ts
+++ b/runner/orchestration/build-repair.ts
@@ -38,7 +38,7 @@ export async function repairAndBuild(
   env: Environment,
   rootPromptDef: RootPromptDefinition,
   directory: string,
-  finalOutputFiles: LlmResponseFile[],
+  previousAttemptFiles: LlmResponseFile[],
   errorMessage: string,
   errorContext: string,
   contextFiles: LlmContextFile[],
@@ -54,7 +54,7 @@ export async function repairAndBuild(
     env,
     rootPromptDef,
     directory,
-    finalOutputFiles,
+    previousAttemptFiles,
     errorMessage,
     errorContext,
     contextFiles,
@@ -66,7 +66,7 @@ export async function repairAndBuild(
     evalID,
     gateway,
     repairResponse,
-    finalOutputFiles,
+    previousAttemptFiles,
     env,
     rootPromptDef,
     directory,
@@ -85,7 +85,7 @@ async function handleRepairResponse(
   evalID: EvalID,
   gateway: Gateway<Environment>,
   repairResponse: LlmResponse,
-  finalOutputFiles: LlmResponseFile[],
+  previousAttemptFiles: LlmResponseFile[],
   env: Environment,
   rootPromptDef: RootPromptDefinition,
   directory: string,
@@ -106,8 +106,13 @@ async function handleRepairResponse(
       `Repair request failed: ${repairResponse.errors.join('\n')}`
     );
   }
-  mergeRepairFiles(repairResponse.outputFiles, finalOutputFiles);
-  writeResponseFiles(directory, finalOutputFiles, env, rootPromptDef.name);
+
+  // Clone the previous files because `mergeRepairFiles` mutates the attempt files.
+  // We don't want to change files of a previous attempt.
+  const newAttemptFiles = previousAttemptFiles.map((f) => ({ ...f }));
+
+  mergeRepairFiles(repairResponse.outputFiles, newAttemptFiles);
+  writeResponseFiles(directory, newAttemptFiles, env, rootPromptDef.name);
 
   const buildResult = await runBuild(
     evalID,
@@ -120,12 +125,8 @@ async function handleRepairResponse(
     progress
   );
 
-  // Capture attempt's full files. Copy because `finalOutputFiles` can be
-  // mutated in subsequent repair attempts.
-  const attemptFullFiles = finalOutputFiles.map((f) => ({ ...f }));
-
   return {
-    outputFiles: attemptFullFiles,
+    outputFiles: newAttemptFiles,
     usage: repairResponse.usage,
     reasoning: repairResponse.reasoning,
     buildResult,
diff --git a/runner/orchestration/build-serve-loop.ts b/runner/orchestration/build-serve-loop.ts
index 1bb5ed5..03a6220 100644
--- a/runner/orchestration/build-serve-loop.ts
+++ b/runner/orchestration/build-serve-loop.ts
@@ -50,13 +50,9 @@ export async function attemptBuild(
   skipScreenshots: boolean,
   skipAxeTesting: boolean,
   enableAutoCsp: boolean,
-  userJourneyAgentTaskInput?: BrowserAgentTaskInput
+  userJourneyAgentTaskInput: BrowserAgentTaskInput | undefined,
+  maxAxeRepairAttempts: number
 ) {
-  // Clone the original files, because we're going to mutate them between repair
-  // attempts and we don't want the different runs to influence each other.
-  const finalOutputFiles = initialResponse.files.map((file) => ({
-    ...file,
-  }));
   const initialBuildResult = await runBuild(
     evalID,
     gateway,
@@ -104,7 +100,7 @@ export async function attemptBuild(
       env,
       rootPromptDef,
       directory,
-      finalOutputFiles,
+      lastAttempt.outputFiles,
       lastAttempt.buildResult.message,
       'There are the following build errors:',
       contextFiles,
@@ -137,13 +133,14 @@ export async function attemptBuild(
     );
   }
 
-  // Attempt to repair axe testing.
-  // This only runs when the last build completed & serving did run.
+  // Attempt to repair axe testing. This only runs when the last build
+  // passed and serving did run. Note: By default, we don't run axe repair
+  // attempts as it's not commonly done by LLMs in the ecosystem.
   let axeRepairAttempts = 0;
   while (
     lastAttempt.serveTestingResult &&
     (lastAttempt.serveTestingResult.axeViolations?.length ?? 0) > 0 &&
-    axeRepairAttempts < maxRepairAttempts
+    axeRepairAttempts < maxAxeRepairAttempts
   ) {
     axeRepairAttempts++;
     progress.log(
@@ -167,7 +164,7 @@ export async function attemptBuild(
       env,
       rootPromptDef,
       directory,
-      finalOutputFiles,
+      lastAttempt.outputFiles,
       axeViolationsError,
       'There are the following accessibility errors from axe accessibility violations:',
       contextFiles,
@@ -180,8 +177,9 @@ export async function attemptBuild(
     attemptDetails.push(attempt);
     lastAttempt = attempt;
 
-    // If we somehow introduced build errors via the Axe repair loop,
-    // then we should abort and let that last attempt have "build errors".
+    // If we somehow introduced build errors via the Axe repair loop, we abort
+    // further a11y repairs and capture the failed build. This is useful insight
+    // as LLMs seem to regress when asked to repair a11y violations.
     if (attempt.buildResult.status !== BuildResultStatus.SUCCESS) {
       break;
     }
@@ -215,7 +213,7 @@ export async function attemptBuild(
   return {
     buildResult: lastAttempt.buildResult,
     serveTestingResult: lastAttempt.serveTestingResult,
-    outputFiles: finalOutputFiles,
+    outputFiles: lastAttempt.outputFiles,
     repairAttempts,
     axeRepairAttempts,
   };
diff --git a/runner/orchestration/generate.ts b/runner/orchestration/generate.ts
index e5c2347..dd3012f 100644
--- a/runner/orchestration/generate.ts
+++ b/runner/orchestration/generate.ts
@@ -88,6 +88,7 @@ export async function generateCodeAndAssess(options: {
   enableAutoCsp?: boolean;
   logging?: 'text-only' | 'dynamic';
   autoraterModel?: string;
+  a11yRepairAttempts?: number;
 }): Promise<RunInfo> {
   const env = await getEnvironmentByPath(
     options.environmentConfigPath,
@@ -190,7 +191,8 @@ export async function generateCodeAndAssess(options: {
                     !!options.enableAutoCsp,
                     workerConcurrencyQueue,
                     progress,
-                    options.autoraterModel || DEFAULT_AUTORATER_MODEL_NAME
+                    options.autoraterModel || DEFAULT_AUTORATER_MODEL_NAME,
+                    options.a11yRepairAttempts ?? 0
                   ),
                 // 10min max per app evaluation.  We just want to make sure it never gets stuck.
                 10
@@ -328,7 +330,8 @@ async function startEvaluationTask(
   enableAutoCsp: boolean,
   workerConcurrencyQueue: PQueue,
   progress: ProgressLogger,
-  autoraterModel: string
+  autoraterModel: string,
+  a11yRepairAttempts: number
 ): Promise<AssessmentResult[]> {
   // Set up the project structure once for the root project.
   const { directory, cleanup } = await setupProjectStructure(
@@ -469,7 +472,8 @@ async function startEvaluationTask(
       skipScreenshots,
       skipAxeTesting,
       enableAutoCsp,
-      userJourneyAgentTaskInput
+      userJourneyAgentTaskInput,
+      a11yRepairAttempts
     );
 
     if (!attempt) {