angular · crisbeto · Sep 30, 2025 · Sep 30, 2025
@@ -258,18 +258,16 @@ <h2>Generated applications</h2>
 
               <div class="status-badge-group">
                 @let initialAttempt = result.attemptDetails[0];
-                @let repairAttempt =
-                  result.attemptDetails.length > 1
-                    ? result.attemptDetails[1]
-                    : null;
                 @let finalAttempt = result.attemptDetails.at(-1)!;
 
                 @if (finalAttempt.serveTestingResult?.runtimeErrors) {
                   <span class="status-badge error">Runtime error</span>
                 }
 
-                @if (repairAttempt?.buildResult?.status === 'error') {
-                  <span class="status-badge error">Build after repair</span>
+                @if (finalAttempt?.buildResult?.status === 'error') {
+                  @if (result.attemptDetails.length > 1) {
+                    <span class="status-badge error">Build failed</span>
+                  }
                 }
 
                 @if (initialAttempt?.buildResult?.status === 'error') {
@@ -366,15 +364,19 @@ <h4>Additional info</h4>
                           ? 'Initial response'
                           : `Repair attempt #${$index}`
                       }}
-                      @if (!isBuilt) {
-                        <span class="status-badge" [class.error]="!isBuilt"
-                          >Build</span
-                        >
-                      }
-                      @if (hasAxeViolations) {
+
+                      <span
+                        class="status-badge error"
+                        [class.error]="!isBuilt"
+                        [class.success]="isBuilt"
+                        >Build</span
+                      >
+
+                      @if (isBuilt) {
                         <span
                           class="status-badge"
                           [class.error]="hasAxeViolations"
+                          [class.success]="!hasAxeViolations"
                           >A11y</span
                         >
                       }

@@ -12,7 +12,10 @@ import {
   viewChild,
 } from '@angular/core';
 import { NgxJsonViewerModule } from 'ngx-json-viewer';
-import { BuildErrorType } from '../../../../../runner/workers/builder/builder-types';
+import {
+  BuildErrorType,
+  BuildResultStatus,
+} from '../../../../../runner/workers/builder/builder-types';
 import {
   AssessmentResult,
   IndividualAssessment,

@@ -39,6 +39,7 @@ interface Options {
   enableUserJourneyTesting?: boolean;
   enableAutoCsp?: boolean;
   autoraterModel?: string;
+  a11yRepairAttempts?: number;
   logging?: 'text-only' | 'dynamic';
 }
 
@@ -156,6 +157,11 @@ function builder(argv: Argv): Argv<Options> {
         default: DEFAULT_AUTORATER_MODEL_NAME,
         description: 'Model to use when automatically rating generated code',
       })
+      .option('a11y-repair-attempts', {
+        type: 'number',
+        default: 0,
+        description: 'Number of repair attempts for discovered a11y violations',
+      })
       .strict()
       .version(false)
       .help()
@@ -199,6 +205,8 @@ async function handler(cliArgs: Arguments<Options>): Promise<void> {
       enableAutoCsp: cliArgs.enableAutoCsp,
       logging: cliArgs.logging,
       autoraterModel: cliArgs.autoraterModel,
+      skipAiSummary: cliArgs.skipAiSummary,
+      a11yRepairAttempts: cliArgs.a11yRepairAttempts,
     });
 
     logReportToConsole(runInfo);

@@ -38,7 +38,7 @@ export async function repairAndBuild(
   env: Environment,
   rootPromptDef: RootPromptDefinition,
   directory: string,
-  finalOutputFiles: LlmResponseFile[],
+  previousAttemptFiles: LlmResponseFile[],
   errorMessage: string,
   errorContext: string,
   contextFiles: LlmContextFile[],
@@ -54,7 +54,7 @@ export async function repairAndBuild(
     env,
     rootPromptDef,
     directory,
-    finalOutputFiles,
+    previousAttemptFiles,
     errorMessage,
     errorContext,
     contextFiles,
@@ -66,7 +66,7 @@ export async function repairAndBuild(
     evalID,
     gateway,
     repairResponse,
-    finalOutputFiles,
+    previousAttemptFiles,
     env,
     rootPromptDef,
     directory,
@@ -85,7 +85,7 @@ async function handleRepairResponse(
   evalID: EvalID,
   gateway: Gateway<Environment>,
   repairResponse: LlmResponse,
-  finalOutputFiles: LlmResponseFile[],
+  previousAttemptFiles: LlmResponseFile[],
   env: Environment,
   rootPromptDef: RootPromptDefinition,
   directory: string,
@@ -106,8 +106,13 @@ async function handleRepairResponse(
       `Repair request failed: ${repairResponse.errors.join('\n')}`
     );
   }
-  mergeRepairFiles(repairResponse.outputFiles, finalOutputFiles);
-  writeResponseFiles(directory, finalOutputFiles, env, rootPromptDef.name);
+
+  // Clone the previous files because `mergeRepairFiles` mutates the attempt files.
+  // We don't want to change files of a previous attempt.
+  const newAttemptFiles = previousAttemptFiles.map((f) => ({ ...f }));
+
+  mergeRepairFiles(repairResponse.outputFiles, newAttemptFiles);
+  writeResponseFiles(directory, newAttemptFiles, env, rootPromptDef.name);
 
   const buildResult = await runBuild(
     evalID,
@@ -120,12 +125,8 @@ async function handleRepairResponse(
     progress
   );
 
-  // Capture attempt's full files. Copy because `finalOutputFiles` can be
-  // mutated in subsequent repair attempts.
-  const attemptFullFiles = finalOutputFiles.map((f) => ({ ...f }));
-
   return {
-    outputFiles: attemptFullFiles,
+    outputFiles: newAttemptFiles,
     usage: repairResponse.usage,
     reasoning: repairResponse.reasoning,
     buildResult,

@@ -50,13 +50,9 @@ export async function attemptBuild(
   skipScreenshots: boolean,
   skipAxeTesting: boolean,
   enableAutoCsp: boolean,
-  userJourneyAgentTaskInput?: BrowserAgentTaskInput
+  userJourneyAgentTaskInput: BrowserAgentTaskInput | undefined,
+  maxAxeRepairAttempts: number
 ) {
-  // Clone the original files, because we're going to mutate them between repair
-  // attempts and we don't want the different runs to influence each other.
-  const finalOutputFiles = initialResponse.files.map((file) => ({
-    ...file,
-  }));
   const initialBuildResult = await runBuild(
     evalID,
     gateway,
@@ -104,7 +100,7 @@ export async function attemptBuild(
       env,
       rootPromptDef,
       directory,
-      finalOutputFiles,
+      lastAttempt.outputFiles,
       lastAttempt.buildResult.message,
       'There are the following build errors:',
       contextFiles,
@@ -137,13 +133,14 @@ export async function attemptBuild(
     );
   }
 
-  // Attempt to repair axe testing.
-  // This only runs when the last build completed & serving did run.
+  // Attempt to repair axe testing. This only runs when the last build
+  // passed and serving did run. Note: By default, we don't run axe repair
+  // attempts as it's not commonly done by LLMs in the ecosystem.
   let axeRepairAttempts = 0;
   while (
     lastAttempt.serveTestingResult &&
     (lastAttempt.serveTestingResult.axeViolations?.length ?? 0) > 0 &&
-    axeRepairAttempts < maxRepairAttempts
+    axeRepairAttempts < maxAxeRepairAttempts
   ) {
     axeRepairAttempts++;
     progress.log(
@@ -167,7 +164,7 @@ export async function attemptBuild(
       env,
       rootPromptDef,
       directory,
-      finalOutputFiles,
+      lastAttempt.outputFiles,
       axeViolationsError,
       'There are the following accessibility errors from axe accessibility violations:',
       contextFiles,
@@ -180,8 +177,9 @@ export async function attemptBuild(
     attemptDetails.push(attempt);
     lastAttempt = attempt;
 
-    // If we somehow introduced build errors via the Axe repair loop,
-    // then we should abort and let that last attempt have "build errors".
+    // If we somehow introduced build errors via the Axe repair loop, we abort
+    // further a11y repairs and capture the failed build. This is useful insight
+    // as LLMs seem to regress when asked to repair a11y violations.
     if (attempt.buildResult.status !== BuildResultStatus.SUCCESS) {
       break;
     }
@@ -215,7 +213,7 @@ export async function attemptBuild(
   return {
     buildResult: lastAttempt.buildResult,
     serveTestingResult: lastAttempt.serveTestingResult,
-    outputFiles: finalOutputFiles,
+    outputFiles: lastAttempt.outputFiles,
     repairAttempts,
     axeRepairAttempts,
   };

@@ -88,6 +88,7 @@ export async function generateCodeAndAssess(options: {
   enableAutoCsp?: boolean;
   logging?: 'text-only' | 'dynamic';
   autoraterModel?: string;
+  a11yRepairAttempts?: number;
 }): Promise<RunInfo> {
   const env = await getEnvironmentByPath(
     options.environmentConfigPath,
@@ -190,7 +191,8 @@ export async function generateCodeAndAssess(options: {
                     !!options.enableAutoCsp,
                     workerConcurrencyQueue,
                     progress,
-                    options.autoraterModel || DEFAULT_AUTORATER_MODEL_NAME
+                    options.autoraterModel || DEFAULT_AUTORATER_MODEL_NAME,
+                    options.a11yRepairAttempts ?? 0
                   ),
                 // 10min max per app evaluation.  We just want to make sure it never gets stuck.
                 10
@@ -328,7 +330,8 @@ async function startEvaluationTask(
   enableAutoCsp: boolean,
   workerConcurrencyQueue: PQueue,
   progress: ProgressLogger,
-  autoraterModel: string
+  autoraterModel: string,
+  a11yRepairAttempts: number
 ): Promise<AssessmentResult[]> {
   // Set up the project structure once for the root project.
   const { directory, cleanup } = await setupProjectStructure(
@@ -469,7 +472,8 @@ async function startEvaluationTask(
       skipScreenshots,
       skipAxeTesting,
       enableAutoCsp,
-      userJourneyAgentTaskInput
+      userJourneyAgentTaskInput,
+      a11yRepairAttempts
     );
 
     if (!attempt) {