Skip to content

Commit d3df340

Browse files
devversioncrisbeto
authored andcommitted
refactor: do not repair a11y failures by default
We will stop repairing a11y failures by default because agents by default don't do this and it would therefore give a more realistic insight into app quality. The main motivator triggering this change is that we occasionally see that a11y repairs re-introduce actual build failures. Those cases are captured as they can be useful insight, but it would rather confuse results in the default mode. Hence the change. The a11y repair attempts can be configured via an opt-in CLI flag.
1 parent 8ec4883 commit d3df340

File tree

6 files changed

+57
-41
lines changed

6 files changed

+57
-41
lines changed

report-app/src/app/pages/report-viewer/report-viewer.html

Lines changed: 14 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -258,18 +258,16 @@ <h2>Generated applications</h2>
258258

259259
<div class="status-badge-group">
260260
@let initialAttempt = result.attemptDetails[0];
261-
@let repairAttempt =
262-
result.attemptDetails.length > 1
263-
? result.attemptDetails[1]
264-
: null;
265261
@let finalAttempt = result.attemptDetails.at(-1)!;
266262

267263
@if (finalAttempt.serveTestingResult?.runtimeErrors) {
268264
<span class="status-badge error">Runtime error</span>
269265
}
270266

271-
@if (repairAttempt?.buildResult?.status === 'error') {
272-
<span class="status-badge error">Build after repair</span>
267+
@if (finalAttempt?.buildResult?.status === 'error') {
268+
@if (result.attemptDetails.length > 1) {
269+
<span class="status-badge error">Build failed</span>
270+
}
273271
}
274272

275273
@if (initialAttempt?.buildResult?.status === 'error') {
@@ -366,15 +364,19 @@ <h4>Additional info</h4>
366364
? 'Initial response'
367365
: `Repair attempt #${$index}`
368366
}}
369-
@if (!isBuilt) {
370-
<span class="status-badge" [class.error]="!isBuilt"
371-
>Build</span
372-
>
373-
}
374-
@if (hasAxeViolations) {
367+
368+
<span
369+
class="status-badge error"
370+
[class.error]="!isBuilt"
371+
[class.success]="isBuilt"
372+
>Build</span
373+
>
374+
375+
@if (isBuilt) {
375376
<span
376377
class="status-badge"
377378
[class.error]="hasAxeViolations"
379+
[class.success]="!hasAxeViolations"
378380
>A11y</span
379381
>
380382
}

report-app/src/app/pages/report-viewer/report-viewer.ts

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,10 @@ import {
1212
viewChild,
1313
} from '@angular/core';
1414
import { NgxJsonViewerModule } from 'ngx-json-viewer';
15-
import { BuildErrorType } from '../../../../../runner/workers/builder/builder-types';
15+
import {
16+
BuildErrorType,
17+
BuildResultStatus,
18+
} from '../../../../../runner/workers/builder/builder-types';
1619
import {
1720
AssessmentResult,
1821
IndividualAssessment,

runner/eval-cli.ts

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ interface Options {
3939
enableUserJourneyTesting?: boolean;
4040
enableAutoCsp?: boolean;
4141
autoraterModel?: string;
42+
a11yRepairAttempts?: number;
4243
logging?: 'text-only' | 'dynamic';
4344
}
4445

@@ -156,6 +157,11 @@ function builder(argv: Argv): Argv<Options> {
156157
default: DEFAULT_AUTORATER_MODEL_NAME,
157158
description: 'Model to use when automatically rating generated code',
158159
})
160+
.option('a11y-repair-attempts', {
161+
type: 'number',
162+
default: 0,
163+
description: 'Number of repair attempts for discovered a11y violations',
164+
})
159165
.strict()
160166
.version(false)
161167
.help()
@@ -199,6 +205,8 @@ async function handler(cliArgs: Arguments<Options>): Promise<void> {
199205
enableAutoCsp: cliArgs.enableAutoCsp,
200206
logging: cliArgs.logging,
201207
autoraterModel: cliArgs.autoraterModel,
208+
skipAiSummary: cliArgs.skipAiSummary,
209+
a11yRepairAttempts: cliArgs.a11yRepairAttempts,
202210
});
203211

204212
logReportToConsole(runInfo);

runner/orchestration/build-repair.ts

Lines changed: 12 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ export async function repairAndBuild(
3838
env: Environment,
3939
rootPromptDef: RootPromptDefinition,
4040
directory: string,
41-
finalOutputFiles: LlmResponseFile[],
41+
previousAttemptFiles: LlmResponseFile[],
4242
errorMessage: string,
4343
errorContext: string,
4444
contextFiles: LlmContextFile[],
@@ -54,7 +54,7 @@ export async function repairAndBuild(
5454
env,
5555
rootPromptDef,
5656
directory,
57-
finalOutputFiles,
57+
previousAttemptFiles,
5858
errorMessage,
5959
errorContext,
6060
contextFiles,
@@ -66,7 +66,7 @@ export async function repairAndBuild(
6666
evalID,
6767
gateway,
6868
repairResponse,
69-
finalOutputFiles,
69+
previousAttemptFiles,
7070
env,
7171
rootPromptDef,
7272
directory,
@@ -85,7 +85,7 @@ async function handleRepairResponse(
8585
evalID: EvalID,
8686
gateway: Gateway<Environment>,
8787
repairResponse: LlmResponse,
88-
finalOutputFiles: LlmResponseFile[],
88+
previousAttemptFiles: LlmResponseFile[],
8989
env: Environment,
9090
rootPromptDef: RootPromptDefinition,
9191
directory: string,
@@ -106,8 +106,13 @@ async function handleRepairResponse(
106106
`Repair request failed: ${repairResponse.errors.join('\n')}`
107107
);
108108
}
109-
mergeRepairFiles(repairResponse.outputFiles, finalOutputFiles);
110-
writeResponseFiles(directory, finalOutputFiles, env, rootPromptDef.name);
109+
110+
// Clone the previous files because `mergeRepairFiles` mutates the attempt files.
111+
// We don't want to change files of a previous attempt.
112+
const newAttemptFiles = previousAttemptFiles.map((f) => ({ ...f }));
113+
114+
mergeRepairFiles(repairResponse.outputFiles, newAttemptFiles);
115+
writeResponseFiles(directory, newAttemptFiles, env, rootPromptDef.name);
111116

112117
const buildResult = await runBuild(
113118
evalID,
@@ -120,12 +125,8 @@ async function handleRepairResponse(
120125
progress
121126
);
122127

123-
// Capture attempt's full files. Copy because `finalOutputFiles` can be
124-
// mutated in subsequent repair attempts.
125-
const attemptFullFiles = finalOutputFiles.map((f) => ({ ...f }));
126-
127128
return {
128-
outputFiles: attemptFullFiles,
129+
outputFiles: newAttemptFiles,
129130
usage: repairResponse.usage,
130131
reasoning: repairResponse.reasoning,
131132
buildResult,

runner/orchestration/build-serve-loop.ts

Lines changed: 12 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -50,13 +50,9 @@ export async function attemptBuild(
5050
skipScreenshots: boolean,
5151
skipAxeTesting: boolean,
5252
enableAutoCsp: boolean,
53-
userJourneyAgentTaskInput?: BrowserAgentTaskInput
53+
userJourneyAgentTaskInput: BrowserAgentTaskInput | undefined,
54+
maxAxeRepairAttempts: number
5455
) {
55-
// Clone the original files, because we're going to mutate them between repair
56-
// attempts and we don't want the different runs to influence each other.
57-
const finalOutputFiles = initialResponse.files.map((file) => ({
58-
...file,
59-
}));
6056
const initialBuildResult = await runBuild(
6157
evalID,
6258
gateway,
@@ -104,7 +100,7 @@ export async function attemptBuild(
104100
env,
105101
rootPromptDef,
106102
directory,
107-
finalOutputFiles,
103+
lastAttempt.outputFiles,
108104
lastAttempt.buildResult.message,
109105
'There are the following build errors:',
110106
contextFiles,
@@ -137,13 +133,14 @@ export async function attemptBuild(
137133
);
138134
}
139135

140-
// Attempt to repair axe testing.
141-
// This only runs when the last build completed & serving did run.
136+
// Attempt to repair axe testing. This only runs when the last build
137+
// passed and serving did run. Note: By default, we don't run axe repair
138+
// attempts as it's not commonly done by LLMs in the ecosystem.
142139
let axeRepairAttempts = 0;
143140
while (
144141
lastAttempt.serveTestingResult &&
145142
(lastAttempt.serveTestingResult.axeViolations?.length ?? 0) > 0 &&
146-
axeRepairAttempts < maxRepairAttempts
143+
axeRepairAttempts < maxAxeRepairAttempts
147144
) {
148145
axeRepairAttempts++;
149146
progress.log(
@@ -167,7 +164,7 @@ export async function attemptBuild(
167164
env,
168165
rootPromptDef,
169166
directory,
170-
finalOutputFiles,
167+
lastAttempt.outputFiles,
171168
axeViolationsError,
172169
'There are the following accessibility errors from axe accessibility violations:',
173170
contextFiles,
@@ -180,8 +177,9 @@ export async function attemptBuild(
180177
attemptDetails.push(attempt);
181178
lastAttempt = attempt;
182179

183-
// If we somehow introduced build errors via the Axe repair loop,
184-
// then we should abort and let that last attempt have "build errors".
180+
// If we somehow introduced build errors via the Axe repair loop, we abort
181+
// further a11y repairs and capture the failed build. This is useful insight
182+
// as LLMs seem to regress when asked to repair a11y violations.
185183
if (attempt.buildResult.status !== BuildResultStatus.SUCCESS) {
186184
break;
187185
}
@@ -215,7 +213,7 @@ export async function attemptBuild(
215213
return {
216214
buildResult: lastAttempt.buildResult,
217215
serveTestingResult: lastAttempt.serveTestingResult,
218-
outputFiles: finalOutputFiles,
216+
outputFiles: lastAttempt.outputFiles,
219217
repairAttempts,
220218
axeRepairAttempts,
221219
};

runner/orchestration/generate.ts

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,7 @@ export async function generateCodeAndAssess(options: {
8888
enableAutoCsp?: boolean;
8989
logging?: 'text-only' | 'dynamic';
9090
autoraterModel?: string;
91+
a11yRepairAttempts?: number;
9192
}): Promise<RunInfo> {
9293
const env = await getEnvironmentByPath(
9394
options.environmentConfigPath,
@@ -190,7 +191,8 @@ export async function generateCodeAndAssess(options: {
190191
!!options.enableAutoCsp,
191192
workerConcurrencyQueue,
192193
progress,
193-
options.autoraterModel || DEFAULT_AUTORATER_MODEL_NAME
194+
options.autoraterModel || DEFAULT_AUTORATER_MODEL_NAME,
195+
options.a11yRepairAttempts ?? 0
194196
),
195197
// 10min max per app evaluation. We just want to make sure it never gets stuck.
196198
10
@@ -328,7 +330,8 @@ async function startEvaluationTask(
328330
enableAutoCsp: boolean,
329331
workerConcurrencyQueue: PQueue,
330332
progress: ProgressLogger,
331-
autoraterModel: string
333+
autoraterModel: string,
334+
a11yRepairAttempts: number
332335
): Promise<AssessmentResult[]> {
333336
// Set up the project structure once for the root project.
334337
const { directory, cleanup } = await setupProjectStructure(
@@ -469,7 +472,8 @@ async function startEvaluationTask(
469472
skipScreenshots,
470473
skipAxeTesting,
471474
enableAutoCsp,
472-
userJourneyAgentTaskInput
475+
userJourneyAgentTaskInput,
476+
a11yRepairAttempts
473477
);
474478

475479
if (!attempt) {

0 commit comments

Comments
 (0)