diff --git a/report-app/src/app/pages/report-viewer/report-viewer.html b/report-app/src/app/pages/report-viewer/report-viewer.html
index af25e46..85e3390 100644
--- a/report-app/src/app/pages/report-viewer/report-viewer.html
+++ b/report-app/src/app/pages/report-viewer/report-viewer.html
@@ -258,18 +258,16 @@
@let initialAttempt = result.attemptDetails[0];
- @let repairAttempt =
- result.attemptDetails.length > 1
- ? result.attemptDetails[1]
- : null;
@let finalAttempt = result.attemptDetails.at(-1)!;
@if (finalAttempt.serveTestingResult?.runtimeErrors) {
Runtime error
}
- @if (repairAttempt?.buildResult?.status === 'error') {
-
Build after repair
+ @if (finalAttempt?.buildResult?.status === 'error') {
+ @if (result.attemptDetails.length > 1) {
+
Build failed
+ }
}
@if (initialAttempt?.buildResult?.status === 'error') {
@@ -366,15 +364,19 @@
Additional info
? 'Initial response'
: `Repair attempt #${$index}`
}}
- @if (!isBuilt) {
-
Build
- }
- @if (hasAxeViolations) {
+
+
Build
+
+ @if (isBuilt) {
A11y
}
diff --git a/report-app/src/app/pages/report-viewer/report-viewer.ts b/report-app/src/app/pages/report-viewer/report-viewer.ts
index 00dcd5b..c8a6c9f 100644
--- a/report-app/src/app/pages/report-viewer/report-viewer.ts
+++ b/report-app/src/app/pages/report-viewer/report-viewer.ts
@@ -12,7 +12,10 @@ import {
viewChild,
} from '@angular/core';
import { NgxJsonViewerModule } from 'ngx-json-viewer';
-import { BuildErrorType } from '../../../../../runner/workers/builder/builder-types';
+import {
+ BuildErrorType,
+ BuildResultStatus,
+} from '../../../../../runner/workers/builder/builder-types';
import {
AssessmentResult,
IndividualAssessment,
diff --git a/runner/eval-cli.ts b/runner/eval-cli.ts
index 78fdb77..f9be502 100644
--- a/runner/eval-cli.ts
+++ b/runner/eval-cli.ts
@@ -39,6 +39,7 @@ interface Options {
enableUserJourneyTesting?: boolean;
enableAutoCsp?: boolean;
autoraterModel?: string;
+ a11yRepairAttempts?: number;
logging?: 'text-only' | 'dynamic';
}
@@ -156,6 +157,11 @@ function builder(argv: Argv): Argv
{
default: DEFAULT_AUTORATER_MODEL_NAME,
description: 'Model to use when automatically rating generated code',
})
+ .option('a11y-repair-attempts', {
+ type: 'number',
+ default: 0,
+ description: 'Number of repair attempts for discovered a11y violations',
+ })
.strict()
.version(false)
.help()
@@ -199,6 +205,8 @@ async function handler(cliArgs: Arguments): Promise {
enableAutoCsp: cliArgs.enableAutoCsp,
logging: cliArgs.logging,
autoraterModel: cliArgs.autoraterModel,
+ skipAiSummary: cliArgs.skipAiSummary,
+ a11yRepairAttempts: cliArgs.a11yRepairAttempts,
});
logReportToConsole(runInfo);
diff --git a/runner/orchestration/build-repair.ts b/runner/orchestration/build-repair.ts
index 0a44536..6a53fe7 100644
--- a/runner/orchestration/build-repair.ts
+++ b/runner/orchestration/build-repair.ts
@@ -38,7 +38,7 @@ export async function repairAndBuild(
env: Environment,
rootPromptDef: RootPromptDefinition,
directory: string,
- finalOutputFiles: LlmResponseFile[],
+ previousAttemptFiles: LlmResponseFile[],
errorMessage: string,
errorContext: string,
contextFiles: LlmContextFile[],
@@ -54,7 +54,7 @@ export async function repairAndBuild(
env,
rootPromptDef,
directory,
- finalOutputFiles,
+ previousAttemptFiles,
errorMessage,
errorContext,
contextFiles,
@@ -66,7 +66,7 @@ export async function repairAndBuild(
evalID,
gateway,
repairResponse,
- finalOutputFiles,
+ previousAttemptFiles,
env,
rootPromptDef,
directory,
@@ -85,7 +85,7 @@ async function handleRepairResponse(
evalID: EvalID,
gateway: Gateway,
repairResponse: LlmResponse,
- finalOutputFiles: LlmResponseFile[],
+ previousAttemptFiles: LlmResponseFile[],
env: Environment,
rootPromptDef: RootPromptDefinition,
directory: string,
@@ -106,8 +106,13 @@ async function handleRepairResponse(
`Repair request failed: ${repairResponse.errors.join('\n')}`
);
}
- mergeRepairFiles(repairResponse.outputFiles, finalOutputFiles);
- writeResponseFiles(directory, finalOutputFiles, env, rootPromptDef.name);
+
+ // Clone the previous files because `mergeRepairFiles` mutates the attempt files.
+ // We don't want to change files of a previous attempt.
+ const newAttemptFiles = previousAttemptFiles.map((f) => ({ ...f }));
+
+ mergeRepairFiles(repairResponse.outputFiles, newAttemptFiles);
+ writeResponseFiles(directory, newAttemptFiles, env, rootPromptDef.name);
const buildResult = await runBuild(
evalID,
@@ -120,12 +125,8 @@ async function handleRepairResponse(
progress
);
- // Capture attempt's full files. Copy because `finalOutputFiles` can be
- // mutated in subsequent repair attempts.
- const attemptFullFiles = finalOutputFiles.map((f) => ({ ...f }));
-
return {
- outputFiles: attemptFullFiles,
+ outputFiles: newAttemptFiles,
usage: repairResponse.usage,
reasoning: repairResponse.reasoning,
buildResult,
diff --git a/runner/orchestration/build-serve-loop.ts b/runner/orchestration/build-serve-loop.ts
index 1bb5ed5..03a6220 100644
--- a/runner/orchestration/build-serve-loop.ts
+++ b/runner/orchestration/build-serve-loop.ts
@@ -50,13 +50,9 @@ export async function attemptBuild(
skipScreenshots: boolean,
skipAxeTesting: boolean,
enableAutoCsp: boolean,
- userJourneyAgentTaskInput?: BrowserAgentTaskInput
+ userJourneyAgentTaskInput: BrowserAgentTaskInput | undefined,
+ maxAxeRepairAttempts: number
) {
- // Clone the original files, because we're going to mutate them between repair
- // attempts and we don't want the different runs to influence each other.
- const finalOutputFiles = initialResponse.files.map((file) => ({
- ...file,
- }));
const initialBuildResult = await runBuild(
evalID,
gateway,
@@ -104,7 +100,7 @@ export async function attemptBuild(
env,
rootPromptDef,
directory,
- finalOutputFiles,
+ lastAttempt.outputFiles,
lastAttempt.buildResult.message,
'There are the following build errors:',
contextFiles,
@@ -137,13 +133,14 @@ export async function attemptBuild(
);
}
- // Attempt to repair axe testing.
- // This only runs when the last build completed & serving did run.
+ // Attempt to repair axe testing. This only runs when the last build
+ // passed and serving did run. Note: By default, we don't run axe repair
+ // attempts as it's not commonly done by LLMs in the ecosystem.
let axeRepairAttempts = 0;
while (
lastAttempt.serveTestingResult &&
(lastAttempt.serveTestingResult.axeViolations?.length ?? 0) > 0 &&
- axeRepairAttempts < maxRepairAttempts
+ axeRepairAttempts < maxAxeRepairAttempts
) {
axeRepairAttempts++;
progress.log(
@@ -167,7 +164,7 @@ export async function attemptBuild(
env,
rootPromptDef,
directory,
- finalOutputFiles,
+ lastAttempt.outputFiles,
axeViolationsError,
'There are the following accessibility errors from axe accessibility violations:',
contextFiles,
@@ -180,8 +177,9 @@ export async function attemptBuild(
attemptDetails.push(attempt);
lastAttempt = attempt;
- // If we somehow introduced build errors via the Axe repair loop,
- // then we should abort and let that last attempt have "build errors".
+ // If we somehow introduced build errors via the Axe repair loop, we abort
+ // further a11y repairs and capture the failed build. This is useful insight
+ // as LLMs seem to regress when asked to repair a11y violations.
if (attempt.buildResult.status !== BuildResultStatus.SUCCESS) {
break;
}
@@ -215,7 +213,7 @@ export async function attemptBuild(
return {
buildResult: lastAttempt.buildResult,
serveTestingResult: lastAttempt.serveTestingResult,
- outputFiles: finalOutputFiles,
+ outputFiles: lastAttempt.outputFiles,
repairAttempts,
axeRepairAttempts,
};
diff --git a/runner/orchestration/generate.ts b/runner/orchestration/generate.ts
index e5c2347..dd3012f 100644
--- a/runner/orchestration/generate.ts
+++ b/runner/orchestration/generate.ts
@@ -88,6 +88,7 @@ export async function generateCodeAndAssess(options: {
enableAutoCsp?: boolean;
logging?: 'text-only' | 'dynamic';
autoraterModel?: string;
+ a11yRepairAttempts?: number;
}): Promise {
const env = await getEnvironmentByPath(
options.environmentConfigPath,
@@ -190,7 +191,8 @@ export async function generateCodeAndAssess(options: {
!!options.enableAutoCsp,
workerConcurrencyQueue,
progress,
- options.autoraterModel || DEFAULT_AUTORATER_MODEL_NAME
+ options.autoraterModel || DEFAULT_AUTORATER_MODEL_NAME,
+ options.a11yRepairAttempts ?? 0
),
// 10min max per app evaluation. We just want to make sure it never gets stuck.
10
@@ -328,7 +330,8 @@ async function startEvaluationTask(
enableAutoCsp: boolean,
workerConcurrencyQueue: PQueue,
progress: ProgressLogger,
- autoraterModel: string
+ autoraterModel: string,
+ a11yRepairAttempts: number
): Promise {
// Set up the project structure once for the root project.
const { directory, cleanup } = await setupProjectStructure(
@@ -469,7 +472,8 @@ async function startEvaluationTask(
skipScreenshots,
skipAxeTesting,
enableAutoCsp,
- userJourneyAgentTaskInput
+ userJourneyAgentTaskInput,
+ a11yRepairAttempts
);
if (!attempt) {