Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 14 additions & 12 deletions report-app/src/app/pages/report-viewer/report-viewer.html
Original file line number Diff line number Diff line change
Expand Up @@ -258,18 +258,16 @@ <h2>Generated applications</h2>

<div class="status-badge-group">
@let initialAttempt = result.attemptDetails[0];
@let repairAttempt =
result.attemptDetails.length > 1
? result.attemptDetails[1]
: null;
@let finalAttempt = result.attemptDetails.at(-1)!;

@if (finalAttempt.serveTestingResult?.runtimeErrors) {
<span class="status-badge error">Runtime error</span>
}

@if (repairAttempt?.buildResult?.status === 'error') {
<span class="status-badge error">Build after repair</span>
@if (finalAttempt?.buildResult?.status === 'error') {
@if (result.attemptDetails.length > 1) {
<span class="status-badge error">Build failed</span>
}
}

@if (initialAttempt?.buildResult?.status === 'error') {
Expand Down Expand Up @@ -366,15 +364,19 @@ <h4>Additional info</h4>
? 'Initial response'
: `Repair attempt #${$index}`
}}
@if (!isBuilt) {
<span class="status-badge" [class.error]="!isBuilt"
>Build</span
>
}
@if (hasAxeViolations) {

<span
class="status-badge error"
[class.error]="!isBuilt"
[class.success]="isBuilt"
>Build</span
>

@if (isBuilt) {
<span
class="status-badge"
[class.error]="hasAxeViolations"
[class.success]="!hasAxeViolations"
>A11y</span
>
}
Expand Down
5 changes: 4 additions & 1 deletion report-app/src/app/pages/report-viewer/report-viewer.ts
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,10 @@ import {
viewChild,
} from '@angular/core';
import { NgxJsonViewerModule } from 'ngx-json-viewer';
import { BuildErrorType } from '../../../../../runner/workers/builder/builder-types';
import {
BuildErrorType,
BuildResultStatus,
} from '../../../../../runner/workers/builder/builder-types';
import {
AssessmentResult,
IndividualAssessment,
Expand Down
8 changes: 8 additions & 0 deletions runner/eval-cli.ts
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ interface Options {
enableUserJourneyTesting?: boolean;
enableAutoCsp?: boolean;
autoraterModel?: string;
a11yRepairAttempts?: number;
logging?: 'text-only' | 'dynamic';
}

Expand Down Expand Up @@ -156,6 +157,11 @@ function builder(argv: Argv): Argv<Options> {
default: DEFAULT_AUTORATER_MODEL_NAME,
description: 'Model to use when automatically rating generated code',
})
.option('a11y-repair-attempts', {
type: 'number',
default: 0,
description: 'Number of repair attempts for discovered a11y violations',
})
.strict()
.version(false)
.help()
Expand Down Expand Up @@ -199,6 +205,8 @@ async function handler(cliArgs: Arguments<Options>): Promise<void> {
enableAutoCsp: cliArgs.enableAutoCsp,
logging: cliArgs.logging,
autoraterModel: cliArgs.autoraterModel,
skipAiSummary: cliArgs.skipAiSummary,
a11yRepairAttempts: cliArgs.a11yRepairAttempts,
});

logReportToConsole(runInfo);
Expand Down
23 changes: 12 additions & 11 deletions runner/orchestration/build-repair.ts
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ export async function repairAndBuild(
env: Environment,
rootPromptDef: RootPromptDefinition,
directory: string,
finalOutputFiles: LlmResponseFile[],
previousAttemptFiles: LlmResponseFile[],
errorMessage: string,
errorContext: string,
contextFiles: LlmContextFile[],
Expand All @@ -54,7 +54,7 @@ export async function repairAndBuild(
env,
rootPromptDef,
directory,
finalOutputFiles,
previousAttemptFiles,
errorMessage,
errorContext,
contextFiles,
Expand All @@ -66,7 +66,7 @@ export async function repairAndBuild(
evalID,
gateway,
repairResponse,
finalOutputFiles,
previousAttemptFiles,
env,
rootPromptDef,
directory,
Expand All @@ -85,7 +85,7 @@ async function handleRepairResponse(
evalID: EvalID,
gateway: Gateway<Environment>,
repairResponse: LlmResponse,
finalOutputFiles: LlmResponseFile[],
previousAttemptFiles: LlmResponseFile[],
env: Environment,
rootPromptDef: RootPromptDefinition,
directory: string,
Expand All @@ -106,8 +106,13 @@ async function handleRepairResponse(
`Repair request failed: ${repairResponse.errors.join('\n')}`
);
}
mergeRepairFiles(repairResponse.outputFiles, finalOutputFiles);
writeResponseFiles(directory, finalOutputFiles, env, rootPromptDef.name);

// Clone the previous files because `mergeRepairFiles` mutates the attempt files.
// We don't want to change files of a previous attempt.
const newAttemptFiles = previousAttemptFiles.map((f) => ({ ...f }));

mergeRepairFiles(repairResponse.outputFiles, newAttemptFiles);
writeResponseFiles(directory, newAttemptFiles, env, rootPromptDef.name);

const buildResult = await runBuild(
evalID,
Expand All @@ -120,12 +125,8 @@ async function handleRepairResponse(
progress
);

// Capture attempt's full files. Copy because `finalOutputFiles` can be
// mutated in subsequent repair attempts.
const attemptFullFiles = finalOutputFiles.map((f) => ({ ...f }));

return {
outputFiles: attemptFullFiles,
outputFiles: newAttemptFiles,
usage: repairResponse.usage,
reasoning: repairResponse.reasoning,
buildResult,
Expand Down
26 changes: 12 additions & 14 deletions runner/orchestration/build-serve-loop.ts
Original file line number Diff line number Diff line change
Expand Up @@ -50,13 +50,9 @@ export async function attemptBuild(
skipScreenshots: boolean,
skipAxeTesting: boolean,
enableAutoCsp: boolean,
userJourneyAgentTaskInput?: BrowserAgentTaskInput
userJourneyAgentTaskInput: BrowserAgentTaskInput | undefined,
maxAxeRepairAttempts: number
) {
// Clone the original files, because we're going to mutate them between repair
// attempts and we don't want the different runs to influence each other.
const finalOutputFiles = initialResponse.files.map((file) => ({
...file,
}));
const initialBuildResult = await runBuild(
evalID,
gateway,
Expand Down Expand Up @@ -104,7 +100,7 @@ export async function attemptBuild(
env,
rootPromptDef,
directory,
finalOutputFiles,
lastAttempt.outputFiles,
lastAttempt.buildResult.message,
'There are the following build errors:',
contextFiles,
Expand Down Expand Up @@ -137,13 +133,14 @@ export async function attemptBuild(
);
}

// Attempt to repair axe testing.
// This only runs when the last build completed & serving did run.
// Attempt to repair axe testing. This only runs when the last build
// passed and serving did run. Note: By default, we don't run axe repair
// attempts as it's not commonly done by LLMs in the ecosystem.
let axeRepairAttempts = 0;
while (
lastAttempt.serveTestingResult &&
(lastAttempt.serveTestingResult.axeViolations?.length ?? 0) > 0 &&
axeRepairAttempts < maxRepairAttempts
axeRepairAttempts < maxAxeRepairAttempts
) {
axeRepairAttempts++;
progress.log(
Expand All @@ -167,7 +164,7 @@ export async function attemptBuild(
env,
rootPromptDef,
directory,
finalOutputFiles,
lastAttempt.outputFiles,
axeViolationsError,
'There are the following accessibility errors from axe accessibility violations:',
contextFiles,
Expand All @@ -180,8 +177,9 @@ export async function attemptBuild(
attemptDetails.push(attempt);
lastAttempt = attempt;

// If we somehow introduced build errors via the Axe repair loop,
// then we should abort and let that last attempt have "build errors".
// If we somehow introduced build errors via the Axe repair loop, we abort
// further a11y repairs and capture the failed build. This is useful insight
// as LLMs seem to regress when asked to repair a11y violations.
if (attempt.buildResult.status !== BuildResultStatus.SUCCESS) {
break;
}
Expand Down Expand Up @@ -215,7 +213,7 @@ export async function attemptBuild(
return {
buildResult: lastAttempt.buildResult,
serveTestingResult: lastAttempt.serveTestingResult,
outputFiles: finalOutputFiles,
outputFiles: lastAttempt.outputFiles,
repairAttempts,
axeRepairAttempts,
};
Expand Down
10 changes: 7 additions & 3 deletions runner/orchestration/generate.ts
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,7 @@ export async function generateCodeAndAssess(options: {
enableAutoCsp?: boolean;
logging?: 'text-only' | 'dynamic';
autoraterModel?: string;
a11yRepairAttempts?: number;
}): Promise<RunInfo> {
const env = await getEnvironmentByPath(
options.environmentConfigPath,
Expand Down Expand Up @@ -190,7 +191,8 @@ export async function generateCodeAndAssess(options: {
!!options.enableAutoCsp,
workerConcurrencyQueue,
progress,
options.autoraterModel || DEFAULT_AUTORATER_MODEL_NAME
options.autoraterModel || DEFAULT_AUTORATER_MODEL_NAME,
options.a11yRepairAttempts ?? 0
),
// 10min max per app evaluation. We just want to make sure it never gets stuck.
10
Expand Down Expand Up @@ -328,7 +330,8 @@ async function startEvaluationTask(
enableAutoCsp: boolean,
workerConcurrencyQueue: PQueue,
progress: ProgressLogger,
autoraterModel: string
autoraterModel: string,
a11yRepairAttempts: number
): Promise<AssessmentResult[]> {
// Set up the project structure once for the root project.
const { directory, cleanup } = await setupProjectStructure(
Expand Down Expand Up @@ -469,7 +472,8 @@ async function startEvaluationTask(
skipScreenshots,
skipAxeTesting,
enableAutoCsp,
userJourneyAgentTaskInput
userJourneyAgentTaskInput,
a11yRepairAttempts
);

if (!attempt) {
Expand Down
Loading