Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions docs/environment-reference.md
Original file line number Diff line number Diff line change
Expand Up @@ -179,3 +179,8 @@ Defaults to `<package manager> run build`.

Command used to start a local dev server as a part of the evaluation.
Defaults to `<package manager> run start --port 0`.

### `testCommand`

Command used to run tests against the generated code. If this property is not provided, tests will not be run. The command should exit with code 0 on success and a non-zero exit code on failure. The output from the command (both `stdout` and `stderr`) is captured and used for repair attempts if the tests fail. The test command will time out after 4 minutes.

64 changes: 63 additions & 1 deletion report-app/src/app/pages/report-viewer/report-viewer.html
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,20 @@ <h3 class="chart-title">
<stacked-bar-chart [data]="buildsAsGraphData(overview.stats.builds)" [compact]="true" />
</div>
</div>
@if (overview.stats.tests) {
<div class="chart-container test-results-details">
<h3 class="chart-title">
<span class="material-symbols-outlined"> quiz </span>
<span>Tests</span>
</h3>
<div class="summary-card-item">
<stacked-bar-chart
[data]="testsAsGraphData(overview.stats.tests)"
[compact]="true"
/>
</div>
</div>
}
@if (overview.stats.runtime) {
<div class="chart-container">
<h3 class="chart-title">
Expand Down Expand Up @@ -276,9 +290,19 @@ <h2>Generated applications</h2>
<span class="status-badge error">Initial build failed</span>
}

@if (hasBuildFailureDuringA11yRepair(result)) {
@if (hasBuildFailureDuringTestRepair(result)) {
<span class="status-badge error">Build failed after a11y repair</span>
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does this need to be updated? (conceptual suggestion)

Suggested change
<span class="status-badge error">Build failed after a11y repair</span>
<span class="status-badge error">Build failed after a11y/test repair</span>

}
<!-- Test status badges -->
@if (finalAttempt.testResult) {
@if (finalAttempt.testResult.passed) {
@if ((result.testRepairAttempts || 0) > 0) {
<span class="status-badge warning">Tests passed after repair</span>
}
} @else {
<span class="status-badge error">Tests failed</span>
}
}
</div>
</div>
</expansion-panel-header>
Expand Down Expand Up @@ -350,12 +374,36 @@ <h5>
</div>
</div>

@if (result.testResult) {
<div class="app-details-section">
<h4>Test Results</h4>
<div class="test-summary">
@if (result.testResult.passed) {
<span class="status-text success">✔ Tests passed</span>
@if ((result.testRepairAttempts || 0) > 0) {
<span class="status-text">&nbsp;after {{ result.testRepairAttempts }} repair attempt(s)</span>
}
} @else {
<span class="status-text error">✘ Tests failed</span>
}
</div>

@if (result.testResult.output && !result.testResult.passed) {
<details class="test-output-button">
<summary class="neutral-button">See Test Output</summary>
<pre class="callout neutral code">{{ result.testResult.output }}</pre>
</details>
}
</div>
}

<div class="app-details-section">
<h4>Additional info</h4>
@for (attempt of result.attemptDetails; track attempt) {
@let isBuilt = attempt.buildResult.status === 'success';
@let axeViolations = attempt.serveTestingResult?.axeViolations;
@let hasAxeViolations = axeViolations && axeViolations.length > 0;
@let testsFailed = attempt.testResult?.passed === false;

<expansion-panel #expansionPanel>
<expansion-panel-header>
Expand All @@ -380,6 +428,15 @@ <h4>Additional info</h4>
>A11y</span
>
}

@if (attempt.testResult) {
<span
class="status-badge"
[class.error]="!attempt.testResult.passed"
[class.success]="attempt.testResult.passed"
>Tests</span
>
}
</expansion-panel-header>

@if (expansionPanel.opened()) {
Expand Down Expand Up @@ -416,6 +473,11 @@ <h4>A11y Violations</h4>
</pre>
}

@if (testsFailed) {
<h4>Failed Tests</h4>
<pre class="callout neutral code">{{ attempt.testResult?.output }}</pre>
}

<h4>Generated Code</h4>

@for (file of attempt.outputFiles; track file) {
Expand Down
30 changes: 28 additions & 2 deletions report-app/src/app/pages/report-viewer/report-viewer.ts
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ import {
LlmResponseFile,
RunInfo,
RunSummaryBuilds,
RunSummaryTests,
RuntimeStats,
ScoreBucket,
SkippedIndividualAssessment,
Expand Down Expand Up @@ -265,6 +266,31 @@ export class ReportViewer {
];
}

protected testsAsGraphData(tests: RunSummaryTests): StackedBarChartData {
return [
{
label: 'Passed',
color: ScoreCssVariable.excellent,
value: tests.successfulInitialTests,
},
{
label: 'Passed after repair',
color: ScoreCssVariable.great,
value: tests.successfulTestsAfterRepair,
},
{
label: 'Failed',
color: ScoreCssVariable.poor,
value: tests.failedTests,
},
{
label: 'No tests run',
color: ScoreCssVariable.neutral,
value: tests.noTestsRun,
},
];
}

protected checksAsGraphData(buckets: ScoreBucket[]): StackedBarChartData {
return buckets.map(b => ({
label: b.nameWithLabels,
Expand Down Expand Up @@ -400,7 +426,7 @@ export class ReportViewer {
return `wcs run --prompt=${result.promptDef.name} --env=<path to ${report.details.summary.environmentId} config>`;
}

protected hasBuildFailureDuringA11yRepair(result: AssessmentResult): boolean {
return result.attemptDetails.some(attempt => attempt.buildFailedDuringA11yRepair);
protected hasBuildFailureDuringTestRepair(result: AssessmentResult): boolean {
return result.attemptDetails.some(attempt => attempt.buildFailedDuringTestRepair);
}
}
8 changes: 7 additions & 1 deletion runner/configuration/constants.ts
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,13 @@ export const LLM_OUTPUT_DIR = join(rootDir, 'llm-output');
* providing the build output and the code that causes the problem.
*/
// Note: When updating, also adjust the default description in `README.md`.
export const DEFAULT_MAX_REPAIR_ATTEMPTS = 1;
export const DEFAULT_MAX_BUILD_REPAIR_ATTEMPTS = 1;

/**
* Number of times we'll try to ask LLM to repair test failures
* E.g. Axe violations, or test command failures
*/
export const DEFAULT_MAX_TEST_REPAIR_ATTEMPTS = 1;

/** Name of the folder where we store all generated reports */
export const REPORTS_ROOT_DIR = join(rootDir, 'reports');
Expand Down
5 changes: 0 additions & 5 deletions runner/configuration/environment-config.ts
Original file line number Diff line number Diff line change
Expand Up @@ -73,11 +73,6 @@ export const environmentConfigSchema = z.object({
export type EnvironmentConfig = z.infer<typeof environmentConfigSchema> &
Partial<LocalExecutorConfig>;

/** Package managers that are currently supported. */
export function getPossiblePackageManagers() {
return ['npm', 'pnpm', 'yarn'] as const;
}

/** Asserts that the specified data is a valid environment config. */
export function assertIsEnvironmentConfig(value: unknown): asserts value is EnvironmentConfig {
const validationResult = environmentConfigSchema
Expand Down
4 changes: 4 additions & 0 deletions runner/configuration/package-managers.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
/** Package managers that are currently supported. */
export function getPossiblePackageManagers() {
return ['npm', 'pnpm', 'yarn'] as const;
}
20 changes: 11 additions & 9 deletions runner/eval-cli.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@ import chalk from 'chalk';
import {
BUILT_IN_ENVIRONMENTS,
DEFAULT_AUTORATER_MODEL_NAME,
DEFAULT_MAX_REPAIR_ATTEMPTS,
DEFAULT_MAX_BUILD_REPAIR_ATTEMPTS,
DEFAULT_MAX_TEST_REPAIR_ATTEMPTS,
DEFAULT_MODEL_NAME,
} from './configuration/constants.js';
import {generateCodeAndAssess} from './orchestration/generate.js';
Expand Down Expand Up @@ -37,9 +38,9 @@ interface Options {
enableUserJourneyTesting?: boolean;
enableAutoCsp?: boolean;
autoraterModel?: string;
a11yRepairAttempts?: number;
logging?: 'text-only' | 'dynamic';
skipLighthouse?: boolean;
maxTestRepairAttempts?: number;
maxBuildRepairAttempts?: number;
}

Expand Down Expand Up @@ -151,21 +152,22 @@ function builder(argv: Argv): Argv<Options> {
default: DEFAULT_AUTORATER_MODEL_NAME,
description: 'Model to use when automatically rating generated code',
})
.option('a11y-repair-attempts', {
type: 'number',
default: 0,
description: 'Number of repair attempts for discovered a11y violations',
})
.option('skip-lighthouse', {
type: 'boolean',
default: false,
description: 'Whether to skip collecting Lighthouse data',
})
.option('max-build-repair-attempts', {
type: 'number',
default: DEFAULT_MAX_REPAIR_ATTEMPTS,
default: DEFAULT_MAX_BUILD_REPAIR_ATTEMPTS,
description: 'Number of repair attempts when build errors are discovered',
})
.option('max-test-repair-attempts', {
type: 'number',
default: DEFAULT_MAX_TEST_REPAIR_ATTEMPTS,
description:
'Number of repair attempts for discovered test failures (including a11y violations and ones from testCommand)',
})
.strict()
.version(false)
.help()
Expand Down Expand Up @@ -209,9 +211,9 @@ async function handler(cliArgs: Arguments<Options>): Promise<void> {
logging: cliArgs.logging,
autoraterModel: cliArgs.autoraterModel,
skipAiSummary: cliArgs.skipAiSummary,
a11yRepairAttempts: cliArgs.a11yRepairAttempts,
skipLighthouse: cliArgs.skipLighthouse,
maxBuildRepairAttempts: cliArgs.maxBuildRepairAttempts,
maxTestRepairAttempts: cliArgs.maxTestRepairAttempts,
});

logReportToConsole(runInfo);
Expand Down
Loading
Loading