Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions docs/environment-reference.md
Original file line number Diff line number Diff line change
Expand Up @@ -179,3 +179,8 @@ Defaults to `<package manager> run build`.

Command used to start a local dev server as a part of the evaluation.
Defaults to `<package manager> run start --port 0`.

### `testCommand`

Command used to run tests against the generated code. If this property is not provided, tests will not be run. The command should exit with code 0 on success and a non-zero exit code on failure. The output from the command (both `stdout` and `stderr`) is captured and used for repair attempts if the tests fail. The test command will time out after 4 minutes.

64 changes: 63 additions & 1 deletion report-app/src/app/pages/report-viewer/report-viewer.html
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,20 @@ <h3 class="chart-title">
<stacked-bar-chart [data]="buildsAsGraphData(overview.stats.builds)" [compact]="true" />
</div>
</div>
@if (overview.stats.tests) {
<div class="chart-container test-results-details">
<h3 class="chart-title">
<span class="material-symbols-outlined"> quiz </span>
<span>Tests</span>
</h3>
<div class="summary-card-item">
<stacked-bar-chart
[data]="testsAsGraphData(overview.stats.tests)"
[compact]="true"
/>
</div>
</div>
}
@if (overview.stats.runtime) {
<div class="chart-container">
<h3 class="chart-title">
Expand Down Expand Up @@ -281,9 +295,19 @@ <h2>Generated applications</h2>
<span class="status-badge error">Initial build failed</span>
}

@if (hasBuildFailureDuringA11yRepair(result)) {
@if (hasBuildFailureDuringTestRepair(result)) {
<span class="status-badge error">Build failed after a11y repair</span>
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does this need to be updated? (conceptual suggestion)

Suggested change
<span class="status-badge error">Build failed after a11y repair</span>
<span class="status-badge error">Build failed after a11y/test repair</span>

}
<!-- Test status badges -->
@if (finalAttempt.testResult) {
@if (finalAttempt.testResult.passed) {
@if ((result.testRepairAttempts || 0) > 0) {
<span class="status-badge warning">Tests passed after repair</span>
}
} @else {
<span class="status-badge error">Tests failed</span>
}
}
</div>
</div>
</expansion-panel-header>
Expand Down Expand Up @@ -355,12 +379,36 @@ <h5>
</div>
</div>

@if (result.testResult) {
<div class="app-details-section">
<h4>Test Results</h4>
<div class="test-summary">
@if (result.testResult.passed) {
<span class="status-text success">✔ Tests passed</span>
@if ((result.testRepairAttempts || 0) > 0) {
<span class="status-text">&nbsp;after {{ result.testRepairAttempts }} repair attempt(s)</span>
}
} @else {
<span class="status-text error">✘ Tests failed</span>
}
</div>

@if (result.testResult.output && !result.testResult.passed) {
<details class="test-output-button">
<summary class="neutral-button">See Test Output</summary>
<pre class="callout neutral code">{{ result.testResult.output }}</pre>
</details>
}
</div>
}

<div class="app-details-section">
<h4>Additional info</h4>
@for (attempt of result.attemptDetails; track attempt) {
@let isBuilt = attempt.buildResult.status === 'success';
@let axeViolations = attempt.serveTestingResult?.axeViolations;
@let hasAxeViolations = axeViolations && axeViolations.length > 0;
@let testsFailed = attempt.testResult?.passed === false;

<expansion-panel #expansionPanel>
<expansion-panel-header>
Expand All @@ -385,6 +433,15 @@ <h4>Additional info</h4>
>A11y</span
>
}

@if (attempt.testResult) {
<span
class="status-badge"
[class.error]="!attempt.testResult.passed"
[class.success]="attempt.testResult.passed"
>Tests</span
>
}
</expansion-panel-header>

@if (expansionPanel.opened()) {
Expand Down Expand Up @@ -421,6 +478,11 @@ <h4>A11y Violations</h4>
</pre>
}

@if (testsFailed) {
<h4>Failed Tests</h4>
<pre class="callout neutral code">{{ attempt.testResult?.output }}</pre>
}

<h4>Generated Code</h4>

@for (file of attempt.outputFiles; track file) {
Expand Down
30 changes: 28 additions & 2 deletions report-app/src/app/pages/report-viewer/report-viewer.ts
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ import {
LlmResponseFile,
RunInfo,
RunSummaryBuilds,
RunSummaryTests,
RuntimeStats,
ScoreBucket,
SkippedIndividualAssessment,
Expand Down Expand Up @@ -271,6 +272,31 @@ export class ReportViewer {
];
}

protected testsAsGraphData(tests: RunSummaryTests): StackedBarChartData {
return [
{
label: 'Passed',
color: ScoreCssVariable.excellent,
value: tests.successfulInitialTests,
},
{
label: 'Passed after repair',
color: ScoreCssVariable.great,
value: tests.successfulTestsAfterRepair,
},
{
label: 'Failed',
color: ScoreCssVariable.poor,
value: tests.failedTests,
},
{
label: 'No tests run',
color: ScoreCssVariable.neutral,
value: tests.noTestsRun,
},
];
}

protected checksAsGraphData(buckets: ScoreBucket[]): StackedBarChartData {
return buckets.map(b => ({
label: b.nameWithLabels,
Expand Down Expand Up @@ -427,7 +453,7 @@ export class ReportViewer {
return `wcs run --prompt=${result.promptDef.name} --env=<path to ${report.details.summary.environmentId} config>`;
}

protected hasBuildFailureDuringA11yRepair(result: AssessmentResult): boolean {
return result.attemptDetails.some(attempt => attempt.buildFailedDuringA11yRepair);
protected hasBuildFailureDuringTestRepair(result: AssessmentResult): boolean {
return result.attemptDetails.some(attempt => attempt.buildFailedDuringTestRepair);
}
}
2 changes: 0 additions & 2 deletions runner/configuration/base-environment-config.ts
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
import z from 'zod';
import {ratingSchema} from '../ratings/rating-types.js';
import {MultiStepPrompt} from './multi-step-prompt.js';
import {mcpServerOptionsSchema} from '../codegen/llm-runner.js';
import {getPossiblePackageManagers} from './environment-config.js';

export const baseEnvironmentConfigSchema = z.strictObject({
/** Display name for the environment. */
Expand Down
5 changes: 0 additions & 5 deletions runner/configuration/environment-config.ts
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,6 @@ const environmentConfigSchema = z.union([
*/
export type EnvironmentConfig = z.infer<typeof environmentConfigSchema>;

/** Package managers that are currently supported. */
export function getPossiblePackageManagers() {
return ['npm', 'pnpm', 'yarn'] as const;
}

/** Asserts that the specified data is a valid environment config. */
export function assertIsEnvironmentConfig(value: unknown): asserts value is EnvironmentConfig {
const validationResult = environmentConfigSchema.safeParse(value);
Expand Down
9 changes: 8 additions & 1 deletion runner/configuration/environment-local.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ import z from 'zod';
import {LlmRunner, McpServerOptions, mcpServerOptionsSchema} from '../codegen/llm-runner.js';
import {LocalGateway} from '../orchestration/gateways/local_gateway.js';
import {BaseEnvironment} from './base-environment.js';
import {EnvironmentConfig, getPossiblePackageManagers} from './environment-config.js';
import {getPossiblePackageManagers} from './package-managers.js';
import {baseEnvironmentConfigSchema} from './base-environment-config.js';

export const localEnvironmentConfigSchema = baseEnvironmentConfigSchema.extend({
Expand All @@ -28,6 +28,10 @@ export const localEnvironmentConfigSchema = baseEnvironmentConfigSchema.extend({
* Defaults to `<package manager> run start --port 0`.
*/
serveCommand: z.string().optional(),
/**
* Command to run when testing the code.
*/
testCommand: z.string().optional(),
/**
* Whether to skip installing dependencies when running evals in the environment.
* Useful if you're managing dependencies yourself.
Expand All @@ -47,6 +51,8 @@ export class LocalEnvironment extends BaseEnvironment {
readonly buildCommand: string;
/** Command to run when starting a development server inside the app. */
readonly serveCommand: string;
/** Command to run when starting tests inside the app. */
readonly testCommand: string | null;
/**
* Absolute path at which files specific to this environment are located. Will be merged in
* with the files from the `projectTemplatePath` to get the final project structure.
Expand Down Expand Up @@ -82,6 +88,7 @@ export class LocalEnvironment extends BaseEnvironment {
this.installCommand = `${packageManager} install --silent`;
this.buildCommand = config.buildCommand || `${packageManager} run build`;
this.serveCommand = config.serveCommand || this.getDefaultServeCommand(packageManager);
this.testCommand = config.testCommand ?? null;
this.projectTemplatePath = projectTemplatePath;
this.sourceDirectory = sourceDirectory;
this.mcpServerOptions = config.mcpServers || [];
Expand Down
4 changes: 4 additions & 0 deletions runner/configuration/package-managers.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
/** Package managers that are currently supported. */
export function getPossiblePackageManagers() {
return ['npm', 'pnpm', 'yarn'] as const;
}
9 changes: 5 additions & 4 deletions runner/eval-cli.ts
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ interface Options {
enableUserJourneyTesting?: boolean;
enableAutoCsp?: boolean;
autoraterModel?: string;
a11yRepairAttempts?: number;
testRepairAttempts?: number;
logging?: 'text-only' | 'dynamic';
}

Expand Down Expand Up @@ -148,10 +148,11 @@ function builder(argv: Argv): Argv<Options> {
default: DEFAULT_AUTORATER_MODEL_NAME,
description: 'Model to use when automatically rating generated code',
})
.option('a11y-repair-attempts', {
.option('test-repair-attempts', {
type: 'number',
default: 0,
description: 'Number of repair attempts for discovered a11y violations',
description:
'Number of repair attempts for discovered test failures (including a11y violations and ones from testCommand)',
})
.strict()
.version(false)
Expand Down Expand Up @@ -196,7 +197,7 @@ async function handler(cliArgs: Arguments<Options>): Promise<void> {
logging: cliArgs.logging,
autoraterModel: cliArgs.autoraterModel,
skipAiSummary: cliArgs.skipAiSummary,
a11yRepairAttempts: cliArgs.a11yRepairAttempts,
testRepairAttempts: cliArgs.testRepairAttempts,
});

logReportToConsole(runInfo);
Expand Down
Loading
Loading