Skip to content

Commit b433915

Browse files
committed
feat(runner): add support for running and repairing tests
This commit introduces the ability to run tests against the generated code as part of the evaluation process. A new optional `testCommand` can be in the environment configuration. If provided, this command will be executed after a successful build. If the tests fail, the tool will attempt to repair the code using the LLM, similar to how build failures are handled. The number of repair attempts is configurable. The report has been updated to display the test results for each run, including whether the tests passed, failed, or passed after repair. The summary view also includes aggregated statistics about the test results.
1 parent e0aa394 commit b433915

23 files changed

+611
-32
lines changed

docs/environment-reference.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -179,3 +179,8 @@ Defaults to `<package manager> run build`.
179179

180180
Command used to start a local dev server as a part of the evaluation.
181181
Defaults to `<package manager> run start --port 0`.
182+
183+
### `testCommand`
184+
185+
Command used to run tests against the generated code. If this property is not provided, tests will not be run. The command should exit with code 0 on success and a non-zero exit code on failure. The output from the command (both `stdout` and `stderr`) is captured and used for repair attempts if the tests fail. The test command will time out after 2 minutes.
186+

report-app/src/app/pages/report-viewer/report-viewer.html

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,20 @@ <h3 class="chart-title">
7676
/>
7777
</div>
7878
</div>
79+
@if (overview.stats.tests) {
80+
<div class="chart-container test-results-details">
81+
<h3 class="chart-title">
82+
<span class="material-symbols-outlined"> quiz </span>
83+
<span>Tests</span>
84+
</h3>
85+
<div class="summary-card-item">
86+
<stacked-bar-chart
87+
[data]="testsAsGraphData(overview.stats.tests)"
88+
[compact]="true"
89+
/>
90+
</div>
91+
</div>
92+
}
7993
@if (overview.stats.runtime) {
8094
<div class="chart-container">
8195
<h3 class="chart-title">
@@ -273,6 +287,17 @@ <h2>Generated applications</h2>
273287
@if (initialAttempt?.buildResult?.status === 'error') {
274288
<span class="status-badge error">Initial build failed</span>
275289
}
290+
291+
<!-- Test status badges -->
292+
@if (finalAttempt.testResult) {
293+
@if (finalAttempt.testResult.passed) {
294+
@if ((result.testRepairAttempts || 0) > 0) {
295+
<span class="status-badge warning">Tests passed after repair</span>
296+
}
297+
} @else {
298+
<span class="status-badge error">Tests failed</span>
299+
}
300+
}
276301
</div>
277302
</div>
278303
</expansion-panel-header>
@@ -348,6 +373,29 @@ <h5>
348373
</div>
349374
</div>
350375

376+
@if (result.testResult) {
377+
<div class="app-details-section">
378+
<h4>Test Results</h4>
379+
<div class="test-summary">
380+
@if (result.testResult.passed) {
381+
<span class="status-text success">✔ Tests passed</span>
382+
@if ((result.testRepairAttempts || 0) > 0) {
383+
<span class="status-text">&nbsp;after {{ result.testRepairAttempts }} repair attempt(s)</span>
384+
}
385+
} @else {
386+
<span class="status-text error">✘ Tests failed</span>
387+
}
388+
</div>
389+
390+
@if (result.testResult.output && !result.testResult.passed) {
391+
<details class="test-output-button">
392+
<summary class="neutral-button">See Test Output</summary>
393+
<pre class="callout neutral code">{{ result.testResult.output }}</pre>
394+
</details>
395+
}
396+
</div>
397+
}
398+
351399
<div class="app-details-section">
352400
<h4>Additional info</h4>
353401
@for (attempt of result.attemptDetails; track attempt) {
@@ -356,6 +404,7 @@ <h4>Additional info</h4>
356404
attempt.serveTestingResult?.axeViolations;
357405
@let hasAxeViolations =
358406
axeViolations && axeViolations.length > 0;
407+
@let testsFailed = attempt.testResult?.passed === false;
359408

360409
<expansion-panel #expansionPanel>
361410
<expansion-panel-header>
@@ -380,6 +429,15 @@ <h4>Additional info</h4>
380429
>A11y</span
381430
>
382431
}
432+
433+
@if (attempt.testResult) {
434+
<span
435+
class="status-badge"
436+
[class.error]="!attempt.testResult.passed"
437+
[class.success]="attempt.testResult.passed"
438+
>Tests</span
439+
>
440+
}
383441
</expansion-panel-header>
384442

385443
@if (expansionPanel.opened()) {
@@ -418,6 +476,11 @@ <h4>A11y Violations</h4>
418476
</pre>
419477
}
420478

479+
@if (testsFailed) {
480+
<h4>Failed Tests</h4>
481+
<pre class="callout neutral code">{{ attempt.testResult?.output }}</pre>
482+
}
483+
421484
<h4>Generated Code</h4>
422485

423486
@for (file of attempt.outputFiles; track file) {

report-app/src/app/pages/report-viewer/report-viewer.ts

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ import {
2323
LlmResponseFile,
2424
RunInfo,
2525
RunSummaryBuilds,
26+
RunSummaryTests,
2627
RuntimeStats,
2728
ScoreBucket,
2829
SkippedIndividualAssessment,
@@ -264,6 +265,31 @@ export class ReportViewer {
264265
];
265266
}
266267

268+
protected testsAsGraphData(tests: RunSummaryTests): StackedBarChartData {
269+
return [
270+
{
271+
label: 'Passed',
272+
color: ScoreCssVariable.excellent,
273+
value: tests.successfulInitialTests,
274+
},
275+
{
276+
label: 'Passed after repair',
277+
color: ScoreCssVariable.great,
278+
value: tests.successfulTestsAfterRepair,
279+
},
280+
{
281+
label: 'Failed',
282+
color: ScoreCssVariable.poor,
283+
value: tests.failedTests,
284+
},
285+
{
286+
label: 'No tests run',
287+
color: ScoreCssVariable.neutral,
288+
value: tests.noTestsRun,
289+
},
290+
];
291+
}
292+
267293
protected checksAsGraphData(buckets: ScoreBucket[]): StackedBarChartData {
268294
return buckets.map(b => ({
269295
label: b.nameWithLabels,

runner/configuration/constants.ts

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,12 @@ export const LLM_OUTPUT_DIR = join(rootDir, 'llm-output');
2626
*/
2727
export const DEFAULT_MAX_REPAIR_ATTEMPTS = 1;
2828

29+
/**
30+
* Number of times we'll try to ask LLM to repair a test failure,
31+
* providing the test output and the code that causes the problem.
32+
*/
33+
export const DEFAULT_MAX_TEST_REPAIR_ATTEMPTS = 1;
34+
2935
/** Name of the folder where we store all generated reports */
3036
export const REPORTS_ROOT_DIR = join(rootDir, 'reports');
3137

runner/configuration/environment-local.ts

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,10 @@ export const localEnvironmentConfigSchema = baseEnvironmentConfigSchema.extend({
2828
* Defaults to `<package manager> run start --port 0`.
2929
*/
3030
serveCommand: z.string().optional(),
31+
/**
32+
* Command to run when testing the code.
33+
*/
34+
testCommand: z.string().optional(),
3135
/**
3236
* Whether to skip installing dependencies when running evals in the environment.
3337
* Useful if you're managing dependencies yourself.
@@ -47,6 +51,8 @@ export class LocalEnvironment extends BaseEnvironment {
4751
readonly buildCommand: string;
4852
/** Command to run when starting a development server inside the app. */
4953
readonly serveCommand: string;
54+
/** Command to run when starting tests inside the app. */
55+
readonly testCommand: string | null;
5056
/**
5157
* Absolute path at which files specific to this environment are located. Will be merged in
5258
* with the files from the `projectTemplatePath` to get the final project structure.
@@ -82,6 +88,7 @@ export class LocalEnvironment extends BaseEnvironment {
8288
this.installCommand = `${packageManager} install --silent`;
8389
this.buildCommand = config.buildCommand || `${packageManager} run build`;
8490
this.serveCommand = config.serveCommand || this.getDefaultServeCommand(packageManager);
91+
this.testCommand = config.testCommand ?? null;
8592
this.projectTemplatePath = projectTemplatePath;
8693
this.sourceDirectory = sourceDirectory;
8794
this.mcpServerOptions = config.mcpServers || [];

runner/orchestration/build-repair.ts

Lines changed: 2 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ import {writeResponseFiles} from './file-system.js';
1212
import {runBuild} from './build-worker.js';
1313
import {ProgressLogger} from '../progress/progress-logger.js';
1414
import {EvalID, Gateway} from './gateway.js';
15+
import {mergeRepairFiles} from './repair.js';
1516

1617
/**
1718
* Calls the LLM to repair code, handles the response, and attempts to build the project again.
@@ -28,7 +29,6 @@ import {EvalID, Gateway} from './gateway.js';
2829
* @param abortSignal An AbortSignal to cancel the operation.
2930
* @param workerConcurrencyQueue The queue for managing worker concurrency.
3031
* @param attempts The current attempt number.
31-
* @param repairType The type of repair being performed.
3232
* @returns A promise that resolves to the new BuildResult.
3333
*/
3434
export async function repairAndBuild(
@@ -49,7 +49,7 @@ export async function repairAndBuild(
4949
): Promise<AttemptDetails> {
5050
const repairResponse = await repairCodeWithAI(
5151
evalID,
52-
gateway,
52+
gateway.repairBuild.bind(gateway),
5353
model,
5454
env,
5555
rootPromptDef,
@@ -132,24 +132,3 @@ async function handleRepairResponse(
132132
attempt: attempts,
133133
};
134134
}
135-
136-
/**
137-
* Merges a set of new or updated files from a repair attempt into the
138-
* current set of files.
139-
* @param repairOutputFiles The array of new or updated files to merge.
140-
* @param finalFiles The array of files to be updated.
141-
*/
142-
function mergeRepairFiles(repairOutputFiles: LlmResponseFile[], finalFiles: LlmResponseFile[]) {
143-
// Merge the repair response into the original files. Otherwise we may end up dropping
144-
// files that were valid in the initial response and the LLM decided not to touch, because
145-
// they're still valid.
146-
for (const file of repairOutputFiles) {
147-
const existingFile = finalFiles.find(f => f.filePath === file.filePath);
148-
149-
if (existingFile) {
150-
existingFile.code = file.code;
151-
} else {
152-
finalFiles.push(file);
153-
}
154-
}
155-
}

runner/orchestration/build-serve-loop.ts

Lines changed: 67 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,13 +2,23 @@ import PQueue from 'p-queue';
22
import {LlmGenerateFilesResponse} from '../codegen/llm-runner.js';
33
import {BuildResultStatus} from '../workers/builder/builder-types.js';
44
import {Environment} from '../configuration/environment.js';
5-
import {AttemptDetails, LlmContextFile, RootPromptDefinition} from '../shared-interfaces.js';
6-
import {DEFAULT_MAX_REPAIR_ATTEMPTS} from '../configuration/constants.js';
5+
import {
6+
AttemptDetails,
7+
LlmContextFile,
8+
RootPromptDefinition,
9+
TestResult,
10+
} from '../shared-interfaces.js';
11+
import {
12+
DEFAULT_MAX_REPAIR_ATTEMPTS,
13+
DEFAULT_MAX_TEST_REPAIR_ATTEMPTS,
14+
} from '../configuration/constants.js';
715
import {ProgressLogger} from '../progress/progress-logger.js';
816
import {runBuild} from './build-worker.js';
917
import {repairAndBuild} from './build-repair.js';
1018
import {EvalID, Gateway} from './gateway.js';
1119
import {serveAndTestApp} from './serve-testing-worker.js';
20+
import {runTest} from './test-worker.js';
21+
import {repairAndTest} from './test-repair.js';
1222
import {BrowserAgentTaskInput} from '../testing/browser-agent/models.js';
1323

1424
/**
@@ -30,7 +40,7 @@ import {BrowserAgentTaskInput} from '../testing/browser-agent/models.js';
3040
* @param abortSignal Signal to fire when the build should be aborted.
3141
* @param workerConcurrencyQueue Concurrency queue for controlling parallelism of worker invocations (as they are more expensive than LLM calls).
3242
*/
33-
export async function attemptBuild(
43+
export async function attemptBuildAndTest(
3444
evalID: EvalID,
3545
gateway: Gateway<Environment>,
3646
model: string,
@@ -202,11 +212,65 @@ export async function attemptBuild(
202212
}
203213
}
204214

215+
// Run tests if test command is configured and build was successful
216+
let testResult: TestResult | null = null;
217+
let testRepairAttempts = 0;
218+
219+
if (lastAttempt.buildResult.status === BuildResultStatus.SUCCESS && 'testCommand' in env) {
220+
testResult = await runTest(
221+
evalID,
222+
gateway,
223+
directory,
224+
env,
225+
rootPromptDef,
226+
abortSignal,
227+
workerConcurrencyQueue,
228+
progress,
229+
);
230+
231+
const maxTestRepairAttempts = gateway.shouldRetryFailedTests(evalID)
232+
? DEFAULT_MAX_TEST_REPAIR_ATTEMPTS
233+
: 0;
234+
235+
lastAttempt.testResult = testResult;
236+
237+
while (!testResult.passed && testRepairAttempts < maxTestRepairAttempts) {
238+
testRepairAttempts++;
239+
progress.log(
240+
rootPromptDef,
241+
'test',
242+
`Trying to repair app tests (attempt #${testRepairAttempts + 1})`,
243+
);
244+
245+
const attempt = await repairAndTest(
246+
evalID,
247+
gateway,
248+
model,
249+
env,
250+
rootPromptDef,
251+
directory,
252+
lastAttempt.outputFiles,
253+
testResult.output,
254+
'The tests failed. Attempt to fix them. There are the following test errors:',
255+
contextFiles,
256+
abortSignal,
257+
workerConcurrencyQueue,
258+
testRepairAttempts,
259+
progress,
260+
);
261+
attemptDetails.push(attempt);
262+
lastAttempt = attempt;
263+
testResult = lastAttempt.testResult!;
264+
}
265+
}
266+
205267
return {
206268
buildResult: lastAttempt.buildResult,
207269
serveTestingResult: lastAttempt.serveTestingResult,
208270
outputFiles: lastAttempt.outputFiles,
209271
repairAttempts,
210272
axeRepairAttempts,
273+
testResult,
274+
testRepairAttempts,
211275
};
212276
}

runner/orchestration/codegen.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,7 @@ export async function generateCodeWithAI(
8888
*/
8989
export async function repairCodeWithAI(
9090
evalID: EvalID,
91-
gateway: Gateway<Environment>,
91+
repairer: Gateway<Environment>['repairBuild'] | Gateway<Environment>['repairTest'],
9292
model: string,
9393
env: Environment,
9494
promptDef: RootPromptDefinition,
@@ -123,7 +123,7 @@ export async function repairCodeWithAI(
123123

124124
progress.log(promptDef, 'codegen', 'Repairing code with AI');
125125

126-
const response = await gateway.repairBuild(
126+
const response = await repairer(
127127
evalID,
128128
context,
129129
model,

0 commit comments

Comments
 (0)