Skip to content

Commit 5419d02

Browse files
committed
fixup! feat(runner): add support for running and repairing tests
1 parent b433915 commit 5419d02

File tree

9 files changed

+5
-73
lines changed

9 files changed

+5
-73
lines changed

report-app/src/app/pages/report-viewer/report-viewer.html

Lines changed: 2 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -289,14 +289,8 @@ <h2>Generated applications</h2>
289289
}
290290

291291
<!-- Test status badges -->
292-
@if (finalAttempt.testResult) {
293-
@if (finalAttempt.testResult.passed) {
294-
@if ((result.testRepairAttempts || 0) > 0) {
295-
<span class="status-badge warning">Tests passed after repair</span>
296-
}
297-
} @else {
298-
<span class="status-badge error">Tests failed</span>
299-
}
292+
@if (finalAttempt.testResult && !finalAttempt.testResult.passed) {
293+
<span class="status-badge error">Tests failed</span>
300294
}
301295
</div>
302296
</div>
@@ -379,9 +373,6 @@ <h4>Test Results</h4>
379373
<div class="test-summary">
380374
@if (result.testResult.passed) {
381375
<span class="status-text success">✔ Tests passed</span>
382-
@if ((result.testRepairAttempts || 0) > 0) {
383-
<span class="status-text">&nbsp;after {{ result.testRepairAttempts }} repair attempt(s)</span>
384-
}
385376
} @else {
386377
<span class="status-text error">✘ Tests failed</span>
387378
}

runner/configuration/constants.ts

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -26,12 +26,6 @@ export const LLM_OUTPUT_DIR = join(rootDir, 'llm-output');
2626
*/
2727
export const DEFAULT_MAX_REPAIR_ATTEMPTS = 1;
2828

29-
/**
30-
* Number of times we'll try to ask LLM to repair a test failure,
31-
* providing the test output and the code that causes the problem.
32-
*/
33-
export const DEFAULT_MAX_TEST_REPAIR_ATTEMPTS = 1;
34-
3529
/** Name of the folder where we store all generated reports */
3630
export const REPORTS_ROOT_DIR = join(rootDir, 'reports');
3731

runner/orchestration/build-serve-loop.ts

Lines changed: 0 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -10,15 +10,13 @@ import {
1010
} from '../shared-interfaces.js';
1111
import {
1212
DEFAULT_MAX_REPAIR_ATTEMPTS,
13-
DEFAULT_MAX_TEST_REPAIR_ATTEMPTS,
1413
} from '../configuration/constants.js';
1514
import {ProgressLogger} from '../progress/progress-logger.js';
1615
import {runBuild} from './build-worker.js';
1716
import {repairAndBuild} from './build-repair.js';
1817
import {EvalID, Gateway} from './gateway.js';
1918
import {serveAndTestApp} from './serve-testing-worker.js';
2019
import {runTest} from './test-worker.js';
21-
import {repairAndTest} from './test-repair.js';
2220
import {BrowserAgentTaskInput} from '../testing/browser-agent/models.js';
2321

2422
/**
@@ -214,7 +212,6 @@ export async function attemptBuildAndTest(
214212

215213
// Run tests if test command is configured and build was successful
216214
let testResult: TestResult | null = null;
217-
let testRepairAttempts = 0;
218215

219216
if (lastAttempt.buildResult.status === BuildResultStatus.SUCCESS && 'testCommand' in env) {
220217
testResult = await runTest(
@@ -227,41 +224,7 @@ export async function attemptBuildAndTest(
227224
workerConcurrencyQueue,
228225
progress,
229226
);
230-
231-
const maxTestRepairAttempts = gateway.shouldRetryFailedTests(evalID)
232-
? DEFAULT_MAX_TEST_REPAIR_ATTEMPTS
233-
: 0;
234-
235227
lastAttempt.testResult = testResult;
236-
237-
while (!testResult.passed && testRepairAttempts < maxTestRepairAttempts) {
238-
testRepairAttempts++;
239-
progress.log(
240-
rootPromptDef,
241-
'test',
242-
`Trying to repair app tests (attempt #${testRepairAttempts + 1})`,
243-
);
244-
245-
const attempt = await repairAndTest(
246-
evalID,
247-
gateway,
248-
model,
249-
env,
250-
rootPromptDef,
251-
directory,
252-
lastAttempt.outputFiles,
253-
testResult.output,
254-
'The tests failed. Attempt to fix them. There are the following test errors:',
255-
contextFiles,
256-
abortSignal,
257-
workerConcurrencyQueue,
258-
testRepairAttempts,
259-
progress,
260-
);
261-
attemptDetails.push(attempt);
262-
lastAttempt = attempt;
263-
testResult = lastAttempt.testResult!;
264-
}
265228
}
266229

267230
return {
@@ -271,6 +234,5 @@ export async function attemptBuildAndTest(
271234
repairAttempts,
272235
axeRepairAttempts,
273236
testResult,
274-
testRepairAttempts,
275237
};
276238
}

runner/orchestration/generate.ts

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -459,7 +459,6 @@ async function startEvaluationTask(
459459
progress,
460460
autoraterModel,
461461
attempt.testResult,
462-
attempt.testRepairAttempts,
463462
);
464463

465464
results.push({
@@ -478,7 +477,6 @@ async function startEvaluationTask(
478477
axeRepairAttempts: attempt.axeRepairAttempts,
479478
toolLogs,
480479
testResult: attempt.testResult,
481-
testRepairAttempts: attempt.testRepairAttempts,
482480
} satisfies AssessmentResult);
483481
}
484482

runner/ratings/built-in-ratings/successful-tests-rating.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ export const successfulTestsRating: PerBuildRating = {
99
category: RatingCategory.MEDIUM_IMPACT,
1010
scoreReduction: '30%',
1111
// Reduce the amount of points in case we've had test repair attempts.
12-
rate: ({testResult, testRepairAttempts}) => {
12+
rate: ({testResult}) => {
1313
// If no test results are available, skip this rating
1414
if (!testResult) {
1515
return {
@@ -21,7 +21,7 @@ export const successfulTestsRating: PerBuildRating = {
2121
return {
2222
state: RatingState.EXECUTED,
2323
coefficient: testResult.passed
24-
? 1 / ((testRepairAttempts || 0) + 1) // Reduce score based on repair attempts
24+
? 1
2525
: 0, // No points if tests failed
2626
};
2727
},

runner/ratings/rate-code.ts

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,6 @@ export async function rateGeneratedCode(
5858
progress: ProgressLogger,
5959
autoraterModel: string,
6060
testResult: TestResult | null,
61-
testRepairAttempts: number,
6261
): Promise<CodeAssessmentScore> {
6362
let categorizedFiles: CategorizedFiles | null = null;
6463
let totalPoints = 0;
@@ -97,7 +96,6 @@ export async function rateGeneratedCode(
9796
serveTestingResult,
9897
repairAttempts,
9998
testResult,
100-
testRepairAttempts,
10199
outputFiles.length,
102100
axeRepairAttempts,
103101
ratingsResult,
@@ -179,7 +177,6 @@ function runPerBuildRating(
179177
serveResult: ServeTestingResult | null,
180178
repairAttempts: number,
181179
testResult: TestResult | null,
182-
testRepairAttempts: number,
183180
generatedFileCount: number,
184181
axeRepairAttempts: number,
185182
ratingsResult: RatingsResult,
@@ -192,7 +189,6 @@ function runPerBuildRating(
192189
axeRepairAttempts,
193190
ratingsResult,
194191
testResult,
195-
testRepairAttempts,
196192
});
197193

198194
// If the rating was skipped (e.g., Axe test wasn't run), create a skipped assessment.

runner/ratings/rating-types.ts

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,6 @@ const perBuildRatingSchema = z
6666
serveResult: z.custom<ServeTestingResult | null>(),
6767
repairAttempts: z.number(),
6868
testResult: z.custom<TestResult | null>(),
69-
testRepairAttempts: z.number(),
7069
axeRepairAttempts: z.number(),
7170
generatedFileCount: z.number(),
7271
ratingsResult: z.record(z.custom<IndividualAssessment | SkippedIndividualAssessment>()),

runner/ratings/stats.ts

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -66,11 +66,7 @@ export function calculateBuildAndCheckStats(assessments: AssessmentResult[]): Ag
6666
// Calculate test statistics
6767
if (result.testResult) {
6868
if (result.testResult.passed) {
69-
if ((result.testRepairAttempts || 0) === 0) {
70-
successfulInitialTests++;
71-
} else {
72-
successfulTestsAfterRepair++;
73-
}
69+
successfulInitialTests++;
7470
} else {
7571
failedTests++;
7672
}

runner/shared-interfaces.ts

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -222,8 +222,6 @@ export interface AttemptDetails {
222222
reasoning?: string;
223223
/** Result of running tests for this attempt. */
224224
testResult?: TestResult;
225-
/** The number of repair attempts made for tests in this attempt. */
226-
testRepairAttempts?: number;
227225
}
228226

229227
/** Statistics related to the build process of the generated applications. */
@@ -436,8 +434,6 @@ export interface AssessmentResult {
436434
toolLogs?: ToolLogEntry[];
437435
/** Result of running unit tests. */
438436
testResult: TestResult | null;
439-
/** Number of repair attempts for tests. */
440-
testRepairAttempts?: number;
441437
}
442438

443439
/**

0 commit comments

Comments
 (0)