Skip to content

Commit 4332b39

Browse files
committed
fixup! feat(runner): add support for running and repairing tests
1 parent b433915 commit 4332b39

File tree

9 files changed

+6
-78
lines changed

9 files changed

+6
-78
lines changed

report-app/src/app/pages/report-viewer/report-viewer.html

Lines changed: 2 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -289,14 +289,8 @@ <h2>Generated applications</h2>
289289
}
290290

291291
<!-- Test status badges -->
292-
@if (finalAttempt.testResult) {
293-
@if (finalAttempt.testResult.passed) {
294-
@if ((result.testRepairAttempts || 0) > 0) {
295-
<span class="status-badge warning">Tests passed after repair</span>
296-
}
297-
} @else {
298-
<span class="status-badge error">Tests failed</span>
299-
}
292+
@if (finalAttempt.testResult && !finalAttempt.testResult.passed) {
293+
<span class="status-badge error">Tests failed</span>
300294
}
301295
</div>
302296
</div>
@@ -379,9 +373,6 @@ <h4>Test Results</h4>
379373
<div class="test-summary">
380374
@if (result.testResult.passed) {
381375
<span class="status-text success">✔ Tests passed</span>
382-
@if ((result.testRepairAttempts || 0) > 0) {
383-
<span class="status-text">&nbsp;after {{ result.testRepairAttempts }} repair attempt(s)</span>
384-
}
385376
} @else {
386377
<span class="status-text error">✘ Tests failed</span>
387378
}

runner/configuration/constants.ts

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -26,12 +26,6 @@ export const LLM_OUTPUT_DIR = join(rootDir, 'llm-output');
2626
*/
2727
export const DEFAULT_MAX_REPAIR_ATTEMPTS = 1;
2828

29-
/**
30-
* Number of times we'll try to ask LLM to repair a test failure,
31-
* providing the test output and the code that causes the problem.
32-
*/
33-
export const DEFAULT_MAX_TEST_REPAIR_ATTEMPTS = 1;
34-
3529
/** Name of the folder where we store all generated reports */
3630
export const REPORTS_ROOT_DIR = join(rootDir, 'reports');
3731

runner/orchestration/build-serve-loop.ts

Lines changed: 1 addition & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -8,17 +8,13 @@ import {
88
RootPromptDefinition,
99
TestResult,
1010
} from '../shared-interfaces.js';
11-
import {
12-
DEFAULT_MAX_REPAIR_ATTEMPTS,
13-
DEFAULT_MAX_TEST_REPAIR_ATTEMPTS,
14-
} from '../configuration/constants.js';
11+
import {DEFAULT_MAX_REPAIR_ATTEMPTS} from '../configuration/constants.js';
1512
import {ProgressLogger} from '../progress/progress-logger.js';
1613
import {runBuild} from './build-worker.js';
1714
import {repairAndBuild} from './build-repair.js';
1815
import {EvalID, Gateway} from './gateway.js';
1916
import {serveAndTestApp} from './serve-testing-worker.js';
2017
import {runTest} from './test-worker.js';
21-
import {repairAndTest} from './test-repair.js';
2218
import {BrowserAgentTaskInput} from '../testing/browser-agent/models.js';
2319

2420
/**
@@ -214,7 +210,6 @@ export async function attemptBuildAndTest(
214210

215211
// Run tests if test command is configured and build was successful
216212
let testResult: TestResult | null = null;
217-
let testRepairAttempts = 0;
218213

219214
if (lastAttempt.buildResult.status === BuildResultStatus.SUCCESS && 'testCommand' in env) {
220215
testResult = await runTest(
@@ -227,41 +222,7 @@ export async function attemptBuildAndTest(
227222
workerConcurrencyQueue,
228223
progress,
229224
);
230-
231-
const maxTestRepairAttempts = gateway.shouldRetryFailedTests(evalID)
232-
? DEFAULT_MAX_TEST_REPAIR_ATTEMPTS
233-
: 0;
234-
235225
lastAttempt.testResult = testResult;
236-
237-
while (!testResult.passed && testRepairAttempts < maxTestRepairAttempts) {
238-
testRepairAttempts++;
239-
progress.log(
240-
rootPromptDef,
241-
'test',
242-
`Trying to repair app tests (attempt #${testRepairAttempts + 1})`,
243-
);
244-
245-
const attempt = await repairAndTest(
246-
evalID,
247-
gateway,
248-
model,
249-
env,
250-
rootPromptDef,
251-
directory,
252-
lastAttempt.outputFiles,
253-
testResult.output,
254-
'The tests failed. Attempt to fix them. There are the following test errors:',
255-
contextFiles,
256-
abortSignal,
257-
workerConcurrencyQueue,
258-
testRepairAttempts,
259-
progress,
260-
);
261-
attemptDetails.push(attempt);
262-
lastAttempt = attempt;
263-
testResult = lastAttempt.testResult!;
264-
}
265226
}
266227

267228
return {
@@ -271,6 +232,5 @@ export async function attemptBuildAndTest(
271232
repairAttempts,
272233
axeRepairAttempts,
273234
testResult,
274-
testRepairAttempts,
275235
};
276236
}

runner/orchestration/generate.ts

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -459,7 +459,6 @@ async function startEvaluationTask(
459459
progress,
460460
autoraterModel,
461461
attempt.testResult,
462-
attempt.testRepairAttempts,
463462
);
464463

465464
results.push({
@@ -478,7 +477,6 @@ async function startEvaluationTask(
478477
axeRepairAttempts: attempt.axeRepairAttempts,
479478
toolLogs,
480479
testResult: attempt.testResult,
481-
testRepairAttempts: attempt.testRepairAttempts,
482480
} satisfies AssessmentResult);
483481
}
484482

runner/ratings/built-in-ratings/successful-tests-rating.ts

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ export const successfulTestsRating: PerBuildRating = {
99
category: RatingCategory.MEDIUM_IMPACT,
1010
scoreReduction: '30%',
1111
// Reduce the amount of points in case we've had test repair attempts.
12-
rate: ({testResult, testRepairAttempts}) => {
12+
rate: ({testResult}) => {
1313
// If no test results are available, skip this rating
1414
if (!testResult) {
1515
return {
@@ -20,9 +20,7 @@ export const successfulTestsRating: PerBuildRating = {
2020

2121
return {
2222
state: RatingState.EXECUTED,
23-
coefficient: testResult.passed
24-
? 1 / ((testRepairAttempts || 0) + 1) // Reduce score based on repair attempts
25-
: 0, // No points if tests failed
23+
coefficient: testResult.passed ? 1 : 0, // No points if tests failed
2624
};
2725
},
2826
};

runner/ratings/rate-code.ts

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,6 @@ export async function rateGeneratedCode(
5858
progress: ProgressLogger,
5959
autoraterModel: string,
6060
testResult: TestResult | null,
61-
testRepairAttempts: number,
6261
): Promise<CodeAssessmentScore> {
6362
let categorizedFiles: CategorizedFiles | null = null;
6463
let totalPoints = 0;
@@ -97,7 +96,6 @@ export async function rateGeneratedCode(
9796
serveTestingResult,
9897
repairAttempts,
9998
testResult,
100-
testRepairAttempts,
10199
outputFiles.length,
102100
axeRepairAttempts,
103101
ratingsResult,
@@ -179,7 +177,6 @@ function runPerBuildRating(
179177
serveResult: ServeTestingResult | null,
180178
repairAttempts: number,
181179
testResult: TestResult | null,
182-
testRepairAttempts: number,
183180
generatedFileCount: number,
184181
axeRepairAttempts: number,
185182
ratingsResult: RatingsResult,
@@ -192,7 +189,6 @@ function runPerBuildRating(
192189
axeRepairAttempts,
193190
ratingsResult,
194191
testResult,
195-
testRepairAttempts,
196192
});
197193

198194
// If the rating was skipped (e.g., Axe test wasn't run), create a skipped assessment.

runner/ratings/rating-types.ts

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,6 @@ const perBuildRatingSchema = z
6666
serveResult: z.custom<ServeTestingResult | null>(),
6767
repairAttempts: z.number(),
6868
testResult: z.custom<TestResult | null>(),
69-
testRepairAttempts: z.number(),
7069
axeRepairAttempts: z.number(),
7170
generatedFileCount: z.number(),
7271
ratingsResult: z.record(z.custom<IndividualAssessment | SkippedIndividualAssessment>()),

runner/ratings/stats.ts

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -66,11 +66,7 @@ export function calculateBuildAndCheckStats(assessments: AssessmentResult[]): Ag
6666
// Calculate test statistics
6767
if (result.testResult) {
6868
if (result.testResult.passed) {
69-
if ((result.testRepairAttempts || 0) === 0) {
70-
successfulInitialTests++;
71-
} else {
72-
successfulTestsAfterRepair++;
73-
}
69+
successfulInitialTests++;
7470
} else {
7571
failedTests++;
7672
}

runner/shared-interfaces.ts

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -222,8 +222,6 @@ export interface AttemptDetails {
222222
reasoning?: string;
223223
/** Result of running tests for this attempt. */
224224
testResult?: TestResult;
225-
/** The number of repair attempts made for tests in this attempt. */
226-
testRepairAttempts?: number;
227225
}
228226

229227
/** Statistics related to the build process of the generated applications. */
@@ -436,8 +434,6 @@ export interface AssessmentResult {
436434
toolLogs?: ToolLogEntry[];
437435
/** Result of running unit tests. */
438436
testResult: TestResult | null;
439-
/** Number of repair attempts for tests. */
440-
testRepairAttempts?: number;
441437
}
442438

443439
/**

0 commit comments

Comments
 (0)