fixup! feat(runner): add support for running and repairing tests

atscott · atscott · commit 5419d022eaa4 · 2025-10-01T13:37:45.000-07:00
diff --git a/report-app/src/app/pages/report-viewer/report-viewer.html b/report-app/src/app/pages/report-viewer/report-viewer.html
@@ -289,14 +289,8 @@ <h2>Generated applications</h2>
                 }
 
                 <!-- Test status badges -->
-                @if (finalAttempt.testResult) {
-                  @if (finalAttempt.testResult.passed) {
-                    @if ((result.testRepairAttempts || 0) > 0) {
-                      <span class="status-badge warning">Tests passed after repair</span>
-                    }
-                  } @else {
-                    <span class="status-badge error">Tests failed</span>
-                  }
+                @if (finalAttempt.testResult && !finalAttempt.testResult.passed) {
+                  <span class="status-badge error">Tests failed</span>
                 }
               </div>
             </div>
@@ -379,9 +373,6 @@ <h4>Test Results</h4>
                   <div class="test-summary">
                     @if (result.testResult.passed) {
                       <span class="status-text success">✔ Tests passed</span>
-                      @if ((result.testRepairAttempts || 0) > 0) {
-                        <span class="status-text">&nbsp;after {{ result.testRepairAttempts }} repair attempt(s)</span>
-                      }
                     } @else {
                       <span class="status-text error">✘ Tests failed</span>
                     }
diff --git a/runner/configuration/constants.ts b/runner/configuration/constants.ts
@@ -26,12 +26,6 @@ export const LLM_OUTPUT_DIR = join(rootDir, 'llm-output');
  */
 export const DEFAULT_MAX_REPAIR_ATTEMPTS = 1;
 
-/**
- * Number of times we'll try to ask LLM to repair a test failure,
- * providing the test output and the code that causes the problem.
- */
-export const DEFAULT_MAX_TEST_REPAIR_ATTEMPTS = 1;
-
 /** Name of the folder where we store all generated reports */
 export const REPORTS_ROOT_DIR = join(rootDir, 'reports');
 
diff --git a/runner/orchestration/build-serve-loop.ts b/runner/orchestration/build-serve-loop.ts
@@ -10,15 +10,13 @@ import {
 } from '../shared-interfaces.js';
 import {
   DEFAULT_MAX_REPAIR_ATTEMPTS,
-  DEFAULT_MAX_TEST_REPAIR_ATTEMPTS,
 } from '../configuration/constants.js';
 import {ProgressLogger} from '../progress/progress-logger.js';
 import {runBuild} from './build-worker.js';
 import {repairAndBuild} from './build-repair.js';
 import {EvalID, Gateway} from './gateway.js';
 import {serveAndTestApp} from './serve-testing-worker.js';
 import {runTest} from './test-worker.js';
-import {repairAndTest} from './test-repair.js';
 import {BrowserAgentTaskInput} from '../testing/browser-agent/models.js';
 
 /**
@@ -214,7 +212,6 @@ export async function attemptBuildAndTest(
 
   // Run tests if test command is configured and build was successful
   let testResult: TestResult | null = null;
-  let testRepairAttempts = 0;
 
   if (lastAttempt.buildResult.status === BuildResultStatus.SUCCESS && 'testCommand' in env) {
     testResult = await runTest(
@@ -227,41 +224,7 @@ export async function attemptBuildAndTest(
       workerConcurrencyQueue,
       progress,
     );
-
-    const maxTestRepairAttempts = gateway.shouldRetryFailedTests(evalID)
-      ? DEFAULT_MAX_TEST_REPAIR_ATTEMPTS
-      : 0;
-
     lastAttempt.testResult = testResult;
-
-    while (!testResult.passed && testRepairAttempts < maxTestRepairAttempts) {
-      testRepairAttempts++;
-      progress.log(
-        rootPromptDef,
-        'test',
-        `Trying to repair app tests (attempt #${testRepairAttempts + 1})`,
-      );
-
-      const attempt = await repairAndTest(
-        evalID,
-        gateway,
-        model,
-        env,
-        rootPromptDef,
-        directory,
-        lastAttempt.outputFiles,
-        testResult.output,
-        'The tests failed. Attempt to fix them. There are the following test errors:',
-        contextFiles,
-        abortSignal,
-        workerConcurrencyQueue,
-        testRepairAttempts,
-        progress,
-      );
-      attemptDetails.push(attempt);
-      lastAttempt = attempt;
-      testResult = lastAttempt.testResult!;
-    }
   }
 
   return {
@@ -271,6 +234,5 @@ export async function attemptBuildAndTest(
     repairAttempts,
     axeRepairAttempts,
     testResult,
-    testRepairAttempts,
   };
 }
diff --git a/runner/orchestration/generate.ts b/runner/orchestration/generate.ts
@@ -459,7 +459,6 @@ async function startEvaluationTask(
       progress,
       autoraterModel,
       attempt.testResult,
-      attempt.testRepairAttempts,
     );
 
     results.push({
@@ -478,7 +477,6 @@ async function startEvaluationTask(
       axeRepairAttempts: attempt.axeRepairAttempts,
       toolLogs,
       testResult: attempt.testResult,
-      testRepairAttempts: attempt.testRepairAttempts,
     } satisfies AssessmentResult);
   }
 
diff --git a/runner/ratings/built-in-ratings/successful-tests-rating.ts b/runner/ratings/built-in-ratings/successful-tests-rating.ts
@@ -9,7 +9,7 @@ export const successfulTestsRating: PerBuildRating = {
   category: RatingCategory.MEDIUM_IMPACT,
   scoreReduction: '30%',
   // Reduce the amount of points in case we've had test repair attempts.
-  rate: ({testResult, testRepairAttempts}) => {
+  rate: ({testResult}) => {
     // If no test results are available, skip this rating
     if (!testResult) {
       return {
@@ -21,7 +21,7 @@ export const successfulTestsRating: PerBuildRating = {
     return {
       state: RatingState.EXECUTED,
       coefficient: testResult.passed
-        ? 1 / ((testRepairAttempts || 0) + 1) // Reduce score based on repair attempts
+        ? 1
         : 0, // No points if tests failed
     };
   },
diff --git a/runner/ratings/rate-code.ts b/runner/ratings/rate-code.ts
@@ -58,7 +58,6 @@ export async function rateGeneratedCode(
   progress: ProgressLogger,
   autoraterModel: string,
   testResult: TestResult | null,
-  testRepairAttempts: number,
 ): Promise<CodeAssessmentScore> {
   let categorizedFiles: CategorizedFiles | null = null;
   let totalPoints = 0;
@@ -97,7 +96,6 @@ export async function rateGeneratedCode(
           serveTestingResult,
           repairAttempts,
           testResult,
-          testRepairAttempts,
           outputFiles.length,
           axeRepairAttempts,
           ratingsResult,
@@ -179,7 +177,6 @@ function runPerBuildRating(
   serveResult: ServeTestingResult | null,
   repairAttempts: number,
   testResult: TestResult | null,
-  testRepairAttempts: number,
   generatedFileCount: number,
   axeRepairAttempts: number,
   ratingsResult: RatingsResult,
@@ -192,7 +189,6 @@ function runPerBuildRating(
     axeRepairAttempts,
     ratingsResult,
     testResult,
-    testRepairAttempts,
   });
 
   // If the rating was skipped (e.g., Axe test wasn't run), create a skipped assessment.
diff --git a/runner/ratings/rating-types.ts b/runner/ratings/rating-types.ts
@@ -66,7 +66,6 @@ const perBuildRatingSchema = z
           serveResult: z.custom<ServeTestingResult | null>(),
           repairAttempts: z.number(),
           testResult: z.custom<TestResult | null>(),
-          testRepairAttempts: z.number(),
           axeRepairAttempts: z.number(),
           generatedFileCount: z.number(),
           ratingsResult: z.record(z.custom<IndividualAssessment | SkippedIndividualAssessment>()),
diff --git a/runner/ratings/stats.ts b/runner/ratings/stats.ts
@@ -66,11 +66,7 @@ export function calculateBuildAndCheckStats(assessments: AssessmentResult[]): Ag
     // Calculate test statistics
     if (result.testResult) {
       if (result.testResult.passed) {
-        if ((result.testRepairAttempts || 0) === 0) {
-          successfulInitialTests++;
-        } else {
-          successfulTestsAfterRepair++;
-        }
+        successfulInitialTests++;
       } else {
         failedTests++;
       }
diff --git a/runner/shared-interfaces.ts b/runner/shared-interfaces.ts
@@ -222,8 +222,6 @@ export interface AttemptDetails {
   reasoning?: string;
   /** Result of running tests for this attempt. */
   testResult?: TestResult;
-  /** The number of repair attempts made for tests in this attempt. */
-  testRepairAttempts?: number;
 }
 
 /** Statistics related to the build process of the generated applications. */
@@ -436,8 +434,6 @@ export interface AssessmentResult {
   toolLogs?: ToolLogEntry[];
   /** Result of running unit tests. */
   testResult: TestResult | null;
-  /** Number of repair attempts for tests. */
-  testRepairAttempts?: number;
 }
 
 /**