Fix test_maze assertion (9, not 13) and handle exit 127 in test runner

unknown · claude · unknown · commit cc3f79d8518c · 2026-03-29T23:00:16.000-07:00
- Fix test_maze in both Python and TypeScript A* examples: the shortest
  path through the maze is 9 steps, not 13. All agents found the correct
  answer but the test was wrong.
- Detect exit 127 (command not found) in test runner and mark as skipped
  instead of failed — prevents false penalties in Copeland scoring
- Hard-error in preflight when test command doesn't exist (exit 127)
  instead of just warning — saves API tokens
- Skip tests criterion in Copeland pairwise comparison when both agents
  have skipped tests

Co-Authored-By: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/examples/astar-python/test_pathfinding.py b/examples/astar-python/test_pathfinding.py
@@ -76,7 +76,7 @@ def test_maze(self):
         self.assertEqual(result.path[0], (0, 0))
         self.assertEqual(result.path[-1], (4, 4))
         self.assertTrue(is_valid_path(grid, result.path))
-        self.assertEqual(len(result.path), 13)
+        self.assertEqual(len(result.path), 9)
 
     def test_large_grid_performance(self):
         size = 50
diff --git a/examples/astar/tests/pathfinding.test.ts b/examples/astar/tests/pathfinding.test.ts
@@ -78,7 +78,7 @@ describe("A* Pathfinding", () => {
     assert.deepEqual(result.path[0], [0, 0]);
     assert.deepEqual(result.path[result.path.length - 1], [4, 4]);
     assert.ok(isValidPath(grid, result.path), "path must be valid");
-    assert.equal(result.path.length, 13, "shortest maze path is 13");
+    assert.equal(result.path.length, 9, "shortest maze path is 9");
   });
 
   it("handles large grid efficiently", () => {
diff --git a/src/commands/run.ts b/src/commands/run.ts
@@ -173,6 +173,10 @@ export async function retry(opts: RunOptions): Promise<void> {
     const repoRoot = await getRepoRoot();
     const testWarning = await preflightTestRun(opts.testCmd, repoRoot);
     if (testWarning) {
+      if (testWarning.includes("exit 127")) {
+        console.error(`  ${testWarning}`);
+        process.exit(1);
+      }
       console.warn(`  ⚠ ${testWarning}`);
     }
   }
@@ -338,8 +342,20 @@ export async function preflightTestRun(testCmd: string, repoRoot: string): Promi
     return null;
   } catch (err: unknown) {
     const e = err as { stdout?: string; stderr?: string; code?: number | string };
+    const exitCode = typeof e.code === "number" ? e.code : 1;
     const output = ((e.stdout ?? "") + (e.stderr ?? "")).trim();
     const snippet = output.length > 200 ? `${output.slice(0, 200)}...` : output;
+
+    // Exit 127 = command not found. This is a setup error, not a test failure.
+    // Hard-error: don't waste API tokens if the test command doesn't exist.
+    if (exitCode === 127) {
+      return (
+        `Test command not found: "${testCmd}" (exit 127). ` +
+        "Ensure the test script exists and is committed to your repository.\n" +
+        (snippet ? `  Output: ${snippet}` : "")
+      );
+    }
+
     return (
       `Test command "${testCmd}" failed on the current branch before spawning agents. ` +
       "Your test environment may already be broken.\n" +
@@ -379,6 +395,11 @@ export async function run(opts: RunOptions): Promise<void> {
     const repoRoot = await getRepoRoot();
     const testWarning = await preflightTestRun(opts.testCmd, repoRoot);
     if (testWarning) {
+      // Exit 127 = command not found — hard error, don't waste API tokens
+      if (testWarning.includes("exit 127")) {
+        console.error(`  ${testWarning}`);
+        process.exit(1);
+      }
       console.warn(`  ⚠ ${testWarning}`);
     }
   }
diff --git a/src/scoring/convergence.ts b/src/scoring/convergence.ts
@@ -231,12 +231,15 @@ export function copelandRecommend(
   // Pre-compute per-agent criteria values
   const agentData = completed.map((agent) => {
     const test = testResults.find((t) => t.agentId === agent.id);
-    const testsPassed = test?.passed ? 1 : 0;
+    // Skipped tests (exit 127 = command not found) are treated as neutral —
+    // don't penalize agents when the test infrastructure itself is broken.
+    const testsSkipped = (test as { skipped?: boolean } | undefined)?.skipped === true;
+    const testsPassed = testsSkipped ? -1 : test?.passed ? 1 : 0;
     const group = convergence.find((g) => g.agents.includes(agent.id));
     const groupSize = group ? group.agents.length : 0;
     const { testFiles, nonTestFiles } = splitFilesByType(agent.filesChanged);
     const cappedTestFiles = effectiveTestFiles(testFiles, nonTestFiles);
-    return { id: agent.id, testsPassed, groupSize, nonTestFiles, cappedTestFiles };
+    return { id: agent.id, testsPassed, testsSkipped, groupSize, nonTestFiles, cappedTestFiles };
   });
 
   // Initialize scores
@@ -262,14 +265,18 @@ export function copelandRecommend(
       let bWins = 0;
 
       // Criterion 1: tests passed (more is better)
-      if (a.testsPassed > b.testsPassed) {
-        aWins++;
-        scoreMap.get(a.id)!.testsWins++;
-        scoreMap.get(b.id)!.testsWins--;
-      } else if (b.testsPassed > a.testsPassed) {
-        bWins++;
-        scoreMap.get(b.id)!.testsWins++;
-        scoreMap.get(a.id)!.testsWins--;
+      // Skip this criterion entirely when both agents have skipped tests
+      // (exit 127 = test command not found — not a code quality signal).
+      if (!(a.testsSkipped && b.testsSkipped)) {
+        if (a.testsPassed > b.testsPassed) {
+          aWins++;
+          scoreMap.get(a.id)!.testsWins++;
+          scoreMap.get(b.id)!.testsWins--;
+        } else if (b.testsPassed > a.testsPassed) {
+          bWins++;
+          scoreMap.get(b.id)!.testsWins++;
+          scoreMap.get(a.id)!.testsWins--;
+        }
       }
 
       // Criterion 2: convergence group size (larger is better)
diff --git a/src/scoring/test-runner.ts b/src/scoring/test-runner.ts
@@ -120,11 +120,26 @@ export async function runTests(
       };
     }
 
+    const exitCode = typeof e.code === "number" ? e.code : 1;
+    const output = (e.stdout ?? "") + (e.stderr ?? "");
+
+    // Exit 127 = "command not found" — the test infrastructure is broken,
+    // not the agent's code. Mark as skipped so scoring doesn't penalize.
+    if (exitCode === 127) {
+      return {
+        agentId,
+        passed: false,
+        output: `Test command not found (exit 127). The test script may not exist in the agent's clone.\n${output}`,
+        exitCode: 127,
+        skipped: true,
+      };
+    }
+
     return {
       agentId,
       passed: false,
-      output: (e.stdout ?? "") + (e.stderr ?? ""),
-      exitCode: typeof e.code === "number" ? e.code : 1,
+      output,
+      exitCode,
     };
   }
 }
diff --git a/src/types.ts b/src/types.ts
@@ -33,6 +33,8 @@ export interface TestResult {
   passed: boolean;
   output: string;
   exitCode: number;
+  /** True when the test command itself couldn't execute (e.g. exit 127 — command not found). */
+  skipped?: boolean;
 }
 
 export interface ConvergenceGroup {

Original file line number	Diff line number	Diff line change
`@@ -33,6 +33,8 @@ export interface TestResult {`
`33`	`33`	`passed: boolean;`
`34`	`34`	`output: string;`
`35`	`35`	`exitCode: number;`
	`36`	`+ /** True when the test command itself couldn't execute (e.g. exit 127 — command not found). */`
	`37`	`+ skipped?: boolean;`
`36`	`38`	`}`
`37`	`39`
`38`	`40`	`export interface ConvergenceGroup {`