Skip to content

Commit cc3f79d

Browse files
unknownclaude
andcommitted
Fix test_maze assertion (9, not 13) and handle exit 127 in test runner
- Fix test_maze in both Python and TypeScript A* examples: the shortest path through the maze is 9 steps, not 13. All agents found the correct answer but the test was wrong. - Detect exit 127 (command not found) in test runner and mark as skipped instead of failed — prevents false penalties in Copeland scoring - Hard-error in preflight when test command doesn't exist (exit 127) instead of just warning — saves API tokens - Skip tests criterion in Copeland pairwise comparison when both agents have skipped tests Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 3494934 commit cc3f79d

File tree

6 files changed

+59
-14
lines changed

6 files changed

+59
-14
lines changed

examples/astar-python/test_pathfinding.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,7 @@ def test_maze(self):
7676
self.assertEqual(result.path[0], (0, 0))
7777
self.assertEqual(result.path[-1], (4, 4))
7878
self.assertTrue(is_valid_path(grid, result.path))
79-
self.assertEqual(len(result.path), 13)
79+
self.assertEqual(len(result.path), 9)
8080

8181
def test_large_grid_performance(self):
8282
size = 50

examples/astar/tests/pathfinding.test.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,7 @@ describe("A* Pathfinding", () => {
7878
assert.deepEqual(result.path[0], [0, 0]);
7979
assert.deepEqual(result.path[result.path.length - 1], [4, 4]);
8080
assert.ok(isValidPath(grid, result.path), "path must be valid");
81-
assert.equal(result.path.length, 13, "shortest maze path is 13");
81+
assert.equal(result.path.length, 9, "shortest maze path is 9");
8282
});
8383

8484
it("handles large grid efficiently", () => {

src/commands/run.ts

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -173,6 +173,10 @@ export async function retry(opts: RunOptions): Promise<void> {
173173
const repoRoot = await getRepoRoot();
174174
const testWarning = await preflightTestRun(opts.testCmd, repoRoot);
175175
if (testWarning) {
176+
if (testWarning.includes("exit 127")) {
177+
console.error(` ${testWarning}`);
178+
process.exit(1);
179+
}
176180
console.warn(` ⚠ ${testWarning}`);
177181
}
178182
}
@@ -338,8 +342,20 @@ export async function preflightTestRun(testCmd: string, repoRoot: string): Promi
338342
return null;
339343
} catch (err: unknown) {
340344
const e = err as { stdout?: string; stderr?: string; code?: number | string };
345+
const exitCode = typeof e.code === "number" ? e.code : 1;
341346
const output = ((e.stdout ?? "") + (e.stderr ?? "")).trim();
342347
const snippet = output.length > 200 ? `${output.slice(0, 200)}...` : output;
348+
349+
// Exit 127 = command not found. This is a setup error, not a test failure.
350+
// Hard-error: don't waste API tokens if the test command doesn't exist.
351+
if (exitCode === 127) {
352+
return (
353+
`Test command not found: "${testCmd}" (exit 127). ` +
354+
"Ensure the test script exists and is committed to your repository.\n" +
355+
(snippet ? ` Output: ${snippet}` : "")
356+
);
357+
}
358+
343359
return (
344360
`Test command "${testCmd}" failed on the current branch before spawning agents. ` +
345361
"Your test environment may already be broken.\n" +
@@ -379,6 +395,11 @@ export async function run(opts: RunOptions): Promise<void> {
379395
const repoRoot = await getRepoRoot();
380396
const testWarning = await preflightTestRun(opts.testCmd, repoRoot);
381397
if (testWarning) {
398+
// Exit 127 = command not found — hard error, don't waste API tokens
399+
if (testWarning.includes("exit 127")) {
400+
console.error(` ${testWarning}`);
401+
process.exit(1);
402+
}
382403
console.warn(` ⚠ ${testWarning}`);
383404
}
384405
}

src/scoring/convergence.ts

Lines changed: 17 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -231,12 +231,15 @@ export function copelandRecommend(
231231
// Pre-compute per-agent criteria values
232232
const agentData = completed.map((agent) => {
233233
const test = testResults.find((t) => t.agentId === agent.id);
234-
const testsPassed = test?.passed ? 1 : 0;
234+
// Skipped tests (exit 127 = command not found) are treated as neutral —
235+
// don't penalize agents when the test infrastructure itself is broken.
236+
const testsSkipped = (test as { skipped?: boolean } | undefined)?.skipped === true;
237+
const testsPassed = testsSkipped ? -1 : test?.passed ? 1 : 0;
235238
const group = convergence.find((g) => g.agents.includes(agent.id));
236239
const groupSize = group ? group.agents.length : 0;
237240
const { testFiles, nonTestFiles } = splitFilesByType(agent.filesChanged);
238241
const cappedTestFiles = effectiveTestFiles(testFiles, nonTestFiles);
239-
return { id: agent.id, testsPassed, groupSize, nonTestFiles, cappedTestFiles };
242+
return { id: agent.id, testsPassed, testsSkipped, groupSize, nonTestFiles, cappedTestFiles };
240243
});
241244

242245
// Initialize scores
@@ -262,14 +265,18 @@ export function copelandRecommend(
262265
let bWins = 0;
263266

264267
// Criterion 1: tests passed (more is better)
265-
if (a.testsPassed > b.testsPassed) {
266-
aWins++;
267-
scoreMap.get(a.id)!.testsWins++;
268-
scoreMap.get(b.id)!.testsWins--;
269-
} else if (b.testsPassed > a.testsPassed) {
270-
bWins++;
271-
scoreMap.get(b.id)!.testsWins++;
272-
scoreMap.get(a.id)!.testsWins--;
268+
// Skip this criterion entirely when both agents have skipped tests
269+
// (exit 127 = test command not found — not a code quality signal).
270+
if (!(a.testsSkipped && b.testsSkipped)) {
271+
if (a.testsPassed > b.testsPassed) {
272+
aWins++;
273+
scoreMap.get(a.id)!.testsWins++;
274+
scoreMap.get(b.id)!.testsWins--;
275+
} else if (b.testsPassed > a.testsPassed) {
276+
bWins++;
277+
scoreMap.get(b.id)!.testsWins++;
278+
scoreMap.get(a.id)!.testsWins--;
279+
}
273280
}
274281

275282
// Criterion 2: convergence group size (larger is better)

src/scoring/test-runner.ts

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -120,11 +120,26 @@ export async function runTests(
120120
};
121121
}
122122

123+
const exitCode = typeof e.code === "number" ? e.code : 1;
124+
const output = (e.stdout ?? "") + (e.stderr ?? "");
125+
126+
// Exit 127 = "command not found" — the test infrastructure is broken,
127+
// not the agent's code. Mark as skipped so scoring doesn't penalize.
128+
if (exitCode === 127) {
129+
return {
130+
agentId,
131+
passed: false,
132+
output: `Test command not found (exit 127). The test script may not exist in the agent's clone.\n${output}`,
133+
exitCode: 127,
134+
skipped: true,
135+
};
136+
}
137+
123138
return {
124139
agentId,
125140
passed: false,
126-
output: (e.stdout ?? "") + (e.stderr ?? ""),
127-
exitCode: typeof e.code === "number" ? e.code : 1,
141+
output,
142+
exitCode,
128143
};
129144
}
130145
}

src/types.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,8 @@ export interface TestResult {
3333
passed: boolean;
3434
output: string;
3535
exitCode: number;
36+
/** True when the test command itself couldn't execute (e.g. exit 127 — command not found). */
37+
skipped?: boolean;
3638
}
3739

3840
export interface ConvergenceGroup {

0 commit comments

Comments
 (0)