fix(pr-patrol): reduce compute waste with issue-type budgets and timeout abandonment

OAGr · claude · OAGr · commit 2990c70fa828 · 2026-03-06T17:52:18.000-08:00
PR Patrol was burning 90%+ of compute on failed attempts:
- Timeouts (30 min each) didn't count toward abandonment, causing infinite retries
- All issue types got the same 40-turn / 30-min budget regardless of complexity
- No early-exit guidance in prompts, so Claude kept trying unfixable issues
- Reflection used expensive Sonnet model for simple log analysis

Changes:
- Unify failure tracking: timeouts now count toward abandonment (2 failures = abandoned)
- Add per-issue-type budgets: missing-issue-ref gets 5 turns/3 min, ci-failure gets 25/15, etc.
- Add "when to stop early" section to prompts with clear unfixable-scenario detection
- CI failure prompt now explicitly lists human-required checks to skip immediately
- Reflection uses haiku model with 5-min timeout instead of sonnet/30-min
- Include pr-patrol tests in vitest config
- Add computeBudget tests

Co-Authored-By: Claude Opus 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/crux/pr-patrol/index.test.ts b/crux/pr-patrol/index.test.ts
@@ -3,6 +3,7 @@ import {
   checkMergeEligibility,
   findMergeCandidates,
   detectIssues,
+  computeBudget,
   type GqlPrNode,
 } from './index.ts';
 
@@ -383,6 +384,46 @@ describe('findMergeCandidates', () => {
   });
 });
 
+// ── computeBudget ────────────────────────────────────────────────────────────
+
+describe('computeBudget', () => {
+  it('gives small budget for missing-issue-ref only', () => {
+    const budget = computeBudget(['missing-issue-ref']);
+    expect(budget.maxTurns).toBe(5);
+    expect(budget.timeoutMinutes).toBe(3);
+  });
+
+  it('gives small budget for missing-testplan only', () => {
+    const budget = computeBudget(['missing-testplan']);
+    expect(budget.maxTurns).toBe(8);
+    expect(budget.timeoutMinutes).toBe(5);
+  });
+
+  it('gives medium budget for ci-failure', () => {
+    const budget = computeBudget(['ci-failure']);
+    expect(budget.maxTurns).toBe(25);
+    expect(budget.timeoutMinutes).toBe(15);
+  });
+
+  it('gives full budget for conflict', () => {
+    const budget = computeBudget(['conflict']);
+    expect(budget.maxTurns).toBe(40);
+    expect(budget.timeoutMinutes).toBe(30);
+  });
+
+  it('uses highest budget when multiple issues present', () => {
+    const budget = computeBudget(['missing-issue-ref', 'ci-failure']);
+    expect(budget.maxTurns).toBe(25);
+    expect(budget.timeoutMinutes).toBe(15);
+  });
+
+  it('conflict dominates when mixed with smaller issues', () => {
+    const budget = computeBudget(['missing-testplan', 'conflict', 'missing-issue-ref']);
+    expect(budget.maxTurns).toBe(40);
+    expect(budget.timeoutMinutes).toBe(30);
+  });
+});
+
 // ── detectIssues (regression test after refactor) ────────────────────────────
 
 describe('detectIssues', () => {
diff --git a/crux/pr-patrol/index.ts b/crux/pr-patrol/index.ts
@@ -172,20 +172,27 @@ function markProcessed(key: number | string): void {
   );
 }
 
-function getMaxTurnsFailCount(key: number | string): number {
-  const file = join(STATE_DIR, `max-turns-${key}`);
-  if (!existsSync(file)) return 0;
-  return parseInt(readFileSync(file, 'utf-8').trim(), 10) || 0;
+function getFailCount(key: number | string): number {
+  // Check both new and legacy file names for backwards compat
+  const newFile = join(STATE_DIR, `failures-${key}`);
+  const legacyFile = join(STATE_DIR, `max-turns-${key}`);
+  if (existsSync(newFile)) {
+    return parseInt(readFileSync(newFile, 'utf-8').trim(), 10) || 0;
+  }
+  if (existsSync(legacyFile)) {
+    return parseInt(readFileSync(legacyFile, 'utf-8').trim(), 10) || 0;
+  }
+  return 0;
 }
 
-function recordMaxTurnsFailure(key: number | string): number {
-  const count = getMaxTurnsFailCount(key) + 1;
-  writeFileSync(join(STATE_DIR, `max-turns-${key}`), String(count));
+function recordFailure(key: number | string): number {
+  const count = getFailCount(key) + 1;
+  writeFileSync(join(STATE_DIR, `failures-${key}`), String(count));
   return count;
 }
 
 function isAbandoned(key: number | string): boolean {
-  return getMaxTurnsFailCount(key) >= 2;
+  return getFailCount(key) >= 2;
 }
 
 // ── Main Branch CI Check ────────────────────────────────────────────────────
@@ -321,21 +328,27 @@ async function fixMainBranch(status: MainBranchStatus, config: PatrolConfig): Pr
     const elapsedS = Math.floor((Date.now() - startTime) / 1000);
 
     if (result.timedOut) {
+      const failCount = recordFailure(MAIN_BRANCH_KEY);
       outcome = 'timeout';
-      reason = `Killed after ${config.timeoutMinutes}m timeout`;
-      log(`✗ Main branch fix timed out after ${config.timeoutMinutes}m`);
+      reason = `Killed after ${config.timeoutMinutes}m timeout — attempt ${failCount}`;
+      log(`✗ Main branch fix timed out after ${config.timeoutMinutes}m (attempt ${failCount})`);
+
+      if (failCount >= 2) {
+        reason = `Abandoned after ${failCount} failures (timeout)`;
+        log(`✗ Main branch fix abandoned after ${failCount} failures`);
+      }
     } else if (result.exitCode === 0 && !result.hitMaxTurns) {
       outcome = 'fixed';
       log(`✓ Main branch CI fix processed (${elapsedS}s)`);
     } else if (result.hitMaxTurns) {
-      const failCount = recordMaxTurnsFailure(MAIN_BRANCH_KEY);
+      const failCount = recordFailure(MAIN_BRANCH_KEY);
       outcome = 'max-turns';
       reason = `Hit max turns (${config.maxTurns}) — attempt ${failCount}`;
       log(`⚠ Main branch fix hit max turns after ${elapsedS}s`);
 
       if (failCount >= 2) {
-        reason = `Abandoned after ${failCount} max-turns failures`;
-        log(`✗ Main branch fix abandoned after ${failCount} max-turns failures`);
+        reason = `Abandoned after ${failCount} failures`;
+        log(`✗ Main branch fix abandoned after ${failCount} failures`);
       }
     } else {
       outcome = 'error';
@@ -703,6 +716,37 @@ const ISSUE_SCORES: Record<PrIssueType, number> = {
   'bot-review-nitpick': 15,
 };
 
+// ── Issue-type-specific resource limits ──────────────────────────────────────
+// Scale max-turns and timeout based on the hardest issue in a PR.
+// This prevents trivial issues from consuming the full 40-turn / 30-min budget.
+
+interface IssueBudget {
+  maxTurns: number;
+  timeoutMinutes: number;
+}
+
+const ISSUE_BUDGETS: Record<PrIssueType, IssueBudget> = {
+  conflict:            { maxTurns: 40, timeoutMinutes: 30 },
+  'ci-failure':        { maxTurns: 25, timeoutMinutes: 15 },
+  'bot-review-major':  { maxTurns: 25, timeoutMinutes: 15 },
+  'missing-issue-ref': { maxTurns: 5,  timeoutMinutes: 3 },
+  stale:               { maxTurns: 10, timeoutMinutes: 5 },
+  'missing-testplan':  { maxTurns: 8,  timeoutMinutes: 5 },
+  'bot-review-nitpick':{ maxTurns: 8,  timeoutMinutes: 5 },
+};
+
+/** Compute the budget for a PR based on its hardest issue. */
+export function computeBudget(issues: PrIssueType[]): IssueBudget {
+  let maxTurns = 5;
+  let timeoutMinutes = 3;
+  for (const issue of issues) {
+    const budget = ISSUE_BUDGETS[issue];
+    if (budget.maxTurns > maxTurns) maxTurns = budget.maxTurns;
+    if (budget.timeoutMinutes > timeoutMinutes) timeoutMinutes = budget.timeoutMinutes;
+  }
+  return { maxTurns, timeoutMinutes };
+}
+
 /** Pure function — computes priority score for a detected PR. */
 export function computeScore(pr: DetectedPr): number {
   let score = 0;
@@ -927,9 +971,12 @@ ${issues.join(', ')}
 ### CI Failure
 - Check CI status: gh pr checks ${num} --repo ${repo}
 - Read the failing check logs to understand the failure
-- Fix the issue (build error, test failure, lint error)
-- Run locally to verify: pnpm build and/or pnpm test
-- Commit and push the fix`);
+- **STOP IMMEDIATELY and report** if ANY of these apply:
+  - The check requires a human action (adding a label like \`rules-change-reviewed\`, manual approval, etc.)
+  - The failure is in a Vercel deployment or external service (not a code issue)
+  - The same check is also failing on the \`main\` branch (pre-existing, not caused by this PR)
+  - The failure is a permissions or authentication issue
+- If the failure IS a code issue you can fix: fix it, run locally to verify (pnpm build / pnpm test), commit and push`);
   }
 
   if (issues.includes('missing-testplan')) {
@@ -979,7 +1026,14 @@ ${issues.join(', ')}
 - Use git push --force-with-lease (never --force) when pushing rebased branches
 - Do not modify files unrelated to the fix
 - Do NOT run /agent-session-start or /agent-session-ready-PR — this is a targeted fix, not a full session
-- Do NOT create new branches — work on the existing PR branch`);
+- Do NOT create new branches — work on the existing PR branch
+
+## When to stop early
+- **If the issue requires human intervention** (adding labels, approvals, external service fixes): output a clear summary of why and stop immediately. Do not attempt workarounds.
+- **If the issue is pre-existing** (also failing on main, not introduced by this PR): state that and stop.
+- **If you've tried 2+ approaches and none worked**: stop and summarize what you tried. Do not keep cycling through the same strategies.
+- **If the fix is "no action needed"** (e.g., no matching issue exists for missing-issue-ref): say so and stop. Not every detected issue requires a code change.
+- Stopping early with a clear explanation is BETTER than burning through all turns without progress.`);
 
   return sections.join('\n');
 }
@@ -1108,20 +1162,48 @@ async function fixPr(pr: ScoredPr, config: PatrolConfig): Promise<void> {
 
   await claimPr(pr.number, config.repo);
 
+  // Compute issue-specific budget (capped by global config)
+  const budget = computeBudget(pr.issues);
+  const effectiveMaxTurns = Math.min(budget.maxTurns, config.maxTurns);
+  const effectiveTimeout = Math.min(budget.timeoutMinutes, config.timeoutMinutes);
+
+  log(`  Budget: ${effectiveMaxTurns} max-turns, ${effectiveTimeout}m timeout (based on: ${pr.issues.join(', ')})`);
+
   const prompt = buildPrompt(pr, config.repo);
   const startTime = Date.now();
 
   let outcome: FixOutcome = 'fixed';
   let reason = '';
 
   try {
-    const result = await spawnClaude(prompt, config);
+    const result = await spawnClaude(prompt, {
+      ...config,
+      maxTurns: effectiveMaxTurns,
+      timeoutMinutes: effectiveTimeout,
+    });
     const elapsedS = Math.floor((Date.now() - startTime) / 1000);
 
     if (result.timedOut) {
+      // Timeouts count toward abandonment — a PR that times out repeatedly
+      // is likely unfixable and should not keep burning compute.
+      const failCount = recordFailure(pr.number);
       outcome = 'timeout';
-      reason = `Killed after ${config.timeoutMinutes}m timeout`;
-      log(`✗ PR #${pr.number} timed out after ${config.timeoutMinutes}m`);
+      reason = `Killed after ${effectiveTimeout}m timeout — attempt ${failCount}`;
+      log(`✗ PR #${pr.number} timed out after ${effectiveTimeout}m (attempt ${failCount})`);
+
+      if (failCount >= 2) {
+        reason = `Abandoned after ${failCount} failures (timeout)`;
+        log(`✗ PR #${pr.number} abandoned after ${failCount} consecutive failures`);
+        await githubApi(
+          `/repos/${config.repo}/issues/${pr.number}/comments`,
+          {
+            method: 'POST',
+            body: {
+              body: `🤖 **PR Patrol**: Abandoning automatic fix after ${failCount} failed attempts (timed out each time).\n\n**Issues detected**: ${pr.issues.join(', ')}\n**Last attempt**: ${elapsedS}s, ${effectiveTimeout}m timeout\n\nThis PR likely needs human intervention to resolve.`,
+            },
+          },
+        ).catch(() => log('  Warning: could not post abandonment comment'));
+      }
     } else if (result.exitCode === 0 && !result.hitMaxTurns) {
       log(`✓ PR #${pr.number} processed successfully (${elapsedS}s)`);
       outcome = 'fixed';
@@ -1132,27 +1214,27 @@ async function fixPr(pr: ScoredPr, config: PatrolConfig): Promise<void> {
         await githubApi(`/repos/${config.repo}/issues/${pr.number}/comments`, {
           method: 'POST',
           body: {
-            body: `🤖 **PR Patrol** ran for ${elapsedS}s (${config.maxTurns} max turns, model: ${config.model}).\n\n**Issues detected**: ${pr.issues.join(', ')}\n\n**Result**:\n${summary}`,
+            body: `🤖 **PR Patrol** ran for ${elapsedS}s (${effectiveMaxTurns} max turns, model: ${config.model}).\n\n**Issues detected**: ${pr.issues.join(', ')}\n\n**Result**:\n${summary}`,
           },
         }).catch(() => log('  Warning: could not post summary comment'));
       }
     } else if (result.hitMaxTurns) {
-      const failCount = recordMaxTurnsFailure(pr.number);
+      const failCount = recordFailure(pr.number);
       outcome = 'max-turns';
-      reason = `Hit max turns (${config.maxTurns}) — attempt ${failCount}`;
-      log(`⚠ PR #${pr.number} hit max turns after ${elapsedS}s`);
+      reason = `Hit max turns (${effectiveMaxTurns}) — attempt ${failCount}`;
+      log(`⚠ PR #${pr.number} hit max turns after ${elapsedS}s (attempt ${failCount})`);
 
       if (failCount >= 2) {
-        reason = `Abandoned after ${failCount} max-turns failures`;
+        reason = `Abandoned after ${failCount} failures`;
         log(
-          `✗ PR #${pr.number} abandoned after ${failCount} max-turns failures`,
+          `✗ PR #${pr.number} abandoned after ${failCount} consecutive failures`,
         );
         await githubApi(
           `/repos/${config.repo}/issues/${pr.number}/comments`,
           {
             method: 'POST',
             body: {
-              body: `🤖 **PR Patrol**: Abandoning automatic fix after ${failCount} failed attempts (hit max turns each time).\n\n**Issues detected**: ${pr.issues.join(', ')}\n**Last attempt**: ${elapsedS}s, ${config.maxTurns} turns\n\nThis PR likely needs human intervention to resolve.`,
+              body: `🤖 **PR Patrol**: Abandoning automatic fix after ${failCount} failed attempts.\n\n**Issues detected**: ${pr.issues.join(', ')}\n**Last attempt**: ${elapsedS}s, ${effectiveMaxTurns} turns\n\nThis PR likely needs human intervention to resolve.`,
             },
           },
         ).catch(() => log('  Warning: could not post abandonment comment'));
@@ -1246,6 +1328,8 @@ ${recentEntries}
     const result = await spawnClaude(prompt, {
       ...config,
       maxTurns: 10, // Reflection needs fewer turns
+      model: 'haiku', // Reflection is log analysis — doesn't need sonnet
+      timeoutMinutes: 5, // Should complete quickly
     });
     const elapsedS = Math.floor((Date.now() - startTime) / 1000);
     const filedIssue = /Created issue #|created.*#\d/.test(result.output);
diff --git a/crux/vitest.config.ts b/crux/vitest.config.ts
@@ -20,6 +20,7 @@ export default defineConfig({
       'wiki-server/**/*.test.ts',
       'evals/**/*.test.ts',
       'health/**/*.test.ts',
+      'pr-patrol/**/*.test.ts',
     ],
     exclude: [
       'claims/archive/**',