fix: harden live contracts and resume semantics

Her-xanadu · sisyphus-dev-ai · Her-xanadu · commit 29964578ec97 · 2026-03-16T00:30:25.000+08:00
Ultraworked with [Sisyphus](https://github.com/code-yeongyu/oh-my-opencode) Co-authored-by: Sisyphus <clio-agent@sisyphuslabs.ai>
diff --git a/scripts/innovation_loop.py b/scripts/innovation_loop.py
@@ -34,6 +34,7 @@
     ensure_repo_bootstrap_for_dvc,
     ensure_controller_not_running,
     load_goal,
+    load_pending_result,
     load_yaml_like,
     opencode_agent_model,
     proposal_round_path,
@@ -498,6 +499,7 @@ def build_live_proposal_prompt(
     cooldowns: dict,
     result_packet: dict,
     research_context: dict,
+    primary_proposal: dict | None = None,
 ) -> str:
     if os.environ.get("INNOVATION_LOOP_LIVE_TEST_MODE") == "1":
         scripted_choice = "objective" if agent == "Apollo" else "architecture"
@@ -520,7 +522,7 @@ def build_live_proposal_prompt(
                 "minimal_ablation": ["revert the single scripted change"],
                 "paper_grounding": grounding,
                 "redirect_if_underperforming": "切换到正交机制轴并停止重复当前主路线",
-                "causal_metric_path": "若该动作有效，中间稳定性指标应先改善，再传导到目标指标。",
+                "causal_metric_path": ["intermediate_stability", "target_metric"],
                 "failure_signature": "若中间指标不变而目标指标波动，则说明当前机制解释站不住。",
                 "pivot_after_failure": "切换到正交机制轴并停止重复当前主路线",
             },
@@ -547,6 +549,13 @@ def build_live_proposal_prompt(
         "research_context": prompt_ready_research_context(research_context, agent),
         "paper_grounding_seed": build_paper_grounding(research_context, agent),
     }
+    if primary_proposal is not None:
+        context["apollo_proposal"] = {
+            "choice": primary_proposal.get("choice"),
+            "family": primary_proposal.get("family"),
+            "mechanism": primary_proposal.get("mechanism"),
+            "paper_grounding": primary_proposal.get("paper_grounding", []),
+        }
     return f"""
 Return exactly one JSON object and nothing else.
 
@@ -573,6 +582,7 @@ def build_live_proposal_prompt(
 - avoid cooldown families when possible
 - keep the reply short
 - paper_grounding must contain at least two unique paper_id values from the evidence pack
+- if Apollo proposal is provided, your family and mechanism axis must be materially orthogonal to Apollo unless no meaningful divergence exists
 
 Context: {json.dumps(context, ensure_ascii=False)}
 """.strip()
@@ -584,8 +594,8 @@ def build_guard_prompt(
     if os.environ.get("INNOVATION_LOOP_LIVE_TEST_MODE") == "1":
         return 'Return exactly {"verdict":"approve","validity_risks":[],"smallest_repair":"","single_change_ok":true,"paper_support_ok":true,"redirect_if_underperforming":"切换到正交机制轴并停止重复当前主路线","failure_signature":"若中间指标不变而目标指标波动，则说明当前机制解释站不住。"}.'
     context = {
-        "primary_choice": primary.get("choice"),
-        "backup_choice": backup.get("choice") if backup else None,
+        "primary_proposal": primary,
+        "backup_proposal": backup,
         "research_context": prompt_ready_research_context(research_context, "Athena"),
         "primary_grounding": primary.get("paper_grounding", []),
         "latest_redirect_hint": primary.get("redirect_if_underperforming"),
@@ -644,6 +654,8 @@ def materialize_live_choice(
     template["causal_metric_path"] = raw.get("causal_metric_path") or dict(
         research_context.get("innovation_briefs", {})
     ).get(role.lower(), {}).get("falsifiable_prediction")
+    if isinstance(template["causal_metric_path"], str):
+        template["causal_metric_path"] = [template["causal_metric_path"]]
     template["failure_signature"] = raw.get("failure_signature") or first_guardrail(
         research_context
     )
@@ -692,6 +704,7 @@ def collect_live_round_proposals(
             cooldowns,
             result_packet,
             research_context,
+            primary_proposal=exploit_raw,
         ),
     )
     guard = run_opencode_agent(
@@ -836,14 +849,38 @@ def tick(config_path: pathlib.Path, workspace: pathlib.Path, mode: str) -> dict:
             )
             return {"phase": "poll", "poll": polled}
         if polled["status"] == "failed":
-            session["active_run_id"] = active
+            judged = run_python(
+                "judge_result.py",
+                "--config",
+                str(config_path),
+                "--workspace",
+                str(workspace),
+                "--run-id",
+                active,
+                "--monitor-state",
+                polled["status"],
+                cwd=workspace,
+            )
+            research_context = {
+                "config_path": str(research_config_path(workspace)),
+                "config": load_research_config(research_config_path(workspace)),
+            }
+            record_research_feedback(workspace, research_context, judged)
+            session = load_session(session_file)
+            session["last_failed_task"] = active
+            session["active_dvc_task"] = None
             set_session_stage(session, "crash_recoverable", f"dvc task {active} failed")
             save_session(session_file, session)
             write_json(
                 status_file,
-                {"phase": "failed", "updated_at": now_iso(), "poll": polled},
+                {
+                    "phase": "failed",
+                    "updated_at": now_iso(),
+                    "poll": polled,
+                    "judge": judged,
+                },
             )
-            return {"phase": "failed", "poll": polled}
+            return {"phase": "failed", "poll": polled, "judge": judged}
         judged = run_python(
             "judge_result.py",
             "--config",
@@ -1071,6 +1108,7 @@ def main() -> None:
         "tick",
         "status",
         "resume",
+        "branch-from-checkpoint",
         "stop",
         "_run-controller",
     ]:
@@ -1138,7 +1176,7 @@ def main() -> None:
         emit_json(status)
         return
 
-    if args.command == "resume":
+    if args.command in {"resume", "branch-from-checkpoint"}:
         session = load_session(session_path(workspace))
         checkpoint = pathlib.Path(
             workspace / "experiments" / "recovery_checkpoint.json"
@@ -1151,22 +1189,37 @@ def main() -> None:
         if not payload or not payload.get("checkpoint_path"):
             emit_json({"resumed": False, "reason": "no_checkpoint"})
             return
-        round_selection = select_round_mutation(
-            workspace,
-            load_goal(config_path),
-            int(session.get("iteration_count", 0)) + 1,
-            args.mode,
-            collect_round_research_context(
+        if args.command == "resume":
+            failed_task = str(
+                session.get("last_failed_task") or payload.get("run_id") or ""
+            )
+            mutation = (
+                load_pending_result(workspace, failed_task) if failed_task else {}
+            )
+            if not mutation:
+                emit_json({"resumed": False, "reason": "no_failed_pending_result"})
+                return
+        else:
+            round_selection = select_round_mutation(
                 workspace,
                 load_goal(config_path),
                 int(session.get("iteration_count", 0)) + 1,
-            ),
-        )
-        mutation = round_selection.get("mutation", round_selection)
+                args.mode,
+                collect_round_research_context(
+                    workspace,
+                    load_goal(config_path),
+                    int(session.get("iteration_count", 0)) + 1,
+                ),
+            )
+            mutation = round_selection.get("mutation", round_selection)
         if mutation.get("review_blocked"):
             emit_json({"resumed": False, "reason": "review_blocked"})
             return
-        run_id = f"resume-{int(session.get('iteration_count', 0)) + 1:04d}"
+        run_id = (
+            f"resume-{int(session.get('iteration_count', 0)) + 1:04d}"
+            if args.command == "resume"
+            else f"branch-{int(session.get('iteration_count', 0)) + 1:04d}"
+        )
         candidate = run_python(
             "run_candidate.py",
             "--config",
@@ -1190,6 +1243,7 @@ def main() -> None:
         emit_json(
             {
                 "resumed": True,
+                "mode": args.command,
                 "candidate": candidate,
                 "resume_from": payload["checkpoint_path"],
             }
diff --git a/scripts/kb/build_index.py b/scripts/kb/build_index.py
@@ -54,6 +54,7 @@ def build_indexes(
     *,
     scaffold_missing: bool,
     extract_claims: bool,
+    overwrite_claims: bool,
     config: Dict[str, Any],
 ) -> Dict[str, Any]:
     output_dir.mkdir(parents=True, exist_ok=True)
@@ -73,7 +74,9 @@ def build_indexes(
         if scaffold_missing:
             scaffold_figure_note(paths, meta)
         missing_fields = validate_meta(meta)
-        if extract_claims or not paths.claims.exists():
+        if (extract_claims and (overwrite_claims or not paths.claims.exists())) or (
+            not extract_claims and not paths.claims.exists()
+        ):
             claims = extract_claims_from_markdown(paths, meta)
             write_claims(paths.claims, claims)
         else:
@@ -125,6 +128,7 @@ def main() -> None:
     parser.add_argument("--config")
     parser.add_argument("--scaffold-missing", action="store_true")
     parser.add_argument("--extract-claims", action="store_true")
+    parser.add_argument("--overwrite-claims", action="store_true")
     args = parser.parse_args()
 
     workspace_root = resolve_workspace_root(args.workspace_root)
@@ -144,6 +148,7 @@ def main() -> None:
             output_dir,
             scaffold_missing=args.scaffold_missing,
             extract_claims=args.extract_claims,
+            overwrite_claims=args.overwrite_claims,
             config=config,
         )
     )
diff --git a/scripts/kb/daily_tracker_lite.py b/scripts/kb/daily_tracker_lite.py
@@ -52,6 +52,7 @@ def main() -> None:
         index_output_dir(workspace_root, config),
         scaffold_missing=True,
         extract_claims=False,
+        overwrite_claims=False,
         config=config,
     )
     emit_json(
diff --git a/scripts/kb/retrieve_papers.py b/scripts/kb/retrieve_papers.py
@@ -464,6 +464,16 @@ def main() -> None:
                 str(goal.get("goal_text") or ""),
                 str(goal.get("target_metric") or ""),
                 str(best.get("family") or ""),
+                " ".join(
+                    str(item.get("failure_signature") or "")
+                    for item in attempts[-5:]
+                    if isinstance(item, dict)
+                ),
+                " ".join(
+                    str(item.get("reject_reason") or "")
+                    for item in attempts[-5:]
+                    if isinstance(item, dict)
+                ),
                 " ".join(
                     str(item.get("family") or "")
                     for item in attempts[-5:]
diff --git a/scripts/run_candidate.py b/scripts/run_candidate.py
@@ -118,7 +118,12 @@ def main() -> None:
     run_id = args.run_id or new_id("candidate")
     session = load_session(session_path(workspace))
     round_index = int(session.get("iteration_count", 0)) + 1
-    touched_files = mutation.get("files_to_touch") or [mutation.get("target_file")]
+    touched_files = (
+        mutation.get("files_to_touch")
+        or mutation.get("touched_files")
+        or [mutation.get("target_file")]
+    )
+    touched_files = [path for path in touched_files if path]
     save_parent_snapshot(workspace, run_id, touched_files)
     execution_result = (
         apply_mutation_live(workspace, mutation)
@@ -203,6 +208,11 @@ def main() -> None:
         "change_unit": mutation.get("change_unit"),
         "proposal_id": mutation.get("proposal_id"),
         "family": mutation.get("family"),
+        "target_file": mutation.get("target_file"),
+        "files_to_touch": touched_files,
+        "params": mutation.get("params", {}),
+        "why_not_parameter_only": mutation.get("why_not_parameter_only"),
+        "minimal_ablation": mutation.get("minimal_ablation"),
         "touched_files": touched_files,
         "diff_summary": execution_result.get(
             "diff_summary", mutation.get("change_unit")
diff --git a/src/agents/apollo.ts b/src/agents/apollo.ts
@@ -16,7 +16,7 @@ Input contract:
 
 Output contract:
 - Return strict JSON only.
-- Return one primary proposal and one backup proposal.
+- Return one proposal object only.
 - Each proposal must include: title, family, mechanism, files_to_touch, expected_gain, risk, why_not_parameter_only, minimal_ablation, paper_grounding, redirect_if_underperforming, causal_metric_path, failure_signature, and pivot_after_failure.
 - Use \`causal_metric_path\` to name the intermediate metric path that should improve before the final target metric.
 - Use \`failure_signature\` to name the observable pattern that would tell the outer loop this route is failing.
diff --git a/src/agents/hermes.ts b/src/agents/hermes.ts
@@ -16,7 +16,7 @@ Input contract:
 
 Output contract:
 - Return strict JSON only.
-- Return one divergent proposal and one backup.
+- Return one divergent proposal object only.
 - Each proposal must include: title, family, mechanism, files_to_touch, expected_gain, risk, why_not_parameter_only, minimal_ablation, paper_grounding, redirect_if_underperforming, causal_metric_path, failure_signature, and pivot_after_failure.
 - Use \`causal_metric_path\` to name the intermediate metric path that should improve before the final target metric.
 - Use \`failure_signature\` to name the observable pattern that would tell the outer loop this route is failing.

Original file line number	Diff line number	Diff line change
`@@ -52,6 +52,7 @@ def main() -> None:`
`52`	`52`	`index_output_dir(workspace_root, config),`
`53`	`53`	`scaffold_missing=True,`
`54`	`54`	`extract_claims=False,`
	`55`	`+ overwrite_claims=False,`
`55`	`56`	`config=config,`
`56`	`57`	`)`
`57`	`58`	`emit_json(`