fix: finalize remediation risk hardening

Her-xanadu · sisyphus-dev-ai · Her-xanadu · commit 7afbc8c91339 · 2026-03-16T03:59:27.000+08:00
Ultraworked with [Sisyphus](https://github.com/code-yeongyu/oh-my-opencode) Co-authored-by: Sisyphus <clio-agent@sisyphuslabs.ai>
diff --git a/README.md b/README.md
@@ -302,6 +302,11 @@ Remote execution is supported as a contract pattern rather than a separate contr
 
 See `REMOTE_EXECUTION.md` for the exact contract.
 
+Validation paths:
+
+- `npm test -- tests/e2e/python-controller-real-dvc.test.ts`
+- `npm test -- tests/e2e/remote-contract.test.ts`
+
 ## Daily Scheduler Split
 
 The previous monolithic `daily-research-brain` job has been split into two clearer chains:
diff --git a/scripts/ae_common.py b/scripts/ae_common.py
@@ -715,7 +715,13 @@ def opencode_repo_dir() -> pathlib.Path:
 
 
 def opencode_agent_model() -> str:
-    return os.environ.get("INNOVATION_LOOP_AGENT_MODEL", "kimi-for-coding/kimi-k2.5")
+    env_override = os.environ.get("INNOVATION_LOOP_AGENT_MODEL")
+    if env_override:
+        return env_override
+    repo = opencode_repo_dir()
+    config = read_json(repo / "opencode.json", {})
+    model = dict(config.get("agent", {})).get("Apollo", {}).get("model")
+    return str(model or "kimi-for-coding/kimi-k2.5")
 
 
 def run_opencode_agent(
@@ -724,6 +730,7 @@ def run_opencode_agent(
     *,
     model: Optional[str] = None,
     timeout: int = 240000,
+    workspace: Optional[pathlib.Path] = None,
 ) -> Dict[str, Any]:
     repo_dir = opencode_repo_dir()
     command = [
@@ -741,14 +748,56 @@ def run_opencode_agent(
     result = run_process(
         command, repo_dir, check=False, timeout=max(timeout / 1000.0, 1.0)
     )
+    if workspace is not None:
+        artifact_dir = workspace / "experiments" / "live-specialist-failures"
+        artifact_dir.mkdir(parents=True, exist_ok=True)
+        raw_path = artifact_dir / f"{agent.lower()}-{int(time.time() * 1000)}.json"
+    else:
+        raw_path = None
     if result.returncode != 0:
+        if raw_path is not None:
+            write_json(
+                raw_path,
+                {
+                    "agent": agent,
+                    "kind": "subprocess_error",
+                    "stdout": result.stdout,
+                    "stderr": result.stderr,
+                    "returncode": result.returncode,
+                },
+            )
         raise RuntimeError(
             result.stderr.strip()
             or result.stdout.strip()
             or f"opencode run failed for {agent}"
         )
-    parsed = extract_json_payload(result.stdout)
+    try:
+        parsed = extract_json_payload(result.stdout)
+    except Exception:
+        if raw_path is not None:
+            write_json(
+                raw_path,
+                {
+                    "agent": agent,
+                    "kind": "schema_parse_failure",
+                    "stdout": result.stdout,
+                    "stderr": result.stderr,
+                    "returncode": result.returncode,
+                },
+            )
+        raise
     if not isinstance(parsed, dict):
+        if raw_path is not None:
+            write_json(
+                raw_path,
+                {
+                    "agent": agent,
+                    "kind": "schema_parse_failure",
+                    "stdout": result.stdout,
+                    "stderr": result.stderr,
+                    "returncode": result.returncode,
+                },
+            )
         raise RuntimeError(f"expected JSON object from {agent}")
     return parsed
 
@@ -994,19 +1043,35 @@ def collect(value: Any) -> None:
 
 def save_parent_snapshot(
     workspace: pathlib.Path, run_id: str, touched_files: List[str]
-) -> Dict[str, str]:
-    snapshot: Dict[str, str] = {}
+) -> Dict[str, Dict[str, Any]]:
+    snapshot: Dict[str, Dict[str, Any]] = {}
     for relative in touched_files:
         absolute = workspace / relative
-        snapshot[relative] = read_text(absolute)
+        snapshot[relative] = {
+            "exists": absolute.exists(),
+            "content": read_text(absolute) if absolute.exists() else "",
+        }
     write_json(run_dir(workspace, run_id) / "parent_snapshot.json", snapshot)
     return snapshot
 
 
 def restore_parent_snapshot(workspace: pathlib.Path, run_id: str) -> None:
     snapshot = read_json(run_dir(workspace, run_id) / "parent_snapshot.json", {})
-    for relative, content in snapshot.items():
-        write_text(workspace / relative, content)
+    for relative, entry in snapshot.items():
+        absolute = workspace / relative
+        if isinstance(entry, dict):
+            if entry.get("exists", False):
+                write_text(absolute, str(entry.get("content", "")))
+            else:
+                absolute.unlink(missing_ok=True)
+        else:
+            write_text(absolute, str(entry))
+
+
+def save_run_manifest(
+    workspace: pathlib.Path, run_id: str, payload: Dict[str, Any]
+) -> None:
+    write_json(run_dir(workspace, run_id) / "meta.json", payload)
 
 
 def load_pending_result(workspace: pathlib.Path, run_id: str) -> Dict[str, Any]:
diff --git a/scripts/innovation_loop.py b/scripts/innovation_loop.py
@@ -692,6 +692,7 @@ def collect_live_round_proposals(
             result_packet,
             research_context,
         ),
+        workspace=workspace,
     )
     divergence_raw = run_opencode_agent(
         "Hermes",
@@ -706,10 +707,12 @@ def collect_live_round_proposals(
             research_context,
             primary_proposal=exploit_raw,
         ),
+        workspace=workspace,
     )
     guard = run_opencode_agent(
         "Athena",
         build_guard_prompt(workspace, exploit_raw, divergence_raw, research_context),
+        workspace=workspace,
     )
 
     exploit = materialize_live_choice(
diff --git a/scripts/run_candidate.py b/scripts/run_candidate.py
@@ -22,6 +22,7 @@
     run_stage,
     restore_parent_snapshot,
     save_parent_snapshot,
+    save_run_manifest,
     save_pending_result,
     session_path,
     start_dvc_queue_worker,
@@ -94,7 +95,7 @@ def apply_mutation_live(workspace: pathlib.Path, mutation: dict) -> dict:
 WORKSPACE_ROOT: {workspace}
 MUTATION_JSON: {json.dumps(mutation, ensure_ascii=False)}
 """.strip()
-    return run_opencode_agent("sisyphus-junior", prompt)
+    return run_opencode_agent("sisyphus-junior", prompt, workspace=workspace)
 
 
 def main() -> None:
@@ -143,6 +144,21 @@ def main() -> None:
             "parent_run_id": current_best_exp_ref(workspace) or "baseline",
         },
     )
+    save_run_manifest(
+        workspace,
+        run_id,
+        {
+            "run_id": run_id,
+            "proposal_id": mutation.get("proposal_id"),
+            "family": mutation.get("family"),
+            "touched_files": touched_files,
+            "created_files": [],
+            "deleted_files": [],
+            "checkpoint_path": str(checkpoint),
+            "dvc_exp_ref": run_id,
+            "resume_from": args.resume_from,
+        },
+    )
 
     try:
         smoke_metric = run_stage(goal, workspace, "smoke")
diff --git a/src/utils/paths.ts b/src/utils/paths.ts
@@ -31,10 +31,6 @@ export function getCompatBestPath(workspaceRoot: string): string {
   return path.join(getCompatExperimentsDir(workspaceRoot), "best.json");
 }
 
-export function getCompatResultPacketPath(workspaceRoot: string): string {
-  return path.join(getCompatExperimentsDir(workspaceRoot), "result_packet.json");
-}
-
 export function getCompatProposalCardsPath(workspaceRoot: string): string {
   return path.join(getCompatExperimentsDir(workspaceRoot), "proposal_cards.jsonl");
 }
diff --git a/tests/e2e/remote-contract.test.ts b/tests/e2e/remote-contract.test.ts
@@ -0,0 +1,50 @@
+import { execFile } from "node:child_process";
+import fs from "node:fs/promises";
+import os from "node:os";
+import path from "node:path";
+import { promisify } from "node:util";
+import { afterEach, describe, expect, it } from "vitest";
+
+const execFileAsync = promisify(execFile);
+const tempDirs: string[] = [];
+const repoRoot = "/Users/herxanadu/Desktop/opencode-auto-experiment";
+
+async function makeWorkspace(): Promise<string> {
+  const dir = await fs.mkdtemp(path.join(os.tmpdir(), "auto-exp-remote-contract-"));
+  tempDirs.push(dir);
+  await fs.mkdir(path.join(dir, "experiments", "checkpoints"), { recursive: true });
+  await fs.writeFile(
+    path.join(dir, "remote_eval.py"),
+    [
+      "import argparse, json, pathlib",
+      "p=argparse.ArgumentParser()",
+      "p.add_argument('--resume-from')",
+      "args=p.parse_args()",
+      "ckpt = pathlib.Path('experiments/checkpoints/remote.ckpt')",
+      "ckpt.write_text('checkpoint\\n', encoding='utf-8')",
+      "metrics = pathlib.Path('experiments/metrics.json')",
+      "metrics.write_text(json.dumps({'score': 0.91, 'resume_from': args.resume_from}, indent=2) + '\\n', encoding='utf-8')",
+      "print(0.91)",
+    ].join("\n") + "\n",
+    "utf8",
+  );
+  return dir;
+}
+
+afterEach(async () => {
+  await Promise.all(tempDirs.splice(0).map((dir) => fs.rm(dir, { recursive: true, force: true, maxRetries: 10, retryDelay: 200 })));
+});
+
+describe("remote execution contract", () => {
+  it("validates that a mock remote adapter preserves metrics, checkpoint, and resume semantics", async () => {
+    const workspace = await makeWorkspace();
+    await execFileAsync("python3", ["remote_eval.py"], { cwd: workspace });
+    const firstMetrics = JSON.parse(await fs.readFile(path.join(workspace, "experiments", "metrics.json"), "utf8"));
+    expect(firstMetrics.score).toBe(0.91);
+    expect(firstMetrics.resume_from).toBeNull();
+    await expect(fs.stat(path.join(workspace, "experiments", "checkpoints", "remote.ckpt"))).resolves.toBeTruthy();
+    await execFileAsync("python3", ["remote_eval.py", "--resume-from", "experiments/checkpoints/remote.ckpt"], { cwd: workspace });
+    const resumedMetrics = JSON.parse(await fs.readFile(path.join(workspace, "experiments", "metrics.json"), "utf8"));
+    expect(resumedMetrics.resume_from).toContain("remote.ckpt");
+  }, 15000);
+});
diff --git a/tests/e2e/rollback-side-effects.test.ts b/tests/e2e/rollback-side-effects.test.ts
@@ -0,0 +1,35 @@
+import { execFile } from "node:child_process";
+import fs from "node:fs/promises";
+import os from "node:os";
+import path from "node:path";
+import { promisify } from "node:util";
+import { afterEach, describe, expect, it } from "vitest";
+
+const execFileAsync = promisify(execFile);
+const tempDirs: string[] = [];
+const repoRoot = "/Users/herxanadu/Desktop/opencode-auto-experiment";
+
+async function makeWorkspace(): Promise<string> {
+  const dir = await fs.mkdtemp(path.join(os.tmpdir(), "auto-exp-rollback-"));
+  tempDirs.push(dir);
+  await fs.mkdir(path.join(dir, "src"), { recursive: true });
+  await fs.mkdir(path.join(dir, "experiments", "runs", "rollback-run"), { recursive: true });
+  await fs.writeFile(path.join(dir, "src", "config.json"), '{"learning_rate":0.1}\n', "utf8");
+  return dir;
+}
+
+afterEach(async () => {
+  await Promise.all(tempDirs.splice(0).map((dir) => fs.rm(dir, { recursive: true, force: true, maxRetries: 10, retryDelay: 200 })));
+});
+
+describe("rollback side effects", () => {
+  it("restores modified files and removes files created after the snapshot", async () => {
+    const workspace = await makeWorkspace();
+    await execFileAsync("python3", ["-c", `import pathlib,sys; sys.path.insert(0, ${JSON.stringify(repoRoot + "/scripts")}); from ae_common import save_parent_snapshot; save_parent_snapshot(pathlib.Path(${JSON.stringify(workspace)}), 'rollback-run', ['src/config.json','src/new.txt'])`], { cwd: repoRoot });
+    await fs.writeFile(path.join(workspace, "src", "config.json"), '{"learning_rate":0.9}\n', "utf8");
+    await fs.writeFile(path.join(workspace, "src", "new.txt"), "temporary\n", "utf8");
+    await execFileAsync("python3", ["-c", `import pathlib,sys; sys.path.insert(0, ${JSON.stringify(repoRoot + "/scripts")}); from ae_common import restore_parent_snapshot; restore_parent_snapshot(pathlib.Path(${JSON.stringify(workspace)}), 'rollback-run')`], { cwd: repoRoot });
+    expect(await fs.readFile(path.join(workspace, "src", "config.json"), "utf8")).toContain('0.1');
+    await expect(fs.stat(path.join(workspace, "src", "new.txt"))).rejects.toThrow();
+  }, 15000);
+});
diff --git a/tests/e2e/specialist-schema-failure.test.ts b/tests/e2e/specialist-schema-failure.test.ts
@@ -0,0 +1,58 @@
+import { execFile } from "node:child_process";
+import fs from "node:fs/promises";
+import os from "node:os";
+import path from "node:path";
+import { fileURLToPath } from "node:url";
+import { promisify } from "node:util";
+import { afterEach, describe, expect, it } from "vitest";
+
+const execFileAsync = promisify(execFile);
+const tempDirs: string[] = [];
+const repoRoot = path.resolve(path.dirname(fileURLToPath(import.meta.url)), "../..");
+const innovationLoopScript = path.join(repoRoot, "scripts", "innovation_loop.py");
+
+async function makeWorkspace(): Promise<{ workspace: string; configPath: string; fakeBin: string }> {
+  const workspace = await fs.mkdtemp(path.join(os.tmpdir(), "auto-exp-specialist-fail-"));
+  tempDirs.push(workspace);
+  await fs.mkdir(path.join(workspace, "configs"), { recursive: true });
+  await fs.mkdir(path.join(workspace, "src"), { recursive: true });
+  await fs.mkdir(path.join(workspace, "data"), { recursive: true });
+  await fs.cp(path.join(repoRoot, "fixtures", "kb", "vault"), path.join(workspace, "vault"), { recursive: true });
+  const fakeBin = path.join(workspace, "fake-bin");
+  await fs.mkdir(fakeBin, { recursive: true });
+  await fs.writeFile(path.join(workspace, "src", "config.json"), JSON.stringify({ objective_mode: "baseline" }, null, 2) + "\n", "utf8");
+  await fs.writeFile(path.join(workspace, "src", "strategy.txt"), "baseline\n", "utf8");
+  await fs.writeFile(path.join(workspace, "src", "module.ts"), "export const variant = 0;\n", "utf8");
+  await fs.writeFile(path.join(workspace, "data", "observations.csv"), "split,value\ntrain,1\n", "utf8");
+  await fs.writeFile(path.join(workspace, "evaluate.py"), "print(0.8)\n", "utf8");
+  await fs.writeFile(path.join(workspace, "configs", "research_brain.yaml"), [`vault_root: ${path.join(workspace, "vault")}`, "index_output_dir: experiments/research/index", "retrieval_cache_dir: experiments/research/retrieval-cache", "evidence_output_dir: experiments/research", "feedback_output: experiments/research/paper-feedback.jsonl", "posterior_rank_output: experiments/research/posterior-rank.json", "paper_id_map_output: experiments/research/paper-id-map.jsonl", "frontier_map_output: experiments/research/index/frontier-map.json"].join("\n") + "\n", "utf8");
+  await fs.writeFile(path.join(fakeBin, "opencode"), `#!/usr/bin/env python3
+import sys
+args = sys.argv[1:]
+agent = args[args.index("--agent") + 1] if "--agent" in args else None
+if agent == "Apollo":
+    print("not-json-response")
+else:
+    print('{"ok": true}')
+`, "utf8");
+  await fs.chmod(path.join(fakeBin, "opencode"), 0o755);
+  await fs.writeFile(path.join(workspace, "configs", "goal.yaml"), ['goal_text: "test"', 'target_metric: "score"', 'metric_direction: "maximize"'].join("\n") + "\n", "utf8");
+  return { workspace, configPath: path.join(workspace, "configs", "goal.yaml"), fakeBin };
+}
+
+afterEach(async () => {
+  await Promise.all(tempDirs.splice(0).map((dir) => fs.rm(dir, { recursive: true, force: true, maxRetries: 10, retryDelay: 200 })));
+});
+
+describe("specialist schema failure", () => {
+  it("fails loudly when a live specialist returns invalid JSON", async () => {
+    const { workspace, configPath, fakeBin } = await makeWorkspace();
+    const env = { ...process.env, PATH: `${fakeBin}:${process.env.PATH ?? ""}`, INNOVATION_LOOP_OPENCODE_DIR: repoRoot, INNOVATION_LOOP_AGENT_MODEL: "kimi-for-coding/kimi-k2.5", INNOVATION_LOOP_DISABLE_REAL_DVC: "1" };
+    await execFileAsync("python3", [innovationLoopScript, "bootstrap", "--config", configPath, "--workspace", workspace, "--mode", "live"], { cwd: workspace, env });
+    await execFileAsync("python3", [innovationLoopScript, "tick", "--config", configPath, "--workspace", workspace, "--mode", "live"], { cwd: workspace, env });
+    await expect(execFileAsync("python3", [innovationLoopScript, "tick", "--config", configPath, "--workspace", workspace, "--mode", "live"], { cwd: workspace, env })).rejects.toThrow();
+    const failureDir = path.join(workspace, "experiments", "live-specialist-failures");
+    const files = await fs.readdir(failureDir);
+    expect(files.length).toBeGreaterThan(0);
+  }, 15000);
+});

Original file line number	Diff line number	Diff line change
`@@ -692,6 +692,7 @@ def collect_live_round_proposals(`
`692`	`692`	`result_packet,`
`693`	`693`	`research_context,`
`694`	`694`	`),`
	`695`	`+ workspace=workspace,`
`695`	`696`	`)`
`696`	`697`	`divergence_raw = run_opencode_agent(`
`697`	`698`	`"Hermes",`
`@@ -706,10 +707,12 @@ def collect_live_round_proposals(`
`706`	`707`	`research_context,`
`707`	`708`	`primary_proposal=exploit_raw,`
`708`	`709`	`),`
	`710`	`+ workspace=workspace,`
`709`	`711`	`)`
`710`	`712`	`guard = run_opencode_agent(`
`711`	`713`	`"Athena",`
`712`	`714`	`build_guard_prompt(workspace, exploit_raw, divergence_raw, research_context),`
	`715`	`+ workspace=workspace,`
`713`	`716`	`)`
`714`	`717`
`715`	`718`	`exploit = materialize_live_choice(`
Original file line number	Diff line number	Diff line change
`@@ -31,10 +31,6 @@ export function getCompatBestPath(workspaceRoot: string): string {`
`31`	`31`	`return path.join(getCompatExperimentsDir(workspaceRoot), "best.json");`
`32`	`32`	`}`
`33`	`33`
`34`		`-export function getCompatResultPacketPath(workspaceRoot: string): string {`
`35`		`- return path.join(getCompatExperimentsDir(workspaceRoot), "result_packet.json");`
`36`		`-}`
`37`		`-`
`38`	`34`	`export function getCompatProposalCardsPath(workspaceRoot: string): string {`
`39`	`35`	`return path.join(getCompatExperimentsDir(workspaceRoot), "proposal_cards.jsonl");`
`40`	`36`	`}`