Skip to content

Commit 7afbc8c

Browse files
fix: finalize remediation risk hardening
Ultraworked with [Sisyphus](https://github.com/code-yeongyu/oh-my-opencode) Co-authored-by: Sisyphus <clio-agent@sisyphuslabs.ai>
1 parent e024b7f commit 7afbc8c

File tree

8 files changed

+240
-12
lines changed

8 files changed

+240
-12
lines changed

README.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -302,6 +302,11 @@ Remote execution is supported as a contract pattern rather than a separate contr
302302

303303
See `REMOTE_EXECUTION.md` for the exact contract.
304304

305+
Validation paths:
306+
307+
- `npm test -- tests/e2e/python-controller-real-dvc.test.ts`
308+
- `npm test -- tests/e2e/remote-contract.test.ts`
309+
305310
## Daily Scheduler Split
306311

307312
The previous monolithic `daily-research-brain` job has been split into two clearer chains:

scripts/ae_common.py

Lines changed: 72 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -715,7 +715,13 @@ def opencode_repo_dir() -> pathlib.Path:
715715

716716

717717
def opencode_agent_model() -> str:
718-
return os.environ.get("INNOVATION_LOOP_AGENT_MODEL", "kimi-for-coding/kimi-k2.5")
718+
env_override = os.environ.get("INNOVATION_LOOP_AGENT_MODEL")
719+
if env_override:
720+
return env_override
721+
repo = opencode_repo_dir()
722+
config = read_json(repo / "opencode.json", {})
723+
model = dict(config.get("agent", {})).get("Apollo", {}).get("model")
724+
return str(model or "kimi-for-coding/kimi-k2.5")
719725

720726

721727
def run_opencode_agent(
@@ -724,6 +730,7 @@ def run_opencode_agent(
724730
*,
725731
model: Optional[str] = None,
726732
timeout: int = 240000,
733+
workspace: Optional[pathlib.Path] = None,
727734
) -> Dict[str, Any]:
728735
repo_dir = opencode_repo_dir()
729736
command = [
@@ -741,14 +748,56 @@ def run_opencode_agent(
741748
result = run_process(
742749
command, repo_dir, check=False, timeout=max(timeout / 1000.0, 1.0)
743750
)
751+
if workspace is not None:
752+
artifact_dir = workspace / "experiments" / "live-specialist-failures"
753+
artifact_dir.mkdir(parents=True, exist_ok=True)
754+
raw_path = artifact_dir / f"{agent.lower()}-{int(time.time() * 1000)}.json"
755+
else:
756+
raw_path = None
744757
if result.returncode != 0:
758+
if raw_path is not None:
759+
write_json(
760+
raw_path,
761+
{
762+
"agent": agent,
763+
"kind": "subprocess_error",
764+
"stdout": result.stdout,
765+
"stderr": result.stderr,
766+
"returncode": result.returncode,
767+
},
768+
)
745769
raise RuntimeError(
746770
result.stderr.strip()
747771
or result.stdout.strip()
748772
or f"opencode run failed for {agent}"
749773
)
750-
parsed = extract_json_payload(result.stdout)
774+
try:
775+
parsed = extract_json_payload(result.stdout)
776+
except Exception:
777+
if raw_path is not None:
778+
write_json(
779+
raw_path,
780+
{
781+
"agent": agent,
782+
"kind": "schema_parse_failure",
783+
"stdout": result.stdout,
784+
"stderr": result.stderr,
785+
"returncode": result.returncode,
786+
},
787+
)
788+
raise
751789
if not isinstance(parsed, dict):
790+
if raw_path is not None:
791+
write_json(
792+
raw_path,
793+
{
794+
"agent": agent,
795+
"kind": "schema_parse_failure",
796+
"stdout": result.stdout,
797+
"stderr": result.stderr,
798+
"returncode": result.returncode,
799+
},
800+
)
752801
raise RuntimeError(f"expected JSON object from {agent}")
753802
return parsed
754803

@@ -994,19 +1043,35 @@ def collect(value: Any) -> None:
9941043

9951044
def save_parent_snapshot(
9961045
workspace: pathlib.Path, run_id: str, touched_files: List[str]
997-
) -> Dict[str, str]:
998-
snapshot: Dict[str, str] = {}
1046+
) -> Dict[str, Dict[str, Any]]:
1047+
snapshot: Dict[str, Dict[str, Any]] = {}
9991048
for relative in touched_files:
10001049
absolute = workspace / relative
1001-
snapshot[relative] = read_text(absolute)
1050+
snapshot[relative] = {
1051+
"exists": absolute.exists(),
1052+
"content": read_text(absolute) if absolute.exists() else "",
1053+
}
10021054
write_json(run_dir(workspace, run_id) / "parent_snapshot.json", snapshot)
10031055
return snapshot
10041056

10051057

10061058
def restore_parent_snapshot(workspace: pathlib.Path, run_id: str) -> None:
10071059
snapshot = read_json(run_dir(workspace, run_id) / "parent_snapshot.json", {})
1008-
for relative, content in snapshot.items():
1009-
write_text(workspace / relative, content)
1060+
for relative, entry in snapshot.items():
1061+
absolute = workspace / relative
1062+
if isinstance(entry, dict):
1063+
if entry.get("exists", False):
1064+
write_text(absolute, str(entry.get("content", "")))
1065+
else:
1066+
absolute.unlink(missing_ok=True)
1067+
else:
1068+
write_text(absolute, str(entry))
1069+
1070+
1071+
def save_run_manifest(
1072+
workspace: pathlib.Path, run_id: str, payload: Dict[str, Any]
1073+
) -> None:
1074+
write_json(run_dir(workspace, run_id) / "meta.json", payload)
10101075

10111076

10121077
def load_pending_result(workspace: pathlib.Path, run_id: str) -> Dict[str, Any]:

scripts/innovation_loop.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -692,6 +692,7 @@ def collect_live_round_proposals(
692692
result_packet,
693693
research_context,
694694
),
695+
workspace=workspace,
695696
)
696697
divergence_raw = run_opencode_agent(
697698
"Hermes",
@@ -706,10 +707,12 @@ def collect_live_round_proposals(
706707
research_context,
707708
primary_proposal=exploit_raw,
708709
),
710+
workspace=workspace,
709711
)
710712
guard = run_opencode_agent(
711713
"Athena",
712714
build_guard_prompt(workspace, exploit_raw, divergence_raw, research_context),
715+
workspace=workspace,
713716
)
714717

715718
exploit = materialize_live_choice(

scripts/run_candidate.py

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
run_stage,
2323
restore_parent_snapshot,
2424
save_parent_snapshot,
25+
save_run_manifest,
2526
save_pending_result,
2627
session_path,
2728
start_dvc_queue_worker,
@@ -94,7 +95,7 @@ def apply_mutation_live(workspace: pathlib.Path, mutation: dict) -> dict:
9495
WORKSPACE_ROOT: {workspace}
9596
MUTATION_JSON: {json.dumps(mutation, ensure_ascii=False)}
9697
""".strip()
97-
return run_opencode_agent("sisyphus-junior", prompt)
98+
return run_opencode_agent("sisyphus-junior", prompt, workspace=workspace)
9899

99100

100101
def main() -> None:
@@ -143,6 +144,21 @@ def main() -> None:
143144
"parent_run_id": current_best_exp_ref(workspace) or "baseline",
144145
},
145146
)
147+
save_run_manifest(
148+
workspace,
149+
run_id,
150+
{
151+
"run_id": run_id,
152+
"proposal_id": mutation.get("proposal_id"),
153+
"family": mutation.get("family"),
154+
"touched_files": touched_files,
155+
"created_files": [],
156+
"deleted_files": [],
157+
"checkpoint_path": str(checkpoint),
158+
"dvc_exp_ref": run_id,
159+
"resume_from": args.resume_from,
160+
},
161+
)
146162

147163
try:
148164
smoke_metric = run_stage(goal, workspace, "smoke")

src/utils/paths.ts

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -31,10 +31,6 @@ export function getCompatBestPath(workspaceRoot: string): string {
3131
return path.join(getCompatExperimentsDir(workspaceRoot), "best.json");
3232
}
3333

34-
export function getCompatResultPacketPath(workspaceRoot: string): string {
35-
return path.join(getCompatExperimentsDir(workspaceRoot), "result_packet.json");
36-
}
37-
3834
export function getCompatProposalCardsPath(workspaceRoot: string): string {
3935
return path.join(getCompatExperimentsDir(workspaceRoot), "proposal_cards.jsonl");
4036
}

tests/e2e/remote-contract.test.ts

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
import { execFile } from "node:child_process";
2+
import fs from "node:fs/promises";
3+
import os from "node:os";
4+
import path from "node:path";
5+
import { promisify } from "node:util";
6+
import { afterEach, describe, expect, it } from "vitest";
7+
8+
const execFileAsync = promisify(execFile);
9+
const tempDirs: string[] = [];
10+
const repoRoot = "/Users/herxanadu/Desktop/opencode-auto-experiment";
11+
12+
async function makeWorkspace(): Promise<string> {
13+
const dir = await fs.mkdtemp(path.join(os.tmpdir(), "auto-exp-remote-contract-"));
14+
tempDirs.push(dir);
15+
await fs.mkdir(path.join(dir, "experiments", "checkpoints"), { recursive: true });
16+
await fs.writeFile(
17+
path.join(dir, "remote_eval.py"),
18+
[
19+
"import argparse, json, pathlib",
20+
"p=argparse.ArgumentParser()",
21+
"p.add_argument('--resume-from')",
22+
"args=p.parse_args()",
23+
"ckpt = pathlib.Path('experiments/checkpoints/remote.ckpt')",
24+
"ckpt.write_text('checkpoint\\n', encoding='utf-8')",
25+
"metrics = pathlib.Path('experiments/metrics.json')",
26+
"metrics.write_text(json.dumps({'score': 0.91, 'resume_from': args.resume_from}, indent=2) + '\\n', encoding='utf-8')",
27+
"print(0.91)",
28+
].join("\n") + "\n",
29+
"utf8",
30+
);
31+
return dir;
32+
}
33+
34+
afterEach(async () => {
35+
await Promise.all(tempDirs.splice(0).map((dir) => fs.rm(dir, { recursive: true, force: true, maxRetries: 10, retryDelay: 200 })));
36+
});
37+
38+
describe("remote execution contract", () => {
39+
it("validates that a mock remote adapter preserves metrics, checkpoint, and resume semantics", async () => {
40+
const workspace = await makeWorkspace();
41+
await execFileAsync("python3", ["remote_eval.py"], { cwd: workspace });
42+
const firstMetrics = JSON.parse(await fs.readFile(path.join(workspace, "experiments", "metrics.json"), "utf8"));
43+
expect(firstMetrics.score).toBe(0.91);
44+
expect(firstMetrics.resume_from).toBeNull();
45+
await expect(fs.stat(path.join(workspace, "experiments", "checkpoints", "remote.ckpt"))).resolves.toBeTruthy();
46+
await execFileAsync("python3", ["remote_eval.py", "--resume-from", "experiments/checkpoints/remote.ckpt"], { cwd: workspace });
47+
const resumedMetrics = JSON.parse(await fs.readFile(path.join(workspace, "experiments", "metrics.json"), "utf8"));
48+
expect(resumedMetrics.resume_from).toContain("remote.ckpt");
49+
}, 15000);
50+
});
Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
import { execFile } from "node:child_process";
2+
import fs from "node:fs/promises";
3+
import os from "node:os";
4+
import path from "node:path";
5+
import { promisify } from "node:util";
6+
import { afterEach, describe, expect, it } from "vitest";
7+
8+
const execFileAsync = promisify(execFile);
9+
const tempDirs: string[] = [];
10+
const repoRoot = "/Users/herxanadu/Desktop/opencode-auto-experiment";
11+
12+
async function makeWorkspace(): Promise<string> {
13+
const dir = await fs.mkdtemp(path.join(os.tmpdir(), "auto-exp-rollback-"));
14+
tempDirs.push(dir);
15+
await fs.mkdir(path.join(dir, "src"), { recursive: true });
16+
await fs.mkdir(path.join(dir, "experiments", "runs", "rollback-run"), { recursive: true });
17+
await fs.writeFile(path.join(dir, "src", "config.json"), '{"learning_rate":0.1}\n', "utf8");
18+
return dir;
19+
}
20+
21+
afterEach(async () => {
22+
await Promise.all(tempDirs.splice(0).map((dir) => fs.rm(dir, { recursive: true, force: true, maxRetries: 10, retryDelay: 200 })));
23+
});
24+
25+
describe("rollback side effects", () => {
26+
it("restores modified files and removes files created after the snapshot", async () => {
27+
const workspace = await makeWorkspace();
28+
await execFileAsync("python3", ["-c", `import pathlib,sys; sys.path.insert(0, ${JSON.stringify(repoRoot + "/scripts")}); from ae_common import save_parent_snapshot; save_parent_snapshot(pathlib.Path(${JSON.stringify(workspace)}), 'rollback-run', ['src/config.json','src/new.txt'])`], { cwd: repoRoot });
29+
await fs.writeFile(path.join(workspace, "src", "config.json"), '{"learning_rate":0.9}\n', "utf8");
30+
await fs.writeFile(path.join(workspace, "src", "new.txt"), "temporary\n", "utf8");
31+
await execFileAsync("python3", ["-c", `import pathlib,sys; sys.path.insert(0, ${JSON.stringify(repoRoot + "/scripts")}); from ae_common import restore_parent_snapshot; restore_parent_snapshot(pathlib.Path(${JSON.stringify(workspace)}), 'rollback-run')`], { cwd: repoRoot });
32+
expect(await fs.readFile(path.join(workspace, "src", "config.json"), "utf8")).toContain('0.1');
33+
await expect(fs.stat(path.join(workspace, "src", "new.txt"))).rejects.toThrow();
34+
}, 15000);
35+
});
Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
import { execFile } from "node:child_process";
2+
import fs from "node:fs/promises";
3+
import os from "node:os";
4+
import path from "node:path";
5+
import { fileURLToPath } from "node:url";
6+
import { promisify } from "node:util";
7+
import { afterEach, describe, expect, it } from "vitest";
8+
9+
const execFileAsync = promisify(execFile);
10+
const tempDirs: string[] = [];
11+
const repoRoot = path.resolve(path.dirname(fileURLToPath(import.meta.url)), "../..");
12+
const innovationLoopScript = path.join(repoRoot, "scripts", "innovation_loop.py");
13+
14+
async function makeWorkspace(): Promise<{ workspace: string; configPath: string; fakeBin: string }> {
15+
const workspace = await fs.mkdtemp(path.join(os.tmpdir(), "auto-exp-specialist-fail-"));
16+
tempDirs.push(workspace);
17+
await fs.mkdir(path.join(workspace, "configs"), { recursive: true });
18+
await fs.mkdir(path.join(workspace, "src"), { recursive: true });
19+
await fs.mkdir(path.join(workspace, "data"), { recursive: true });
20+
await fs.cp(path.join(repoRoot, "fixtures", "kb", "vault"), path.join(workspace, "vault"), { recursive: true });
21+
const fakeBin = path.join(workspace, "fake-bin");
22+
await fs.mkdir(fakeBin, { recursive: true });
23+
await fs.writeFile(path.join(workspace, "src", "config.json"), JSON.stringify({ objective_mode: "baseline" }, null, 2) + "\n", "utf8");
24+
await fs.writeFile(path.join(workspace, "src", "strategy.txt"), "baseline\n", "utf8");
25+
await fs.writeFile(path.join(workspace, "src", "module.ts"), "export const variant = 0;\n", "utf8");
26+
await fs.writeFile(path.join(workspace, "data", "observations.csv"), "split,value\ntrain,1\n", "utf8");
27+
await fs.writeFile(path.join(workspace, "evaluate.py"), "print(0.8)\n", "utf8");
28+
await fs.writeFile(path.join(workspace, "configs", "research_brain.yaml"), [`vault_root: ${path.join(workspace, "vault")}`, "index_output_dir: experiments/research/index", "retrieval_cache_dir: experiments/research/retrieval-cache", "evidence_output_dir: experiments/research", "feedback_output: experiments/research/paper-feedback.jsonl", "posterior_rank_output: experiments/research/posterior-rank.json", "paper_id_map_output: experiments/research/paper-id-map.jsonl", "frontier_map_output: experiments/research/index/frontier-map.json"].join("\n") + "\n", "utf8");
29+
await fs.writeFile(path.join(fakeBin, "opencode"), `#!/usr/bin/env python3
30+
import sys
31+
args = sys.argv[1:]
32+
agent = args[args.index("--agent") + 1] if "--agent" in args else None
33+
if agent == "Apollo":
34+
print("not-json-response")
35+
else:
36+
print('{"ok": true}')
37+
`, "utf8");
38+
await fs.chmod(path.join(fakeBin, "opencode"), 0o755);
39+
await fs.writeFile(path.join(workspace, "configs", "goal.yaml"), ['goal_text: "test"', 'target_metric: "score"', 'metric_direction: "maximize"'].join("\n") + "\n", "utf8");
40+
return { workspace, configPath: path.join(workspace, "configs", "goal.yaml"), fakeBin };
41+
}
42+
43+
afterEach(async () => {
44+
await Promise.all(tempDirs.splice(0).map((dir) => fs.rm(dir, { recursive: true, force: true, maxRetries: 10, retryDelay: 200 })));
45+
});
46+
47+
describe("specialist schema failure", () => {
48+
it("fails loudly when a live specialist returns invalid JSON", async () => {
49+
const { workspace, configPath, fakeBin } = await makeWorkspace();
50+
const env = { ...process.env, PATH: `${fakeBin}:${process.env.PATH ?? ""}`, INNOVATION_LOOP_OPENCODE_DIR: repoRoot, INNOVATION_LOOP_AGENT_MODEL: "kimi-for-coding/kimi-k2.5", INNOVATION_LOOP_DISABLE_REAL_DVC: "1" };
51+
await execFileAsync("python3", [innovationLoopScript, "bootstrap", "--config", configPath, "--workspace", workspace, "--mode", "live"], { cwd: workspace, env });
52+
await execFileAsync("python3", [innovationLoopScript, "tick", "--config", configPath, "--workspace", workspace, "--mode", "live"], { cwd: workspace, env });
53+
await expect(execFileAsync("python3", [innovationLoopScript, "tick", "--config", configPath, "--workspace", workspace, "--mode", "live"], { cwd: workspace, env })).rejects.toThrow();
54+
const failureDir = path.join(workspace, "experiments", "live-specialist-failures");
55+
const files = await fs.readdir(failureDir);
56+
expect(files.length).toBeGreaterThan(0);
57+
}, 15000);
58+
});

0 commit comments

Comments
 (0)