Skip to content

Commit 529032f

Browse files
fix: harden controller guardrails and public e2e stability
Ultraworked with [Sisyphus](https://github.com/code-yeongyu/oh-my-opencode) Co-authored-by: Sisyphus <clio-agent@sisyphuslabs.ai>
1 parent 9297046 commit 529032f

File tree

6 files changed

+89
-4
lines changed

6 files changed

+89
-4
lines changed

scripts/innovation_loop.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -724,8 +724,10 @@ def collect_live_round_proposals(
724724
"Athena": "approved the exploit proposal",
725725
"Hermes": "kept as fallback while exploit remained valid",
726726
}
727-
if str(guard.get("verdict", "approve")).lower() != "approve" or not bool(
728-
guard.get("single_change_ok", False)
727+
if (
728+
str(guard.get("verdict", "approve")).lower() != "approve"
729+
or not bool(guard.get("single_change_ok", False))
730+
or not bool(guard.get("paper_support_ok", False))
729731
):
730732
chosen = divergence
731733
reject_reasons = {

scripts/kb/retrieve_papers.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -433,6 +433,7 @@ def main() -> None:
433433
parser.add_argument("--session", required=True)
434434
parser.add_argument("--best", required=True)
435435
parser.add_argument("--attempts", required=False)
436+
parser.add_argument("--result-packet")
436437
parser.add_argument("--workspace-root")
437438
parser.add_argument("--index-dir")
438439
parser.add_argument("--config")
@@ -455,6 +456,11 @@ def main() -> None:
455456
attempts = (
456457
read_jsonl(pathlib.Path(args.attempts).resolve()) if args.attempts else []
457458
)
459+
result_packet = (
460+
load_structured(pathlib.Path(args.result_packet).resolve())
461+
if args.result_packet
462+
else {}
463+
)
458464
posterior = read_json(posterior_rank_output_path(workspace_root, config), {})
459465
papers = load_jsonl(index_dir / "paper-index.jsonl")
460466

@@ -464,6 +470,13 @@ def main() -> None:
464470
str(goal.get("goal_text") or ""),
465471
str(goal.get("target_metric") or ""),
466472
str(best.get("family") or ""),
473+
str(result_packet.get("change_class") or ""),
474+
str(result_packet.get("change_unit") or ""),
475+
str((result_packet.get("monitor_summary") or {}).get("state") or ""),
476+
str(
477+
(result_packet.get("change_manifest") or {}).get("primary_object")
478+
or ""
479+
),
467480
" ".join(
468481
str(item.get("failure_signature") or "")
469482
for item in attempts[-5:]

scripts/kb/run_inference_cycle.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ def main() -> None:
3838
session = workspace_root / "experiments" / "session.json"
3939
best = workspace_root / "experiments" / "best.json"
4040
attempts = workspace_root / "experiments" / "attempts.jsonl"
41+
result_packet = workspace_root / "experiments" / "result-packet.json"
4142
if not (session.exists() and best.exists() and attempts.exists()):
4243
emit_json(
4344
{
@@ -59,6 +60,8 @@ def main() -> None:
5960
str(best),
6061
"--attempts",
6162
str(attempts),
63+
"--result-packet",
64+
str(result_packet),
6265
"--workspace-root",
6366
args.workspace_root,
6467
"--config",
Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
import { execFile } from "node:child_process";
2+
import fs from "node:fs/promises";
3+
import os from "node:os";
4+
import path from "node:path";
5+
import { fileURLToPath } from "node:url";
6+
import { promisify } from "node:util";
7+
import { afterEach, describe, expect, it } from "vitest";
8+
9+
const execFileAsync = promisify(execFile);
10+
const tempDirs: string[] = [];
11+
const repoRoot = path.resolve(path.dirname(fileURLToPath(import.meta.url)), "../..");
12+
const innovationLoopScript = path.join(repoRoot, "scripts", "innovation_loop.py");
13+
14+
async function makeWorkspace(): Promise<{ workspace: string; configPath: string; fakeBin: string }> {
15+
const workspace = await fs.mkdtemp(path.join(os.tmpdir(), "auto-exp-athena-veto-"));
16+
tempDirs.push(workspace);
17+
await fs.mkdir(path.join(workspace, "configs"), { recursive: true });
18+
await fs.mkdir(path.join(workspace, "src"), { recursive: true });
19+
await fs.mkdir(path.join(workspace, "data"), { recursive: true });
20+
await fs.cp(path.join(repoRoot, "fixtures", "kb", "vault"), path.join(workspace, "vault"), { recursive: true });
21+
const fakeBin = path.join(workspace, "fake-bin");
22+
await fs.mkdir(fakeBin, { recursive: true });
23+
await fs.writeFile(path.join(workspace, "src", "config.json"), JSON.stringify({ objective_mode: "baseline" }, null, 2) + "\n", "utf8");
24+
await fs.writeFile(path.join(workspace, "src", "strategy.txt"), "baseline\n", "utf8");
25+
await fs.writeFile(path.join(workspace, "src", "module.ts"), "export const variant = 0;\n", "utf8");
26+
await fs.writeFile(path.join(workspace, "data", "observations.csv"), "split,value\ntrain,1\n", "utf8");
27+
await fs.writeFile(path.join(workspace, "evaluate.py"), "print(0.8)\n", "utf8");
28+
await fs.writeFile(path.join(workspace, "configs", "research_brain.yaml"), [`vault_root: ${path.join(workspace, "vault")}`, "index_output_dir: experiments/research/index", "retrieval_cache_dir: experiments/research/retrieval-cache", "evidence_output_dir: experiments/research", "feedback_output: experiments/research/paper-feedback.jsonl", "posterior_rank_output: experiments/research/posterior-rank.json", "paper_id_map_output: experiments/research/paper-id-map.jsonl", "frontier_map_output: experiments/research/index/frontier-map.json"].join("\n") + "\n", "utf8");
29+
await fs.writeFile(
30+
path.join(fakeBin, "opencode"),
31+
`#!/usr/bin/env python3
32+
import json, sys
33+
args = sys.argv[1:]
34+
agent = args[args.index("--agent") + 1] if "--agent" in args else None
35+
if agent == "Apollo":
36+
print(json.dumps({"choice":"objective","title":"weak-support","family":"objective.loss","innovation_tags":["objective"],"mechanism":"对目标函数做正则化,预期先改善中间稳定性指标,再影响目标指标。","files_to_touch":["src/config.json"],"expected_gain":0.02,"risk":"low","why_not_parameter_only":"changes objective family","minimal_ablation":["revert objective"],"paper_grounding":[{"paper_id":"doi:10.1145/3718958.3750493"},{"paper_id":"doi:10.1145/3711896.3736964"}],"redirect_if_underperforming":"切换到表征路线","causal_metric_path":["loss_shape","optimization_stability","target_metric"],"failure_signature":"loss path stalled","pivot_after_failure":"repr.feature"}))
37+
elif agent == "Hermes":
38+
print(json.dumps({"choice":"representation","title":"orthogonal","family":"repr.feature","innovation_tags":["representation"],"mechanism":"对表征层做重塑,预期先改变表征判别性,再影响目标指标。","files_to_touch":["src/strategy.txt"],"expected_gain":0.01,"risk":"medium","why_not_parameter_only":"changes representation path","minimal_ablation":["revert strategy"],"paper_grounding":[{"paper_id":"doi:10.1145/3711896.3736964"},{"paper_id":"paper:arxiv:2024:ffffeeee11"}],"redirect_if_underperforming":"切换到架构路线","causal_metric_path":["representation_quality","separation_margin","target_metric"],"failure_signature":"repr path stalled","pivot_after_failure":"arch.backbone"}))
39+
elif agent == "Athena":
40+
print(json.dumps({"verdict":"approve","validity_risks":[],"smallest_repair":None,"single_change_ok":True,"paper_support_ok":False,"redirect_if_underperforming":"切换到表征路线","failure_signature":"paper support weak","causal_metric_path":["loss_shape","optimization_stability","target_metric"]}))
41+
elif agent == "sisyphus-junior":
42+
print(json.dumps({"touched_files":["src/strategy.txt"],"diff_summary":"redirected change","change_manifest":{"primary_object":"representation","secondary_objects":[]}}))
43+
else:
44+
print(json.dumps({"ok": True}))
45+
`,
46+
"utf8",
47+
);
48+
await fs.chmod(path.join(fakeBin, "opencode"), 0o755);
49+
await fs.writeFile(path.join(workspace, "configs", "goal.yaml"), ['goal_text: "test"', 'target_metric: "score"', 'metric_direction: "maximize"'].join("\n") + "\n", "utf8");
50+
return { workspace, configPath: path.join(workspace, "configs", "goal.yaml"), fakeBin };
51+
}
52+
53+
afterEach(async () => {
54+
await Promise.all(tempDirs.splice(0).map((dir) => fs.rm(dir, { recursive: true, force: true, maxRetries: 10, retryDelay: 200 })));
55+
});
56+
57+
describe("athena paper support veto", () => {
58+
it("switches to Hermes when Athena reports paper_support_ok=false", async () => {
59+
const { workspace, configPath, fakeBin } = await makeWorkspace();
60+
const env = { ...process.env, PATH: `${fakeBin}:${process.env.PATH ?? ""}`, INNOVATION_LOOP_OPENCODE_DIR: repoRoot, INNOVATION_LOOP_AGENT_MODEL: "kimi-for-coding/kimi-k2.5", INNOVATION_LOOP_DISABLE_REAL_DVC: "1" };
61+
await execFileAsync("python3", [innovationLoopScript, "bootstrap", "--config", configPath, "--workspace", workspace, "--mode", "live"], { cwd: workspace, env });
62+
await execFileAsync("python3", [innovationLoopScript, "tick", "--config", configPath, "--workspace", workspace, "--mode", "live"], { cwd: workspace, env });
63+
await execFileAsync("python3", [innovationLoopScript, "tick", "--config", configPath, "--workspace", workspace, "--mode", "live"], { cwd: workspace, env });
64+
const proposals = JSON.parse(await fs.readFile(path.join(workspace, "experiments", "proposals", "round-0001.json"), "utf8"));
65+
expect(proposals.next_primary_hypothesis.family).toBe("repr.feature");
66+
}, 15000);
67+
});

tests/e2e/python-controller.test.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -179,7 +179,7 @@ describe("python controller cli", () => {
179179
const resume = await runInnovationLoop(workspace, configPath, "resume");
180180
expect(resume.resumed).toBe(true);
181181
expect(resume.candidate.queued).toBe(true);
182-
}, 30000);
182+
}, 45000);
183183

184184
it("supports detached start and stop", async () => {
185185
const { workspace, configPath } = await makeWorkspace();

tests/e2e/research-brain-mock-redirect-selection.test.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ async function makeWorkspace(): Promise<{ workspace: string; configPath: string
4141
}
4242

4343
afterEach(async () => {
44-
await Promise.all(tempDirs.splice(0).map((dir) => fs.rm(dir, { recursive: true, force: true })));
44+
await Promise.all(tempDirs.splice(0).map((dir) => fs.rm(dir, { recursive: true, force: true, maxRetries: 10, retryDelay: 200 })));
4545
});
4646

4747
describe("research brain mock redirect selection", () => {

0 commit comments

Comments
 (0)