|
| 1 | +import { execFile } from "node:child_process"; |
| 2 | +import fs from "node:fs/promises"; |
| 3 | +import os from "node:os"; |
| 4 | +import path from "node:path"; |
| 5 | +import { fileURLToPath } from "node:url"; |
| 6 | +import { promisify } from "node:util"; |
| 7 | +import { afterEach, describe, expect, it } from "vitest"; |
| 8 | + |
| 9 | +const execFileAsync = promisify(execFile); |
| 10 | +const tempDirs: string[] = []; |
| 11 | +const repoRoot = path.resolve(path.dirname(fileURLToPath(import.meta.url)), "../.."); |
| 12 | +const innovationLoopScript = path.join(repoRoot, "scripts", "innovation_loop.py"); |
| 13 | + |
| 14 | +async function makeWorkspace(): Promise<{ workspace: string; configPath: string; fakeBin: string }> { |
| 15 | + const workspace = await fs.mkdtemp(path.join(os.tmpdir(), "auto-exp-athena-veto-")); |
| 16 | + tempDirs.push(workspace); |
| 17 | + await fs.mkdir(path.join(workspace, "configs"), { recursive: true }); |
| 18 | + await fs.mkdir(path.join(workspace, "src"), { recursive: true }); |
| 19 | + await fs.mkdir(path.join(workspace, "data"), { recursive: true }); |
| 20 | + await fs.cp(path.join(repoRoot, "fixtures", "kb", "vault"), path.join(workspace, "vault"), { recursive: true }); |
| 21 | + const fakeBin = path.join(workspace, "fake-bin"); |
| 22 | + await fs.mkdir(fakeBin, { recursive: true }); |
| 23 | + await fs.writeFile(path.join(workspace, "src", "config.json"), JSON.stringify({ objective_mode: "baseline" }, null, 2) + "\n", "utf8"); |
| 24 | + await fs.writeFile(path.join(workspace, "src", "strategy.txt"), "baseline\n", "utf8"); |
| 25 | + await fs.writeFile(path.join(workspace, "src", "module.ts"), "export const variant = 0;\n", "utf8"); |
| 26 | + await fs.writeFile(path.join(workspace, "data", "observations.csv"), "split,value\ntrain,1\n", "utf8"); |
| 27 | + await fs.writeFile(path.join(workspace, "evaluate.py"), "print(0.8)\n", "utf8"); |
| 28 | + await fs.writeFile(path.join(workspace, "configs", "research_brain.yaml"), [`vault_root: ${path.join(workspace, "vault")}`, "index_output_dir: experiments/research/index", "retrieval_cache_dir: experiments/research/retrieval-cache", "evidence_output_dir: experiments/research", "feedback_output: experiments/research/paper-feedback.jsonl", "posterior_rank_output: experiments/research/posterior-rank.json", "paper_id_map_output: experiments/research/paper-id-map.jsonl", "frontier_map_output: experiments/research/index/frontier-map.json"].join("\n") + "\n", "utf8"); |
| 29 | + await fs.writeFile( |
| 30 | + path.join(fakeBin, "opencode"), |
| 31 | + `#!/usr/bin/env python3 |
| 32 | +import json, sys |
| 33 | +args = sys.argv[1:] |
| 34 | +agent = args[args.index("--agent") + 1] if "--agent" in args else None |
| 35 | +if agent == "Apollo": |
| 36 | + print(json.dumps({"choice":"objective","title":"weak-support","family":"objective.loss","innovation_tags":["objective"],"mechanism":"对目标函数做正则化,预期先改善中间稳定性指标,再影响目标指标。","files_to_touch":["src/config.json"],"expected_gain":0.02,"risk":"low","why_not_parameter_only":"changes objective family","minimal_ablation":["revert objective"],"paper_grounding":[{"paper_id":"doi:10.1145/3718958.3750493"},{"paper_id":"doi:10.1145/3711896.3736964"}],"redirect_if_underperforming":"切换到表征路线","causal_metric_path":["loss_shape","optimization_stability","target_metric"],"failure_signature":"loss path stalled","pivot_after_failure":"repr.feature"})) |
| 37 | +elif agent == "Hermes": |
| 38 | + print(json.dumps({"choice":"representation","title":"orthogonal","family":"repr.feature","innovation_tags":["representation"],"mechanism":"对表征层做重塑,预期先改变表征判别性,再影响目标指标。","files_to_touch":["src/strategy.txt"],"expected_gain":0.01,"risk":"medium","why_not_parameter_only":"changes representation path","minimal_ablation":["revert strategy"],"paper_grounding":[{"paper_id":"doi:10.1145/3711896.3736964"},{"paper_id":"paper:arxiv:2024:ffffeeee11"}],"redirect_if_underperforming":"切换到架构路线","causal_metric_path":["representation_quality","separation_margin","target_metric"],"failure_signature":"repr path stalled","pivot_after_failure":"arch.backbone"})) |
| 39 | +elif agent == "Athena": |
| 40 | + print(json.dumps({"verdict":"approve","validity_risks":[],"smallest_repair":None,"single_change_ok":True,"paper_support_ok":False,"redirect_if_underperforming":"切换到表征路线","failure_signature":"paper support weak","causal_metric_path":["loss_shape","optimization_stability","target_metric"]})) |
| 41 | +elif agent == "sisyphus-junior": |
| 42 | + print(json.dumps({"touched_files":["src/strategy.txt"],"diff_summary":"redirected change","change_manifest":{"primary_object":"representation","secondary_objects":[]}})) |
| 43 | +else: |
| 44 | + print(json.dumps({"ok": True})) |
| 45 | +`, |
| 46 | + "utf8", |
| 47 | + ); |
| 48 | + await fs.chmod(path.join(fakeBin, "opencode"), 0o755); |
| 49 | + await fs.writeFile(path.join(workspace, "configs", "goal.yaml"), ['goal_text: "test"', 'target_metric: "score"', 'metric_direction: "maximize"'].join("\n") + "\n", "utf8"); |
| 50 | + return { workspace, configPath: path.join(workspace, "configs", "goal.yaml"), fakeBin }; |
| 51 | +} |
| 52 | + |
| 53 | +afterEach(async () => { |
| 54 | + await Promise.all(tempDirs.splice(0).map((dir) => fs.rm(dir, { recursive: true, force: true, maxRetries: 10, retryDelay: 200 }))); |
| 55 | +}); |
| 56 | + |
| 57 | +describe("athena paper support veto", () => { |
| 58 | + it("switches to Hermes when Athena reports paper_support_ok=false", async () => { |
| 59 | + const { workspace, configPath, fakeBin } = await makeWorkspace(); |
| 60 | + const env = { ...process.env, PATH: `${fakeBin}:${process.env.PATH ?? ""}`, INNOVATION_LOOP_OPENCODE_DIR: repoRoot, INNOVATION_LOOP_AGENT_MODEL: "kimi-for-coding/kimi-k2.5", INNOVATION_LOOP_DISABLE_REAL_DVC: "1" }; |
| 61 | + await execFileAsync("python3", [innovationLoopScript, "bootstrap", "--config", configPath, "--workspace", workspace, "--mode", "live"], { cwd: workspace, env }); |
| 62 | + await execFileAsync("python3", [innovationLoopScript, "tick", "--config", configPath, "--workspace", workspace, "--mode", "live"], { cwd: workspace, env }); |
| 63 | + await execFileAsync("python3", [innovationLoopScript, "tick", "--config", configPath, "--workspace", workspace, "--mode", "live"], { cwd: workspace, env }); |
| 64 | + const proposals = JSON.parse(await fs.readFile(path.join(workspace, "experiments", "proposals", "round-0001.json"), "utf8")); |
| 65 | + expect(proposals.next_primary_hypothesis.family).toBe("repr.feature"); |
| 66 | + }, 15000); |
| 67 | +}); |
0 commit comments