test: align public repo with latest controller and brain flows

Her-xanadu · sisyphus-dev-ai · Her-xanadu · commit b6ad7a4d53b6 · 2026-03-15T20:37:21.000+08:00
Ultraworked with [Sisyphus](https://github.com/code-yeongyu/oh-my-opencode) Co-authored-by: Sisyphus <clio-agent@sisyphuslabs.ai>
diff --git a/.gitignore b/.gitignore
@@ -22,5 +22,6 @@ experiments/research/paper-id-map.jsonl
 dvclive/
 .DS_Store
 vault/
+vault
 *.log
 *.pyc
diff --git a/tests/analysis/schema.test.ts b/tests/analysis/schema.test.ts
@@ -1,11 +1,25 @@
 import { describe, expect, it } from "vitest";
 import resultPacket from "../../fixtures/results/result-packet-good.json";
-import { proposalCardSchema } from "../../src/analysis/proposal-card";
-import { resultPacketSchema } from "../../src/analysis/result-packet";
+import { proposalCardSchema, proposalContractSchema } from "../../src/analysis/proposal-card";
+import { controllerSessionSchema, resultPacketSchema } from "../../src/analysis/result-packet";
 
 describe("analysis schemas", () => {
   it("accepts a valid result packet and rejects invalid proposal cards", () => {
     expect(() => resultPacketSchema.parse(resultPacket)).not.toThrow();
+    expect(() =>
+      proposalContractSchema.parse({
+        family: "objective.loss",
+        mechanism: "对目标函数做正则化，预期先改善中间稳定性指标，再影响目标指标。",
+        redirect_if_underperforming: "切换到表征路线",
+      }),
+    ).not.toThrow();
+    expect(() =>
+      controllerSessionSchema.parse({
+        session_id: "s1",
+        stage: "ready_to_execute",
+        direction_memory_v2: { "objective.loss|generic-underperform": { "repr.feature": { weight: 1.0, confidence: 0.5 } } },
+      }),
+    ).not.toThrow();
     expect(() =>
       proposalCardSchema.parse({
         proposal_id: "bad",
diff --git a/tests/e2e/e2e.test.ts b/tests/e2e/e2e.test.ts
@@ -15,11 +15,12 @@ async function makeWorkspace(): Promise<string> {
   await fs.writeFile(path.join(dir, "src", "config.json"), '{"learning_rate":0.1}\n', "utf8");
   await fs.writeFile(path.join(dir, "src", "strategy.txt"), "baseline\n", "utf8");
   await fs.writeFile(path.join(dir, "src", "module.ts"), "export const variant = 0;\n", "utf8");
+  await fs.writeFile(path.join(dir, "evaluate.py"), "print(0.93)\n", "utf8");
   return dir;
 }
 
 afterEach(async () => {
-  await Promise.all(tempDirs.splice(0).map((dir) => fs.rm(dir, { recursive: true, force: true })));
+  await Promise.all(tempDirs.splice(0).map((dir) => fs.rm(dir, { recursive: true, force: true, maxRetries: 10, retryDelay: 200 })));
 });
 
 describe("local e2e", () => {
@@ -71,5 +72,5 @@ describe("local e2e", () => {
       ]),
     );
     expect(status.best.current_best.metric).toBeGreaterThan(0.5);
-  });
+  }, 30000);
 });
diff --git a/tests/e2e/python-controller-real-dvc.test.ts b/tests/e2e/python-controller-real-dvc.test.ts
@@ -13,7 +13,7 @@ const innovationLoopScript = path.join(repoRoot, "scripts", "innovation_loop.py"
 const hasRealDvc = spawnSync("python3", ["-c", "import shutil,sys; sys.exit(0 if shutil.which('dvc') else 1)"], {
   cwd: repoRoot,
 }).status === 0;
-const describeIfRealDvc = hasRealDvc && process.env.RUN_REAL_DVC_TESTS === "1" ? describe : describe.skip;
+const describeIfRealDvc = hasRealDvc ? describe : describe.skip;
 
 async function makeWorkspace(): Promise<{ workspace: string; configPath: string }> {
   const workspace = await fs.mkdtemp(path.join(os.tmpdir(), "auto-exp-python-controller-real-dvc-"));
diff --git a/tests/e2e/research-brain-direction-memory.test.ts b/tests/e2e/research-brain-direction-memory.test.ts
@@ -58,11 +58,12 @@ describe("research brain direction memory", () => {
       },
     };
     session.direction_memory_v2 = {
-      "objective.loss|generic-underperform": {
+      "objective.loss|generic-underperform|loss_shape->optimization_stability->target_metric": {
         "repr.feature": {
           weight: 1.5,
           last_round: 1,
           reason: "停止重复 objective.loss，转向 repr.feature",
+          metric_path_signature: "loss_shape->optimization_stability->target_metric",
           success_count: 2,
           failure_count: 1,
           crash_count: 0,
@@ -72,6 +73,7 @@ describe("research brain direction memory", () => {
           weight: 1.8,
           last_round: 1,
           reason: "停止重复 objective.loss，转向 arch.backbone",
+          metric_path_signature: "loss_shape->optimization_stability->target_metric",
           success_count: 0,
           failure_count: 3,
           crash_count: 1,
@@ -84,5 +86,5 @@ describe("research brain direction memory", () => {
     await execFileAsync("python3", [innovationLoopScript, "tick", "--config", configPath, "--workspace", workspace, "--mode", "mock"], { cwd: workspace });
     const proposals = JSON.parse(await fs.readFile(path.join(workspace, "experiments", "proposals", "round-0001.json"), "utf8"));
     expect(proposals.next_primary_hypothesis.family).toBe("repr.feature");
-  });
+  }, 15000);
 });
diff --git a/tests/e2e/research-brain-loop-integration.test.ts b/tests/e2e/research-brain-loop-integration.test.ts
@@ -67,5 +67,5 @@ describe("research brain loop integration", () => {
     expect(proposals.next_primary_hypothesis.causal_metric_path).toBeTruthy();
     expect(proposals.next_primary_hypothesis.failure_signature).toBeTruthy();
     expect(proposals.next_primary_hypothesis.pivot_after_failure).toBeTruthy();
-  });
+  }, 15000);
 });
diff --git a/tests/e2e/research-brain-mock-redirect-selection.test.ts b/tests/e2e/research-brain-mock-redirect-selection.test.ts
@@ -49,10 +49,10 @@ describe("research brain mock redirect selection", () => {
     const { workspace, configPath } = await makeWorkspace();
     await execFileAsync("python3", [innovationLoopScript, "bootstrap", "--config", configPath, "--workspace", workspace, "--mode", "mock"], { cwd: workspace });
     const attemptsPath = path.join(workspace, "experiments", "attempts.jsonl");
-    await fs.writeFile(attemptsPath, JSON.stringify({ kind: "candidate", family: "objective.loss", decision: "discard", redirect_if_underperforming: "停止重复 objective.loss，转向 repr.feature" }) + "\n", "utf8");
+    await fs.writeFile(attemptsPath, JSON.stringify({ kind: "candidate", family: "objective.loss", decision: "discard", redirect_if_underperforming: "停止重复 objective.loss，转向 repr.feature", failure_signature: "loss path stalled", causal_metric_path: ["loss_shape", "optimization_stability", "target_metric"] }) + "\n", "utf8");
     await execFileAsync("python3", [innovationLoopScript, "tick", "--config", configPath, "--workspace", workspace, "--mode", "mock"], { cwd: workspace });
     await execFileAsync("python3", [innovationLoopScript, "tick", "--config", configPath, "--workspace", workspace, "--mode", "mock"], { cwd: workspace });
     const proposals = JSON.parse(await fs.readFile(path.join(workspace, "experiments", "proposals", "round-0001.json"), "utf8"));
     expect(proposals.next_primary_hypothesis.family).toBe("repr.feature");
-  });
+  }, 15000);
 });
diff --git a/tests/kb/retrieve-papers.test.ts b/tests/kb/retrieve-papers.test.ts
@@ -42,8 +42,11 @@ describe("kb retrieve papers", () => {
     expect(result.innovation_briefs.apollo.support_mech_id).toBeTruthy();
     expect(result.innovation_briefs.apollo.compatibility_score).toBeGreaterThan(0);
     expect(result.innovation_briefs.apollo.lead_unit.mechanism_verb).toBeTruthy();
+    expect(Array.isArray(result.innovation_briefs.apollo.causal_metric_path)).toBe(true);
+    expect(result.innovation_briefs.apollo.causal_metric_path.length).toBeGreaterThan(1);
     expect(result.innovation_briefs.athena.guardrails.length).toBeGreaterThan(0);
     expect(result.selected[0].mechanism_units.length).toBeGreaterThan(0);
+    expect(result.selected[0].metric_paths.length).toBeGreaterThan(0);
     expect(result.selected[0].mechanism_units[0].intervention).not.toContain("作者解决了什么问题");
     expect(result.selected[0].mechanism_units[0].intervention).not.toBe("1.");
     expect(result.selected[0].mechanism_units[0].action_sentence.startsWith("对")).toBe(true);
diff --git a/tests/orchestration/workflow.test.ts b/tests/orchestration/workflow.test.ts
@@ -3,8 +3,8 @@ import os from "node:os";
 import path from "node:path";
 import { afterEach, describe, expect, it } from "vitest";
 import loopSpec from "../../fixtures/specs/loop-max-3.json";
-import { runGovernedExperimentWorkflow } from "../../src/orchestration/workflow";
 import type { ExperimentSpec } from "../../src/spec/schema";
+import { experiment_run_governed_workflow, experiment_init } from "../../src/tools";
 import { readJson, readJsonl, writeJson } from "../../src/utils/fs";
 import { getOrchestrationSummaryPath, getOrchestrationTracePath, getRecoveryCheckpointPath, getWorkspaceConfigPath } from "../../src/utils/paths";
 
@@ -17,6 +17,7 @@ async function makeWorkspace(): Promise<string> {
   await fs.writeFile(path.join(dir, "src", "config.json"), '{"learning_rate":0.1}\n', "utf8");
   await fs.writeFile(path.join(dir, "src", "strategy.txt"), "baseline\n", "utf8");
   await fs.writeFile(path.join(dir, "src", "module.ts"), "export const variant = 0;\n", "utf8");
+  await fs.writeFile(path.join(dir, "evaluate.py"), "print(0.93)\n", "utf8");
   await writeJson(getWorkspaceConfigPath(dir), { ...loopSpec, workspace_root: dir });
   await writeJson(getRecoveryCheckpointPath(dir), {
     run_id: "recoverable-run",
@@ -30,11 +31,12 @@ afterEach(async () => {
   await Promise.all(tempDirs.splice(0).map((dir) => fs.rm(dir, { recursive: true, force: true })));
 });
 
-describe("governed experiment workflow", () => {
-  it("writes an orchestration trace in specialist order", async () => {
+describe("governed experiment workflow bridge", () => {
+  it("writes an orchestration trace through the python controller authority path", async () => {
     const workspace = await makeWorkspace();
     const spec: ExperimentSpec = { ...(loopSpec as ExperimentSpec), workspace_root: workspace };
-    const result = await runGovernedExperimentWorkflow({ workspaceRoot: workspace, spec });
+    await experiment_init.execute({ workspace_root: workspace, spec });
+    const result = JSON.parse(await experiment_run_governed_workflow.execute({ workspace_root: workspace }));
     const steps = await readJsonl<{ actor: string; status: string; payload?: { execution_mode?: string; raw_excerpt?: string | null } }>(getOrchestrationTracePath(workspace));
     const summary = await readJson<{ specialist_audit?: Array<{ actor: string; session_id: string | null; execution_mode: string | null; fallback_reason: string | null; raw_excerpt: string | null }> }>(getOrchestrationSummaryPath(workspace), {});
     expect(result.total_iterations).toBeGreaterThan(0);
@@ -47,11 +49,8 @@ describe("governed experiment workflow", () => {
       "status_poll.py",
       "judge_result.py",
     ]);
-    expect(steps.every((step) => typeof step.payload?.execution_mode === "string" || (step.actor === "Sisyphus (Ultraworker)" && step.status === "blocked"))).toBe(true);
-    expect(steps.every((step) => step.payload?.execution_mode === "fallback")).toBe(true);
-    expect(steps.some((step) => typeof step.payload?.raw_excerpt === "string" || step.payload?.raw_excerpt === null)).toBe(true);
-    expect(summary.specialist_audit?.length).toBeGreaterThan(0);
-    expect(summary.specialist_audit?.every((entry) => typeof entry.execution_mode === "string" || entry.execution_mode === null)).toBe(true);
-    expect(summary.specialist_audit?.some((entry) => entry.raw_excerpt !== null || entry.fallback_reason !== null || entry.session_id !== null)).toBe(true);
-  });
+    expect(result.authority_path).toBe("python_controller");
+    expect(result.legacy_ts_workflow).toBe(false);
+    expect(summary.specialist_audit).toBeUndefined();
+  }, 30000);
 });