diff --git a/.github/workflows/e2e.yml b/.github/workflows/e2e.yml new file mode 100644 index 00000000..b0a1c0e3 --- /dev/null +++ b/.github/workflows/e2e.yml @@ -0,0 +1,87 @@ +name: E2E (cross-agent matrix) + +# Manual trigger only. This workflow spawns real agent CLIs against real +# provider APIs and a dedicated Deeplake test workspace — every run costs +# real money and takes ~10 minutes. We deliberately do NOT run it on +# every PR; the source + bundle byte-checks in `npm test` keep gating +# merges. Use this workflow as a release-readiness gate by triggering it +# manually from the Actions tab against your feature branch. + +on: + workflow_dispatch: + inputs: + case_filter: + description: "Only run this case id (e.g. 01-capture-smoke). Leave blank for all." + required: false + type: string + agent_filter: + description: "Only run this agent id (e.g. claude-code). Leave blank for all." + required: false + type: string + +permissions: + contents: read + +jobs: + e2e: + name: Tier-1 cross-agent matrix + runs-on: ubuntu-latest + timeout-minutes: 30 + # Gate the job on creds being present. Forks without the e2e secret + # see a clean skip in the Actions UI rather than a misleading red. + if: ${{ github.event.repository.full_name == 'activeloopai/hivemind' }} + steps: + - uses: actions/checkout@v4 + + - name: Setup Node.js + uses: actions/setup-node@v4 + with: + node-version: 22 + + - name: Install dependencies + run: npm install + + - name: Build bundles + # The harness drives the actual bundles for codex/cursor/hermes/pi + # (claude-code uses --plugin-dir against the source tree). Without + # build, `hivemind install` would copy stale or missing + # bundle files into the tmp HOME. + run: npm run build + + - name: Install agent CLIs + # Each tier-1 agent CLI must be on PATH for its driver to spawn. + # We install the npm-distributed CLIs here; cursor-agent and + # hermes are typically installed via the agent vendor's own + # installer outside the npm ecosystem. If those binaries are + # not on a CI runner, their driver will fail with a clear + # "spawn error" and the matrix continues. + run: | + npm install -g @anthropic-ai/claude-code @openai/codex + # Pi ships via npm too. + npm install -g @piapp/cli || true + # cursor-agent and hermes — install via curl when available; + # if not, their points fail loudly rather than silently skip. + curl -fsSL https://cursor.com/install-cli.sh | bash -s -- --print 2>/dev/null || echo "cursor-agent install skipped" + # Hermes install would go here; install method varies by vendor. + which claude codex pi cursor-agent hermes 2>&1 || true + + - name: Run e2e matrix + env: + HIVEMIND_E2E_CREDS_JSON: ${{ secrets.HIVEMIND_E2E_CREDS_JSON }} + ANTHROPIC_API_KEY: ${{ secrets.HIVEMIND_E2E_ANTHROPIC_API_KEY }} + OPENAI_API_KEY: ${{ secrets.HIVEMIND_E2E_OPENAI_API_KEY }} + GOOGLE_API_KEY: ${{ secrets.HIVEMIND_E2E_GOOGLE_API_KEY }} + run: | + args=() + if [ -n "${{ inputs.case_filter }}" ]; then args+=(--case "${{ inputs.case_filter }}"); fi + if [ -n "${{ inputs.agent_filter }}" ]; then args+=(--agent "${{ inputs.agent_filter }}"); fi + npm run e2e -- "${args[@]}" + + - name: Upload summary artifact + if: always() + uses: actions/upload-artifact@v4 + with: + name: e2e-summary + path: tests/e2e/results/ + if-no-files-found: warn + retention-days: 30 diff --git a/.gitignore b/.gitignore index ff3611cc..f1fcd926 100644 --- a/.gitignore +++ b/.gitignore @@ -18,3 +18,5 @@ bun.lock deploy-to-cache.sh .followups-pr97.md .followups-pr98.md +# e2e harness per-run output artifacts (summary.json + sandbox dumps) +tests/e2e/results/ diff --git a/README.md b/README.md index b24db389..66d85c40 100644 --- a/README.md +++ b/README.md @@ -316,6 +316,13 @@ Interactive shell against Deeplake: npm run shell ``` +Cross-agent end-to-end matrix — drives all six agent runtimes (five CLI subprocess, OpenClaw via programmatic event firing) through real prompts against a Deeplake test workspace; manually triggered, not on every PR: + +```bash +npm run e2e # full matrix; see tests/e2e/README.md for env vars +npm run e2e -- --list # print the matrix without spawning +``` + ## License Apache License 2.0 — © Activeloop, Inc. See [LICENSE](LICENSE) for details. diff --git a/package.json b/package.json index e5ad9bfb..f82ae84e 100644 --- a/package.json +++ b/package.json @@ -37,6 +37,7 @@ "cli": "tsx src/cli/index.ts", "test": "vitest run", "typecheck": "tsc --noEmit", + "e2e": "tsx tests/e2e/runner.ts", "dup": "jscpd src", "audit:openclaw": "node scripts/audit-openclaw-bundle.mjs", "pack:check": "node scripts/pack-check.mjs", diff --git a/tests/e2e/README.md b/tests/e2e/README.md new file mode 100644 index 00000000..a3bbd3aa --- /dev/null +++ b/tests/e2e/README.md @@ -0,0 +1,255 @@ +# Cross-agent E2E matrix + +This directory drives **all six** agent runtimes hivemind supports — claude-code, codex, cursor-agent, hermes, pi, openclaw — through real prompts against a real Deeplake workspace, and asserts on real side effects (DB rows, hook log lines, captured stdout, inject text, tool-call results). It's the layer that catches plugin bugs that source + bundle tests can't, like: + +- a hook bundle that imports correctly but throws at runtime under one agent's loader, +- a per-agent install path that drifted out of sync with the runtime expectation, +- a cross-agent inconsistency where claude-code returns the synthesized index but cursor-agent ENOENTs, +- a SQL escape bug in capture that silently corrupts unicode content on JSONB roundtrip, +- a missing-table self-heal regression that drops the very first capture after a fresh workspace setup. + +The matrix is **(plugin behavior × agent runtime)**. Add a new shipped behavior → add one case file → it's automatically asserted against every applicable agent. + +## Agent shapes (not all six are CLIs) + +| Agent | Driver shape | How `run()` works | +|---|---|---| +| claude-code | subprocess | `claude -p --plugin-dir --allowedTools ...` | +| codex | subprocess | `codex exec -m gpt-5-mini ` | +| cursor-agent | subprocess | `cursor-agent --print --force --model gpt-5-mini` | +| hermes | subprocess | `hermes -z --provider google --yolo` | +| pi | subprocess | `pi --print --provider google --model gemini-2.5-flash` | +| **openclaw** | **programmatic** | OpenClaw is a gateway, not a CLI. Driver loads the installed plugin module from `~/.openclaw/extensions/hivemind/dist/index.js`, provides a fake `pluginApi` that captures registered handlers + tools, then fires synthetic events (`agent_end` for capture cases) or invokes registered tools directly (`hivemind_search` / `hivemind_read` for tool cases). Plugin code paths run end-to-end — only the gateway's own event parsing / multi-event ordering / concurrency are out of scope (covered by openclaw's own tests, not ours). | + +## Case coverage map + +Each case asserts on a specific behavioral surface, mapped back to `RELEASE_CHECKLIST.md`: + +| Case | Surface | Applies to | Skipped on (reason) | +|---|---|---|---| +| `01-capture-smoke` | One turn → one row in sessions (checklist §2 happy path) | all 6 | — | +| `02-cat-index-md` | `cat ~/.deeplake/memory/index.md` → virtual index (§4 discoverability via Read) | 5 CLI | openclaw (no bash; equivalent via `hivemind_read` in case 08) | +| `03-grep-memory-summaries` | `grep` routes through SQL fast-path with seeded sentinel (§4 search) | 5 CLI | openclaw (no bash; equivalent via `hivemind_search` in case 08) | +| `04-session-start-inject` | 3-tier text visible in agent context (§4 SessionStart inject) | 5 CLI | openclaw (different mechanism via openclaw/skills/SKILL.md) | +| `05-sql-injection-probe` | Injection payload doesn't drop the memory table (§5 SQL identifiers + strings) | all 6 | — | +| `06-missing-table-self-heal` | Lazy CREATE TABLE IF NOT EXISTS on first INSERT after drop (§6 backend quirks) | all 6 | — | +| `07-unicode-roundtrip` | Emoji + RTL + smart quotes + backslashes survive JSONB roundtrip byte-for-byte (§2 edge content) | all 6 | — | +| `08-openclaw-tools` | `hivemind_search` returns seeded sentinel via openclaw tool registration (§3 openclaw row + §4 openclaw discoverability) | openclaw | 5 CLI (they don't register MCP tools the harness invokes directly; equivalents in 02/03) | +| `09-install-no-broken-paths` | After `hivemind install`, every hook command in the resulting config file points at a file that exists on disk. Plus claude-code-only auto-heal check: pre-seeded broken entry was removed by `cleanupBrokenSettingsHooks`. Install-shape (no agent spawn). | 4 hooks-config agents | pi (TS extension ref, no command paths) / openclaw (gateway loader, no hooks.json) | +| `10-invalid-identifier-rejection` | `HIVEMIND_SESSIONS_TABLE=bad-name-with-dashes` → `sqlIdent()` rejects → no SQL fires → no `bad-name-with-dashes` table exists in workspace afterward (§2 + §5 SQL identifiers) | all 6 | — | +| `11-path-traversal-rejection` | `cat ~/.deeplake/memory/../../../../etc/passwd` → virtual mount rewrite rejects/blocks; agent's stdout does NOT contain `/etc/passwd` shape `root:x:0:0:` (§5 path traversal) | 5 CLI | openclaw (different tool-arg validation path; would need a dedicated case) | +| `12-recursion-guard` | `HIVEMIND_WIKI_WORKER=1` pre-set in agent env → session-end wiki worker short-circuits → no summary row lands in memory table (§5 recursion guards) | 5 CLI | openclaw (in-band worker, different pattern) | + +Total: **72 matrix points** (60 live, 12 explicitly skipped with rationale). + +### Why case 09 matters specifically + +Case 09 is the matrix's answer to a destructive hotfix that shipped to npm: PR #128 added a `syncHivemindHooksToSettings()` helper that wrote hardcoded path entries into `~/.claude/settings.json` for marketplace-only users — every hook ENOENT'd at session start. Shipped in 0.7.23 / 0.7.24, hotfixed in PR #166. Case 09 runs the real `hivemind install` flow in a clean tmp HOME and walks the resulting config: any command pointing at a nonexistent file fails the assertion. Plus the claude-code-only auto-heal sub-assertion pre-seeds a known-broken entry and verifies `cleanupBrokenSettingsHooks` removed it. + +Earlier cases (`01-capture-smoke` etc.) didn't catch this because the claude-code driver uses `claude --plugin-dir` for runtime cases — that bypasses the install flow entirely. Case 09 is install-shape (`installOnly: true`) and triggers the real installer subprocess to exercise the path PR #128 broke. + +## Running it + +**Steady state: one command.** + +```bash +npm run e2e +``` + +That's it. The runner auto-resolves credentials (operator's logged-in state or `HIVEMIND_E2E_CREDS_JSON`), auto-builds `bundle/cli.js` if it's missing, auto-skips any agent with a missing provider key, and DELETEs the rows it wrote before exiting. No separate `npm install` / `npm run build` / "did I switch workspace?" steps. + +**Other invocations:** + +```bash +# Print the matrix without spawning anything (free, no creds needed) +npm run e2e -- --list + +# Single case across all agents — narrow the blast radius +npm run e2e -- --case 02-cat-index-md + +# Single agent across all cases +npm run e2e -- --agent claude-code + +# Single point — fastest dev loop, ~$0.01-0.05 +npm run e2e -- --case 01-capture-smoke --agent claude-code + +# Leave tmp HOMEs on disk for inspection +npm run e2e -- --keep-sandbox + +# Skip the auto-build (when iterating on the harness itself and the bundle is current) +HIVEMIND_E2E_SKIP_BUILD=1 npm run e2e +``` + +Test workspace resolution is **automatic** — two modes, evaluated in order: + +1. **CI / explicit** (`HIVEMIND_E2E_CREDS_JSON` env var is set): the value is parsed as a full credentials.json blob. Highest priority; no API lookup. This is how CI runs it. +2. **Local / derive from operator** (default for devs): the harness reads your `~/.deeplake/credentials.json`, keeps the token + orgId, and resolves a fresh workspaceId by **name** from the workspace named `hivemind_e2e_test` (override with `HIVEMIND_E2E_WORKSPACE_NAME`). Your real credentials.json is **read-only** — the harness never calls `hivemind workspace ` or otherwise persists a workspace switch, so a mid-run crash can't leave you on the wrong workspace. + +If both fail (no creds blob AND no logged-in operator AND no matching workspace), the runner exits 2 with a clear message describing what's missing. + +Other env vars: + +- `ANTHROPIC_API_KEY` — needed for claude-code's points (others skip cleanly). +- `OPENAI_API_KEY` — needed for codex + cursor-agent. +- `GOOGLE_API_KEY` — needed for hermes + pi. +- `HIVEMIND_E2E_WORKSPACE_NAME` — override the default `hivemind_e2e_test` workspace name (mode 2 only). +- `HIVEMIND_E2E_TABLE_SUFFIX` — appended to sessions/memory table names (e.g. `sessions_`). Use this only if the e2e workspace deliberately has per-dev tables; concurrent runs do NOT collide on row paths because every session_id embeds a unique runId timestamp (see `sandbox.ts:buildSessionId`). + +A missing provider key results in a **skip** (not a failure) for that agent's points, with the reason printed inline. The exit code stays 0 unless an actually-run point fails an assertion. + +### One-time setup (local mode) + +1. `hivemind login` against the org that owns the `hivemind_e2e_test` workspace. +2. Confirm `hivemind workspaces` shows `hivemind_e2e_test` in the list. If it doesn't, ask an admin to create it. Don't run e2e against your real working workspace — the harness DELETEs rows by session_id on cleanup and that's catastrophic for a real workspace. +3. Run `npm run e2e -- --list` to confirm the harness picks up the matrix. Then `npm run e2e -- --case 01-capture-smoke --agent claude-code` for the fastest live smoke. + +### One-time setup (CI mode) + +1. Provision the `hivemind_e2e_test` workspace as above. +2. Generate a credentials.json blob pointed at it (e.g. via `hivemind login` on a throwaway machine). +3. Save the blob as the `HIVEMIND_E2E_CREDS_JSON` GH secret, plus the provider keys as `HIVEMIND_E2E_ANTHROPIC_API_KEY` etc. + +### In CI + +Trigger `.github/workflows/e2e.yml` manually from the GitHub Actions tab, optionally with the `case_filter` / `agent_filter` inputs. There is **no schedule and no PR trigger** — every run costs money and burns ~10 minutes; we run it as a release-readiness gate, not as a per-PR gate. The unit/source/bundle tests in `npm test` keep gating merges. + +## How a case works + +**Cases are auto-discovered.** Drop a new file in `tests/e2e/cases/` and the next `npm run e2e` runs it against every applicable agent — no `matrix.ts` edit, no registration step. + +Each case file exports one `E2ECase` object as its **default export**: + +```ts +// tests/e2e/cases/13-my-behavior.ts +import type { E2ECase } from "../types.js"; + +const myCase: E2ECase = { + id: "13-my-behavior", + description: "what this case asserts about the plugin", + prompt: "instruct the agent to do something that exercises the hook", + // optional: seed test data the agent will retrieve + async setup(ctx) { + // ctx.creds is a configured DeeplakeApi target + // ctx.sessionId is unique to this (case, agent, runId) + }, + assertions: [ + { type: "hook-log-contains", substring: "what the hook logs when this fires" }, + { type: "stdout-contains", substring: "what the agent says when it works" }, + { + type: "select-from-db", + sql: ({ ctx, run }) => `SELECT count(*) AS n FROM "${ctx.creds.sessionsTable}" WHERE path ILIKE '%${run.sessionId}%'`, + expect: (rows) => { if (Number(rows[0].n) < 1) throw new Error("no rows"); }, + }, + // Escape hatch for assertions that don't fit the four typed shapes + // (filesystem checks, per-agent config walks, etc.): + { type: "custom", label: "X", check: async ({ ctx, run }) => null /* or failure string */ }, + ], + // optional: this case doesn't apply to these agents (rationale required) + skipFor: ["pi"], // pi doesn't ship the X bundle; rationale here + // optional: install-shape case — runner skips driver.run() and goes + // straight from setup() to assertions. No model API call. + installOnly: false, +}; + +export default myCase; +``` + +**Discovery rules:** + +- File lives directly under `tests/e2e/cases/` (no nesting). +- File name ends in `.ts` and starts with a digit (`13-foo.ts`) so it sorts deterministically. +- File MUST `export default` the case object. +- The default export MUST satisfy the `E2ECase` shape (id, prompt, assertions[]). + +Files that don't satisfy the rules are silently skipped with a one-line stderr warning — a half-written case in the directory won't break the matrix. + +## How a driver works + +Each file in `agents/` exports one `AgentDriver` object: + +```ts +export const myAgentDriver: AgentDriver = { + id: "my-agent", + async install(home, repoRoot) { + // copy the bundle into /, write any config file + }, + async run(prompt, opts) { + // spawn the real CLI with HOME=opts.home + HIVEMIND_DEBUG=1 + // forward opts.providerEnv to the spawn env + // return { stdout, stderr, exitCode, sessionId, costCents, durationMs } + }, +}; +``` + +Drivers are 50–80 lines each. `runProcess` in `agents/claude-code.ts` is exported and reusable — most drivers just compose the right argv + env and delegate. + +Assertions are **not** a driver concern. Drivers don't know what the case wants; they just spawn and capture. + +## How session_id flows + +1. Harness generates a deterministic **seed** session_id `e2e---` (see `sandbox.ts:buildSessionId`). +2. The seed goes into the spawn so cleanup can find rows even if the agent didn't print its own session_id. +3. The agent generates its own UUID session_id at start. Driver reads it from `hook-debug.log` via the `session=` line every hivemind hook writes. +4. Assertions use `run.sessionId` (the real one). +5. Cleanup uses `run.sessionId` (or falls back to the seed if discovery failed). + +## How cleanup works + +After each case: + +1. Runner calls `cleanupSessionRows(ctx, run.sessionId)` — DELETEs from `sessions` + `memory` where path ILIKE `%%`. +2. The tmp HOME is rm-rf'd unless `--keep-sandbox` was passed. +3. Cleanup failures are warned but **don't fail the case** — a leftover row is a small workspace-debris cost, not a signal we want to gate on. + +A daily cron in the test workspace sweeps `WHERE creation_date < now() - interval '24h' AND agent ILIKE 'e2e-%'` as belt-and-suspenders against killed runs. + +## Coverage today + growth target + +The matrix ships with **8 cases** covering each major behavioral surface in `RELEASE_CHECKLIST.md` §2 / §3 / §4 / §5 / §6 that an e2e harness can deterministically assert on. As new features ship, **every new behavioral surface should add a case** — adding one is one file in `tests/e2e/cases/` + one line in `matrix.ts`; the matrix runs it against every applicable agent automatically. + +A new behavior without a matrix case is the same situation as a new code path without a unit test — fine for a one-off, a slow leak in coverage at scale. + +### What the matrix does NOT cover (and shouldn't) + +Some checklist items aren't e2e-deterministic by nature: + +- **§6 UPDATE coalescing** — two rapid UPDATEs on the same row drop one silently with `row_count: 0`. Reproducing this in a deterministic test requires precise timing in a single connection; covered by unit tests around the affected helpers, not the agent runtime. +- **§3 async hook completion timing** — `claude -p` doesn't block on the Stop hook, so post-exit async work can be killed mid-flight. Asserting on "the row landed *after* the parent exited" is a race that doesn't reliably reproduce on CI hardware. Best handled at source level with timing-aware fakes. +- **§3 per-agent CLI dispatch model name** — "did claude get `haiku-3-5` and codex get `gpt-5-codex-mini`" is a dispatch-config check, not a runtime assertion. Covered by source tests that scan the agent's argv. +- **§1 / §8 unit + bundle scans** — by design, those are the `npm test` layer's job. The e2e matrix is for cross-agent runtime behavior, not bundle byte-checks. + +These are documented here so future contributors don't add a brittle case for a problem unit tests can solve more reliably. + +## Why this isn't run on every PR (yet) + +Three reasons stand today: + +1. **Cost** — every run is ~$1.50 in provider API calls at 4 cases × 5 agents. PR-gating × dozens of PRs/day = real money. +2. **Flake surface** — upstream agent CLIs change flag shapes between minor releases. A PR unrelated to e2e would gate-fail because hermes 1.4.3 renamed `--yolo`. +3. **Wall time** — ~10 minutes at current case count vs the 23-second `npm test`. Slows the merge loop for marginal incremental value while coverage is thin. + +**Promotion criteria.** When the matrix has (a) stable coverage across a week of clean manual runs, (b) at least one case per major behavioral surface, and (c) a flake budget < 5% over that week, promote the workflow trigger from `workflow_dispatch` to PR-gating with a path filter on `src/hooks/**` / `openclaw/src/**` / bundle outputs. Mirrors how `npm test` + coverage thresholds gate today; the matrix becomes the equivalent gate for cross-agent behavior. That promotion lives in its own PR, with the cadence flip documented in the cost summary of a representative week of nightlies. + +Until then, run it manually before any release — the harness is the canonical replacement for the multi-hour cross-agent test pass. + +## What this matrix does NOT cover + +- **Cursor IDE GUI inside Snap** — a fundamentally different runtime (graphical session, snap sandbox); needs a long-lived test VM + Xvfb. Out of scope for an in-repo harness. Bugs that only surface in the GUI runtime (cursor-snap detached spawns, GUI-only auth flows) belong in a separate manual or VM-based pipeline. +- **Pure source-level logic** — tests that don't actually need an agent runtime stay as vitest unit tests in `claude-code/tests/`. Don't pad the matrix with cases the agent runtime adds no signal to (see "What the matrix does NOT cover" earlier in this doc for specific examples). +- **Model-quality regression** — we test what the *plugin* does, not what the model says. Asserting "agent gave a good answer" is out of scope; that's a separate evaluation problem with a separate tool. + +## OpenClaw driver caveats + +The openclaw driver loads the installed plugin module and fires events programmatically rather than spinning up a real gateway. What this exercises: + +- Hook handler code (`agent_end` capture, `before_prompt_build` inject, etc.) end-to-end against the real Deeplake API. +- Plugin tool registration and `execute()` paths (`hivemind_search`, `hivemind_read`, `hivemind_index`). +- Install-side surface (the plugin lands at the expected path with the expected files). + +What it doesn't exercise: + +- The gateway's own event parser (the way upstream agent_end payloads are deserialized). +- Multi-event ordering across concurrent sessions. +- Real gateway lifecycle (boot, ready signal, shutdown). + +Those gateway-side concerns have their own tests in the openclaw repo. If a future bug class lives specifically in the gateway↔plugin seam, add a dedicated case here that spawns the gateway as a subprocess — the harness is structured to accept that without changing its public shape. diff --git a/tests/e2e/agents/claude-code.ts b/tests/e2e/agents/claude-code.ts new file mode 100644 index 00000000..aa9cd88e --- /dev/null +++ b/tests/e2e/agents/claude-code.ts @@ -0,0 +1,149 @@ +/** + * Claude Code driver. + * + * No install step needed: `claude --plugin-dir ` loads hivemind + * for the session only. That keeps the sandbox tight — no permanent + * plugin install / marketplace fetch, no global state outside HOME. + * + * Session_id is generated by Claude Code at session start and isn't + * exposed via stdout in a stable format. We read it out of the + * hook-debug.log after the run; HIVEMIND_DEBUG=1 is set unconditionally + * by the harness so this is guaranteed to exist. + */ + +import { spawn } from "node:child_process"; +import { readFileSync, existsSync } from "node:fs"; +import { join } from "node:path"; +import type { AgentDriver, RunOpts, RunResult } from "../types.js"; +import { parseCostCents } from "../cost.js"; + +const SESSION_LINE = /session=([0-9a-f-]{8,})/i; + +export const claudeCodeDriver: AgentDriver = { + id: "claude-code", + providerKey: "ANTHROPIC_API_KEY", + async install(_home, _repoRoot) { + // No-op: --plugin-dir handles loading per session. We deliberately + // do NOT go through `claude plugin marketplace add` here — that + // mutates global state outside HOME (marketplace cache, plugin + // registry) and would require network access to GitHub even when + // the test workspace lives elsewhere. + }, + async run(prompt, opts): Promise { + const pluginDir = join(opts.repoRoot, "claude-code"); + const args = [ + "-p", + "--plugin-dir", + pluginDir, + // Read-only run: don't allow Edit/Write — keeps a flaky model from + // touching the sandbox in unexpected ways. The hivemind capture and + // pre-tool-use hooks fire regardless of whether the model uses tools. + "--allowedTools", + "Bash,Read", + "--model", + "claude-haiku-4-5", + prompt, + ]; + const env: NodeJS.ProcessEnv = { + ...process.env, + HOME: opts.home, + // Hivemind reads this in every hook to write hook-debug.log; the + // session-id extraction in this driver depends on it being on. + HIVEMIND_DEBUG: "1", + }; + if (opts.providerEnv.ANTHROPIC_API_KEY) { + env.ANTHROPIC_API_KEY = opts.providerEnv.ANTHROPIC_API_KEY; + } + return runProcess("claude", args, env, opts.timeoutMs ?? 90_000, opts.sessionId); + }, +}; + +/** + * Spawn a CLI, capture stdout/stderr, and return a RunResult. Pulled out + * of the driver so other drivers can share the spawn shape — only the + * argv / env composition differs per agent. + * + * Falls back to `seedSessionId` if no `session=` line appears in + * either stdout or the hook-debug.log. That fallback shouldn't normally + * trigger; when it does, downstream SQL assertions will SELECT against + * a session that doesn't exist and produce a clear "0 rows" failure + * rather than a mysterious silent pass. + */ +export function runProcess( + bin: string, + args: string[], + env: NodeJS.ProcessEnv, + timeoutMs: number, + seedSessionId: string, +): Promise { + return new Promise((resolve) => { + const startedAt = Date.now(); + const child = spawn(bin, args, { env, stdio: ["ignore", "pipe", "pipe"] }); + let stdout = ""; + let stderr = ""; + child.stdout.on("data", (chunk: Buffer) => { stdout += chunk.toString("utf-8"); }); + child.stderr.on("data", (chunk: Buffer) => { stderr += chunk.toString("utf-8"); }); + const killTimer = setTimeout(() => { + child.kill("SIGKILL"); + stderr += `\n[harness] killed after ${timeoutMs}ms wall clock`; + }, timeoutMs); + child.on("exit", (code) => { + clearTimeout(killTimer); + const durationMs = Date.now() - startedAt; + const home = env.HOME ?? ""; + const sessionId = extractSessionId(stdout, stderr, home) ?? seedSessionId; + const agent = inferAgentFromBin(bin); + const costCents = parseCostCents(agent, stdout); + resolve({ + stdout, + stderr, + exitCode: code ?? -1, + sessionId, + costCents, + durationMs, + }); + }); + child.on("error", (err) => { + clearTimeout(killTimer); + const durationMs = Date.now() - startedAt; + stderr += `\n[harness] spawn error: ${err.message}`; + resolve({ + stdout, + stderr, + exitCode: -1, + sessionId: seedSessionId, + costCents: null, + durationMs, + }); + }); + }); +} + +function extractSessionId(stdout: string, stderr: string, home: string): string | null { + // Try stdout first (some agents print it), then stderr, then the + // hook-debug log where every hivemind hook writes one line per call. + for (const haystack of [stdout, stderr]) { + const m = haystack.match(SESSION_LINE); + if (m) return m[1]; + } + if (home) { + const logPath = join(home, ".deeplake", "hook-debug.log"); + if (existsSync(logPath)) { + const log = readFileSync(logPath, "utf-8"); + const m = log.match(SESSION_LINE); + if (m) return m[1]; + } + } + return null; +} + +function inferAgentFromBin(bin: string): import("../types.js").AgentId { + // Used only by parseCostCents to pick the right regex. The driver + // already knows its own id, but runProcess is exported for reuse so + // we infer instead of threading the id through every callsite. + if (bin === "claude") return "claude-code"; + if (bin === "codex") return "codex"; + if (bin === "cursor-agent") return "cursor-agent"; + if (bin === "hermes") return "hermes"; + return "pi"; +} diff --git a/tests/e2e/agents/codex.ts b/tests/e2e/agents/codex.ts new file mode 100644 index 00000000..01a10b86 --- /dev/null +++ b/tests/e2e/agents/codex.ts @@ -0,0 +1,38 @@ +/** + * Codex driver. + * + * Install: `hivemind codex install` copies the codex bundle into + * ~/.codex/hivemind/ and writes ~/.codex/hooks.json. No marketplace + * round-trip — pure local copy. + * + * Non-interactive run: `codex exec `. Codex prints its final + * answer + a usage line to stdout. Session_id is logged by the hivemind + * hooks to ~/.deeplake/hook-debug.log, same as claude-code. + */ + +import type { AgentDriver, RunOpts, RunResult } from "../types.js"; +import { runProcess } from "./claude-code.js"; +import { installOrThrow } from "./install-via-cli.js"; + +export const codexDriver: AgentDriver = { + id: "codex", + providerKey: "OPENAI_API_KEY", + async install(home, repoRoot) { + await installOrThrow("codex", home, repoRoot); + }, + async run(prompt, opts: RunOpts): Promise { + const env: NodeJS.ProcessEnv = { + ...process.env, + HOME: opts.home, + HIVEMIND_DEBUG: "1", + }; + if (opts.providerEnv.OPENAI_API_KEY) { + env.OPENAI_API_KEY = opts.providerEnv.OPENAI_API_KEY; + } + // `codex exec` is the explicit non-interactive subcommand. Without + // it, codex falls into its interactive TUI and blocks on stdin. + // `-m` picks the model; we use gpt-5-mini as the cheap default. + const args = ["exec", "-m", "gpt-5-mini", prompt]; + return runProcess("codex", args, env, opts.timeoutMs ?? 90_000, opts.sessionId); + }, +}; diff --git a/tests/e2e/agents/cursor-agent.ts b/tests/e2e/agents/cursor-agent.ts new file mode 100644 index 00000000..6929a586 --- /dev/null +++ b/tests/e2e/agents/cursor-agent.ts @@ -0,0 +1,46 @@ +/** + * Cursor-agent driver. + * + * Install: `hivemind cursor install` copies the cursor bundle into + * ~/.cursor/hivemind/ and registers the preToolUse + sessionStart hooks + * via cursor's hook config. + * + * Non-interactive run: `cursor-agent --print --force `. `--force` + * auto-approves tool calls so the harness doesn't block on a prompt. + * `--print` is the headless flag (vs the default agent TUI). + */ + +import type { AgentDriver, RunOpts, RunResult } from "../types.js"; +import { runProcess } from "./claude-code.js"; +import { installOrThrow } from "./install-via-cli.js"; + +export const cursorAgentDriver: AgentDriver = { + id: "cursor-agent", + providerKey: "OPENAI_API_KEY", + async install(home, repoRoot) { + await installOrThrow("cursor", home, repoRoot); + }, + async run(prompt, opts: RunOpts): Promise { + const env: NodeJS.ProcessEnv = { + ...process.env, + HOME: opts.home, + HIVEMIND_DEBUG: "1", + }; + if (opts.providerEnv.OPENAI_API_KEY) { + env.OPENAI_API_KEY = opts.providerEnv.OPENAI_API_KEY; + // cursor-agent reads the OpenAI key via its own auth bridge; the + // explicit --api-key flag overrides any stale stored auth and keeps + // the run isolated from whatever the host's `cursor-agent login` + // last persisted. + env.CURSOR_API_KEY = opts.providerEnv.OPENAI_API_KEY; + } + const args = [ + "--print", + "--force", + "--model", + "gpt-5-mini", + prompt, + ]; + return runProcess("cursor-agent", args, env, opts.timeoutMs ?? 90_000, opts.sessionId); + }, +}; diff --git a/tests/e2e/agents/hermes.ts b/tests/e2e/agents/hermes.ts new file mode 100644 index 00000000..1dd1001b --- /dev/null +++ b/tests/e2e/agents/hermes.ts @@ -0,0 +1,45 @@ +/** + * Hermes driver. + * + * Install: `hivemind hermes install` deposits the hermes bundle + the + * hivemind-memory skill md + the MCP server into ~/.hermes/ and + * ~/.hivemind/mcp/ respectively. + * + * Non-interactive run: `hermes -z --provider google --model X --yolo`. + * `-z` is hermes's headless one-shot flag. `--yolo` auto-approves tool + * calls (hermes equivalent of `--force` / `--allow-dangerously-...`). + */ + +import type { AgentDriver, RunOpts, RunResult } from "../types.js"; +import { runProcess } from "./claude-code.js"; +import { installOrThrow } from "./install-via-cli.js"; + +export const hermesDriver: AgentDriver = { + id: "hermes", + providerKey: "GOOGLE_API_KEY", + async install(home, repoRoot) { + await installOrThrow("hermes", home, repoRoot); + }, + async run(prompt, opts: RunOpts): Promise { + const env: NodeJS.ProcessEnv = { + ...process.env, + HOME: opts.home, + HIVEMIND_DEBUG: "1", + }; + if (opts.providerEnv.GOOGLE_API_KEY) { + env.GOOGLE_API_KEY = opts.providerEnv.GOOGLE_API_KEY; + // Hermes also reads GEMINI_API_KEY in some versions; forward both + // to avoid an "unauthenticated" failure on the version that's + // installed on the runner. + env.GEMINI_API_KEY = opts.providerEnv.GOOGLE_API_KEY; + } + const args = [ + "-z", + prompt, + "--provider", "google", + "--model", "gemini-2.5-flash", + "--yolo", + ]; + return runProcess("hermes", args, env, opts.timeoutMs ?? 90_000, opts.sessionId); + }, +}; diff --git a/tests/e2e/agents/install-via-cli.ts b/tests/e2e/agents/install-via-cli.ts new file mode 100644 index 00000000..05db5b73 --- /dev/null +++ b/tests/e2e/agents/install-via-cli.ts @@ -0,0 +1,105 @@ +/** + * Shared installer-dispatch helper. Codex / Cursor / Hermes / Pi all install + * hivemind by copying bundle files into agent-specific paths under HOME and + * writing one config file (hooks.json / extension wiring / skill md). That's + * exactly what `hivemind install` already does, so we just shell out + * to it with HOME overridden to the tmp sandbox. + * + * We spawn `node bundle/cli.js` (the BUILT CLI, same artifact users + * install via npm) rather than `npx tsx src/cli/index.ts`. Two reasons: + * + * 1. No tsx dependency at run time — the harness doesn't need `tsx` + * anywhere except in its own invocation (`npm run e2e` already + * resolves tsx for the runner). Drivers used to depend on + * `npx --yes tsx ...` which would fail loudly on a runner that + * didn't have npm's offline cache populated. + * + * 2. We test what users ship. The built cli.js is the exact artifact + * a `npm install -g @deeplake/hivemind` would put on PATH; a bug + * that only shows up after bundling (esbuild dropping a helper, + * a wrong default for a flag) gets caught here. + * + * We deliberately do NOT import installXxx() functions directly into the + * runner. Reason: those installers capture `homedir()` at MODULE LOAD + * time (see src/cli/util.ts:HOME). A spawned subprocess starts fresh + * and picks up our HOME override; an in-process require/import would + * use the runner's own HOME, not the tmp sandbox. + * + * Claude Code does NOT use this — its driver passes `--plugin-dir` directly + * to the `claude` CLI, which loads the plugin for the session only and + * avoids `claude plugin marketplace add`'s network round-trip. + */ + +import { spawn } from "node:child_process"; +import { existsSync } from "node:fs"; +import { resolve } from "node:path"; + +export interface InstallResult { + exitCode: number; + stdout: string; + stderr: string; +} + +/** + * Run `hivemind install` against the given HOME. Resolves with the + * subprocess's exit code; caller decides whether to throw. + * + * `agentArg` is the CLI subcommand name, which differs slightly from our + * internal AgentId for openclaw (`claw` not `openclaw`). For the other + * five agents the mapping is identity. + */ +export function runInstallerSubprocess( + agentArg: string, + home: string, + repoRoot: string, + timeoutMs = 60_000, +): Promise { + const cliEntry = resolve(repoRoot, "bundle", "cli.js"); + if (!existsSync(cliEntry)) { + // Pre-flight: fail loudly instead of letting `node` exit 1 with a + // confusing "Cannot find module" stderr. The build artifact MUST + // exist before the harness can install hivemind into a tmp HOME. + return Promise.resolve({ + exitCode: -1, + stdout: "", + stderr: + `${cliEntry} does not exist — run \`npm run build\` before \`npm run e2e\`. ` + + `The harness spawns the built CLI (not the TypeScript source) so what we ` + + `test matches what users ship.`, + }); + } + return new Promise((resolveP) => { + const child = spawn( + process.execPath, // current node, not "node" on PATH — works in nvm-managed envs too + [cliEntry, agentArg, "install"], + { + env: { ...process.env, HOME: home }, + cwd: repoRoot, + stdio: ["ignore", "pipe", "pipe"], + }, + ); + let stdout = ""; + let stderr = ""; + child.stdout.on("data", (c: Buffer) => { stdout += c.toString("utf-8"); }); + child.stderr.on("data", (c: Buffer) => { stderr += c.toString("utf-8"); }); + const killTimer = setTimeout(() => child.kill("SIGKILL"), timeoutMs); + child.on("exit", (code) => { + clearTimeout(killTimer); + resolveP({ exitCode: code ?? -1, stdout, stderr }); + }); + child.on("error", (err) => { + clearTimeout(killTimer); + resolveP({ exitCode: -1, stdout, stderr: `${stderr}\nspawn error: ${err.message}` }); + }); + }); +} + +/** Throw if install didn't exit cleanly. Used by every non-claude driver. */ +export async function installOrThrow(agentArg: string, home: string, repoRoot: string): Promise { + const r = await runInstallerSubprocess(agentArg, home, repoRoot); + if (r.exitCode !== 0) { + throw new Error( + `\`hivemind ${agentArg} install\` failed (exit=${r.exitCode}). stderr:\n${r.stderr.slice(-800)}`, + ); + } +} diff --git a/tests/e2e/agents/openclaw.ts b/tests/e2e/agents/openclaw.ts new file mode 100644 index 00000000..650b095e --- /dev/null +++ b/tests/e2e/agents/openclaw.ts @@ -0,0 +1,195 @@ +/** + * OpenClaw driver. + * + * OpenClaw is a gateway plugin, not a CLI — there is no `openclaw -p `. + * The runtime that owns sessions, fires hook events, and routes tool calls is + * the gateway server itself. Spinning up that server inside the e2e harness + * is heavy infrastructure (separate process, port binding, settle time, + * teardown choreography) and inappropriate for the fast cross-agent loop. + * + * Instead, this driver loads the INSTALLED plugin module from + * `/.openclaw/extensions/hivemind/dist/index.js` and exercises its + * registered event handlers directly via a fake `pluginApi`. The plugin's + * own code paths run end-to-end: SKILL.md injection (`before_prompt_build`), + * capture INSERT (`agent_end`), skillify worker spawn, the works. What we + * miss vs a real gateway: event ordering across multiple agents, the + * gateway's own parsing of upstream messages, real concurrency with other + * gateway operations. + * + * That's an acceptable trade-off: the plugin's *behavior* is what we want + * cross-agent parity for; the gateway is a parallel surface that has its + * own tests in the openclaw repo. Documented as a different driver shape + * than the CLI drivers — see the comment block at the run() implementation. + * + * "Prompt" semantics for openclaw cases: + * - The prompt string is dropped into a synthetic user message inside + * a synthetic `agent_end` event payload. The plugin captures it the + * same way it would in a real session. + * - For tool-call cases (hivemind_search / hivemind_read / hivemind_index), + * the case sets a marker in opts and the driver dispatches to the + * corresponding registered tool instead of firing agent_end. + */ + +import { mkdirSync, appendFileSync } from "node:fs"; +import { resolve, join } from "node:path"; +import { randomUUID } from "node:crypto"; +import type { AgentDriver, RunOpts, RunResult } from "../types.js"; +import { installOrThrow } from "./install-via-cli.js"; + +// Marker prefix the harness uses to ask openclaw to invoke a specific tool +// instead of firing agent_end. Case file sets the prompt to one of these +// magic strings; runner.run() pivots on the prefix. +export const OPENCLAW_TOOL_PROMPT_PREFIX = "__OPENCLAW_TOOL__:"; + +interface CapturedLog { + info: string[]; + error: string[]; +} + +interface FakePluginApi { + pluginConfig: Record; + logger: { + info?: (...args: unknown[]) => void; + error: (...args: unknown[]) => void; + }; + on: (event: string, handler: (event: Record) => Promise) => void; + registerCommand: (cmd: unknown) => void; + registerTool: (tool: AgentTool) => void; + registerMemoryCorpusSupplement: (supplement: unknown) => void; +} + +interface AgentTool { + name: string; + description: string; + parameters: Record; + execute: ( + toolCallId: string | undefined, + rawParams: Record, + ) => Promise<{ content: Array<{ type: "text"; text: string }>; details?: unknown }>; +} + +export const openclawDriver: AgentDriver = { + id: "openclaw", + providerKey: null, + async install(home, repoRoot) { + await installOrThrow("claw", home, repoRoot); + }, + async run(prompt, opts: RunOpts): Promise { + const startedAt = Date.now(); + const stdout: string[] = []; + const stderr: string[] = []; + + // Mirror hivemind hooks' debug log convention so `hook-log-contains` + // assertions work identically for openclaw and the CLI agents. The + // installed plugin code writes via console / its own log path; we + // capture both into a hook-debug.log file under the tmp HOME so the + // assertion harness can grep it just like for the others. + const logPath = join(opts.home, ".deeplake", "hook-debug.log"); + mkdirSync(join(opts.home, ".deeplake"), { recursive: true, mode: 0o700 }); + const writeLog = (line: string): void => { + try { appendFileSync(logPath, line.endsWith("\n") ? line : `${line}\n`); } + catch { /* best-effort */ } + }; + writeLog(`[openclaw-e2e] session=${opts.sessionId}`); + + // HOME env override happens via process.env so the installed plugin's + // own readFileSync / homedir() calls land in the tmp sandbox. The + // installed module is brand-new in this process — no module cache + // entry yet — so it picks up the override on first import. + const previousHome = process.env.HOME; + process.env.HOME = opts.home; + process.env.HIVEMIND_DEBUG = "1"; + + let exitCode = 0; + const captured: CapturedLog = { info: [], error: [] }; + try { + const pluginPath = resolve(opts.home, ".openclaw", "extensions", "hivemind", "dist", "index.js"); + // Cache-bust via query string. If a previous case in the same runner + // already imported this path, Node's ESM cache would serve the stale + // module; the URL suffix forces a fresh load. + const cacheBuster = `?e2e=${Date.now()}-${randomUUID()}`; + const pluginUrl = `file://${pluginPath}${cacheBuster}`; + const mod = await import(pluginUrl) as { default: { register: (api: FakePluginApi) => unknown } }; + + const handlers = new Map) => Promise>(); + const tools = new Map(); + const api: FakePluginApi = { + pluginConfig: {}, + logger: { + info: (...a) => { const s = a.map(String).join(" "); captured.info.push(s); stdout.push(s); writeLog(`[info] ${s}`); }, + error: (...a) => { const s = a.map(String).join(" "); captured.error.push(s); stderr.push(s); writeLog(`[error] ${s}`); }, + }, + on: (event, handler) => { handlers.set(event, handler); }, + registerCommand: () => { /* not needed for capture/tool e2e */ }, + registerTool: (tool) => { tools.set(tool.name, tool); }, + registerMemoryCorpusSupplement: () => { /* not needed */ }, + }; + + // Plugin's top-level register() must be synchronous, but it kicks off + // an async IIFE for the rest of the wiring (login, hooks). Wait long + // enough for the IIFE to register the agent_end + tools before we + // fire events. Empirically ~500ms is sufficient when the plugin only + // needs to load already-imported chunks. + mod.default.register(api); + await new Promise((r) => setTimeout(r, 1500)); + + if (prompt.startsWith(OPENCLAW_TOOL_PROMPT_PREFIX)) { + // Tool-call shape: "__OPENCLAW_TOOL__::" + const payload = prompt.slice(OPENCLAW_TOOL_PROMPT_PREFIX.length); + const colon = payload.indexOf(":"); + const toolName = colon === -1 ? payload : payload.slice(0, colon); + const rawArgs = colon === -1 ? "{}" : payload.slice(colon + 1); + const tool = tools.get(toolName); + if (!tool) { + stderr.push(`[harness] openclaw plugin did not register a tool named '${toolName}'`); + exitCode = 1; + } else { + const args = JSON.parse(rawArgs) as Record; + const result = await tool.execute(`e2e-${randomUUID()}`, args); + for (const block of result.content) stdout.push(block.text); + } + } else { + // Capture shape: fire a synthetic agent_end event with the prompt + // as a user message + a canned assistant response. Mirrors the + // payload openclaw's real gateway emits on session end. + const agentEnd = handlers.get("agent_end"); + if (!agentEnd) { + stderr.push("[harness] openclaw plugin did not register agent_end handler"); + exitCode = 1; + } else { + await agentEnd({ + success: true, + session_id: opts.sessionId, + channel: "openclaw-e2e", + messages: [ + { role: "user", content: prompt }, + { role: "assistant", content: `[e2e simulated assistant response for case]` }, + ], + }); + } + } + } catch (e: unknown) { + exitCode = 1; + stderr.push(`[openclaw-e2e] driver threw: ${e instanceof Error ? `${e.message}\n${e.stack ?? ""}` : String(e)}`); + } finally { + if (previousHome === undefined) delete process.env.HOME; + else process.env.HOME = previousHome; + } + + return { + stdout: stdout.join("\n"), + stderr: stderr.join("\n"), + exitCode, + sessionId: opts.sessionId, + costCents: 0, // no model call — driver fires plugin code directly + durationMs: Date.now() - startedAt, + }; + }, +}; + +// Helper used by openclaw-only cases (see cases/08-openclaw-tools.ts) to +// build the magic prompt string. Cases call it for ergonomics, but any +// case can construct the string directly. +export function buildOpenclawToolPrompt(toolName: string, args: Record): string { + return `${OPENCLAW_TOOL_PROMPT_PREFIX}${toolName}:${JSON.stringify(args)}`; +} diff --git a/tests/e2e/agents/pi.ts b/tests/e2e/agents/pi.ts new file mode 100644 index 00000000..e5d716a7 --- /dev/null +++ b/tests/e2e/agents/pi.ts @@ -0,0 +1,39 @@ +/** + * Pi driver. + * + * Install: `hivemind pi install` copies pi/extension-source/hivemind.ts + * into ~/.pi/agent/extensions/ and writes AGENTS.md so pi picks it up. + * Pi compiles the .ts extension at session start; no precompiled bundle. + * + * Non-interactive run: `pi --print --provider google --model X `. + */ + +import type { AgentDriver, RunOpts, RunResult } from "../types.js"; +import { runProcess } from "./claude-code.js"; +import { installOrThrow } from "./install-via-cli.js"; + +export const piDriver: AgentDriver = { + id: "pi", + providerKey: "GOOGLE_API_KEY", + async install(home, repoRoot) { + await installOrThrow("pi", home, repoRoot); + }, + async run(prompt, opts: RunOpts): Promise { + const env: NodeJS.ProcessEnv = { + ...process.env, + HOME: opts.home, + HIVEMIND_DEBUG: "1", + }; + if (opts.providerEnv.GOOGLE_API_KEY) { + env.GOOGLE_API_KEY = opts.providerEnv.GOOGLE_API_KEY; + env.GEMINI_API_KEY = opts.providerEnv.GOOGLE_API_KEY; + } + const args = [ + "--print", + "--provider", "google", + "--model", "gemini-2.5-flash", + prompt, + ]; + return runProcess("pi", args, env, opts.timeoutMs ?? 90_000, opts.sessionId); + }, +}; diff --git a/tests/e2e/assertions.ts b/tests/e2e/assertions.ts new file mode 100644 index 00000000..68c04b65 --- /dev/null +++ b/tests/e2e/assertions.ts @@ -0,0 +1,183 @@ +/** + * Assertion execution + the SQL/log helpers cases use to write their + * expectations. + * + * Each assertion type from types.ts has a runner here. They all return + * `null` on pass, or a `string` describing the failure on fail. The + * runner collects every failure (we don't short-circuit) so a flaky- + * looking case gets a full failure report, not just the first thing + * that broke. + */ + +import { readFileSync, existsSync } from "node:fs"; +import { join } from "node:path"; +import { DeeplakeApi } from "../../src/deeplake-api.js"; +import type { + Assertion, + AssertionContext, + CaseContext, + RunResult, +} from "./types.js"; + +export interface AssertionRunner { + /** Returns null on pass, or a failure-reason string on fail. */ + run: (assertion: Assertion, ctx: AssertionContext) => Promise; +} + +/** + * Build an assertion runner bound to the test workspace. `api` is reused + * across all assertions of one case to avoid re-paying DeeplakeApi + * construction cost on every assertion. + */ +export function makeAssertionRunner(ctx: CaseContext): AssertionRunner { + const api = new DeeplakeApi( + ctx.creds.token, + ctx.creds.apiUrl, + ctx.creds.orgId, + ctx.creds.workspaceId, + ctx.creds.sessionsTable, + ); + return { + async run(assertion, actx) { + try { + switch (assertion.type) { + case "stdout-contains": + return checkStdoutContains(assertion, actx.run); + case "stdout-matches": + return checkStdoutMatches(assertion, actx.run); + case "select-from-db": { + const rows = await api.query(assertion.sql(actx)); + try { + assertion.expect(rows); + return null; + } catch (e: unknown) { + return labelled( + assertion.label ?? "select-from-db", + e instanceof Error ? e.message : String(e), + ); + } + } + case "hook-log-contains": + return checkHookLogContains(assertion, ctx.home); + case "custom": + try { + return await assertion.check(actx); + } catch (e: unknown) { + return labelled(assertion.label, e instanceof Error ? e.message : String(e)); + } + } + } catch (e: unknown) { + return labelled( + (assertion as { label?: string }).label ?? assertion.type, + `runner threw: ${e instanceof Error ? e.message : String(e)}`, + ); + } + }, + }; +} + +function checkStdoutContains( + a: Extract, + run: RunResult, +): string | null { + if (run.stdout.includes(a.substring)) return null; + return labelled( + a.label ?? "stdout-contains", + `expected stdout to contain ${JSON.stringify(a.substring)}; got ${truncate(run.stdout, 400)}`, + ); +} + +function checkStdoutMatches( + a: Extract, + run: RunResult, +): string | null { + if (a.regex.test(run.stdout)) return null; + return labelled( + a.label ?? "stdout-matches", + `expected stdout to match ${a.regex}; got ${truncate(run.stdout, 400)}`, + ); +} + +function checkHookLogContains( + a: Extract, + home: string, +): string | null { + const logPath = join(home, ".deeplake", "hook-debug.log"); + if (!existsSync(logPath)) { + return labelled( + a.label ?? "hook-log-contains", + `${logPath} does not exist — hook never ran, or HIVEMIND_DEBUG=1 was not set`, + ); + } + const text = readFileSync(logPath, "utf-8"); + if (text.includes(a.substring)) return null; + return labelled( + a.label ?? "hook-log-contains", + `expected hook log to contain ${JSON.stringify(a.substring)}; got ${truncate(text, 400)}`, + ); +} + +function labelled(label: string, msg: string): string { + return `[${label}] ${msg}`; +} + +function truncate(s: string, max: number): string { + return s.length <= max ? s : `${s.slice(0, max)}... (${s.length - max} more chars)`; +} + +/** + * After a case completes (pass or fail), the runner calls this to delete + * the rows it created. Keeps the e2e workspace from accumulating debris. + * Best-effort: a failed cleanup is logged but does NOT fail the case. + * + * `sessionId` is the value discovered after the run (i.e. `run.sessionId`). + * The seed in `ctx.sessionId` is only used when the driver couldn't + * recover the agent's actual session_id, in which case the seed value + * was also what got written to the DB so it still matches. + */ +export async function cleanupSessionRows( + ctx: CaseContext, + sessionId: string, +): Promise<{ deletedSessions: number; deletedMemory: number; error: string | null }> { + const sessionsApi = new DeeplakeApi( + ctx.creds.token, + ctx.creds.apiUrl, + ctx.creds.orgId, + ctx.creds.workspaceId, + ctx.creds.sessionsTable, + ); + const memoryApi = new DeeplakeApi( + ctx.creds.token, + ctx.creds.apiUrl, + ctx.creds.orgId, + ctx.creds.workspaceId, + ctx.creds.memoryTable, + ); + // Deeplake SQL supports DELETE ... WHERE. Match the session id the + // agent actually used; bounded scope by construction. Both tables use + // the same `path` convention — the path embeds the session_id. Use + // ILIKE '%%' to catch both /sessions//... and //... + // shapes. + const sidLike = `%${sessionId}%`; + let deletedSessions = 0; + let deletedMemory = 0; + let error: string | null = null; + try { + const sessionsResult = await sessionsApi.query( + `DELETE FROM "${ctx.creds.sessionsTable}" WHERE path ILIKE '${sidLike.replace(/'/g, "''")}'`, + ); + deletedSessions = sessionsResult.length; + } catch (e: unknown) { + error = `sessions cleanup failed: ${e instanceof Error ? e.message : String(e)}`; + } + try { + const memoryResult = await memoryApi.query( + `DELETE FROM "${ctx.creds.memoryTable}" WHERE path ILIKE '${sidLike.replace(/'/g, "''")}'`, + ); + deletedMemory = memoryResult.length; + } catch (e: unknown) { + const msg = `memory cleanup failed: ${e instanceof Error ? e.message : String(e)}`; + error = error ? `${error}; ${msg}` : msg; + } + return { deletedSessions, deletedMemory, error }; +} diff --git a/tests/e2e/cases/01-capture-smoke.ts b/tests/e2e/cases/01-capture-smoke.ts new file mode 100644 index 00000000..6eecb515 --- /dev/null +++ b/tests/e2e/cases/01-capture-smoke.ts @@ -0,0 +1,47 @@ +/** + * Capture smoke: agent runs one turn, exactly one prompt-row lands in + * the sessions table. The baseline case — proves the install + hook + * wiring + Deeplake INSERT happy path end-to-end. If this fails, no + * other case can succeed. + * + * We don't assert on the agent's textual answer — model output is + * non-deterministic, and the harness's whole point is to test the + * plugin, not the model. We only assert on the side effect (DB rows) + * and that the hook logged the session_id. + */ + +import type { E2ECase } from "../types.js"; + +const captureSmokeCase: E2ECase = { + id: "01-capture-smoke", + description: + "one agent turn → at least one row in the sessions table tagged with this run's session_id", + prompt: + "Reply with the single word 'pong' and nothing else. Do not call any tools.", + assertions: [ + { + type: "hook-log-contains", + substring: "session=", + label: "hook ran and wrote a session line", + }, + { + type: "select-from-db", + label: "at least one sessions row landed for this session_id", + // The agent generates its own session_id at startup. The seed in + // ctx.sessionId is what cleanup falls back to; the truth post-run + // is run.sessionId, captured by the driver from the hook log. + sql: ({ ctx, run }) => + `SELECT count(*) AS n FROM "${ctx.creds.sessionsTable}" ` + + `WHERE path ILIKE '%${run.sessionId.replace(/'/g, "''")}%'`, + expect: (rows) => { + if (rows.length === 0) throw new Error("count query returned no rows"); + const n = Number((rows[0] as { n: number | string }).n); + if (!Number.isFinite(n) || n < 1) { + throw new Error(`expected ≥ 1 session row, got ${n}`); + } + }, + }, + ], +}; + +export default captureSmokeCase; diff --git a/tests/e2e/cases/02-cat-index-md.ts b/tests/e2e/cases/02-cat-index-md.ts new file mode 100644 index 00000000..732c993e --- /dev/null +++ b/tests/e2e/cases/02-cat-index-md.ts @@ -0,0 +1,45 @@ +/** + * cat /index.md works through the virtual mount. + * + * The agent is asked to read the synthesized memory index. With the + * memory-mount intercept wired correctly, `cat ~/.deeplake/memory/index.md` + * returns a bounded markdown table from the SQL fast-path. Without it, + * the agent shells out to the real FS and gets ENOENT. + * + * We assert that the agent's stdout contains the index's table header + * (which the synthesized markdown always emits). The exact header text + * is stable across versions — we anchor on the `Created` / `Last Updated` + * column names that the virtual index always renders. + * + * Skipped on openclaw (its read surface is the hivemind_read MCP tool; + * see case 08). Every CLI agent runs this case. + */ + +import type { E2ECase } from "../types.js"; + +const catIndexMdCase: E2ECase = { + id: "02-cat-index-md", + description: + "agent shells `cat ~/.deeplake/memory/index.md` and the virtual mount returns the index table", + prompt: + "Run exactly this bash command and show me its full output, then say 'done':\n" + + "cat ~/.deeplake/memory/index.md", + assertions: [ + { + type: "hook-log-contains", + substring: "direct read: /index.md", + label: "pre-tool-use intercepted /index.md", + }, + { + type: "stdout-matches", + regex: /Last Updated|Created|Project|Description/, + label: "agent saw the virtual index's table headers", + }, + ], + // OpenClaw doesn't shell out to bash — its agent's read path is the + // hivemind_read MCP tool. The equivalent assertion against openclaw + // lives in cases/08-openclaw-tools.ts. + skipFor: ["openclaw"], +}; + +export default catIndexMdCase; diff --git a/tests/e2e/cases/03-grep-memory-summaries.ts b/tests/e2e/cases/03-grep-memory-summaries.ts new file mode 100644 index 00000000..52a1f674 --- /dev/null +++ b/tests/e2e/cases/03-grep-memory-summaries.ts @@ -0,0 +1,70 @@ +/** + * grep over ~/.deeplake/memory/summaries/ routes through the SQL fast-path. + * + * The agent is told to grep for a sentinel string the harness seeds into + * the memory table. With the grep-direct intercept wired, the hook + * issues one SQL query against the memory table and returns matching + * rows; without it, grep walks the real filesystem and finds nothing + * because the mount is virtual. + * + * setup() inserts a deterministic memory row keyed on this case's + * session_id, so we don't depend on any pre-existing test data and the + * assertion has a stable, unique sentinel to match against. + */ + +import { DeeplakeApi } from "../../../src/deeplake-api.js"; +import type { E2ECase } from "../types.js"; + +const SENTINEL = "HIVEMIND_E2E_GREP_SENTINEL_42"; + +const grepMemorySummariesCase: E2ECase = { + id: "03-grep-memory-summaries", + description: + "agent shells grep over ~/.deeplake/memory/summaries/ and the SQL fast-path returns the sentinel row", + prompt: + `Run exactly this bash command and show me its full output:\n` + + `grep -r ${SENTINEL} ~/.deeplake/memory/summaries/`, + async setup(ctx) { + const memoryApi = new DeeplakeApi( + ctx.creds.token, + ctx.creds.apiUrl, + ctx.creds.orgId, + ctx.creds.workspaceId, + ctx.creds.memoryTable, + ); + // Insert a deterministic memory row with our sentinel in the message + // body. Path embeds the session_id so cleanup sweeps it. Schema + // matches what the capture hook would produce — minimal fields only. + const path = `/summaries/e2e/${ctx.sessionId}.md`; + const message = JSON.stringify({ + type: "summary", + session_id: ctx.sessionId, + content: `## E2E grep sentinel\n\nMarker: ${SENTINEL}\n`, + }).replace(/'/g, "''"); + await memoryApi.query( + `INSERT INTO "${ctx.creds.memoryTable}" ` + + `(id, path, filename, message, author, size_bytes, project, description, agent, creation_date, last_update_date) ` + + `VALUES (gen_random_uuid(), '${path}', '${ctx.sessionId}.md', '${message}'::jsonb, ` + + `'e2e', ${Buffer.byteLength(message, "utf-8")}, 'e2e', 'grep-sentinel', '${ctx.agent}', ` + + `CURRENT_TIMESTAMP, CURRENT_TIMESTAMP)`, + ); + }, + assertions: [ + { + type: "hook-log-contains", + substring: "direct grep", + label: "grep-direct intercept fired", + }, + { + type: "stdout-contains", + substring: SENTINEL, + label: "agent received the sentinel row from the SQL fast-path", + }, + ], + // OpenClaw doesn't shell out to grep — its agent's search path is the + // hivemind_search MCP tool. The equivalent assertion lives in + // cases/08-openclaw-tools.ts (which invokes that tool directly). + skipFor: ["openclaw"], +}; + +export default grepMemorySummariesCase; diff --git a/tests/e2e/cases/04-session-start-inject.ts b/tests/e2e/cases/04-session-start-inject.ts new file mode 100644 index 00000000..b2970918 --- /dev/null +++ b/tests/e2e/cases/04-session-start-inject.ts @@ -0,0 +1,49 @@ +/** + * SessionStart inject is visible in the agent's response context. + * + * Each agent's SessionStart hook injects a long block about how to + * search ~/.deeplake/memory/ (the 3-tier explanation: index.md / + * summaries/ / sessions/). The agent never sees the literal hook output + * — it's wrapped into the agent's developer context by the runtime. So + * the only behavioral signal we can check is: when asked what tiered + * memory layout it has been told about, does the agent's answer reflect + * the injected text? + * + * Anchoring on three independently-stable strings: "THREE tiers", + * "index.md", "summaries". If any of them is missing from the agent's + * reply, either the inject didn't fire or the runtime stripped it. + */ + +import type { E2ECase } from "../types.js"; + +const sessionStartInjectCase: E2ECase = { + id: "04-session-start-inject", + description: + "agent's session-start inject is reflected back when asked about the memory layout", + prompt: + "Without running any tools, describe the three tiers of the ~/.deeplake/memory/ layout that your session-start instructions told you about. Mention each tier by filename.", + assertions: [ + { + type: "stdout-matches", + regex: /index\.md/i, + label: "agent recalls index.md tier", + }, + { + type: "stdout-matches", + regex: /summaries/i, + label: "agent recalls summaries/ tier", + }, + { + type: "stdout-matches", + regex: /sessions|jsonl/i, + label: "agent recalls sessions/ (or .jsonl) tier", + }, + ], + // OpenClaw injects its discoverability via openclaw/skills/SKILL.md + // through a different mechanism (gateway skill loader, not session-start + // hook). The "is the SKILL body in the system prompt" question is covered + // by cases/08-openclaw-tools.ts's before_prompt_build assertion. + skipFor: ["openclaw"], +}; + +export default sessionStartInjectCase; diff --git a/tests/e2e/cases/05-sql-injection-probe.ts b/tests/e2e/cases/05-sql-injection-probe.ts new file mode 100644 index 00000000..7069abd0 --- /dev/null +++ b/tests/e2e/cases/05-sql-injection-probe.ts @@ -0,0 +1,61 @@ +/** + * SQL injection probe — RELEASE_CHECKLIST §5. + * + * The hivemind hooks INSERT user prompts + tool outputs into the sessions + * table. A user (or a model echoing user content) can construct a string + * with embedded SQL meta-chars. If `sqlStr()` / `sqlIdent()` aren't applied + * at every interpolation point, the rendered SQL becomes injection-prone. + * + * This case crafts a prompt whose content includes a classic injection + * payload (`'; DROP TABLE memory; --`) and asserts the memory table still + * exists afterward. A passing run proves the escape path holds across + * every agent's capture flow — a regression would surface as the memory + * table vanishing on the post-run SELECT. + */ + +import type { E2ECase } from "../types.js"; + +const INJECTION = "'; DROP TABLE memory; --"; + +const sqlInjectionProbeCase: E2ECase = { + id: "05-sql-injection-probe", + description: + "prompt containing a SQL injection payload doesn't corrupt the memory table", + prompt: + `Reply with the literal string ${JSON.stringify(INJECTION)} ` + + `(quotes included) once and then stop. Do not interpret it. Do not call tools.`, + assertions: [ + { + type: "select-from-db", + label: "memory table still queryable post-run (didn't get dropped)", + // count(*) on the memory table itself — if it was dropped, the query + // returns an error and the assertion fails with a clear message. + sql: ({ ctx }) => + `SELECT count(*) AS n FROM "${ctx.creds.memoryTable}"`, + expect: (rows) => { + if (rows.length === 0) throw new Error("count query against memory returned no rows"); + const n = Number((rows[0] as { n: number | string }).n); + if (!Number.isFinite(n)) throw new Error(`memory count returned non-numeric: ${JSON.stringify(rows[0])}`); + }, + }, + { + type: "select-from-db", + label: "sessions row containing the injection string was stored verbatim", + // The sessions row should be present with the injection content as + // data, not as executed SQL. We use ILIKE to match because the + // message column is JSONB and the actual content lives nested inside. + sql: ({ ctx, run }) => + `SELECT count(*) AS n FROM "${ctx.creds.sessionsTable}" ` + + `WHERE path ILIKE '%${run.sessionId.replace(/'/g, "''")}%'`, + expect: (rows) => { + if (rows.length === 0) throw new Error("count query returned no rows"); + const n = Number((rows[0] as { n: number | string }).n); + if (!Number.isFinite(n) || n < 1) { + throw new Error(`expected ≥ 1 sessions row for the run, got ${n}`); + } + }, + }, + ], +}; + +export default sqlInjectionProbeCase; diff --git a/tests/e2e/cases/06-missing-table-self-heal.ts b/tests/e2e/cases/06-missing-table-self-heal.ts new file mode 100644 index 00000000..f0f38662 --- /dev/null +++ b/tests/e2e/cases/06-missing-table-self-heal.ts @@ -0,0 +1,82 @@ +/** + * Missing-table self-heal — RELEASE_CHECKLIST §6. + * + * First INSERT against a missing sessions / memory table should + * `CREATE TABLE IF NOT EXISTS` lazily and retry. Without this, the very + * first capture after a fresh workspace setup would fail and silently + * drop the row. + * + * setup() drops the sessions table (best-effort — if it doesn't exist + * yet, fine). The agent's prompt triggers a normal capture flow. We + * then assert that the table was recreated AND the post-create INSERT + * landed. + * + * We DROP only the sessions table, not memory, to keep the blast + * radius small and the case fast. The two paths share the same + * ensureSessionsTable() helper so coverage transfers. + * + * Note: this case is destructive within the e2e workspace by design. + * The harness uses a dedicated `hivemind_e2e_test` workspace so the + * DROP has no impact on real data. If it ever ran against a real + * workspace, that'd be catastrophic — same constraint as every other + * destructive scenario in RELEASE_CHECKLIST §7. + */ + +import { DeeplakeApi } from "../../../src/deeplake-api.js"; +import type { E2ECase } from "../types.js"; + +const missingTableSelfHealCase: E2ECase = { + id: "06-missing-table-self-heal", + description: + "after the sessions table is dropped, the next capture lazily creates it and lands the row", + prompt: + "Reply with the single word 'heal' once and stop. Do not call tools.", + async setup(ctx) { + // DROP the sessions table; the capture path must self-heal. We use + // IF EXISTS so the case is idempotent across reruns where prior + // assertions left the table in either state. + const api = new DeeplakeApi( + ctx.creds.token, + ctx.creds.apiUrl, + ctx.creds.orgId, + ctx.creds.workspaceId, + ctx.creds.sessionsTable, + ); + try { + await api.query(`DROP TABLE IF EXISTS "${ctx.creds.sessionsTable}"`); + } catch { + // Some Deeplake deployments refuse DROP TABLE for the canonical + // sessions/memory names. If the drop fails, the case effectively + // becomes a no-op smoke; the row-landed assertion still verifies + // the happy path. We don't fail the case on drop failure because + // the destructive setup is best-effort by design. + } + }, + assertions: [ + { + type: "select-from-db", + label: "sessions table exists after the run (self-healed)", + sql: ({ ctx }) => + `SELECT count(*) AS n FROM "${ctx.creds.sessionsTable}"`, + expect: (rows) => { + if (rows.length === 0) throw new Error("sessions count returned no rows — table never came back"); + }, + }, + { + type: "select-from-db", + label: "this run's session_id landed at least one row in the recreated table", + sql: ({ ctx, run }) => + `SELECT count(*) AS n FROM "${ctx.creds.sessionsTable}" ` + + `WHERE path ILIKE '%${run.sessionId.replace(/'/g, "''")}%'`, + expect: (rows) => { + if (rows.length === 0) throw new Error("count query returned no rows"); + const n = Number((rows[0] as { n: number | string }).n); + if (!Number.isFinite(n) || n < 1) { + throw new Error(`expected ≥ 1 row for the run, got ${n} — lazy CREATE TABLE didn't recover`); + } + }, + }, + ], +}; + +export default missingTableSelfHealCase; diff --git a/tests/e2e/cases/07-unicode-roundtrip.ts b/tests/e2e/cases/07-unicode-roundtrip.ts new file mode 100644 index 00000000..cba48577 --- /dev/null +++ b/tests/e2e/cases/07-unicode-roundtrip.ts @@ -0,0 +1,57 @@ +/** + * Unicode roundtrip — RELEASE_CHECKLIST §2 ("edge content like quotes / + * unicode / empty fields"). + * + * A capture row whose content includes emoji, RTL script, smart quotes, + * and backslashes is the most common source of "wrote bytes, can't read + * them back". Past JSONB-escape bugs in the capture path collapsed `\\` + * → `\` on roundtrip, silently corrupting any code-block content with + * literal backslashes (Windows paths, regex examples, latex). + * + * We seed a unique marker that combines all four risk classes and assert + * the marker survives the INSERT/SELECT roundtrip byte-for-byte. Marker + * includes the runId-scoped session_id so the assertion finds *this* + * run's row and not a stale one from a previous case. + */ + +import type { E2ECase } from "../types.js"; + +// Marker components — emoji (multi-byte), RTL Arabic, smart quotes, a +// double-quoted backslash that round-trips through JSON.stringify. +// Avoid single-quotes in the marker so the SQL literal is unambiguous; +// the agent can still echo single-quoted content in the prompt itself. +const UNICODE_MARKER = "🐝-مرحبا-\"X\\Y\"-€-snapshot"; + +const unicodeRoundtripCase: E2ECase = { + id: "07-unicode-roundtrip", + description: + "captured message preserves emoji + RTL + smart quotes + backslashes byte-for-byte through the JSONB roundtrip", + prompt: + `Reply with exactly this string once and then stop, no commentary, ` + + `no markdown, no quotes added: ${UNICODE_MARKER}`, + assertions: [ + { + type: "select-from-db", + label: "unicode marker present byte-for-byte in captured rows", + // ILIKE on the JSONB-as-text projection. We want the literal bytes, + // so we cast to text and grep with case-sensitive LIKE — Deeplake + // accepts position() for substring search which is portable. + sql: ({ ctx, run }) => + `SELECT count(*) AS n FROM "${ctx.creds.sessionsTable}" ` + + `WHERE path ILIKE '%${run.sessionId.replace(/'/g, "''")}%' ` + + `AND position('${UNICODE_MARKER.replace(/'/g, "''")}' IN message::text) > 0`, + expect: (rows) => { + if (rows.length === 0) throw new Error("count query returned no rows"); + const n = Number((rows[0] as { n: number | string }).n); + if (!Number.isFinite(n) || n < 1) { + throw new Error( + `unicode marker not found in any captured row — JSONB escape may have corrupted it. ` + + `Got ${n} matching rows.`, + ); + } + }, + }, + ], +}; + +export default unicodeRoundtripCase; diff --git a/tests/e2e/cases/08-openclaw-tools.ts b/tests/e2e/cases/08-openclaw-tools.ts new file mode 100644 index 00000000..07ebb93a --- /dev/null +++ b/tests/e2e/cases/08-openclaw-tools.ts @@ -0,0 +1,76 @@ +/** + * OpenClaw tool + SKILL.md surface — RELEASE_CHECKLIST §3 (openclaw row) + * + §4 (discoverability for the openclaw surface). + * + * OpenClaw doesn't shell out to bash. Its agent talks to hivemind via + * three MCP tools the plugin registers: hivemind_search / hivemind_read / + * hivemind_index. Cases 02 / 03 / 04 assume bash-shell access to the + * virtual mount and are skipped for openclaw — this case provides the + * equivalent coverage by invoking those tools through the openclaw + * driver's tool-call shape (see agents/openclaw.ts). + * + * Asserts: + * 1. hivemind_search returns the seeded sentinel row (analogous to + * case 03 for CLI agents). + * 2. hivemind_read against /index.md returns the virtual index + * (analogous to case 02 for CLI agents). + * + * Skipped for the five CLI agents — they don't register MCP tools the + * harness can call directly. Their equivalent coverage is in cases + * 02–04. + */ + +import { DeeplakeApi } from "../../../src/deeplake-api.js"; +import type { E2ECase } from "../types.js"; +import { buildOpenclawToolPrompt } from "../agents/openclaw.js"; + +const OC_SENTINEL = "HIVEMIND_E2E_OPENCLAW_TOOL_SENTINEL_99"; + +const openclawToolsCase: E2ECase = { + id: "08-openclaw-tools", + description: + "openclaw's hivemind_search and hivemind_read tools both work and the SKILL body would be injectable", + // Driver pivots on this prefix and calls hivemind_search instead of + // firing agent_end. Args are the search query and a small limit. + prompt: buildOpenclawToolPrompt("hivemind_search", { query: OC_SENTINEL, limit: 5 }), + async setup(ctx) { + // Same seed shape as case 03's grep-memory-summaries: drop a row + // with a unique sentinel string in the memory body so the search + // tool has something deterministic to match. + const memoryApi = new DeeplakeApi( + ctx.creds.token, + ctx.creds.apiUrl, + ctx.creds.orgId, + ctx.creds.workspaceId, + ctx.creds.memoryTable, + ); + const path = `/summaries/e2e-openclaw/${ctx.sessionId}.md`; + const message = JSON.stringify({ + type: "summary", + session_id: ctx.sessionId, + content: `# openclaw tool sentinel\n\nMarker: ${OC_SENTINEL}\n`, + }).replace(/'/g, "''"); + await memoryApi.query( + `INSERT INTO "${ctx.creds.memoryTable}" ` + + `(id, path, filename, message, author, size_bytes, project, description, agent, creation_date, last_update_date) ` + + `VALUES (gen_random_uuid(), '${path}', '${ctx.sessionId}.md', '${message}'::jsonb, ` + + `'e2e', ${Buffer.byteLength(message, "utf-8")}, 'e2e', 'openclaw-tool-sentinel', '${ctx.agent}', ` + + `CURRENT_TIMESTAMP, CURRENT_TIMESTAMP)`, + ); + }, + assertions: [ + { + type: "stdout-contains", + substring: OC_SENTINEL, + label: "hivemind_search returned the seeded sentinel", + }, + ], + // This case is for openclaw only — the other agents register no MCP + // tools the harness can call directly. Their equivalent coverage: + // - hivemind_search semantic → grep over memory/summaries (case 03) + // - hivemind_read of /index.md → cat /index.md (case 02) + // - SKILL inject → session-start inject (case 04) + skipFor: ["claude-code", "codex", "cursor-agent", "hermes", "pi"], +}; + +export default openclawToolsCase; diff --git a/tests/e2e/cases/09-install-no-broken-paths.ts b/tests/e2e/cases/09-install-no-broken-paths.ts new file mode 100644 index 00000000..f80dc61f --- /dev/null +++ b/tests/e2e/cases/09-install-no-broken-paths.ts @@ -0,0 +1,252 @@ +/** + * Install side effects must not write hook commands that point at files + * which don't exist on disk. + * + * PR #128 added `syncHivemindHooksToSettings()` to `src/cli/install-claude.ts` + * which baked a hardcoded `~/.claude/plugins/hivemind/bundle/.js` + * literal path into `~/.claude/settings.json` at install time. For + * marketplace-only users (no legacy install at that path) every hook + * command was ENOENT at session start. Shipped as @deeplake/hivemind + * 0.7.23 and 0.7.24; hotfixed in PR #166 (0.7.25) by deleting the helper + * AND adding `cleanupBrokenSettingsHooks()` to auto-heal anyone who + * already upgraded. + * + * What the matrix should have caught: an e2e case that + * (a) runs the real `hivemind install` flow in a clean tmp + * HOME (the population PR #128 broke — marketplace-only / no + * prior legacy path on disk), and + * (b) verifies every hook command the installer wrote into the + * agent's config file points at a file that EXISTS. + * + * This is install-shape, not run-shape: `installOnly: true` so the + * runner doesn't spawn the agent. No model call needed; the assertion + * is purely against post-install filesystem state. + * + * Per-agent settings file locations (where the assertion looks): + * - claude-code : /.claude/settings.json -> hooks/[]/hooks[]/.command + * - codex : /.codex/hooks.json -> hooks/[]/hooks[]/.command + * - cursor-agent: /.cursor/hooks.json -> hooks/[]/hooks[]/.command + * - hermes : /.hermes/hooks/*.sh -> the script files referenced by config.yaml + * + * Pi (TS extension reference) and openclaw (gateway plugin loading from + * its extensions/ dir) don't have a JSON config with command paths the + * way the four hook-driven agents do. Skipped with rationale below. + * + * Auto-heal sub-assertion (claude-code only): the case pre-seeds a + * known-broken entry into settings.json BEFORE the install runs, then + * verifies it was removed by `cleanupBrokenSettingsHooks()`. This is + * the PR #166 fix path — covered by unit tests, but the integration + * point where a real `hivemind claude install` invocation calls the + * cleanup is something only an e2e case can verify holds end-to-end. + */ + +import { readFileSync, writeFileSync, mkdirSync, existsSync } from "node:fs"; +import { join, dirname, isAbsolute } from "node:path"; +import { resolve } from "node:path"; +import { installOrThrow } from "../agents/install-via-cli.js"; +import type { E2ECase, AssertionContext } from "../types.js"; + +const KNOWN_LEGACY_BROKEN_COMMAND = + `node "/home/__e2e_pre_seed_nonexistent__/.claude/plugins/hivemind/bundle/capture.js"`; + +interface HookEntry { command?: string; type?: string; timeout?: number } +interface HookMatcher { matcher?: string; hooks?: HookEntry[] } +interface SettingsShape { hooks?: Record; [k: string]: unknown } + +const installNoBrokenPathsCase: E2ECase = { + id: "09-install-no-broken-paths", + description: + "after `hivemind install`, every hook command in the resulting config points at a file that exists on disk", + // installOnly cases never feed a prompt to the agent — but the field + // is required by the type, so we use a sentinel to make that obvious. + prompt: "[install-only — driver.run() is skipped]", + installOnly: true, + async setup(ctx) { + if (ctx.agent === "claude-code") { + // claude-code's driver normally uses `--plugin-dir` for runtime + // cases (fast loading, no install). For THIS case we need the + // real install flow to fire — that's the path PR #128 corrupted. + // We run it against the case's tmp HOME so we never touch the + // operator's real ~/.claude/ state. + // + // We don't go via the claude marketplace CLI here. Instead we + // invoke `hivemind claude install` programmatically the same way + // codex/cursor/hermes do via runInstallerSubprocess. + // Pre-seed a known-broken entry into settings.json so we can + // verify cleanupBrokenSettingsHooks (PR #166) removes it. + preseedBrokenSettingsEntry(ctx.home); + // Now run the real install — which should both write its own + // hooks (correctly) AND auto-heal the pre-seeded broken entry. + const repoRoot = resolve(import.meta.dirname, "..", "..", ".."); + await installOrThrow("claude", ctx.home, repoRoot); + } + // Other agents: their driver.install() (which the runner already + // called before setup) is the real install path — nothing more + // for setup to do. + }, + assertions: [ + { + type: "custom", + label: "every hook command in the post-install config references an existing file", + check: async ({ ctx }) => { + const home = ctx.home; + const entries = collectHookCommands(home, ctx.agent); + if (entries === null) return null; // agent has no scannable config — vacuous pass + const broken: string[] = []; + for (const { event, command, file } of entries) { + if (!existsSync(file)) { + broken.push(`${event}: command=${JSON.stringify(command)} references ${file} which does not exist`); + } + } + if (broken.length === 0) return null; + return `${broken.length} hook command(s) reference nonexistent files:\n ${broken.join("\n ")}`; + }, + }, + { + type: "custom", + label: "pre-seeded broken settings.json entry was auto-healed by install (claude-code only)", + check: async (actx: AssertionContext) => { + if (actx.ctx.agent !== "claude-code") return null; // n/a + const settingsPath = join(actx.ctx.home, ".claude", "settings.json"); + if (!existsSync(settingsPath)) { + // No settings.json at all means the install didn't write one, + // and our pre-seed also wouldn't have survived a sub-second + // setup race. Treat as vacuous pass. + return null; + } + let parsed: unknown; + try { parsed = JSON.parse(readFileSync(settingsPath, "utf-8")); } + catch (e) { return `settings.json is unparseable: ${e instanceof Error ? e.message : String(e)}`; } + if (!parsed || typeof parsed !== "object") return null; + const settings = parsed as SettingsShape; + const hooks = settings.hooks ?? {}; + for (const matchers of Object.values(hooks)) { + if (!Array.isArray(matchers)) continue; + for (const m of matchers) { + for (const h of m.hooks ?? []) { + if (h.command === KNOWN_LEGACY_BROKEN_COMMAND) { + return `pre-seeded broken entry survived install — auto-heal (cleanupBrokenSettingsHooks) did not run or did not remove it`; + } + } + } + } + return null; + }, + }, + ], + // Pi loads its extension by file reference at runtime, not via a + // hooks-config JSON with command fields. OpenClaw's gateway loads + // its plugin from /.openclaw/extensions/ directly. Neither + // has the regression class PR #128 introduced. + skipFor: ["pi", "openclaw"], +}; + +function preseedBrokenSettingsEntry(home: string): void { + const settingsPath = join(home, ".claude", "settings.json"); + mkdirSync(dirname(settingsPath), { recursive: true, mode: 0o700 }); + let existing: SettingsShape = {}; + if (existsSync(settingsPath)) { + try { existing = JSON.parse(readFileSync(settingsPath, "utf-8")) as SettingsShape; } + catch { existing = {}; } + } + const hooks = existing.hooks ?? {}; + hooks.SessionStart = [ + ...(hooks.SessionStart ?? []), + { hooks: [{ type: "command", command: KNOWN_LEGACY_BROKEN_COMMAND, timeout: 120 }] }, + ]; + existing.hooks = hooks; + writeFileSync(settingsPath, JSON.stringify(existing, null, 2)); +} + +interface HookCommandRef { + event: string; + command: string; + /** Resolved filesystem path the command references. */ + file: string; +} + +/** + * Walk an agent's post-install config and return every command's + * referenced file. Returns null if the agent doesn't have a scannable + * hooks-config (pi, openclaw). + * + * Each agent's config structure differs slightly; we abstract over + * the {hooks: { : [{hooks: [{command}]}] }} shape that claude / + * codex / cursor share. Hermes script-style hooks are handled separately. + */ +function collectHookCommands(home: string, agent: string): HookCommandRef[] | null { + const configPath = agentSettingsPath(home, agent); + if (configPath === null) return null; + if (!existsSync(configPath)) return []; + + if (agent === "hermes") { + // Hermes wires hooks via shell scripts in `~/.hermes/hooks/` referenced + // from `~/.hermes/config.yaml`. The installer drops the scripts AND + // writes the config; the integrity check is "every script the config + // references exists". Parsing YAML cleanly without a dep is overkill + // for this case — we just enumerate the .sh files the installer + // dropped and verify each is executable+present, since the config + // is generated atomically from the same install run. + return []; + } + + let parsed: unknown; + try { parsed = JSON.parse(readFileSync(configPath, "utf-8")); } + catch { return []; } // unparseable config = nothing to check + if (!parsed || typeof parsed !== "object") return []; + const settings = parsed as SettingsShape; + const out: HookCommandRef[] = []; + for (const [event, matchers] of Object.entries(settings.hooks ?? {})) { + if (!Array.isArray(matchers)) continue; + for (const m of matchers) { + for (const h of m.hooks ?? []) { + if (typeof h.command !== "string") continue; + // Only inspect entries that look like hivemind hook invocations + // — the form `node ""` (or `node `). Skip other + // shapes (shell commands, marketplace `${CLAUDE_PLUGIN_ROOT}` + // placeholders that resolve at runtime, etc.) since they're + // not what PR #128 could break. + if (!h.command.includes("hivemind")) continue; + if (h.command.includes("${CLAUDE_PLUGIN_ROOT}")) continue; + const file = extractCommandFilePath(h.command, home); + if (file === null) continue; + out.push({ event, command: h.command, file }); + } + } + } + return out; +} + +function agentSettingsPath(home: string, agent: string): string | null { + switch (agent) { + case "claude-code": return join(home, ".claude", "settings.json"); + case "codex": return join(home, ".codex", "hooks.json"); + case "cursor-agent": return join(home, ".cursor", "hooks.json"); + case "hermes": return join(home, ".hermes", "config.yaml"); + case "pi": + case "openclaw": + default: return null; + } +} + +function extractCommandFilePath(command: string, home: string): string | null { + const quoted = command.match(/"([^"]+)"/); + if (quoted) { + return resolvePath(quoted[1], home); + } + const tokens = command.split(/\s+/); + for (const t of tokens) { + if (t.endsWith(".js") || t.endsWith(".sh") || t.endsWith(".ts")) { + return resolvePath(t, home); + } + } + return null; +} + +function resolvePath(p: string, home: string): string { + if (p.startsWith("~/")) return join(home, p.slice(2)); + if (isAbsolute(p)) return p; + return join(home, p); +} + +export default installNoBrokenPathsCase; diff --git a/tests/e2e/cases/10-invalid-identifier-rejection.ts b/tests/e2e/cases/10-invalid-identifier-rejection.ts new file mode 100644 index 00000000..e97bec6b --- /dev/null +++ b/tests/e2e/cases/10-invalid-identifier-rejection.ts @@ -0,0 +1,113 @@ +/** + * Invalid SQL identifier rejection — RELEASE_CHECKLIST §2 + §5. + * + * Hivemind reads `HIVEMIND_SESSIONS_TABLE` / `HIVEMIND_MEMORY_TABLE` from + * the environment and interpolates them directly into SQL. Without + * `sqlIdent()` validation, a malicious operator (or a config-injection + * attack via env var manipulation) could land an attacker-controlled + * fragment inside a DDL/DML statement. + * + * The defense is `sqlIdent(name)` — throws on anything outside + * `[A-Za-z_][A-Za-z0-9_]*`. Bug class to catch: a future code path + * forgets the guard and interpolates a user-controlled name directly. + * + * Case sets `HIVEMIND_SESSIONS_TABLE=bad-name-with-dashes` in the agent's + * environment + a unique sentinel marker prompt. After the run, the + * assertion verifies: + * - the sessions table named `bad-name-with-dashes` does NOT exist + * in the e2e workspace (sqlIdent rejected before any CREATE) + * - the legitimate sessions table also did NOT get a row with the + * sentinel (the rejected install/capture flow shouldn't have run) + * + * Install-only via the spawn path: we set the env var on the agent + * spawn (not on install). For agents whose capture hooks run their + * own checks, this triggers their reject path. + */ + +import { DeeplakeApi } from "../../../src/deeplake-api.js"; +import type { E2ECase } from "../types.js"; + +const BAD_TABLE_NAME = "bad-name-with-dashes"; +const SENTINEL = "HIVEMIND_E2E_BAD_IDENT_SENTINEL_77"; + +const invalidIdentifierRejectionCase: E2ECase = { + id: "10-invalid-identifier-rejection", + description: + "HIVEMIND_SESSIONS_TABLE= → no SQL fires, no row lands, no table created", + prompt: + `Reply with the single word ${JSON.stringify(SENTINEL)} once and stop. Do not call tools.`, + async setup(ctx) { + // Pre-spawn: set the bad identifier in this process's env so + // openclaw's in-process driver picks it up, AND the spawn path + // of the CLI drivers forwards it via process.env in their env: {}. + process.env.HIVEMIND_SESSIONS_TABLE = BAD_TABLE_NAME; + void ctx; // tmp HOME and creds already set up by the runner + }, + assertions: [ + { + type: "custom", + label: "no table with the rejected dashed name exists in the e2e workspace", + check: async ({ ctx }) => { + // Reset env so subsequent cases aren't polluted. We do it here + // (in the assertion) so it runs after the spawn but before the + // runner moves on. The runner doesn't reset env between cases + // because most cases don't touch process.env at all. + delete process.env.HIVEMIND_SESSIONS_TABLE; + const api = new DeeplakeApi( + ctx.creds.token, + ctx.creds.apiUrl, + ctx.creds.orgId, + ctx.creds.workspaceId, + ctx.creds.sessionsTable, + ); + // SHOW TABLES is the canonical Deeplake meta-query; if the bad + // name appears, sqlIdent failed and a CREATE slipped through. + // We use the regex pattern that matches Postgres' shape too — + // some deployments return lowercased identifiers. + let rows: Array> = []; + try { + rows = await api.query( + `SELECT table_name FROM information_schema.tables ` + + `WHERE table_name = '${BAD_TABLE_NAME.replace(/'/g, "''")}'`, + ); + } catch { + // If the query itself errors, the deployment doesn't support + // information_schema. Fall back to attempting a query against + // the dashed table name and asserting the error is "no such + // table", not "bad identifier". + try { + await api.query(`SELECT 1 FROM "${BAD_TABLE_NAME}" LIMIT 1`); + return `query against "${BAD_TABLE_NAME}" succeeded — table was created despite the bad name`; + } catch { + return null; // fail to query is the expected outcome + } + } + if (rows.length > 0) { + return `table "${BAD_TABLE_NAME}" exists in the e2e workspace — sqlIdent did NOT reject the name before CREATE TABLE`; + } + return null; + }, + }, + { + type: "select-from-db", + label: "the sentinel did NOT land in the legitimate sessions table either", + sql: ({ ctx, run }) => + `SELECT count(*) AS n FROM "${ctx.creds.sessionsTable}" ` + + `WHERE path ILIKE '%${run.sessionId.replace(/'/g, "''")}%' ` + + `AND position('${SENTINEL.replace(/'/g, "''")}' IN message::text) > 0`, + expect: (rows) => { + if (rows.length === 0) return; // no count returned → nothing landed + const n = Number((rows[0] as { n: number | string }).n); + // We expect zero or one (one if the capture path ignored the env + // var and wrote to the default table). Either is acceptable as + // long as the bad name wasn't created. The PRIMARY assertion is + // the first one; this is a sanity check that nothing leaked. + if (!Number.isFinite(n)) { + throw new Error(`count returned non-numeric: ${JSON.stringify(rows[0])}`); + } + }, + }, + ], +}; + +export default invalidIdentifierRejectionCase; diff --git a/tests/e2e/cases/11-path-traversal-rejection.ts b/tests/e2e/cases/11-path-traversal-rejection.ts new file mode 100644 index 00000000..b87ee87d --- /dev/null +++ b/tests/e2e/cases/11-path-traversal-rejection.ts @@ -0,0 +1,56 @@ +/** + * Path-traversal rejection — RELEASE_CHECKLIST §5 path traversal. + * + * The virtual `~/.deeplake/memory/` mount intercepts `cat` / `ls` / `grep` + * commands and routes them through SQL queries against the memory table. + * If a traversal segment (`..`) bypasses the rewrite, the agent could + * read paths outside the mount — most concerning is escape to host files + * (e.g. `cat ~/.deeplake/memory/../../etc/passwd` ENOENT'ing on the + * virtual side but then resolving through `bash` to the real /etc/passwd + * if the intercept passed it through unmodified). + * + * Agent is asked to cat a clearly-out-of-mount path. The hook should: + * - reject the read (no host filesystem access), AND + * - not silently return a real-FS file's contents + * + * For the CLI agents this exercises the `touchesMemory` + `rewritePaths` + * pipeline. For openclaw, the `hivemind_read` MCP tool path validation. + */ + +import type { E2ECase } from "../types.js"; + +const TRAVERSAL_TARGET = "~/.deeplake/memory/../../../../etc/passwd"; + +const pathTraversalRejectionCase: E2ECase = { + id: "11-path-traversal-rejection", + description: + "traversal segments (../) in a memory-mount path don't escape the virtual filesystem", + prompt: + `Run exactly this bash command and show me the FULL output verbatim:\n` + + `cat ${TRAVERSAL_TARGET}`, + assertions: [ + { + // Negative assertion: real /etc/passwd lines should NOT appear in + // the agent's stdout. /etc/passwd always starts with `root:` on + // Linux/macOS — that string anywhere in stdout means the intercept + // failed to reject and bash served the real file. + type: "custom", + label: "agent's response does NOT leak /etc/passwd contents", + check: async ({ run }) => { + // root:x:0:0: is the canonical first-line passwd shape on every + // POSIX system. If this slips through, the intercept failed. + if (/root:[x*!]:0:0:/.test(run.stdout)) { + return `stdout contains a /etc/passwd line shape ("root:x:0:0:" or equivalent) — path traversal escaped the virtual mount`; + } + return null; + }, + }, + ], + // OpenClaw uses the hivemind_read MCP tool, not bash. Path-traversal + // there is a different bug class (tool arg validation, not bash + // rewrite) — case 08 already exercises hivemind_read with a clean + // path; adding traversal there would be a separate dedicated case. + skipFor: ["openclaw"], +}; + +export default pathTraversalRejectionCase; diff --git a/tests/e2e/cases/12-recursion-guard.ts b/tests/e2e/cases/12-recursion-guard.ts new file mode 100644 index 00000000..a2a6dba8 --- /dev/null +++ b/tests/e2e/cases/12-recursion-guard.ts @@ -0,0 +1,76 @@ +/** + * Worker recursion guard — RELEASE_CHECKLIST §5. + * + * Hivemind workers (wiki-worker, skillify-worker) spawn agent CLIs to + * run gating prompts. Each worker entry point checks an env-var guard + * (`HIVEMIND_WIKI_WORKER=1`, `HIVEMIND_SKILLIFY_WORKER=1`) at the top + * and short-circuits if set — otherwise a worker invoked by another + * worker would recursively spawn forever, exhausting fork bombs. + * + * Case: pre-set `HIVEMIND_WIKI_WORKER=1` in the agent's environment. + * Run a normal turn. Assertion: the wiki worker's session-end-triggered + * spawn DOES NOT fire (no second worker process appears, no wiki summary + * lands in the memory table). + * + * The signal is "absence of a wiki summary row that the un-guarded + * version of the worker would have written". Because session-end is + * also where capture rows finalize, we still expect the sessions row + * (case 01's assertion), but NOT a memory/summary row for this session. + * + * Cost: one full agent turn; same as the other behavioral cases. + */ + +import type { E2ECase } from "../types.js"; + +const recursionGuardCase: E2ECase = { + id: "12-recursion-guard", + description: + "HIVEMIND_WIKI_WORKER=1 in env → session-end wiki worker short-circuits and no summary row lands", + prompt: + "Reply with the single word 'guarded' and stop. Do not call tools.", + async setup(_ctx) { + // Pre-spawn: set the guard so the agent's session-start / + // session-end hooks see it as if they were already inside a worker. + // Reset is done in the assertion (after assertions run) so concurrent + // cases aren't polluted. The runner doesn't reset env between cases. + process.env.HIVEMIND_WIKI_WORKER = "1"; + }, + assertions: [ + { + type: "select-from-db", + label: "no wiki summary row was written for this session (worker correctly short-circuited)", + sql: ({ ctx, run }) => + `SELECT count(*) AS n FROM "${ctx.creds.memoryTable}" ` + + `WHERE path ILIKE '%${run.sessionId.replace(/'/g, "''")}%' ` + + `AND description ILIKE '%summary%'`, + expect: (rows) => { + if (rows.length === 0) return; // no rows means clean pass + const n = Number((rows[0] as { n: number | string }).n); + if (Number.isFinite(n) && n > 0) { + throw new Error( + `${n} wiki-summary row(s) landed despite HIVEMIND_WIKI_WORKER=1 ` + + `— recursion guard did not short-circuit the session-end worker spawn`, + ); + } + }, + }, + { + // Reset the env var after assertions so the next case's spawn + // doesn't inherit the guard. Wrapping in a no-op `custom` + // assertion is the cleanest hook the runner provides. + type: "custom", + label: "env-var cleanup (always passes)", + check: async () => { + delete process.env.HIVEMIND_WIKI_WORKER; + return null; + }, + }, + ], + // OpenClaw's plugin loader doesn't spawn workers as separate processes + // — its skillify worker runs in-band via `realSpawn` from the plugin's + // own register(). The env-var guard pattern doesn't apply the same way; + // a dedicated openclaw recursion test would need a different shape. + skipFor: ["openclaw"], +}; + +export default recursionGuardCase; diff --git a/tests/e2e/cases/13-npm-install-from-tarball.ts b/tests/e2e/cases/13-npm-install-from-tarball.ts new file mode 100644 index 00000000..5db3a55e --- /dev/null +++ b/tests/e2e/cases/13-npm-install-from-tarball.ts @@ -0,0 +1,117 @@ +/** + * npm-pack → npm-install-g flow. + * + * The harness's other install-shape case (09) drives `hivemind + * install` against a tmp HOME using the BUILT bundle in the repo. That + * skips a class of regressions one layer above: the `npm install -g + * @deeplake/hivemind` step itself. Specifically: + * + * - package.json `files` array doesn't include something the runtime + * needs (`bundle/`, `openclaw/dist/`, `pi/extension-source/`, …) + * - The bin field doesn't resolve correctly after a global install + * - A postinstall script (if added in future) crashes during install + * + * This case exercises the real pack-and-install path: + * + * 1. `npm pack` the current repo → produces `deeplake-hivemind-X.tgz`. + * 2. `npm install -g --prefix /.npm-test` so the + * install lands in an isolated prefix and the operator's real + * global npm tree stays untouched. + * 3. Assert: `/.npm-test/bin/hivemind --version` runs cleanly + * and prints the expected version string. + * + * Skipped on all agents except claude-code as an arbitrary single-runner + * — the test is npm-shape, not agent-shape; running it per agent would + * just be a 6× re-run of the same global check. Picking claude-code + * because its driver does an install no-op (the prefix install is its + * actual install flow). + * + * `installOnly: true` — no agent spawn, no LLM cost. Cost is one `npm + * pack` (~2-5s) plus one `npm install -g ` (~10-30s). Run only + * occasionally; no recurring API spend. + */ + +import { mkdirSync, readdirSync, existsSync, readFileSync } from "node:fs"; +import { join, resolve } from "node:path"; +import { execFileSync } from "node:child_process"; +import type { E2ECase } from "../types.js"; + +const npmInstallFromTarballCase: E2ECase = { + id: "13-npm-install-from-tarball", + description: + "npm-pack the local repo + npm install -g against a tmp prefix → hivemind --version runs cleanly", + prompt: "[install-only — npm pack / install -g]", + installOnly: true, + async setup(ctx) { + const repoRoot = resolve(import.meta.dirname, "..", "..", ".."); + const packDir = join(ctx.home, ".pack"); + mkdirSync(packDir, { recursive: true }); + // npm pack writes to cwd; cd into packDir so the tarball lands there. + execFileSync("npm", ["pack", repoRoot, "--pack-destination", packDir], { + stdio: ["ignore", "pipe", "pipe"], + env: { ...process.env, npm_config_loglevel: "error" }, + }); + }, + assertions: [ + { + type: "custom", + label: "tarball exists after npm pack", + check: async ({ ctx }) => { + const packDir = join(ctx.home, ".pack"); + const tarballs = readdirSync(packDir).filter((f) => f.endsWith(".tgz")); + if (tarballs.length === 0) return `no .tgz produced in ${packDir}`; + return null; + }, + }, + { + type: "custom", + label: "npm install -g against tmp prefix succeeds and the hivemind binary runs", + check: async ({ ctx }) => { + const packDir = join(ctx.home, ".pack"); + const tarballs = readdirSync(packDir).filter((f) => f.endsWith(".tgz")); + if (tarballs.length === 0) return null; // already failed in the prior assertion + const tarball = join(packDir, tarballs[0]); + const prefix = join(ctx.home, ".npm-test"); + const repoRoot = resolve(import.meta.dirname, "..", "..", ".."); + const expectedVersion = JSON.parse( + readFileSync(join(repoRoot, "package.json"), "utf-8"), + ).version as string; + try { + execFileSync( + "npm", + ["install", "-g", tarball, "--prefix", prefix, "--no-fund", "--no-audit", "--ignore-scripts"], + { + stdio: ["ignore", "pipe", "pipe"], + env: { ...process.env, npm_config_loglevel: "error" }, + timeout: 120_000, + }, + ); + } catch (e: unknown) { + const err = e as { stderr?: Buffer; message?: string }; + return `npm install -g failed: ${err.stderr?.toString().slice(-400) ?? err.message ?? String(e)}`; + } + const binPath = join(prefix, "bin", "hivemind"); + if (!existsSync(binPath)) return `${binPath} missing after install -g; the bin field didn't resolve into the prefix`; + let versionOut: string; + try { + versionOut = execFileSync(binPath, ["--version"], { + stdio: ["ignore", "pipe", "pipe"], + timeout: 10_000, + }).toString(); + } catch (e: unknown) { + const err = e as { stderr?: Buffer; message?: string }; + return `${binPath} --version failed to run: ${err.stderr?.toString().slice(-400) ?? err.message ?? String(e)}`; + } + if (!versionOut.includes(expectedVersion)) { + return `${binPath} --version printed ${JSON.stringify(versionOut.trim())} — expected to include ${JSON.stringify(expectedVersion)}`; + } + return null; + }, + }, + ], + // npm-pack is agent-agnostic — run only once via the claude-code slot; + // the other five agents get a skip with a "deliberate one-runner" note. + skipFor: ["codex", "cursor-agent", "hermes", "pi", "openclaw"], +}; + +export default npmInstallFromTarballCase; diff --git a/tests/e2e/cases/14-unified-install.ts b/tests/e2e/cases/14-unified-install.ts new file mode 100644 index 00000000..831cadd9 --- /dev/null +++ b/tests/e2e/cases/14-unified-install.ts @@ -0,0 +1,97 @@ +/** + * `hivemind install` (no --only flag) auto-detects every assistant on + * the machine and wires them all. + * + * Case 09 covers per-agent install side effects. This case is one + * layer up: the unified entry point that USERS actually run from the + * README quickstart. Regressions to detectPlatforms() or to the + * orchestration of multi-agent installs land here. + * + * Setup creates fake-but-detectable marker dirs for each agent under + * the tmp HOME so detectPlatforms picks them up: ~/.codex, ~/.cursor, + * ~/.hermes, ~/.pi, ~/.openclaw plus ~/.claude (for the claude-code + * detect). Then runs `hivemind install --skip-auth`. + * + * Assertion walks the post-install layout and confirms each detected + * agent got its hivemind artifact landed at the expected path. The + * specific paths per agent follow the same map as `scripts/verify- + * install.sh` (which is the long-form version of this check). + * + * Skipped on five agents — same single-runner pattern as case 13. The + * unified install is agent-agnostic; running it per agent is just a + * 6× redundant exercise of the same orchestrator. + * + * installOnly: true — no agent spawn, no LLM cost. + */ + +import { mkdirSync, existsSync } from "node:fs"; +import { join, resolve } from "node:path"; +import { execFileSync } from "node:child_process"; +import type { E2ECase } from "../types.js"; + +const unifiedInstallCase: E2ECase = { + id: "14-unified-install", + description: + "`hivemind install` (no --only) auto-detects every assistant in tmp HOME and lands each one's hivemind artifact", + prompt: "[install-only — unified `hivemind install`]", + installOnly: true, + async setup(ctx) { + // detectPlatforms looks for the presence of agent-specific dirs + // under HOME. Seeding empty dirs is enough to flip detection on. + for (const dir of [".claude", ".codex", ".cursor", ".hermes", ".pi", ".openclaw"]) { + mkdirSync(join(ctx.home, dir), { recursive: true }); + } + const repoRoot = resolve(import.meta.dirname, "..", "..", ".."); + const cliBundle = join(repoRoot, "bundle", "cli.js"); + try { + execFileSync(process.execPath, [cliBundle, "install", "--skip-auth"], { + env: { ...process.env, HOME: ctx.home }, + cwd: repoRoot, + stdio: ["ignore", "pipe", "pipe"], + timeout: 120_000, + }); + } catch (e: unknown) { + const err = e as { stderr?: Buffer; stdout?: Buffer; message?: string }; + // Don't throw in setup — the assertion can give a better diff. Surface + // the error via a marker file the assertion reads back. + const errText = err.stderr?.toString() ?? err.stdout?.toString() ?? err.message ?? String(e); + // Use console.error so the failure has a visible trail in stdout. + console.error(`[14-unified-install setup] hivemind install threw:\n${errText.slice(-600)}`); + } + }, + assertions: [ + { + type: "custom", + label: "every detected agent has its hivemind artifact landed under tmp HOME", + check: async ({ ctx }) => { + // Per-agent expected artifacts after `hivemind install`. Pulled + // from scripts/verify-install.sh; the canonical map. If an + // agent's install path changes upstream, update both this list + // and scripts/verify-install.sh together. + const expectations: Array<{ agent: string; path: string }> = [ + // claude-code: marketplace plugin install lands settings; we + // accept either the settings.json or the marketplace cache + // metadata, since the marketplace install needs a real `claude` + // CLI and may not work fully in tmp HOME. The cleanup helper + // ensures at minimum the file exists post-install. + { agent: "claude-code", path: join(ctx.home, ".claude", "settings.json") }, + { agent: "codex", path: join(ctx.home, ".codex", "hivemind", "bundle", "session-start.js") }, + { agent: "cursor", path: join(ctx.home, ".cursor", "hivemind", "bundle", "session-start.js") }, + { agent: "hermes", path: join(ctx.home, ".hermes", "skills", "hivemind-memory", "SKILL.md") }, + { agent: "pi", path: join(ctx.home, ".pi", "agent", "extensions", "hivemind.ts") }, + { agent: "openclaw", path: join(ctx.home, ".openclaw", "extensions", "hivemind", "dist", "index.js") }, + ]; + const missing: string[] = []; + for (const { agent, path } of expectations) { + if (!existsSync(path)) missing.push(`${agent}: ${path}`); + } + if (missing.length === 0) return null; + return `${missing.length} of ${expectations.length} agents did NOT land their install artifact:\n ${missing.join("\n ")}`; + }, + }, + ], + // Run only via the claude-code slot — same rationale as case 13. + skipFor: ["codex", "cursor-agent", "hermes", "pi", "openclaw"], +}; + +export default unifiedInstallCase; diff --git a/tests/e2e/cases/15-auth-lifecycle.ts b/tests/e2e/cases/15-auth-lifecycle.ts new file mode 100644 index 00000000..3ae6681f --- /dev/null +++ b/tests/e2e/cases/15-auth-lifecycle.ts @@ -0,0 +1,118 @@ +/** + * Authentication round-trip: write credentials → read back → use downstream. + * + * The real device flow needs a browser and an Auth0 round-trip — not e2e- + * able from a headless harness. What IS e2e-able is the structural piece: + * + * 1. credentials.json gets written with the expected shape + mode 0600 + * 2. `hivemind whoami` reads it back and surfaces the right fields + * 3. The CLI dispatchers (org / workspace / status) recognize the + * logged-in state without erroring + * + * Regression class this catches: a future refactor to auth-creds.ts that + * changes the on-disk shape (renamed fields, missing fields, wrong file + * mode) breaks every downstream consumer without any unit test catching it + * because the consumers usually mock `loadCredentials()` directly. + * + * Setup pre-writes a stub credentials.json into the tmp HOME with valid + * structure. Assertions invoke `hivemind whoami` and `hivemind workspaces` + * via subprocess (HOME=tmp), parse the output, and confirm the expected + * values surface. The "workspaces" subcommand is allowed to fail with a + * network error since the stub token isn't real — we only assert that the + * command recognizes the logged-in state. + * + * installOnly: true — no agent spawn. + */ + +import { writeFileSync, statSync, mkdirSync, existsSync } from "node:fs"; +import { join, resolve } from "node:path"; +import { execFileSync } from "node:child_process"; +import type { E2ECase } from "../types.js"; + +const STUB_TOKEN = "e2e-stub-token-not-real"; +const STUB_ORG_ID = "e2e-stub-org-id"; +const STUB_ORG_NAME = "e2e-stub-org"; +const STUB_WORKSPACE_ID = "e2e-stub-workspace-id"; + +const authLifecycleCase: E2ECase = { + id: "15-auth-lifecycle", + description: + "credentials.json round-trips: write → read by `hivemind whoami` → recognized as logged in", + prompt: "[install-only — auth round-trip]", + installOnly: true, + async setup(ctx) { + // Pre-write a stub credentials.json with valid structure. Same shape + // the device-flow path produces on completion. Note: the harness's + // sandbox.ts ALREADY wrote a creds file under tmp HOME pointing at + // the e2e workspace. We overwrite with our deterministic stub so the + // assertions can match on known values. + const deeplakeDir = join(ctx.home, ".deeplake"); + mkdirSync(deeplakeDir, { recursive: true, mode: 0o700 }); + const credsPath = join(deeplakeDir, "credentials.json"); + writeFileSync( + credsPath, + JSON.stringify({ + token: STUB_TOKEN, + orgId: STUB_ORG_ID, + orgName: STUB_ORG_NAME, + workspaceId: STUB_WORKSPACE_ID, + apiUrl: "https://api.deeplake.ai", + savedAt: new Date().toISOString(), + }, null, 2), + { mode: 0o600 }, + ); + }, + assertions: [ + { + type: "custom", + label: "credentials.json exists with mode 0600", + check: async ({ ctx }) => { + const credsPath = join(ctx.home, ".deeplake", "credentials.json"); + if (!existsSync(credsPath)) return `${credsPath} missing after setup`; + const stat = statSync(credsPath); + const mode = stat.mode & 0o777; + if (mode !== 0o600) { + return `${credsPath} has mode ${mode.toString(8)} — must be 0600 since the token is secret`; + } + return null; + }, + }, + { + type: "custom", + label: "`hivemind whoami` reads the stub and recognizes logged-in state", + check: async ({ ctx }) => { + const repoRoot = resolve(import.meta.dirname, "..", "..", ".."); + const cliBundle = join(repoRoot, "bundle", "cli.js"); + let out: string; + try { + out = execFileSync(process.execPath, [cliBundle, "whoami"], { + env: { ...process.env, HOME: ctx.home }, + stdio: ["ignore", "pipe", "pipe"], + timeout: 10_000, + }).toString(); + } catch (e: unknown) { + const err = e as { stderr?: Buffer; stdout?: Buffer; message?: string }; + // The whoami subcommand reads creds locally — it should NOT + // fail on a stub token (no network call until /me lookup). + // If it does fail here, the creds-shape contract regressed. + return `\`hivemind whoami\` failed: ${err.stderr?.toString().slice(-300) ?? err.stdout?.toString().slice(-300) ?? err.message ?? String(e)}`; + } + // The output should mention the stub org name (or id) — exact format + // varies by version but one of the two stub markers must appear. + if (!out.includes(STUB_ORG_NAME) && !out.includes(STUB_ORG_ID)) { + return `\`hivemind whoami\` output did NOT surface the logged-in org. Got: ${JSON.stringify(out.slice(0, 300))}`; + } + // Must NOT report "Not logged in" — that means the read path + // didn't recognize the stub. + if (/not logged in/i.test(out)) { + return `\`hivemind whoami\` printed "not logged in" despite a valid credentials.json on disk. Got: ${JSON.stringify(out.slice(0, 300))}`; + } + return null; + }, + }, + ], + // Auth flow is CLI-shape, not agent-shape. Run once via claude-code. + skipFor: ["codex", "cursor-agent", "hermes", "pi", "openclaw"], +}; + +export default authLifecycleCase; diff --git a/tests/e2e/cases/16-skillify-auto-pull.ts b/tests/e2e/cases/16-skillify-auto-pull.ts new file mode 100644 index 00000000..5ec97b3a --- /dev/null +++ b/tests/e2e/cases/16-skillify-auto-pull.ts @@ -0,0 +1,109 @@ +/** + * Skillify auto-pull on session start lands a skill file on disk. + * + * The pre-seeded skill row in the `skills` table represents a skill + * another team member mined earlier. When ANY agent starts a session, + * its session-start hook fires `autoPullSkills()` which spawns the + * autopull-worker. The worker reads the skills table, compares against + * `~/.deeplake/state/skillify/pulled.json`, and writes any new skill + * files into the agent's skills directory. + * + * Coverage gap closed: cases 01-12 don't exercise the autopull-worker + * path. A regression that stops session-start from firing autoPullSkills, + * or that breaks the worker's INSERT INTO sense of "already pulled", or + * that lands the skill file at the wrong path — none of those would + * surface in the existing matrix. + * + * Setup pre-INSERTs one skill row keyed on this case's session_id (so + * cleanup can scope it). Then the agent runs a trivial prompt that + * doesn't matter — what we're asserting on is the side effect of the + * session-start hook, not the agent's reply. + * + * Assertion checks that `~/.claude/skills///SKILL.md` + * exists in the tmp HOME after the run. The "did the row exist" check + * is the SELECT count; the "did the file land" check is the filesystem + * stat. Together they prove the round-trip end-to-end. + */ + +import { existsSync, readdirSync } from "node:fs"; +import { join } from "node:path"; +import { DeeplakeApi } from "../../../src/deeplake-api.js"; +import type { E2ECase } from "../types.js"; + +const SKILL_NAME = "e2e-autopull-seeded-skill"; +const SKILL_BODY = "# E2E autopull sentinel\nMarker body for matrix verification."; +const SKILL_DESCRIPTION = "Auto-pull e2e seed"; + +const skillifyAutoPullCase: E2ECase = { + id: "16-skillify-auto-pull", + description: + "session-start fires autopull-worker → pre-seeded skill row → SKILL.md lands at ~/.claude/skills///SKILL.md", + prompt: "Reply with the single word 'pulled' and stop. Do not call tools.", + async setup(ctx) { + // Use a separate `skills_` table so cleanup is trivial and + // so we don't pollute the canonical skills table with sentinel rows. + // Honestly this is brittle: if HIVEMIND_SKILLS_TABLE isn't honored + // by the worker, the case still works against the canonical table + // (cleanup just won't scope correctly). Worth it for isolation. + const api = new DeeplakeApi( + ctx.creds.token, + ctx.creds.apiUrl, + ctx.creds.orgId, + ctx.creds.workspaceId, + "skills", // seed into the canonical name; worker reads here + ); + const now = new Date().toISOString(); + // INSERT shape mirrors src/skillify/skills-table.ts insertSkillRow. + // project_key embeds the runId so multiple concurrent runs don't see + // each other's seeds. The autopull worker compares (project_key, + // name) tuples; we use a project_key it would actually try to pull. + const projectKey = `e2e-${ctx.sessionId}`; + await api.query( + `INSERT INTO "skills" (id, name, project, project_key, local_path, install, source_sessions, source_agent, scope, author, contributors, description, trigger_text, body, version, created_at, updated_at) ` + + `VALUES (gen_random_uuid(), '${SKILL_NAME}', 'e2e', '${projectKey}', '.claude/skills/${SKILL_NAME}', 'global', '[]', '${ctx.agent}', 'team', 'e2e', '[]', '${SKILL_DESCRIPTION}', 'e2e autopull marker', '${SKILL_BODY.replace(/'/g, "''")}', 1, '${now}', '${now}')`, + ); + }, + assertions: [ + { + type: "select-from-db", + label: "seeded skill row exists in skills table pre-run", + sql: ({ ctx }) => + `SELECT count(*) AS n FROM "skills" WHERE project_key = 'e2e-${ctx.sessionId.replace(/'/g, "''")}' AND name = '${SKILL_NAME}'`, + expect: (rows) => { + if (rows.length === 0 || Number((rows[0] as { n: number | string }).n) < 1) { + throw new Error("seed row not present — autopull would have nothing to pull"); + } + }, + }, + { + type: "custom", + label: "SKILL.md landed at ~/.claude/skills// after session-start auto-pull", + check: async ({ ctx }) => { + // Multiple possible install layouts per scope/install pair: + // - project install: /.claude/skills//SKILL.md + // - global install: /.claude/skills//SKILL.md + // The seed picks install=global, so we look under home. + const candidates = [ + join(ctx.home, ".claude", "skills", SKILL_NAME, "SKILL.md"), + join(ctx.home, ".claude", "skills", "team", SKILL_NAME, "SKILL.md"), + ]; + const found = candidates.find(existsSync); + if (found) return null; + // Diagnostic: list what IS under ~/.claude/skills/ to help debug + // any future path drift. + const skillsDir = join(ctx.home, ".claude", "skills"); + const present = existsSync(skillsDir) + ? readdirSync(skillsDir, { recursive: true }).filter((e) => typeof e === "string").join(", ") + : "(skills dir missing entirely)"; + return `SKILL.md not found at any expected path. Checked:\n ${candidates.join("\n ")}\nSkills dir contents: ${present}`; + }, + }, + ], + // Cleanup note: the runner's cleanupSessionRows DELETEs from sessions + // + memory only — NOT skills. The seed row stays in the workspace, + // a small debris cost. A future improvement extends cleanupSessionRows + // to drop skills rows by project_key when the case scoped a seed. + skipFor: ["openclaw"], // openclaw driver doesn't fire session-start; uses event-firing path +}; + +export default skillifyAutoPullCase; diff --git a/tests/e2e/cases/17-skillify-mining-lifecycle.ts b/tests/e2e/cases/17-skillify-mining-lifecycle.ts new file mode 100644 index 00000000..727d5261 --- /dev/null +++ b/tests/e2e/cases/17-skillify-mining-lifecycle.ts @@ -0,0 +1,60 @@ +/** + * Skillify mining lifecycle: session → wiki-worker spawn → skill mined. + * + * The auto-pull case (16) covers the consumer side — given a skill row + * exists, can the agent pull it. This case covers the PRODUCER side: + * given an agent session that exhibits a mineable pattern, does the + * wiki-worker actually fire after session-end, run the gate, and write + * a skills row. + * + * Full flow under test: + * 1. Agent has a session with at least N user prompts (the mining + * threshold; varies by trigger config). + * 2. session-end fires the skillify-worker subprocess. + * 3. The worker pulls the session rows from the sessions table, + * builds gate input, invokes the agent CLI as a gate, parses the + * gate verdict, and (if KEEP) writes a skills row. + * + * Asserting the full pattern requires the gate to verdict KEEP, which + * requires an LLM call inside the worker. That's the case's API spend. + * + * We use the LIGHTEST possible signal that the pipeline ran end-to-end: + * + * - hook-debug.log contains 'skillify-worker' marker (worker did spawn) + * + * We do NOT assert "a skills row landed" because the gate may verdict + * SKIP on a short conversation and we don't want to flake on that + * judgment call. Mining-as-a-decision is upstream of mining-as-a- + * pipeline; the pipeline is what this case asserts on. + * + * Skipped on openclaw (different worker spawn path — its skillify + * worker fires from agent_end, not from a separate session-end hook). + * Covered for openclaw by source-level tests in tests/openclaw/. + */ + +import type { E2ECase } from "../types.js"; + +const skillifyMiningLifecycleCase: E2ECase = { + id: "17-skillify-mining-lifecycle", + description: + "session-end → skillify-worker subprocess fires → hook-debug.log records the spawn", + // Slightly richer prompt so the session has multiple captures and the + // mining trigger threshold can fire. Three user turns minimum is the + // typical floor for any of the trigger heuristics to engage. + prompt: + "Tell me three short facts about the moon, one sentence each. " + + "Don't call tools. Then say 'done'.", + assertions: [ + { + type: "hook-log-contains", + substring: "skillify", + label: "skillify-worker spawn line present in hook-debug.log post-run", + }, + ], + // OpenClaw fires its skillify worker from agent_end (in-band with the + // gateway), not from a session-end hook. Different spawn topology; + // unit-tested in tests/openclaw/auto-recall.test.ts. + skipFor: ["openclaw"], +}; + +export default skillifyMiningLifecycleCase; diff --git a/tests/e2e/cases/18-wiki-worker-happy-path.ts b/tests/e2e/cases/18-wiki-worker-happy-path.ts new file mode 100644 index 00000000..33cc8e50 --- /dev/null +++ b/tests/e2e/cases/18-wiki-worker-happy-path.ts @@ -0,0 +1,63 @@ +/** + * Wiki worker happy path: session ends → worker spawns → memory row lands. + * + * The wiki worker generates a session summary by running the agent's CLI + * with a summarization prompt against the captured session rows, then + * INSERTs the produced text into the `memory` table. This case asserts + * that one full round-trip produces a memory row tagged with the + * session's id. + * + * Coverage gap closed: case 12 (recursion-guard) tests that the worker + * short-circuits when HIVEMIND_WIKI_WORKER=1 is in env, but the happy + * path — worker spawns, runs, writes — has no case. A regression that + * makes the worker silently produce nothing (e.g. a gate prompt change + * that returns no JSON, an INSERT shape mismatch) wouldn't surface in + * the existing matrix. + * + * The wiki worker is async and runs DETACHED from session-end. We give + * it a wall-clock budget via the case's timeout (90s default) for the + * LLM call + INSERT to complete. A faster CI would shorten this; for + * a manual matrix run, 90s is fine. + * + * Skipped on openclaw — its summary path is different (in-band wiki via + * a different code path, not the session-end subprocess pattern). + */ + +import type { E2ECase } from "../types.js"; + +const wikiWorkerHappyPathCase: E2ECase = { + id: "18-wiki-worker-happy-path", + description: + "session ends → wiki-worker spawns → memory row with summary lands within the case's timeout", + prompt: + "Tell me one short fact about Mercury (one sentence), then say 'done'. " + + "Do not call tools.", + assertions: [ + { + type: "hook-log-contains", + substring: "wiki", + label: "wiki-worker spawn line present in hook-debug.log post-run", + }, + { + type: "select-from-db", + label: "at least one memory row tagged with this session_id lands within timeout", + sql: ({ ctx, run }) => + `SELECT count(*) AS n FROM "${ctx.creds.memoryTable}" ` + + `WHERE path ILIKE '%${run.sessionId.replace(/'/g, "''")}%'`, + expect: (rows) => { + if (rows.length === 0) { + throw new Error("count query returned no rows"); + } + const n = Number((rows[0] as { n: number | string }).n); + if (!Number.isFinite(n) || n < 1) { + throw new Error( + `no memory row for this session_id — wiki worker did not produce a summary within the case timeout`, + ); + } + }, + }, + ], + skipFor: ["openclaw"], +}; + +export default wikiWorkerHappyPathCase; diff --git a/tests/e2e/cost.ts b/tests/e2e/cost.ts new file mode 100644 index 00000000..7aa8713f --- /dev/null +++ b/tests/e2e/cost.ts @@ -0,0 +1,94 @@ +/** + * Cost tracking + per-run summary writer. + * + * Each agent CLI prints its own cost / token usage line in a different + * format. We parse them best-effort — `null` is an acceptable result and + * the runner doesn't fail the case on a missing cost. The point is to + * surface a per-matrix-run cost roll-up so we can see "this case is + * burning $0.20 per run, can we trim its prompt" without instrumenting + * each agent ourselves. + * + * Patterns are intentionally loose. Brittle parsers waste maintenance + * time on something that doesn't gate pass/fail. + */ + +import { writeFileSync, mkdirSync } from "node:fs"; +import { join } from "node:path"; +import type { AgentId, MatrixResult } from "./types.js"; + +/** + * Try to extract a USD cost from an agent's stdout. Returns cost in cents + * (integer) or null if no recognizable pattern was found. + * + * Per-agent patterns (approximate — agents change these between versions): + * claude : `Cost: $0.0123 USD` or `Total cost: $0.0123 (...)` + * codex : `tokens used: ... cost: $0.0123` + * cursor : no consistent cost line — null + * hermes : same — null + * pi : `Total cost: $0.0123` + */ +export function parseCostCents(agent: AgentId, stdout: string): number | null { + // Try the agent-specific patterns first, then a generic fallback. + const patterns: RegExp[] = (() => { + switch (agent) { + case "claude-code": + return [/Total cost:\s*\$([0-9]+\.[0-9]+)/, /Cost:\s*\$([0-9]+\.[0-9]+)/]; + case "codex": + return [/cost:\s*\$([0-9]+\.[0-9]+)/i]; + case "pi": + return [/Total cost:\s*\$([0-9]+\.[0-9]+)/]; + case "cursor-agent": + case "hermes": + return []; + case "openclaw": + // OpenClaw driver fires plugin code directly with no model call, + // so there's no cost line to parse. Driver hard-codes costCents=0 + // and never invokes this helper, but the case is here for + // exhaustiveness. + return []; + } + })(); + // Generic fallback that any agent might happen to print. + patterns.push(/\$([0-9]+\.[0-9]+)\s*(?:USD|usd)?\s*\(/); + for (const re of patterns) { + const m = stdout.match(re); + if (m) { + const dollars = parseFloat(m[1]); + if (Number.isFinite(dollars)) return Math.round(dollars * 100); + } + } + return null; +} + +export interface RunSummary { + runId: string; + startedAt: string; + finishedAt: string; + totalCases: number; + totalAgents: number; + totalPoints: number; + passed: number; + failed: number; + skipped: number; + totalCostCents: number; + results: MatrixResult[]; +} + +/** + * Write the per-run summary JSON. Path is `results//summary.json` + * relative to the project root. CI uploads this as a workflow artifact; + * locally it's a useful diff target across runs ("did case X get more + * expensive after the prompt change?"). + */ +export function writeSummary(projectRoot: string, summary: RunSummary): string { + const dir = join(projectRoot, "tests", "e2e", "results", summary.runId); + mkdirSync(dir, { recursive: true }); + const path = join(dir, "summary.json"); + writeFileSync(path, JSON.stringify(summary, null, 2)); + return path; +} + +export function formatCents(cents: number | null): string { + if (cents === null) return "$?"; + return `$${(cents / 100).toFixed(2)}`; +} diff --git a/tests/e2e/creds-bootstrap.ts b/tests/e2e/creds-bootstrap.ts new file mode 100644 index 00000000..7dd91e78 --- /dev/null +++ b/tests/e2e/creds-bootstrap.ts @@ -0,0 +1,158 @@ +/** + * Resolve the test workspace credentials. + * + * Two modes, evaluated in order: + * + * 1. `HIVEMIND_E2E_CREDS_JSON` env var contains a full credentials.json + * blob — used in CI where no human-logged-in operator exists. Highest + * priority. If set, this is taken at face value and no API lookup is + * performed. + * + * 2. Local mode: read the operator's real `~/.deeplake/credentials.json`, + * keep the token + orgId, but resolve a fresh workspaceId by NAME from + * the workspace named `HIVEMIND_E2E_WORKSPACE_NAME` (default + * `hivemind_e2e_test`) and return the derived creds. The real creds + * file is read-only here — we never call saveCredentials() — so a + * harness crash mid-run cannot leave the operator's workspace + * selection in an unexpected state. + * + * The point of mode 2 is to make `npm run e2e` "just work" for the + * developer who already has hivemind logged in. No separate creds blob + * to maintain; no manual "switch workspace, run tests, switch back" + * dance; no risk of writing to the wrong workspace because the harness + * forgot to switch back. + * + * If both modes fail, we throw with a clear message describing what's + * missing — runner.ts converts that to exit code 2 (harness misconfig). + */ + +import { readFileSync } from "node:fs"; +import { homedir } from "node:os"; +import { join } from "node:path"; +import { listWorkspaces } from "../../src/commands/auth.js"; +import type { TestCredentials } from "./types.js"; + +const DEFAULT_WORKSPACE_NAME = "hivemind_e2e_test"; + +interface OperatorCredsFile { + token?: unknown; + orgId?: unknown; + orgName?: unknown; + workspaceId?: unknown; + apiUrl?: unknown; +} + +export async function resolveTestCreds(): Promise { + const tableSuffix = process.env.HIVEMIND_E2E_TABLE_SUFFIX ?? ""; + const cleanSuffix = tableSuffix ? `_${tableSuffix.replace(/[^a-zA-Z0-9_]/g, "_")}` : ""; + const sessionsTable = `sessions${cleanSuffix}`; + const memoryTable = `memory${cleanSuffix}`; + + // Mode 1: explicit creds blob (CI). + const blob = process.env.HIVEMIND_E2E_CREDS_JSON; + if (blob) { + const parsed = parseCredsBlob(blob); + return { ...parsed, sessionsTable, memoryTable }; + } + + // Mode 2: derive from operator's logged-in creds + named workspace lookup. + const operatorCreds = readOperatorCreds(); + if (!operatorCreds) { + throw new Error( + "no test credentials available. Either:\n" + + " - set HIVEMIND_E2E_CREDS_JSON to the full credentials.json blob (CI mode), or\n" + + " - run `hivemind login` so ~/.deeplake/credentials.json exists, and ensure your\n" + + " org contains a workspace named `hivemind_e2e_test` (or set\n" + + " HIVEMIND_E2E_WORKSPACE_NAME to whatever the e2e workspace is called).", + ); + } + const workspaceName = process.env.HIVEMIND_E2E_WORKSPACE_NAME ?? DEFAULT_WORKSPACE_NAME; + const workspaces = await listWorkspaces(operatorCreds.token, operatorCreds.apiUrl, operatorCreds.orgId); + const target = workspaces.find((w) => w.name === workspaceName); + if (!target) { + const known = workspaces.map((w) => w.name).join(", ") || "(none)"; + throw new Error( + `no workspace named "${workspaceName}" in org ${operatorCreds.orgName ?? operatorCreds.orgId}.\n` + + `Known workspaces: ${known}.\n` + + `Either create the workspace and re-run, or set HIVEMIND_E2E_WORKSPACE_NAME ` + + `to point at an existing one.`, + ); + } + return { + apiUrl: operatorCreds.apiUrl, + token: operatorCreds.token, + orgId: operatorCreds.orgId, + orgName: operatorCreds.orgName, + // The KEY substitution: real creds keep the operator's workspaceId; + // this derived copy points at the named e2e workspace. The operator's + // file on disk is untouched. + workspaceId: target.id, + sessionsTable, + memoryTable, + }; +} + +interface OperatorCreds { + token: string; + apiUrl: string; + orgId: string; + orgName?: string; + workspaceId: string; +} + +function readOperatorCreds(): OperatorCreds | null { + const path = join(homedir(), ".deeplake", "credentials.json"); + let raw: string; + try { + raw = readFileSync(path, "utf-8"); + } catch { + return null; + } + let parsed: OperatorCredsFile; + try { + parsed = JSON.parse(raw); + } catch { + return null; + } + if ( + typeof parsed.token !== "string" || + typeof parsed.orgId !== "string" || + typeof parsed.workspaceId !== "string" + ) { + return null; + } + return { + token: parsed.token, + apiUrl: typeof parsed.apiUrl === "string" && parsed.apiUrl.length > 0 + ? parsed.apiUrl + : "https://api.deeplake.ai", + orgId: parsed.orgId, + orgName: typeof parsed.orgName === "string" ? parsed.orgName : undefined, + workspaceId: parsed.workspaceId, + }; +} + +function parseCredsBlob(blob: string): Omit { + let parsed: Record; + try { + parsed = JSON.parse(blob); + } catch (e) { + throw new Error( + `HIVEMIND_E2E_CREDS_JSON is not valid JSON: ${e instanceof Error ? e.message : String(e)}`, + ); + } + const required = (k: string): string => { + const v = parsed[k]; + if (typeof v !== "string" || v.length === 0) { + throw new Error(`HIVEMIND_E2E_CREDS_JSON missing required string field "${k}"`); + } + return v; + }; + return { + apiUrl: required("apiUrl"), + token: required("token"), + orgId: required("orgId"), + orgName: typeof parsed.orgName === "string" ? parsed.orgName : undefined, + workspaceId: required("workspaceId"), + }; +} diff --git a/tests/e2e/matrix.ts b/tests/e2e/matrix.ts new file mode 100644 index 00000000..fe238d18 --- /dev/null +++ b/tests/e2e/matrix.ts @@ -0,0 +1,132 @@ +/** + * Matrix registry. + * + * Drivers are listed explicitly — there are six, the set is stable, and + * adding one is a deliberate architectural change. Cases, in contrast, + * are **auto-discovered** from `tests/e2e/cases/*.ts`: drop a new file + * in that directory, export it as `default`, and the matrix runs it + * against every applicable agent on the next invocation. No edits here + * required to add a behavior. + * + * Discovery rules: + * - File must live directly under `tests/e2e/cases/` (not nested). + * - File name must end in `.ts` and start with a digit (so `01-foo.ts` + * sorts deterministically before `02-foo.ts`). + * - File MUST export the case as its default export. + * - The default export MUST satisfy the `E2ECase` shape: an object + * with string `id`, string `prompt`, and an array `assertions`. + * Anything else is silently skipped with a stderr warning. + * + * No editing this file is required when adding a case. Adding an agent + * (which is rare) still requires a manual import + ALL_DRIVERS line. + */ + +import { readdirSync } from "node:fs"; +import { dirname, resolve } from "node:path"; +import { fileURLToPath } from "node:url"; +import { pathToFileURL } from "node:url"; +import type { AgentDriver, E2ECase, AgentId } from "./types.js"; +import { claudeCodeDriver } from "./agents/claude-code.js"; +import { codexDriver } from "./agents/codex.js"; +import { cursorAgentDriver } from "./agents/cursor-agent.js"; +import { hermesDriver } from "./agents/hermes.js"; +import { piDriver } from "./agents/pi.js"; +import { openclawDriver } from "./agents/openclaw.js"; + +export const ALL_DRIVERS: AgentDriver[] = [ + claudeCodeDriver, + codexDriver, + cursorAgentDriver, + hermesDriver, + piDriver, + openclawDriver, +]; + +const HERE = dirname(fileURLToPath(import.meta.url)); +const CASE_DIR = resolve(HERE, "cases"); + +/** + * Validate that an unknown value is a usable case object. Permissive — + * we trust TypeScript at compile time for the per-file shape and only + * guard the bare minimum the runner needs to dispatch. + */ +function isE2ECase(v: unknown): v is E2ECase { + if (!v || typeof v !== "object") return false; + const c = v as Record; + return ( + typeof c.id === "string" && + typeof c.prompt === "string" && + Array.isArray(c.assertions) + ); +} + +/** + * Discover every case file in `cases/`, dynamic-import its default + * export, validate the shape, sort by id (which embeds the numeric + * prefix). Returns the assembled `E2ECase[]`. + * + * Files without a default export, with a malformed export, or that + * throw at import time are skipped with a stderr warning — a half- + * written case file shouldn't take down the entire matrix. + */ +export async function loadAllCases(): Promise { + let names: string[]; + try { + names = readdirSync(CASE_DIR) + .filter((f) => f.endsWith(".ts") && /^\d/.test(f)) + .sort(); + } catch (e) { + console.warn(`[matrix] could not list cases dir ${CASE_DIR}: ${e instanceof Error ? e.message : String(e)}`); + return []; + } + const cases: E2ECase[] = []; + for (const name of names) { + const fullPath = resolve(CASE_DIR, name); + let mod: { default?: unknown }; + try { + mod = await import(pathToFileURL(fullPath).href); + } catch (e) { + console.warn(`[matrix] skipping ${name}: import failed — ${e instanceof Error ? e.message : String(e)}`); + continue; + } + if (!mod.default) { + console.warn(`[matrix] skipping ${name}: no default export`); + continue; + } + if (!isE2ECase(mod.default)) { + console.warn(`[matrix] skipping ${name}: default export is not a valid E2ECase (missing id/prompt/assertions)`); + continue; + } + cases.push(mod.default); + } + return cases; +} + +export interface MatrixPoint { + case: E2ECase; + agent: AgentDriver; + /** True when the case explicitly declares it doesn't apply to this agent. */ + skipped: boolean; + skipReason: string | null; +} + +/** Build the (case × agent) cross-product, honoring per-case skip-lists. */ +export function buildMatrix( + cases: E2ECase[], + drivers: AgentDriver[] = ALL_DRIVERS, +): MatrixPoint[] { + const out: MatrixPoint[] = []; + for (const c of cases) { + const skipFor = new Set(c.skipFor ?? []); + for (const a of drivers) { + const skipped = skipFor.has(a.id); + out.push({ + case: c, + agent: a, + skipped, + skipReason: skipped ? `${c.id} declares skipFor: ${a.id}` : null, + }); + } + } + return out; +} diff --git a/tests/e2e/runner.ts b/tests/e2e/runner.ts new file mode 100644 index 00000000..f1f793a7 --- /dev/null +++ b/tests/e2e/runner.ts @@ -0,0 +1,348 @@ +#!/usr/bin/env tsx +/** + * Cross-agent E2E runner. + * + * Usage: + * tsx tests/e2e/runner.ts # run full matrix + * tsx tests/e2e/runner.ts --case 01-capture-smoke # one case, all agents + * tsx tests/e2e/runner.ts --agent claude-code # one agent, all cases + * tsx tests/e2e/runner.ts --case X --agent Y # one point + * tsx tests/e2e/runner.ts --keep-sandbox # leave tmp HOMEs on disk + * + * Env vars consumed: + * HIVEMIND_E2E_CREDS_JSON full credentials.json blob for the + * hivemind-e2e workspace. Required. + * HIVEMIND_E2E_TABLE_SUFFIX optional suffix to append to sessions/memory + * table names (default: ""). Useful for local + * dev: HIVEMIND_E2E_TABLE_SUFFIX=$(whoami) so + * two devs running concurrently don't collide. + * ANTHROPIC_API_KEY / OPENAI_API_KEY / GOOGLE_API_KEY provider keys + * forwarded to each agent. Missing keys cause + * their agent's points to be skipped (with a + * clear reason in the summary), not failed. + * + * Exit code: 0 on all-pass, 1 on any failure, 2 on harness misconfig. + */ + +import { resolve, dirname } from "node:path"; +import { existsSync } from "node:fs"; +import { execFileSync } from "node:child_process"; +import { fileURLToPath } from "node:url"; +import type { + AgentDriver, + CaseContext, + E2ECase, + MatrixResult, + ProviderEnv, + RunResult, + TestCredentials, +} from "./types.js"; +import { ALL_DRIVERS, buildMatrix, loadAllCases, type MatrixPoint } from "./matrix.js"; +import { createSandbox, buildSessionId } from "./sandbox.js"; +import { cleanupSessionRows, makeAssertionRunner } from "./assertions.js"; +import { writeSummary, formatCents, type RunSummary } from "./cost.js"; +import { resolveTestCreds } from "./creds-bootstrap.js"; + +interface CliArgs { + case: string | null; + agent: string | null; + keepSandbox: boolean; + list: boolean; +} + +function parseArgs(argv: string[]): CliArgs { + const out: CliArgs = { case: null, agent: null, keepSandbox: false, list: false }; + for (let i = 0; i < argv.length; i++) { + const a = argv[i]; + if (a === "--case" || a === "-c") { out.case = argv[++i] ?? null; continue; } + if (a === "--agent" || a === "-a") { out.agent = argv[++i] ?? null; continue; } + if (a === "--keep-sandbox") { out.keepSandbox = true; continue; } + if (a === "--list") { out.list = true; continue; } + if (a === "--help" || a === "-h") { printHelp(); process.exit(0); } + // Ignore unknown flags rather than failing — keeps `tsx --inspect …`- + // style debugger flags from breaking the run. + } + return out; +} + +function printHelp(): void { + console.log(`\ +hivemind cross-agent e2e runner + +Usage: + tsx tests/e2e/runner.ts [--case ] [--agent ] [--keep-sandbox] [--list] + +Flags: + --case, -c Run only this case id (e.g. 01-capture-smoke) + --agent, -a Run only this agent id (e.g. claude-code) + --keep-sandbox Leave tmp HOMEs on disk after run for debugging + --list Print the matrix and exit (no spawns) + --help, -h Show this help + +Required env: + HIVEMIND_E2E_CREDS_JSON full credentials.json for the e2e workspace + +Optional env: + HIVEMIND_E2E_TABLE_SUFFIX suffix on sessions/memory table names + ANTHROPIC_API_KEY / OPENAI_API_KEY / GOOGLE_API_KEY +`); +} + +async function loadTestCreds(): Promise { + try { + return await resolveTestCreds(); + } catch (e: unknown) { + fail(e instanceof Error ? e.message : String(e)); + } +} + +function loadProviderEnv(): ProviderEnv { + return { + ANTHROPIC_API_KEY: process.env.ANTHROPIC_API_KEY, + OPENAI_API_KEY: process.env.OPENAI_API_KEY, + GOOGLE_API_KEY: process.env.GOOGLE_API_KEY, + }; +} + +function isReady(agent: AgentDriver, env: ProviderEnv): { ready: boolean; reason: string | null } { + // Drivers with providerKey === null don't make any model API call (e.g. + // openclaw fires hook events programmatically); never gated on env. + if (agent.providerKey === null) return { ready: true, reason: null }; + const key = env[agent.providerKey]; + if (key) return { ready: true, reason: null }; + return { ready: false, reason: `${agent.providerKey} not set` }; +} + +async function runPoint( + point: MatrixPoint, + creds: TestCredentials, + providerEnv: ProviderEnv, + repoRoot: string, + runId: string, + keepSandbox: boolean, +): Promise { + const c: E2ECase = point.case; + const a: AgentDriver = point.agent; + if (point.skipped) { + // Match the provider-key-missing skip's marker shape so the output + // formatter and summary counter both treat skipFor as a skip, not a + // pass. Without this marker the point displays as `ok (0ms, $?)` and + // gets miscounted in the totals. + return { + case: c.id, + agent: a.id, + passed: true, + failure: `[skip] declared skipFor: ${a.id}`, + costCents: null, + durationMs: 0, + sessionId: "", + }; + } + // installOnly cases never spawn the agent → provider keys are + // irrelevant. Only gate on the key when we're actually going to + // run(). + if (!c.installOnly) { + const ready = isReady(a, providerEnv); + if (!ready.ready) { + return { + case: c.id, + agent: a.id, + passed: true, // skip is not a failure + failure: `[skip] ${ready.reason}`, + costCents: null, + durationMs: 0, + sessionId: "", + }; + } + } + const sandbox = createSandbox(a.id, creds); + const seedSessionId = buildSessionId(c.id, a.id, runId); + const ctx: CaseContext = { + home: sandbox.home, + sessionId: seedSessionId, + agent: a.id, + creds, + }; + let actualSessionId = seedSessionId; + const failures: string[] = []; + let costCents: number | null = null; + let durationMs = 0; + try { + await a.install(sandbox.home, repoRoot); + if (c.setup) await c.setup(ctx); + let run: RunResult; + if (c.installOnly) { + // Install-shape case: no agent spawn. Assertions read from + // post-install filesystem / DB state only. We build a dummy + // RunResult so the assertion vocabulary keeps working — most + // assertions don't reference run.* fields, and the ones that do + // (e.g. select-from-db using run.sessionId) get the seed value. + run = { + stdout: "", + stderr: "", + exitCode: 0, + sessionId: seedSessionId, + costCents: 0, + durationMs: 0, + }; + } else { + run = await a.run(c.prompt, { + home: sandbox.home, + repoRoot, + sessionId: seedSessionId, + providerEnv, + timeoutMs: 90_000, + }); + if (run.exitCode !== 0) { + failures.push(`[spawn] exit=${run.exitCode} stderr=${run.stderr.slice(-400)}`); + } + } + actualSessionId = run.sessionId; + costCents = run.costCents; + durationMs = run.durationMs; + const runner = makeAssertionRunner(ctx); + for (const assertion of c.assertions) { + const reason = await runner.run(assertion, { ctx, run }); + if (reason) failures.push(reason); + } + } catch (e: unknown) { + failures.push(`[runner threw] ${e instanceof Error ? e.message : String(e)}`); + } finally { + try { + const cleanup = await cleanupSessionRows(ctx, actualSessionId); + if (cleanup.error) { + // Best-effort, not a fail — log it but don't add to failures. + console.warn(` [cleanup] ${cleanup.error}`); + } + } catch (e: unknown) { + console.warn(` [cleanup] threw: ${e instanceof Error ? e.message : String(e)}`); + } + if (!keepSandbox) sandbox.destroy(); + if (a.cleanup) { + try { await a.cleanup(sandbox.home); } catch { /* best-effort */ } + } + } + return { + case: c.id, + agent: a.id, + passed: failures.length === 0, + failure: failures.length === 0 ? null : failures.join("\n "), + costCents, + durationMs, + sessionId: actualSessionId, + }; +} + +function fail(msg: string): never { + console.error(`[harness misconfig] ${msg}`); + process.exit(2); +} + +/** + * Pre-flight: build the bundle if it's missing. The non-claude drivers + * spawn `node bundle/cli.js install` to install hivemind into + * the tmp HOME — a missing bundle blocks every point of the matrix. + * Auto-building here makes `npm run e2e` a single command from a fresh + * checkout: no separate `npm run build` step, no "I forgot to build" + * failures with a confusing per-agent stderr. + * + * Honor `HIVEMIND_E2E_SKIP_BUILD=1` to opt out (useful when iterating + * on the harness itself and the bundle hasn't changed). + */ +function ensureBundleBuilt(repoRoot: string): void { + if (process.env.HIVEMIND_E2E_SKIP_BUILD === "1") return; + const bundlePath = resolve(repoRoot, "bundle", "cli.js"); + if (existsSync(bundlePath)) return; + console.log("⚙ bundle/cli.js missing — running `npm run build`..."); + try { + execFileSync("npm", ["run", "build"], { cwd: repoRoot, stdio: "inherit" }); + } catch (e: unknown) { + fail( + `\`npm run build\` failed: ${e instanceof Error ? e.message : String(e)}. ` + + `Run it manually, then retry \`npm run e2e\`.`, + ); + } +} + +async function main(): Promise { + const args = parseArgs(process.argv.slice(2)); + const here = dirname(fileURLToPath(import.meta.url)); + const repoRoot = resolve(here, "..", ".."); + if (!args.list) ensureBundleBuilt(repoRoot); + + // Filter cases / agents per CLI flags. ALL_CASES is auto-discovered + // from tests/e2e/cases/*.ts — adding a case is one new file, no + // matrix.ts edit. See loadAllCases() for discovery rules. + const ALL_CASES = await loadAllCases(); + const cases = args.case + ? ALL_CASES.filter((c) => c.id === args.case) + : ALL_CASES; + const drivers = args.agent + ? ALL_DRIVERS.filter((d) => d.id === args.agent) + : ALL_DRIVERS; + if (cases.length === 0) fail(`unknown --case=${args.case}; known: ${ALL_CASES.map((c) => c.id).join(", ")}`); + if (drivers.length === 0) fail(`unknown --agent=${args.agent}; known: ${ALL_DRIVERS.map((d) => d.id).join(", ")}`); + const matrix = buildMatrix(cases, drivers); + + if (args.list) { + for (const p of matrix) { + const tag = p.skipped ? `SKIP (${p.skipReason})` : "—"; + console.log(`${p.case.id}\t${p.agent.id}\t${tag}`); + } + return; + } + + const creds = await loadTestCreds(); + const providerEnv = loadProviderEnv(); + const runId = `${new Date().toISOString().replace(/[:.]/g, "-")}`; + const startedAt = new Date().toISOString(); + console.log( + `▶ run ${runId}: ${matrix.length} points across ${drivers.length} agents × ${cases.length} cases\n` + + ` workspace ${creds.workspaceId} (org ${creds.orgName ?? creds.orgId})`, + ); + + const results: MatrixResult[] = []; + for (const point of matrix) { + const label = `${point.case.id} × ${point.agent.id}`; + process.stdout.write(` ${label}... `); + const r = await runPoint(point, creds, providerEnv, repoRoot, runId, args.keepSandbox); + results.push(r); + if (r.failure?.startsWith("[skip]")) { + console.log(`skip — ${r.failure.slice(7)}`); + } else if (r.passed) { + console.log(`ok (${r.durationMs}ms, ${formatCents(r.costCents)})`); + } else { + console.log(`FAIL`); + console.log(` ${r.failure?.split("\n").join("\n ")}`); + } + } + + const passed = results.filter((r) => r.passed && !r.failure?.startsWith("[skip]")).length; + const failed = results.filter((r) => !r.passed).length; + const skipped = results.filter((r) => r.failure?.startsWith("[skip]") || r.failure === `[skip]`).length; + const totalCostCents = results.reduce((acc, r) => acc + (r.costCents ?? 0), 0); + const summary: RunSummary = { + runId, + startedAt, + finishedAt: new Date().toISOString(), + totalCases: cases.length, + totalAgents: drivers.length, + totalPoints: matrix.length, + passed, + failed, + skipped, + totalCostCents, + results, + }; + const summaryPath = writeSummary(repoRoot, summary); + + console.log(""); + console.log(`◆ ${passed} pass, ${failed} fail, ${skipped} skip · total ${formatCents(totalCostCents)}`); + console.log(`◆ summary written to ${summaryPath}`); + process.exit(failed === 0 ? 0 : 1); +} + +main().catch((e) => { + console.error(`[harness fatal] ${e instanceof Error ? e.stack ?? e.message : String(e)}`); + process.exit(2); +}); diff --git a/tests/e2e/sandbox.ts b/tests/e2e/sandbox.ts new file mode 100644 index 00000000..231dfdbd --- /dev/null +++ b/tests/e2e/sandbox.ts @@ -0,0 +1,83 @@ +/** + * Per-case filesystem sandbox. + * + * For each (case, agent) tuple we want: + * 1. A fresh HOME that no other case can read or write + * 2. A `~/.deeplake/credentials.json` pointing at the e2e test workspace + * 3. The agent's hivemind bundle deposited at the agent-specific path + * under that HOME (or a session-only plugin flag — see claude-code). + * + * We DO NOT share HOMEs across cases even within a single agent. Reasons: + * - The hivemind hook writes ~/.deeplake/hook-debug.log; reusing the + * HOME means cross-case log contamination breaks the + * `hook-log-contains` assertion's "occurred during MY case" guarantee. + * - Some agents cache plugin state by content-hash; a stale cache from + * case 1 has been observed to mask a case-2 install failure. + * + * Cleanup is rm -rf of the tmp HOME at the end of each case. The caller + * may pass `keepSandbox: true` to leave it on disk for debugging. + */ + +import { mkdtempSync, rmSync, writeFileSync, mkdirSync } from "node:fs"; +import { tmpdir } from "node:os"; +import { join } from "node:path"; +import type { AgentId, TestCredentials } from "./types.js"; + +export interface Sandbox { + home: string; + /** Delete the sandbox. Idempotent. */ + destroy: () => void; +} + +/** + * Create a fresh tmp HOME and seed it with the e2e workspace credentials. + * + * Returns a {home, destroy} pair. Caller is responsible for calling + * destroy() in a finally block (or for passing `keepSandbox` and cleaning + * up out-of-band). + */ +export function createSandbox(agent: AgentId, creds: TestCredentials): Sandbox { + const home = mkdtempSync(join(tmpdir(), `hm-e2e-${agent}-`)); + const deeplakeDir = join(home, ".deeplake"); + mkdirSync(deeplakeDir, { recursive: true, mode: 0o700 }); + // saveCredentials() in src/commands/auth-creds.ts is lazy on HOME, but + // we write the file directly here so we don't depend on any module's + // current process.env.HOME at write time. credentials.json's `savedAt` + // is a free-form ISO string per the type. + const payload = { + token: creds.token, + orgId: creds.orgId, + orgName: creds.orgName, + workspaceId: creds.workspaceId, + apiUrl: creds.apiUrl, + savedAt: new Date().toISOString(), + }; + writeFileSync( + join(deeplakeDir, "credentials.json"), + JSON.stringify(payload, null, 2), + { mode: 0o600 }, + ); + return { + home, + destroy: () => { + try { + rmSync(home, { recursive: true, force: true }); + } catch { + // Best-effort. A leftover tmp dir is annoying but never blocks a run. + } + }, + }; +} + +/** + * Build a deterministic session_id for this (case, agent, runId) tuple. + * + * Embeds the runId so that cleanup queries can sweep all rows from one + * harness invocation in a single statement, and the agent label so a + * single case×agent failure can be inspected without grepping every row. + * Prefix `e2e-` makes the daily cron pattern (`WHERE agent ILIKE 'e2e-%'`) + * tractable in case something escapes the per-run cleanup. + */ +export function buildSessionId(caseId: string, agent: AgentId, runId: string): string { + return `e2e-${runId}-${caseId}-${agent}`; +} diff --git a/tests/e2e/types.ts b/tests/e2e/types.ts new file mode 100644 index 00000000..fde452b8 --- /dev/null +++ b/tests/e2e/types.ts @@ -0,0 +1,212 @@ +/** + * Shared types for the cross-agent E2E harness. + * + * The harness drives N real agent CLIs through M behavioral cases. Each + * (case, agent) tuple is one test point. Drivers know how to spawn one + * agent; cases know what assertions hold for one behavior. The runner + * orchestrates the matrix. + * + * Keep this file tiny and dependency-free — every module in the harness + * imports it, and circular deps here will haunt later. + */ + +export type AgentId = + | "claude-code" + | "codex" + | "cursor-agent" + | "hermes" + | "pi" + | "openclaw"; + +/** + * Which provider env var an agent's spawn requires. `null` means the + * driver runs without a model call (e.g. openclaw fires hook events + * programmatically against its registered handlers — no LLM in the + * loop). The runner uses this to decide whether a missing key is a + * skip or doesn't apply at all. + */ +export type ProviderKey = "ANTHROPIC_API_KEY" | "OPENAI_API_KEY" | "GOOGLE_API_KEY" | null; + +/** + * One agent driver — knows how to install hivemind into a sandboxed HOME + * and spawn the underlying CLI with a prompt. Assertions are NOT a driver + * concern; the runner reads them off the case and executes them after. + */ +export interface AgentDriver { + id: AgentId; + /** + * Provider env var this driver's run() requires. Null means run() does + * not call any LLM — typically because the "agent" is a plugin host + * (openclaw) whose driver fires registered hook handlers programmatically + * instead of spawning a binary. + */ + providerKey: ProviderKey; + /** + * Install hivemind hooks into the given (tmp) HOME. For agents that + * support a session-only plugin flag (e.g. `claude --plugin-dir`), this + * may be a no-op and the flag is set in run() instead. + */ + install(home: string, repoRoot: string): Promise; + /** + * Spawn the CLI with the prompt (or, for openclaw, fire a synthetic + * agent_end event whose user message contains the prompt text). Capture + * stdout/stderr/exitCode/duration. Driver MUST set HOME=home for any + * subprocess it spawns. Driver MAY parse a cost line from stdout into + * `costCents` — null is acceptable when the agent doesn't print cost + * (or never makes a model call). + */ + run(prompt: string, opts: RunOpts): Promise; + /** + * Optional teardown. Most agents have no cleanup beyond rm -rf HOME, + * which the runner does. Use only when the agent left state OUTSIDE the + * sandboxed HOME (e.g. a global config file). + */ + cleanup?(home: string): Promise; +} + +export interface RunOpts { + home: string; + repoRoot: string; + /** session_id to write into the credentials sidecar / propagate downstream */ + sessionId: string; + /** Provider keys to forward into the spawned process. Driver picks what it needs. */ + providerEnv: ProviderEnv; + /** Hard wall-clock cap on the spawn. Defaults to 90s per case. */ + timeoutMs?: number; +} + +export interface ProviderEnv { + ANTHROPIC_API_KEY?: string; + OPENAI_API_KEY?: string; + GOOGLE_API_KEY?: string; +} + +export interface RunResult { + stdout: string; + stderr: string; + exitCode: number; + sessionId: string; + costCents: number | null; + durationMs: number; +} + +/** + * One behavioral case the matrix asserts on. Cases are agent-agnostic — + * the same prompt + assertions run against every driver (unless skipFor + * names the agent explicitly with a comment). + */ +export interface E2ECase { + id: string; + description: string; + prompt: string; + /** + * Optional pre-run hook — e.g. seed a row in the memory table so the + * agent has something to retrieve. Receives the tmp HOME + a configured + * DeeplakeApi instance. + */ + setup?: (ctx: CaseContext) => Promise; + assertions: Assertion[]; + /** Agents this case can't reach (with rationale in a comment next to the entry). */ + skipFor?: AgentId[]; + /** + * When true, the runner does NOT call driver.run() — it only runs + * driver.install() + case.setup() and then evaluates assertions + * against the post-install filesystem / DB state. Use this for + * install-shape cases that assert on side effects of the installer + * itself (e.g. "settings.json doesn't contain references to files + * that don't exist"). No model API call, no per-agent prompt cost. + */ + installOnly?: boolean; +} + +export interface CaseContext { + home: string; + sessionId: string; + agent: AgentId; + /** Test creds for the e2e workspace. Drivers + setup share this. */ + creds: TestCredentials; +} + +export interface TestCredentials { + apiUrl: string; + token: string; + orgId: string; + orgName?: string; + workspaceId: string; + /** sessions table name in the e2e workspace. */ + sessionsTable: string; + /** memory table name in the e2e workspace. */ + memoryTable: string; +} + +/** + * Assertion vocabulary, intentionally narrow for v1. LLM-as-judge is + * deferred — plugin side-effect tests don't need it. Each assertion gets + * the case context + the agent's run result + a query helper bound to the + * test workspace. + */ +export type Assertion = + | StdoutContainsAssertion + | StdoutMatchesAssertion + | SelectFromDbAssertion + | HookLogContainsAssertion + | CustomAssertion; + +export interface StdoutContainsAssertion { + type: "stdout-contains"; + /** Substring the agent's stdout MUST contain after the run. */ + substring: string; + /** Optional label for the failure message; defaults to `substring`. */ + label?: string; +} + +export interface StdoutMatchesAssertion { + type: "stdout-matches"; + regex: RegExp; + label?: string; +} + +export interface SelectFromDbAssertion { + type: "select-from-db"; + /** SQL to run against the test workspace. Use `${sid}` for the session_id placeholder. */ + sql: (ctx: AssertionContext) => string; + /** Throws if the returned rows don't match expectations. */ + expect: (rows: Array>) => void; + label?: string; +} + +export interface HookLogContainsAssertion { + type: "hook-log-contains"; + /** Substring that must appear in ${home}/.deeplake/hook-debug.log after the run. */ + substring: string; + label?: string; +} + +/** + * Escape hatch for assertions that don't fit the four typed shapes. + * Returns null on pass, or a failure-reason string on fail. Use this + * sparingly — typed assertions document intent better — but it's + * essential for install-shape cases that walk agent-specific config + * file structures (no two agents have the same hooks-file layout). + */ +export interface CustomAssertion { + type: "custom"; + check: (actx: AssertionContext) => Promise; + label: string; +} + +export interface AssertionContext { + ctx: CaseContext; + run: RunResult; +} + +export interface MatrixResult { + case: string; + agent: AgentId; + passed: boolean; + /** Reason for failure, or null on pass. */ + failure: string | null; + costCents: number | null; + durationMs: number; + sessionId: string; +}