Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,18 @@ jobs:
# — first CI run after the openclaw skillify wiring landed).
run: npm run build

- name: Audit openclaw bundle against ClawHub static-scan rules
# PR-time gate mirroring release.yml's audit step. Catches a
# flagged bundle BEFORE the PR merges instead of only at release
# time (where it'd correctly abort the publish, but the bad code
# would already be in main). `--criticals-only` blocks on
# critical findings only; warns are advisory (the openclaw
# skillify-worker's `readFileSync` + `fetch` warn is irreducible
# without splitting the worker into multiple shipped files).
# If this step trips, run `npm run audit:openclaw` locally and
# see scripts/audit-openclaw-bundle.mjs for the rule details.
run: npm run audit:openclaw -- --criticals-only

- name: Run tests with coverage
# Per-file 80% thresholds for PR #60 files are declared in
# vitest.config.ts under `coverage.thresholds`. Vitest exits non-zero
Expand Down
143 changes: 116 additions & 27 deletions openclaw/src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,8 @@ import { homedir, tmpdir } from "node:os";
import {
existsSync as fsExists, mkdirSync as fsMkdir, openSync as fsOpen,
closeSync as fsClose, writeFileSync as fsWriteFile, constants as fsConstants,
readFileSync as fsReadFile, renameSync as fsRename,
readFileSync as fsReadFile, renameSync as fsRename, unlinkSync as fsUnlink,
statSync as fsStat,
} from "node:fs";
import { createHash } from "node:crypto";
// node:child_process is stubbed in the main openclaw bundle (see esbuild.config.mjs
Expand Down Expand Up @@ -374,6 +375,15 @@ let captureEnabled = true;
const capturedCounts = new Map<string, number>();
const fallbackSessionId = crypto.randomUUID();

// Per-runtime dedup of skillify worker spawns. Without this, every
// agent_end after the previous worker exits re-acquires the on-disk
// lock and spawns a fresh worker, which does one watermark-check SQL
// round-trip and exits — wasted Node cold-start + DB I/O across a long
// session. Single-spawn-per-session-per-runtime matches what the
// non-openclaw agents already do via `tryAcquireWorkerLock` semantics
// in src/skillify/state.ts. See #100.
const skillifySpawnedFor = new Set<string>();

// --- Skillify worker spawn (mirror of src/skillify/spawn-skillify-worker.ts) ---
//
// OpenClaw can't import the shared skillify TS modules — its bundle is
Expand Down Expand Up @@ -425,14 +435,61 @@ function deriveOpenclawProjectKey(channel: string): { key: string; project: stri
return { key, project };
}

// Per-project filesystem lock guarding the skillify worker spawn.
// Mirrors `tryAcquireWorkerLock` in src/skillify/state.ts: writes a ms
// timestamp into the lock file when acquired, treats locks older than
// LOCK_MAX_AGE_MS as stale (abnormal worker death, kernel kill, OOM —
// the worker's `finally`-release didn't run), unlinks and re-acquires.
// Without this, a single crashed worker halts mining for that
// project_key permanently until manual cleanup. See #110.
//
// Empty pre-existing locks (from earlier code that wrote no payload)
// parse as NaN and are treated as immediately stale — clean migration
// on first patched run.
const LOCK_MAX_AGE_MS = 10 * 60 * 1000; // 10 min, generous vs typical
// worker run (<30s + buffer)

function tryAcquireOpenclawSkillifyLock(projectKey: string): boolean {
try {
migrateOpenclawSkillifyLegacyStateDir();
fsMkdir(OPENCLAW_SKILLIFY_STATE_DIR, { recursive: true });
const lockPath = joinPath(OPENCLAW_SKILLIFY_STATE_DIR, `${projectKey}.worker.lock`);
const fd = fsOpen(lockPath, fsConstants.O_CREAT | fsConstants.O_EXCL | fsConstants.O_WRONLY);
fsClose(fd);
return true;
const acquire = (): boolean => {
const fd = fsOpen(lockPath, fsConstants.O_CREAT | fsConstants.O_EXCL | fsConstants.O_WRONLY);
try {
fsWriteFile(fd, String(Date.now()));
} finally {
fsClose(fd);
}
return true;
};
try {
return acquire();
} catch {
// O_EXCL failed → lock file already exists. Check staleness.
// There's a brief window between O_CREAT|O_EXCL and the timestamp
// write where a racing caller can see an empty body. Don't treat
// empty/NaN as immediately stale (CodeRabbit on #172) — fall back
// to the file's mtime to decide. If the FILE is fresh, the
// competitor is mid-write and we should yield; if the file is
// older than LOCK_MAX_AGE_MS, the previous holder crashed without
// writing the timestamp (or the disk lost it), and we can recycle.
try {
const body = fsReadFile(lockPath, "utf-8");
const ts = Number.parseInt(body.trim(), 10);
const ageByBody = Number.isFinite(ts) ? Date.now() - ts : Number.POSITIVE_INFINITY;
let ageByMtime = 0;
try { ageByMtime = Date.now() - fsStat(lockPath).mtimeMs; } catch { ageByMtime = 0; }
const effectiveAge = Number.isFinite(ts) ? ageByBody : ageByMtime;
if (effectiveAge > LOCK_MAX_AGE_MS) {
try { fsUnlink(lockPath); } catch { /* race; recheck below */ }
try { return acquire(); } catch { return false; }
Comment on lines +457 to +486
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major | ⚡ Quick win

Avoid treating a freshly-created empty lock file as stale.

Line 382 creates the lock file before Line 384 writes the timestamp. A racing process that hits the read path during that window will see "", parse NaN, unlink the file, and acquire the same lock. That reintroduces overlapping workers across processes.

A simple fix is to only treat NaN / empty contents as stale when the file itself is older than the stale threshold (or at least older than a short grace window), instead of immediately unlinking on parse failure.

Suggested direction
-import {
+import {
   existsSync as fsExists, mkdirSync as fsMkdir, openSync as fsOpen,
   closeSync as fsClose, writeFileSync as fsWriteFile, constants as fsConstants,
-  readFileSync as fsReadFile, renameSync as fsRename, unlinkSync as fsUnlink,
+  readFileSync as fsReadFile, renameSync as fsRename, statSync as fsStat, unlinkSync as fsUnlink,
 } from "node:fs";
...
       try {
         const body = fsReadFile(lockPath, "utf-8");
         const ts = Number.parseInt(body.trim(), 10);
-        const age = Date.now() - (Number.isFinite(ts) ? ts : 0);
-        if (!Number.isFinite(ts) || age > LOCK_MAX_AGE_MS) {
+        const age = Number.isFinite(ts)
+          ? Date.now() - ts
+          : Date.now() - fsStat(lockPath).mtimeMs;
+        if (age > LOCK_MAX_AGE_MS) {
           try { fsUnlink(lockPath); } catch { /* race; recheck below */ }
           try { return acquire(); } catch { return false; }
         }
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
const acquire = (): boolean => {
const fd = fsOpen(lockPath, fsConstants.O_CREAT | fsConstants.O_EXCL | fsConstants.O_WRONLY);
try {
fsWriteFile(fd, String(Date.now()));
} finally {
fsClose(fd);
}
return true;
};
try {
return acquire();
} catch {
// O_EXCL failed → lock file already exists. Check staleness.
try {
const body = fsReadFile(lockPath, "utf-8");
const ts = Number.parseInt(body.trim(), 10);
const age = Date.now() - (Number.isFinite(ts) ? ts : 0);
if (!Number.isFinite(ts) || age > LOCK_MAX_AGE_MS) {
try { fsUnlink(lockPath); } catch { /* race; recheck below */ }
try { return acquire(); } catch { return false; }
const acquire = (): boolean => {
const fd = fsOpen(lockPath, fsConstants.O_CREAT | fsConstants.O_EXCL | fsConstants.O_WRONLY);
try {
fsWriteFile(fd, String(Date.now()));
} finally {
fsClose(fd);
}
return true;
};
try {
return acquire();
} catch {
// O_EXCL failed → lock file already exists. Check staleness.
try {
const body = fsReadFile(lockPath, "utf-8");
const ts = Number.parseInt(body.trim(), 10);
const age = Number.isFinite(ts)
? Date.now() - ts
: Date.now() - fsStat(lockPath).mtimeMs;
if (age > LOCK_MAX_AGE_MS) {
try { fsUnlink(lockPath); } catch { /* race; recheck below */ }
try { return acquire(); } catch { return false; }
🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@openclaw/src/index.ts` around lines 381 - 400, The race occurs because
acquire() creates an empty lock file before writing the timestamp, so a reader
can see an empty string and treat it as stale; update the staleness check in the
read path (the code that reads lockPath and interprets ts) to not immediately
unlink on Number.isNaN(ts): call fsStat(lockPath) and use the file's mtime (or
birthtime) to compute fileAge, and only treat NaN/empty contents as stale if
fileAge > min(LOCK_MAX_AGE_MS, GRACE_MS) (pick a short GRACE_MS like a few
seconds) or if the fileAge > LOCK_MAX_AGE_MS; alternatively, avoid the window
entirely by replacing acquire() with an atomic create+write (e.g., write with
'wx'/'O_EXCL|O_WRONLY' so timestamp is written on creation), but if you keep the
current acquire(), implement the mtime check before unlinking to prevent
unlinking freshly-created empty files.

}
return false; // fresh lock held by a live worker — skip spawn
} catch {
return false; // couldn't stat/read; safer to skip than double-spawn
}
}
} catch { return false; }
}

Expand Down Expand Up @@ -489,26 +546,36 @@ function detectOpenclawGateAgent(): GateAgent | null {
return null;
}

function spawnOpenclawSkillifyWorker(a: OpenclawSpawnArgs): void {
/**
* Returns true when the worker was actually spawned (the caller can
* record the session in the per-runtime dedup set). Returns false on
* any "didn't spawn" outcome — missing worker, no delegate gate CLI,
* lock not acquired, mkdir/config write failure, or spawn() throw —
* so the caller can let a future agent_end retry. CodeRabbit on #172
* caught the previous flow that recorded the session before knowing
* whether spawn succeeded, suppressing retries forever within the
* runtime.
*/
function spawnOpenclawSkillifyWorker(a: OpenclawSpawnArgs): boolean {
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why do we have the specific function?

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@efenocchi yes — openclaw has its own spawn function instead of reusing the shared src/skillify/spawn-skillify-worker.ts. The rationale lives in the comment block right above the openclaw spawn helper (openclaw/src/index.ts:300-310):

OpenClaw can't import the shared skillify TS modules — its bundle is stubbed for child_process and code-splits the gateway. Inline the spawn shape here, keyed off the bundled sibling skillify-worker.js.

Two specific constraints driving the duplication:

  1. node:child_process is stubbed in the openclaw main bundle by the stub-unused-child-process esbuild plugin (esbuild.config.mjs:280-301). That stub replaces spawn/execSync/execFileSync with no-op exports to drop CC-specific dead code (e.g. browser-opening for SSO). The shared spawn helper would inherit the stub and silently fail. Openclaw works around it by using createRequire to bypass esbuild and pull the real spawn at runtime (openclaw/src/index.ts:78-80).

  2. openclaw/src/index.ts uses code splitting (chunks/), and the shared spawn helper transitively imports modules the openclaw chunk graph deliberately avoids. Inlining the spawn keeps the openclaw bundle's chunk topology under control.

It IS code duplication, deliberately. The two functions stay in sync via the (slightly-pun-aware) header comment mirror of src/skillify/spawn-skillify-worker.ts plus the shared test (tests/claude-code/skillify-session-start-injection.test.ts asserts both surfaces have the same wiring).

If we ever drop the child_process stub on the openclaw main bundle, we can DRY this — but the stub itself is also load-bearing for the ClawHub static scan (PR #170), so it'd be a meaningful architectural shift, not a quick refactor.

if (!fsExists(OPENCLAW_SKILLIFY_WORKER_PATH)) {
a.loggerWarn?.(`skillify worker missing at ${OPENCLAW_SKILLIFY_WORKER_PATH} — reinstall openclaw plugin`);
return;
return false;
}
const gateAgent = detectOpenclawGateAgent();
if (!gateAgent) {
a.loggerWarn?.(`skillify spawn: no delegate gate CLI found on PATH (need one of: claude, codex, cursor-agent, hermes, pi). Mining skipped.`);
return;
return false;
}
const { key: projectKey, project } = deriveOpenclawProjectKey(a.channel);
if (!tryAcquireOpenclawSkillifyLock(projectKey)) {
// A worker is already running for this project — skip (next agent_end may
// re-fire after the worker releases the lock, or the worker watermark
// advance makes the re-fire a no-op).
return;
return false;
}
const tmpDir = joinPath(tmpdir(), `deeplake-skillify-openclaw-${projectKey}-${Date.now()}`);
try { fsMkdir(tmpDir, { recursive: true, mode: 0o700 }); }
catch (e: any) { a.loggerWarn?.(`skillify spawn: mkdir failed: ${e?.message ?? e}`); return; }
catch (e: any) { a.loggerWarn?.(`skillify spawn: mkdir failed: ${e?.message ?? e}`); return false; }
const configPath = joinPath(tmpDir, "config.json");

// install: "global" — openclaw has no per-project filesystem cwd, so written
Expand Down Expand Up @@ -549,16 +616,18 @@ function spawnOpenclawSkillifyWorker(a: OpenclawSpawnArgs): void {
},
};
try { fsWriteFile(configPath, JSON.stringify(config), { mode: 0o600 }); }
catch (e: any) { a.loggerWarn?.(`skillify spawn: config write failed: ${e?.message ?? e}`); return; }
catch (e: any) { a.loggerWarn?.(`skillify spawn: config write failed: ${e?.message ?? e}`); return false; }

try {
realSpawn(process.execPath, [OPENCLAW_SKILLIFY_WORKER_PATH, configPath], {
detached: true,
stdio: "ignore",
env: { ...inheritedEnv.env, HIVEMIND_SKILLIFY_WORKER: "1", HIVEMIND_CAPTURE: "false" },
}).unref();
return true;
} catch (e: any) {
a.loggerWarn?.(`skillify spawn: spawn failed: ${e?.message ?? e}`);
return false;
}
}

Expand Down Expand Up @@ -1258,23 +1327,43 @@ export default definePluginEntry({
// never blocks the agent. Worker reads from the sessions table we
// just wrote to. Non-fatal: a spawn failure here only loses one
// mining attempt, never breaks capture.
try {
spawnOpenclawSkillifyWorker({
apiUrl: cfg.apiUrl,
token: cfg.token,
orgId: cfg.orgId,
workspaceId: cfg.workspaceId,
userName: cfg.userName,
channel: ev.channel || "openclaw",
sessionId: sid,
loggerWarn: (msg) => logger.error(`Skillify spawn: ${msg}`),
// Pass the same tuning dispatch the plugin populated at
// register-time. The worker will repopulate its own
// globalThis from this.
tuning: (globalThis as Record<string, unknown>).__hivemind_tuning__ as Record<string, string | undefined> | undefined,
});
} catch (e: any) {
logger.error(`Skillify spawn threw: ${e?.message ?? e}`);
//
// Per-runtime dedup (see #100): on long sessions, agent_end fires
// many times, and the previous worker has typically finished by
// the second or third turn — releasing the on-disk lock. Without
// this guard, every subsequent agent_end re-acquires the lock and
// spawns a fresh worker that does one watermark-check SQL roundtrip
// and exits. The on-disk lock is still authoritative across
// processes (e.g. multiple gateway restarts); this Set only
// suppresses redundant spawns within the same runtime.
if (!skillifySpawnedFor.has(sid)) {
// Only record the session as deduped on SUCCESSFUL spawn.
// spawnOpenclawSkillifyWorker has multiple non-exception
// failure paths (no delegate CLI, lock held by a fresh
// worker, mkdir/config write failure, spawn throw). If we
// add to the set before knowing the outcome, one transient
// failure suppresses every retry for the rest of the
// runtime. CodeRabbit on #172.
try {
if (spawnOpenclawSkillifyWorker({
apiUrl: cfg.apiUrl,
token: cfg.token,
orgId: cfg.orgId,
workspaceId: cfg.workspaceId,
userName: cfg.userName,
channel: ev.channel || "openclaw",
sessionId: sid,
loggerWarn: (msg) => logger.error(`Skillify spawn: ${msg}`),
// Pass the same tuning dispatch the plugin populated at
// register-time. The worker will repopulate its own
// globalThis from this.
tuning: (globalThis as Record<string, unknown>).__hivemind_tuning__ as Record<string, string | undefined> | undefined,
})) {
skillifySpawnedFor.add(sid);
}
} catch (e: any) {
logger.error(`Skillify spawn threw: ${e?.message ?? e}`);
}
Comment on lines +1339 to +1366
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major | ⚡ Quick win

Record the session only after a worker was actually launched.

Lines 1208-1209 mark the session as deduped before spawnOpenclawSkillifyWorker(...) can tell you whether anything was spawned. The helper has several non-exception failure paths (no delegate CLI, lock not acquired, config write failure, missing worker file), so one transient failure on the first turn suppresses every later retry for that session in this runtime.

Have spawnOpenclawSkillifyWorker return a boolean and add sid to skillifySpawnedFor only on success.

Suggested fix
-function spawnOpenclawSkillifyWorker(a: OpenclawSpawnArgs): void {
+function spawnOpenclawSkillifyWorker(a: OpenclawSpawnArgs): boolean {
   if (!fsExists(OPENCLAW_SKILLIFY_WORKER_PATH)) {
     a.loggerWarn?.(`skillify worker missing at ${OPENCLAW_SKILLIFY_WORKER_PATH} — reinstall openclaw plugin`);
-    return;
+    return false;
   }
   const gateAgent = detectOpenclawGateAgent();
   if (!gateAgent) {
     a.loggerWarn?.(`skillify spawn: no delegate gate CLI found on PATH (need one of: claude, codex, cursor-agent, hermes, pi). Mining skipped.`);
-    return;
+    return false;
   }
   const { key: projectKey, project } = deriveOpenclawProjectKey(a.channel);
   if (!tryAcquireOpenclawSkillifyLock(projectKey)) {
-    return;
+    return false;
   }
   ...
-  try { fsWriteFile(configPath, JSON.stringify(config), { mode: 0o600 }); }
-  catch (e: any) { a.loggerWarn?.(`skillify spawn: config write failed: ${e?.message ?? e}`); return; }
+  try { fsWriteFile(configPath, JSON.stringify(config), { mode: 0o600 }); }
+  catch (e: any) { a.loggerWarn?.(`skillify spawn: config write failed: ${e?.message ?? e}`); return false; }

   try {
     realSpawn(process.execPath, [OPENCLAW_SKILLIFY_WORKER_PATH, configPath], {
       detached: true,
       stdio: "ignore",
       env: { ...process.env, HIVEMIND_SKILLIFY_WORKER: "1", HIVEMIND_CAPTURE: "false" },
     }).unref();
+    return true;
   } catch (e: any) {
     a.loggerWarn?.(`skillify spawn: spawn failed: ${e?.message ?? e}`);
+    return false;
   }
 }
...
           if (!skillifySpawnedFor.has(sid)) {
-            skillifySpawnedFor.add(sid);
             try {
-              spawnOpenclawSkillifyWorker({
+              if (spawnOpenclawSkillifyWorker({
                 apiUrl: cfg.apiUrl,
                 token: cfg.token,
                 orgId: cfg.orgId,
                 workspaceId: cfg.workspaceId,
                 userName: cfg.userName,
                 channel: ev.channel || "openclaw",
                 sessionId: sid,
                 loggerWarn: (msg) => logger.error(`Skillify spawn: ${msg}`),
-              });
+              })) {
+                skillifySpawnedFor.add(sid);
+              }
             } catch (e: any) {
               logger.error(`Skillify spawn threw: ${e?.message ?? e}`);
             }
           }
🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@openclaw/src/index.ts` around lines 1208 - 1223, The current code marks a
session as deduped by adding sid to skillifySpawnedFor before
spawnOpenclawSkillifyWorker can indicate success, which blocks retries on
non-exception failure paths; change spawnOpenclawSkillifyWorker to return a
boolean (true on worker actually launched, false on non-exception early exits),
update the caller to await the result (if async) and only call
skillifySpawnedFor.add(sid) when the function returns true, while preserving the
existing try/catch to log thrown errors (logger.error) and still not add sid on
failure.

}
} catch (err) {
logger.error(`Auto-capture failed: ${err instanceof Error ? err.message : String(err)}`);
Expand Down
12 changes: 10 additions & 2 deletions tests/claude-code/skillify-session-start-injection.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -283,8 +283,16 @@ describe("OpenClaw skillify worker (mining) wiring", () => {
expect(src).toMatch(/OPENCLAW_SKILLIFY_WORKER_PATH\s*=\s*joinPath\(__openclaw_dirname,\s*"skillify-worker\.js"\)/);
// HIVEMIND_SKILLIFY_WORKER=1 recursion guard set on spawn env
expect(src).toMatch(/HIVEMIND_SKILLIFY_WORKER:\s*"1"/);
// agent_end hook calls it after the capture loop
expect(src).toMatch(/agent_end[\s\S]{0,3500}Auto-captured[\s\S]{0,500}spawnOpenclawSkillifyWorker/);
// Per-runtime dedup guard (#100) — explicit assertions so a future
// refactor that drops the Set or its has/add usage fails this test
// (CodeRabbit on #172).
expect(src).toMatch(/const skillifySpawnedFor = new Set<string>\(\)/);
expect(src).toMatch(/if \(!skillifySpawnedFor\.has\(sid\)\)/);
expect(src).toMatch(/skillifySpawnedFor\.add\(sid\)/);
// agent_end hook calls it after the capture loop. Distance bumped
// from 500→1500 to accommodate the per-runtime spawn-dedup comment
// block landed for #100 between `Auto-captured` and the spawn site.
expect(src).toMatch(/agent_end[\s\S]{0,3500}Auto-captured[\s\S]{0,1500}spawnOpenclawSkillifyWorker/);
// install: "global" — no per-project cwd, skills land under ~/.claude/skills/
expect(src).toMatch(/install:\s*"global"/);
});
Expand Down
Loading