Skip to content

Commit 3439a89

Browse files
authored
fix(libnpmexec): fix lock compromise logic (#8733)
Fix a race condition in `withLock` where a slow `fs.stat` call could result in an ECOMPROMISED false positive. Due to the usage of `setInterval`, one callback could mutate `mtime` just before an overlapping callback's `fs.stat` promise has resolved, causing a mismatch. By switching to `setTimeout`, we ensure that we don't have overlapping callbacks and incorrect values. Additionally bump the stale threshold higher, to reduce the likelihood of another caller taking over a seemingly-stale-but-actually-active lock. Under Windows in particular, `fs.stat` [has been observed](#8710 (comment)) to sometimes take over 20 seconds, so we should err on the side of a higher threshold before we judge a lock as stale. The minor potential downside is that we might wait longer before taking over a stale lock, but lock takeover is already a very exceptional case (i.e. it would typically only happen if another process was SIGKILLed while holding the same lock) ## Testing Notes - Added a new test to cover this scenario - Verified [the failure](https://github.com/jenseng/cli/actions/runs/19373681768/job/55435674539) and [the fix](https://github.com/jenseng/cli/actions/runs/19373765497/job/55435952370) via one-off GHA workflow that does `npx --yes jest --version` ## References Fixes #8710
1 parent c6242d9 commit 3439a89

File tree

2 files changed

+33
-11
lines changed

2 files changed

+33
-11
lines changed

workspaces/libnpmexec/lib/with-lock.js

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -18,9 +18,10 @@ const { onExit } = require('signal-exit')
1818
// - more ergonomic compromised lock handling (i.e. withLock will reject, and callbacks have access to an AbortSignal)
1919
// - uses a more recent version of signal-exit
2020

21+
// mtime precision is platform dependent, so deal in seconds
2122
const touchInterval = 1_000
22-
// mtime precision is platform dependent, so use a reasonably large threshold
23-
const staleThreshold = 5_000
23+
// use a reasonably large threshold, in case stat calls take a while
24+
const staleThreshold = 60_000
2425

2526
// track current locks and their cleanup functions
2627
const currentLocks = new Map()
@@ -144,6 +145,7 @@ async function maintainLock (lockPath) {
144145
let mtime = Math.round(stats.mtimeMs / 1000)
145146
const signal = controller.signal
146147

148+
let timeout
147149
async function touchLock () {
148150
try {
149151
const currentStats = (await fs.stat(lockPath))
@@ -156,16 +158,16 @@ async function maintainLock (lockPath) {
156158
if (currentLocks.has(lockPath)) {
157159
await fs.utimes(lockPath, mtime, mtime)
158160
}
161+
timeout = setTimeout(touchLock, touchInterval).unref()
159162
} catch (err) {
160163
// stats mismatch or other fs error means the lock was compromised
161164
controller.abort()
162165
}
163166
}
164167

165-
const timeout = setInterval(touchLock, touchInterval)
166-
timeout.unref()
168+
timeout = setTimeout(touchLock, touchInterval).unref()
167169
function cleanup () {
168-
clearInterval(timeout)
170+
clearTimeout(timeout)
169171
deleteLock(lockPath)
170172
}
171173
currentLocks.set(lockPath, cleanup)

workspaces/libnpmexec/test/with-lock.js

Lines changed: 26 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -105,7 +105,7 @@ t.test('stale lock takeover', async (t) => {
105105
const mtimeMs = Math.round(Date.now() / 1000) * 1000
106106
mockStat = async () => {
107107
if (++statCalls === 1) {
108-
return { mtimeMs: mtimeMs - 10_000 }
108+
return { mtimeMs: mtimeMs - 120_000 }
109109
} else {
110110
return { mtimeMs, ino: 1 }
111111
}
@@ -146,7 +146,7 @@ t.test('EBUSY during stale lock takeover', async (t) => {
146146
const mtimeMs = Math.round(Date.now() / 1000) * 1000
147147
mockStat = async () => {
148148
if (++statCalls === 1) {
149-
return { mtimeMs: mtimeMs - 10_000 }
149+
return { mtimeMs: mtimeMs - 120_000 }
150150
} else {
151151
return { mtimeMs, ino: 1 }
152152
}
@@ -168,7 +168,7 @@ t.test('concurrent stale lock takeover', async (t) => {
168168
const lockPath = path.join(fs.mkdtempSync(path.join(getTempDir(), 'test-')), 'concurrency.lock')
169169
// make a stale lock
170170
await fs.promises.mkdir(lockPath)
171-
await fs.promises.utimes(lockPath, new Date(Date.now() - 10_000), new Date(Date.now() - 10_000))
171+
await fs.promises.utimes(lockPath, new Date(Date.now() - 120_000), new Date(Date.now() - 120_000))
172172

173173
const results = await Promise.allSettled([
174174
withLock(lockPath, () => 'lock1'),
@@ -225,6 +225,26 @@ t.test('mtime floating point mismatch', async (t) => {
225225
}), 'should handle mtime floating point mismatches')
226226
})
227227

228+
t.test('slow fs.stat calls shouldn\'t cause a lock compromise false positive', async (t) => {
229+
let mtimeMs = Math.round(Date.now() / 1000) * 1000
230+
let statCalls = 0
231+
mockStat = async () => {
232+
const result = mtimeMs
233+
if (++statCalls === 2) { // make it slow in the first touchLock callback
234+
await setTimeout(2000)
235+
}
236+
return { mtimeMs: result, ino: 1 }
237+
}
238+
mockUtimes = async (_, nextMtimeSeconds) => {
239+
mtimeMs = nextMtimeSeconds * 1000
240+
}
241+
const lockPath = path.join(fs.mkdtempSync(path.join(getTempDir(), 'test-')), 'concurrency.lock')
242+
t.ok(await withLock(lockPath, async () => {
243+
await setTimeout(4000)
244+
return true
245+
}), 'should handle slow fs.stat calls')
246+
})
247+
228248
t.test('unexpected errors', async (t) => {
229249
t.test('can\'t create lock', async (t) => {
230250
const lockPath = '/these/parent/directories/do/not/exist/so/it/should/fail.lock'
@@ -259,7 +279,7 @@ t.test('unexpected errors', async (t) => {
259279
}
260280
// it's stale
261281
mockStat = async () => {
262-
return { mtimeMs: Date.now() - 10_000 }
282+
return { mtimeMs: Date.now() - 120_000 }
263283
}
264284
// but we can't release it
265285
mockRmdirSync = () => {
@@ -301,7 +321,7 @@ t.test('lock released during maintenance', async (t) => {
301321
mockStat = async (...args) => {
302322
const value = await fs.promises.stat(...args)
303323
if (++statCalls > 1) {
304-
// this runs during the setInterval; release the lock so that we no longer hold it
324+
// this runs during the setTimeout callback; release the lock so that we no longer hold it
305325
await releaseLock('test value')
306326
await setTimeout()
307327
}
@@ -314,7 +334,7 @@ t.test('lock released during maintenance', async (t) => {
314334
}
315335

316336
const lockPromise = withLock(lockPath, () => releaseLockPromise)
317-
// since we unref the interval timeout, we need to wait to ensure it actually runs
337+
// since we unref the timeout, we need to wait to ensure it actually runs
318338
await setTimeout(2000)
319339
t.equal(await lockPromise, 'test value', 'should acquire the lock')
320340
t.equal(utimesCalls, 0, 'should never call utimes')

0 commit comments

Comments
 (0)