Skip to content

Commit d9eb132

Browse files
fix(crashMonitoring): handle sleep/wake appropriately
Problem: When a user does a sleep then wake, our heartbeats are not up to date since they cannot send when the user's computer is asleep. Due to this there is a race condition between the heartbeats sent and when we check the heartbeats to determine if they are stale (crash). Solution: Keep a TimeLag class that helps to assert that there is no lag in our time. It works by updating a state every second. And if we determine that the next update to that state took longer than a second, we determine that there was a lag. Then we simply skip the next crash check, allowing a fresh heartbeat to be sent. Signed-off-by: nkomonen-amazon <[email protected]>
1 parent 17e3b6b commit d9eb132

File tree

1 file changed

+131
-15
lines changed

1 file changed

+131
-15
lines changed

packages/core/src/shared/crashMonitoring.ts

Lines changed: 131 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,7 @@ const className = 'CrashMonitoring'
6161
*/
6262
export class CrashMonitoring {
6363
protected heartbeat: Heartbeat | undefined
64+
protected timeLag: TimeLag | undefined
6465
protected crashChecker: CrashChecker | undefined
6566

6667
constructor(
@@ -111,16 +112,27 @@ export class CrashMonitoring {
111112
}
112113

113114
try {
115+
this.timeLag = new TimeLag()
116+
114117
this.heartbeat = new Heartbeat(this.state, this.checkInterval, this.isDevMode)
115-
this.crashChecker = new CrashChecker(this.state, this.checkInterval, this.isDevMode, this.devLogger)
118+
this.heartbeat.onFailure(() => this.cleanup())
119+
120+
this.crashChecker = new CrashChecker(
121+
this.state,
122+
this.checkInterval,
123+
this.isDevMode,
124+
this.devLogger,
125+
this.timeLag
126+
)
127+
this.crashChecker.onFailure(() => this.cleanup())
116128

129+
this.timeLag.start()
117130
await this.heartbeat.start()
118131
await this.crashChecker.start()
119132
} catch (error) {
120133
emitFailure({ functionName: 'start', error })
121134
try {
122-
this.crashChecker?.cleanup()
123-
await this.heartbeat?.cleanup()
135+
await this.cleanup()
124136
} catch {}
125137

126138
// Surface errors during development, otherwise it can be missed.
@@ -135,6 +147,7 @@ export class CrashMonitoring {
135147
try {
136148
this.crashChecker?.cleanup()
137149
await this.heartbeat?.shutdown()
150+
this.timeLag?.cleanup()
138151
} catch (error) {
139152
try {
140153
// This probably wont emit in time before shutdown, but may be written to the logs
@@ -146,6 +159,12 @@ export class CrashMonitoring {
146159
}
147160
}
148161
}
162+
163+
public async cleanup() {
164+
this.crashChecker?.cleanup()
165+
await this.heartbeat?.cleanup()
166+
this.timeLag?.cleanup()
167+
}
149168
}
150169

151170
/**
@@ -154,15 +173,19 @@ export class CrashMonitoring {
154173
*/
155174
class Heartbeat {
156175
private intervalRef: NodeJS.Timer | undefined
176+
private _onFailure = new vscode.EventEmitter<void>()
177+
public onFailure: vscode.Event<void> = this._onFailure.event
178+
private readonly heartbeatInterval: number
179+
157180
constructor(
158181
private readonly state: FileSystemState,
159-
private readonly checkInterval: number,
182+
checkInterval: number,
160183
private readonly isDevMode: boolean
161-
) {}
184+
) {
185+
this.heartbeatInterval = checkInterval / 2
186+
}
162187

163188
public async start() {
164-
const heartbeatInterval = this.checkInterval / 2
165-
166189
// Send an initial heartbeat immediately
167190
await withFailCtx('initialSendHeartbeat', () => this.state.sendHeartbeat())
168191

@@ -179,14 +202,15 @@ class Heartbeat {
179202
if (this.isDevMode) {
180203
throw e
181204
}
205+
this._onFailure.fire()
182206
}
183-
}, heartbeatInterval)
207+
}, this.heartbeatInterval)
184208
}
185209

186210
/** Stops everything, signifying a graceful shutdown */
187211
public async shutdown() {
188212
globals.clock.clearInterval(this.intervalRef)
189-
return this.state.indicateGracefulShutdown()
213+
await this.state.indicateGracefulShutdown()
190214
}
191215

192216
/**
@@ -217,12 +241,15 @@ class Heartbeat {
217241
*/
218242
class CrashChecker {
219243
private intervalRef: NodeJS.Timer | undefined
244+
private _onFailure = new vscode.EventEmitter<void>()
245+
public onFailure = this._onFailure.event
220246

221247
constructor(
222248
private readonly state: FileSystemState,
223249
private readonly checkInterval: number,
224250
private readonly isDevMode: boolean,
225-
private readonly devLogger: Logger | undefined
251+
private readonly devLogger: Logger | undefined,
252+
private readonly timeLag: TimeLag
226253
) {}
227254

228255
public async start() {
@@ -231,20 +258,21 @@ class CrashChecker {
231258

232259
// do an initial check
233260
await withFailCtx('initialCrashCheck', () =>
234-
tryCheckCrash(this.state, this.checkInterval, this.isDevMode, this.devLogger)
261+
tryCheckCrash(this.state, this.checkInterval, this.isDevMode, this.devLogger, this.timeLag)
235262
)
236263

237264
// check on an interval
238265
this.intervalRef = globals.clock.setInterval(async () => {
239266
try {
240-
await tryCheckCrash(this.state, this.checkInterval, this.isDevMode, this.devLogger)
267+
await tryCheckCrash(this.state, this.checkInterval, this.isDevMode, this.devLogger, this.timeLag)
241268
} catch (e) {
242269
emitFailure({ functionName: 'checkCrashInterval', error: e })
243-
this.cleanup()
244270

245271
if (this.isDevMode) {
246272
throw e
247273
}
274+
275+
this._onFailure.fire()
248276
}
249277
}, this.checkInterval)
250278
}
@@ -255,8 +283,15 @@ class CrashChecker {
255283
state: FileSystemState,
256284
checkInterval: number,
257285
isDevMode: boolean,
258-
devLogger: Logger | undefined
286+
devLogger: Logger | undefined,
287+
timeLag: TimeLag
259288
) {
289+
if (await timeLag.didLag()) {
290+
timeLag.reset()
291+
devLogger?.warn('crashMonitoring: SKIPPED check crash due to time lag')
292+
return
293+
}
294+
260295
// Iterate all known extensions and for each check if they have crashed
261296
const knownExts = await state.getAllExts()
262297
const runningExts: ExtInstanceHeartbeat[] = []
@@ -324,6 +359,84 @@ class CrashChecker {
324359

325360
/** Mimics a crash, only for testing */
326361
public testCrash() {
362+
this.cleanup()
363+
}
364+
}
365+
366+
/**
367+
* This class is required for the following edge case:
368+
* 1. Heartbeat is sent
369+
* 2. Computer goes to sleep for X minutes
370+
* 3. Wake up computer. But before a new heartbeat can be sent, a crash checker (can be from another ext instance) runs
371+
* and sees a stale heartbeat. It assumes a crash.
372+
*
373+
* Why? Intervals do not run while the computer is asleep, so the latest heartbeat has a "lag" since it wasn't able to send
374+
* a new heartbeat.
375+
* Then on wake, there is a racecondition for the next heartbeat to be sent before the next crash check. If the crash checker
376+
* runs first it will incorrectly conclude a crash.
377+
*
378+
* Solution: Keep track of the lag, and then skip the next crash check if there was a lag. This will give time for the
379+
* next heartbeat to be sent.
380+
*/
381+
class TimeLag {
382+
private intervalRef: NodeJS.Timer | undefined
383+
private isCompleted: Promise<void> | undefined
384+
/** Resolves {@link isCompleted} when the next interval of lag checking is completed */
385+
private setCompleted: (() => void) | undefined
386+
387+
/** The last timestamp we remember. If this is more than 1 {@link lagCheckInterval} we probably did a sleep+wake */
388+
private latestTimestamp: number = 0
389+
/** The accumulation of lag before the next crash checker interval, since the user can sleep+wake multiple times in between */
390+
private totalLag: number = 0
391+
private readonly lagCheckInterval = 1000
392+
393+
public start() {
394+
this.reset() // initialize
395+
396+
// Every interval calculate the lag
397+
this.intervalRef = globals.clock.setInterval(() => {
398+
const expectedNow = this.latestTimestamp + this.lagCheckInterval
399+
const actualNow = globals.clock.Date.now()
400+
const lag = actualNow - expectedNow
401+
402+
// interval callback execution is not exact, so this buffer avoids micro lag from being
403+
// considered actual lag
404+
if (lag > 5000) {
405+
this.totalLag += lag
406+
}
407+
408+
this.latestTimestamp = Date.now()
409+
410+
// race condition between crash checker and lag checker on computer wake. So we have
411+
// the crash checker block until this is completed.
412+
this.setCompleted!()
413+
this.setInProgress()
414+
}, this.lagCheckInterval)
415+
}
416+
417+
/** True if there is a time lag */
418+
public async didLag() {
419+
await this.isCompleted!
420+
return this.totalLag > 0
421+
}
422+
423+
/** Indicates the next time lag interval check is not completed */
424+
private setInProgress() {
425+
this.isCompleted = new Promise((resolve) => {
426+
this.setCompleted = resolve
427+
})
428+
}
429+
430+
/**
431+
* Call this once the user of this instance has handled the lag. If not done, {@link didLag} can
432+
* be permanently true.
433+
*/
434+
public reset() {
435+
this.totalLag = 0
436+
this.latestTimestamp = globals.clock.Date.now()
437+
this.setInProgress()
438+
}
439+
public cleanup() {
327440
globals.clock.clearInterval(this.intervalRef)
328441
}
329442
}
@@ -617,7 +730,10 @@ export type ExtInstance = {
617730
isDebug?: boolean
618731
}
619732

620-
type ExtInstanceHeartbeat = ExtInstance & { lastHeartbeat: number }
733+
type ExtInstanceHeartbeat = ExtInstance & {
734+
/** Timestamp of the last heartbeat in milliseconds */
735+
lastHeartbeat: number
736+
}
621737

622738
function isExtHeartbeat(ext: unknown): ext is ExtInstanceHeartbeat {
623739
return typeof ext === 'object' && ext !== null && 'lastHeartbeat' in ext && ext.lastHeartbeat !== undefined

0 commit comments

Comments
 (0)