Skip to content

Commit 8340d18

Browse files
fix(crashMonitoring): handle sleep/wake appropriately
Problem: When a user does a sleep then wake, our heartbeats are not up to date since they cannot send when the user's computer is asleep. Due to this there is a race condition between the heartbeats sent and when we check the heartbeats to determine if they are stale (crash). Solution: Keep a TimeLag class that helps to assert that there is no lag in our time. It works by updating a state every second. And if we determine that the next update to that state took longer than a second, we determine that there was a lag. Then we simply skip the next crash check, allowing a fresh heartbeat to be sent. Signed-off-by: nkomonen-amazon <[email protected]>
1 parent 3b6197b commit 8340d18

File tree

2 files changed

+127
-15
lines changed

2 files changed

+127
-15
lines changed

packages/core/src/shared/crashMonitoring.ts

Lines changed: 60 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ import { getLogger } from './logger/logger'
1818
import { crashMonitoringDirNames } from './constants'
1919
import { throwOnUnstableFileSystem } from './filesystemUtilities'
2020
import { withRetries } from './utilities/functionUtils'
21+
import { TimeLag } from './utilities/timeoutUtils'
2122

2223
const className = 'CrashMonitoring'
2324

@@ -112,15 +113,17 @@ export class CrashMonitoring {
112113

113114
try {
114115
this.heartbeat = new Heartbeat(this.state, this.checkInterval, this.isDevMode)
116+
this.heartbeat.onFailure(() => this.cleanup())
117+
115118
this.crashChecker = new CrashChecker(this.state, this.checkInterval, this.isDevMode, this.devLogger)
119+
this.crashChecker.onFailure(() => this.cleanup())
116120

117121
await this.heartbeat.start()
118122
await this.crashChecker.start()
119123
} catch (error) {
120124
emitFailure({ functionName: 'start', error })
121125
try {
122-
this.crashChecker?.cleanup()
123-
await this.heartbeat?.cleanup()
126+
await this.cleanup()
124127
} catch {}
125128

126129
// Surface errors during development, otherwise it can be missed.
@@ -146,6 +149,11 @@ export class CrashMonitoring {
146149
}
147150
}
148151
}
152+
153+
public async cleanup() {
154+
this.crashChecker?.cleanup()
155+
await this.heartbeat?.cleanup()
156+
}
149157
}
150158

151159
/**
@@ -154,15 +162,19 @@ export class CrashMonitoring {
154162
*/
155163
class Heartbeat {
156164
private intervalRef: NodeJS.Timer | undefined
165+
private _onFailure = new vscode.EventEmitter<void>()
166+
public onFailure: vscode.Event<void> = this._onFailure.event
167+
private readonly heartbeatInterval: number
168+
157169
constructor(
158170
private readonly state: FileSystemState,
159-
private readonly checkInterval: number,
171+
checkInterval: number,
160172
private readonly isDevMode: boolean
161-
) {}
173+
) {
174+
this.heartbeatInterval = checkInterval / 2
175+
}
162176

163177
public async start() {
164-
const heartbeatInterval = this.checkInterval / 2
165-
166178
// Send an initial heartbeat immediately
167179
await withFailCtx('initialSendHeartbeat', () => this.state.sendHeartbeat())
168180

@@ -179,14 +191,15 @@ class Heartbeat {
179191
if (this.isDevMode) {
180192
throw e
181193
}
194+
this._onFailure.fire()
182195
}
183-
}, heartbeatInterval)
196+
}, this.heartbeatInterval)
184197
}
185198

186199
/** Stops everything, signifying a graceful shutdown */
187200
public async shutdown() {
188201
globals.clock.clearInterval(this.intervalRef)
189-
return this.state.indicateGracefulShutdown()
202+
await this.state.indicateGracefulShutdown()
190203
}
191204

192205
/**
@@ -217,34 +230,55 @@ class Heartbeat {
217230
*/
218231
class CrashChecker {
219232
private intervalRef: NodeJS.Timer | undefined
233+
private _onFailure = new vscode.EventEmitter<void>()
234+
public onFailure = this._onFailure.event
220235

221236
constructor(
222237
private readonly state: FileSystemState,
223238
private readonly checkInterval: number,
224239
private readonly isDevMode: boolean,
225-
private readonly devLogger: Logger | undefined
240+
private readonly devLogger: Logger | undefined,
241+
/**
242+
* This class is required for the following edge case:
243+
* 1. Heartbeat is sent
244+
* 2. Computer goes to sleep for X minutes
245+
* 3. Wake up computer. But before a new heartbeat can be sent, a crash checker (can be from another ext instance) runs
246+
* and sees a stale heartbeat. It assumes a crash.
247+
*
248+
* Why? Intervals do not run while the computer is asleep, so the latest heartbeat has a "lag" since it wasn't able to send
249+
* a new heartbeat.
250+
* Then on wake, there is a racecondition for the next heartbeat to be sent before the next crash check. If the crash checker
251+
* runs first it will incorrectly conclude a crash.
252+
*
253+
* Solution: Keep track of the lag, and then skip the next crash check if there was a lag. This will give time for the
254+
* next heartbeat to be sent.
255+
*/
256+
private readonly timeLag: TimeLag = new TimeLag()
226257
) {}
227258

228259
public async start() {
229260
{
230261
this.devLogger?.debug(`crashMonitoring: checkInterval ${this.checkInterval}`)
231262

263+
this.timeLag.start()
264+
232265
// do an initial check
233266
await withFailCtx('initialCrashCheck', () =>
234-
tryCheckCrash(this.state, this.checkInterval, this.isDevMode, this.devLogger)
267+
tryCheckCrash(this.state, this.checkInterval, this.isDevMode, this.devLogger, this.timeLag)
235268
)
236269

237270
// check on an interval
238271
this.intervalRef = globals.clock.setInterval(async () => {
239272
try {
240-
await tryCheckCrash(this.state, this.checkInterval, this.isDevMode, this.devLogger)
273+
await tryCheckCrash(this.state, this.checkInterval, this.isDevMode, this.devLogger, this.timeLag)
241274
} catch (e) {
242275
emitFailure({ functionName: 'checkCrashInterval', error: e })
243-
this.cleanup()
244276

245277
if (this.isDevMode) {
246278
throw e
247279
}
280+
281+
this._onFailure.fire()
248282
}
249283
}, this.checkInterval)
250284
}
@@ -255,8 +289,15 @@ class CrashChecker {
255289
state: FileSystemState,
256290
checkInterval: number,
257291
isDevMode: boolean,
258-
devLogger: Logger | undefined
292+
devLogger: Logger | undefined,
293+
timeLag: TimeLag
259294
) {
295+
if (await timeLag.didLag()) {
296+
timeLag.reset()
297+
devLogger?.warn('crashMonitoring: SKIPPED check crash due to time lag')
298+
return
299+
}
300+
260301
// Iterate all known extensions and for each check if they have crashed
261302
const knownExts = await state.getAllExts()
262303
const runningExts: ExtInstanceHeartbeat[] = []
@@ -320,11 +361,12 @@ class CrashChecker {
320361
/** Use this on failures to terminate the crash checker */
321362
public cleanup() {
322363
globals.clock.clearInterval(this.intervalRef)
364+
this.timeLag.cleanup()
323365
}
324366

325367
/** Mimics a crash, only for testing */
326368
public testCrash() {
327-
globals.clock.clearInterval(this.intervalRef)
369+
this.cleanup()
328370
}
329371
}
330372

@@ -617,7 +659,10 @@ export type ExtInstance = {
617659
isDebug?: boolean
618660
}
619661

620-
type ExtInstanceHeartbeat = ExtInstance & { lastHeartbeat: number }
662+
type ExtInstanceHeartbeat = ExtInstance & {
663+
/** Timestamp of the last heartbeat in milliseconds */
664+
lastHeartbeat: number
665+
}
621666

622667
function isExtHeartbeat(ext: unknown): ext is ExtInstanceHeartbeat {
623668
return typeof ext === 'object' && ext !== null && 'lastHeartbeat' in ext && ext.lastHeartbeat !== undefined

packages/core/src/shared/utilities/timeoutUtils.ts

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -292,3 +292,70 @@ export function sleep(duration: number = 0): Promise<void> {
292292
const schedule = globals?.clock?.setTimeout ?? setTimeout
293293
return new Promise((r) => schedule(r, Math.max(duration, 0)))
294294
}
295+
296+
/**
297+
* Time lag occurs when the computer goes to sleep, and intervals cannot run on the expected
298+
* cadence. This keeps track of that lag in cadence.
299+
*/
300+
export class TimeLag {
301+
private intervalRef: NodeJS.Timer | undefined
302+
private isCompleted: Promise<void> | undefined
303+
/** Resolves {@link isCompleted} when the next interval of lag checking is completed */
304+
private setCompleted: (() => void) | undefined
305+
306+
/** The last timestamp we remember. If this is more than 1 {@link lagCheckInterval} we probably did a sleep+wake */
307+
private latestTimestamp: number = 0
308+
/** The accumulation of lag before the next crash checker interval, since the user can sleep+wake multiple times in between */
309+
private totalLag: number = 0
310+
private readonly lagCheckInterval = 1000
311+
312+
public start() {
313+
this.reset() // initialize
314+
315+
// Every interval calculate the lag
316+
this.intervalRef = globals.clock.setInterval(() => {
317+
const expectedNow = this.latestTimestamp + this.lagCheckInterval
318+
const actualNow = globals.clock.Date.now()
319+
const lag = actualNow - expectedNow
320+
321+
// interval callback execution is not exact, so this buffer avoids micro lag from being
322+
// considered actual lag
323+
if (lag > 5000) {
324+
this.totalLag += lag
325+
}
326+
327+
this.latestTimestamp = Date.now()
328+
329+
// race condition between crash checker and lag checker on computer wake. So we have
330+
// the crash checker block until this is completed.
331+
this.setCompleted!()
332+
this.setInProgress()
333+
}, this.lagCheckInterval)
334+
}
335+
336+
/** True if there is a time lag */
337+
public async didLag() {
338+
await this.isCompleted!
339+
return this.totalLag > 0
340+
}
341+
342+
/** Indicates the next time lag interval check is not completed */
343+
private setInProgress() {
344+
this.isCompleted = new Promise((resolve) => {
345+
this.setCompleted = resolve
346+
})
347+
}
348+
349+
/**
350+
* Call this once the user of this instance has handled the lag. If not done, {@link didLag} can
351+
* be permanently true.
352+
*/
353+
public reset() {
354+
this.totalLag = 0
355+
this.latestTimestamp = globals.clock.Date.now()
356+
this.setInProgress()
357+
}
358+
public cleanup() {
359+
globals.clock.clearInterval(this.intervalRef)
360+
}
361+
}

0 commit comments

Comments
 (0)