@@ -61,6 +61,7 @@ const className = 'CrashMonitoring'
6161 */
6262export class CrashMonitoring {
6363 protected heartbeat : Heartbeat | undefined
64+ protected timeLag : TimeLag | undefined
6465 protected crashChecker : CrashChecker | undefined
6566
6667 constructor (
@@ -111,16 +112,27 @@ export class CrashMonitoring {
111112 }
112113
113114 try {
115+ this . timeLag = new TimeLag ( )
116+
114117 this . heartbeat = new Heartbeat ( this . state , this . checkInterval , this . isDevMode )
115- this . crashChecker = new CrashChecker ( this . state , this . checkInterval , this . isDevMode , this . devLogger )
118+ this . heartbeat . onFailure ( ( ) => this . cleanup ( ) )
119+
120+ this . crashChecker = new CrashChecker (
121+ this . state ,
122+ this . checkInterval ,
123+ this . isDevMode ,
124+ this . devLogger ,
125+ this . timeLag
126+ )
127+ this . crashChecker . onFailure ( ( ) => this . cleanup ( ) )
116128
129+ this . timeLag . start ( )
117130 await this . heartbeat . start ( )
118131 await this . crashChecker . start ( )
119132 } catch ( error ) {
120133 emitFailure ( { functionName : 'start' , error } )
121134 try {
122- this . crashChecker ?. cleanup ( )
123- await this . heartbeat ?. cleanup ( )
135+ await this . cleanup ( )
124136 } catch { }
125137
126138 // Surface errors during development, otherwise it can be missed.
@@ -135,6 +147,7 @@ export class CrashMonitoring {
135147 try {
136148 this . crashChecker ?. cleanup ( )
137149 await this . heartbeat ?. shutdown ( )
150+ this . timeLag ?. cleanup ( )
138151 } catch ( error ) {
139152 try {
140153 // This probably wont emit in time before shutdown, but may be written to the logs
@@ -146,6 +159,12 @@ export class CrashMonitoring {
146159 }
147160 }
148161 }
162+
163+ public async cleanup ( ) {
164+ this . crashChecker ?. cleanup ( )
165+ await this . heartbeat ?. cleanup ( )
166+ this . timeLag ?. cleanup ( )
167+ }
149168}
150169
151170/**
@@ -154,15 +173,19 @@ export class CrashMonitoring {
154173 */
155174class Heartbeat {
156175 private intervalRef : NodeJS . Timer | undefined
176+ private _onFailure = new vscode . EventEmitter < void > ( )
177+ public onFailure : vscode . Event < void > = this . _onFailure . event
178+ private readonly heartbeatInterval : number
179+
157180 constructor (
158181 private readonly state : FileSystemState ,
159- private readonly checkInterval : number ,
182+ checkInterval : number ,
160183 private readonly isDevMode : boolean
161- ) { }
184+ ) {
185+ this . heartbeatInterval = checkInterval / 2
186+ }
162187
163188 public async start ( ) {
164- const heartbeatInterval = this . checkInterval / 2
165-
166189 // Send an initial heartbeat immediately
167190 await withFailCtx ( 'initialSendHeartbeat' , ( ) => this . state . sendHeartbeat ( ) )
168191
@@ -179,14 +202,15 @@ class Heartbeat {
179202 if ( this . isDevMode ) {
180203 throw e
181204 }
205+ this . _onFailure . fire ( )
182206 }
183- } , heartbeatInterval )
207+ } , this . heartbeatInterval )
184208 }
185209
186210 /** Stops everything, signifying a graceful shutdown */
187211 public async shutdown ( ) {
188212 globals . clock . clearInterval ( this . intervalRef )
189- return this . state . indicateGracefulShutdown ( )
213+ await this . state . indicateGracefulShutdown ( )
190214 }
191215
192216 /**
@@ -217,12 +241,15 @@ class Heartbeat {
217241 */
218242class CrashChecker {
219243 private intervalRef : NodeJS . Timer | undefined
244+ private _onFailure = new vscode . EventEmitter < void > ( )
245+ public onFailure = this . _onFailure . event
220246
221247 constructor (
222248 private readonly state : FileSystemState ,
223249 private readonly checkInterval : number ,
224250 private readonly isDevMode : boolean ,
225- private readonly devLogger : Logger | undefined
251+ private readonly devLogger : Logger | undefined ,
252+ private readonly timeLag : TimeLag
226253 ) { }
227254
228255 public async start ( ) {
@@ -231,20 +258,21 @@ class CrashChecker {
231258
232259 // do an initial check
233260 await withFailCtx ( 'initialCrashCheck' , ( ) =>
234- tryCheckCrash ( this . state , this . checkInterval , this . isDevMode , this . devLogger )
261+ tryCheckCrash ( this . state , this . checkInterval , this . isDevMode , this . devLogger , this . timeLag )
235262 )
236263
237264 // check on an interval
238265 this . intervalRef = globals . clock . setInterval ( async ( ) => {
239266 try {
240- await tryCheckCrash ( this . state , this . checkInterval , this . isDevMode , this . devLogger )
267+ await tryCheckCrash ( this . state , this . checkInterval , this . isDevMode , this . devLogger , this . timeLag )
241268 } catch ( e ) {
242269 emitFailure ( { functionName : 'checkCrashInterval' , error : e } )
243- this . cleanup ( )
244270
245271 if ( this . isDevMode ) {
246272 throw e
247273 }
274+
275+ this . _onFailure . fire ( )
248276 }
249277 } , this . checkInterval )
250278 }
@@ -255,8 +283,15 @@ class CrashChecker {
255283 state : FileSystemState ,
256284 checkInterval : number ,
257285 isDevMode : boolean ,
258- devLogger : Logger | undefined
286+ devLogger : Logger | undefined ,
287+ timeLag : TimeLag
259288 ) {
289+ if ( await timeLag . didLag ( ) ) {
290+ timeLag . reset ( )
291+ devLogger ?. warn ( 'crashMonitoring: SKIPPED check crash due to time lag' )
292+ return
293+ }
294+
260295 // Iterate all known extensions and for each check if they have crashed
261296 const knownExts = await state . getAllExts ( )
262297 const runningExts : ExtInstanceHeartbeat [ ] = [ ]
@@ -324,6 +359,84 @@ class CrashChecker {
324359
325360 /** Mimics a crash, only for testing */
326361 public testCrash ( ) {
362+ this . cleanup ( )
363+ }
364+ }
365+
366+ /**
367+ * This class is required for the following edge case:
368+ * 1. Heartbeat is sent
369+ * 2. Computer goes to sleep for X minutes
370+ * 3. Wake up computer. But before a new heartbeat can be sent, a crash checker (can be from another ext instance) runs
371+ * and sees a stale heartbeat. It assumes a crash.
372+ *
373+ * Why? Intervals do not run while the computer is asleep, so the latest heartbeat has a "lag" since it wasn't able to send
374+ * a new heartbeat.
375+ * Then on wake, there is a racecondition for the next heartbeat to be sent before the next crash check. If the crash checker
376+ * runs first it will incorrectly conclude a crash.
377+ *
378+ * Solution: Keep track of the lag, and then skip the next crash check if there was a lag. This will give time for the
379+ * next heartbeat to be sent.
380+ */
381+ class TimeLag {
382+ private intervalRef : NodeJS . Timer | undefined
383+ private isCompleted : Promise < void > | undefined
384+ /** Resolves {@link isCompleted} when the next interval of lag checking is completed */
385+ private setCompleted : ( ( ) => void ) | undefined
386+
387+ /** The last timestamp we remember. If this is more than 1 {@link lagCheckInterval} we probably did a sleep+wake */
388+ private latestTimestamp : number = 0
389+ /** The accumulation of lag before the next crash checker interval, since the user can sleep+wake multiple times in between */
390+ private totalLag : number = 0
391+ private readonly lagCheckInterval = 1000
392+
393+ public start ( ) {
394+ this . reset ( ) // initialize
395+
396+ // Every interval calculate the lag
397+ this . intervalRef = globals . clock . setInterval ( ( ) => {
398+ const expectedNow = this . latestTimestamp + this . lagCheckInterval
399+ const actualNow = globals . clock . Date . now ( )
400+ const lag = actualNow - expectedNow
401+
402+ // interval callback execution is not exact, so this buffer avoids micro lag from being
403+ // considered actual lag
404+ if ( lag > 5000 ) {
405+ this . totalLag += lag
406+ }
407+
408+ this . latestTimestamp = Date . now ( )
409+
410+ // race condition between crash checker and lag checker on computer wake. So we have
411+ // the crash checker block until this is completed.
412+ this . setCompleted ! ( )
413+ this . setInProgress ( )
414+ } , this . lagCheckInterval )
415+ }
416+
417+ /** True if there is a time lag */
418+ public async didLag ( ) {
419+ await this . isCompleted !
420+ return this . totalLag > 0
421+ }
422+
423+ /** Indicates the next time lag interval check is not completed */
424+ private setInProgress ( ) {
425+ this . isCompleted = new Promise ( ( resolve ) => {
426+ this . setCompleted = resolve
427+ } )
428+ }
429+
430+ /**
431+ * Call this once the user of this instance has handled the lag. If not done, {@link didLag} can
432+ * be permanently true.
433+ */
434+ public reset ( ) {
435+ this . totalLag = 0
436+ this . latestTimestamp = globals . clock . Date . now ( )
437+ this . setInProgress ( )
438+ }
439+ public cleanup ( ) {
327440 globals . clock . clearInterval ( this . intervalRef )
328441 }
329442}
@@ -617,7 +730,10 @@ export type ExtInstance = {
617730 isDebug ?: boolean
618731}
619732
620- type ExtInstanceHeartbeat = ExtInstance & { lastHeartbeat : number }
733+ type ExtInstanceHeartbeat = ExtInstance & {
734+ /** Timestamp of the last heartbeat in milliseconds */
735+ lastHeartbeat : number
736+ }
621737
622738function isExtHeartbeat ( ext : unknown ) : ext is ExtInstanceHeartbeat {
623739 return typeof ext === 'object' && ext !== null && 'lastHeartbeat' in ext && ext . lastHeartbeat !== undefined
0 commit comments