@@ -18,6 +18,7 @@ import { getLogger } from './logger/logger'
1818import { crashMonitoringDirNames } from './constants'
1919import { throwOnUnstableFileSystem } from './filesystemUtilities'
2020import { withRetries } from './utilities/functionUtils'
21+ import { TimeLag } from './utilities/timeoutUtils'
2122
2223const className = 'CrashMonitoring'
2324
@@ -112,15 +113,17 @@ export class CrashMonitoring {
112113
113114 try {
114115 this . heartbeat = new Heartbeat ( this . state , this . checkInterval , this . isDevMode )
116+ this . heartbeat . onFailure ( ( ) => this . cleanup ( ) )
117+
115118 this . crashChecker = new CrashChecker ( this . state , this . checkInterval , this . isDevMode , this . devLogger )
119+ this . crashChecker . onFailure ( ( ) => this . cleanup ( ) )
116120
117121 await this . heartbeat . start ( )
118122 await this . crashChecker . start ( )
119123 } catch ( error ) {
120124 emitFailure ( { functionName : 'start' , error } )
121125 try {
122- this . crashChecker ?. cleanup ( )
123- await this . heartbeat ?. cleanup ( )
126+ await this . cleanup ( )
124127 } catch { }
125128
126129 // Surface errors during development, otherwise it can be missed.
@@ -146,6 +149,11 @@ export class CrashMonitoring {
146149 }
147150 }
148151 }
152+
153+ public async cleanup ( ) {
154+ this . crashChecker ?. cleanup ( )
155+ await this . heartbeat ?. cleanup ( )
156+ }
149157}
150158
151159/**
@@ -154,15 +162,19 @@ export class CrashMonitoring {
154162 */
155163class Heartbeat {
156164 private intervalRef : NodeJS . Timer | undefined
165+ private _onFailure = new vscode . EventEmitter < void > ( )
166+ public onFailure : vscode . Event < void > = this . _onFailure . event
167+ private readonly heartbeatInterval : number
168+
157169 constructor (
158170 private readonly state : FileSystemState ,
159- private readonly checkInterval : number ,
171+ checkInterval : number ,
160172 private readonly isDevMode : boolean
161- ) { }
173+ ) {
174+ this . heartbeatInterval = checkInterval / 2
175+ }
162176
163177 public async start ( ) {
164- const heartbeatInterval = this . checkInterval / 2
165-
166178 // Send an initial heartbeat immediately
167179 await withFailCtx ( 'initialSendHeartbeat' , ( ) => this . state . sendHeartbeat ( ) )
168180
@@ -179,14 +191,15 @@ class Heartbeat {
179191 if ( this . isDevMode ) {
180192 throw e
181193 }
194+ this . _onFailure . fire ( )
182195 }
183- } , heartbeatInterval )
196+ } , this . heartbeatInterval )
184197 }
185198
186199 /** Stops everything, signifying a graceful shutdown */
187200 public async shutdown ( ) {
188201 globals . clock . clearInterval ( this . intervalRef )
189- return this . state . indicateGracefulShutdown ( )
202+ await this . state . indicateGracefulShutdown ( )
190203 }
191204
192205 /**
@@ -217,34 +230,55 @@ class Heartbeat {
217230 */
218231class CrashChecker {
219232 private intervalRef : NodeJS . Timer | undefined
233+ private _onFailure = new vscode . EventEmitter < void > ( )
234+ public onFailure = this . _onFailure . event
220235
221236 constructor (
222237 private readonly state : FileSystemState ,
223238 private readonly checkInterval : number ,
224239 private readonly isDevMode : boolean ,
225- private readonly devLogger : Logger | undefined
240+ private readonly devLogger : Logger | undefined ,
241+ /**
242+ * This class is required for the following edge case:
243+ * 1. Heartbeat is sent
244+ * 2. Computer goes to sleep for X minutes
245+ * 3. Wake up computer. But before a new heartbeat can be sent, a crash checker (can be from another ext instance) runs
246+ * and sees a stale heartbeat. It assumes a crash.
247+ *
248+ * Why? Intervals do not run while the computer is asleep, so the latest heartbeat has a "lag" since it wasn't able to send
249+ * a new heartbeat.
250+ * Then on wake, there is a racecondition for the next heartbeat to be sent before the next crash check. If the crash checker
251+ * runs first it will incorrectly conclude a crash.
252+ *
253+ * Solution: Keep track of the lag, and then skip the next crash check if there was a lag. This will give time for the
254+ * next heartbeat to be sent.
255+ */
256+ private readonly timeLag : TimeLag = new TimeLag ( )
226257 ) { }
227258
228259 public async start ( ) {
229260 {
230261 this . devLogger ?. debug ( `crashMonitoring: checkInterval ${ this . checkInterval } ` )
231262
263+ this . timeLag . start ( )
264+
232265 // do an initial check
233266 await withFailCtx ( 'initialCrashCheck' , ( ) =>
234- tryCheckCrash ( this . state , this . checkInterval , this . isDevMode , this . devLogger )
267+ tryCheckCrash ( this . state , this . checkInterval , this . isDevMode , this . devLogger , this . timeLag )
235268 )
236269
237270 // check on an interval
238271 this . intervalRef = globals . clock . setInterval ( async ( ) => {
239272 try {
240- await tryCheckCrash ( this . state , this . checkInterval , this . isDevMode , this . devLogger )
273+ await tryCheckCrash ( this . state , this . checkInterval , this . isDevMode , this . devLogger , this . timeLag )
241274 } catch ( e ) {
242275 emitFailure ( { functionName : 'checkCrashInterval' , error : e } )
243- this . cleanup ( )
244276
245277 if ( this . isDevMode ) {
246278 throw e
247279 }
280+
281+ this . _onFailure . fire ( )
248282 }
249283 } , this . checkInterval )
250284 }
@@ -255,8 +289,15 @@ class CrashChecker {
255289 state : FileSystemState ,
256290 checkInterval : number ,
257291 isDevMode : boolean ,
258- devLogger : Logger | undefined
292+ devLogger : Logger | undefined ,
293+ timeLag : TimeLag
259294 ) {
295+ if ( await timeLag . didLag ( ) ) {
296+ timeLag . reset ( )
297+ devLogger ?. warn ( 'crashMonitoring: SKIPPED check crash due to time lag' )
298+ return
299+ }
300+
260301 // Iterate all known extensions and for each check if they have crashed
261302 const knownExts = await state . getAllExts ( )
262303 const runningExts : ExtInstanceHeartbeat [ ] = [ ]
@@ -320,11 +361,12 @@ class CrashChecker {
320361 /** Use this on failures to terminate the crash checker */
321362 public cleanup ( ) {
322363 globals . clock . clearInterval ( this . intervalRef )
364+ this . timeLag . cleanup ( )
323365 }
324366
325367 /** Mimics a crash, only for testing */
326368 public testCrash ( ) {
327- globals . clock . clearInterval ( this . intervalRef )
369+ this . cleanup ( )
328370 }
329371}
330372
@@ -617,7 +659,10 @@ export type ExtInstance = {
617659 isDebug ?: boolean
618660}
619661
620- type ExtInstanceHeartbeat = ExtInstance & { lastHeartbeat : number }
662+ type ExtInstanceHeartbeat = ExtInstance & {
663+ /** Timestamp of the last heartbeat in milliseconds */
664+ lastHeartbeat : number
665+ }
621666
622667function isExtHeartbeat ( ext : unknown ) : ext is ExtInstanceHeartbeat {
623668 return typeof ext === 'object' && ext !== null && 'lastHeartbeat' in ext && ext . lastHeartbeat !== undefined
0 commit comments