@@ -28,14 +28,14 @@ const className = 'CrashMonitoring'
2828 *
2929 * - If an extension crashes it cannot report that it crashed.
3030 * - The ExtensionHost is a separate process from the main VS Code editor process where all extensions run in
31- * - Read about the [`deactivate()` behavior](../../../../../ docs/vscode_behaviors.md)
31+ * - Read about the [`deactivate()` behavior](../../../../docs/vscode_behaviors.md)
3232 * - An IDE instance is one instance of VS Code, and Extension Instance is 1 instance of our extension. These are 1:1.
3333 *
3434 * ### How it works at a high level:
3535 *
36- * - Each IDE instance will start its own crash reporting process on startup
37- * - The crash reporting process works with each instance sending heartbeats to a centralized state. Separately each instance
38- * has a "Checker" the each entry in the centralized to see if it is not running anymore, and appropriately handles when needed.
36+ * - Each IDE instance will start its own crash monitoring process on startup
37+ * - The crash monitoring process works with each instance sending heartbeats to a centralized state. Separately each instance
38+ * has a "Checker" that checks each heartbeat to see if it is not running anymore, and appropriately handles when needed.
3939 *
4040 * - On a crash we will emit a `session_end` metrics with `{ result: 'Failed', reason: 'ExtHostCrashed', crashedSessionId: '...' }`
4141 * - On successful shutdown a `session_end` with a successful result is already emitted elsewhere.
@@ -44,14 +44,20 @@ const className = 'CrashMonitoring'
4444 *
4545 * - To get the most verbose debug logs, configure the devmode setting: `crashReportInterval`
4646 *
47+ * - This entire feature is non critical and should not impede extension usage if something goes wrong. As a result, we
48+ * swallow all errors and only log/telemetry issues. This is the reason for all the try/catch statements
49+ *
4750 * ### Limitations
4851 * - We will never truly know if we are the cause of the crash
4952 * - Since all extensions run in the same Ext Host process, any one of them could cause it to crash and we wouldn't be
5053 * able to differentiate
5154 * - If the IDE itself crashes, unrelated to the extensions, it will still be seen as a crash in our telemetry
5255 * - We are not able to explicitly determine if we were the cause of the crash
53- * - If the user shuts down their computer after a crash before the next interval of the Primary can run, that info is lost
56+ * - If the user shuts down their computer after a crash before the next crash check can run, that info is lost
5457 * - We cannot persist crash information on computer restart
58+ * - We use the users filesystem to maintain the state of running extension instances, but the
59+ * filesystem is not reliable and can lead to incorrect crash reports
60+ * - To mitigate this we do not run crash reporting on machines that we detect have a flaky filesystem
5561 */
5662export class CrashMonitoring {
5763 protected heartbeat : Heartbeat | undefined
@@ -99,12 +105,11 @@ export class CrashMonitoring {
99105 return
100106 }
101107
102- // In the Prod code this runs by default and interferes as it reports its own heartbeats.
108+ // During tests, the Prod code also runs this function. It interferes with telemetry assertion since it reports additional heartbeats.
103109 if ( this . isAutomation ) {
104110 return
105111 }
106112
107- // Dont throw since this feature is not critical and shouldn't prevent extension execution
108113 try {
109114 this . heartbeat = new Heartbeat ( this . state , this . checkInterval , this . isDevMode )
110115 this . crashChecker = new CrashChecker ( this . state , this . checkInterval , this . isDevMode , this . devLogger )
@@ -113,42 +118,34 @@ export class CrashMonitoring {
113118 await this . crashChecker . start ( )
114119 } catch ( error ) {
115120 emitFailure ( { functionName : 'start' , error } )
116- this . cleanup ( )
121+ try {
122+ this . crashChecker ?. cleanup ( )
123+ await this . heartbeat ?. cleanup ( )
124+ } catch { }
117125
118- // In development this gives us a useful stacktrace
126+ // Surface errors during development, otherwise it can be missed.
119127 if ( this . isDevMode ) {
120128 throw error
121129 }
122130 }
123131 }
124132
125133 /** Stop the Crash Monitoring process, signifying a graceful shutdown */
126- public async stop ( ) {
127- // Dont throw since this feature is not critical and shouldn't prevent extension shutdown
134+ public async shutdown ( ) {
128135 try {
129- this . crashChecker ?. stop ( )
130- await this . heartbeat ?. stop ( )
136+ this . crashChecker ?. cleanup ( )
137+ await this . heartbeat ?. shutdown ( )
131138 } catch ( error ) {
132139 try {
133140 // This probably wont emit in time before shutdown, but may be written to the logs
134141 emitFailure ( { functionName : 'stop' , error } )
135- } catch ( e ) {
136- // In case emit fails, do nothing
137- }
142+ } catch { }
143+
138144 if ( this . isDevMode ) {
139145 throw error
140146 }
141147 }
142148 }
143-
144- /**
145- * Mimic a crash of the extension, or can just be used as cleanup.
146- * Only use this for tests.
147- */
148- protected cleanup ( ) {
149- this . crashChecker ?. stop ( )
150- this . heartbeat ?. stopInterval ( )
151- }
152149}
153150
154151/**
@@ -164,7 +161,6 @@ class Heartbeat {
164161 ) { }
165162
166163 public async start ( ) {
167- // heartbeat 2 times per check
168164 const heartbeatInterval = this . checkInterval / 2
169165
170166 // Send an initial heartbeat immediately
@@ -175,10 +171,11 @@ class Heartbeat {
175171 try {
176172 await this . state . sendHeartbeat ( )
177173 } catch ( e ) {
178- this . stopInterval ( )
179- emitFailure ( { functionName : 'sendHeartbeat' , error : e } )
174+ try {
175+ await this . cleanup ( )
176+ emitFailure ( { functionName : 'sendHeartbeatInterval' , error : e } )
177+ } catch { }
180178
181- // During development we are fine with impacting extension execution, so throw
182179 if ( this . isDevMode ) {
183180 throw e
184181 }
@@ -187,31 +184,31 @@ class Heartbeat {
187184 }
188185
189186 /** Stops everything, signifying a graceful shutdown */
190- public async stop ( ) {
191- this . stopInterval ( )
187+ public async shutdown ( ) {
188+ globals . clock . clearInterval ( this . intervalRef )
192189 return this . state . indicateGracefulShutdown ( )
193190 }
194191
195- /** Stops the heartbeat interval */
196- public stopInterval ( ) {
197- globals . clock . clearInterval ( this . intervalRef )
192+ /**
193+ * Safely attempts to clean up this heartbeat from the state to try and avoid
194+ * an incorrectly indicated crash. Use this on failures.
195+ *
196+ * ---
197+ *
198+ * IMPORTANT: This function must not throw as this function is run within a catch
199+ */
200+ public async cleanup ( ) {
201+ try {
202+ await this . shutdown ( )
203+ } catch { }
204+ try {
205+ await this . state . clearHeartbeat ( )
206+ } catch { }
198207 }
199208}
200209
201210/**
202- * This checks for if an extension has crashed and handles that result appropriately.
203- * It listens to heartbeats sent by {@link Heartbeat}, and then handles appropriately when the heartbeats
204- * stop.
205- *
206- * ---
207- *
208- * This follows the Primary/Secondary design where one of the extension instances is the Primary checker
209- * and all others are Secondary.
210- *
211- * The Primary actually reads the state and reports crashes if detected.
212- *
213- * The Secondary continuously attempts to become the Primary if the previous Primary is no longer responsive.
214- * This helps to reduce raceconditions for operations on the state.
211+ * This checks the heartbeats of each known extension to see if it has crashed and handles that result appropriately.
215212 */
216213class CrashChecker {
217214 private intervalRef : NodeJS . Timer | undefined
@@ -232,17 +229,14 @@ class CrashChecker {
232229 tryCheckCrash ( this . state , this . checkInterval , this . isDevMode , this . devLogger )
233230 )
234231
232+ // check on an interval
235233 this . intervalRef = globals . clock . setInterval ( async ( ) => {
236234 try {
237235 await tryCheckCrash ( this . state , this . checkInterval , this . isDevMode , this . devLogger )
238236 } catch ( e ) {
239237 emitFailure ( { functionName : 'checkCrashInterval' , error : e } )
238+ this . cleanup ( )
240239
241- // Since there was an error there is point to try checking for crashed instances.
242- // We will need to monitor telemetry to see if we can determine widespread issues.
243- this . stop ( )
244-
245- // During development we are fine with impacting extension execution, so throw
246240 if ( this . isDevMode ) {
247241 throw e
248242 }
@@ -269,13 +263,13 @@ class CrashChecker {
269263
270264 // Ext is not running anymore, handle appropriately depending on why it stopped running
271265 await state . handleExtNotRunning ( ext , {
272- shutdown : async ( ) => {
266+ onShutdown : async ( ) => {
273267 // Nothing to do, just log info if necessary
274268 devLogger ?. debug (
275269 `crashMonitoring: SHUTDOWN: following has gracefully shutdown: pid ${ ext . extHostPid } + sessionId: ${ ext . sessionId } `
276270 )
277271 } ,
278- crash : async ( ) => {
272+ onCrash : async ( ) => {
279273 // Debugger instances may incorrectly look like they crashed, so don't emit.
280274 // Example is if I hit the red square in the debug menu, it is a non-graceful shutdown. But the regular
281275 // 'x' button in the Debug IDE instance is a graceful shutdown.
@@ -314,16 +308,12 @@ class CrashChecker {
314308
315309 function isStoppedHeartbeats ( ext : ExtInstanceHeartbeat , checkInterval : number ) {
316310 const millisSinceLastHeartbeat = globals . clock . Date . now ( ) - ext . lastHeartbeat
317- // since heartbeats happen 2 times per check interval it will have occured
318- // at least once in the timespan of the check interval.
319- //
320- // But if we want to be more flexible this condition can be modified since
321- // something like global state taking time to sync can return the incorrect last heartbeat value.
322311 return millisSinceLastHeartbeat >= checkInterval
323312 }
324313 }
325314
326- public stop ( ) {
315+ /** Use this on failures */
316+ public cleanup ( ) {
327317 globals . clock . clearInterval ( this . intervalRef )
328318 }
329319}
@@ -356,8 +346,7 @@ function getDefaultDependencies(): MementoStateDependencies {
356346 }
357347}
358348/**
359- * Factory to create an instance of the state. Do not create an instance of the
360- * class manually since there are required setup steps.
349+ * Factory to create an instance of the state.
361350 *
362351 * @throws if the filesystem state cannot be confirmed to be stable, i.e flaky fs operations
363352 */
@@ -368,14 +357,16 @@ export async function crashMonitoringStateFactory(deps = getDefaultDependencies(
368357}
369358
370359/**
371- * The state of all running extensions. This state is globally shared with all other extension instances.
372- * This state specifically uses the File System.
360+ * The state of all running extensions.
361+ * - is globally shared with all other extension instances.
362+ * - uses the File System
363+ * - is not truly reliable since filesystems are not reliable
373364 */
374365export class FileSystemState {
375366 private readonly stateDirPath : string
376367
377368 /**
378- * Use {@link crashMonitoringStateFactory} to make an instance
369+ * IMORTANT: Use {@link crashMonitoringStateFactory} to make an instance
379370 */
380371 constructor ( protected readonly deps : MementoStateDependencies ) {
381372 this . stateDirPath = path . join ( this . deps . workDirPath , crashMonitoringDirNames . root )
@@ -393,8 +384,8 @@ export class FileSystemState {
393384 */
394385 public async init ( ) {
395386 // IMPORTANT: do not run crash reporting on unstable filesystem to reduce invalid crash data
396- // NOTE: We want to get a diff between clients that succeeded vs failed this check,
397- // so emit a metric to track that
387+ //
388+ // NOTE: Emits a metric to know how many clients we skipped
398389 await telemetry . function_call . run ( async ( span ) => {
399390 span . record ( { className, functionName : 'FileSystemStateValidation' } )
400391 await withFailCtx ( 'validateFileSystemStability' , ( ) => throwOnUnstableFileSystem ( ) )
@@ -423,15 +414,17 @@ export class FileSystemState {
423414 }
424415 const funcWithCtx = ( ) => withFailCtx ( 'sendHeartbeatState' , func )
425416 const funcWithRetries = withRetries ( funcWithCtx , { maxRetries : 8 , delay : 100 , backoff : 2 } )
426-
427- // retry a failed heartbeat to avoid incorrectly being reported as a crash
428417 return await funcWithRetries
429418 } catch ( e ) {
430419 // delete this ext from the state to avoid an incorrectly reported crash since we could not send a new heartbeat
431- await withFailCtx ( 'sendHeartbeatStateCleanup ' , ( ) => this . deleteRunningFile ( extId ) )
420+ await withFailCtx ( 'sendHeartbeatFailureCleanup ' , ( ) => this . clearHeartbeat ( ) )
432421 throw e
433422 }
434423 }
424+ /** Clears this extentions heartbeat from the state */
425+ public async clearHeartbeat ( ) {
426+ await this . deleteHeartbeatFile ( this . extId )
427+ }
435428
436429 /**
437430 * Signal that this extension is gracefully shutting down. This will prevent the IDE from thinking it crashed.
@@ -460,31 +453,29 @@ export class FileSystemState {
460453 */
461454 public async handleExtNotRunning (
462455 ext : ExtInstance ,
463- opts : { shutdown : ( ) => Promise < void > ; crash : ( ) => Promise < void > }
456+ opts : { onShutdown : ( ) => Promise < void > ; onCrash : ( ) => Promise < void > }
464457 ) : Promise < void > {
465458 const extId = this . createExtId ( ext )
466459 const shutdownFilePath = path . join ( await this . shutdownExtsDir ( ) , extId )
467460
468461 if ( await withFailCtx ( 'existsShutdownFile' , ( ) => fs . exists ( shutdownFilePath ) ) ) {
469- await opts . shutdown ( )
462+ await opts . onShutdown ( )
470463 // We intentionally do not clean up the file in shutdown since there may be another
471464 // extension may be doing the same thing in parallel, and would read the extension as
472465 // crashed since the file was missing. The file will be cleared on computer restart though.
473466
474467 // TODO: Be smart and clean up the file after some time.
475468 } else {
476- await opts . crash ( )
469+ await opts . onCrash ( )
477470 }
478471
479472 // Clean up the running extension file since it no longer exists
480- await this . deleteRunningFile ( extId )
473+ await this . deleteHeartbeatFile ( extId )
481474 }
482- private async deleteRunningFile ( extId : ExtInstanceId ) {
483- // Clean up the running extension file since it is no longer exists
475+ public async deleteHeartbeatFile ( extId : ExtInstanceId ) {
484476 const dir = await this . runningExtsDir ( )
485- // Retry on failure since failing to delete this file will result in incorrectly reported crashes.
486- // The common errors we were seeing were windows EPERM/EBUSY errors. There may be a relation
487- // to this https://github.com/aws/aws-toolkit-vscode/pull/5335
477+ // Retry file deletion to prevent incorrect crash reports. Common Windows errors seen in telemetry: EPERM/EBUSY.
478+ // See: https://github.com/aws/aws-toolkit-vscode/pull/5335
488479 await withRetries ( ( ) => withFailCtx ( 'deleteStaleRunningFile' , ( ) => fs . delete ( path . join ( dir , extId ) ) ) , {
489480 maxRetries : 8 ,
490481 delay : 100 ,
0 commit comments