@@ -54,8 +54,6 @@ const className = 'CrashMonitoring'
5454 * - We cannot persist crash information on computer restart
5555 */
5656export class CrashMonitoring {
57- private isStarted : boolean = false
58-
5957 protected heartbeat : Heartbeat | undefined
6058 protected crashChecker : CrashChecker | undefined
6159
@@ -67,20 +65,21 @@ export class CrashMonitoring {
6765 private readonly devLogger : Logger | undefined
6866 ) { }
6967
70- static #didTryInstance = false
68+ static #didTryCreate = false
7169 static #instance: CrashMonitoring | undefined
70+ /** Returns an instance of this class or undefined if any initial validation fails. */
7271 public static async instance ( ) : Promise < CrashMonitoring | undefined > {
7372 // Since the first attempt to create an instance may have failed, we do not
7473 // attempt to create an instance again and return whatever we have
75- if ( this . #didTryInstance === true ) {
74+ if ( this . #didTryCreate === true ) {
7675 return this . #instance
7776 }
7877
7978 try {
80- this . #didTryInstance = true
79+ this . #didTryCreate = true
8180 const isDevMode = getIsDevMode ( )
8281 const devModeLogger : Logger | undefined = isDevMode ? getLogger ( ) : undefined
83- const state = await crashMonitoringStateFactory ( )
82+ const state = await crashMonitoringStateFactory ( ) // can throw
8483 return ( this . #instance ??= new CrashMonitoring (
8584 state ,
8685 DevSettings . instance . get ( 'crashCheckInterval' , 1000 * 60 * 10 ) , // check every 10 minutes
@@ -89,7 +88,6 @@ export class CrashMonitoring {
8988 devModeLogger
9089 ) )
9190 } catch ( error ) {
92- // Creating the crash monitoring state can throw/fail
9391 emitFailure ( { functionName : 'instance' , error } )
9492 return undefined
9593 }
@@ -113,10 +111,10 @@ export class CrashMonitoring {
113111
114112 await this . heartbeat . start ( )
115113 await this . crashChecker . start ( )
116-
117- this . isStarted = true
118114 } catch ( error ) {
119115 emitFailure ( { functionName : 'start' , error } )
116+ this . cleanup ( )
117+
120118 // In development this gives us a useful stacktrace
121119 if ( this . isDevMode ) {
122120 throw error
@@ -126,10 +124,6 @@ export class CrashMonitoring {
126124
127125 /** Stop the Crash Monitoring process, signifying a graceful shutdown */
128126 public async stop ( ) {
129- if ( ! this . isStarted ) {
130- return
131- }
132-
133127 // Dont throw since this feature is not critical and shouldn't prevent extension shutdown
134128 try {
135129 this . crashChecker ?. stop ( )
@@ -151,13 +145,9 @@ export class CrashMonitoring {
151145 * Mimic a crash of the extension, or can just be used as cleanup.
152146 * Only use this for tests.
153147 */
154- protected crash ( ) {
155- if ( ! this . isStarted ) {
156- return
157- }
158-
148+ protected cleanup ( ) {
159149 this . crashChecker ?. stop ( )
160- this . heartbeat ?. crash ( )
150+ this . heartbeat ?. stopInterval ( )
161151 }
162152}
163153
@@ -166,8 +156,6 @@ export class CrashMonitoring {
166156 * {@link CrashChecker} listens for these.
167157 */
168158class Heartbeat {
169- private didEmitFailure : boolean = false
170- private isRunning : boolean = false
171159 private intervalRef : NodeJS . Timer | undefined
172160 constructor (
173161 private readonly state : FileSystemState ,
@@ -187,33 +175,25 @@ class Heartbeat {
187175 try {
188176 await this . state . sendHeartbeat ( )
189177 } catch ( e ) {
190- // only emit a failure once so we do not spam telemetry on repeated errors
191- if ( ! this . didEmitFailure ) {
192- this . didEmitFailure = true
193- emitFailure ( { functionName : 'sendHeartbeat' , error : e } )
194- }
178+ this . stopInterval ( )
179+ emitFailure ( { functionName : 'sendHeartbeat' , error : e } )
195180
196181 // During development we are fine with impacting extension execution, so throw
197182 if ( this . isDevMode ) {
198183 throw e
199184 }
200185 }
201186 } , heartbeatInterval )
202-
203- this . isRunning = true
204187 }
205188
189+ /** Stops everything, signifying a graceful shutdown */
206190 public async stop ( ) {
207- // non-happy path where heartbeats were never started.
208- if ( ! this . isRunning ) {
209- return
210- }
211-
212- globals . clock . clearInterval ( this . intervalRef )
191+ this . stopInterval ( )
213192 return this . state . indicateGracefulShutdown ( )
214193 }
215194
216- public crash ( ) {
195+ /** Stops the heartbeat interval */
196+ public stopInterval ( ) {
217197 globals . clock . clearInterval ( this . intervalRef )
218198 }
219199}
@@ -258,7 +238,7 @@ class CrashChecker {
258238 } catch ( e ) {
259239 emitFailure ( { functionName : 'checkCrashInterval' , error : e } )
260240
261- // Since there was an error we want to stop crash monitoring since it is pointless .
241+ // Since there was an error there is point to try checking for crashed instances .
262242 // We will need to monitor telemetry to see if we can determine widespread issues.
263243 this . stop ( )
264244
@@ -379,7 +359,7 @@ function getDefaultDependencies(): MementoStateDependencies {
379359 * Factory to create an instance of the state. Do not create an instance of the
380360 * class manually since there are required setup steps.
381361 *
382- * @throws if the filesystem state cannot get in to a good state
362+ * @throws if the filesystem state cannot be confirmed to be stable, i.e flaky fs operations
383363 */
384364export async function crashMonitoringStateFactory ( deps = getDefaultDependencies ( ) ) : Promise < FileSystemState > {
385365 const state : FileSystemState = new FileSystemState ( deps )
@@ -409,15 +389,15 @@ export class FileSystemState {
409389 * Does the required initialization steps, this must always be run after
410390 * creation of the instance.
411391 *
412- * @throws if the filesystem state cannot get in to a good state
392+ * @throws if the filesystem state cannot be confirmed to be stable, i.e flaky fs operations
413393 */
414394 public async init ( ) {
415395 // IMPORTANT: do not run crash reporting on unstable filesystem to reduce invalid crash data
416396 // NOTE: We want to get a diff between clients that succeeded vs failed this check,
417397 // so emit a metric to track that
418398 await telemetry . function_call . run ( async ( span ) => {
419399 span . record ( { className, functionName : 'FileSystemStateValidation' } )
420- await withFailCtx ( 'validateFileSystemStability' , ( ) => throwOnUnstableFileSystem ( this . deps . workDirPath ) )
400+ await withFailCtx ( 'validateFileSystemStability' , ( ) => throwOnUnstableFileSystem ( ) )
421401 } )
422402
423403 // Clear the state if the user did something like a computer restart
@@ -428,21 +408,29 @@ export class FileSystemState {
428408
429409 // ------------------ Heartbeat methods ------------------
430410 public async sendHeartbeat ( ) {
431- const _sendHeartbeat = ( ) =>
432- withFailCtx ( 'sendHeartbeatState' , async ( ) => {
411+ const extId = this . createExtId ( this . ext )
412+
413+ try {
414+ const func = async ( ) => {
433415 const dir = await this . runningExtsDir ( )
434- const extId = this . createExtId ( this . ext )
435416 await fs . writeFile (
436417 path . join ( dir , extId ) ,
437418 JSON . stringify ( { ...this . ext , lastHeartbeat : this . deps . now ( ) } , undefined , 4 )
438419 )
439420 this . deps . devLogger ?. debug (
440421 `crashMonitoring: HEARTBEAT pid ${ this . deps . pid } + sessionId: ${ this . deps . sessionId . slice ( 0 , 8 ) } -...`
441422 )
442- } )
443- // retry a failed heartbeat to avoid incorrectly being reported as a crash
444- const _sendHeartbeatWithRetries = withRetries ( _sendHeartbeat , { maxRetries : 7 , delay : 100 , backoff : 2 } )
445- return _sendHeartbeatWithRetries
423+ }
424+ const funcWithCtx = ( ) => withFailCtx ( 'sendHeartbeatState' , func )
425+ const funcWithRetries = withRetries ( funcWithCtx , { maxRetries : 8 , delay : 100 , backoff : 2 } )
426+
427+ // retry a failed heartbeat to avoid incorrectly being reported as a crash
428+ return await funcWithRetries
429+ } catch ( e ) {
430+ // delete this ext from the state to avoid an incorrectly reported crash since we could not send a new heartbeat
431+ await withFailCtx ( 'sendHeartbeatStateCleanup' , ( ) => this . deleteRunningFile ( extId ) )
432+ throw e
433+ }
446434 }
447435
448436 /**
@@ -488,13 +476,17 @@ export class FileSystemState {
488476 await opts . crash ( )
489477 }
490478
479+ // Clean up the running extension file since it no longer exists
480+ await this . deleteRunningFile ( extId )
481+ }
482+ private async deleteRunningFile ( extId : ExtInstanceId ) {
491483 // Clean up the running extension file since it is no longer exists
492484 const dir = await this . runningExtsDir ( )
493485 // Retry on failure since failing to delete this file will result in incorrectly reported crashes.
494486 // The common errors we were seeing were windows EPERM/EBUSY errors. There may be a relation
495487 // to this https://github.com/aws/aws-toolkit-vscode/pull/5335
496488 await withRetries ( ( ) => withFailCtx ( 'deleteStaleRunningFile' , ( ) => fs . delete ( path . join ( dir , extId ) ) ) , {
497- maxRetries : 7 ,
489+ maxRetries : 8 ,
498490 delay : 100 ,
499491 backoff : 2 ,
500492 } )
@@ -525,7 +517,7 @@ export class FileSystemState {
525517 private async runningExtsDir ( ) : Promise < string > {
526518 const p = path . join ( this . stateDirPath , crashMonitoringDirNames . running )
527519 // ensure the dir exists
528- await withFailCtx ( 'ensureRunningExtsDir' , ( ) => fs . mkdir ( p ) )
520+ await withFailCtx ( 'ensureRunningExtsDir' , ( ) => nodeFs . mkdir ( p , { recursive : true } ) )
529521 return p
530522 }
531523 private async shutdownExtsDir ( ) {
@@ -551,17 +543,22 @@ export class FileSystemState {
551543 const allExts = allExtIds . map < Promise < ExtInstanceHeartbeat | undefined > > ( async ( extId : string ) => {
552544 // Due to a race condition, a separate extension instance may have removed this file by this point. It is okay since
553545 // we will assume that other instance handled its termination appropriately.
554- const ext = await withFailCtx ( 'parseRunningExtFile' , async ( ) =>
555- ignoreBadFileError ( async ( ) => {
556- const text = await fs . readFileText ( path . join ( await this . runningExtsDir ( ) , extId ) )
557-
558- if ( ! text ) {
559- return undefined
560- }
561-
562- // This was sometimes throwing SyntaxError
563- return JSON . parse ( text ) as ExtInstanceHeartbeat
564- } )
546+ // NOTE: On Windows we were failing on EBUSY, so we retry on failure.
547+ const ext : ExtInstanceHeartbeat | undefined = await withRetries (
548+ ( ) =>
549+ withFailCtx ( 'parseRunningExtFile' , async ( ) =>
550+ ignoreBadFileError ( async ( ) => {
551+ const text = await fs . readFileText ( path . join ( await this . runningExtsDir ( ) , extId ) )
552+
553+ if ( ! text ) {
554+ return undefined
555+ }
556+
557+ // This was sometimes throwing SyntaxError
558+ return JSON . parse ( text ) as ExtInstanceHeartbeat
559+ } )
560+ ) ,
561+ { maxRetries : 6 , delay : 100 , backoff : 2 }
565562 )
566563
567564 if ( ext === undefined ) {
0 commit comments