Skip to content

Commit 545508a

Browse files
minor refactors
Signed-off-by: nkomonen-amazon <[email protected]>
1 parent ad2e297 commit 545508a

File tree

4 files changed

+81
-89
lines changed

4 files changed

+81
-89
lines changed

packages/amazonq/src/extensionNode.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -97,5 +97,5 @@ async function setupDevMode(context: vscode.ExtensionContext) {
9797

9898
export async function deactivate() {
9999
// Run concurrently to speed up execution. stop() does not throw so it is safe
100-
await Promise.all([(await CrashMonitoring.instance())?.stop(), deactivateCommon()])
100+
await Promise.all([(await CrashMonitoring.instance())?.shutdown(), deactivateCommon()])
101101
}

packages/core/src/extensionNode.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -255,7 +255,7 @@ export async function activate(context: vscode.ExtensionContext) {
255255

256256
export async function deactivate() {
257257
// Run concurrently to speed up execution. stop() does not throw so it is safe
258-
await Promise.all([await (await CrashMonitoring.instance())?.stop(), deactivateCommon()])
258+
await Promise.all([await (await CrashMonitoring.instance())?.shutdown(), deactivateCommon()])
259259
await globals.resourceManager.dispose()
260260
}
261261

packages/core/src/shared/crashMonitoring.ts

Lines changed: 70 additions & 79 deletions
Original file line numberDiff line numberDiff line change
@@ -28,14 +28,14 @@ const className = 'CrashMonitoring'
2828
*
2929
* - If an extension crashes it cannot report that it crashed.
3030
* - The ExtensionHost is a separate process from the main VS Code editor process where all extensions run in
31-
* - Read about the [`deactivate()` behavior](../../../../../docs/vscode_behaviors.md)
31+
* - Read about the [`deactivate()` behavior](../../../../docs/vscode_behaviors.md)
3232
* - An IDE instance is one instance of VS Code, and Extension Instance is 1 instance of our extension. These are 1:1.
3333
*
3434
* ### How it works at a high level:
3535
*
36-
* - Each IDE instance will start its own crash reporting process on startup
37-
* - The crash reporting process works with each instance sending heartbeats to a centralized state. Separately each instance
38-
* has a "Checker" the each entry in the centralized to see if it is not running anymore, and appropriately handles when needed.
36+
* - Each IDE instance will start its own crash monitoring process on startup
37+
* - The crash monitoring process works with each instance sending heartbeats to a centralized state. Separately each instance
38+
* has a "Checker" that checks each heartbeat to see if it is not running anymore, and appropriately handles when needed.
3939
*
4040
* - On a crash we will emit a `session_end` metrics with `{ result: 'Failed', reason: 'ExtHostCrashed', crashedSessionId: '...' }`
4141
* - On successful shutdown a `session_end` with a successful result is already emitted elsewhere.
@@ -44,14 +44,20 @@ const className = 'CrashMonitoring'
4444
*
4545
* - To get the most verbose debug logs, configure the devmode setting: `crashReportInterval`
4646
*
47+
* - This entire feature is non critical and should not impede extension usage if something goes wrong. As a result, we
48+
* swallow all errors and only log/telemetry issues. This is the reason for all the try/catch statements
49+
*
4750
* ### Limitations
4851
* - We will never truly know if we are the cause of the crash
4952
* - Since all extensions run in the same Ext Host process, any one of them could cause it to crash and we wouldn't be
5053
* able to differentiate
5154
* - If the IDE itself crashes, unrelated to the extensions, it will still be seen as a crash in our telemetry
5255
* - We are not able to explicitly determine if we were the cause of the crash
53-
* - If the user shuts down their computer after a crash before the next interval of the Primary can run, that info is lost
56+
* - If the user shuts down their computer after a crash before the next crash check can run, that info is lost
5457
* - We cannot persist crash information on computer restart
58+
* - We use the users filesystem to maintain the state of running extension instances, but the
59+
* filesystem is not reliable and can lead to incorrect crash reports
60+
* - To mitigate this we do not run crash reporting on machines that we detect have a flaky filesystem
5561
*/
5662
export class CrashMonitoring {
5763
protected heartbeat: Heartbeat | undefined
@@ -99,12 +105,11 @@ export class CrashMonitoring {
99105
return
100106
}
101107

102-
// In the Prod code this runs by default and interferes as it reports its own heartbeats.
108+
// During tests, the Prod code also runs this function. It interferes with telemetry assertion since it reports additional heartbeats.
103109
if (this.isAutomation) {
104110
return
105111
}
106112

107-
// Dont throw since this feature is not critical and shouldn't prevent extension execution
108113
try {
109114
this.heartbeat = new Heartbeat(this.state, this.checkInterval, this.isDevMode)
110115
this.crashChecker = new CrashChecker(this.state, this.checkInterval, this.isDevMode, this.devLogger)
@@ -113,42 +118,34 @@ export class CrashMonitoring {
113118
await this.crashChecker.start()
114119
} catch (error) {
115120
emitFailure({ functionName: 'start', error })
116-
this.cleanup()
121+
try {
122+
this.crashChecker?.cleanup()
123+
await this.heartbeat?.cleanup()
124+
} catch {}
117125

118-
// In development this gives us a useful stacktrace
126+
// Surface errors during development, otherwise it can be missed.
119127
if (this.isDevMode) {
120128
throw error
121129
}
122130
}
123131
}
124132

125133
/** Stop the Crash Monitoring process, signifying a graceful shutdown */
126-
public async stop() {
127-
// Dont throw since this feature is not critical and shouldn't prevent extension shutdown
134+
public async shutdown() {
128135
try {
129-
this.crashChecker?.stop()
130-
await this.heartbeat?.stop()
136+
this.crashChecker?.cleanup()
137+
await this.heartbeat?.shutdown()
131138
} catch (error) {
132139
try {
133140
// This probably wont emit in time before shutdown, but may be written to the logs
134141
emitFailure({ functionName: 'stop', error })
135-
} catch (e) {
136-
// In case emit fails, do nothing
137-
}
142+
} catch {}
143+
138144
if (this.isDevMode) {
139145
throw error
140146
}
141147
}
142148
}
143-
144-
/**
145-
* Mimic a crash of the extension, or can just be used as cleanup.
146-
* Only use this for tests.
147-
*/
148-
protected cleanup() {
149-
this.crashChecker?.stop()
150-
this.heartbeat?.stopInterval()
151-
}
152149
}
153150

154151
/**
@@ -164,7 +161,6 @@ class Heartbeat {
164161
) {}
165162

166163
public async start() {
167-
// heartbeat 2 times per check
168164
const heartbeatInterval = this.checkInterval / 2
169165

170166
// Send an initial heartbeat immediately
@@ -175,10 +171,11 @@ class Heartbeat {
175171
try {
176172
await this.state.sendHeartbeat()
177173
} catch (e) {
178-
this.stopInterval()
179-
emitFailure({ functionName: 'sendHeartbeat', error: e })
174+
try {
175+
await this.cleanup()
176+
emitFailure({ functionName: 'sendHeartbeatInterval', error: e })
177+
} catch {}
180178

181-
// During development we are fine with impacting extension execution, so throw
182179
if (this.isDevMode) {
183180
throw e
184181
}
@@ -187,31 +184,31 @@ class Heartbeat {
187184
}
188185

189186
/** Stops everything, signifying a graceful shutdown */
190-
public async stop() {
191-
this.stopInterval()
187+
public async shutdown() {
188+
globals.clock.clearInterval(this.intervalRef)
192189
return this.state.indicateGracefulShutdown()
193190
}
194191

195-
/** Stops the heartbeat interval */
196-
public stopInterval() {
197-
globals.clock.clearInterval(this.intervalRef)
192+
/**
193+
* Safely attempts to clean up this heartbeat from the state to try and avoid
194+
* an incorrectly indicated crash. Use this on failures.
195+
*
196+
* ---
197+
*
198+
* IMPORTANT: This function must not throw as this function is run within a catch
199+
*/
200+
public async cleanup() {
201+
try {
202+
await this.shutdown()
203+
} catch {}
204+
try {
205+
await this.state.clearHeartbeat()
206+
} catch {}
198207
}
199208
}
200209

201210
/**
202-
* This checks for if an extension has crashed and handles that result appropriately.
203-
* It listens to heartbeats sent by {@link Heartbeat}, and then handles appropriately when the heartbeats
204-
* stop.
205-
*
206-
* ---
207-
*
208-
* This follows the Primary/Secondary design where one of the extension instances is the Primary checker
209-
* and all others are Secondary.
210-
*
211-
* The Primary actually reads the state and reports crashes if detected.
212-
*
213-
* The Secondary continuously attempts to become the Primary if the previous Primary is no longer responsive.
214-
* This helps to reduce raceconditions for operations on the state.
211+
* This checks the heartbeats of each known extension to see if it has crashed and handles that result appropriately.
215212
*/
216213
class CrashChecker {
217214
private intervalRef: NodeJS.Timer | undefined
@@ -232,17 +229,14 @@ class CrashChecker {
232229
tryCheckCrash(this.state, this.checkInterval, this.isDevMode, this.devLogger)
233230
)
234231

232+
// check on an interval
235233
this.intervalRef = globals.clock.setInterval(async () => {
236234
try {
237235
await tryCheckCrash(this.state, this.checkInterval, this.isDevMode, this.devLogger)
238236
} catch (e) {
239237
emitFailure({ functionName: 'checkCrashInterval', error: e })
238+
this.cleanup()
240239

241-
// Since there was an error there is point to try checking for crashed instances.
242-
// We will need to monitor telemetry to see if we can determine widespread issues.
243-
this.stop()
244-
245-
// During development we are fine with impacting extension execution, so throw
246240
if (this.isDevMode) {
247241
throw e
248242
}
@@ -269,13 +263,13 @@ class CrashChecker {
269263

270264
// Ext is not running anymore, handle appropriately depending on why it stopped running
271265
await state.handleExtNotRunning(ext, {
272-
shutdown: async () => {
266+
onShutdown: async () => {
273267
// Nothing to do, just log info if necessary
274268
devLogger?.debug(
275269
`crashMonitoring: SHUTDOWN: following has gracefully shutdown: pid ${ext.extHostPid} + sessionId: ${ext.sessionId}`
276270
)
277271
},
278-
crash: async () => {
272+
onCrash: async () => {
279273
// Debugger instances may incorrectly look like they crashed, so don't emit.
280274
// Example is if I hit the red square in the debug menu, it is a non-graceful shutdown. But the regular
281275
// 'x' button in the Debug IDE instance is a graceful shutdown.
@@ -314,16 +308,12 @@ class CrashChecker {
314308

315309
function isStoppedHeartbeats(ext: ExtInstanceHeartbeat, checkInterval: number) {
316310
const millisSinceLastHeartbeat = globals.clock.Date.now() - ext.lastHeartbeat
317-
// since heartbeats happen 2 times per check interval it will have occured
318-
// at least once in the timespan of the check interval.
319-
//
320-
// But if we want to be more flexible this condition can be modified since
321-
// something like global state taking time to sync can return the incorrect last heartbeat value.
322311
return millisSinceLastHeartbeat >= checkInterval
323312
}
324313
}
325314

326-
public stop() {
315+
/** Use this on failures */
316+
public cleanup() {
327317
globals.clock.clearInterval(this.intervalRef)
328318
}
329319
}
@@ -356,8 +346,7 @@ function getDefaultDependencies(): MementoStateDependencies {
356346
}
357347
}
358348
/**
359-
* Factory to create an instance of the state. Do not create an instance of the
360-
* class manually since there are required setup steps.
349+
* Factory to create an instance of the state.
361350
*
362351
* @throws if the filesystem state cannot be confirmed to be stable, i.e flaky fs operations
363352
*/
@@ -368,14 +357,16 @@ export async function crashMonitoringStateFactory(deps = getDefaultDependencies(
368357
}
369358

370359
/**
371-
* The state of all running extensions. This state is globally shared with all other extension instances.
372-
* This state specifically uses the File System.
360+
* The state of all running extensions.
361+
* - is globally shared with all other extension instances.
362+
* - uses the File System
363+
* - is not truly reliable since filesystems are not reliable
373364
*/
374365
export class FileSystemState {
375366
private readonly stateDirPath: string
376367

377368
/**
378-
* Use {@link crashMonitoringStateFactory} to make an instance
369+
* IMORTANT: Use {@link crashMonitoringStateFactory} to make an instance
379370
*/
380371
constructor(protected readonly deps: MementoStateDependencies) {
381372
this.stateDirPath = path.join(this.deps.workDirPath, crashMonitoringDirNames.root)
@@ -393,8 +384,8 @@ export class FileSystemState {
393384
*/
394385
public async init() {
395386
// IMPORTANT: do not run crash reporting on unstable filesystem to reduce invalid crash data
396-
// NOTE: We want to get a diff between clients that succeeded vs failed this check,
397-
// so emit a metric to track that
387+
//
388+
// NOTE: Emits a metric to know how many clients we skipped
398389
await telemetry.function_call.run(async (span) => {
399390
span.record({ className, functionName: 'FileSystemStateValidation' })
400391
await withFailCtx('validateFileSystemStability', () => throwOnUnstableFileSystem())
@@ -423,15 +414,17 @@ export class FileSystemState {
423414
}
424415
const funcWithCtx = () => withFailCtx('sendHeartbeatState', func)
425416
const funcWithRetries = withRetries(funcWithCtx, { maxRetries: 8, delay: 100, backoff: 2 })
426-
427-
// retry a failed heartbeat to avoid incorrectly being reported as a crash
428417
return await funcWithRetries
429418
} catch (e) {
430419
// delete this ext from the state to avoid an incorrectly reported crash since we could not send a new heartbeat
431-
await withFailCtx('sendHeartbeatStateCleanup', () => this.deleteRunningFile(extId))
420+
await withFailCtx('sendHeartbeatFailureCleanup', () => this.clearHeartbeat())
432421
throw e
433422
}
434423
}
424+
/** Clears this extentions heartbeat from the state */
425+
public async clearHeartbeat() {
426+
await this.deleteHeartbeatFile(this.extId)
427+
}
435428

436429
/**
437430
* Signal that this extension is gracefully shutting down. This will prevent the IDE from thinking it crashed.
@@ -460,31 +453,29 @@ export class FileSystemState {
460453
*/
461454
public async handleExtNotRunning(
462455
ext: ExtInstance,
463-
opts: { shutdown: () => Promise<void>; crash: () => Promise<void> }
456+
opts: { onShutdown: () => Promise<void>; onCrash: () => Promise<void> }
464457
): Promise<void> {
465458
const extId = this.createExtId(ext)
466459
const shutdownFilePath = path.join(await this.shutdownExtsDir(), extId)
467460

468461
if (await withFailCtx('existsShutdownFile', () => fs.exists(shutdownFilePath))) {
469-
await opts.shutdown()
462+
await opts.onShutdown()
470463
// We intentionally do not clean up the file in shutdown since there may be another
471464
// extension may be doing the same thing in parallel, and would read the extension as
472465
// crashed since the file was missing. The file will be cleared on computer restart though.
473466

474467
// TODO: Be smart and clean up the file after some time.
475468
} else {
476-
await opts.crash()
469+
await opts.onCrash()
477470
}
478471

479472
// Clean up the running extension file since it no longer exists
480-
await this.deleteRunningFile(extId)
473+
await this.deleteHeartbeatFile(extId)
481474
}
482-
private async deleteRunningFile(extId: ExtInstanceId) {
483-
// Clean up the running extension file since it is no longer exists
475+
public async deleteHeartbeatFile(extId: ExtInstanceId) {
484476
const dir = await this.runningExtsDir()
485-
// Retry on failure since failing to delete this file will result in incorrectly reported crashes.
486-
// The common errors we were seeing were windows EPERM/EBUSY errors. There may be a relation
487-
// to this https://github.com/aws/aws-toolkit-vscode/pull/5335
477+
// Retry file deletion to prevent incorrect crash reports. Common Windows errors seen in telemetry: EPERM/EBUSY.
478+
// See: https://github.com/aws/aws-toolkit-vscode/pull/5335
488479
await withRetries(() => withFailCtx('deleteStaleRunningFile', () => fs.delete(path.join(dir, extId))), {
489480
maxRetries: 8,
490481
delay: 100,

0 commit comments

Comments
 (0)