Skip to content

Commit ad2e297

Browse files
various fixes
- removed some unnecessary code - fix some comments - if heartbeats fails, stop sending future heartbeats, and remove the existing one from the state so it cannot be seen as a crash. - increase the retries from 12 seconds total to 25 seconds total (1 extra interval) - switch a vscode fs mkdir call to the node one since it looks like it may be flaky based on telemetry data - retry when trying to load the extension state from disk since it seems to fail sometimes due to temporary issues Signed-off-by: nkomonen-amazon <[email protected]>
1 parent dd1db97 commit ad2e297

File tree

3 files changed

+71
-64
lines changed

3 files changed

+71
-64
lines changed

packages/core/src/shared/crashMonitoring.ts

Lines changed: 56 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -54,8 +54,6 @@ const className = 'CrashMonitoring'
5454
* - We cannot persist crash information on computer restart
5555
*/
5656
export class CrashMonitoring {
57-
private isStarted: boolean = false
58-
5957
protected heartbeat: Heartbeat | undefined
6058
protected crashChecker: CrashChecker | undefined
6159

@@ -67,20 +65,21 @@ export class CrashMonitoring {
6765
private readonly devLogger: Logger | undefined
6866
) {}
6967

70-
static #didTryInstance = false
68+
static #didTryCreate = false
7169
static #instance: CrashMonitoring | undefined
70+
/** Returns an instance of this class or undefined if any initial validation fails. */
7271
public static async instance(): Promise<CrashMonitoring | undefined> {
7372
// Since the first attempt to create an instance may have failed, we do not
7473
// attempt to create an instance again and return whatever we have
75-
if (this.#didTryInstance === true) {
74+
if (this.#didTryCreate === true) {
7675
return this.#instance
7776
}
7877

7978
try {
80-
this.#didTryInstance = true
79+
this.#didTryCreate = true
8180
const isDevMode = getIsDevMode()
8281
const devModeLogger: Logger | undefined = isDevMode ? getLogger() : undefined
83-
const state = await crashMonitoringStateFactory()
82+
const state = await crashMonitoringStateFactory() // can throw
8483
return (this.#instance ??= new CrashMonitoring(
8584
state,
8685
DevSettings.instance.get('crashCheckInterval', 1000 * 60 * 10), // check every 10 minutes
@@ -89,7 +88,6 @@ export class CrashMonitoring {
8988
devModeLogger
9089
))
9190
} catch (error) {
92-
// Creating the crash monitoring state can throw/fail
9391
emitFailure({ functionName: 'instance', error })
9492
return undefined
9593
}
@@ -113,10 +111,10 @@ export class CrashMonitoring {
113111

114112
await this.heartbeat.start()
115113
await this.crashChecker.start()
116-
117-
this.isStarted = true
118114
} catch (error) {
119115
emitFailure({ functionName: 'start', error })
116+
this.cleanup()
117+
120118
// In development this gives us a useful stacktrace
121119
if (this.isDevMode) {
122120
throw error
@@ -126,10 +124,6 @@ export class CrashMonitoring {
126124

127125
/** Stop the Crash Monitoring process, signifying a graceful shutdown */
128126
public async stop() {
129-
if (!this.isStarted) {
130-
return
131-
}
132-
133127
// Dont throw since this feature is not critical and shouldn't prevent extension shutdown
134128
try {
135129
this.crashChecker?.stop()
@@ -151,13 +145,9 @@ export class CrashMonitoring {
151145
* Mimic a crash of the extension, or can just be used as cleanup.
152146
* Only use this for tests.
153147
*/
154-
protected crash() {
155-
if (!this.isStarted) {
156-
return
157-
}
158-
148+
protected cleanup() {
159149
this.crashChecker?.stop()
160-
this.heartbeat?.crash()
150+
this.heartbeat?.stopInterval()
161151
}
162152
}
163153

@@ -166,8 +156,6 @@ export class CrashMonitoring {
166156
* {@link CrashChecker} listens for these.
167157
*/
168158
class Heartbeat {
169-
private didEmitFailure: boolean = false
170-
private isRunning: boolean = false
171159
private intervalRef: NodeJS.Timer | undefined
172160
constructor(
173161
private readonly state: FileSystemState,
@@ -187,33 +175,25 @@ class Heartbeat {
187175
try {
188176
await this.state.sendHeartbeat()
189177
} catch (e) {
190-
// only emit a failure once so we do not spam telemetry on repeated errors
191-
if (!this.didEmitFailure) {
192-
this.didEmitFailure = true
193-
emitFailure({ functionName: 'sendHeartbeat', error: e })
194-
}
178+
this.stopInterval()
179+
emitFailure({ functionName: 'sendHeartbeat', error: e })
195180

196181
// During development we are fine with impacting extension execution, so throw
197182
if (this.isDevMode) {
198183
throw e
199184
}
200185
}
201186
}, heartbeatInterval)
202-
203-
this.isRunning = true
204187
}
205188

189+
/** Stops everything, signifying a graceful shutdown */
206190
public async stop() {
207-
// non-happy path where heartbeats were never started.
208-
if (!this.isRunning) {
209-
return
210-
}
211-
212-
globals.clock.clearInterval(this.intervalRef)
191+
this.stopInterval()
213192
return this.state.indicateGracefulShutdown()
214193
}
215194

216-
public crash() {
195+
/** Stops the heartbeat interval */
196+
public stopInterval() {
217197
globals.clock.clearInterval(this.intervalRef)
218198
}
219199
}
@@ -258,7 +238,7 @@ class CrashChecker {
258238
} catch (e) {
259239
emitFailure({ functionName: 'checkCrashInterval', error: e })
260240

261-
// Since there was an error we want to stop crash monitoring since it is pointless.
241+
// Since there was an error there is point to try checking for crashed instances.
262242
// We will need to monitor telemetry to see if we can determine widespread issues.
263243
this.stop()
264244

@@ -379,7 +359,7 @@ function getDefaultDependencies(): MementoStateDependencies {
379359
* Factory to create an instance of the state. Do not create an instance of the
380360
* class manually since there are required setup steps.
381361
*
382-
* @throws if the filesystem state cannot get in to a good state
362+
* @throws if the filesystem state cannot be confirmed to be stable, i.e flaky fs operations
383363
*/
384364
export async function crashMonitoringStateFactory(deps = getDefaultDependencies()): Promise<FileSystemState> {
385365
const state: FileSystemState = new FileSystemState(deps)
@@ -409,15 +389,15 @@ export class FileSystemState {
409389
* Does the required initialization steps, this must always be run after
410390
* creation of the instance.
411391
*
412-
* @throws if the filesystem state cannot get in to a good state
392+
* @throws if the filesystem state cannot be confirmed to be stable, i.e flaky fs operations
413393
*/
414394
public async init() {
415395
// IMPORTANT: do not run crash reporting on unstable filesystem to reduce invalid crash data
416396
// NOTE: We want to get a diff between clients that succeeded vs failed this check,
417397
// so emit a metric to track that
418398
await telemetry.function_call.run(async (span) => {
419399
span.record({ className, functionName: 'FileSystemStateValidation' })
420-
await withFailCtx('validateFileSystemStability', () => throwOnUnstableFileSystem(this.deps.workDirPath))
400+
await withFailCtx('validateFileSystemStability', () => throwOnUnstableFileSystem())
421401
})
422402

423403
// Clear the state if the user did something like a computer restart
@@ -428,21 +408,29 @@ export class FileSystemState {
428408

429409
// ------------------ Heartbeat methods ------------------
430410
public async sendHeartbeat() {
431-
const _sendHeartbeat = () =>
432-
withFailCtx('sendHeartbeatState', async () => {
411+
const extId = this.createExtId(this.ext)
412+
413+
try {
414+
const func = async () => {
433415
const dir = await this.runningExtsDir()
434-
const extId = this.createExtId(this.ext)
435416
await fs.writeFile(
436417
path.join(dir, extId),
437418
JSON.stringify({ ...this.ext, lastHeartbeat: this.deps.now() }, undefined, 4)
438419
)
439420
this.deps.devLogger?.debug(
440421
`crashMonitoring: HEARTBEAT pid ${this.deps.pid} + sessionId: ${this.deps.sessionId.slice(0, 8)}-...`
441422
)
442-
})
443-
// retry a failed heartbeat to avoid incorrectly being reported as a crash
444-
const _sendHeartbeatWithRetries = withRetries(_sendHeartbeat, { maxRetries: 7, delay: 100, backoff: 2 })
445-
return _sendHeartbeatWithRetries
423+
}
424+
const funcWithCtx = () => withFailCtx('sendHeartbeatState', func)
425+
const funcWithRetries = withRetries(funcWithCtx, { maxRetries: 8, delay: 100, backoff: 2 })
426+
427+
// retry a failed heartbeat to avoid incorrectly being reported as a crash
428+
return await funcWithRetries
429+
} catch (e) {
430+
// delete this ext from the state to avoid an incorrectly reported crash since we could not send a new heartbeat
431+
await withFailCtx('sendHeartbeatStateCleanup', () => this.deleteRunningFile(extId))
432+
throw e
433+
}
446434
}
447435

448436
/**
@@ -488,13 +476,17 @@ export class FileSystemState {
488476
await opts.crash()
489477
}
490478

479+
// Clean up the running extension file since it no longer exists
480+
await this.deleteRunningFile(extId)
481+
}
482+
private async deleteRunningFile(extId: ExtInstanceId) {
491483
// Clean up the running extension file since it is no longer exists
492484
const dir = await this.runningExtsDir()
493485
// Retry on failure since failing to delete this file will result in incorrectly reported crashes.
494486
// The common errors we were seeing were windows EPERM/EBUSY errors. There may be a relation
495487
// to this https://github.com/aws/aws-toolkit-vscode/pull/5335
496488
await withRetries(() => withFailCtx('deleteStaleRunningFile', () => fs.delete(path.join(dir, extId))), {
497-
maxRetries: 7,
489+
maxRetries: 8,
498490
delay: 100,
499491
backoff: 2,
500492
})
@@ -525,7 +517,7 @@ export class FileSystemState {
525517
private async runningExtsDir(): Promise<string> {
526518
const p = path.join(this.stateDirPath, crashMonitoringDirNames.running)
527519
// ensure the dir exists
528-
await withFailCtx('ensureRunningExtsDir', () => fs.mkdir(p))
520+
await withFailCtx('ensureRunningExtsDir', () => nodeFs.mkdir(p, { recursive: true }))
529521
return p
530522
}
531523
private async shutdownExtsDir() {
@@ -551,17 +543,22 @@ export class FileSystemState {
551543
const allExts = allExtIds.map<Promise<ExtInstanceHeartbeat | undefined>>(async (extId: string) => {
552544
// Due to a race condition, a separate extension instance may have removed this file by this point. It is okay since
553545
// we will assume that other instance handled its termination appropriately.
554-
const ext = await withFailCtx('parseRunningExtFile', async () =>
555-
ignoreBadFileError(async () => {
556-
const text = await fs.readFileText(path.join(await this.runningExtsDir(), extId))
557-
558-
if (!text) {
559-
return undefined
560-
}
561-
562-
// This was sometimes throwing SyntaxError
563-
return JSON.parse(text) as ExtInstanceHeartbeat
564-
})
546+
// NOTE: On Windows we were failing on EBUSY, so we retry on failure.
547+
const ext: ExtInstanceHeartbeat | undefined = await withRetries(
548+
() =>
549+
withFailCtx('parseRunningExtFile', async () =>
550+
ignoreBadFileError(async () => {
551+
const text = await fs.readFileText(path.join(await this.runningExtsDir(), extId))
552+
553+
if (!text) {
554+
return undefined
555+
}
556+
557+
// This was sometimes throwing SyntaxError
558+
return JSON.parse(text) as ExtInstanceHeartbeat
559+
})
560+
),
561+
{ maxRetries: 6, delay: 100, backoff: 2 }
565562
)
566563

567564
if (ext === undefined) {

packages/core/src/shared/filesystemUtilities.ts

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ import * as pathutils from './utilities/pathUtils'
1212
import globals from '../shared/extensionGlobals'
1313
import fs from '../shared/fs/fs'
1414
import { ToolkitError } from './errors'
15+
import * as nodeFs from 'fs/promises'
1516

1617
export const tempDirPath = path.join(
1718
// https://github.com/aws/aws-toolkit-vscode/issues/240
@@ -261,7 +262,7 @@ const FileSystemStabilityException = ToolkitError.named(FileSystemStabilityExcep
261262
*
262263
* @throws a {@link FileSystemStabilityException} which wraps the underlying failed fs operation error.
263264
*/
264-
export async function throwOnUnstableFileSystem(tmpRoot: string) {
265+
export async function throwOnUnstableFileSystem(tmpRoot: string = tempDirPath) {
265266
const tmpFolder = path.join(tmpRoot, `validateStableFS-${crypto.randomBytes(4).toString('hex')}`)
266267
const tmpFile = path.join(tmpFolder, 'file.txt')
267268

@@ -270,8 +271,14 @@ export async function throwOnUnstableFileSystem(tmpRoot: string) {
270271
await withFailCtx('mkdirInitial', () => fs.mkdir(tmpFolder))
271272
// Verifies that we do not throw if the dir exists and we try to make it again
272273
await withFailCtx('mkdirButAlreadyExists', () => fs.mkdir(tmpFolder))
273-
await withFailCtx('mkdirSubfolder', () => fs.mkdir(path.join(tmpFolder, 'subfolder')))
274-
await withFailCtx('rmdirInitial', () => fs.delete(tmpFolder, { recursive: true }))
274+
// Test subfolder creation. Based on telemetry it looks like the vsc mkdir may be flaky
275+
// when creating subfolders. Hopefully this gives us some useful information.
276+
const subfolderRoot = path.join(tmpFolder, 'a')
277+
const subfolderPath = path.join(subfolderRoot, 'b/c/d/e')
278+
await withFailCtx('mkdirSubfolderNode', () => nodeFs.mkdir(subfolderPath, { recursive: true }))
279+
await withFailCtx('rmdirInitialForNode', () => fs.delete(subfolderRoot, { recursive: true }))
280+
await withFailCtx('mkdirSubfolderVsc', () => fs.mkdir(subfolderPath))
281+
await withFailCtx('rmdirInitialForVsc', () => fs.delete(subfolderRoot, { recursive: true }))
275282

276283
// test basic file operations
277284
await withFailCtx('mkdirForFileOpsTest', () => fs.mkdir(tmpFolder))
@@ -293,6 +300,8 @@ export async function throwOnUnstableFileSystem(tmpRoot: string) {
293300
throw new Error(`Unexpected file contents after multiple writes: "${text}"`)
294301
}
295302
})
303+
// write a large file, ensuring we are not near a space limit
304+
await withFailCtx('writeFileLarge', () => fs.writeFile(tmpFile, 'a'.repeat(1000)))
296305
// test concurrent reads on a file
297306
await withFailCtx('writeFileConcurrencyTest', () => fs.writeFile(tmpFile, 'concurrencyTest'))
298307
const result = await Promise.all([

packages/core/src/test/shared/crashMonitoring.test.ts

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,9 @@ class TestCrashMonitoring extends CrashMonitoring {
1414
public constructor(...deps: ConstructorParameters<typeof CrashMonitoring>) {
1515
super(...deps)
1616
}
17-
public override crash() {
18-
super.crash()
17+
/** Immitates an extension crash */
18+
public crash() {
19+
super.cleanup()
1920
}
2021
}
2122

0 commit comments

Comments
 (0)