From a425c97a0ba2c6905f36ae546d6e43fe81a2930b Mon Sep 17 00:00:00 2001 From: nkomonen-amazon Date: Fri, 4 Oct 2024 12:35:49 -0400 Subject: [PATCH 1/9] fix(crash): crash monitoring dir not being cleaned up During development the underlying implementation was changed to get the OS uptime. The initial implementation returned uptime in minutes, but the new one returned it in seconds and this was not accounted for. As a result the crash monitoring folder was not being cleaned up when expected on computer restart Signed-off-by: nkomonen-amazon --- packages/core/src/shared/crashMonitoring.ts | 6 ++- packages/core/src/shared/utilities/osUtils.ts | 9 ++-- .../src/test/shared/utilities/osUtils.test.ts | 46 +++++++++---------- 3 files changed, 33 insertions(+), 28 deletions(-) diff --git a/packages/core/src/shared/crashMonitoring.ts b/packages/core/src/shared/crashMonitoring.ts index bfd470a7150..4238ce9f1fc 100644 --- a/packages/core/src/shared/crashMonitoring.ts +++ b/packages/core/src/shared/crashMonitoring.ts @@ -492,7 +492,11 @@ export class FileSystemState { return p } public async clearState(): Promise { - await withFailCtx('clearState', async () => fs.delete(this.stateDirPath, { force: true })) + this.deps.devLogger?.debug('crashMonitoring: CLEAR_STATE: Started') + await withFailCtx('clearState', async () => { + await fs.delete(this.stateDirPath, { force: true, recursive: true }) + this.deps.devLogger?.debug('crashMonitoring: CLEAR_STATE: Succeeded') + }) } public async getAllExts(): Promise { const res = await withFailCtx('getAllExts', async () => { diff --git a/packages/core/src/shared/utilities/osUtils.ts b/packages/core/src/shared/utilities/osUtils.ts index b0266f73e1a..2ed7b976998 100644 --- a/packages/core/src/shared/utilities/osUtils.ts +++ b/packages/core/src/shared/utilities/osUtils.ts @@ -15,12 +15,13 @@ import * as os from 'os' * Use this function to perform one-time initialization tasks that should only happen * once per OS session, regardless of how many extension instances are running. */ -export async function isNewOsSession(now = () => globals.clock.Date.now(), uptime = () => os.uptime()) { +export async function isNewOsSession(now = () => globals.clock.Date.now(), uptimeMillis = () => os.uptime() * 1000) { // Windows does not have an ephemeral /tmp/ folder that deletes on shutdown, while unix-like os's do. // So in Windows we calculate the start time and see if it changed from the previous known start time. const lastStartTime = globals.globalState.tryGet('lastOsStartTime', Number) - // uptime() returns seconds, convert to ms - const currentOsStartTime = now() - uptime() * 1000 * 60 + + const uptime = uptimeMillis() + const currentOsStartTime = now() - uptime if (lastStartTime === undefined) { await globals.globalState.update('lastOsStartTime', currentOsStartTime) @@ -28,7 +29,7 @@ export async function isNewOsSession(now = () => globals.clock.Date.now(), uptim } // If the current start time is later than the last, it means we are in a new session since they should be the same value. - // But to account for small differences in how the current time is calculate, we add in a 5 second buffer. + // But to account for minor millisecond differnces, we add in a 5 second buffer. if (currentOsStartTime - 1000 * 5 > lastStartTime) { await globals.globalState.update('lastOsStartTime', currentOsStartTime) return true diff --git a/packages/core/src/test/shared/utilities/osUtils.test.ts b/packages/core/src/test/shared/utilities/osUtils.test.ts index 1654aca8cc8..bd6e4489ac0 100644 --- a/packages/core/src/test/shared/utilities/osUtils.test.ts +++ b/packages/core/src/test/shared/utilities/osUtils.test.ts @@ -22,32 +22,32 @@ describe('isNewOsSession', () => { }) it('unix-like: returns true when expected', async () => { - const uptimeStub = sandbox.stub() + const uptimeMillisStub = sandbox.stub() const now = sandbox.stub() - // We started our computer at 2 minutes since epoch (time - pc uptime) - // and the comptuer has been on for 1 minute. So the OS started 1 minute since epoch. - now.returns(60_000 + 60_000) - uptimeStub.returns(1) + // We started our computer at 1 minutes since epoch and the comptuer uptime has been 1 minute. + // So the OS started at the epoch (time - uptime). + now.returns(0) // the epoch time + uptimeMillisStub.returns(0) // this session has just started // On a brand new session the first caller will get true - assert.strictEqual(await isNewOsSession(now, uptimeStub), true) + assert.strictEqual(await isNewOsSession(now, uptimeMillisStub), true) // Subsequent callers will get false - assert.strictEqual(await isNewOsSession(now, uptimeStub), false) - - // Start a computer session 10 minutes from epoch - uptimeStub.returns(0) - now.returns(60_000 * 10) - assert.strictEqual(await isNewOsSession(now, uptimeStub), true) - // Anything that is within a 5 second threshold of the last session time, is considered the same session - now.returns(60_000 * 10 + 5000) - assert.strictEqual(await isNewOsSession(now, uptimeStub), false) - now.returns(60_000 * 10 + 5000 + 1) - assert.strictEqual(await isNewOsSession(now, uptimeStub), true) - - // A non-zero uptime - uptimeStub.returns(5) // The computer has been running for 5 minutes already, so the start time is relative to this. - now.returns(60_000 * 10 + 5000 + 60_000 * 10) // 5 minutes since last session - // Nothing changes since the diff between uptime and the last start has not changed - assert.strictEqual(await isNewOsSession(now, uptimeStub), true) + assert.strictEqual(await isNewOsSession(now, uptimeMillisStub), false) + + // 10 minutes later, same session + now.returns(1000 * 60 * 10) + uptimeMillisStub.returns(1000 * 60 * 10) // This scales proportionately with the current time + // This is still the same session, so we get false + assert.strictEqual(await isNewOsSession(now, uptimeMillisStub), false) + + // Test the lowerbound of what is considered a new session + // Pretend we started a new computer session 5 seconds after the initial session + uptimeMillisStub.returns(0) + now.returns(5000) + // Anything that is within a 5 second threshold of the last session time, is considered the SAME session + assert.strictEqual(await isNewOsSession(now, uptimeMillisStub), false) + // This is 1 millisecond after the threshold, it is considered a NEW session + now.returns(5000 + 1) + assert.strictEqual(await isNewOsSession(now, uptimeMillisStub), true) }) }) From 5f0829c9dfd3faaacddeb3ff389db7c163b3be2b Mon Sep 17 00:00:00 2001 From: nkomonen-amazon Date: Mon, 7 Oct 2024 20:27:37 -0400 Subject: [PATCH 2/9] do not crash monitor on unstable fs Signed-off-by: nkomonen-amazon --- packages/amazonq/src/extensionNode.ts | 4 +- packages/core/src/extensionNode.ts | 4 +- packages/core/src/shared/crashMonitoring.ts | 54 ++++++-- .../core/src/shared/filesystemUtilities.ts | 115 ++++++++++++++++++ .../test/shared/filesystemUtilities.test.ts | 9 ++ packages/core/src/test/testUtil.ts | 6 +- 6 files changed, 172 insertions(+), 20 deletions(-) diff --git a/packages/amazonq/src/extensionNode.ts b/packages/amazonq/src/extensionNode.ts index b203e541a84..b6cc7923a04 100644 --- a/packages/amazonq/src/extensionNode.ts +++ b/packages/amazonq/src/extensionNode.ts @@ -32,7 +32,7 @@ export async function activate(context: vscode.ExtensionContext) { * the code compatible with web and move it to {@link activateAmazonQCommon}. */ async function activateAmazonQNode(context: vscode.ExtensionContext) { - await (await CrashMonitoring.instance()).start() + await (await CrashMonitoring.instance())?.start() const extContext = { extensionContext: context, @@ -96,5 +96,5 @@ async function setupDevMode(context: vscode.ExtensionContext) { export async function deactivate() { // Run concurrently to speed up execution. stop() does not throw so it is safe - await Promise.all([(await CrashMonitoring.instance()).stop(), deactivateCommon()]) + await Promise.all([(await CrashMonitoring.instance())?.stop(), deactivateCommon()]) } diff --git a/packages/core/src/extensionNode.ts b/packages/core/src/extensionNode.ts index f91fa321f59..53185112375 100644 --- a/packages/core/src/extensionNode.ts +++ b/packages/core/src/extensionNode.ts @@ -78,7 +78,7 @@ export async function activate(context: vscode.ExtensionContext) { // IMPORTANT: If you are doing setup that should also work in web mode (browser), it should be done in the function below const extContext = await activateCommon(context, contextPrefix, false) - await (await CrashMonitoring.instance()).start() + await (await CrashMonitoring.instance())?.start() initializeCredentialsProviderManager() @@ -254,7 +254,7 @@ export async function activate(context: vscode.ExtensionContext) { export async function deactivate() { // Run concurrently to speed up execution. stop() does not throw so it is safe - await Promise.all([await (await CrashMonitoring.instance()).stop(), deactivateCommon()]) + await Promise.all([await (await CrashMonitoring.instance())?.stop(), deactivateCommon()]) await globals.resourceManager.dispose() } diff --git a/packages/core/src/shared/crashMonitoring.ts b/packages/core/src/shared/crashMonitoring.ts index 4238ce9f1fc..21b6a333083 100644 --- a/packages/core/src/shared/crashMonitoring.ts +++ b/packages/core/src/shared/crashMonitoring.ts @@ -16,6 +16,7 @@ import nodeFs from 'fs/promises' import fs from './fs/fs' import { getLogger } from './logger/logger' import { crashMonitoringDirNames } from './constants' +import { throwOnUnstableFileSystem } from './filesystemUtilities' const className = 'CrashMonitoring' @@ -65,17 +66,32 @@ export class CrashMonitoring { private readonly devLogger: Logger | undefined ) {} + static #didTryInstance = false static #instance: CrashMonitoring | undefined - public static async instance(): Promise { - const isDevMode = getIsDevMode() - const devModeLogger: Logger | undefined = isDevMode ? getLogger() : undefined - return (this.#instance ??= new CrashMonitoring( - await crashMonitoringStateFactory(), - DevSettings.instance.get('crashCheckInterval', 1000 * 60 * 3), - isDevMode, - isAutomation(), - devModeLogger - )) + public static async instance(): Promise { + // Since the first attempt to create an instance may have failed, we do not + // attempt to create an instance again and return whatever we have + if (this.#didTryInstance === true) { + return this.#instance + } + + try { + this.#didTryInstance = true + const isDevMode = getIsDevMode() + const devModeLogger: Logger | undefined = isDevMode ? getLogger() : undefined + const state = await crashMonitoringStateFactory() + return (this.#instance ??= new CrashMonitoring( + state, + DevSettings.instance.get('crashCheckInterval', 1000 * 60 * 3), + isDevMode, + isAutomation(), + devModeLogger + )) + } catch (error) { + // Creating the crash monitoring state can throw/fail + emitFailure({ functionName: 'instance', error }) + return undefined + } } /** Start the Crash Monitoring process */ @@ -358,6 +374,12 @@ function getDefaultDependencies(): MementoStateDependencies { devLogger: getIsDevMode() ? getLogger() : undefined, } } +/** + * Factory to create an instance of the state. Do not create an instance of the + * class manually since there are required setup steps. + * + * @throws if the filesystem state cannot get in to a good state + */ export async function crashMonitoringStateFactory(deps = getDefaultDependencies()): Promise { const state: FileSystemState = new FileSystemState(deps) await state.init() @@ -385,8 +407,18 @@ export class FileSystemState { /** * Does the required initialization steps, this must always be run after * creation of the instance. + * + * @throws if the filesystem state cannot get in to a good state */ public async init() { + // IMPORTANT: do not run crash reporting on unstable filesystem to reduce invalid crash data + // NOTE: We want to get a diff between clients that succeeded vs failed this check, + // so emit a metric to track that + await telemetry.function_call.run(async (span) => { + span.record({ className, functionName: 'FileSystemStateValidation' }) + await withFailCtx('validateFileSystemStability', () => throwOnUnstableFileSystem(this.deps.workDirPath)) + }) + // Clear the state if the user did something like a computer restart if (await this.deps.isStateStale()) { await this.clearState() @@ -595,7 +627,7 @@ async function withFailCtx(ctx: string, fn: () => Promise): Promise { // make sure we await the function so it actually executes within the try/catch return await fn() } catch (err) { - throw CrashMonitoringError.chain(err, `Failed "${ctx}"`, { code: className }) + throw CrashMonitoringError.chain(err, `Context: "${ctx}"`, { code: className }) } } diff --git a/packages/core/src/shared/filesystemUtilities.ts b/packages/core/src/shared/filesystemUtilities.ts index 351cd7028b5..d1e214de388 100644 --- a/packages/core/src/shared/filesystemUtilities.ts +++ b/packages/core/src/shared/filesystemUtilities.ts @@ -11,6 +11,7 @@ import { getLogger } from './logger' import * as pathutils from './utilities/pathUtils' import globals from '../shared/extensionGlobals' import fs from '../shared/fs/fs' +import { ToolkitError } from './errors' export const tempDirPath = path.join( // https://github.com/aws/aws-toolkit-vscode/issues/240 @@ -246,3 +247,117 @@ export async function setDefaultDownloadPath(downloadPath: string) { getLogger().error('Error while setting "aws.downloadPath"', err as Error) } } + +const FileSystemStabilityExceptionId = 'FileSystemStabilityException' +const FileSystemStabilityException = ToolkitError.named(FileSystemStabilityExceptionId) + +/** + * Run this function to validate common file system calls/flows, ensuring they work + * as expected. + * + * The intent of this function is to catch potential fs issues early, + * testing common cases of fs usage. We need this since each machine can behave + * differently depending on things like OS, permissions, disk speed, ... + * + * @throws a {@link FileSystemStabilityException} which wraps the underlying failed fs operation error. + */ +export async function throwOnUnstableFileSystem(tmpRoot: string) { + const tmpFolder = path.join(tmpRoot, `validateStableFS-${crypto.randomBytes(4).toString('hex')}`) + const tmpFile = path.join(tmpFolder, 'file.txt') + + try { + // test basic folder operations + await withFailCtx('mkdirInitial', () => fs.mkdir(tmpFolder)) + // Verifies that we do not throw if the dir exists and we try to make it again + await withFailCtx('mkdirButAlreadyExists', () => fs.mkdir(tmpFolder)) + await withFailCtx('mkdirSubfolder', () => fs.mkdir(path.join(tmpFolder, 'subfolder'))) + await withFailCtx('rmdirInitial', () => fs.delete(tmpFolder, { recursive: true })) + + // test basic file operations + await withFailCtx('mkdirForFileOpsTest', () => fs.mkdir(tmpFolder)) + await withFailCtx('writeFile1', () => fs.writeFile(tmpFile, 'test1')) + await withFailCtx('readFile1', async () => { + const text = await fs.readFileText(tmpFile) + if (text !== 'test1') { + throw new Error(`Unexpected file contents: "${text}"`) + } + }) + // overwrite the file content multiple times + await withFailCtx('writeFile2', () => fs.writeFile(tmpFile, 'test2')) + await withFailCtx('writeFile3', () => fs.writeFile(tmpFile, 'test3')) + await withFailCtx('writeFile4', () => fs.writeFile(tmpFile, 'test4')) + await withFailCtx('writeFile5', () => fs.writeFile(tmpFile, 'test5')) + await withFailCtx('readFile5', async () => { + const text = await fs.readFileText(tmpFile) + if (text !== 'test5') { + throw new Error(`Unexpected file contents after multiple writes: "${text}"`) + } + }) + // test concurrent reads on a file + await withFailCtx('writeFileConcurrencyTest', () => fs.writeFile(tmpFile, 'concurrencyTest')) + const result = await Promise.all([ + withFailCtx('readFileConcurrent1', () => fs.readFileText(tmpFile)), + withFailCtx('readFileConcurrent2', () => fs.readFileText(tmpFile)), + withFailCtx('readFileConcurrent3', () => fs.readFileText(tmpFile)), + withFailCtx('readFileConcurrent4', () => fs.readFileText(tmpFile)), + withFailCtx('readFileConcurrent5', () => fs.readFileText(tmpFile)), + ]) + if (result.some((text) => text !== 'concurrencyTest')) { + throw new Error(`Unexpected concurrent file reads: ${result}`) + } + // test deleting a file + await withFailCtx('deleteFileInitial', () => fs.delete(tmpFile)) + await withFailCtx('writeFileAfterDelete', () => fs.writeFile(tmpFile, 'afterDelete')) + await withFailCtx('readNewFileAfterDelete', async () => { + const text = await fs.readFileText(tmpFile) + if (text !== 'afterDelete') { + throw new Error(`Unexpected file content after writing to deleted file: "${text}"`) + } + }) + await withFailCtx('deleteFileFully', () => fs.delete(tmpFile)) + await withFailCtx('notExistsFileAfterDelete', async () => { + const res = await fs.exists(tmpFile) + if (res) { + throw new Error(`Expected file to NOT exist: "${tmpFile}"`) + } + }) + + // test rename + await withFailCtx('writeFileForRename', () => fs.writeFile(tmpFile, 'TestingRename')) + const tmpFileRenamed = tmpFile + '.renamed' + await withFailCtx('renameFile', () => fs.rename(tmpFile, tmpFileRenamed)) + await withFailCtx('existsRenamedFile', async () => { + const res = await fs.exists(tmpFileRenamed) + if (!res) { + throw new Error(`Expected RENAMED file to exist: "${tmpFileRenamed}"`) + } + }) + await withFailCtx('writeToRenamedFile', async () => fs.writeFile(tmpFileRenamed, 'hello')) + await withFailCtx('readFromRenamedFile', async () => { + const res = await fs.readFileText(tmpFileRenamed) + if (res !== 'hello') { + throw new Error(`Expected RENAMED file to be writable: "${tmpFileRenamed}"`) + } + }) + await withFailCtx('renameFileReset', () => fs.rename(tmpFileRenamed, tmpFile)) + await withFailCtx('renameFileResetExists', async () => { + const res = await fs.exists(tmpFile) + if (!res) { + throw new Error(`Expected reverted renamed file to exist: "${tmpFile}"`) + } + }) + } finally { + await fs.delete(tmpFolder, { recursive: true, force: true }) + } + + async function withFailCtx(ctx: string, fn: () => Promise): Promise { + try { + return await fn() + } catch (e) { + if (!(e instanceof Error)) { + throw e + } + throw FileSystemStabilityException.chain(e, `context: "${ctx}"`, { code: FileSystemStabilityExceptionId }) + } + } +} diff --git a/packages/core/src/test/shared/filesystemUtilities.test.ts b/packages/core/src/test/shared/filesystemUtilities.test.ts index b99e8f4b0bf..3bb3a018b37 100644 --- a/packages/core/src/test/shared/filesystemUtilities.test.ts +++ b/packages/core/src/test/shared/filesystemUtilities.test.ts @@ -13,8 +13,10 @@ import { isInDirectory, makeTemporaryToolkitFolder, tempDirPath, + throwOnUnstableFileSystem, } from '../../shared/filesystemUtilities' import { fs } from '../../shared' +import { TestFolder } from '../testUtil' describe('filesystemUtilities', function () { const targetFilename = 'findThisFile12345.txt' @@ -191,4 +193,11 @@ describe('filesystemUtilities', function () { assert.strictEqual(actual, 3) }) }) + + describe('throwOnUnstableFileSystem', async function () { + it('does not throw on stable filesystem', async function () { + const testFolder = await TestFolder.create() + await throwOnUnstableFileSystem(testFolder.path) + }) + }) }) diff --git a/packages/core/src/test/testUtil.ts b/packages/core/src/test/testUtil.ts index c09fbfe6ccc..c54f4690628 100644 --- a/packages/core/src/test/testUtil.ts +++ b/packages/core/src/test/testUtil.ts @@ -75,7 +75,7 @@ export function getWorkspaceFolder(dir: string): vscode.WorkspaceFolder { * But if the day comes that we need it for web, we should be able to add some agnostic FS methods in here. */ export class TestFolder { - protected constructor(private readonly rootFolder: string) {} + protected constructor(public readonly path: string) {} /** Creates a folder that deletes itself once all tests are done running. */ static async create() { @@ -118,10 +118,6 @@ export class TestFolder { pathFrom(relativePath: string): string { return path.join(this.path, relativePath) } - - get path(): string { - return path.join(this.rootFolder) - } } /** From a130ae56984e6aa972d112ead08f644a83028aa9 Mon Sep 17 00:00:00 2001 From: nkomonen-amazon Date: Mon, 7 Oct 2024 23:53:01 -0400 Subject: [PATCH 3/9] retry heartbeats on fail Signed-off-by: nkomonen-amazon --- packages/amazonq/src/extensionNode.ts | 3 +- packages/core/src/extensionNode.ts | 3 +- packages/core/src/shared/crashMonitoring.ts | 27 +++--- .../src/shared/utilities/functionUtils.ts | 33 +++++++- .../shared/utilities/functionUtils.test.ts | 83 ++++++++++++++++++- 5 files changed, 134 insertions(+), 15 deletions(-) diff --git a/packages/amazonq/src/extensionNode.ts b/packages/amazonq/src/extensionNode.ts index b6cc7923a04..379bcd9df1c 100644 --- a/packages/amazonq/src/extensionNode.ts +++ b/packages/amazonq/src/extensionNode.ts @@ -32,7 +32,8 @@ export async function activate(context: vscode.ExtensionContext) { * the code compatible with web and move it to {@link activateAmazonQCommon}. */ async function activateAmazonQNode(context: vscode.ExtensionContext) { - await (await CrashMonitoring.instance())?.start() + // Intentionally do not await since this is slow and non-critical + void (await CrashMonitoring.instance())?.start() const extContext = { extensionContext: context, diff --git a/packages/core/src/extensionNode.ts b/packages/core/src/extensionNode.ts index 53185112375..146e3c44b33 100644 --- a/packages/core/src/extensionNode.ts +++ b/packages/core/src/extensionNode.ts @@ -78,7 +78,8 @@ export async function activate(context: vscode.ExtensionContext) { // IMPORTANT: If you are doing setup that should also work in web mode (browser), it should be done in the function below const extContext = await activateCommon(context, contextPrefix, false) - await (await CrashMonitoring.instance())?.start() + // Intentionally do not await since this can be slow and non-critical + void (await CrashMonitoring.instance())?.start() initializeCredentialsProviderManager() diff --git a/packages/core/src/shared/crashMonitoring.ts b/packages/core/src/shared/crashMonitoring.ts index 21b6a333083..a3bcc8a17bd 100644 --- a/packages/core/src/shared/crashMonitoring.ts +++ b/packages/core/src/shared/crashMonitoring.ts @@ -17,6 +17,7 @@ import fs from './fs/fs' import { getLogger } from './logger/logger' import { crashMonitoringDirNames } from './constants' import { throwOnUnstableFileSystem } from './filesystemUtilities' +import { withRetries } from './utilities/functionUtils' const className = 'CrashMonitoring' @@ -427,17 +428,21 @@ export class FileSystemState { // ------------------ Heartbeat methods ------------------ public async sendHeartbeat() { - await withFailCtx('sendHeartbeatState', async () => { - const dir = await this.runningExtsDir() - const extId = this.createExtId(this.ext) - await fs.writeFile( - path.join(dir, extId), - JSON.stringify({ ...this.ext, lastHeartbeat: this.deps.now() }, undefined, 4) - ) - this.deps.devLogger?.debug( - `crashMonitoring: HEARTBEAT pid ${this.deps.pid} + sessionId: ${this.deps.sessionId.slice(0, 8)}-...` - ) - }) + const _sendHeartbeat = () => + withFailCtx('sendHeartbeatState', async () => { + const dir = await this.runningExtsDir() + const extId = this.createExtId(this.ext) + await fs.writeFile( + path.join(dir, extId), + JSON.stringify({ ...this.ext, lastHeartbeat: this.deps.now() }, undefined, 4) + ) + this.deps.devLogger?.debug( + `crashMonitoring: HEARTBEAT pid ${this.deps.pid} + sessionId: ${this.deps.sessionId.slice(0, 8)}-...` + ) + }) + // retry a failed heartbeat to avoid incorrectly being reported as a crash + const _sendHeartbeatWithRetries = withRetries(_sendHeartbeat, { maxRetries: 8, delay: 100, backoff: 2 }) + return _sendHeartbeatWithRetries } /** diff --git a/packages/core/src/shared/utilities/functionUtils.ts b/packages/core/src/shared/utilities/functionUtils.ts index d21727bac1d..f61a3abde34 100644 --- a/packages/core/src/shared/utilities/functionUtils.ts +++ b/packages/core/src/shared/utilities/functionUtils.ts @@ -3,7 +3,7 @@ * SPDX-License-Identifier: Apache-2.0 */ -import { Timeout } from './timeoutUtils' +import { sleep, Timeout } from './timeoutUtils' /** * Creates a function that always returns a 'shared' Promise. @@ -145,3 +145,34 @@ export function cancellableDebounce( cancel: cancel, } } + +/** + * Executes the given function, retrying if it throws. + * + * @param opts - if no opts given, defaults are used + */ +export async function withRetries( + fn: () => Promise, + opts?: { maxRetries?: number; delay?: number; backoff?: number } +): Promise { + const maxRetries = opts?.maxRetries ?? 3 + const delay = opts?.delay ?? 0 + const backoff = opts?.backoff ?? 1 + + let retryCount = 0 + let latestDelay = delay + while (true) { + try { + return await fn() + } catch (err) { + retryCount++ + if (retryCount >= maxRetries) { + throw err + } + if (latestDelay > 0) { + await sleep(latestDelay) + latestDelay = latestDelay * backoff + } + } + } +} diff --git a/packages/core/src/test/shared/utilities/functionUtils.test.ts b/packages/core/src/test/shared/utilities/functionUtils.test.ts index 43da4ebb619..fb87cf3501e 100644 --- a/packages/core/src/test/shared/utilities/functionUtils.test.ts +++ b/packages/core/src/test/shared/utilities/functionUtils.test.ts @@ -4,8 +4,10 @@ */ import assert from 'assert' -import { once, onceChanged, debounce } from '../../../shared/utilities/functionUtils' +import { once, onceChanged, debounce, withRetries } from '../../../shared/utilities/functionUtils' import { installFakeClock } from '../../testUtil' +import { stub, SinonStub } from 'sinon' +import { InstalledClock } from '@sinonjs/fake-timers' describe('functionUtils', function () { it('once()', function () { @@ -107,3 +109,82 @@ describe('debounce', function () { }) }) }) + +// function to test the withRetries method. It passes in a stub function as the argument and has different tests that throw on different iterations +describe('withRetries', function () { + let clock: InstalledClock + let fn: SinonStub + + beforeEach(function () { + fn = stub() + clock = installFakeClock() + }) + + afterEach(function () { + clock.uninstall() + }) + + it('retries the function until it succeeds, using defaults', async function () { + fn.onCall(0).throws() + fn.onCall(1).throws() + fn.onCall(2).resolves('success') + assert.strictEqual(await withRetries(fn), 'success') + }) + + it('retries the function until it succeeds at the final try', async function () { + fn.onCall(0).throws() + fn.onCall(1).throws() + fn.onCall(2).throws() + fn.onCall(3).resolves('success') + assert.strictEqual(await withRetries(fn, { maxRetries: 4 }), 'success') + }) + + it('throws the last error if the function always fails, using defaults', async function () { + fn.onCall(0).throws() + fn.onCall(1).throws() + fn.onCall(2).throws() + fn.onCall(3).resolves('unreachable') + await assert.rejects(async () => { + await withRetries(fn) + }) + }) + + it('throws the last error if the function always fails', async function () { + fn.onCall(0).throws() + fn.onCall(1).throws() + fn.onCall(2).throws() + fn.onCall(3).throws() + fn.onCall(4).resolves('unreachable') + await assert.rejects(async () => { + await withRetries(fn, { maxRetries: 4 }) + }) + }) + + it('honors retry delay + backoff multiplier', async function () { + fn.onCall(0).throws() // 100ms + fn.onCall(1).throws() // 200ms + fn.onCall(2).throws() // 400ms + fn.onCall(3).resolves('success') + + const res = withRetries(fn, { maxRetries: 4, delay: 100, backoff: 2 }) + + // Check the call count after each iteration, ensuring the function is called + // after the correct delay between retries. + await clock.tickAsync(99) + assert.strictEqual(fn.callCount, 1) + await clock.tickAsync(1) + assert.strictEqual(fn.callCount, 2) + + await clock.tickAsync(199) + assert.strictEqual(fn.callCount, 2) + await clock.tickAsync(1) + assert.strictEqual(fn.callCount, 3) + + await clock.tickAsync(399) + assert.strictEqual(fn.callCount, 3) + await clock.tickAsync(1) + assert.strictEqual(fn.callCount, 4) + + assert.strictEqual(await res, 'success') + }) +}) From 9464dea81c8b866187663732b322cf8061d5081a Mon Sep 17 00:00:00 2001 From: nkomonen-amazon Date: Tue, 8 Oct 2024 01:02:35 -0400 Subject: [PATCH 4/9] keep emitting heartbeats even on failure Signed-off-by: nkomonen-amazon --- packages/core/src/shared/crashMonitoring.ts | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/packages/core/src/shared/crashMonitoring.ts b/packages/core/src/shared/crashMonitoring.ts index a3bcc8a17bd..f8b79881542 100644 --- a/packages/core/src/shared/crashMonitoring.ts +++ b/packages/core/src/shared/crashMonitoring.ts @@ -166,6 +166,7 @@ export class CrashMonitoring { * {@link CrashChecker} listens for these. */ class Heartbeat { + private didEmitFailure: boolean = false private isRunning: boolean = false private intervalRef: NodeJS.Timer | undefined constructor( @@ -175,8 +176,6 @@ class Heartbeat { ) {} public async start() { - this.isRunning = true - // heartbeat 2 times per check const heartbeatInterval = this.checkInterval / 2 @@ -188,12 +187,11 @@ class Heartbeat { try { await this.state.sendHeartbeat() } catch (e) { - emitFailure({ functionName: 'sendHeartbeat', error: e }) - - // Since there was an error we want to stop crash monitoring since it is pointless. - // We will need to monitor telemetry to see if we can determine widespread issues. - // Make sure it is signaled as a graceful shutdown to reduce noise of crashed extensions. - await this.stop() + // only emit a failure once so we do not spam telemetry on repeated errors + if (!this.didEmitFailure) { + this.didEmitFailure = true + emitFailure({ functionName: 'sendHeartbeat', error: e }) + } // During development we are fine with impacting extension execution, so throw if (this.isDevMode) { @@ -201,6 +199,8 @@ class Heartbeat { } } }, heartbeatInterval) + + this.isRunning = true } public async stop() { @@ -491,7 +491,7 @@ export class FileSystemState { // Clean up the running extension file since it is no longer exists const dir = await this.runningExtsDir() // Use force since another checker may have already removed this file before this is ran - await withFailCtx('deleteStaleRunningFile', () => fs.delete(path.join(dir, extId), { force: true })) + await withFailCtx('deleteStaleRunningFile', () => fs.delete(path.join(dir, extId))) } // ------------------ State data ------------------ From 933813bfb8b18d0918e146dd7e0f82e596ed1172 Mon Sep 17 00:00:00 2001 From: nkomonen-amazon Date: Tue, 8 Oct 2024 01:16:01 -0400 Subject: [PATCH 5/9] minor fs test fixes Signed-off-by: nkomonen-amazon --- packages/core/src/test/shared/fs/fs.test.ts | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/packages/core/src/test/shared/fs/fs.test.ts b/packages/core/src/test/shared/fs/fs.test.ts index fefec8f0484..d88ada56a5c 100644 --- a/packages/core/src/test/shared/fs/fs.test.ts +++ b/packages/core/src/test/shared/fs/fs.test.ts @@ -340,6 +340,7 @@ describe('FileSystem', function () { it('deletes directory with recursive:true', async function () { const dir = await testFolder.mkdir() + await testFolder.write('testfile.txt', 'testText') await fs.delete(dir, { recursive: true }) assert(!existsSync(dir)) }) @@ -348,7 +349,7 @@ describe('FileSystem', function () { const dir = await testFolder.mkdir() const f = path.join(dir, 'missingfile.txt') assert(!existsSync(f)) - await fs.delete(f, { recursive: true }) + await fs.delete(f) }) it('error if file *and* its parent dir not found', async function () { From dd1db977f7ac7a7cba68e4a751b0d1e282e9991a Mon Sep 17 00:00:00 2001 From: nkomonen-amazon Date: Tue, 8 Oct 2024 01:26:44 -0400 Subject: [PATCH 6/9] tweak retries + intervals Signed-off-by: nkomonen-amazon --- packages/core/src/shared/crashMonitoring.ts | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/packages/core/src/shared/crashMonitoring.ts b/packages/core/src/shared/crashMonitoring.ts index f8b79881542..74bdbbb1016 100644 --- a/packages/core/src/shared/crashMonitoring.ts +++ b/packages/core/src/shared/crashMonitoring.ts @@ -83,7 +83,7 @@ export class CrashMonitoring { const state = await crashMonitoringStateFactory() return (this.#instance ??= new CrashMonitoring( state, - DevSettings.instance.get('crashCheckInterval', 1000 * 60 * 3), + DevSettings.instance.get('crashCheckInterval', 1000 * 60 * 10), // check every 10 minutes isDevMode, isAutomation(), devModeLogger @@ -441,7 +441,7 @@ export class FileSystemState { ) }) // retry a failed heartbeat to avoid incorrectly being reported as a crash - const _sendHeartbeatWithRetries = withRetries(_sendHeartbeat, { maxRetries: 8, delay: 100, backoff: 2 }) + const _sendHeartbeatWithRetries = withRetries(_sendHeartbeat, { maxRetries: 7, delay: 100, backoff: 2 }) return _sendHeartbeatWithRetries } @@ -490,8 +490,14 @@ export class FileSystemState { // Clean up the running extension file since it is no longer exists const dir = await this.runningExtsDir() - // Use force since another checker may have already removed this file before this is ran - await withFailCtx('deleteStaleRunningFile', () => fs.delete(path.join(dir, extId))) + // Retry on failure since failing to delete this file will result in incorrectly reported crashes. + // The common errors we were seeing were windows EPERM/EBUSY errors. There may be a relation + // to this https://github.com/aws/aws-toolkit-vscode/pull/5335 + await withRetries(() => withFailCtx('deleteStaleRunningFile', () => fs.delete(path.join(dir, extId))), { + maxRetries: 7, + delay: 100, + backoff: 2, + }) } // ------------------ State data ------------------ From ad2e297334c7088f48dbec0c5cfcfa9a8241a346 Mon Sep 17 00:00:00 2001 From: nkomonen-amazon Date: Tue, 8 Oct 2024 19:52:09 -0400 Subject: [PATCH 7/9] various fixes - removed some unnecessary code - fix some comments - if heartbeats fails, stop sending future heartbeats, and remove the existing one from the state so it cannot be seen as a crash. - increase the retries from 12 seconds total to 25 seconds total (1 extra interval) - switch a vscode fs mkdir call to the node one since it looks like it may be flaky based on telemetry data - retry when trying to load the extension state from disk since it seems to fail sometimes due to temporary issues Signed-off-by: nkomonen-amazon --- packages/core/src/shared/crashMonitoring.ts | 115 +++++++++--------- .../core/src/shared/filesystemUtilities.ts | 15 ++- .../src/test/shared/crashMonitoring.test.ts | 5 +- 3 files changed, 71 insertions(+), 64 deletions(-) diff --git a/packages/core/src/shared/crashMonitoring.ts b/packages/core/src/shared/crashMonitoring.ts index 74bdbbb1016..5a95211d5f0 100644 --- a/packages/core/src/shared/crashMonitoring.ts +++ b/packages/core/src/shared/crashMonitoring.ts @@ -54,8 +54,6 @@ const className = 'CrashMonitoring' * - We cannot persist crash information on computer restart */ export class CrashMonitoring { - private isStarted: boolean = false - protected heartbeat: Heartbeat | undefined protected crashChecker: CrashChecker | undefined @@ -67,20 +65,21 @@ export class CrashMonitoring { private readonly devLogger: Logger | undefined ) {} - static #didTryInstance = false + static #didTryCreate = false static #instance: CrashMonitoring | undefined + /** Returns an instance of this class or undefined if any initial validation fails. */ public static async instance(): Promise { // Since the first attempt to create an instance may have failed, we do not // attempt to create an instance again and return whatever we have - if (this.#didTryInstance === true) { + if (this.#didTryCreate === true) { return this.#instance } try { - this.#didTryInstance = true + this.#didTryCreate = true const isDevMode = getIsDevMode() const devModeLogger: Logger | undefined = isDevMode ? getLogger() : undefined - const state = await crashMonitoringStateFactory() + const state = await crashMonitoringStateFactory() // can throw return (this.#instance ??= new CrashMonitoring( state, DevSettings.instance.get('crashCheckInterval', 1000 * 60 * 10), // check every 10 minutes @@ -89,7 +88,6 @@ export class CrashMonitoring { devModeLogger )) } catch (error) { - // Creating the crash monitoring state can throw/fail emitFailure({ functionName: 'instance', error }) return undefined } @@ -113,10 +111,10 @@ export class CrashMonitoring { await this.heartbeat.start() await this.crashChecker.start() - - this.isStarted = true } catch (error) { emitFailure({ functionName: 'start', error }) + this.cleanup() + // In development this gives us a useful stacktrace if (this.isDevMode) { throw error @@ -126,10 +124,6 @@ export class CrashMonitoring { /** Stop the Crash Monitoring process, signifying a graceful shutdown */ public async stop() { - if (!this.isStarted) { - return - } - // Dont throw since this feature is not critical and shouldn't prevent extension shutdown try { this.crashChecker?.stop() @@ -151,13 +145,9 @@ export class CrashMonitoring { * Mimic a crash of the extension, or can just be used as cleanup. * Only use this for tests. */ - protected crash() { - if (!this.isStarted) { - return - } - + protected cleanup() { this.crashChecker?.stop() - this.heartbeat?.crash() + this.heartbeat?.stopInterval() } } @@ -166,8 +156,6 @@ export class CrashMonitoring { * {@link CrashChecker} listens for these. */ class Heartbeat { - private didEmitFailure: boolean = false - private isRunning: boolean = false private intervalRef: NodeJS.Timer | undefined constructor( private readonly state: FileSystemState, @@ -187,11 +175,8 @@ class Heartbeat { try { await this.state.sendHeartbeat() } catch (e) { - // only emit a failure once so we do not spam telemetry on repeated errors - if (!this.didEmitFailure) { - this.didEmitFailure = true - emitFailure({ functionName: 'sendHeartbeat', error: e }) - } + this.stopInterval() + emitFailure({ functionName: 'sendHeartbeat', error: e }) // During development we are fine with impacting extension execution, so throw if (this.isDevMode) { @@ -199,21 +184,16 @@ class Heartbeat { } } }, heartbeatInterval) - - this.isRunning = true } + /** Stops everything, signifying a graceful shutdown */ public async stop() { - // non-happy path where heartbeats were never started. - if (!this.isRunning) { - return - } - - globals.clock.clearInterval(this.intervalRef) + this.stopInterval() return this.state.indicateGracefulShutdown() } - public crash() { + /** Stops the heartbeat interval */ + public stopInterval() { globals.clock.clearInterval(this.intervalRef) } } @@ -258,7 +238,7 @@ class CrashChecker { } catch (e) { emitFailure({ functionName: 'checkCrashInterval', error: e }) - // Since there was an error we want to stop crash monitoring since it is pointless. + // Since there was an error there is point to try checking for crashed instances. // We will need to monitor telemetry to see if we can determine widespread issues. this.stop() @@ -379,7 +359,7 @@ function getDefaultDependencies(): MementoStateDependencies { * Factory to create an instance of the state. Do not create an instance of the * class manually since there are required setup steps. * - * @throws if the filesystem state cannot get in to a good state + * @throws if the filesystem state cannot be confirmed to be stable, i.e flaky fs operations */ export async function crashMonitoringStateFactory(deps = getDefaultDependencies()): Promise { const state: FileSystemState = new FileSystemState(deps) @@ -409,7 +389,7 @@ export class FileSystemState { * Does the required initialization steps, this must always be run after * creation of the instance. * - * @throws if the filesystem state cannot get in to a good state + * @throws if the filesystem state cannot be confirmed to be stable, i.e flaky fs operations */ public async init() { // IMPORTANT: do not run crash reporting on unstable filesystem to reduce invalid crash data @@ -417,7 +397,7 @@ export class FileSystemState { // so emit a metric to track that await telemetry.function_call.run(async (span) => { span.record({ className, functionName: 'FileSystemStateValidation' }) - await withFailCtx('validateFileSystemStability', () => throwOnUnstableFileSystem(this.deps.workDirPath)) + await withFailCtx('validateFileSystemStability', () => throwOnUnstableFileSystem()) }) // Clear the state if the user did something like a computer restart @@ -428,10 +408,11 @@ export class FileSystemState { // ------------------ Heartbeat methods ------------------ public async sendHeartbeat() { - const _sendHeartbeat = () => - withFailCtx('sendHeartbeatState', async () => { + const extId = this.createExtId(this.ext) + + try { + const func = async () => { const dir = await this.runningExtsDir() - const extId = this.createExtId(this.ext) await fs.writeFile( path.join(dir, extId), JSON.stringify({ ...this.ext, lastHeartbeat: this.deps.now() }, undefined, 4) @@ -439,10 +420,17 @@ export class FileSystemState { this.deps.devLogger?.debug( `crashMonitoring: HEARTBEAT pid ${this.deps.pid} + sessionId: ${this.deps.sessionId.slice(0, 8)}-...` ) - }) - // retry a failed heartbeat to avoid incorrectly being reported as a crash - const _sendHeartbeatWithRetries = withRetries(_sendHeartbeat, { maxRetries: 7, delay: 100, backoff: 2 }) - return _sendHeartbeatWithRetries + } + const funcWithCtx = () => withFailCtx('sendHeartbeatState', func) + const funcWithRetries = withRetries(funcWithCtx, { maxRetries: 8, delay: 100, backoff: 2 }) + + // retry a failed heartbeat to avoid incorrectly being reported as a crash + return await funcWithRetries + } catch (e) { + // delete this ext from the state to avoid an incorrectly reported crash since we could not send a new heartbeat + await withFailCtx('sendHeartbeatStateCleanup', () => this.deleteRunningFile(extId)) + throw e + } } /** @@ -488,13 +476,17 @@ export class FileSystemState { await opts.crash() } + // Clean up the running extension file since it no longer exists + await this.deleteRunningFile(extId) + } + private async deleteRunningFile(extId: ExtInstanceId) { // Clean up the running extension file since it is no longer exists const dir = await this.runningExtsDir() // Retry on failure since failing to delete this file will result in incorrectly reported crashes. // The common errors we were seeing were windows EPERM/EBUSY errors. There may be a relation // to this https://github.com/aws/aws-toolkit-vscode/pull/5335 await withRetries(() => withFailCtx('deleteStaleRunningFile', () => fs.delete(path.join(dir, extId))), { - maxRetries: 7, + maxRetries: 8, delay: 100, backoff: 2, }) @@ -525,7 +517,7 @@ export class FileSystemState { private async runningExtsDir(): Promise { const p = path.join(this.stateDirPath, crashMonitoringDirNames.running) // ensure the dir exists - await withFailCtx('ensureRunningExtsDir', () => fs.mkdir(p)) + await withFailCtx('ensureRunningExtsDir', () => nodeFs.mkdir(p, { recursive: true })) return p } private async shutdownExtsDir() { @@ -551,17 +543,22 @@ export class FileSystemState { const allExts = allExtIds.map>(async (extId: string) => { // Due to a race condition, a separate extension instance may have removed this file by this point. It is okay since // we will assume that other instance handled its termination appropriately. - const ext = await withFailCtx('parseRunningExtFile', async () => - ignoreBadFileError(async () => { - const text = await fs.readFileText(path.join(await this.runningExtsDir(), extId)) - - if (!text) { - return undefined - } - - // This was sometimes throwing SyntaxError - return JSON.parse(text) as ExtInstanceHeartbeat - }) + // NOTE: On Windows we were failing on EBUSY, so we retry on failure. + const ext: ExtInstanceHeartbeat | undefined = await withRetries( + () => + withFailCtx('parseRunningExtFile', async () => + ignoreBadFileError(async () => { + const text = await fs.readFileText(path.join(await this.runningExtsDir(), extId)) + + if (!text) { + return undefined + } + + // This was sometimes throwing SyntaxError + return JSON.parse(text) as ExtInstanceHeartbeat + }) + ), + { maxRetries: 6, delay: 100, backoff: 2 } ) if (ext === undefined) { diff --git a/packages/core/src/shared/filesystemUtilities.ts b/packages/core/src/shared/filesystemUtilities.ts index d1e214de388..b216427634e 100644 --- a/packages/core/src/shared/filesystemUtilities.ts +++ b/packages/core/src/shared/filesystemUtilities.ts @@ -12,6 +12,7 @@ import * as pathutils from './utilities/pathUtils' import globals from '../shared/extensionGlobals' import fs from '../shared/fs/fs' import { ToolkitError } from './errors' +import * as nodeFs from 'fs/promises' export const tempDirPath = path.join( // https://github.com/aws/aws-toolkit-vscode/issues/240 @@ -261,7 +262,7 @@ const FileSystemStabilityException = ToolkitError.named(FileSystemStabilityExcep * * @throws a {@link FileSystemStabilityException} which wraps the underlying failed fs operation error. */ -export async function throwOnUnstableFileSystem(tmpRoot: string) { +export async function throwOnUnstableFileSystem(tmpRoot: string = tempDirPath) { const tmpFolder = path.join(tmpRoot, `validateStableFS-${crypto.randomBytes(4).toString('hex')}`) const tmpFile = path.join(tmpFolder, 'file.txt') @@ -270,8 +271,14 @@ export async function throwOnUnstableFileSystem(tmpRoot: string) { await withFailCtx('mkdirInitial', () => fs.mkdir(tmpFolder)) // Verifies that we do not throw if the dir exists and we try to make it again await withFailCtx('mkdirButAlreadyExists', () => fs.mkdir(tmpFolder)) - await withFailCtx('mkdirSubfolder', () => fs.mkdir(path.join(tmpFolder, 'subfolder'))) - await withFailCtx('rmdirInitial', () => fs.delete(tmpFolder, { recursive: true })) + // Test subfolder creation. Based on telemetry it looks like the vsc mkdir may be flaky + // when creating subfolders. Hopefully this gives us some useful information. + const subfolderRoot = path.join(tmpFolder, 'a') + const subfolderPath = path.join(subfolderRoot, 'b/c/d/e') + await withFailCtx('mkdirSubfolderNode', () => nodeFs.mkdir(subfolderPath, { recursive: true })) + await withFailCtx('rmdirInitialForNode', () => fs.delete(subfolderRoot, { recursive: true })) + await withFailCtx('mkdirSubfolderVsc', () => fs.mkdir(subfolderPath)) + await withFailCtx('rmdirInitialForVsc', () => fs.delete(subfolderRoot, { recursive: true })) // test basic file operations await withFailCtx('mkdirForFileOpsTest', () => fs.mkdir(tmpFolder)) @@ -293,6 +300,8 @@ export async function throwOnUnstableFileSystem(tmpRoot: string) { throw new Error(`Unexpected file contents after multiple writes: "${text}"`) } }) + // write a large file, ensuring we are not near a space limit + await withFailCtx('writeFileLarge', () => fs.writeFile(tmpFile, 'a'.repeat(1000))) // test concurrent reads on a file await withFailCtx('writeFileConcurrencyTest', () => fs.writeFile(tmpFile, 'concurrencyTest')) const result = await Promise.all([ diff --git a/packages/core/src/test/shared/crashMonitoring.test.ts b/packages/core/src/test/shared/crashMonitoring.test.ts index ec61d5487f3..74e595b52d5 100644 --- a/packages/core/src/test/shared/crashMonitoring.test.ts +++ b/packages/core/src/test/shared/crashMonitoring.test.ts @@ -14,8 +14,9 @@ class TestCrashMonitoring extends CrashMonitoring { public constructor(...deps: ConstructorParameters) { super(...deps) } - public override crash() { - super.crash() + /** Immitates an extension crash */ + public crash() { + super.cleanup() } } From 545508a4588a7837e99597567f29e42e1f6732b3 Mon Sep 17 00:00:00 2001 From: nkomonen-amazon Date: Wed, 9 Oct 2024 12:31:36 -0400 Subject: [PATCH 8/9] minor refactors Signed-off-by: nkomonen-amazon --- packages/amazonq/src/extensionNode.ts | 2 +- packages/core/src/extensionNode.ts | 2 +- packages/core/src/shared/crashMonitoring.ts | 149 ++++++++---------- .../src/test/shared/crashMonitoring.test.ts | 17 +- 4 files changed, 81 insertions(+), 89 deletions(-) diff --git a/packages/amazonq/src/extensionNode.ts b/packages/amazonq/src/extensionNode.ts index 379bcd9df1c..cd07aeba981 100644 --- a/packages/amazonq/src/extensionNode.ts +++ b/packages/amazonq/src/extensionNode.ts @@ -97,5 +97,5 @@ async function setupDevMode(context: vscode.ExtensionContext) { export async function deactivate() { // Run concurrently to speed up execution. stop() does not throw so it is safe - await Promise.all([(await CrashMonitoring.instance())?.stop(), deactivateCommon()]) + await Promise.all([(await CrashMonitoring.instance())?.shutdown(), deactivateCommon()]) } diff --git a/packages/core/src/extensionNode.ts b/packages/core/src/extensionNode.ts index 146e3c44b33..06185f695cc 100644 --- a/packages/core/src/extensionNode.ts +++ b/packages/core/src/extensionNode.ts @@ -255,7 +255,7 @@ export async function activate(context: vscode.ExtensionContext) { export async function deactivate() { // Run concurrently to speed up execution. stop() does not throw so it is safe - await Promise.all([await (await CrashMonitoring.instance())?.stop(), deactivateCommon()]) + await Promise.all([await (await CrashMonitoring.instance())?.shutdown(), deactivateCommon()]) await globals.resourceManager.dispose() } diff --git a/packages/core/src/shared/crashMonitoring.ts b/packages/core/src/shared/crashMonitoring.ts index 5a95211d5f0..2df2af74eb5 100644 --- a/packages/core/src/shared/crashMonitoring.ts +++ b/packages/core/src/shared/crashMonitoring.ts @@ -28,14 +28,14 @@ const className = 'CrashMonitoring' * * - If an extension crashes it cannot report that it crashed. * - The ExtensionHost is a separate process from the main VS Code editor process where all extensions run in - * - Read about the [`deactivate()` behavior](../../../../../docs/vscode_behaviors.md) + * - Read about the [`deactivate()` behavior](../../../../docs/vscode_behaviors.md) * - An IDE instance is one instance of VS Code, and Extension Instance is 1 instance of our extension. These are 1:1. * * ### How it works at a high level: * - * - Each IDE instance will start its own crash reporting process on startup - * - The crash reporting process works with each instance sending heartbeats to a centralized state. Separately each instance - * has a "Checker" the each entry in the centralized to see if it is not running anymore, and appropriately handles when needed. + * - Each IDE instance will start its own crash monitoring process on startup + * - The crash monitoring process works with each instance sending heartbeats to a centralized state. Separately each instance + * has a "Checker" that checks each heartbeat to see if it is not running anymore, and appropriately handles when needed. * * - On a crash we will emit a `session_end` metrics with `{ result: 'Failed', reason: 'ExtHostCrashed', crashedSessionId: '...' }` * - On successful shutdown a `session_end` with a successful result is already emitted elsewhere. @@ -44,14 +44,20 @@ const className = 'CrashMonitoring' * * - To get the most verbose debug logs, configure the devmode setting: `crashReportInterval` * + * - This entire feature is non critical and should not impede extension usage if something goes wrong. As a result, we + * swallow all errors and only log/telemetry issues. This is the reason for all the try/catch statements + * * ### Limitations * - We will never truly know if we are the cause of the crash * - Since all extensions run in the same Ext Host process, any one of them could cause it to crash and we wouldn't be * able to differentiate * - If the IDE itself crashes, unrelated to the extensions, it will still be seen as a crash in our telemetry * - We are not able to explicitly determine if we were the cause of the crash - * - If the user shuts down their computer after a crash before the next interval of the Primary can run, that info is lost + * - If the user shuts down their computer after a crash before the next crash check can run, that info is lost * - We cannot persist crash information on computer restart + * - We use the users filesystem to maintain the state of running extension instances, but the + * filesystem is not reliable and can lead to incorrect crash reports + * - To mitigate this we do not run crash reporting on machines that we detect have a flaky filesystem */ export class CrashMonitoring { protected heartbeat: Heartbeat | undefined @@ -99,12 +105,11 @@ export class CrashMonitoring { return } - // In the Prod code this runs by default and interferes as it reports its own heartbeats. + // During tests, the Prod code also runs this function. It interferes with telemetry assertion since it reports additional heartbeats. if (this.isAutomation) { return } - // Dont throw since this feature is not critical and shouldn't prevent extension execution try { this.heartbeat = new Heartbeat(this.state, this.checkInterval, this.isDevMode) this.crashChecker = new CrashChecker(this.state, this.checkInterval, this.isDevMode, this.devLogger) @@ -113,9 +118,12 @@ export class CrashMonitoring { await this.crashChecker.start() } catch (error) { emitFailure({ functionName: 'start', error }) - this.cleanup() + try { + this.crashChecker?.cleanup() + await this.heartbeat?.cleanup() + } catch {} - // In development this gives us a useful stacktrace + // Surface errors during development, otherwise it can be missed. if (this.isDevMode) { throw error } @@ -123,32 +131,21 @@ export class CrashMonitoring { } /** Stop the Crash Monitoring process, signifying a graceful shutdown */ - public async stop() { - // Dont throw since this feature is not critical and shouldn't prevent extension shutdown + public async shutdown() { try { - this.crashChecker?.stop() - await this.heartbeat?.stop() + this.crashChecker?.cleanup() + await this.heartbeat?.shutdown() } catch (error) { try { // This probably wont emit in time before shutdown, but may be written to the logs emitFailure({ functionName: 'stop', error }) - } catch (e) { - // In case emit fails, do nothing - } + } catch {} + if (this.isDevMode) { throw error } } } - - /** - * Mimic a crash of the extension, or can just be used as cleanup. - * Only use this for tests. - */ - protected cleanup() { - this.crashChecker?.stop() - this.heartbeat?.stopInterval() - } } /** @@ -164,7 +161,6 @@ class Heartbeat { ) {} public async start() { - // heartbeat 2 times per check const heartbeatInterval = this.checkInterval / 2 // Send an initial heartbeat immediately @@ -175,10 +171,11 @@ class Heartbeat { try { await this.state.sendHeartbeat() } catch (e) { - this.stopInterval() - emitFailure({ functionName: 'sendHeartbeat', error: e }) + try { + await this.cleanup() + emitFailure({ functionName: 'sendHeartbeatInterval', error: e }) + } catch {} - // During development we are fine with impacting extension execution, so throw if (this.isDevMode) { throw e } @@ -187,31 +184,31 @@ class Heartbeat { } /** Stops everything, signifying a graceful shutdown */ - public async stop() { - this.stopInterval() + public async shutdown() { + globals.clock.clearInterval(this.intervalRef) return this.state.indicateGracefulShutdown() } - /** Stops the heartbeat interval */ - public stopInterval() { - globals.clock.clearInterval(this.intervalRef) + /** + * Safely attempts to clean up this heartbeat from the state to try and avoid + * an incorrectly indicated crash. Use this on failures. + * + * --- + * + * IMPORTANT: This function must not throw as this function is run within a catch + */ + public async cleanup() { + try { + await this.shutdown() + } catch {} + try { + await this.state.clearHeartbeat() + } catch {} } } /** - * This checks for if an extension has crashed and handles that result appropriately. - * It listens to heartbeats sent by {@link Heartbeat}, and then handles appropriately when the heartbeats - * stop. - * - * --- - * - * This follows the Primary/Secondary design where one of the extension instances is the Primary checker - * and all others are Secondary. - * - * The Primary actually reads the state and reports crashes if detected. - * - * The Secondary continuously attempts to become the Primary if the previous Primary is no longer responsive. - * This helps to reduce raceconditions for operations on the state. + * This checks the heartbeats of each known extension to see if it has crashed and handles that result appropriately. */ class CrashChecker { private intervalRef: NodeJS.Timer | undefined @@ -232,17 +229,14 @@ class CrashChecker { tryCheckCrash(this.state, this.checkInterval, this.isDevMode, this.devLogger) ) + // check on an interval this.intervalRef = globals.clock.setInterval(async () => { try { await tryCheckCrash(this.state, this.checkInterval, this.isDevMode, this.devLogger) } catch (e) { emitFailure({ functionName: 'checkCrashInterval', error: e }) + this.cleanup() - // Since there was an error there is point to try checking for crashed instances. - // We will need to monitor telemetry to see if we can determine widespread issues. - this.stop() - - // During development we are fine with impacting extension execution, so throw if (this.isDevMode) { throw e } @@ -269,13 +263,13 @@ class CrashChecker { // Ext is not running anymore, handle appropriately depending on why it stopped running await state.handleExtNotRunning(ext, { - shutdown: async () => { + onShutdown: async () => { // Nothing to do, just log info if necessary devLogger?.debug( `crashMonitoring: SHUTDOWN: following has gracefully shutdown: pid ${ext.extHostPid} + sessionId: ${ext.sessionId}` ) }, - crash: async () => { + onCrash: async () => { // Debugger instances may incorrectly look like they crashed, so don't emit. // Example is if I hit the red square in the debug menu, it is a non-graceful shutdown. But the regular // 'x' button in the Debug IDE instance is a graceful shutdown. @@ -314,16 +308,12 @@ class CrashChecker { function isStoppedHeartbeats(ext: ExtInstanceHeartbeat, checkInterval: number) { const millisSinceLastHeartbeat = globals.clock.Date.now() - ext.lastHeartbeat - // since heartbeats happen 2 times per check interval it will have occured - // at least once in the timespan of the check interval. - // - // But if we want to be more flexible this condition can be modified since - // something like global state taking time to sync can return the incorrect last heartbeat value. return millisSinceLastHeartbeat >= checkInterval } } - public stop() { + /** Use this on failures */ + public cleanup() { globals.clock.clearInterval(this.intervalRef) } } @@ -356,8 +346,7 @@ function getDefaultDependencies(): MementoStateDependencies { } } /** - * Factory to create an instance of the state. Do not create an instance of the - * class manually since there are required setup steps. + * Factory to create an instance of the state. * * @throws if the filesystem state cannot be confirmed to be stable, i.e flaky fs operations */ @@ -368,14 +357,16 @@ export async function crashMonitoringStateFactory(deps = getDefaultDependencies( } /** - * The state of all running extensions. This state is globally shared with all other extension instances. - * This state specifically uses the File System. + * The state of all running extensions. + * - is globally shared with all other extension instances. + * - uses the File System + * - is not truly reliable since filesystems are not reliable */ export class FileSystemState { private readonly stateDirPath: string /** - * Use {@link crashMonitoringStateFactory} to make an instance + * IMORTANT: Use {@link crashMonitoringStateFactory} to make an instance */ constructor(protected readonly deps: MementoStateDependencies) { this.stateDirPath = path.join(this.deps.workDirPath, crashMonitoringDirNames.root) @@ -393,8 +384,8 @@ export class FileSystemState { */ public async init() { // IMPORTANT: do not run crash reporting on unstable filesystem to reduce invalid crash data - // NOTE: We want to get a diff between clients that succeeded vs failed this check, - // so emit a metric to track that + // + // NOTE: Emits a metric to know how many clients we skipped await telemetry.function_call.run(async (span) => { span.record({ className, functionName: 'FileSystemStateValidation' }) await withFailCtx('validateFileSystemStability', () => throwOnUnstableFileSystem()) @@ -423,15 +414,17 @@ export class FileSystemState { } const funcWithCtx = () => withFailCtx('sendHeartbeatState', func) const funcWithRetries = withRetries(funcWithCtx, { maxRetries: 8, delay: 100, backoff: 2 }) - - // retry a failed heartbeat to avoid incorrectly being reported as a crash return await funcWithRetries } catch (e) { // delete this ext from the state to avoid an incorrectly reported crash since we could not send a new heartbeat - await withFailCtx('sendHeartbeatStateCleanup', () => this.deleteRunningFile(extId)) + await withFailCtx('sendHeartbeatFailureCleanup', () => this.clearHeartbeat()) throw e } } + /** Clears this extentions heartbeat from the state */ + public async clearHeartbeat() { + await this.deleteHeartbeatFile(this.extId) + } /** * Signal that this extension is gracefully shutting down. This will prevent the IDE from thinking it crashed. @@ -460,31 +453,29 @@ export class FileSystemState { */ public async handleExtNotRunning( ext: ExtInstance, - opts: { shutdown: () => Promise; crash: () => Promise } + opts: { onShutdown: () => Promise; onCrash: () => Promise } ): Promise { const extId = this.createExtId(ext) const shutdownFilePath = path.join(await this.shutdownExtsDir(), extId) if (await withFailCtx('existsShutdownFile', () => fs.exists(shutdownFilePath))) { - await opts.shutdown() + await opts.onShutdown() // We intentionally do not clean up the file in shutdown since there may be another // extension may be doing the same thing in parallel, and would read the extension as // crashed since the file was missing. The file will be cleared on computer restart though. // TODO: Be smart and clean up the file after some time. } else { - await opts.crash() + await opts.onCrash() } // Clean up the running extension file since it no longer exists - await this.deleteRunningFile(extId) + await this.deleteHeartbeatFile(extId) } - private async deleteRunningFile(extId: ExtInstanceId) { - // Clean up the running extension file since it is no longer exists + public async deleteHeartbeatFile(extId: ExtInstanceId) { const dir = await this.runningExtsDir() - // Retry on failure since failing to delete this file will result in incorrectly reported crashes. - // The common errors we were seeing were windows EPERM/EBUSY errors. There may be a relation - // to this https://github.com/aws/aws-toolkit-vscode/pull/5335 + // Retry file deletion to prevent incorrect crash reports. Common Windows errors seen in telemetry: EPERM/EBUSY. + // See: https://github.com/aws/aws-toolkit-vscode/pull/5335 await withRetries(() => withFailCtx('deleteStaleRunningFile', () => fs.delete(path.join(dir, extId))), { maxRetries: 8, delay: 100, diff --git a/packages/core/src/test/shared/crashMonitoring.test.ts b/packages/core/src/test/shared/crashMonitoring.test.ts index 74e595b52d5..6b4e4bcd571 100644 --- a/packages/core/src/test/shared/crashMonitoring.test.ts +++ b/packages/core/src/test/shared/crashMonitoring.test.ts @@ -15,8 +15,9 @@ class TestCrashMonitoring extends CrashMonitoring { super(...deps) } /** Immitates an extension crash */ - public crash() { - super.cleanup() + public async crash() { + this.crashChecker?.cleanup() + await this.heartbeat?.cleanup() } } @@ -88,7 +89,7 @@ export const crashMonitoringTest = async () => { // Ext 1 does a graceful shutdown await exts[1].ext.start() - await exts[1].ext.stop() + await exts[1].ext.shutdown() await awaitIntervals(oneInterval) // Ext 1 did a graceful shutdown so no metric emitted assertTelemetry('session_end', []) @@ -100,7 +101,7 @@ export const crashMonitoringTest = async () => { const exts = await makeTestExtensions(2) await exts[0].ext.start() - exts[0].ext.crash() + await exts[0].ext.crash() await awaitIntervals(oneInterval) // There is no other active instance to report the issue assertTelemetry('session_end', []) @@ -122,7 +123,7 @@ export const crashMonitoringTest = async () => { } for (let i = 1; i < extCount; i++) { - exts[i].ext.crash() + await exts[i].ext.crash() latestCrashedExts.push(exts[i]) } @@ -143,7 +144,7 @@ export const crashMonitoringTest = async () => { // start Ext 1 then crash it, Ext 0 finds the crash await exts[1].ext.start() - exts[1].ext.crash() + await exts[1].ext.crash() latestCrashedExts.push(exts[1]) await awaitIntervals(oneInterval * 1) @@ -151,14 +152,14 @@ export const crashMonitoringTest = async () => { // start Ext 2 and crash Ext 0, Ext 2 is promoted to Primary checker await exts[2].ext.start() - exts[0].ext.crash() + await exts[0].ext.crash() latestCrashedExts.push(exts[0]) await awaitIntervals(oneInterval * 1) assertCrashedExtensions(latestCrashedExts) // Ext 3 starts, then crashes. Ext 2 reports the crash since it is the Primary checker await exts[3].ext.start() - exts[3].ext.crash() + await exts[3].ext.crash() latestCrashedExts.push(exts[3]) await awaitIntervals(oneInterval * 1) assertCrashedExtensions(latestCrashedExts) From 3449e485a3bfe2233224fd700d1f2a8cc808de3a Mon Sep 17 00:00:00 2001 From: nkomonen-amazon Date: Wed, 9 Oct 2024 14:22:43 -0400 Subject: [PATCH 9/9] fix failing tests Signed-off-by: nkomonen-amazon --- packages/core/src/shared/crashMonitoring.ts | 12 +++++++++++- .../core/src/test/shared/crashMonitoring.test.ts | 6 +++--- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/packages/core/src/shared/crashMonitoring.ts b/packages/core/src/shared/crashMonitoring.ts index 2df2af74eb5..00fbb162862 100644 --- a/packages/core/src/shared/crashMonitoring.ts +++ b/packages/core/src/shared/crashMonitoring.ts @@ -205,6 +205,11 @@ class Heartbeat { await this.state.clearHeartbeat() } catch {} } + + /** Mimics a crash, only for testing */ + public testCrash() { + globals.clock.clearInterval(this.intervalRef) + } } /** @@ -312,10 +317,15 @@ class CrashChecker { } } - /** Use this on failures */ + /** Use this on failures to terminate the crash checker */ public cleanup() { globals.clock.clearInterval(this.intervalRef) } + + /** Mimics a crash, only for testing */ + public testCrash() { + globals.clock.clearInterval(this.intervalRef) + } } /** diff --git a/packages/core/src/test/shared/crashMonitoring.test.ts b/packages/core/src/test/shared/crashMonitoring.test.ts index 6b4e4bcd571..5802592ea8c 100644 --- a/packages/core/src/test/shared/crashMonitoring.test.ts +++ b/packages/core/src/test/shared/crashMonitoring.test.ts @@ -14,10 +14,10 @@ class TestCrashMonitoring extends CrashMonitoring { public constructor(...deps: ConstructorParameters) { super(...deps) } - /** Immitates an extension crash */ + /** Imitates an extension crash */ public async crash() { - this.crashChecker?.cleanup() - await this.heartbeat?.cleanup() + this.crashChecker?.testCrash() + this.heartbeat?.testCrash() } }