diff --git a/packages/amazonq/src/extensionNode.ts b/packages/amazonq/src/extensionNode.ts index b203e541a84..cd07aeba981 100644 --- a/packages/amazonq/src/extensionNode.ts +++ b/packages/amazonq/src/extensionNode.ts @@ -32,7 +32,8 @@ export async function activate(context: vscode.ExtensionContext) { * the code compatible with web and move it to {@link activateAmazonQCommon}. */ async function activateAmazonQNode(context: vscode.ExtensionContext) { - await (await CrashMonitoring.instance()).start() + // Intentionally do not await since this is slow and non-critical + void (await CrashMonitoring.instance())?.start() const extContext = { extensionContext: context, @@ -96,5 +97,5 @@ async function setupDevMode(context: vscode.ExtensionContext) { export async function deactivate() { // Run concurrently to speed up execution. stop() does not throw so it is safe - await Promise.all([(await CrashMonitoring.instance()).stop(), deactivateCommon()]) + await Promise.all([(await CrashMonitoring.instance())?.shutdown(), deactivateCommon()]) } diff --git a/packages/core/src/extensionNode.ts b/packages/core/src/extensionNode.ts index f91fa321f59..06185f695cc 100644 --- a/packages/core/src/extensionNode.ts +++ b/packages/core/src/extensionNode.ts @@ -78,7 +78,8 @@ export async function activate(context: vscode.ExtensionContext) { // IMPORTANT: If you are doing setup that should also work in web mode (browser), it should be done in the function below const extContext = await activateCommon(context, contextPrefix, false) - await (await CrashMonitoring.instance()).start() + // Intentionally do not await since this can be slow and non-critical + void (await CrashMonitoring.instance())?.start() initializeCredentialsProviderManager() @@ -254,7 +255,7 @@ export async function activate(context: vscode.ExtensionContext) { export async function deactivate() { // Run concurrently to speed up execution. stop() does not throw so it is safe - await Promise.all([await (await CrashMonitoring.instance()).stop(), deactivateCommon()]) + await Promise.all([await (await CrashMonitoring.instance())?.shutdown(), deactivateCommon()]) await globals.resourceManager.dispose() } diff --git a/packages/core/src/shared/crashMonitoring.ts b/packages/core/src/shared/crashMonitoring.ts index bfd470a7150..00fbb162862 100644 --- a/packages/core/src/shared/crashMonitoring.ts +++ b/packages/core/src/shared/crashMonitoring.ts @@ -16,6 +16,8 @@ import nodeFs from 'fs/promises' import fs from './fs/fs' import { getLogger } from './logger/logger' import { crashMonitoringDirNames } from './constants' +import { throwOnUnstableFileSystem } from './filesystemUtilities' +import { withRetries } from './utilities/functionUtils' const className = 'CrashMonitoring' @@ -26,14 +28,14 @@ const className = 'CrashMonitoring' * * - If an extension crashes it cannot report that it crashed. * - The ExtensionHost is a separate process from the main VS Code editor process where all extensions run in - * - Read about the [`deactivate()` behavior](../../../../../docs/vscode_behaviors.md) + * - Read about the [`deactivate()` behavior](../../../../docs/vscode_behaviors.md) * - An IDE instance is one instance of VS Code, and Extension Instance is 1 instance of our extension. These are 1:1. * * ### How it works at a high level: * - * - Each IDE instance will start its own crash reporting process on startup - * - The crash reporting process works with each instance sending heartbeats to a centralized state. Separately each instance - * has a "Checker" the each entry in the centralized to see if it is not running anymore, and appropriately handles when needed. + * - Each IDE instance will start its own crash monitoring process on startup + * - The crash monitoring process works with each instance sending heartbeats to a centralized state. Separately each instance + * has a "Checker" that checks each heartbeat to see if it is not running anymore, and appropriately handles when needed. * * - On a crash we will emit a `session_end` metrics with `{ result: 'Failed', reason: 'ExtHostCrashed', crashedSessionId: '...' }` * - On successful shutdown a `session_end` with a successful result is already emitted elsewhere. @@ -42,18 +44,22 @@ const className = 'CrashMonitoring' * * - To get the most verbose debug logs, configure the devmode setting: `crashReportInterval` * + * - This entire feature is non critical and should not impede extension usage if something goes wrong. As a result, we + * swallow all errors and only log/telemetry issues. This is the reason for all the try/catch statements + * * ### Limitations * - We will never truly know if we are the cause of the crash * - Since all extensions run in the same Ext Host process, any one of them could cause it to crash and we wouldn't be * able to differentiate * - If the IDE itself crashes, unrelated to the extensions, it will still be seen as a crash in our telemetry * - We are not able to explicitly determine if we were the cause of the crash - * - If the user shuts down their computer after a crash before the next interval of the Primary can run, that info is lost + * - If the user shuts down their computer after a crash before the next crash check can run, that info is lost * - We cannot persist crash information on computer restart + * - We use the users filesystem to maintain the state of running extension instances, but the + * filesystem is not reliable and can lead to incorrect crash reports + * - To mitigate this we do not run crash reporting on machines that we detect have a flaky filesystem */ export class CrashMonitoring { - private isStarted: boolean = false - protected heartbeat: Heartbeat | undefined protected crashChecker: CrashChecker | undefined @@ -65,17 +71,32 @@ export class CrashMonitoring { private readonly devLogger: Logger | undefined ) {} + static #didTryCreate = false static #instance: CrashMonitoring | undefined - public static async instance(): Promise { - const isDevMode = getIsDevMode() - const devModeLogger: Logger | undefined = isDevMode ? getLogger() : undefined - return (this.#instance ??= new CrashMonitoring( - await crashMonitoringStateFactory(), - DevSettings.instance.get('crashCheckInterval', 1000 * 60 * 3), - isDevMode, - isAutomation(), - devModeLogger - )) + /** Returns an instance of this class or undefined if any initial validation fails. */ + public static async instance(): Promise { + // Since the first attempt to create an instance may have failed, we do not + // attempt to create an instance again and return whatever we have + if (this.#didTryCreate === true) { + return this.#instance + } + + try { + this.#didTryCreate = true + const isDevMode = getIsDevMode() + const devModeLogger: Logger | undefined = isDevMode ? getLogger() : undefined + const state = await crashMonitoringStateFactory() // can throw + return (this.#instance ??= new CrashMonitoring( + state, + DevSettings.instance.get('crashCheckInterval', 1000 * 60 * 10), // check every 10 minutes + isDevMode, + isAutomation(), + devModeLogger + )) + } catch (error) { + emitFailure({ functionName: 'instance', error }) + return undefined + } } /** Start the Crash Monitoring process */ @@ -84,23 +105,25 @@ export class CrashMonitoring { return } - // In the Prod code this runs by default and interferes as it reports its own heartbeats. + // During tests, the Prod code also runs this function. It interferes with telemetry assertion since it reports additional heartbeats. if (this.isAutomation) { return } - // Dont throw since this feature is not critical and shouldn't prevent extension execution try { this.heartbeat = new Heartbeat(this.state, this.checkInterval, this.isDevMode) this.crashChecker = new CrashChecker(this.state, this.checkInterval, this.isDevMode, this.devLogger) await this.heartbeat.start() await this.crashChecker.start() - - this.isStarted = true } catch (error) { emitFailure({ functionName: 'start', error }) - // In development this gives us a useful stacktrace + try { + this.crashChecker?.cleanup() + await this.heartbeat?.cleanup() + } catch {} + + // Surface errors during development, otherwise it can be missed. if (this.isDevMode) { throw error } @@ -108,40 +131,21 @@ export class CrashMonitoring { } /** Stop the Crash Monitoring process, signifying a graceful shutdown */ - public async stop() { - if (!this.isStarted) { - return - } - - // Dont throw since this feature is not critical and shouldn't prevent extension shutdown + public async shutdown() { try { - this.crashChecker?.stop() - await this.heartbeat?.stop() + this.crashChecker?.cleanup() + await this.heartbeat?.shutdown() } catch (error) { try { // This probably wont emit in time before shutdown, but may be written to the logs emitFailure({ functionName: 'stop', error }) - } catch (e) { - // In case emit fails, do nothing - } + } catch {} + if (this.isDevMode) { throw error } } } - - /** - * Mimic a crash of the extension, or can just be used as cleanup. - * Only use this for tests. - */ - protected crash() { - if (!this.isStarted) { - return - } - - this.crashChecker?.stop() - this.heartbeat?.crash() - } } /** @@ -149,7 +153,6 @@ export class CrashMonitoring { * {@link CrashChecker} listens for these. */ class Heartbeat { - private isRunning: boolean = false private intervalRef: NodeJS.Timer | undefined constructor( private readonly state: FileSystemState, @@ -158,9 +161,6 @@ class Heartbeat { ) {} public async start() { - this.isRunning = true - - // heartbeat 2 times per check const heartbeatInterval = this.checkInterval / 2 // Send an initial heartbeat immediately @@ -171,14 +171,11 @@ class Heartbeat { try { await this.state.sendHeartbeat() } catch (e) { - emitFailure({ functionName: 'sendHeartbeat', error: e }) - - // Since there was an error we want to stop crash monitoring since it is pointless. - // We will need to monitor telemetry to see if we can determine widespread issues. - // Make sure it is signaled as a graceful shutdown to reduce noise of crashed extensions. - await this.stop() + try { + await this.cleanup() + emitFailure({ functionName: 'sendHeartbeatInterval', error: e }) + } catch {} - // During development we are fine with impacting extension execution, so throw if (this.isDevMode) { throw e } @@ -186,35 +183,37 @@ class Heartbeat { }, heartbeatInterval) } - public async stop() { - // non-happy path where heartbeats were never started. - if (!this.isRunning) { - return - } - + /** Stops everything, signifying a graceful shutdown */ + public async shutdown() { globals.clock.clearInterval(this.intervalRef) return this.state.indicateGracefulShutdown() } - public crash() { + /** + * Safely attempts to clean up this heartbeat from the state to try and avoid + * an incorrectly indicated crash. Use this on failures. + * + * --- + * + * IMPORTANT: This function must not throw as this function is run within a catch + */ + public async cleanup() { + try { + await this.shutdown() + } catch {} + try { + await this.state.clearHeartbeat() + } catch {} + } + + /** Mimics a crash, only for testing */ + public testCrash() { globals.clock.clearInterval(this.intervalRef) } } /** - * This checks for if an extension has crashed and handles that result appropriately. - * It listens to heartbeats sent by {@link Heartbeat}, and then handles appropriately when the heartbeats - * stop. - * - * --- - * - * This follows the Primary/Secondary design where one of the extension instances is the Primary checker - * and all others are Secondary. - * - * The Primary actually reads the state and reports crashes if detected. - * - * The Secondary continuously attempts to become the Primary if the previous Primary is no longer responsive. - * This helps to reduce raceconditions for operations on the state. + * This checks the heartbeats of each known extension to see if it has crashed and handles that result appropriately. */ class CrashChecker { private intervalRef: NodeJS.Timer | undefined @@ -235,17 +234,14 @@ class CrashChecker { tryCheckCrash(this.state, this.checkInterval, this.isDevMode, this.devLogger) ) + // check on an interval this.intervalRef = globals.clock.setInterval(async () => { try { await tryCheckCrash(this.state, this.checkInterval, this.isDevMode, this.devLogger) } catch (e) { emitFailure({ functionName: 'checkCrashInterval', error: e }) + this.cleanup() - // Since there was an error we want to stop crash monitoring since it is pointless. - // We will need to monitor telemetry to see if we can determine widespread issues. - this.stop() - - // During development we are fine with impacting extension execution, so throw if (this.isDevMode) { throw e } @@ -272,13 +268,13 @@ class CrashChecker { // Ext is not running anymore, handle appropriately depending on why it stopped running await state.handleExtNotRunning(ext, { - shutdown: async () => { + onShutdown: async () => { // Nothing to do, just log info if necessary devLogger?.debug( `crashMonitoring: SHUTDOWN: following has gracefully shutdown: pid ${ext.extHostPid} + sessionId: ${ext.sessionId}` ) }, - crash: async () => { + onCrash: async () => { // Debugger instances may incorrectly look like they crashed, so don't emit. // Example is if I hit the red square in the debug menu, it is a non-graceful shutdown. But the regular // 'x' button in the Debug IDE instance is a graceful shutdown. @@ -317,16 +313,17 @@ class CrashChecker { function isStoppedHeartbeats(ext: ExtInstanceHeartbeat, checkInterval: number) { const millisSinceLastHeartbeat = globals.clock.Date.now() - ext.lastHeartbeat - // since heartbeats happen 2 times per check interval it will have occured - // at least once in the timespan of the check interval. - // - // But if we want to be more flexible this condition can be modified since - // something like global state taking time to sync can return the incorrect last heartbeat value. return millisSinceLastHeartbeat >= checkInterval } } - public stop() { + /** Use this on failures to terminate the crash checker */ + public cleanup() { + globals.clock.clearInterval(this.intervalRef) + } + + /** Mimics a crash, only for testing */ + public testCrash() { globals.clock.clearInterval(this.intervalRef) } } @@ -358,6 +355,11 @@ function getDefaultDependencies(): MementoStateDependencies { devLogger: getIsDevMode() ? getLogger() : undefined, } } +/** + * Factory to create an instance of the state. + * + * @throws if the filesystem state cannot be confirmed to be stable, i.e flaky fs operations + */ export async function crashMonitoringStateFactory(deps = getDefaultDependencies()): Promise { const state: FileSystemState = new FileSystemState(deps) await state.init() @@ -365,14 +367,16 @@ export async function crashMonitoringStateFactory(deps = getDefaultDependencies( } /** - * The state of all running extensions. This state is globally shared with all other extension instances. - * This state specifically uses the File System. + * The state of all running extensions. + * - is globally shared with all other extension instances. + * - uses the File System + * - is not truly reliable since filesystems are not reliable */ export class FileSystemState { private readonly stateDirPath: string /** - * Use {@link crashMonitoringStateFactory} to make an instance + * IMORTANT: Use {@link crashMonitoringStateFactory} to make an instance */ constructor(protected readonly deps: MementoStateDependencies) { this.stateDirPath = path.join(this.deps.workDirPath, crashMonitoringDirNames.root) @@ -385,8 +389,18 @@ export class FileSystemState { /** * Does the required initialization steps, this must always be run after * creation of the instance. + * + * @throws if the filesystem state cannot be confirmed to be stable, i.e flaky fs operations */ public async init() { + // IMPORTANT: do not run crash reporting on unstable filesystem to reduce invalid crash data + // + // NOTE: Emits a metric to know how many clients we skipped + await telemetry.function_call.run(async (span) => { + span.record({ className, functionName: 'FileSystemStateValidation' }) + await withFailCtx('validateFileSystemStability', () => throwOnUnstableFileSystem()) + }) + // Clear the state if the user did something like a computer restart if (await this.deps.isStateStale()) { await this.clearState() @@ -395,17 +409,31 @@ export class FileSystemState { // ------------------ Heartbeat methods ------------------ public async sendHeartbeat() { - await withFailCtx('sendHeartbeatState', async () => { - const dir = await this.runningExtsDir() - const extId = this.createExtId(this.ext) - await fs.writeFile( - path.join(dir, extId), - JSON.stringify({ ...this.ext, lastHeartbeat: this.deps.now() }, undefined, 4) - ) - this.deps.devLogger?.debug( - `crashMonitoring: HEARTBEAT pid ${this.deps.pid} + sessionId: ${this.deps.sessionId.slice(0, 8)}-...` - ) - }) + const extId = this.createExtId(this.ext) + + try { + const func = async () => { + const dir = await this.runningExtsDir() + await fs.writeFile( + path.join(dir, extId), + JSON.stringify({ ...this.ext, lastHeartbeat: this.deps.now() }, undefined, 4) + ) + this.deps.devLogger?.debug( + `crashMonitoring: HEARTBEAT pid ${this.deps.pid} + sessionId: ${this.deps.sessionId.slice(0, 8)}-...` + ) + } + const funcWithCtx = () => withFailCtx('sendHeartbeatState', func) + const funcWithRetries = withRetries(funcWithCtx, { maxRetries: 8, delay: 100, backoff: 2 }) + return await funcWithRetries + } catch (e) { + // delete this ext from the state to avoid an incorrectly reported crash since we could not send a new heartbeat + await withFailCtx('sendHeartbeatFailureCleanup', () => this.clearHeartbeat()) + throw e + } + } + /** Clears this extentions heartbeat from the state */ + public async clearHeartbeat() { + await this.deleteHeartbeatFile(this.extId) } /** @@ -435,26 +463,34 @@ export class FileSystemState { */ public async handleExtNotRunning( ext: ExtInstance, - opts: { shutdown: () => Promise; crash: () => Promise } + opts: { onShutdown: () => Promise; onCrash: () => Promise } ): Promise { const extId = this.createExtId(ext) const shutdownFilePath = path.join(await this.shutdownExtsDir(), extId) if (await withFailCtx('existsShutdownFile', () => fs.exists(shutdownFilePath))) { - await opts.shutdown() + await opts.onShutdown() // We intentionally do not clean up the file in shutdown since there may be another // extension may be doing the same thing in parallel, and would read the extension as // crashed since the file was missing. The file will be cleared on computer restart though. // TODO: Be smart and clean up the file after some time. } else { - await opts.crash() + await opts.onCrash() } - // Clean up the running extension file since it is no longer exists + // Clean up the running extension file since it no longer exists + await this.deleteHeartbeatFile(extId) + } + public async deleteHeartbeatFile(extId: ExtInstanceId) { const dir = await this.runningExtsDir() - // Use force since another checker may have already removed this file before this is ran - await withFailCtx('deleteStaleRunningFile', () => fs.delete(path.join(dir, extId), { force: true })) + // Retry file deletion to prevent incorrect crash reports. Common Windows errors seen in telemetry: EPERM/EBUSY. + // See: https://github.com/aws/aws-toolkit-vscode/pull/5335 + await withRetries(() => withFailCtx('deleteStaleRunningFile', () => fs.delete(path.join(dir, extId))), { + maxRetries: 8, + delay: 100, + backoff: 2, + }) } // ------------------ State data ------------------ @@ -482,7 +518,7 @@ export class FileSystemState { private async runningExtsDir(): Promise { const p = path.join(this.stateDirPath, crashMonitoringDirNames.running) // ensure the dir exists - await withFailCtx('ensureRunningExtsDir', () => fs.mkdir(p)) + await withFailCtx('ensureRunningExtsDir', () => nodeFs.mkdir(p, { recursive: true })) return p } private async shutdownExtsDir() { @@ -492,7 +528,11 @@ export class FileSystemState { return p } public async clearState(): Promise { - await withFailCtx('clearState', async () => fs.delete(this.stateDirPath, { force: true })) + this.deps.devLogger?.debug('crashMonitoring: CLEAR_STATE: Started') + await withFailCtx('clearState', async () => { + await fs.delete(this.stateDirPath, { force: true, recursive: true }) + this.deps.devLogger?.debug('crashMonitoring: CLEAR_STATE: Succeeded') + }) } public async getAllExts(): Promise { const res = await withFailCtx('getAllExts', async () => { @@ -504,17 +544,22 @@ export class FileSystemState { const allExts = allExtIds.map>(async (extId: string) => { // Due to a race condition, a separate extension instance may have removed this file by this point. It is okay since // we will assume that other instance handled its termination appropriately. - const ext = await withFailCtx('parseRunningExtFile', async () => - ignoreBadFileError(async () => { - const text = await fs.readFileText(path.join(await this.runningExtsDir(), extId)) - - if (!text) { - return undefined - } - - // This was sometimes throwing SyntaxError - return JSON.parse(text) as ExtInstanceHeartbeat - }) + // NOTE: On Windows we were failing on EBUSY, so we retry on failure. + const ext: ExtInstanceHeartbeat | undefined = await withRetries( + () => + withFailCtx('parseRunningExtFile', async () => + ignoreBadFileError(async () => { + const text = await fs.readFileText(path.join(await this.runningExtsDir(), extId)) + + if (!text) { + return undefined + } + + // This was sometimes throwing SyntaxError + return JSON.parse(text) as ExtInstanceHeartbeat + }) + ), + { maxRetries: 6, delay: 100, backoff: 2 } ) if (ext === undefined) { @@ -591,7 +636,7 @@ async function withFailCtx(ctx: string, fn: () => Promise): Promise { // make sure we await the function so it actually executes within the try/catch return await fn() } catch (err) { - throw CrashMonitoringError.chain(err, `Failed "${ctx}"`, { code: className }) + throw CrashMonitoringError.chain(err, `Context: "${ctx}"`, { code: className }) } } diff --git a/packages/core/src/shared/filesystemUtilities.ts b/packages/core/src/shared/filesystemUtilities.ts index 351cd7028b5..b216427634e 100644 --- a/packages/core/src/shared/filesystemUtilities.ts +++ b/packages/core/src/shared/filesystemUtilities.ts @@ -11,6 +11,8 @@ import { getLogger } from './logger' import * as pathutils from './utilities/pathUtils' import globals from '../shared/extensionGlobals' import fs from '../shared/fs/fs' +import { ToolkitError } from './errors' +import * as nodeFs from 'fs/promises' export const tempDirPath = path.join( // https://github.com/aws/aws-toolkit-vscode/issues/240 @@ -246,3 +248,125 @@ export async function setDefaultDownloadPath(downloadPath: string) { getLogger().error('Error while setting "aws.downloadPath"', err as Error) } } + +const FileSystemStabilityExceptionId = 'FileSystemStabilityException' +const FileSystemStabilityException = ToolkitError.named(FileSystemStabilityExceptionId) + +/** + * Run this function to validate common file system calls/flows, ensuring they work + * as expected. + * + * The intent of this function is to catch potential fs issues early, + * testing common cases of fs usage. We need this since each machine can behave + * differently depending on things like OS, permissions, disk speed, ... + * + * @throws a {@link FileSystemStabilityException} which wraps the underlying failed fs operation error. + */ +export async function throwOnUnstableFileSystem(tmpRoot: string = tempDirPath) { + const tmpFolder = path.join(tmpRoot, `validateStableFS-${crypto.randomBytes(4).toString('hex')}`) + const tmpFile = path.join(tmpFolder, 'file.txt') + + try { + // test basic folder operations + await withFailCtx('mkdirInitial', () => fs.mkdir(tmpFolder)) + // Verifies that we do not throw if the dir exists and we try to make it again + await withFailCtx('mkdirButAlreadyExists', () => fs.mkdir(tmpFolder)) + // Test subfolder creation. Based on telemetry it looks like the vsc mkdir may be flaky + // when creating subfolders. Hopefully this gives us some useful information. + const subfolderRoot = path.join(tmpFolder, 'a') + const subfolderPath = path.join(subfolderRoot, 'b/c/d/e') + await withFailCtx('mkdirSubfolderNode', () => nodeFs.mkdir(subfolderPath, { recursive: true })) + await withFailCtx('rmdirInitialForNode', () => fs.delete(subfolderRoot, { recursive: true })) + await withFailCtx('mkdirSubfolderVsc', () => fs.mkdir(subfolderPath)) + await withFailCtx('rmdirInitialForVsc', () => fs.delete(subfolderRoot, { recursive: true })) + + // test basic file operations + await withFailCtx('mkdirForFileOpsTest', () => fs.mkdir(tmpFolder)) + await withFailCtx('writeFile1', () => fs.writeFile(tmpFile, 'test1')) + await withFailCtx('readFile1', async () => { + const text = await fs.readFileText(tmpFile) + if (text !== 'test1') { + throw new Error(`Unexpected file contents: "${text}"`) + } + }) + // overwrite the file content multiple times + await withFailCtx('writeFile2', () => fs.writeFile(tmpFile, 'test2')) + await withFailCtx('writeFile3', () => fs.writeFile(tmpFile, 'test3')) + await withFailCtx('writeFile4', () => fs.writeFile(tmpFile, 'test4')) + await withFailCtx('writeFile5', () => fs.writeFile(tmpFile, 'test5')) + await withFailCtx('readFile5', async () => { + const text = await fs.readFileText(tmpFile) + if (text !== 'test5') { + throw new Error(`Unexpected file contents after multiple writes: "${text}"`) + } + }) + // write a large file, ensuring we are not near a space limit + await withFailCtx('writeFileLarge', () => fs.writeFile(tmpFile, 'a'.repeat(1000))) + // test concurrent reads on a file + await withFailCtx('writeFileConcurrencyTest', () => fs.writeFile(tmpFile, 'concurrencyTest')) + const result = await Promise.all([ + withFailCtx('readFileConcurrent1', () => fs.readFileText(tmpFile)), + withFailCtx('readFileConcurrent2', () => fs.readFileText(tmpFile)), + withFailCtx('readFileConcurrent3', () => fs.readFileText(tmpFile)), + withFailCtx('readFileConcurrent4', () => fs.readFileText(tmpFile)), + withFailCtx('readFileConcurrent5', () => fs.readFileText(tmpFile)), + ]) + if (result.some((text) => text !== 'concurrencyTest')) { + throw new Error(`Unexpected concurrent file reads: ${result}`) + } + // test deleting a file + await withFailCtx('deleteFileInitial', () => fs.delete(tmpFile)) + await withFailCtx('writeFileAfterDelete', () => fs.writeFile(tmpFile, 'afterDelete')) + await withFailCtx('readNewFileAfterDelete', async () => { + const text = await fs.readFileText(tmpFile) + if (text !== 'afterDelete') { + throw new Error(`Unexpected file content after writing to deleted file: "${text}"`) + } + }) + await withFailCtx('deleteFileFully', () => fs.delete(tmpFile)) + await withFailCtx('notExistsFileAfterDelete', async () => { + const res = await fs.exists(tmpFile) + if (res) { + throw new Error(`Expected file to NOT exist: "${tmpFile}"`) + } + }) + + // test rename + await withFailCtx('writeFileForRename', () => fs.writeFile(tmpFile, 'TestingRename')) + const tmpFileRenamed = tmpFile + '.renamed' + await withFailCtx('renameFile', () => fs.rename(tmpFile, tmpFileRenamed)) + await withFailCtx('existsRenamedFile', async () => { + const res = await fs.exists(tmpFileRenamed) + if (!res) { + throw new Error(`Expected RENAMED file to exist: "${tmpFileRenamed}"`) + } + }) + await withFailCtx('writeToRenamedFile', async () => fs.writeFile(tmpFileRenamed, 'hello')) + await withFailCtx('readFromRenamedFile', async () => { + const res = await fs.readFileText(tmpFileRenamed) + if (res !== 'hello') { + throw new Error(`Expected RENAMED file to be writable: "${tmpFileRenamed}"`) + } + }) + await withFailCtx('renameFileReset', () => fs.rename(tmpFileRenamed, tmpFile)) + await withFailCtx('renameFileResetExists', async () => { + const res = await fs.exists(tmpFile) + if (!res) { + throw new Error(`Expected reverted renamed file to exist: "${tmpFile}"`) + } + }) + } finally { + await fs.delete(tmpFolder, { recursive: true, force: true }) + } + + async function withFailCtx(ctx: string, fn: () => Promise): Promise { + try { + return await fn() + } catch (e) { + if (!(e instanceof Error)) { + throw e + } + throw FileSystemStabilityException.chain(e, `context: "${ctx}"`, { code: FileSystemStabilityExceptionId }) + } + } +} diff --git a/packages/core/src/shared/utilities/functionUtils.ts b/packages/core/src/shared/utilities/functionUtils.ts index d21727bac1d..f61a3abde34 100644 --- a/packages/core/src/shared/utilities/functionUtils.ts +++ b/packages/core/src/shared/utilities/functionUtils.ts @@ -3,7 +3,7 @@ * SPDX-License-Identifier: Apache-2.0 */ -import { Timeout } from './timeoutUtils' +import { sleep, Timeout } from './timeoutUtils' /** * Creates a function that always returns a 'shared' Promise. @@ -145,3 +145,34 @@ export function cancellableDebounce( cancel: cancel, } } + +/** + * Executes the given function, retrying if it throws. + * + * @param opts - if no opts given, defaults are used + */ +export async function withRetries( + fn: () => Promise, + opts?: { maxRetries?: number; delay?: number; backoff?: number } +): Promise { + const maxRetries = opts?.maxRetries ?? 3 + const delay = opts?.delay ?? 0 + const backoff = opts?.backoff ?? 1 + + let retryCount = 0 + let latestDelay = delay + while (true) { + try { + return await fn() + } catch (err) { + retryCount++ + if (retryCount >= maxRetries) { + throw err + } + if (latestDelay > 0) { + await sleep(latestDelay) + latestDelay = latestDelay * backoff + } + } + } +} diff --git a/packages/core/src/shared/utilities/osUtils.ts b/packages/core/src/shared/utilities/osUtils.ts index b0266f73e1a..2ed7b976998 100644 --- a/packages/core/src/shared/utilities/osUtils.ts +++ b/packages/core/src/shared/utilities/osUtils.ts @@ -15,12 +15,13 @@ import * as os from 'os' * Use this function to perform one-time initialization tasks that should only happen * once per OS session, regardless of how many extension instances are running. */ -export async function isNewOsSession(now = () => globals.clock.Date.now(), uptime = () => os.uptime()) { +export async function isNewOsSession(now = () => globals.clock.Date.now(), uptimeMillis = () => os.uptime() * 1000) { // Windows does not have an ephemeral /tmp/ folder that deletes on shutdown, while unix-like os's do. // So in Windows we calculate the start time and see if it changed from the previous known start time. const lastStartTime = globals.globalState.tryGet('lastOsStartTime', Number) - // uptime() returns seconds, convert to ms - const currentOsStartTime = now() - uptime() * 1000 * 60 + + const uptime = uptimeMillis() + const currentOsStartTime = now() - uptime if (lastStartTime === undefined) { await globals.globalState.update('lastOsStartTime', currentOsStartTime) @@ -28,7 +29,7 @@ export async function isNewOsSession(now = () => globals.clock.Date.now(), uptim } // If the current start time is later than the last, it means we are in a new session since they should be the same value. - // But to account for small differences in how the current time is calculate, we add in a 5 second buffer. + // But to account for minor millisecond differnces, we add in a 5 second buffer. if (currentOsStartTime - 1000 * 5 > lastStartTime) { await globals.globalState.update('lastOsStartTime', currentOsStartTime) return true diff --git a/packages/core/src/test/shared/crashMonitoring.test.ts b/packages/core/src/test/shared/crashMonitoring.test.ts index ec61d5487f3..5802592ea8c 100644 --- a/packages/core/src/test/shared/crashMonitoring.test.ts +++ b/packages/core/src/test/shared/crashMonitoring.test.ts @@ -14,8 +14,10 @@ class TestCrashMonitoring extends CrashMonitoring { public constructor(...deps: ConstructorParameters) { super(...deps) } - public override crash() { - super.crash() + /** Imitates an extension crash */ + public async crash() { + this.crashChecker?.testCrash() + this.heartbeat?.testCrash() } } @@ -87,7 +89,7 @@ export const crashMonitoringTest = async () => { // Ext 1 does a graceful shutdown await exts[1].ext.start() - await exts[1].ext.stop() + await exts[1].ext.shutdown() await awaitIntervals(oneInterval) // Ext 1 did a graceful shutdown so no metric emitted assertTelemetry('session_end', []) @@ -99,7 +101,7 @@ export const crashMonitoringTest = async () => { const exts = await makeTestExtensions(2) await exts[0].ext.start() - exts[0].ext.crash() + await exts[0].ext.crash() await awaitIntervals(oneInterval) // There is no other active instance to report the issue assertTelemetry('session_end', []) @@ -121,7 +123,7 @@ export const crashMonitoringTest = async () => { } for (let i = 1; i < extCount; i++) { - exts[i].ext.crash() + await exts[i].ext.crash() latestCrashedExts.push(exts[i]) } @@ -142,7 +144,7 @@ export const crashMonitoringTest = async () => { // start Ext 1 then crash it, Ext 0 finds the crash await exts[1].ext.start() - exts[1].ext.crash() + await exts[1].ext.crash() latestCrashedExts.push(exts[1]) await awaitIntervals(oneInterval * 1) @@ -150,14 +152,14 @@ export const crashMonitoringTest = async () => { // start Ext 2 and crash Ext 0, Ext 2 is promoted to Primary checker await exts[2].ext.start() - exts[0].ext.crash() + await exts[0].ext.crash() latestCrashedExts.push(exts[0]) await awaitIntervals(oneInterval * 1) assertCrashedExtensions(latestCrashedExts) // Ext 3 starts, then crashes. Ext 2 reports the crash since it is the Primary checker await exts[3].ext.start() - exts[3].ext.crash() + await exts[3].ext.crash() latestCrashedExts.push(exts[3]) await awaitIntervals(oneInterval * 1) assertCrashedExtensions(latestCrashedExts) diff --git a/packages/core/src/test/shared/filesystemUtilities.test.ts b/packages/core/src/test/shared/filesystemUtilities.test.ts index b99e8f4b0bf..3bb3a018b37 100644 --- a/packages/core/src/test/shared/filesystemUtilities.test.ts +++ b/packages/core/src/test/shared/filesystemUtilities.test.ts @@ -13,8 +13,10 @@ import { isInDirectory, makeTemporaryToolkitFolder, tempDirPath, + throwOnUnstableFileSystem, } from '../../shared/filesystemUtilities' import { fs } from '../../shared' +import { TestFolder } from '../testUtil' describe('filesystemUtilities', function () { const targetFilename = 'findThisFile12345.txt' @@ -191,4 +193,11 @@ describe('filesystemUtilities', function () { assert.strictEqual(actual, 3) }) }) + + describe('throwOnUnstableFileSystem', async function () { + it('does not throw on stable filesystem', async function () { + const testFolder = await TestFolder.create() + await throwOnUnstableFileSystem(testFolder.path) + }) + }) }) diff --git a/packages/core/src/test/shared/fs/fs.test.ts b/packages/core/src/test/shared/fs/fs.test.ts index fefec8f0484..d88ada56a5c 100644 --- a/packages/core/src/test/shared/fs/fs.test.ts +++ b/packages/core/src/test/shared/fs/fs.test.ts @@ -340,6 +340,7 @@ describe('FileSystem', function () { it('deletes directory with recursive:true', async function () { const dir = await testFolder.mkdir() + await testFolder.write('testfile.txt', 'testText') await fs.delete(dir, { recursive: true }) assert(!existsSync(dir)) }) @@ -348,7 +349,7 @@ describe('FileSystem', function () { const dir = await testFolder.mkdir() const f = path.join(dir, 'missingfile.txt') assert(!existsSync(f)) - await fs.delete(f, { recursive: true }) + await fs.delete(f) }) it('error if file *and* its parent dir not found', async function () { diff --git a/packages/core/src/test/shared/utilities/functionUtils.test.ts b/packages/core/src/test/shared/utilities/functionUtils.test.ts index 43da4ebb619..fb87cf3501e 100644 --- a/packages/core/src/test/shared/utilities/functionUtils.test.ts +++ b/packages/core/src/test/shared/utilities/functionUtils.test.ts @@ -4,8 +4,10 @@ */ import assert from 'assert' -import { once, onceChanged, debounce } from '../../../shared/utilities/functionUtils' +import { once, onceChanged, debounce, withRetries } from '../../../shared/utilities/functionUtils' import { installFakeClock } from '../../testUtil' +import { stub, SinonStub } from 'sinon' +import { InstalledClock } from '@sinonjs/fake-timers' describe('functionUtils', function () { it('once()', function () { @@ -107,3 +109,82 @@ describe('debounce', function () { }) }) }) + +// function to test the withRetries method. It passes in a stub function as the argument and has different tests that throw on different iterations +describe('withRetries', function () { + let clock: InstalledClock + let fn: SinonStub + + beforeEach(function () { + fn = stub() + clock = installFakeClock() + }) + + afterEach(function () { + clock.uninstall() + }) + + it('retries the function until it succeeds, using defaults', async function () { + fn.onCall(0).throws() + fn.onCall(1).throws() + fn.onCall(2).resolves('success') + assert.strictEqual(await withRetries(fn), 'success') + }) + + it('retries the function until it succeeds at the final try', async function () { + fn.onCall(0).throws() + fn.onCall(1).throws() + fn.onCall(2).throws() + fn.onCall(3).resolves('success') + assert.strictEqual(await withRetries(fn, { maxRetries: 4 }), 'success') + }) + + it('throws the last error if the function always fails, using defaults', async function () { + fn.onCall(0).throws() + fn.onCall(1).throws() + fn.onCall(2).throws() + fn.onCall(3).resolves('unreachable') + await assert.rejects(async () => { + await withRetries(fn) + }) + }) + + it('throws the last error if the function always fails', async function () { + fn.onCall(0).throws() + fn.onCall(1).throws() + fn.onCall(2).throws() + fn.onCall(3).throws() + fn.onCall(4).resolves('unreachable') + await assert.rejects(async () => { + await withRetries(fn, { maxRetries: 4 }) + }) + }) + + it('honors retry delay + backoff multiplier', async function () { + fn.onCall(0).throws() // 100ms + fn.onCall(1).throws() // 200ms + fn.onCall(2).throws() // 400ms + fn.onCall(3).resolves('success') + + const res = withRetries(fn, { maxRetries: 4, delay: 100, backoff: 2 }) + + // Check the call count after each iteration, ensuring the function is called + // after the correct delay between retries. + await clock.tickAsync(99) + assert.strictEqual(fn.callCount, 1) + await clock.tickAsync(1) + assert.strictEqual(fn.callCount, 2) + + await clock.tickAsync(199) + assert.strictEqual(fn.callCount, 2) + await clock.tickAsync(1) + assert.strictEqual(fn.callCount, 3) + + await clock.tickAsync(399) + assert.strictEqual(fn.callCount, 3) + await clock.tickAsync(1) + assert.strictEqual(fn.callCount, 4) + + assert.strictEqual(await res, 'success') + }) +}) diff --git a/packages/core/src/test/shared/utilities/osUtils.test.ts b/packages/core/src/test/shared/utilities/osUtils.test.ts index 1654aca8cc8..bd6e4489ac0 100644 --- a/packages/core/src/test/shared/utilities/osUtils.test.ts +++ b/packages/core/src/test/shared/utilities/osUtils.test.ts @@ -22,32 +22,32 @@ describe('isNewOsSession', () => { }) it('unix-like: returns true when expected', async () => { - const uptimeStub = sandbox.stub() + const uptimeMillisStub = sandbox.stub() const now = sandbox.stub() - // We started our computer at 2 minutes since epoch (time - pc uptime) - // and the comptuer has been on for 1 minute. So the OS started 1 minute since epoch. - now.returns(60_000 + 60_000) - uptimeStub.returns(1) + // We started our computer at 1 minutes since epoch and the comptuer uptime has been 1 minute. + // So the OS started at the epoch (time - uptime). + now.returns(0) // the epoch time + uptimeMillisStub.returns(0) // this session has just started // On a brand new session the first caller will get true - assert.strictEqual(await isNewOsSession(now, uptimeStub), true) + assert.strictEqual(await isNewOsSession(now, uptimeMillisStub), true) // Subsequent callers will get false - assert.strictEqual(await isNewOsSession(now, uptimeStub), false) - - // Start a computer session 10 minutes from epoch - uptimeStub.returns(0) - now.returns(60_000 * 10) - assert.strictEqual(await isNewOsSession(now, uptimeStub), true) - // Anything that is within a 5 second threshold of the last session time, is considered the same session - now.returns(60_000 * 10 + 5000) - assert.strictEqual(await isNewOsSession(now, uptimeStub), false) - now.returns(60_000 * 10 + 5000 + 1) - assert.strictEqual(await isNewOsSession(now, uptimeStub), true) - - // A non-zero uptime - uptimeStub.returns(5) // The computer has been running for 5 minutes already, so the start time is relative to this. - now.returns(60_000 * 10 + 5000 + 60_000 * 10) // 5 minutes since last session - // Nothing changes since the diff between uptime and the last start has not changed - assert.strictEqual(await isNewOsSession(now, uptimeStub), true) + assert.strictEqual(await isNewOsSession(now, uptimeMillisStub), false) + + // 10 minutes later, same session + now.returns(1000 * 60 * 10) + uptimeMillisStub.returns(1000 * 60 * 10) // This scales proportionately with the current time + // This is still the same session, so we get false + assert.strictEqual(await isNewOsSession(now, uptimeMillisStub), false) + + // Test the lowerbound of what is considered a new session + // Pretend we started a new computer session 5 seconds after the initial session + uptimeMillisStub.returns(0) + now.returns(5000) + // Anything that is within a 5 second threshold of the last session time, is considered the SAME session + assert.strictEqual(await isNewOsSession(now, uptimeMillisStub), false) + // This is 1 millisecond after the threshold, it is considered a NEW session + now.returns(5000 + 1) + assert.strictEqual(await isNewOsSession(now, uptimeMillisStub), true) }) }) diff --git a/packages/core/src/test/testUtil.ts b/packages/core/src/test/testUtil.ts index c09fbfe6ccc..c54f4690628 100644 --- a/packages/core/src/test/testUtil.ts +++ b/packages/core/src/test/testUtil.ts @@ -75,7 +75,7 @@ export function getWorkspaceFolder(dir: string): vscode.WorkspaceFolder { * But if the day comes that we need it for web, we should be able to add some agnostic FS methods in here. */ export class TestFolder { - protected constructor(private readonly rootFolder: string) {} + protected constructor(public readonly path: string) {} /** Creates a folder that deletes itself once all tests are done running. */ static async create() { @@ -118,10 +118,6 @@ export class TestFolder { pathFrom(relativePath: string): string { return path.join(this.path, relativePath) } - - get path(): string { - return path.join(this.rootFolder) - } } /**