diff --git a/packages/core/src/shared/crashMonitoring.ts b/packages/core/src/shared/crashMonitoring.ts index b99bbef9436..3336a1e72fc 100644 --- a/packages/core/src/shared/crashMonitoring.ts +++ b/packages/core/src/shared/crashMonitoring.ts @@ -69,7 +69,8 @@ export class CrashMonitoring { private readonly checkInterval: number, private readonly isDevMode: boolean, private readonly isAutomation: boolean, - private readonly devLogger: Logger | undefined + private readonly devLogger: Logger | undefined, + private readonly timeLag: TimeLag ) {} static #didTryCreate = false @@ -92,7 +93,8 @@ export class CrashMonitoring { DevSettings.instance.get('crashCheckInterval', 1000 * 60 * 10), // check every 10 minutes isDevMode, isAutomation(), - devModeLogger + devModeLogger, + new TimeLag() )) } catch (error) { emitFailure({ functionName: 'instance', error }) @@ -115,7 +117,13 @@ export class CrashMonitoring { this.heartbeat = new Heartbeat(this.state, this.checkInterval, this.isDevMode) this.heartbeat.onFailure(() => this.cleanup()) - this.crashChecker = new CrashChecker(this.state, this.checkInterval, this.isDevMode, this.devLogger) + this.crashChecker = new CrashChecker( + this.state, + this.checkInterval, + this.isDevMode, + this.devLogger, + this.timeLag + ) this.crashChecker.onFailure(() => this.cleanup()) await this.heartbeat.start() @@ -236,7 +244,7 @@ class CrashChecker { * Solution: Keep track of the lag, and then skip the next crash check if there was a lag. This will give time for the * next heartbeat to be sent. */ - private readonly timeLag: TimeLag = new TimeLag() + private readonly timeLag: TimeLag ) {} public async start() { @@ -391,7 +399,7 @@ export async function crashMonitoringStateFactory(deps = getDefaultDependencies( * - is not truly reliable since filesystems are not reliable */ export class FileSystemState { - private readonly stateDirPath: string + public readonly stateDirPath: string /** * IMORTANT: Use {@link crashMonitoringStateFactory} to make an instance @@ -512,9 +520,10 @@ export class FileSystemState { protected createExtId(ext: ExtInstance): ExtInstanceId { return `${ext.extHostPid}_${ext.sessionId}` } + private readonly fileSuffix = 'running' private makeStateFilePath(ext: ExtInstance | ExtInstanceId) { const extId = typeof ext === 'string' ? ext : this.createExtId(ext) - return path.join(this.stateDirPath, extId) + return path.join(this.stateDirPath, extId + `.${this.fileSuffix}`) } public async clearState(): Promise { this.deps.devLogger?.debug('crashMonitoring: CLEAR_STATE: Started') @@ -525,10 +534,26 @@ export class FileSystemState { } public async getAllExts(): Promise { const res = await withFailCtx('getAllExts', async () => { - // The file names are intentionally the IDs for easy mapping - const allExtIds: ExtInstanceId[] = await withFailCtx('readdir', async () => - (await fs.readdir(this.stateDirPath)).map((k) => k[0]) - ) + // Read all the exts from the filesystem, deserializing as needed + const allExtIds: ExtInstanceId[] = await withFailCtx('readdir', async () => { + const filesInDir = await fs.readdir(this.stateDirPath) + const relevantFiles = filesInDir.filter((file: [string, vscode.FileType]) => { + const name = file[0] + const type = file[1] + if (type !== vscode.FileType.File) { + return false + } + if (path.extname(name) !== `.${this.fileSuffix}`) { + return false + } + return true + }) + const idsFromFileNames = relevantFiles.map((file: [string, vscode.FileType]) => { + const name = file[0] + return name.split('.')[0] + }) + return idsFromFileNames + }) const allExts = allExtIds.map>(async (extId: string) => { // Due to a race condition, a separate extension instance may have removed this file by this point. It is okay since diff --git a/packages/core/src/test/shared/crashMonitoring.test.ts b/packages/core/src/test/shared/crashMonitoring.test.ts index 88c448a6cf4..e231a37a400 100644 --- a/packages/core/src/test/shared/crashMonitoring.test.ts +++ b/packages/core/src/test/shared/crashMonitoring.test.ts @@ -9,6 +9,10 @@ import globals from '../../shared/extensionGlobals' import { CrashMonitoring, ExtInstance, crashMonitoringStateFactory } from '../../shared/crashMonitoring' import { isCI } from '../../shared/vscode/env' import { getLogger } from '../../shared/logger/logger' +import { TimeLag } from '../../shared/utilities/timeoutUtils' +import { SinonSandbox, createSandbox } from 'sinon' +import { fs } from '../../shared' +import path from 'path' class TestCrashMonitoring extends CrashMonitoring { public constructor(...deps: ConstructorParameters) { @@ -24,6 +28,8 @@ class TestCrashMonitoring extends CrashMonitoring { export const crashMonitoringTest = async () => { let testFolder: TestFolder let spawnedExtensions: TestCrashMonitoring[] + let timeLag: TimeLag + let sandbox: SinonSandbox // Scale down the default interval we heartbeat and check for crashes to something much short for testing. const checkInterval = 200 @@ -38,48 +44,54 @@ export const crashMonitoringTest = async () => { * 1:1 mapping between Crash Reporting instances and the Extension instances. */ async function makeTestExtensions(amount: number) { - const devLogger = getLogger() - const extensions: TestExtension[] = [] for (let i = 0; i < amount; i++) { - const sessionId = `sessionId-${i}` - const pid = Number(String(i).repeat(6)) - const state = await crashMonitoringStateFactory({ - workDirPath: testFolder.path, - isStateStale: async () => false, - pid, - sessionId: sessionId, - now: () => globals.clock.Date.now(), - memento: globals.globalState, - isDevMode: true, - devLogger, - }) - const ext = new TestCrashMonitoring(state, checkInterval, true, false, devLogger) - spawnedExtensions.push(ext) - const metadata = { - extHostPid: pid, - sessionId, - lastHeartbeat: globals.clock.Date.now(), - isDebug: undefined, - } - extensions[i] = { ext, metadata } + extensions[i] = await makeTestExtension(i, new TimeLag()) } return extensions } + async function makeTestExtension(id: number, timeLag: TimeLag, opts?: { isStateStale: () => Promise }) { + const isStateStale = opts?.isStateStale ?? (() => Promise.resolve(false)) + const sessionId = `sessionId-${id}` + const pid = Number(String(id).repeat(6)) + + const state = await crashMonitoringStateFactory({ + workDirPath: testFolder.path, + isStateStale, + pid, + sessionId: sessionId, + now: () => globals.clock.Date.now(), + memento: globals.globalState, + isDevMode: true, + devLogger: getLogger(), + }) + const ext = new TestCrashMonitoring(state, checkInterval, true, false, getLogger(), timeLag) + spawnedExtensions.push(ext) + const metadata = { + extHostPid: pid, + sessionId, + lastHeartbeat: globals.clock.Date.now(), + isDebug: undefined, + } + return { ext, metadata } + } + beforeEach(async function () { testFolder = await TestFolder.create() spawnedExtensions = [] + timeLag = new TimeLag() + sandbox = createSandbox() }) afterEach(async function () { // clean up all running instances spawnedExtensions?.forEach((e) => e.crash()) + timeLag.cleanup() + sandbox.restore() }) it('graceful shutdown no metric emitted', async function () { - // this.retries(3) - const exts = await makeTestExtensions(2) await exts[0].ext.start() @@ -95,9 +107,7 @@ export const crashMonitoringTest = async () => { assertTelemetry('session_end', []) }) - it('single running instances crashes, so nothing is reported, but a new instaces appears and reports', async function () { - // this.retries(3) - + it('single running instance crashes, so nothing is reported, but a new instaces appears and reports', async function () { const exts = await makeTestExtensions(2) await exts[0].ext.start() @@ -112,34 +122,11 @@ export const crashMonitoringTest = async () => { assertCrashedExtensions([exts[0]]) }) - it('start the first extension, then start many subsequent ones and crash them all at once', async function () { - // this.retries(3) - const latestCrashedExts: TestExtension[] = [] - - const extCount = 10 - const exts = await makeTestExtensions(extCount) - for (let i = 0; i < extCount; i++) { - await exts[i].ext.start() - } - - // Crash all exts except the 0th one - for (let i = 1; i < extCount; i++) { - await exts[i].ext.crash() - latestCrashedExts.push(exts[i]) - } - - // Give some extra time since there is a lot of file i/o - await awaitIntervals(oneInterval * 2) - - assertCrashedExtensions(latestCrashedExts) - }) - - it('the Primary checker crashes and another checker is promoted to Primary', async function () { - // this.retries(3) + it('multiple running instances start+crash at different times, but another instance always reports', async function () { const latestCrashedExts: TestExtension[] = [] const exts = await makeTestExtensions(4) - // Ext 0 is the Primary checker + await exts[0].ext.start() await awaitIntervals(oneInterval) @@ -166,6 +153,73 @@ export const crashMonitoringTest = async () => { assertCrashedExtensions(latestCrashedExts) }) + it('clears the state when a new os session is determined', async function () { + const exts = await makeTestExtensions(1) + + // Start an extension then crash it + await exts[0].ext.start() + await exts[0].ext.crash() + await awaitIntervals(oneInterval) + // There is no other active instance to report the issue + assertTelemetry('session_end', []) + + // This extension clears the state due to it being stale, not reporting the previously crashed ext + const ext1 = await makeTestExtension(1, timeLag, { isStateStale: () => Promise.resolve(true) }) + await ext1.ext.start() + await awaitIntervals(oneInterval * 1) + assertCrashedExtensions([]) + }) + + it('start the first extension, then start many subsequent ones and crash them all at once', async function () { + const latestCrashedExts: TestExtension[] = [] + + const extCount = 10 + const exts = await makeTestExtensions(extCount) + for (let i = 0; i < extCount; i++) { + await exts[i].ext.start() + } + + // Crash all exts except the 0th one + for (let i = 1; i < extCount; i++) { + await exts[i].ext.crash() + latestCrashedExts.push(exts[i]) + } + + // Give some extra time since there is a lot of file i/o + await awaitIntervals(oneInterval * 2) + + assertCrashedExtensions(latestCrashedExts) + }) + + it('does not check for crashes when there is a time lag', async function () { + // This test handles the case for a users computer doing a sleep+wake and + // then a crash was incorrectly reported since a new heartbeat could not be sent in time + + const timeLagStub = sandbox.stub(timeLag) + timeLagStub.start.resolves() + timeLagStub.didLag.resolves(false) + + // Load up a crash + const ext0 = await makeTestExtension(0, timeLagStub as unknown as TimeLag) + await ext0.ext.start() + await ext0.ext.crash() + + const ext1 = await makeTestExtension(1, timeLagStub as unknown as TimeLag) + await ext1.ext.start() + + // Indicate that we have a time lag, and until it returns false + // we will skip crash checking + timeLagStub.didLag.resolves(true) + await awaitIntervals(oneInterval) + assertCrashedExtensions([]) + await awaitIntervals(oneInterval) + assertCrashedExtensions([]) + // Now that the time lag is true, we will check for a crash + timeLagStub.didLag.resolves(false) + await awaitIntervals(oneInterval) + assertCrashedExtensions([ext0]) + }) + /** * Something like the following code can switch contexts early and the test will * finish before it has completed. Certain async functions that may take longer to run @@ -219,6 +273,31 @@ export const crashMonitoringTest = async () => { function deduplicate(array: T[], predicate: (a: T, b: T) => boolean): T[] { return array.filter((item, index, self) => index === self.findIndex((t) => predicate(item, t))) } + + describe('FileSystemState', async function () { + it('ignores irrelevant files in state', async function () { + const state = await crashMonitoringStateFactory({ + workDirPath: testFolder.path, + isStateStale: () => Promise.resolve(false), + pid: 1111, + sessionId: 'sessionId_1111', + now: () => globals.clock.Date.now(), + memento: globals.globalState, + isDevMode: true, + devLogger: getLogger(), + }) + const stateDirPath = state.stateDirPath + + assert.deepStrictEqual((await fs.readdir(stateDirPath)).length, 0) + await fs.writeFile(path.join(stateDirPath, 'ignoreMe.json'), '') + await fs.mkdir(path.join(stateDirPath, 'ignoreMe')) + await state.sendHeartbeat() // creates a relevant file in the state + assert.deepStrictEqual((await fs.readdir(stateDirPath)).length, 3) + + const result = await state.getAllExts() + assert.deepStrictEqual(result.length, 1) + }) + }) } // This test is slow, so we only want to run it locally and not in CI. It will be run in the integ CI tests though. ;(isCI() ? describe.skip : describe)('CrashReporting', crashMonitoringTest)