Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 35 additions & 10 deletions packages/core/src/shared/crashMonitoring.ts
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,8 @@ export class CrashMonitoring {
private readonly checkInterval: number,
private readonly isDevMode: boolean,
private readonly isAutomation: boolean,
private readonly devLogger: Logger | undefined
private readonly devLogger: Logger | undefined,
private readonly timeLag: TimeLag
) {}

static #didTryCreate = false
Expand All @@ -92,7 +93,8 @@ export class CrashMonitoring {
DevSettings.instance.get('crashCheckInterval', 1000 * 60 * 10), // check every 10 minutes
isDevMode,
isAutomation(),
devModeLogger
devModeLogger,
new TimeLag()
))
} catch (error) {
emitFailure({ functionName: 'instance', error })
Expand All @@ -115,7 +117,13 @@ export class CrashMonitoring {
this.heartbeat = new Heartbeat(this.state, this.checkInterval, this.isDevMode)
this.heartbeat.onFailure(() => this.cleanup())

this.crashChecker = new CrashChecker(this.state, this.checkInterval, this.isDevMode, this.devLogger)
this.crashChecker = new CrashChecker(
this.state,
this.checkInterval,
this.isDevMode,
this.devLogger,
this.timeLag
)
this.crashChecker.onFailure(() => this.cleanup())

await this.heartbeat.start()
Expand Down Expand Up @@ -236,7 +244,7 @@ class CrashChecker {
* Solution: Keep track of the lag, and then skip the next crash check if there was a lag. This will give time for the
* next heartbeat to be sent.
*/
private readonly timeLag: TimeLag = new TimeLag()
private readonly timeLag: TimeLag
) {}

public async start() {
Expand Down Expand Up @@ -391,7 +399,7 @@ export async function crashMonitoringStateFactory(deps = getDefaultDependencies(
* - is not truly reliable since filesystems are not reliable
*/
export class FileSystemState {
private readonly stateDirPath: string
public readonly stateDirPath: string

/**
* IMORTANT: Use {@link crashMonitoringStateFactory} to make an instance
Expand Down Expand Up @@ -512,9 +520,10 @@ export class FileSystemState {
protected createExtId(ext: ExtInstance): ExtInstanceId {
return `${ext.extHostPid}_${ext.sessionId}`
}
private readonly fileSuffix = 'running'
private makeStateFilePath(ext: ExtInstance | ExtInstanceId) {
const extId = typeof ext === 'string' ? ext : this.createExtId(ext)
return path.join(this.stateDirPath, extId)
return path.join(this.stateDirPath, extId + `.${this.fileSuffix}`)
}
public async clearState(): Promise<void> {
this.deps.devLogger?.debug('crashMonitoring: CLEAR_STATE: Started')
Expand All @@ -525,10 +534,26 @@ export class FileSystemState {
}
public async getAllExts(): Promise<ExtInstanceHeartbeat[]> {
const res = await withFailCtx('getAllExts', async () => {
// The file names are intentionally the IDs for easy mapping
const allExtIds: ExtInstanceId[] = await withFailCtx('readdir', async () =>
(await fs.readdir(this.stateDirPath)).map((k) => k[0])
)
// Read all the exts from the filesystem, deserializing as needed
const allExtIds: ExtInstanceId[] = await withFailCtx('readdir', async () => {
const filesInDir = await fs.readdir(this.stateDirPath)
const relevantFiles = filesInDir.filter((file: [string, vscode.FileType]) => {
const name = file[0]
const type = file[1]
if (type !== vscode.FileType.File) {
return false
}
if (path.extname(name) !== `.${this.fileSuffix}`) {
return false
}
return true
})
const idsFromFileNames = relevantFiles.map((file: [string, vscode.FileType]) => {
const name = file[0]
return name.split('.')[0]
})
return idsFromFileNames
})

const allExts = allExtIds.map<Promise<ExtInstanceHeartbeat | undefined>>(async (extId: string) => {
// Due to a race condition, a separate extension instance may have removed this file by this point. It is okay since
Expand Down
185 changes: 132 additions & 53 deletions packages/core/src/test/shared/crashMonitoring.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,10 @@ import globals from '../../shared/extensionGlobals'
import { CrashMonitoring, ExtInstance, crashMonitoringStateFactory } from '../../shared/crashMonitoring'
import { isCI } from '../../shared/vscode/env'
import { getLogger } from '../../shared/logger/logger'
import { TimeLag } from '../../shared/utilities/timeoutUtils'
import { SinonSandbox, createSandbox } from 'sinon'
import { fs } from '../../shared'
import path from 'path'

class TestCrashMonitoring extends CrashMonitoring {
public constructor(...deps: ConstructorParameters<typeof CrashMonitoring>) {
Expand All @@ -24,6 +28,8 @@ class TestCrashMonitoring extends CrashMonitoring {
export const crashMonitoringTest = async () => {
let testFolder: TestFolder
let spawnedExtensions: TestCrashMonitoring[]
let timeLag: TimeLag
let sandbox: SinonSandbox

// Scale down the default interval we heartbeat and check for crashes to something much short for testing.
const checkInterval = 200
Expand All @@ -38,48 +44,54 @@ export const crashMonitoringTest = async () => {
* 1:1 mapping between Crash Reporting instances and the Extension instances.
*/
async function makeTestExtensions(amount: number) {
const devLogger = getLogger()

const extensions: TestExtension[] = []
for (let i = 0; i < amount; i++) {
const sessionId = `sessionId-${i}`
const pid = Number(String(i).repeat(6))
const state = await crashMonitoringStateFactory({
workDirPath: testFolder.path,
isStateStale: async () => false,
pid,
sessionId: sessionId,
now: () => globals.clock.Date.now(),
memento: globals.globalState,
isDevMode: true,
devLogger,
})
const ext = new TestCrashMonitoring(state, checkInterval, true, false, devLogger)
spawnedExtensions.push(ext)
const metadata = {
extHostPid: pid,
sessionId,
lastHeartbeat: globals.clock.Date.now(),
isDebug: undefined,
}
extensions[i] = { ext, metadata }
extensions[i] = await makeTestExtension(i, new TimeLag())
}
return extensions
}

async function makeTestExtension(id: number, timeLag: TimeLag, opts?: { isStateStale: () => Promise<boolean> }) {
const isStateStale = opts?.isStateStale ?? (() => Promise.resolve(false))
const sessionId = `sessionId-${id}`
const pid = Number(String(id).repeat(6))

const state = await crashMonitoringStateFactory({
workDirPath: testFolder.path,
isStateStale,
pid,
sessionId: sessionId,
now: () => globals.clock.Date.now(),
memento: globals.globalState,
isDevMode: true,
devLogger: getLogger(),
})
const ext = new TestCrashMonitoring(state, checkInterval, true, false, getLogger(), timeLag)
spawnedExtensions.push(ext)
const metadata = {
extHostPid: pid,
sessionId,
lastHeartbeat: globals.clock.Date.now(),
isDebug: undefined,
}
return { ext, metadata }
}

beforeEach(async function () {
testFolder = await TestFolder.create()
spawnedExtensions = []
timeLag = new TimeLag()
sandbox = createSandbox()
})

afterEach(async function () {
// clean up all running instances
spawnedExtensions?.forEach((e) => e.crash())
timeLag.cleanup()
sandbox.restore()
})

it('graceful shutdown no metric emitted', async function () {
// this.retries(3)

const exts = await makeTestExtensions(2)

await exts[0].ext.start()
Expand All @@ -95,9 +107,7 @@ export const crashMonitoringTest = async () => {
assertTelemetry('session_end', [])
})

it('single running instances crashes, so nothing is reported, but a new instaces appears and reports', async function () {
// this.retries(3)

it('single running instance crashes, so nothing is reported, but a new instaces appears and reports', async function () {
const exts = await makeTestExtensions(2)

await exts[0].ext.start()
Expand All @@ -112,34 +122,11 @@ export const crashMonitoringTest = async () => {
assertCrashedExtensions([exts[0]])
})

it('start the first extension, then start many subsequent ones and crash them all at once', async function () {
// this.retries(3)
const latestCrashedExts: TestExtension[] = []

const extCount = 10
const exts = await makeTestExtensions(extCount)
for (let i = 0; i < extCount; i++) {
await exts[i].ext.start()
}

// Crash all exts except the 0th one
for (let i = 1; i < extCount; i++) {
await exts[i].ext.crash()
latestCrashedExts.push(exts[i])
}

// Give some extra time since there is a lot of file i/o
await awaitIntervals(oneInterval * 2)

assertCrashedExtensions(latestCrashedExts)
})

it('the Primary checker crashes and another checker is promoted to Primary', async function () {
// this.retries(3)
it('multiple running instances start+crash at different times, but another instance always reports', async function () {
const latestCrashedExts: TestExtension[] = []

const exts = await makeTestExtensions(4)
// Ext 0 is the Primary checker

await exts[0].ext.start()
await awaitIntervals(oneInterval)

Expand All @@ -166,6 +153,73 @@ export const crashMonitoringTest = async () => {
assertCrashedExtensions(latestCrashedExts)
})

it('clears the state when a new os session is determined', async function () {
const exts = await makeTestExtensions(1)

// Start an extension then crash it
await exts[0].ext.start()
await exts[0].ext.crash()
await awaitIntervals(oneInterval)
// There is no other active instance to report the issue
assertTelemetry('session_end', [])

// This extension clears the state due to it being stale, not reporting the previously crashed ext
const ext1 = await makeTestExtension(1, timeLag, { isStateStale: () => Promise.resolve(true) })
await ext1.ext.start()
await awaitIntervals(oneInterval * 1)
assertCrashedExtensions([])
})

it('start the first extension, then start many subsequent ones and crash them all at once', async function () {
const latestCrashedExts: TestExtension[] = []

const extCount = 10
const exts = await makeTestExtensions(extCount)
for (let i = 0; i < extCount; i++) {
await exts[i].ext.start()
}

// Crash all exts except the 0th one
for (let i = 1; i < extCount; i++) {
await exts[i].ext.crash()
latestCrashedExts.push(exts[i])
}

// Give some extra time since there is a lot of file i/o
await awaitIntervals(oneInterval * 2)

assertCrashedExtensions(latestCrashedExts)
})

it('does not check for crashes when there is a time lag', async function () {
// This test handles the case for a users computer doing a sleep+wake and
// then a crash was incorrectly reported since a new heartbeat could not be sent in time

const timeLagStub = sandbox.stub(timeLag)
timeLagStub.start.resolves()
timeLagStub.didLag.resolves(false)

// Load up a crash
const ext0 = await makeTestExtension(0, timeLagStub as unknown as TimeLag)
await ext0.ext.start()
await ext0.ext.crash()

const ext1 = await makeTestExtension(1, timeLagStub as unknown as TimeLag)
await ext1.ext.start()

// Indicate that we have a time lag, and until it returns false
// we will skip crash checking
timeLagStub.didLag.resolves(true)
await awaitIntervals(oneInterval)
assertCrashedExtensions([])
await awaitIntervals(oneInterval)
assertCrashedExtensions([])
// Now that the time lag is true, we will check for a crash
timeLagStub.didLag.resolves(false)
await awaitIntervals(oneInterval)
assertCrashedExtensions([ext0])
})

/**
* Something like the following code can switch contexts early and the test will
* finish before it has completed. Certain async functions that may take longer to run
Expand Down Expand Up @@ -219,6 +273,31 @@ export const crashMonitoringTest = async () => {
function deduplicate<T>(array: T[], predicate: (a: T, b: T) => boolean): T[] {
return array.filter((item, index, self) => index === self.findIndex((t) => predicate(item, t)))
}

describe('FileSystemState', async function () {
it('ignores irrelevant files in state', async function () {
const state = await crashMonitoringStateFactory({
workDirPath: testFolder.path,
isStateStale: () => Promise.resolve(false),
pid: 1111,
sessionId: 'sessionId_1111',
now: () => globals.clock.Date.now(),
memento: globals.globalState,
isDevMode: true,
devLogger: getLogger(),
})
const stateDirPath = state.stateDirPath

assert.deepStrictEqual((await fs.readdir(stateDirPath)).length, 0)
await fs.writeFile(path.join(stateDirPath, 'ignoreMe.json'), '')
await fs.mkdir(path.join(stateDirPath, 'ignoreMe'))
await state.sendHeartbeat() // creates a relevant file in the state
assert.deepStrictEqual((await fs.readdir(stateDirPath)).length, 3)

const result = await state.getAllExts()
assert.deepStrictEqual(result.length, 1)
})
})
}
// This test is slow, so we only want to run it locally and not in CI. It will be run in the integ CI tests though.
;(isCI() ? describe.skip : describe)('CrashReporting', crashMonitoringTest)
Expand Down
Loading