Skip to content

Commit 192d1ea

Browse files
test(crash): Crash minor fixes (#5804)
See each commit message for revelant fix --- <!--- REMINDER: Ensure that your PR meets the guidelines in CONTRIBUTING.md --> License: I confirm that my contribution is made under the terms of the Apache 2.0 license. --------- Signed-off-by: nkomonen-amazon <[email protected]>
1 parent b958880 commit 192d1ea

File tree

2 files changed

+167
-63
lines changed

2 files changed

+167
-63
lines changed

packages/core/src/shared/crashMonitoring.ts

Lines changed: 35 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,8 @@ export class CrashMonitoring {
6969
private readonly checkInterval: number,
7070
private readonly isDevMode: boolean,
7171
private readonly isAutomation: boolean,
72-
private readonly devLogger: Logger | undefined
72+
private readonly devLogger: Logger | undefined,
73+
private readonly timeLag: TimeLag
7374
) {}
7475

7576
static #didTryCreate = false
@@ -92,7 +93,8 @@ export class CrashMonitoring {
9293
DevSettings.instance.get('crashCheckInterval', 1000 * 60 * 10), // check every 10 minutes
9394
isDevMode,
9495
isAutomation(),
95-
devModeLogger
96+
devModeLogger,
97+
new TimeLag()
9698
))
9799
} catch (error) {
98100
emitFailure({ functionName: 'instance', error })
@@ -115,7 +117,13 @@ export class CrashMonitoring {
115117
this.heartbeat = new Heartbeat(this.state, this.checkInterval, this.isDevMode)
116118
this.heartbeat.onFailure(() => this.cleanup())
117119

118-
this.crashChecker = new CrashChecker(this.state, this.checkInterval, this.isDevMode, this.devLogger)
120+
this.crashChecker = new CrashChecker(
121+
this.state,
122+
this.checkInterval,
123+
this.isDevMode,
124+
this.devLogger,
125+
this.timeLag
126+
)
119127
this.crashChecker.onFailure(() => this.cleanup())
120128

121129
await this.heartbeat.start()
@@ -236,7 +244,7 @@ class CrashChecker {
236244
* Solution: Keep track of the lag, and then skip the next crash check if there was a lag. This will give time for the
237245
* next heartbeat to be sent.
238246
*/
239-
private readonly timeLag: TimeLag = new TimeLag()
247+
private readonly timeLag: TimeLag
240248
) {}
241249

242250
public async start() {
@@ -391,7 +399,7 @@ export async function crashMonitoringStateFactory(deps = getDefaultDependencies(
391399
* - is not truly reliable since filesystems are not reliable
392400
*/
393401
export class FileSystemState {
394-
private readonly stateDirPath: string
402+
public readonly stateDirPath: string
395403

396404
/**
397405
* IMORTANT: Use {@link crashMonitoringStateFactory} to make an instance
@@ -512,9 +520,10 @@ export class FileSystemState {
512520
protected createExtId(ext: ExtInstance): ExtInstanceId {
513521
return `${ext.extHostPid}_${ext.sessionId}`
514522
}
523+
private readonly fileSuffix = 'running'
515524
private makeStateFilePath(ext: ExtInstance | ExtInstanceId) {
516525
const extId = typeof ext === 'string' ? ext : this.createExtId(ext)
517-
return path.join(this.stateDirPath, extId)
526+
return path.join(this.stateDirPath, extId + `.${this.fileSuffix}`)
518527
}
519528
public async clearState(): Promise<void> {
520529
this.deps.devLogger?.debug('crashMonitoring: CLEAR_STATE: Started')
@@ -525,10 +534,26 @@ export class FileSystemState {
525534
}
526535
public async getAllExts(): Promise<ExtInstanceHeartbeat[]> {
527536
const res = await withFailCtx('getAllExts', async () => {
528-
// The file names are intentionally the IDs for easy mapping
529-
const allExtIds: ExtInstanceId[] = await withFailCtx('readdir', async () =>
530-
(await fs.readdir(this.stateDirPath)).map((k) => k[0])
531-
)
537+
// Read all the exts from the filesystem, deserializing as needed
538+
const allExtIds: ExtInstanceId[] = await withFailCtx('readdir', async () => {
539+
const filesInDir = await fs.readdir(this.stateDirPath)
540+
const relevantFiles = filesInDir.filter((file: [string, vscode.FileType]) => {
541+
const name = file[0]
542+
const type = file[1]
543+
if (type !== vscode.FileType.File) {
544+
return false
545+
}
546+
if (path.extname(name) !== `.${this.fileSuffix}`) {
547+
return false
548+
}
549+
return true
550+
})
551+
const idsFromFileNames = relevantFiles.map((file: [string, vscode.FileType]) => {
552+
const name = file[0]
553+
return name.split('.')[0]
554+
})
555+
return idsFromFileNames
556+
})
532557

533558
const allExts = allExtIds.map<Promise<ExtInstanceHeartbeat | undefined>>(async (extId: string) => {
534559
// Due to a race condition, a separate extension instance may have removed this file by this point. It is okay since

packages/core/src/test/shared/crashMonitoring.test.ts

Lines changed: 132 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,10 @@ import globals from '../../shared/extensionGlobals'
99
import { CrashMonitoring, ExtInstance, crashMonitoringStateFactory } from '../../shared/crashMonitoring'
1010
import { isCI } from '../../shared/vscode/env'
1111
import { getLogger } from '../../shared/logger/logger'
12+
import { TimeLag } from '../../shared/utilities/timeoutUtils'
13+
import { SinonSandbox, createSandbox } from 'sinon'
14+
import { fs } from '../../shared'
15+
import path from 'path'
1216

1317
class TestCrashMonitoring extends CrashMonitoring {
1418
public constructor(...deps: ConstructorParameters<typeof CrashMonitoring>) {
@@ -24,6 +28,8 @@ class TestCrashMonitoring extends CrashMonitoring {
2428
export const crashMonitoringTest = async () => {
2529
let testFolder: TestFolder
2630
let spawnedExtensions: TestCrashMonitoring[]
31+
let timeLag: TimeLag
32+
let sandbox: SinonSandbox
2733

2834
// Scale down the default interval we heartbeat and check for crashes to something much short for testing.
2935
const checkInterval = 200
@@ -38,48 +44,54 @@ export const crashMonitoringTest = async () => {
3844
* 1:1 mapping between Crash Reporting instances and the Extension instances.
3945
*/
4046
async function makeTestExtensions(amount: number) {
41-
const devLogger = getLogger()
42-
4347
const extensions: TestExtension[] = []
4448
for (let i = 0; i < amount; i++) {
45-
const sessionId = `sessionId-${i}`
46-
const pid = Number(String(i).repeat(6))
47-
const state = await crashMonitoringStateFactory({
48-
workDirPath: testFolder.path,
49-
isStateStale: async () => false,
50-
pid,
51-
sessionId: sessionId,
52-
now: () => globals.clock.Date.now(),
53-
memento: globals.globalState,
54-
isDevMode: true,
55-
devLogger,
56-
})
57-
const ext = new TestCrashMonitoring(state, checkInterval, true, false, devLogger)
58-
spawnedExtensions.push(ext)
59-
const metadata = {
60-
extHostPid: pid,
61-
sessionId,
62-
lastHeartbeat: globals.clock.Date.now(),
63-
isDebug: undefined,
64-
}
65-
extensions[i] = { ext, metadata }
49+
extensions[i] = await makeTestExtension(i, new TimeLag())
6650
}
6751
return extensions
6852
}
6953

54+
async function makeTestExtension(id: number, timeLag: TimeLag, opts?: { isStateStale: () => Promise<boolean> }) {
55+
const isStateStale = opts?.isStateStale ?? (() => Promise.resolve(false))
56+
const sessionId = `sessionId-${id}`
57+
const pid = Number(String(id).repeat(6))
58+
59+
const state = await crashMonitoringStateFactory({
60+
workDirPath: testFolder.path,
61+
isStateStale,
62+
pid,
63+
sessionId: sessionId,
64+
now: () => globals.clock.Date.now(),
65+
memento: globals.globalState,
66+
isDevMode: true,
67+
devLogger: getLogger(),
68+
})
69+
const ext = new TestCrashMonitoring(state, checkInterval, true, false, getLogger(), timeLag)
70+
spawnedExtensions.push(ext)
71+
const metadata = {
72+
extHostPid: pid,
73+
sessionId,
74+
lastHeartbeat: globals.clock.Date.now(),
75+
isDebug: undefined,
76+
}
77+
return { ext, metadata }
78+
}
79+
7080
beforeEach(async function () {
7181
testFolder = await TestFolder.create()
7282
spawnedExtensions = []
83+
timeLag = new TimeLag()
84+
sandbox = createSandbox()
7385
})
7486

7587
afterEach(async function () {
7688
// clean up all running instances
7789
spawnedExtensions?.forEach((e) => e.crash())
90+
timeLag.cleanup()
91+
sandbox.restore()
7892
})
7993

8094
it('graceful shutdown no metric emitted', async function () {
81-
// this.retries(3)
82-
8395
const exts = await makeTestExtensions(2)
8496

8597
await exts[0].ext.start()
@@ -95,9 +107,7 @@ export const crashMonitoringTest = async () => {
95107
assertTelemetry('session_end', [])
96108
})
97109

98-
it('single running instances crashes, so nothing is reported, but a new instaces appears and reports', async function () {
99-
// this.retries(3)
100-
110+
it('single running instance crashes, so nothing is reported, but a new instaces appears and reports', async function () {
101111
const exts = await makeTestExtensions(2)
102112

103113
await exts[0].ext.start()
@@ -112,34 +122,11 @@ export const crashMonitoringTest = async () => {
112122
assertCrashedExtensions([exts[0]])
113123
})
114124

115-
it('start the first extension, then start many subsequent ones and crash them all at once', async function () {
116-
// this.retries(3)
117-
const latestCrashedExts: TestExtension[] = []
118-
119-
const extCount = 10
120-
const exts = await makeTestExtensions(extCount)
121-
for (let i = 0; i < extCount; i++) {
122-
await exts[i].ext.start()
123-
}
124-
125-
// Crash all exts except the 0th one
126-
for (let i = 1; i < extCount; i++) {
127-
await exts[i].ext.crash()
128-
latestCrashedExts.push(exts[i])
129-
}
130-
131-
// Give some extra time since there is a lot of file i/o
132-
await awaitIntervals(oneInterval * 2)
133-
134-
assertCrashedExtensions(latestCrashedExts)
135-
})
136-
137-
it('the Primary checker crashes and another checker is promoted to Primary', async function () {
138-
// this.retries(3)
125+
it('multiple running instances start+crash at different times, but another instance always reports', async function () {
139126
const latestCrashedExts: TestExtension[] = []
140127

141128
const exts = await makeTestExtensions(4)
142-
// Ext 0 is the Primary checker
129+
143130
await exts[0].ext.start()
144131
await awaitIntervals(oneInterval)
145132

@@ -166,6 +153,73 @@ export const crashMonitoringTest = async () => {
166153
assertCrashedExtensions(latestCrashedExts)
167154
})
168155

156+
it('clears the state when a new os session is determined', async function () {
157+
const exts = await makeTestExtensions(1)
158+
159+
// Start an extension then crash it
160+
await exts[0].ext.start()
161+
await exts[0].ext.crash()
162+
await awaitIntervals(oneInterval)
163+
// There is no other active instance to report the issue
164+
assertTelemetry('session_end', [])
165+
166+
// This extension clears the state due to it being stale, not reporting the previously crashed ext
167+
const ext1 = await makeTestExtension(1, timeLag, { isStateStale: () => Promise.resolve(true) })
168+
await ext1.ext.start()
169+
await awaitIntervals(oneInterval * 1)
170+
assertCrashedExtensions([])
171+
})
172+
173+
it('start the first extension, then start many subsequent ones and crash them all at once', async function () {
174+
const latestCrashedExts: TestExtension[] = []
175+
176+
const extCount = 10
177+
const exts = await makeTestExtensions(extCount)
178+
for (let i = 0; i < extCount; i++) {
179+
await exts[i].ext.start()
180+
}
181+
182+
// Crash all exts except the 0th one
183+
for (let i = 1; i < extCount; i++) {
184+
await exts[i].ext.crash()
185+
latestCrashedExts.push(exts[i])
186+
}
187+
188+
// Give some extra time since there is a lot of file i/o
189+
await awaitIntervals(oneInterval * 2)
190+
191+
assertCrashedExtensions(latestCrashedExts)
192+
})
193+
194+
it('does not check for crashes when there is a time lag', async function () {
195+
// This test handles the case for a users computer doing a sleep+wake and
196+
// then a crash was incorrectly reported since a new heartbeat could not be sent in time
197+
198+
const timeLagStub = sandbox.stub(timeLag)
199+
timeLagStub.start.resolves()
200+
timeLagStub.didLag.resolves(false)
201+
202+
// Load up a crash
203+
const ext0 = await makeTestExtension(0, timeLagStub as unknown as TimeLag)
204+
await ext0.ext.start()
205+
await ext0.ext.crash()
206+
207+
const ext1 = await makeTestExtension(1, timeLagStub as unknown as TimeLag)
208+
await ext1.ext.start()
209+
210+
// Indicate that we have a time lag, and until it returns false
211+
// we will skip crash checking
212+
timeLagStub.didLag.resolves(true)
213+
await awaitIntervals(oneInterval)
214+
assertCrashedExtensions([])
215+
await awaitIntervals(oneInterval)
216+
assertCrashedExtensions([])
217+
// Now that the time lag is true, we will check for a crash
218+
timeLagStub.didLag.resolves(false)
219+
await awaitIntervals(oneInterval)
220+
assertCrashedExtensions([ext0])
221+
})
222+
169223
/**
170224
* Something like the following code can switch contexts early and the test will
171225
* finish before it has completed. Certain async functions that may take longer to run
@@ -219,6 +273,31 @@ export const crashMonitoringTest = async () => {
219273
function deduplicate<T>(array: T[], predicate: (a: T, b: T) => boolean): T[] {
220274
return array.filter((item, index, self) => index === self.findIndex((t) => predicate(item, t)))
221275
}
276+
277+
describe('FileSystemState', async function () {
278+
it('ignores irrelevant files in state', async function () {
279+
const state = await crashMonitoringStateFactory({
280+
workDirPath: testFolder.path,
281+
isStateStale: () => Promise.resolve(false),
282+
pid: 1111,
283+
sessionId: 'sessionId_1111',
284+
now: () => globals.clock.Date.now(),
285+
memento: globals.globalState,
286+
isDevMode: true,
287+
devLogger: getLogger(),
288+
})
289+
const stateDirPath = state.stateDirPath
290+
291+
assert.deepStrictEqual((await fs.readdir(stateDirPath)).length, 0)
292+
await fs.writeFile(path.join(stateDirPath, 'ignoreMe.json'), '')
293+
await fs.mkdir(path.join(stateDirPath, 'ignoreMe'))
294+
await state.sendHeartbeat() // creates a relevant file in the state
295+
assert.deepStrictEqual((await fs.readdir(stateDirPath)).length, 3)
296+
297+
const result = await state.getAllExts()
298+
assert.deepStrictEqual(result.length, 1)
299+
})
300+
})
222301
}
223302
// This test is slow, so we only want to run it locally and not in CI. It will be run in the integ CI tests though.
224303
;(isCI() ? describe.skip : describe)('CrashReporting', crashMonitoringTest)

0 commit comments

Comments
 (0)