Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 1 addition & 5 deletions packages/core/src/shared/constants.ts
Original file line number Diff line number Diff line change
Expand Up @@ -164,10 +164,6 @@ export const amazonQVscodeMarketplace =
*
* Moved here to resolve circular dependency issues.
*/
export const crashMonitoringDirNames = {
root: 'crashMonitoring',
running: 'running',
shutdown: 'shutdown',
} as const
export const crashMonitoringDirName = 'crashMonitoring'

export const amazonQTabSuffix = '(Generated by Amazon Q)'
199 changes: 102 additions & 97 deletions packages/core/src/shared/crashMonitoring.ts
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,10 @@ import { isNewOsSession } from './utilities/osUtils'
import nodeFs from 'fs/promises'
import fs from './fs/fs'
import { getLogger } from './logger/logger'
import { crashMonitoringDirNames } from './constants'
import { crashMonitoringDirName } from './constants'
import { throwOnUnstableFileSystem } from './filesystemUtilities'
import { withRetries } from './utilities/functionUtils'
import { TimeLag } from './utilities/timeoutUtils'

const className = 'CrashMonitoring'

Expand Down Expand Up @@ -112,15 +113,17 @@ export class CrashMonitoring {

try {
this.heartbeat = new Heartbeat(this.state, this.checkInterval, this.isDevMode)
this.heartbeat.onFailure(() => this.cleanup())

this.crashChecker = new CrashChecker(this.state, this.checkInterval, this.isDevMode, this.devLogger)
this.crashChecker.onFailure(() => this.cleanup())

await this.heartbeat.start()
await this.crashChecker.start()
} catch (error) {
emitFailure({ functionName: 'start', error })
try {
this.crashChecker?.cleanup()
await this.heartbeat?.cleanup()
await this.cleanup()
} catch {}

// Surface errors during development, otherwise it can be missed.
Expand All @@ -146,6 +149,11 @@ export class CrashMonitoring {
}
}
}

public async cleanup() {
this.crashChecker?.cleanup()
await this.heartbeat?.cleanup()
}
}

/**
Expand All @@ -154,15 +162,19 @@ export class CrashMonitoring {
*/
class Heartbeat {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hearbeat is a concept that continues to come up, e.g. we have another heartbeat for codecatalyst:

private tryStartActivityHeartbeat = shared(async (reauth: boolean) => {

Even if the codecatalyst heartbeat is too different to be useful here, is this new Heartbeat not amenable to generalization? It is a general concept that is needed by different features that want to emit data on a timed basis.

Do we have a backlog item to pull this out / generalize it? Are features easily able to attach data to a toolkit_heartbeat metric, or did we name the metric crashreporting_x (not intuitive)?

Copy link
Contributor Author

@nkomonen-amazon nkomonen-amazon Oct 15, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There isn't much reuse between Codecatalyst heartbeat and this heartbeat atm. This one heartbeats on an interval, and the other heartbeats when the user performs actions in the IDE.

We could make an interface for the common apis, but this doesn't deduplicate any existing code.

I'll made a backlog item for unifying the heartbeats: https://issues.amazon.com/issues/IDE-15108


Are features easily able to attach data to a toolkit_heartbeat metric, or did we name the metric crashreporting_x (not intuitive)?

We dont have a metric sent for heartbeats since it is a bit noisy.

private intervalRef: NodeJS.Timer | undefined
private _onFailure = new vscode.EventEmitter<void>()
public onFailure: vscode.Event<void> = this._onFailure.event
private readonly heartbeatInterval: number

constructor(
private readonly state: FileSystemState,
private readonly checkInterval: number,
checkInterval: number,
private readonly isDevMode: boolean
) {}
) {
this.heartbeatInterval = checkInterval / 2
}

public async start() {
const heartbeatInterval = this.checkInterval / 2

// Send an initial heartbeat immediately
await withFailCtx('initialSendHeartbeat', () => this.state.sendHeartbeat())

Expand All @@ -179,14 +191,15 @@ class Heartbeat {
if (this.isDevMode) {
throw e
}
this._onFailure.fire()
}
}, heartbeatInterval)
}, this.heartbeatInterval)
}

/** Stops everything, signifying a graceful shutdown */
public async shutdown() {
globals.clock.clearInterval(this.intervalRef)
return this.state.indicateGracefulShutdown()
await this.state.indicateGracefulShutdown()
}

/**
Expand Down Expand Up @@ -217,34 +230,55 @@ class Heartbeat {
*/
class CrashChecker {
private intervalRef: NodeJS.Timer | undefined
private _onFailure = new vscode.EventEmitter<void>()
public onFailure = this._onFailure.event

constructor(
private readonly state: FileSystemState,
private readonly checkInterval: number,
private readonly isDevMode: boolean,
private readonly devLogger: Logger | undefined
private readonly devLogger: Logger | undefined,
/**
* This class is required for the following edge case:
* 1. Heartbeat is sent
* 2. Computer goes to sleep for X minutes
* 3. Wake up computer. But before a new heartbeat can be sent, a crash checker (can be from another ext instance) runs
* and sees a stale heartbeat. It assumes a crash.
*
* Why? Intervals do not run while the computer is asleep, so the latest heartbeat has a "lag" since it wasn't able to send
* a new heartbeat.
* Then on wake, there is a racecondition for the next heartbeat to be sent before the next crash check. If the crash checker
* runs first it will incorrectly conclude a crash.
*
* Solution: Keep track of the lag, and then skip the next crash check if there was a lag. This will give time for the
* next heartbeat to be sent.
*/
private readonly timeLag: TimeLag = new TimeLag()
) {}

public async start() {
{
this.devLogger?.debug(`crashMonitoring: checkInterval ${this.checkInterval}`)

this.timeLag.start()

// do an initial check
await withFailCtx('initialCrashCheck', () =>
tryCheckCrash(this.state, this.checkInterval, this.isDevMode, this.devLogger)
tryCheckCrash(this.state, this.checkInterval, this.isDevMode, this.devLogger, this.timeLag)
)

// check on an interval
this.intervalRef = globals.clock.setInterval(async () => {
try {
await tryCheckCrash(this.state, this.checkInterval, this.isDevMode, this.devLogger)
await tryCheckCrash(this.state, this.checkInterval, this.isDevMode, this.devLogger, this.timeLag)
} catch (e) {
emitFailure({ functionName: 'checkCrashInterval', error: e })
this.cleanup()

if (this.isDevMode) {
throw e
}

this._onFailure.fire()
}
}, this.checkInterval)
}
Expand All @@ -255,8 +289,15 @@ class CrashChecker {
state: FileSystemState,
checkInterval: number,
isDevMode: boolean,
devLogger: Logger | undefined
devLogger: Logger | undefined,
timeLag: TimeLag
) {
if (await timeLag.didLag()) {
timeLag.reset()
devLogger?.warn('crashMonitoring: SKIPPED check crash due to time lag')
return
}

// Iterate all known extensions and for each check if they have crashed
const knownExts = await state.getAllExts()
const runningExts: ExtInstanceHeartbeat[] = []
Expand All @@ -266,35 +307,26 @@ class CrashChecker {
continue
}

// Ext is not running anymore, handle appropriately depending on why it stopped running
await state.handleExtNotRunning(ext, {
onShutdown: async () => {
// Nothing to do, just log info if necessary
devLogger?.debug(
`crashMonitoring: SHUTDOWN: following has gracefully shutdown: pid ${ext.extHostPid} + sessionId: ${ext.sessionId}`
)
},
onCrash: async () => {
// Debugger instances may incorrectly look like they crashed, so don't emit.
// Example is if I hit the red square in the debug menu, it is a non-graceful shutdown. But the regular
// 'x' button in the Debug IDE instance is a graceful shutdown.
if (ext.isDebug) {
devLogger?.debug(`crashMonitoring: DEBUG instance crashed: ${JSON.stringify(ext)}`)
return
}

// This is the metric to let us know the extension crashed
telemetry.session_end.emit({
result: 'Failed',
proxiedSessionId: ext.sessionId,
reason: 'ExtHostCrashed',
passive: true,
})

devLogger?.debug(
`crashMonitoring: CRASH: following has crashed: pid ${ext.extHostPid} + sessionId: ${ext.sessionId}`
)
},
await state.handleCrashedExt(ext, () => {
// Debugger instances may incorrectly look like they crashed, so don't emit.
// Example is if I hit the red square in the debug menu, it is a non-graceful shutdown. But the regular
// 'x' button in the Debug IDE instance is a graceful shutdown.
if (ext.isDebug) {
devLogger?.debug(`crashMonitoring: DEBUG instance crashed: ${JSON.stringify(ext)}`)
return
}

// This is the metric to let us know the extension crashed
telemetry.session_end.emit({
result: 'Failed',
proxiedSessionId: ext.sessionId,
reason: 'ExtHostCrashed',
passive: true,
})

devLogger?.debug(
`crashMonitoring: CRASH: following has crashed: pid ${ext.extHostPid} + sessionId: ${ext.sessionId}`
)
})
}

Expand All @@ -320,11 +352,12 @@ class CrashChecker {
/** Use this on failures to terminate the crash checker */
public cleanup() {
globals.clock.clearInterval(this.intervalRef)
this.timeLag.cleanup()
}

/** Mimics a crash, only for testing */
public testCrash() {
globals.clock.clearInterval(this.intervalRef)
this.cleanup()
}
}

Expand Down Expand Up @@ -379,7 +412,7 @@ export class FileSystemState {
* IMORTANT: Use {@link crashMonitoringStateFactory} to make an instance
*/
constructor(protected readonly deps: MementoStateDependencies) {
this.stateDirPath = path.join(this.deps.workDirPath, crashMonitoringDirNames.root)
this.stateDirPath = path.join(this.deps.workDirPath, crashMonitoringDirName)

this.deps.devLogger?.debug(`crashMonitoring: pid: ${this.deps.pid}`)
this.deps.devLogger?.debug(`crashMonitoring: sessionId: ${this.deps.sessionId.slice(0, 8)}-...`)
Expand All @@ -405,17 +438,16 @@ export class FileSystemState {
if (await this.deps.isStateStale()) {
await this.clearState()
}

await withFailCtx('init', () => fs.mkdir(this.stateDirPath))
}

// ------------------ Heartbeat methods ------------------
public async sendHeartbeat() {
const extId = this.createExtId(this.ext)

try {
const func = async () => {
const dir = await this.runningExtsDir()
await fs.writeFile(
path.join(dir, extId),
this.makeStateFilePath(this.extId),
JSON.stringify({ ...this.ext, lastHeartbeat: this.deps.now() }, undefined, 4)
)
this.deps.devLogger?.debug(
Expand All @@ -437,7 +469,7 @@ export class FileSystemState {
}

/**
* Signal that this extension is gracefully shutting down. This will prevent the IDE from thinking it crashed.
* Indicates that this extension instance has gracefully shutdown.
*
* IMPORTANT: This code is being run in `deactivate()` where VS Code api is not available. Due to this we cannot
* easily update the state to indicate a graceful shutdown. So the next best option is to write to a file on disk,
Expand All @@ -447,46 +479,24 @@ export class FileSystemState {
* function touches.
*/
public async indicateGracefulShutdown(): Promise<void> {
const dir = await this.shutdownExtsDir()
await withFailCtx('writeShutdownFile', () => nodeFs.writeFile(path.join(dir, this.extId), ''))
// By removing the heartbeat entry, the crash checkers will not be able to find this entry anymore, making it
// impossible to report on since the file system is the source of truth
await withFailCtx('indicateGracefulShutdown', () => nodeFs.rm(this.makeStateFilePath(this.extId)))
}

// ------------------ Checker Methods ------------------

/**
* Signals the state that the given extension is not running, allowing the state to appropriately update
* depending on a graceful shutdown or crash.
*
* NOTE: This does NOT run in the `deactivate()` method, so it CAN reliably use the VS Code FS api
*
* @param opts - functions to run depending on why the extension stopped running
*/
public async handleExtNotRunning(
ext: ExtInstance,
opts: { onShutdown: () => Promise<void>; onCrash: () => Promise<void> }
): Promise<void> {
const extId = this.createExtId(ext)
const shutdownFilePath = path.join(await this.shutdownExtsDir(), extId)

if (await withFailCtx('existsShutdownFile', () => fs.exists(shutdownFilePath))) {
await opts.onShutdown()
// We intentionally do not clean up the file in shutdown since there may be another
// extension may be doing the same thing in parallel, and would read the extension as
// crashed since the file was missing. The file will be cleared on computer restart though.

// TODO: Be smart and clean up the file after some time.
} else {
await opts.onCrash()
}

// Clean up the running extension file since it no longer exists
await this.deleteHeartbeatFile(extId)
public async handleCrashedExt(ext: ExtInstance, fn: () => void) {
await withFailCtx('handleCrashedExt', async () => {
await this.deleteHeartbeatFile(ext)
fn()
})
}
public async deleteHeartbeatFile(extId: ExtInstanceId) {
const dir = await this.runningExtsDir()

private async deleteHeartbeatFile(ext: ExtInstanceId | ExtInstance) {
// Retry file deletion to prevent incorrect crash reports. Common Windows errors seen in telemetry: EPERM/EBUSY.
// See: https://github.com/aws/aws-toolkit-vscode/pull/5335
await withRetries(() => withFailCtx('deleteStaleRunningFile', () => fs.delete(path.join(dir, extId))), {
await withRetries(() => withFailCtx('deleteStaleRunningFile', () => fs.delete(this.makeStateFilePath(ext))), {
maxRetries: 8,
delay: 100,
backoff: 2,
Expand Down Expand Up @@ -515,17 +525,9 @@ export class FileSystemState {
protected createExtId(ext: ExtInstance): ExtInstanceId {
return `${ext.extHostPid}_${ext.sessionId}`
}
private async runningExtsDir(): Promise<string> {
const p = path.join(this.stateDirPath, crashMonitoringDirNames.running)
// ensure the dir exists
await withFailCtx('ensureRunningExtsDir', () => nodeFs.mkdir(p, { recursive: true }))
return p
}
private async shutdownExtsDir() {
const p = path.join(this.stateDirPath, crashMonitoringDirNames.shutdown)
// Since this runs in `deactivate()` it cannot use the VS Code FS api
await withFailCtx('ensureShutdownExtsDir', () => nodeFs.mkdir(p, { recursive: true }))
return p
private makeStateFilePath(ext: ExtInstance | ExtInstanceId) {
const extId = typeof ext === 'string' ? ext : this.createExtId(ext)
return path.join(this.stateDirPath, extId)
}
public async clearState(): Promise<void> {
this.deps.devLogger?.debug('crashMonitoring: CLEAR_STATE: Started')
Expand All @@ -538,7 +540,7 @@ export class FileSystemState {
const res = await withFailCtx('getAllExts', async () => {
// The file names are intentionally the IDs for easy mapping
const allExtIds: ExtInstanceId[] = await withFailCtx('readdir', async () =>
(await fs.readdir(await this.runningExtsDir())).map((k) => k[0])
(await fs.readdir(this.stateDirPath)).map((k) => k[0])
)

const allExts = allExtIds.map<Promise<ExtInstanceHeartbeat | undefined>>(async (extId: string) => {
Expand All @@ -549,7 +551,7 @@ export class FileSystemState {
() =>
withFailCtx('parseRunningExtFile', async () =>
ignoreBadFileError(async () => {
const text = await fs.readFileText(path.join(await this.runningExtsDir(), extId))
const text = await fs.readFileText(this.makeStateFilePath(extId))

if (!text) {
return undefined
Expand Down Expand Up @@ -617,7 +619,10 @@ export type ExtInstance = {
isDebug?: boolean
}

type ExtInstanceHeartbeat = ExtInstance & { lastHeartbeat: number }
type ExtInstanceHeartbeat = ExtInstance & {
/** Timestamp of the last heartbeat in milliseconds */
lastHeartbeat: number
}

function isExtHeartbeat(ext: unknown): ext is ExtInstanceHeartbeat {
return typeof ext === 'object' && ext !== null && 'lastHeartbeat' in ext && ext.lastHeartbeat !== undefined
Expand Down
Loading
Loading