Skip to content

Commit a130ae5

Browse files
retry heartbeats on fail
Signed-off-by: nkomonen-amazon <[email protected]>
1 parent 5f0829c commit a130ae5

File tree

5 files changed

+134
-15
lines changed

5 files changed

+134
-15
lines changed

packages/amazonq/src/extensionNode.ts

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,8 @@ export async function activate(context: vscode.ExtensionContext) {
3232
* the code compatible with web and move it to {@link activateAmazonQCommon}.
3333
*/
3434
async function activateAmazonQNode(context: vscode.ExtensionContext) {
35-
await (await CrashMonitoring.instance())?.start()
35+
// Intentionally do not await since this is slow and non-critical
36+
void (await CrashMonitoring.instance())?.start()
3637

3738
const extContext = {
3839
extensionContext: context,

packages/core/src/extensionNode.ts

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,8 @@ export async function activate(context: vscode.ExtensionContext) {
7878
// IMPORTANT: If you are doing setup that should also work in web mode (browser), it should be done in the function below
7979
const extContext = await activateCommon(context, contextPrefix, false)
8080

81-
await (await CrashMonitoring.instance())?.start()
81+
// Intentionally do not await since this can be slow and non-critical
82+
void (await CrashMonitoring.instance())?.start()
8283

8384
initializeCredentialsProviderManager()
8485

packages/core/src/shared/crashMonitoring.ts

Lines changed: 16 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ import fs from './fs/fs'
1717
import { getLogger } from './logger/logger'
1818
import { crashMonitoringDirNames } from './constants'
1919
import { throwOnUnstableFileSystem } from './filesystemUtilities'
20+
import { withRetries } from './utilities/functionUtils'
2021

2122
const className = 'CrashMonitoring'
2223

@@ -427,17 +428,21 @@ export class FileSystemState {
427428

428429
// ------------------ Heartbeat methods ------------------
429430
public async sendHeartbeat() {
430-
await withFailCtx('sendHeartbeatState', async () => {
431-
const dir = await this.runningExtsDir()
432-
const extId = this.createExtId(this.ext)
433-
await fs.writeFile(
434-
path.join(dir, extId),
435-
JSON.stringify({ ...this.ext, lastHeartbeat: this.deps.now() }, undefined, 4)
436-
)
437-
this.deps.devLogger?.debug(
438-
`crashMonitoring: HEARTBEAT pid ${this.deps.pid} + sessionId: ${this.deps.sessionId.slice(0, 8)}-...`
439-
)
440-
})
431+
const _sendHeartbeat = () =>
432+
withFailCtx('sendHeartbeatState', async () => {
433+
const dir = await this.runningExtsDir()
434+
const extId = this.createExtId(this.ext)
435+
await fs.writeFile(
436+
path.join(dir, extId),
437+
JSON.stringify({ ...this.ext, lastHeartbeat: this.deps.now() }, undefined, 4)
438+
)
439+
this.deps.devLogger?.debug(
440+
`crashMonitoring: HEARTBEAT pid ${this.deps.pid} + sessionId: ${this.deps.sessionId.slice(0, 8)}-...`
441+
)
442+
})
443+
// retry a failed heartbeat to avoid incorrectly being reported as a crash
444+
const _sendHeartbeatWithRetries = withRetries(_sendHeartbeat, { maxRetries: 8, delay: 100, backoff: 2 })
445+
return _sendHeartbeatWithRetries
441446
}
442447

443448
/**

packages/core/src/shared/utilities/functionUtils.ts

Lines changed: 32 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
* SPDX-License-Identifier: Apache-2.0
44
*/
55

6-
import { Timeout } from './timeoutUtils'
6+
import { sleep, Timeout } from './timeoutUtils'
77

88
/**
99
* Creates a function that always returns a 'shared' Promise.
@@ -145,3 +145,34 @@ export function cancellableDebounce<T, U extends any[]>(
145145
cancel: cancel,
146146
}
147147
}
148+
149+
/**
150+
* Executes the given function, retrying if it throws.
151+
*
152+
* @param opts - if no opts given, defaults are used
153+
*/
154+
export async function withRetries<T>(
155+
fn: () => Promise<T>,
156+
opts?: { maxRetries?: number; delay?: number; backoff?: number }
157+
): Promise<T> {
158+
const maxRetries = opts?.maxRetries ?? 3
159+
const delay = opts?.delay ?? 0
160+
const backoff = opts?.backoff ?? 1
161+
162+
let retryCount = 0
163+
let latestDelay = delay
164+
while (true) {
165+
try {
166+
return await fn()
167+
} catch (err) {
168+
retryCount++
169+
if (retryCount >= maxRetries) {
170+
throw err
171+
}
172+
if (latestDelay > 0) {
173+
await sleep(latestDelay)
174+
latestDelay = latestDelay * backoff
175+
}
176+
}
177+
}
178+
}

packages/core/src/test/shared/utilities/functionUtils.test.ts

Lines changed: 82 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,10 @@
44
*/
55

66
import assert from 'assert'
7-
import { once, onceChanged, debounce } from '../../../shared/utilities/functionUtils'
7+
import { once, onceChanged, debounce, withRetries } from '../../../shared/utilities/functionUtils'
88
import { installFakeClock } from '../../testUtil'
9+
import { stub, SinonStub } from 'sinon'
10+
import { InstalledClock } from '@sinonjs/fake-timers'
911

1012
describe('functionUtils', function () {
1113
it('once()', function () {
@@ -107,3 +109,82 @@ describe('debounce', function () {
107109
})
108110
})
109111
})
112+
113+
// function to test the withRetries method. It passes in a stub function as the argument and has different tests that throw on different iterations
114+
describe('withRetries', function () {
115+
let clock: InstalledClock
116+
let fn: SinonStub
117+
118+
beforeEach(function () {
119+
fn = stub()
120+
clock = installFakeClock()
121+
})
122+
123+
afterEach(function () {
124+
clock.uninstall()
125+
})
126+
127+
it('retries the function until it succeeds, using defaults', async function () {
128+
fn.onCall(0).throws()
129+
fn.onCall(1).throws()
130+
fn.onCall(2).resolves('success')
131+
assert.strictEqual(await withRetries(fn), 'success')
132+
})
133+
134+
it('retries the function until it succeeds at the final try', async function () {
135+
fn.onCall(0).throws()
136+
fn.onCall(1).throws()
137+
fn.onCall(2).throws()
138+
fn.onCall(3).resolves('success')
139+
assert.strictEqual(await withRetries(fn, { maxRetries: 4 }), 'success')
140+
})
141+
142+
it('throws the last error if the function always fails, using defaults', async function () {
143+
fn.onCall(0).throws()
144+
fn.onCall(1).throws()
145+
fn.onCall(2).throws()
146+
fn.onCall(3).resolves('unreachable')
147+
await assert.rejects(async () => {
148+
await withRetries(fn)
149+
})
150+
})
151+
152+
it('throws the last error if the function always fails', async function () {
153+
fn.onCall(0).throws()
154+
fn.onCall(1).throws()
155+
fn.onCall(2).throws()
156+
fn.onCall(3).throws()
157+
fn.onCall(4).resolves('unreachable')
158+
await assert.rejects(async () => {
159+
await withRetries(fn, { maxRetries: 4 })
160+
})
161+
})
162+
163+
it('honors retry delay + backoff multiplier', async function () {
164+
fn.onCall(0).throws() // 100ms
165+
fn.onCall(1).throws() // 200ms
166+
fn.onCall(2).throws() // 400ms
167+
fn.onCall(3).resolves('success')
168+
169+
const res = withRetries(fn, { maxRetries: 4, delay: 100, backoff: 2 })
170+
171+
// Check the call count after each iteration, ensuring the function is called
172+
// after the correct delay between retries.
173+
await clock.tickAsync(99)
174+
assert.strictEqual(fn.callCount, 1)
175+
await clock.tickAsync(1)
176+
assert.strictEqual(fn.callCount, 2)
177+
178+
await clock.tickAsync(199)
179+
assert.strictEqual(fn.callCount, 2)
180+
await clock.tickAsync(1)
181+
assert.strictEqual(fn.callCount, 3)
182+
183+
await clock.tickAsync(399)
184+
assert.strictEqual(fn.callCount, 3)
185+
await clock.tickAsync(1)
186+
assert.strictEqual(fn.callCount, 4)
187+
188+
assert.strictEqual(await res, 'success')
189+
})
190+
})

0 commit comments

Comments
 (0)