Skip to content

Commit 33f73ea

Browse files
committed
fix(smus): Improve error handling when the Space takes too long to start
1 parent e1abbe6 commit 33f73ea

File tree

3 files changed

+40
-10
lines changed

3 files changed

+40
-10
lines changed

packages/core/src/awsService/sagemaker/commands.ts

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -332,7 +332,8 @@ async function handleRunningSpaceWithDisabledAccess(
332332
await client.waitForAppInService(
333333
node.spaceApp.DomainId!,
334334
spaceName,
335-
node.spaceApp.SpaceSettingsSummary!.AppType!
335+
node.spaceApp.SpaceSettingsSummary!.AppType!,
336+
progress
336337
)
337338
await tryRemoteConnection(node, ctx, progress)
338339
} catch (err: any) {
@@ -376,7 +377,8 @@ async function handleStoppedSpace(
376377
await client.waitForAppInService(
377378
node.spaceApp.DomainId!,
378379
spaceName,
379-
node.spaceApp.SpaceSettingsSummary!.AppType!
380+
node.spaceApp.SpaceSettingsSummary!.AppType!,
381+
progress
380382
)
381383
await tryRemoteConnection(node, ctx, progress)
382384
}

packages/core/src/shared/clients/sagemaker.ts

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -364,10 +364,13 @@ export class SagemakerClient extends ClientWrapper<SageMakerClient> {
364364
domainId: string,
365365
spaceName: string,
366366
appType: string,
367-
maxRetries = 30,
368-
intervalMs = 5000
367+
progress?: vscode.Progress<{ message?: string; increment?: number }>
369368
): Promise<void> {
370-
for (let attempt = 0; attempt < maxRetries; attempt++) {
369+
const softTimeoutRetries = 12 // 1 minute
370+
const hardTimeoutRetries = 120 // 10 minutes
371+
const intervalMs = 5000
372+
373+
for (let attempt = 0; attempt < hardTimeoutRetries; attempt++) {
371374
const { Status } = await this.describeApp({
372375
DomainId: domainId,
373376
SpaceName: spaceName,
@@ -383,6 +386,12 @@ export class SagemakerClient extends ClientWrapper<SageMakerClient> {
383386
throw new ToolkitError(`App failed to start. Status: ${Status}`)
384387
}
385388

389+
if (attempt === softTimeoutRetries) {
390+
progress?.report({
391+
message: `Starting the space is taking longer than usual. The space will connect when ready`,
392+
})
393+
}
394+
386395
await sleep(intervalMs)
387396
}
388397

packages/core/src/test/shared/clients/sagemakerClient.test.ts

Lines changed: 24 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -249,12 +249,31 @@ describe('SagemakerClient.waitForAppInService', function () {
249249
})
250250

251251
it('times out after max retries', async function () {
252-
describeAppStub.resolves({ Status: 'Pending' })
252+
let callCount = 0
253+
describeAppStub.callsFake(async () => {
254+
callCount++
255+
return { Status: 'Pending' }
256+
})
253257

254-
await assert.rejects(
255-
client.waitForAppInService('domain1', 'space1', 'CodeEditor', 2, 10),
256-
/Timed out waiting for app/
257-
)
258+
// Stub globals.clock.setTimeout to make sleep instant
259+
const globalsModule = await import('../../../shared/extensionGlobals.js')
260+
const globals = globalsModule.default as any
261+
const originalSetTimeout = globals.clock.setTimeout
262+
globals.clock.setTimeout = ((cb: any) => {
263+
// eslint-disable-next-line @typescript-eslint/no-implied-eval
264+
setImmediate(cb)
265+
return 0 as any
266+
}) as any
267+
268+
try {
269+
await assert.rejects(
270+
client.waitForAppInService('domain1', 'space1', 'CodeEditor'),
271+
/Timed out waiting for app/
272+
)
273+
assert.strictEqual(callCount, 120)
274+
} finally {
275+
globals.clock.setTimeout = originalSetTimeout
276+
}
258277
})
259278
})
260279

0 commit comments

Comments
 (0)