Skip to content

Commit e369ff3

Browse files
authored
fix(smus): Improve error handling when the Space takes too long to start (#8277)
## Problem Occasionally, then a user clicks the Connect button for a Space that is in the Stopped status, the corresponding App that gets created eventually takes too long to become Running, so the user is shown the following error message. ``` Remote connection failed: Timed out waiting for app "default-b97e54b8-e0e1-70b7-a216-856fcbb3cc61" to reach "InService" status. | Timed out waiting for app "default-b97e54b8-e0e1-70b7-a216-856fcbb3cc61" to reach "InService" status. ``` We can't prevent this from happening as it depends on the SageMaker platform, but we can improve the user experience around this. ## Solution * Add more time to hard timeout * update the process messages when App takes longer than usual to connect ## Appearance ### currently: <img width="474" height="67" alt="Screenshot 2025-11-12 at 12 13 53 PM (3)" src="https://github.com/user-attachments/assets/2b543afa-621f-4ac0-8838-3b1d0ca74276" /> (2 min 30 sec) -> <img width="469" height="161" alt="Screenshot 2025-11-12 at 12 05 16 PM" src="https://github.com/user-attachments/assets/ce4d7f3f-7d52-48ed-937d-b07a7e09175d" /> ### this change: <img width="474" height="67" alt="Screenshot 2025-11-12 at 12 13 53 PM (3)" src="https://github.com/user-attachments/assets/2b543afa-621f-4ac0-8838-3b1d0ca74276" /> (1 min) -> **Note: Based on @dylanraws' and ricokyle@'s suggestions, the exact wording is changed to "Connecting to testX: Starting the Space is taking longer than usual. The space will connect when ready"** <img width="478" height="127" alt="Screenshot 2025-11-11 at 10 02 37 AM" src="https://github.com/user-attachments/assets/b9d0c4b2-e25b-4c05-8ea5-ecb65c9b3fa5" /> (9 min) -> <img width="469" height="161" alt="Screenshot 2025-11-12 at 12 05 16 PM" src="https://github.com/user-attachments/assets/ce4d7f3f-7d52-48ed-937d-b07a7e09175d" /> --- - Treat all work as PUBLIC. Private `feature/x` branches will not be squash-merged at release time. - Your code changes must meet the guidelines in [CONTRIBUTING.md](https://github.com/aws/aws-toolkit-vscode/blob/master/CONTRIBUTING.md#guidelines). - License: I confirm that my contribution is made under the terms of the Apache 2.0 license.
1 parent 8d1eb19 commit e369ff3

File tree

3 files changed

+32
-12
lines changed

3 files changed

+32
-12
lines changed

packages/core/src/awsService/sagemaker/commands.ts

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -341,7 +341,8 @@ async function handleRunningSpaceWithDisabledAccess(
341341
await client.waitForAppInService(
342342
node.spaceApp.DomainId!,
343343
spaceName,
344-
node.spaceApp.SpaceSettingsSummary!.AppType!
344+
node.spaceApp.SpaceSettingsSummary!.AppType!,
345+
progress
345346
)
346347
await tryRemoteConnection(node, ctx, progress)
347348
} catch (err: any) {
@@ -385,7 +386,8 @@ async function handleStoppedSpace(
385386
await client.waitForAppInService(
386387
node.spaceApp.DomainId!,
387388
spaceName,
388-
node.spaceApp.SpaceSettingsSummary!.AppType!
389+
node.spaceApp.SpaceSettingsSummary!.AppType!,
390+
progress
389391
)
390392
await tryRemoteConnection(node, ctx, progress)
391393
}

packages/core/src/shared/clients/sagemaker.ts

Lines changed: 16 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
1-
/*!
2-
* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
1+
/*! * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
32
* SPDX-License-Identifier: Apache-2.0
43
*/
54

@@ -59,6 +58,12 @@ const appTypeSettingsMap: Record<string, string> = {
5958
[AppType.CodeEditor as string]: 'CodeEditorAppSettings',
6059
} as const
6160

61+
export const waitForAppConfig = {
62+
softTimeoutRetries: 12,
63+
hardTimeoutRetries: 120,
64+
intervalMs: 5000,
65+
}
66+
6267
export interface SagemakerSpaceApp extends SpaceDetails {
6368
App?: AppDetails
6469
DomainSpaceKey: string
@@ -364,10 +369,9 @@ export class SagemakerClient extends ClientWrapper<SageMakerClient> {
364369
domainId: string,
365370
spaceName: string,
366371
appType: string,
367-
maxRetries = 30,
368-
intervalMs = 5000
372+
progress?: vscode.Progress<{ message?: string; increment?: number }>
369373
): Promise<void> {
370-
for (let attempt = 0; attempt < maxRetries; attempt++) {
374+
for (let attempt = 0; attempt < waitForAppConfig.hardTimeoutRetries; attempt++) {
371375
const { Status } = await this.describeApp({
372376
DomainId: domainId,
373377
SpaceName: spaceName,
@@ -383,7 +387,13 @@ export class SagemakerClient extends ClientWrapper<SageMakerClient> {
383387
throw new ToolkitError(`App failed to start. Status: ${Status}`)
384388
}
385389

386-
await sleep(intervalMs)
390+
if (attempt === waitForAppConfig.softTimeoutRetries) {
391+
progress?.report({
392+
message: `Starting the space is taking longer than usual. The space will connect when ready`,
393+
})
394+
}
395+
396+
await sleep(waitForAppConfig.intervalMs)
387397
}
388398

389399
throw new ToolkitError(`Timed out waiting for app "${spaceName}" to reach "InService" status.`)

packages/core/src/test/shared/clients/sagemakerClient.test.ts

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -251,10 +251,18 @@ describe('SagemakerClient.waitForAppInService', function () {
251251
it('times out after max retries', async function () {
252252
describeAppStub.resolves({ Status: 'Pending' })
253253

254-
await assert.rejects(
255-
client.waitForAppInService('domain1', 'space1', 'CodeEditor', 2, 10),
256-
/Timed out waiting for app/
257-
)
254+
const sagemakerModule = await import('../../../shared/clients/sagemaker.js')
255+
const originalValue = sagemakerModule.waitForAppConfig.hardTimeoutRetries
256+
sagemakerModule.waitForAppConfig.hardTimeoutRetries = 3
257+
258+
try {
259+
await assert.rejects(
260+
client.waitForAppInService('domain1', 'space1', 'CodeEditor'),
261+
/Timed out waiting for app/
262+
)
263+
} finally {
264+
sagemakerModule.waitForAppConfig.hardTimeoutRetries = originalValue
265+
}
258266
})
259267
})
260268

0 commit comments

Comments
 (0)