Skip to content

Commit 79fc42c

Browse files
authored
fix(codecatalyst): DevEnv connect sometimes fails #3287
Problem: Toolkit may fail to connect to FAILED DevEnvs (or potentially other states). Solution: - Always try to start DevEnv, regardless of its current or previous state(s). - Improve logging. ``` … [DEBUG]: devenv not started, waiting (time: 1.641s): d8d8eecf-809d-4f35-… [FAILED/0ms] … [ERROR]: devenv failed to start (time: 15.603s, startAttempts: 3): d8d8eecf-809d-4f35-… [FAILED/13963ms] … [INFO]: Command: (not started) [ssh -G aws-devenv-test] (running processes: 0) … [DEBUG]: devenv not started, waiting (time: 1.318s): d8d8eecf-809d-4f35-… [FAILED/0ms] … [INFO]: devenv failed to start (user cancelled) (time: 8.571s, startAttempts: 2): d8d8eecf-809d-4f35-… [FAILED/7256ms] … [INFO]: Command: (not started) [ssh -G aws-devenv-test] (running processes: 0) … [DEBUG]: devenv not started, waiting (time: 1.287s): jmk-mar24 [STOPPED/0ms] … [DEBUG]: devenv not started, waiting (time: 6.904s): jmk-mar24 [STOPPED/5617ms STARTING/0ms] … [INFO]: devenv failed to start (user cancelled) (time: 13.327s): jmk-mar24 [STOPPED/5617ms STARTING/6425ms] … [INFO]: Command: (not started) [ssh -G aws-devenv-test] (running processes: 0) … [DEBUG]: devenv not started, waiting (time: 1.333s): jmk-mar24 [STARTING/1ms] … [DEBUG]: devenv not started, waiting (time: 30.994s): jmk-mar24 [STARTING/29662ms RUNNING/0ms] … [INFO]: devenv started (time: 30.995s): jmk-mar24 [STARTING/29662ms RUNNING/2187ms] ```
1 parent cd4f400 commit 79fc42c

File tree

4 files changed

+105
-63
lines changed

4 files changed

+105
-63
lines changed
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
{
2+
"type": "Feature",
3+
"description": "CodeCatalyst: retry connection to \"FAILED\" Dev Environments"
4+
}
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
{
2+
"type": "Feature",
3+
"description": "CodeCatalyst: improved messages and logging when connecting to Dev Environment"
4+
}

src/codecatalyst/model.ts

Lines changed: 5 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -190,14 +190,11 @@ export async function prepareDevEnvConnection(
190190
{ topic, timeout }: { topic?: string; timeout?: Timeout } = {}
191191
): Promise<DevEnvConnection> {
192192
const { ssm, vsc, ssh } = (await ensureDependencies()).unwrap()
193-
const runningDevEnv = await client.startDevEnvironmentWithProgress(
194-
{
195-
id,
196-
spaceName: org.name,
197-
projectName: project.name,
198-
},
199-
'RUNNING'
200-
)
193+
const runningDevEnv = await client.startDevEnvironmentWithProgress({
194+
id,
195+
spaceName: org.name,
196+
projectName: project.name,
197+
})
201198

202199
const hostname = getHostNameFromEnv({ id, org, project })
203200
const logPrefix = topic ? `codecatalyst ${topic} (${id})` : `codecatalyst (${id})`

src/shared/clients/codecatalystClient.ts

Lines changed: 92 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,8 @@ const localize = nls.loadMessageBundle()
1111
import * as AWS from 'aws-sdk'
1212
import * as logger from '../logger/logger'
1313
import { ServiceConfigurationOptions } from 'aws-sdk/lib/service'
14-
import { Timeout, waitTimeout, waitUntil } from '../utilities/timeoutUtils'
14+
import { CancellationError, Timeout, waitTimeout, waitUntil } from '../utilities/timeoutUtils'
15+
import { isUserCancelledError } from '../../shared/errors'
1516
import { showMessageWithCancel } from '../utilities/messages'
1617
import { assertHasProps, ClassToInterfaceType, isNonNullable, RequiredProps } from '../utilities/tsUtils'
1718
import { AsyncCollection, toCollection } from '../utilities/asyncCollection'
@@ -591,118 +592,154 @@ class CodeCatalystClientInternal {
591592
*/
592593
public async startDevEnvironmentWithProgress(
593594
args: CodeCatalyst.StartDevEnvironmentRequest,
594-
status: string,
595595
timeout: Timeout = new Timeout(1000 * 60 * 60)
596596
): Promise<DevEnvironment> {
597597
// Track the status changes chronologically so that we can
598598
// 1. reason about hysterisis (weird flip-flops)
599599
// 2. have visibility in the logs
600-
const lastStatus = new Array<{ status: string; start: number }>()
600+
const statuses = new Array<{ status: string; start: number }>()
601601
let alias: string | undefined
602-
603-
try {
604-
const devenv = await this.getDevEnvironment(args)
605-
alias = devenv.alias
606-
lastStatus.push({ status: devenv.status, start: Date.now() })
607-
if (status === 'RUNNING' && devenv.status === 'RUNNING') {
608-
// "Debounce" in case caller did not check if the environment was already running.
609-
return devenv
610-
}
611-
} catch {
612-
lastStatus.length = 0
613-
}
602+
let startAttempts = 0
614603

615604
function statusesToString() {
616605
let s = ''
617-
for (let i = 0; i < lastStatus.length; i++) {
618-
const item = lastStatus[i]
619-
const nextItem = i < lastStatus.length - 1 ? lastStatus[i + 1] : undefined
606+
for (let i = 0; i < statuses.length; i++) {
607+
const item = statuses[i]
608+
const nextItem = i < statuses.length - 1 ? statuses[i + 1] : undefined
620609
const nextTime = nextItem ? nextItem.start : Date.now()
621610
const elapsed = nextTime - item.start
622611
s += `${s ? ' ' : ''}${item.status}/${elapsed}ms`
623612
}
624613
return `[${s}]`
625614
}
626615

627-
const doLog = (kind: 'debug' | 'info', msg: string) => {
628-
const name = (alias ? alias : args.id).substring(0, 19)
629-
const fmt = `${msg} (time: %d s): %s %s`
616+
function getName(): string {
617+
const fullname = alias ? alias : args.id
618+
const shortname = fullname.substring(0, 19) + (fullname.length > 20 ? '…' : '')
619+
return shortname
620+
}
621+
622+
function failedStartMsg() {
623+
const lastStatus = statuses[statuses.length - 1]?.status
624+
return `Dev Environment failed to start (${lastStatus}): ${getName()}`
625+
}
626+
627+
const doLog = (kind: 'debug' | 'error' | 'info', msg: string) => {
628+
const fmt = `${msg} (time: %ds${
629+
startAttempts <= 1 ? '' : ', startAttempts: ' + startAttempts.toString()
630+
}): %s %s`
630631
if (kind === 'debug') {
631-
this.log.debug(fmt, timeout.elapsedTime / 1000, name, statusesToString())
632+
this.log.debug(fmt, timeout.elapsedTime / 1000, getName(), statusesToString())
633+
} else if (kind === 'error') {
634+
this.log.error(fmt, timeout.elapsedTime / 1000, getName(), statusesToString())
632635
} else {
633-
this.log.info(fmt, timeout.elapsedTime / 1000, name, statusesToString())
636+
this.log.info(fmt, timeout.elapsedTime / 1000, getName(), statusesToString())
634637
}
635638
}
636639

637640
const progress = await showMessageWithCancel(
638641
localize('AWS.CodeCatalyst.devenv.message', 'CodeCatalyst'),
639642
timeout
640643
)
641-
progress.report({ message: localize('AWS.CodeCatalyst.devenv.checking', 'checking status...') })
644+
progress.report({ message: localize('AWS.CodeCatalyst.devenv.checking', 'Checking status...') })
645+
646+
try {
647+
const devenv = await this.getDevEnvironment(args)
648+
alias = devenv.alias
649+
statuses.push({ status: devenv.status, start: Date.now() })
650+
if (devenv.status === 'RUNNING') {
651+
doLog('debug', 'devenv RUNNING')
652+
timeout.cancel()
653+
// "Debounce" in case caller did not check if the environment was already running.
654+
return devenv
655+
}
656+
} catch {
657+
// Continue.
658+
}
659+
660+
doLog('debug', 'devenv not started, waiting')
642661

643662
const pollDevEnv = waitUntil(
644663
async () => {
645-
doLog('debug', 'devenv not started, waiting')
646-
// technically this will continue to be called until it reaches its own timeout, need a better way to 'cancel' a `waitUntil`
647664
if (timeout.completed) {
648-
return
665+
// TODO: need a better way to "cancel" a `waitUntil`.
666+
throw new CancellationError('user')
649667
}
650668

669+
const lastStatus = statuses[statuses.length - 1]
670+
const elapsed = Date.now() - lastStatus.start
651671
const resp = await this.getDevEnvironment(args)
652-
const startingStates = lastStatus.filter(o => o.status === 'STARTING').length
672+
alias = resp.alias
673+
653674
if (
654-
startingStates > 1 &&
655-
lastStatus[0].status === 'STARTING' &&
656-
(resp.status === 'STOPPED' || resp.status === 'STOPPING')
675+
lastStatus &&
676+
['STOPPED', 'FAILED'].includes(lastStatus.status) &&
677+
['STOPPED', 'FAILED'].includes(resp.status) &&
678+
elapsed > 60000 &&
679+
startAttempts > 2
657680
) {
658-
throw new ToolkitError('Dev Environment failed to start', { code: 'BadDevEnvState' })
659-
}
660-
661-
if (resp.status === 'STOPPED') {
681+
// If still STOPPED/FAILED after 60+ seconds, don't keep retrying for 1 hour...
682+
throw new ToolkitError(failedStartMsg(), { code: 'FailedDevEnv' })
683+
} else if (['STOPPED', 'FAILED'].includes(resp.status)) {
662684
progress.report({
663-
message: localize('AWS.CodeCatalyst.devenv.stopStart', 'Resuming Dev Environment...'),
685+
message: localize('AWS.CodeCatalyst.devenv.resuming', 'Resuming Dev Environment...'),
664686
})
665687
try {
688+
startAttempts++
666689
await this.startDevEnvironment(args)
667690
} catch (e) {
668691
const err = e as AWS.AWSError
669-
// May happen after "Update Dev Environment":
670-
// ConflictException: "Cannot start Dev Environment because update process is still going on"
671-
// Continue retrying in that case.
672-
if (err.code === 'ConflictException') {
673-
doLog('debug', 'devenv not started (ConflictException), waiting')
674-
} else {
675-
throw new ToolkitError('Dev Environment failed to start', {
676-
code: 'BadDevEnvState',
677-
cause: err,
678-
})
679-
}
692+
// - ValidationException: "… creation has failed, cannot start"
693+
// - ConflictException: "Cannot start … because update process is still going on"
694+
// (can happen after "Update Dev Environment")
695+
doLog('info', `devenv not started (${err.code}), waiting`)
696+
// Continue retrying...
680697
}
681698
} else if (resp.status === 'STOPPING') {
682699
progress.report({
683-
message: localize('AWS.CodeCatalyst.devenv.resuming', 'Waiting for Dev Environment to stop...'),
700+
message: localize('AWS.CodeCatalyst.devenv.stopping', 'Waiting for Dev Environment to stop...'),
684701
})
685-
} else if (resp.status === 'FAILED') {
686-
throw new ToolkitError('Dev Environment failed to start', { code: 'FailedDevEnv' })
687702
} else {
688703
progress.report({
689704
message: localize('AWS.CodeCatalyst.devenv.starting', 'Opening Dev Environment...'),
690705
})
691706
}
692707

693-
if (lastStatus[lastStatus.length - 1].status !== resp.status) {
694-
lastStatus.push({ status: resp.status, start: Date.now() })
708+
if (lastStatus?.status !== resp.status) {
709+
statuses.push({ status: resp.status, start: Date.now() })
710+
if (resp.status !== 'RUNNING') {
711+
doLog('debug', `devenv not started, waiting`)
712+
}
695713
}
696714
return resp.status === 'RUNNING' ? resp : undefined
697715
},
698716
// note: the `waitUntil` will resolve prior to the real timeout if it is refreshed
699717
{ interval: 1000, timeout: timeout.remainingTime, truthy: true }
700718
)
701719

702-
const devenv = await waitTimeout(pollDevEnv, timeout)
720+
const devenv = await waitTimeout(pollDevEnv, timeout).catch(e => {
721+
const err = e as Error
722+
const starts = statuses.filter(o => o.status === 'STARTING').length
723+
const fails = statuses.filter(o => o.status === 'FAILED').length
724+
725+
if (!isUserCancelledError(e)) {
726+
doLog('error', 'devenv failed to start')
727+
} else {
728+
doLog('info', 'devenv failed to start (user cancelled)')
729+
err.message = failedStartMsg()
730+
throw err
731+
}
732+
733+
if (starts > 1 && fails === 0) {
734+
throw new ToolkitError(failedStartMsg(), { code: 'BadDevEnvState', cause: err })
735+
} else if (fails > 0) {
736+
throw new ToolkitError(failedStartMsg(), { code: 'FailedDevEnv', cause: err })
737+
}
738+
})
739+
703740
if (!devenv) {
704-
doLog('info', 'devenv failed to start')
705-
throw new ToolkitError('Dev Environment failed to start', { code: 'Timeout' })
741+
doLog('error', 'devenv failed to start (timeout)')
742+
throw new ToolkitError(failedStartMsg(), { code: 'Timeout' })
706743
}
707744
doLog('info', 'devenv started')
708745

0 commit comments

Comments
 (0)