Skip to content

Commit 1f68be9

Browse files
authored
fix: SSH connection is terminated when agent is marked as disconnected/timed out (#247)
For a bit of context Coder Toolbox can establish an SSH connection when the workspace is in RUNNING state and agent is in READY state. If the connection is already established but for some reason the agent does not respond to pings, then the connection is terminated by signaling Toolbox that the workspace changed the overall status to a non-reachable state. In reality the connection can still work. After some code research and testing it looks like coder ssh waits for the agent to be connecting before establishing the ssh connection. This allows us to mark the workspace as reachable as soon as it hits the RUNNING state regardless of the agent status. The PR also changes the log level from debug to info for the lines that report the workspace and agent status. This change should help with faster debugs in the future. Logs will not be spammed every 5 seconds with this report, instead the line will be logging only when the workspace build changes. - resolves #246
1 parent 9e14b30 commit 1f68be9

File tree

4 files changed

+100
-74
lines changed

4 files changed

+100
-74
lines changed

CHANGELOG.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,10 @@
22

33
## Unreleased
44

5+
### Fixed
6+
7+
- improved SSH connection reliability during transient network failures
8+
59
## 0.8.3 - 2026-01-14
610

711
### Fixed

gradle.properties

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
1-
version=0.8.3
1+
version=0.8.4
22
group=com.coder.toolbox
33
name=coder-toolbox

src/main/kotlin/com/coder/toolbox/CoderRemoteEnvironment.kt

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -133,7 +133,7 @@ class CoderRemoteEnvironment(
133133
// cli takes 15 seconds to move the workspace in queueing/starting state
134134
// while the user won't see anything happening in TBX after start is clicked
135135
// During those 15 seconds we work around by forcing a `Queuing` state
136-
updateStatus(WorkspaceAndAgentStatus.QUEUED)
136+
updateStatus(WorkspaceAndAgentStatus.Queued(workspace))
137137
// force refresh of the actions list (Start should no longer be available)
138138
refreshAvailableActions()
139139
})
@@ -279,7 +279,7 @@ class CoderRemoteEnvironment(
279279
state.update {
280280
environmentStatus.toRemoteEnvironmentState(context)
281281
}
282-
context.logger.debug("Overall status for workspace $id is $environmentStatus. Workspace status: ${workspace.latestBuild.status}, agent status: ${agent.status}, agent lifecycle state: ${agent.lifecycleState}, login before ready: ${agent.loginBeforeReady}")
282+
context.logger.info("Overall status for workspace $id is $environmentStatus. Workspace status: ${workspace.latestBuild.status}, agent status: ${agent.status}, agent lifecycle state: ${agent.lifecycleState}, login before ready: ${agent.loginBeforeReady}")
283283
}
284284

285285
/**
@@ -323,14 +323,14 @@ class CoderRemoteEnvironment(
323323
// mark the env as deleting otherwise we will have to
324324
// wait for the poller to update the status in the next 5 seconds
325325
state.update {
326-
WorkspaceAndAgentStatus.DELETING.toRemoteEnvironmentState(context)
326+
WorkspaceAndAgentStatus.Deleting(workspace).toRemoteEnvironmentState(context)
327327
}
328328

329329
context.cs.launch(CoroutineName("Workspace Deletion Poller")) {
330330
withTimeout(5.minutes) {
331331
var workspaceStillExists = true
332332
while (context.cs.isActive && workspaceStillExists) {
333-
if (environmentStatus == WorkspaceAndAgentStatus.DELETING || environmentStatus == WorkspaceAndAgentStatus.DELETED) {
333+
if (environmentStatus is WorkspaceAndAgentStatus.Deleting || environmentStatus is WorkspaceAndAgentStatus.Deleted) {
334334
workspaceStillExists = false
335335
context.envPageManager.showPluginEnvironmentsPage()
336336
} else {

src/main/kotlin/com/coder/toolbox/models/WorkspaceAndAgentStatus.kt

Lines changed: 91 additions & 69 deletions
Original file line numberDiff line numberDiff line change
@@ -18,41 +18,60 @@ private val CircularSpinner: EnvironmentStateIcons = EnvironmentStateIcons.Conne
1818
* WorkspaceAndAgentStatus represents the combined status of a single agent and
1919
* its workspace (or just the workspace if there are no agents).
2020
*/
21-
enum class WorkspaceAndAgentStatus(val label: String, val description: String) {
21+
sealed class WorkspaceAndAgentStatus(
22+
val label: String,
23+
val workspace: Workspace
24+
) {
2225
// Workspace states.
23-
QUEUED("Queued", "The workspace is queueing to start."),
24-
STARTING("Starting", "The workspace is starting."),
25-
FAILED("Failed", "The workspace has failed to start."),
26-
DELETING("Deleting", "The workspace is being deleted."),
27-
DELETED("Deleted", "The workspace has been deleted."),
28-
STOPPING("Stopping", "The workspace is stopping."),
29-
STOPPED("Stopped", "The workspace has stopped."),
30-
CANCELING("Canceling action", "The workspace is being canceled."),
31-
CANCELED("Canceled action", "The workspace has been canceled."),
32-
RUNNING("Running", "The workspace is running, waiting for agents."),
26+
class Queued(workspace: Workspace) : WorkspaceAndAgentStatus("Queued", workspace)
27+
28+
class Starting(workspace: Workspace) : WorkspaceAndAgentStatus("Starting", workspace)
29+
30+
class Failed(workspace: Workspace) : WorkspaceAndAgentStatus("Failed", workspace)
31+
32+
class Deleting(workspace: Workspace) : WorkspaceAndAgentStatus("Deleting", workspace)
33+
34+
class Deleted(workspace: Workspace) :
35+
WorkspaceAndAgentStatus("Deleted", workspace)
36+
37+
class Stopping(workspace: Workspace) : WorkspaceAndAgentStatus("Stopping", workspace)
38+
39+
class Stopped(workspace: Workspace) : WorkspaceAndAgentStatus("Stopped", workspace)
40+
41+
class Canceling(workspace: Workspace) : WorkspaceAndAgentStatus("Canceling action", workspace)
42+
43+
class Canceled(workspace: Workspace) : WorkspaceAndAgentStatus("Canceled action", workspace)
44+
45+
class Running(workspace: Workspace) : WorkspaceAndAgentStatus("Running", workspace)
3346

3447
// Agent states.
35-
CONNECTING("Connecting", "The agent is connecting."),
36-
DISCONNECTED("Disconnected", "The agent has disconnected."),
37-
TIMEOUT("Timeout", "The agent is taking longer than expected to connect."),
38-
AGENT_STARTING("Starting", "The startup script is running."),
39-
AGENT_STARTING_READY(
40-
"Starting",
41-
"The startup script is still running but the agent is ready to accept connections.",
42-
),
43-
CREATED("Created", "The agent has been created."),
44-
START_ERROR("Started with error", "The agent is ready but the startup script errored."),
45-
START_TIMEOUT("Starting", "The startup script is taking longer than expected."),
46-
START_TIMEOUT_READY(
47-
"Starting",
48-
"The startup script is taking longer than expected but the agent is ready to accept connections.",
49-
),
50-
SHUTTING_DOWN("Shutting down", "The agent is shutting down."),
51-
SHUTDOWN_ERROR("Shutdown with error", "The agent shut down but the shutdown script errored."),
52-
SHUTDOWN_TIMEOUT("Shutting down", "The shutdown script is taking longer than expected."),
53-
OFF("Off", "The agent has shut down."),
54-
READY("Ready", "The agent is ready to accept connections."),
55-
;
48+
class Connecting(workspace: Workspace) : WorkspaceAndAgentStatus("Connecting", workspace)
49+
50+
class Disconnected(workspace: Workspace) : WorkspaceAndAgentStatus("Disconnected", workspace)
51+
52+
class Timeout(workspace: Workspace) : WorkspaceAndAgentStatus("Timeout", workspace)
53+
54+
class AgentStarting(workspace: Workspace) : WorkspaceAndAgentStatus("Starting", workspace)
55+
56+
class AgentStartingReady(workspace: Workspace) : WorkspaceAndAgentStatus("Starting", workspace)
57+
58+
class Created(workspace: Workspace) : WorkspaceAndAgentStatus("Created", workspace)
59+
60+
class StartError(workspace: Workspace) : WorkspaceAndAgentStatus("Started with error", workspace)
61+
62+
class StartTimeout(workspace: Workspace) : WorkspaceAndAgentStatus("Starting", workspace)
63+
64+
class StartTimeoutReady(workspace: Workspace) : WorkspaceAndAgentStatus("Starting", workspace)
65+
66+
class ShuttingDown(workspace: Workspace) : WorkspaceAndAgentStatus("Shutting down", workspace)
67+
68+
class ShutdownError(workspace: Workspace) : WorkspaceAndAgentStatus("Shutdown with error", workspace)
69+
70+
class ShutdownTimeout(workspace: Workspace) : WorkspaceAndAgentStatus("Shutting down", workspace)
71+
72+
class Off(workspace: Workspace) : WorkspaceAndAgentStatus("Off", workspace)
73+
74+
class Ready(workspace: Workspace) : WorkspaceAndAgentStatus("Ready", workspace)
5675

5776
/**
5877
* Return the environment state for Toolbox, which tells it the label, color
@@ -63,29 +82,29 @@ enum class WorkspaceAndAgentStatus(val label: String, val description: String) {
6382
*/
6483
fun toRemoteEnvironmentState(context: CoderToolboxContext): CustomRemoteEnvironmentStateV2 {
6584
return CustomRemoteEnvironmentStateV2(
66-
context.i18n.pnotr(label),
85+
label = context.i18n.pnotr(label),
6786
color = getStateColor(context),
68-
isReachable = ready() || unhealthy(),
87+
isReachable = this.workspace.latestBuild.status == WorkspaceStatus.RUNNING,
6988
// TODO@JB: How does this work? Would like a spinner for pending states.
7089
iconId = getStateIcon().id,
7190
isPriorityShow = true
7291
)
7392
}
7493

7594
private fun getStateColor(context: CoderToolboxContext): StateColor {
76-
return if (this == FAILED) context.envStateColorPalette.getColor(StandardRemoteEnvironmentState.FailedToStart)
77-
else if (this == DELETING) context.envStateColorPalette.getColor(StandardRemoteEnvironmentState.Deleting)
78-
else if (this == DELETED) context.envStateColorPalette.getColor(StandardRemoteEnvironmentState.Deleted)
95+
return if (this is Failed) context.envStateColorPalette.getColor(StandardRemoteEnvironmentState.FailedToStart)
96+
else if (this is Deleting) context.envStateColorPalette.getColor(StandardRemoteEnvironmentState.Deleting)
97+
else if (this is Deleted) context.envStateColorPalette.getColor(StandardRemoteEnvironmentState.Deleted)
7998
else if (ready()) context.envStateColorPalette.getColor(StandardRemoteEnvironmentState.Active)
8099
else if (unhealthy()) context.envStateColorPalette.getColor(StandardRemoteEnvironmentState.Unhealthy)
81-
else if (canStart() || this == STOPPING) context.envStateColorPalette.getColor(StandardRemoteEnvironmentState.Hibernating)
100+
else if (canStart() || this is Stopping) context.envStateColorPalette.getColor(StandardRemoteEnvironmentState.Hibernating)
82101
else if (pending()) context.envStateColorPalette.getColor(StandardRemoteEnvironmentState.Activating)
83102
else context.envStateColorPalette.getColor(StandardRemoteEnvironmentState.Unreachable)
84103
}
85104

86105
private fun getStateIcon(): EnvironmentStateIcons {
87-
return if (this == FAILED) EnvironmentStateIcons.Error
88-
else if (pending() || this == DELETING || this == DELETED || this == STOPPING) CircularSpinner
106+
return if (this is Failed) EnvironmentStateIcons.Error
107+
else if (pending() || this is Deleting || this is Deleted || this is Stopping) CircularSpinner
89108
else if (ready() || unhealthy()) EnvironmentStateIcons.Active
90109
else if (canStart()) EnvironmentStateIcons.Offline
91110
else EnvironmentStateIcons.NoIcon
@@ -94,27 +113,24 @@ enum class WorkspaceAndAgentStatus(val label: String, val description: String) {
94113
/**
95114
* Return true if the agent is in a connectable state.
96115
*/
97-
fun ready(): Boolean = this == READY
116+
fun ready(): Boolean = this is Ready
98117

99118
fun unhealthy(): Boolean {
100-
return listOf(START_ERROR, START_TIMEOUT_READY)
101-
.contains(this)
119+
return this is StartError || this is StartTimeoutReady
102120
}
103121

104122
/**
105123
* Return true if the agent might soon be in a connectable state.
106124
*/
107125
fun pending(): Boolean {
108126
// See ready() for why `CREATED` is not in this list.
109-
return listOf(CREATED, CONNECTING, TIMEOUT, AGENT_STARTING, START_TIMEOUT, QUEUED, STARTING)
110-
.contains(this)
127+
return this is Created || this is Connecting || this is Timeout || this is AgentStarting || this is StartTimeout || this is Queued || this is Starting
111128
}
112129

113130
/**
114131
* Return true if the workspace can be started.
115132
*/
116-
fun canStart(): Boolean = listOf(STOPPED, FAILED, CANCELED)
117-
.contains(this)
133+
fun canStart(): Boolean = this is Stopped || this is Failed || this is Canceled
118134

119135
/**
120136
* Return true if the workspace can be stopped.
@@ -140,36 +156,42 @@ enum class WorkspaceAndAgentStatus(val label: String, val description: String) {
140156
workspace: Workspace,
141157
agent: WorkspaceAgent? = null,
142158
) = when (workspace.latestBuild.status) {
143-
WorkspaceStatus.PENDING -> QUEUED
144-
WorkspaceStatus.STARTING -> STARTING
159+
WorkspaceStatus.PENDING -> Queued(workspace)
160+
WorkspaceStatus.STARTING -> Starting(workspace)
145161
WorkspaceStatus.RUNNING ->
146162
when (agent?.status) {
147163
WorkspaceAgentStatus.CONNECTED ->
148164
when (agent.lifecycleState) {
149-
WorkspaceAgentLifecycleState.CREATED -> CREATED
150-
WorkspaceAgentLifecycleState.STARTING -> if (agent.loginBeforeReady == true) AGENT_STARTING_READY else AGENT_STARTING
151-
WorkspaceAgentLifecycleState.START_TIMEOUT -> if (agent.loginBeforeReady == true) START_TIMEOUT_READY else START_TIMEOUT
152-
WorkspaceAgentLifecycleState.START_ERROR -> START_ERROR
153-
WorkspaceAgentLifecycleState.READY -> READY
154-
WorkspaceAgentLifecycleState.SHUTTING_DOWN -> SHUTTING_DOWN
155-
WorkspaceAgentLifecycleState.SHUTDOWN_TIMEOUT -> SHUTDOWN_TIMEOUT
156-
WorkspaceAgentLifecycleState.SHUTDOWN_ERROR -> SHUTDOWN_ERROR
157-
WorkspaceAgentLifecycleState.OFF -> OFF
165+
WorkspaceAgentLifecycleState.CREATED -> Created(workspace)
166+
WorkspaceAgentLifecycleState.STARTING -> if (agent.loginBeforeReady == true) AgentStartingReady(
167+
workspace
168+
) else AgentStarting(workspace)
169+
170+
WorkspaceAgentLifecycleState.START_TIMEOUT -> if (agent.loginBeforeReady == true) StartTimeoutReady(
171+
workspace
172+
) else StartTimeout(workspace)
173+
174+
WorkspaceAgentLifecycleState.START_ERROR -> StartError(workspace)
175+
WorkspaceAgentLifecycleState.READY -> Ready(workspace)
176+
WorkspaceAgentLifecycleState.SHUTTING_DOWN -> ShuttingDown(workspace)
177+
WorkspaceAgentLifecycleState.SHUTDOWN_TIMEOUT -> ShutdownTimeout(workspace)
178+
WorkspaceAgentLifecycleState.SHUTDOWN_ERROR -> ShutdownError(workspace)
179+
WorkspaceAgentLifecycleState.OFF -> Off(workspace)
158180
}
159181

160-
WorkspaceAgentStatus.DISCONNECTED -> DISCONNECTED
161-
WorkspaceAgentStatus.TIMEOUT -> TIMEOUT
162-
WorkspaceAgentStatus.CONNECTING -> CONNECTING
163-
else -> RUNNING
182+
WorkspaceAgentStatus.DISCONNECTED -> Disconnected(workspace)
183+
WorkspaceAgentStatus.TIMEOUT -> Timeout(workspace)
184+
WorkspaceAgentStatus.CONNECTING -> Connecting(workspace)
185+
else -> Running(workspace)
164186
}
165187

166-
WorkspaceStatus.STOPPING -> STOPPING
167-
WorkspaceStatus.STOPPED -> STOPPED
168-
WorkspaceStatus.FAILED -> FAILED
169-
WorkspaceStatus.CANCELING -> CANCELING
170-
WorkspaceStatus.CANCELED -> CANCELED
171-
WorkspaceStatus.DELETING -> DELETING
172-
WorkspaceStatus.DELETED -> DELETED
188+
WorkspaceStatus.STOPPING -> Stopping(workspace)
189+
WorkspaceStatus.STOPPED -> Stopped(workspace)
190+
WorkspaceStatus.FAILED -> Failed(workspace)
191+
WorkspaceStatus.CANCELING -> Canceling(workspace)
192+
WorkspaceStatus.CANCELED -> Canceled(workspace)
193+
WorkspaceStatus.DELETING -> Deleting(workspace)
194+
WorkspaceStatus.DELETED -> Deleted(workspace)
173195
}
174196
}
175197
}

0 commit comments

Comments
 (0)