Skip to content

Commit 0ee58fb

Browse files
authored
Merge pull request #86 from AgentWorkforce/claude/fix-workspace-disconnections-c4F68
fix: add ping/pong keepalive to main and bridge WebSockets
2 parents 562d86a + 2ae1a7a commit 0ee58fb

File tree

6 files changed

+218
-2
lines changed

6 files changed

+218
-2
lines changed
Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
{
2+
"id": "traj_ub8csuv3lcv4",
3+
"version": 1,
4+
"task": {
5+
"title": "Fix WebSocket disconnections for workspace instances",
6+
"source": {
7+
"system": "plain",
8+
"id": "workspace-websocket-stability"
9+
}
10+
},
11+
"status": "completed",
12+
"startedAt": "2026-01-06T18:13:23.603Z",
13+
"agents": [
14+
{
15+
"name": "default",
16+
"role": "lead",
17+
"joinedAt": "2026-01-06T18:13:35.723Z"
18+
}
19+
],
20+
"chapters": [
21+
{
22+
"id": "chap_e4vj0j8ig7ma",
23+
"title": "Work",
24+
"agentName": "default",
25+
"startedAt": "2026-01-06T18:13:35.723Z",
26+
"events": [
27+
{
28+
"ts": 1767723215725,
29+
"type": "decision",
30+
"content": "Add ping/pong keepalive to main and bridge WebSockets: Add ping/pong keepalive to main and bridge WebSockets",
31+
"raw": {
32+
"question": "Add ping/pong keepalive to main and bridge WebSockets",
33+
"chosen": "Add ping/pong keepalive to main and bridge WebSockets",
34+
"alternatives": [],
35+
"reasoning": "Main dashboard and bridge WebSocket endpoints were missing ping/pong keepalive, while logs and presence endpoints had it. Without keepalive, TCP/proxy timeouts kill idle connections (typically 60-120s)."
36+
},
37+
"significance": "high"
38+
}
39+
],
40+
"endedAt": "2026-01-06T18:16:51.462Z"
41+
}
42+
],
43+
"commits": [],
44+
"filesChanged": [],
45+
"projectId": "/home/user/relay",
46+
"tags": [],
47+
"completedAt": "2026-01-06T18:16:51.462Z",
48+
"retrospective": {
49+
"summary": "Added ping/pong keepalive to main and bridge WebSockets to fix connection instability",
50+
"approach": "Standard approach",
51+
"confidence": 0.9
52+
}
53+
}
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
# Trajectory: Fix WebSocket disconnections for workspace instances
2+
3+
> **Status:** ✅ Completed
4+
> **Task:** workspace-websocket-stability
5+
> **Confidence:** 90%
6+
> **Started:** January 6, 2026 at 06:13 PM
7+
> **Completed:** January 6, 2026 at 06:16 PM
8+
9+
---
10+
11+
## Summary
12+
13+
Added ping/pong keepalive to main and bridge WebSockets to fix connection instability
14+
15+
**Approach:** Standard approach
16+
17+
---
18+
19+
## Key Decisions
20+
21+
### Add ping/pong keepalive to main and bridge WebSockets
22+
- **Chose:** Add ping/pong keepalive to main and bridge WebSockets
23+
- **Reasoning:** Main dashboard and bridge WebSocket endpoints were missing ping/pong keepalive, while logs and presence endpoints had it. Without keepalive, TCP/proxy timeouts kill idle connections (typically 60-120s).
24+
25+
---
26+
27+
## Chapters
28+
29+
### 1. Work
30+
*Agent: default*
31+
32+
- Add ping/pong keepalive to main and bridge WebSockets: Add ping/pong keepalive to main and bridge WebSockets

.trajectories/index.json

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
22
"version": 1,
3-
"lastUpdated": "2026-01-06T17:12:56.941Z",
3+
"lastUpdated": "2026-01-06T18:16:51.488Z",
44
"trajectories": {
55
"traj_ozd98si6a7ns": {
66
"title": "Fix thinking indicator showing on all messages",
@@ -365,6 +365,13 @@
365365
"startedAt": "2026-01-06T17:11:57.504Z",
366366
"completedAt": "2026-01-06T17:12:56.919Z",
367367
"path": "/home/user/relay/.trajectories/completed/2026-01/traj_v9dkdoxylyid.json"
368+
},
369+
"traj_ub8csuv3lcv4": {
370+
"title": "Fix WebSocket disconnections for workspace instances",
371+
"status": "completed",
372+
"startedAt": "2026-01-06T18:13:23.603Z",
373+
"completedAt": "2026-01-06T18:16:51.462Z",
374+
"path": "/home/user/relay/.trajectories/completed/2026-01/traj_ub8csuv3lcv4.json"
368375
}
369376
}
370-
}
377+
}

src/daemon/api.ts

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,10 @@ export class DaemonApi extends EventEmitter {
5555
private allowedOrigins: Set<string>;
5656
private allowAllOrigins: boolean;
5757

58+
// Track alive status for ping/pong keepalive
59+
private clientAlive = new WeakMap<WS, boolean>();
60+
private pingInterval?: NodeJS.Timeout;
61+
5862
constructor(config: ApiDaemonConfig) {
5963
super();
6064
this.config = config;
@@ -127,6 +131,19 @@ export class DaemonApi extends EventEmitter {
127131
this.wss = new WebSocketServer({ server: this.server });
128132
this.wss.on('connection', (ws, req) => this.handleWebSocketConnection(ws, req));
129133

134+
// Setup ping/pong keepalive (30 second interval)
135+
this.pingInterval = setInterval(() => {
136+
this.wss?.clients.forEach((ws) => {
137+
if (this.clientAlive.get(ws) === false) {
138+
logger.info('WebSocket client unresponsive, closing');
139+
ws.terminate();
140+
return;
141+
}
142+
this.clientAlive.set(ws, false);
143+
ws.ping();
144+
});
145+
}, 30000);
146+
130147
this.server.listen(this.config.port, this.config.host, () => {
131148
logger.info('Daemon API started', { port: this.config.port, host: this.config.host });
132149
resolve();
@@ -138,6 +155,12 @@ export class DaemonApi extends EventEmitter {
138155
* Stop the API server
139156
*/
140157
async stop(): Promise<void> {
158+
// Clear ping interval
159+
if (this.pingInterval) {
160+
clearInterval(this.pingInterval);
161+
this.pingInterval = undefined;
162+
}
163+
141164
// Close all WebSocket connections
142165
if (this.wss) {
143166
for (const ws of this.wss.clients) {
@@ -576,6 +599,14 @@ export class DaemonApi extends EventEmitter {
576599
private handleWebSocketConnection(ws: WS, req: http.IncomingMessage): void {
577600
logger.info('WebSocket client connected', { url: req.url });
578601

602+
// Mark client as alive for ping/pong keepalive
603+
this.clientAlive.set(ws, true);
604+
605+
// Handle pong responses
606+
ws.on('pong', () => {
607+
this.clientAlive.set(ws, true);
608+
});
609+
579610
// Create session
580611
const session: UserSession = {
581612
userId: 'anonymous', // Would be set from auth

src/daemon/orchestrator.ts

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,10 @@ export class Orchestrator extends EventEmitter {
7070
});
7171
private workspacesFile: string;
7272

73+
// Track alive status for ping/pong keepalive
74+
private clientAlive = new WeakMap<WebSocket, boolean>();
75+
private pingInterval?: NodeJS.Timeout;
76+
7377
constructor(config: Partial<OrchestratorConfig> = {}) {
7478
super();
7579
this.config = { ...DEFAULT_CONFIG, ...config };
@@ -112,6 +116,19 @@ export class Orchestrator extends EventEmitter {
112116
this.wss = new WebSocketServer({ server: this.server });
113117
this.wss.on('connection', (ws, req) => this.handleWebSocket(ws, req));
114118

119+
// Setup ping/pong keepalive (30 second interval)
120+
this.pingInterval = setInterval(() => {
121+
this.wss?.clients.forEach((ws) => {
122+
if (this.clientAlive.get(ws) === false) {
123+
logger.info('WebSocket client unresponsive, closing');
124+
ws.terminate();
125+
return;
126+
}
127+
this.clientAlive.set(ws, false);
128+
ws.ping();
129+
});
130+
}, 30000);
131+
115132
return new Promise((resolve) => {
116133
this.server!.listen(this.config.port, this.config.host, () => {
117134
logger.info('Orchestrator started', {
@@ -128,6 +145,12 @@ export class Orchestrator extends EventEmitter {
128145
async stop(): Promise<void> {
129146
logger.info('Stopping orchestrator');
130147

148+
// Clear ping interval
149+
if (this.pingInterval) {
150+
clearInterval(this.pingInterval);
151+
this.pingInterval = undefined;
152+
}
153+
131154
// Stop all workspace daemons
132155
for (const [id] of this.workspaces) {
133156
await this.stopWorkspaceDaemon(id);
@@ -606,6 +629,14 @@ export class Orchestrator extends EventEmitter {
606629
private handleWebSocket(ws: WebSocket, _req: http.IncomingMessage): void {
607630
logger.info('WebSocket client connected');
608631

632+
// Mark client as alive for ping/pong keepalive
633+
this.clientAlive.set(ws, true);
634+
635+
// Handle pong responses
636+
ws.on('pong', () => {
637+
this.clientAlive.set(ws, true);
638+
});
639+
609640
const session: UserSession = {
610641
userId: 'anonymous',
611642
githubUsername: 'anonymous',

src/dashboard-server/server.ts

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -501,6 +501,52 @@ export async function startDashboard(
501501
// Track log subscriptions: agentName -> Set of WebSocket clients
502502
const logSubscriptions = new Map<string, Set<WebSocket>>();
503503

504+
// Track alive status for ping/pong keepalive on main dashboard connections
505+
// This prevents TCP/proxy timeouts from killing idle workspace connections
506+
const mainClientAlive = new WeakMap<WebSocket, boolean>();
507+
508+
// Track alive status for ping/pong keepalive on bridge connections
509+
const bridgeClientAlive = new WeakMap<WebSocket, boolean>();
510+
511+
// Ping interval for main dashboard WebSocket connections (30 seconds)
512+
// Aligns with heartbeat timeout (5s heartbeat * 6 multiplier = 30s)
513+
const MAIN_PING_INTERVAL_MS = 30000;
514+
const mainPingInterval = setInterval(() => {
515+
wss.clients.forEach((ws) => {
516+
if (mainClientAlive.get(ws) === false) {
517+
// Client didn't respond to last ping - close gracefully
518+
console.log('[dashboard] Main WebSocket client unresponsive, closing gracefully');
519+
ws.close(1000, 'unresponsive');
520+
return;
521+
}
522+
// Mark as not alive until we get a pong
523+
mainClientAlive.set(ws, false);
524+
ws.ping();
525+
});
526+
}, MAIN_PING_INTERVAL_MS);
527+
528+
// Ping interval for bridge WebSocket connections (30 seconds)
529+
const BRIDGE_PING_INTERVAL_MS = 30000;
530+
const bridgePingInterval = setInterval(() => {
531+
wssBridge.clients.forEach((ws) => {
532+
if (bridgeClientAlive.get(ws) === false) {
533+
console.log('[dashboard] Bridge WebSocket client unresponsive, closing gracefully');
534+
ws.close(1000, 'unresponsive');
535+
return;
536+
}
537+
bridgeClientAlive.set(ws, false);
538+
ws.ping();
539+
});
540+
}, BRIDGE_PING_INTERVAL_MS);
541+
542+
// Clean up ping intervals on server close
543+
wss.on('close', () => {
544+
clearInterval(mainPingInterval);
545+
});
546+
wssBridge.on('close', () => {
547+
clearInterval(bridgePingInterval);
548+
});
549+
504550
// Track online users for presence with multi-tab support
505551
// username -> { connections: Set<WebSocket>, userInfo }
506552
interface UserPresenceInfo {
@@ -1578,6 +1624,14 @@ export async function startDashboard(
15781624
wss.on('connection', async (ws, req) => {
15791625
console.log('[dashboard] WebSocket client connected from:', req.socket.remoteAddress);
15801626

1627+
// Mark client as alive initially for ping/pong keepalive
1628+
mainClientAlive.set(ws, true);
1629+
1630+
// Handle pong responses (keep connection alive)
1631+
ws.on('pong', () => {
1632+
mainClientAlive.set(ws, true);
1633+
});
1634+
15811635
// Mark as initializing to prevent broadcastData from sending before we do
15821636
initializingClients.add(ws);
15831637

@@ -1618,6 +1672,14 @@ export async function startDashboard(
16181672
wssBridge.on('connection', async (ws) => {
16191673
console.log('[dashboard] Bridge WebSocket client connected');
16201674

1675+
// Mark client as alive initially for ping/pong keepalive
1676+
bridgeClientAlive.set(ws, true);
1677+
1678+
// Handle pong responses (keep connection alive)
1679+
ws.on('pong', () => {
1680+
bridgeClientAlive.set(ws, true);
1681+
});
1682+
16211683
try {
16221684
const data = await getBridgeData();
16231685
const payload = JSON.stringify(data);

0 commit comments

Comments
 (0)