diff --git a/.trajectories/completed/2026-01/traj_ub8csuv3lcv4.json b/.trajectories/completed/2026-01/traj_ub8csuv3lcv4.json new file mode 100644 index 00000000..9f009017 --- /dev/null +++ b/.trajectories/completed/2026-01/traj_ub8csuv3lcv4.json @@ -0,0 +1,53 @@ +{ + "id": "traj_ub8csuv3lcv4", + "version": 1, + "task": { + "title": "Fix WebSocket disconnections for workspace instances", + "source": { + "system": "plain", + "id": "workspace-websocket-stability" + } + }, + "status": "completed", + "startedAt": "2026-01-06T18:13:23.603Z", + "agents": [ + { + "name": "default", + "role": "lead", + "joinedAt": "2026-01-06T18:13:35.723Z" + } + ], + "chapters": [ + { + "id": "chap_e4vj0j8ig7ma", + "title": "Work", + "agentName": "default", + "startedAt": "2026-01-06T18:13:35.723Z", + "events": [ + { + "ts": 1767723215725, + "type": "decision", + "content": "Add ping/pong keepalive to main and bridge WebSockets: Add ping/pong keepalive to main and bridge WebSockets", + "raw": { + "question": "Add ping/pong keepalive to main and bridge WebSockets", + "chosen": "Add ping/pong keepalive to main and bridge WebSockets", + "alternatives": [], + "reasoning": "Main dashboard and bridge WebSocket endpoints were missing ping/pong keepalive, while logs and presence endpoints had it. Without keepalive, TCP/proxy timeouts kill idle connections (typically 60-120s)." + }, + "significance": "high" + } + ], + "endedAt": "2026-01-06T18:16:51.462Z" + } + ], + "commits": [], + "filesChanged": [], + "projectId": "/home/user/relay", + "tags": [], + "completedAt": "2026-01-06T18:16:51.462Z", + "retrospective": { + "summary": "Added ping/pong keepalive to main and bridge WebSockets to fix connection instability", + "approach": "Standard approach", + "confidence": 0.9 + } +} \ No newline at end of file diff --git a/.trajectories/completed/2026-01/traj_ub8csuv3lcv4.md b/.trajectories/completed/2026-01/traj_ub8csuv3lcv4.md new file mode 100644 index 00000000..e04f795c --- /dev/null +++ b/.trajectories/completed/2026-01/traj_ub8csuv3lcv4.md @@ -0,0 +1,32 @@ +# Trajectory: Fix WebSocket disconnections for workspace instances + +> **Status:** ✅ Completed +> **Task:** workspace-websocket-stability +> **Confidence:** 90% +> **Started:** January 6, 2026 at 06:13 PM +> **Completed:** January 6, 2026 at 06:16 PM + +--- + +## Summary + +Added ping/pong keepalive to main and bridge WebSockets to fix connection instability + +**Approach:** Standard approach + +--- + +## Key Decisions + +### Add ping/pong keepalive to main and bridge WebSockets +- **Chose:** Add ping/pong keepalive to main and bridge WebSockets +- **Reasoning:** Main dashboard and bridge WebSocket endpoints were missing ping/pong keepalive, while logs and presence endpoints had it. Without keepalive, TCP/proxy timeouts kill idle connections (typically 60-120s). + +--- + +## Chapters + +### 1. Work +*Agent: default* + +- Add ping/pong keepalive to main and bridge WebSockets: Add ping/pong keepalive to main and bridge WebSockets diff --git a/.trajectories/index.json b/.trajectories/index.json index e4820323..51e651bc 100644 --- a/.trajectories/index.json +++ b/.trajectories/index.json @@ -1,6 +1,6 @@ { "version": 1, - "lastUpdated": "2026-01-06T17:12:56.941Z", + "lastUpdated": "2026-01-06T18:16:51.488Z", "trajectories": { "traj_ozd98si6a7ns": { "title": "Fix thinking indicator showing on all messages", @@ -365,6 +365,13 @@ "startedAt": "2026-01-06T17:11:57.504Z", "completedAt": "2026-01-06T17:12:56.919Z", "path": "/home/user/relay/.trajectories/completed/2026-01/traj_v9dkdoxylyid.json" + }, + "traj_ub8csuv3lcv4": { + "title": "Fix WebSocket disconnections for workspace instances", + "status": "completed", + "startedAt": "2026-01-06T18:13:23.603Z", + "completedAt": "2026-01-06T18:16:51.462Z", + "path": "/home/user/relay/.trajectories/completed/2026-01/traj_ub8csuv3lcv4.json" } } -} \ No newline at end of file +} diff --git a/src/daemon/api.ts b/src/daemon/api.ts index 9687daff..17be319e 100644 --- a/src/daemon/api.ts +++ b/src/daemon/api.ts @@ -55,6 +55,10 @@ export class DaemonApi extends EventEmitter { private allowedOrigins: Set; private allowAllOrigins: boolean; + // Track alive status for ping/pong keepalive + private clientAlive = new WeakMap(); + private pingInterval?: NodeJS.Timeout; + constructor(config: ApiDaemonConfig) { super(); this.config = config; @@ -127,6 +131,19 @@ export class DaemonApi extends EventEmitter { this.wss = new WebSocketServer({ server: this.server }); this.wss.on('connection', (ws, req) => this.handleWebSocketConnection(ws, req)); + // Setup ping/pong keepalive (30 second interval) + this.pingInterval = setInterval(() => { + this.wss?.clients.forEach((ws) => { + if (this.clientAlive.get(ws) === false) { + logger.info('WebSocket client unresponsive, closing'); + ws.terminate(); + return; + } + this.clientAlive.set(ws, false); + ws.ping(); + }); + }, 30000); + this.server.listen(this.config.port, this.config.host, () => { logger.info('Daemon API started', { port: this.config.port, host: this.config.host }); resolve(); @@ -138,6 +155,12 @@ export class DaemonApi extends EventEmitter { * Stop the API server */ async stop(): Promise { + // Clear ping interval + if (this.pingInterval) { + clearInterval(this.pingInterval); + this.pingInterval = undefined; + } + // Close all WebSocket connections if (this.wss) { for (const ws of this.wss.clients) { @@ -576,6 +599,14 @@ export class DaemonApi extends EventEmitter { private handleWebSocketConnection(ws: WS, req: http.IncomingMessage): void { logger.info('WebSocket client connected', { url: req.url }); + // Mark client as alive for ping/pong keepalive + this.clientAlive.set(ws, true); + + // Handle pong responses + ws.on('pong', () => { + this.clientAlive.set(ws, true); + }); + // Create session const session: UserSession = { userId: 'anonymous', // Would be set from auth diff --git a/src/daemon/orchestrator.ts b/src/daemon/orchestrator.ts index 9a920ab1..cf537c5a 100644 --- a/src/daemon/orchestrator.ts +++ b/src/daemon/orchestrator.ts @@ -70,6 +70,10 @@ export class Orchestrator extends EventEmitter { }); private workspacesFile: string; + // Track alive status for ping/pong keepalive + private clientAlive = new WeakMap(); + private pingInterval?: NodeJS.Timeout; + constructor(config: Partial = {}) { super(); this.config = { ...DEFAULT_CONFIG, ...config }; @@ -112,6 +116,19 @@ export class Orchestrator extends EventEmitter { this.wss = new WebSocketServer({ server: this.server }); this.wss.on('connection', (ws, req) => this.handleWebSocket(ws, req)); + // Setup ping/pong keepalive (30 second interval) + this.pingInterval = setInterval(() => { + this.wss?.clients.forEach((ws) => { + if (this.clientAlive.get(ws) === false) { + logger.info('WebSocket client unresponsive, closing'); + ws.terminate(); + return; + } + this.clientAlive.set(ws, false); + ws.ping(); + }); + }, 30000); + return new Promise((resolve) => { this.server!.listen(this.config.port, this.config.host, () => { logger.info('Orchestrator started', { @@ -128,6 +145,12 @@ export class Orchestrator extends EventEmitter { async stop(): Promise { logger.info('Stopping orchestrator'); + // Clear ping interval + if (this.pingInterval) { + clearInterval(this.pingInterval); + this.pingInterval = undefined; + } + // Stop all workspace daemons for (const [id] of this.workspaces) { await this.stopWorkspaceDaemon(id); @@ -606,6 +629,14 @@ export class Orchestrator extends EventEmitter { private handleWebSocket(ws: WebSocket, _req: http.IncomingMessage): void { logger.info('WebSocket client connected'); + // Mark client as alive for ping/pong keepalive + this.clientAlive.set(ws, true); + + // Handle pong responses + ws.on('pong', () => { + this.clientAlive.set(ws, true); + }); + const session: UserSession = { userId: 'anonymous', githubUsername: 'anonymous', diff --git a/src/dashboard-server/server.ts b/src/dashboard-server/server.ts index 795b82e0..725d5b14 100644 --- a/src/dashboard-server/server.ts +++ b/src/dashboard-server/server.ts @@ -501,6 +501,52 @@ export async function startDashboard( // Track log subscriptions: agentName -> Set of WebSocket clients const logSubscriptions = new Map>(); + // Track alive status for ping/pong keepalive on main dashboard connections + // This prevents TCP/proxy timeouts from killing idle workspace connections + const mainClientAlive = new WeakMap(); + + // Track alive status for ping/pong keepalive on bridge connections + const bridgeClientAlive = new WeakMap(); + + // Ping interval for main dashboard WebSocket connections (30 seconds) + // Aligns with heartbeat timeout (5s heartbeat * 6 multiplier = 30s) + const MAIN_PING_INTERVAL_MS = 30000; + const mainPingInterval = setInterval(() => { + wss.clients.forEach((ws) => { + if (mainClientAlive.get(ws) === false) { + // Client didn't respond to last ping - close gracefully + console.log('[dashboard] Main WebSocket client unresponsive, closing gracefully'); + ws.close(1000, 'unresponsive'); + return; + } + // Mark as not alive until we get a pong + mainClientAlive.set(ws, false); + ws.ping(); + }); + }, MAIN_PING_INTERVAL_MS); + + // Ping interval for bridge WebSocket connections (30 seconds) + const BRIDGE_PING_INTERVAL_MS = 30000; + const bridgePingInterval = setInterval(() => { + wssBridge.clients.forEach((ws) => { + if (bridgeClientAlive.get(ws) === false) { + console.log('[dashboard] Bridge WebSocket client unresponsive, closing gracefully'); + ws.close(1000, 'unresponsive'); + return; + } + bridgeClientAlive.set(ws, false); + ws.ping(); + }); + }, BRIDGE_PING_INTERVAL_MS); + + // Clean up ping intervals on server close + wss.on('close', () => { + clearInterval(mainPingInterval); + }); + wssBridge.on('close', () => { + clearInterval(bridgePingInterval); + }); + // Track online users for presence with multi-tab support // username -> { connections: Set, userInfo } interface UserPresenceInfo { @@ -1578,6 +1624,14 @@ export async function startDashboard( wss.on('connection', async (ws, req) => { console.log('[dashboard] WebSocket client connected from:', req.socket.remoteAddress); + // Mark client as alive initially for ping/pong keepalive + mainClientAlive.set(ws, true); + + // Handle pong responses (keep connection alive) + ws.on('pong', () => { + mainClientAlive.set(ws, true); + }); + // Mark as initializing to prevent broadcastData from sending before we do initializingClients.add(ws); @@ -1618,6 +1672,14 @@ export async function startDashboard( wssBridge.on('connection', async (ws) => { console.log('[dashboard] Bridge WebSocket client connected'); + // Mark client as alive initially for ping/pong keepalive + bridgeClientAlive.set(ws, true); + + // Handle pong responses (keep connection alive) + ws.on('pong', () => { + bridgeClientAlive.set(ws, true); + }); + try { const data = await getBridgeData(); const payload = JSON.stringify(data);