Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 53 additions & 0 deletions .trajectories/completed/2026-01/traj_ub8csuv3lcv4.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
{
"id": "traj_ub8csuv3lcv4",
"version": 1,
"task": {
"title": "Fix WebSocket disconnections for workspace instances",
"source": {
"system": "plain",
"id": "workspace-websocket-stability"
}
},
"status": "completed",
"startedAt": "2026-01-06T18:13:23.603Z",
"agents": [
{
"name": "default",
"role": "lead",
"joinedAt": "2026-01-06T18:13:35.723Z"
}
],
"chapters": [
{
"id": "chap_e4vj0j8ig7ma",
"title": "Work",
"agentName": "default",
"startedAt": "2026-01-06T18:13:35.723Z",
"events": [
{
"ts": 1767723215725,
"type": "decision",
"content": "Add ping/pong keepalive to main and bridge WebSockets: Add ping/pong keepalive to main and bridge WebSockets",
"raw": {
"question": "Add ping/pong keepalive to main and bridge WebSockets",
"chosen": "Add ping/pong keepalive to main and bridge WebSockets",
"alternatives": [],
"reasoning": "Main dashboard and bridge WebSocket endpoints were missing ping/pong keepalive, while logs and presence endpoints had it. Without keepalive, TCP/proxy timeouts kill idle connections (typically 60-120s)."
},
"significance": "high"
}
],
"endedAt": "2026-01-06T18:16:51.462Z"
}
],
"commits": [],
"filesChanged": [],
"projectId": "/home/user/relay",
"tags": [],
"completedAt": "2026-01-06T18:16:51.462Z",
"retrospective": {
"summary": "Added ping/pong keepalive to main and bridge WebSockets to fix connection instability",
"approach": "Standard approach",
"confidence": 0.9
}
}
32 changes: 32 additions & 0 deletions .trajectories/completed/2026-01/traj_ub8csuv3lcv4.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
# Trajectory: Fix WebSocket disconnections for workspace instances

> **Status:** ✅ Completed
> **Task:** workspace-websocket-stability
> **Confidence:** 90%
> **Started:** January 6, 2026 at 06:13 PM
> **Completed:** January 6, 2026 at 06:16 PM

---

## Summary

Added ping/pong keepalive to main and bridge WebSockets to fix connection instability

**Approach:** Standard approach

---

## Key Decisions

### Add ping/pong keepalive to main and bridge WebSockets
- **Chose:** Add ping/pong keepalive to main and bridge WebSockets
- **Reasoning:** Main dashboard and bridge WebSocket endpoints were missing ping/pong keepalive, while logs and presence endpoints had it. Without keepalive, TCP/proxy timeouts kill idle connections (typically 60-120s).

---

## Chapters

### 1. Work
*Agent: default*

- Add ping/pong keepalive to main and bridge WebSockets: Add ping/pong keepalive to main and bridge WebSockets
11 changes: 9 additions & 2 deletions .trajectories/index.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"version": 1,
"lastUpdated": "2026-01-06T17:12:56.941Z",
"lastUpdated": "2026-01-06T18:16:51.488Z",
"trajectories": {
"traj_ozd98si6a7ns": {
"title": "Fix thinking indicator showing on all messages",
Expand Down Expand Up @@ -365,6 +365,13 @@
"startedAt": "2026-01-06T17:11:57.504Z",
"completedAt": "2026-01-06T17:12:56.919Z",
"path": "/home/user/relay/.trajectories/completed/2026-01/traj_v9dkdoxylyid.json"
},
"traj_ub8csuv3lcv4": {
"title": "Fix WebSocket disconnections for workspace instances",
"status": "completed",
"startedAt": "2026-01-06T18:13:23.603Z",
"completedAt": "2026-01-06T18:16:51.462Z",
"path": "/home/user/relay/.trajectories/completed/2026-01/traj_ub8csuv3lcv4.json"
}
}
}
}
31 changes: 31 additions & 0 deletions src/daemon/api.ts
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,10 @@ export class DaemonApi extends EventEmitter {
private allowedOrigins: Set<string>;
private allowAllOrigins: boolean;

// Track alive status for ping/pong keepalive
private clientAlive = new WeakMap<WS, boolean>();
private pingInterval?: NodeJS.Timeout;

constructor(config: ApiDaemonConfig) {
super();
this.config = config;
Expand Down Expand Up @@ -127,6 +131,19 @@ export class DaemonApi extends EventEmitter {
this.wss = new WebSocketServer({ server: this.server });
this.wss.on('connection', (ws, req) => this.handleWebSocketConnection(ws, req));

// Setup ping/pong keepalive (30 second interval)
this.pingInterval = setInterval(() => {
this.wss?.clients.forEach((ws) => {
if (this.clientAlive.get(ws) === false) {
logger.info('WebSocket client unresponsive, closing');
ws.terminate();
return;
}
this.clientAlive.set(ws, false);
ws.ping();
});
}, 30000);

this.server.listen(this.config.port, this.config.host, () => {
logger.info('Daemon API started', { port: this.config.port, host: this.config.host });
resolve();
Expand All @@ -138,6 +155,12 @@ export class DaemonApi extends EventEmitter {
* Stop the API server
*/
async stop(): Promise<void> {
// Clear ping interval
if (this.pingInterval) {
clearInterval(this.pingInterval);
this.pingInterval = undefined;
}

// Close all WebSocket connections
if (this.wss) {
for (const ws of this.wss.clients) {
Expand Down Expand Up @@ -576,6 +599,14 @@ export class DaemonApi extends EventEmitter {
private handleWebSocketConnection(ws: WS, req: http.IncomingMessage): void {
logger.info('WebSocket client connected', { url: req.url });

// Mark client as alive for ping/pong keepalive
this.clientAlive.set(ws, true);

// Handle pong responses
ws.on('pong', () => {
this.clientAlive.set(ws, true);
});

// Create session
const session: UserSession = {
userId: 'anonymous', // Would be set from auth
Expand Down
31 changes: 31 additions & 0 deletions src/daemon/orchestrator.ts
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,10 @@ export class Orchestrator extends EventEmitter {
});
private workspacesFile: string;

// Track alive status for ping/pong keepalive
private clientAlive = new WeakMap<WebSocket, boolean>();
private pingInterval?: NodeJS.Timeout;

constructor(config: Partial<OrchestratorConfig> = {}) {
super();
this.config = { ...DEFAULT_CONFIG, ...config };
Expand Down Expand Up @@ -112,6 +116,19 @@ export class Orchestrator extends EventEmitter {
this.wss = new WebSocketServer({ server: this.server });
this.wss.on('connection', (ws, req) => this.handleWebSocket(ws, req));

// Setup ping/pong keepalive (30 second interval)
this.pingInterval = setInterval(() => {
this.wss?.clients.forEach((ws) => {
if (this.clientAlive.get(ws) === false) {
logger.info('WebSocket client unresponsive, closing');
ws.terminate();
return;
}
this.clientAlive.set(ws, false);
ws.ping();
});
}, 30000);

return new Promise((resolve) => {
this.server!.listen(this.config.port, this.config.host, () => {
logger.info('Orchestrator started', {
Expand All @@ -128,6 +145,12 @@ export class Orchestrator extends EventEmitter {
async stop(): Promise<void> {
logger.info('Stopping orchestrator');

// Clear ping interval
if (this.pingInterval) {
clearInterval(this.pingInterval);
this.pingInterval = undefined;
}

// Stop all workspace daemons
for (const [id] of this.workspaces) {
await this.stopWorkspaceDaemon(id);
Expand Down Expand Up @@ -606,6 +629,14 @@ export class Orchestrator extends EventEmitter {
private handleWebSocket(ws: WebSocket, _req: http.IncomingMessage): void {
logger.info('WebSocket client connected');

// Mark client as alive for ping/pong keepalive
this.clientAlive.set(ws, true);

// Handle pong responses
ws.on('pong', () => {
this.clientAlive.set(ws, true);
});

const session: UserSession = {
userId: 'anonymous',
githubUsername: 'anonymous',
Expand Down
62 changes: 62 additions & 0 deletions src/dashboard-server/server.ts
Original file line number Diff line number Diff line change
Expand Up @@ -501,6 +501,52 @@ export async function startDashboard(
// Track log subscriptions: agentName -> Set of WebSocket clients
const logSubscriptions = new Map<string, Set<WebSocket>>();

// Track alive status for ping/pong keepalive on main dashboard connections
// This prevents TCP/proxy timeouts from killing idle workspace connections
const mainClientAlive = new WeakMap<WebSocket, boolean>();

// Track alive status for ping/pong keepalive on bridge connections
const bridgeClientAlive = new WeakMap<WebSocket, boolean>();

// Ping interval for main dashboard WebSocket connections (30 seconds)
// Aligns with heartbeat timeout (5s heartbeat * 6 multiplier = 30s)
const MAIN_PING_INTERVAL_MS = 30000;
const mainPingInterval = setInterval(() => {
wss.clients.forEach((ws) => {
if (mainClientAlive.get(ws) === false) {
// Client didn't respond to last ping - close gracefully
console.log('[dashboard] Main WebSocket client unresponsive, closing gracefully');
ws.close(1000, 'unresponsive');
return;
}
// Mark as not alive until we get a pong
mainClientAlive.set(ws, false);
ws.ping();
});
}, MAIN_PING_INTERVAL_MS);

// Ping interval for bridge WebSocket connections (30 seconds)
const BRIDGE_PING_INTERVAL_MS = 30000;
const bridgePingInterval = setInterval(() => {
wssBridge.clients.forEach((ws) => {
if (bridgeClientAlive.get(ws) === false) {
console.log('[dashboard] Bridge WebSocket client unresponsive, closing gracefully');
ws.close(1000, 'unresponsive');
return;
}
bridgeClientAlive.set(ws, false);
ws.ping();
});
}, BRIDGE_PING_INTERVAL_MS);

// Clean up ping intervals on server close
wss.on('close', () => {
clearInterval(mainPingInterval);
});
wssBridge.on('close', () => {
clearInterval(bridgePingInterval);
});

// Track online users for presence with multi-tab support
// username -> { connections: Set<WebSocket>, userInfo }
interface UserPresenceInfo {
Expand Down Expand Up @@ -1578,6 +1624,14 @@ export async function startDashboard(
wss.on('connection', async (ws, req) => {
console.log('[dashboard] WebSocket client connected from:', req.socket.remoteAddress);

// Mark client as alive initially for ping/pong keepalive
mainClientAlive.set(ws, true);

// Handle pong responses (keep connection alive)
ws.on('pong', () => {
mainClientAlive.set(ws, true);
});

// Mark as initializing to prevent broadcastData from sending before we do
initializingClients.add(ws);

Expand Down Expand Up @@ -1618,6 +1672,14 @@ export async function startDashboard(
wssBridge.on('connection', async (ws) => {
console.log('[dashboard] Bridge WebSocket client connected');

// Mark client as alive initially for ping/pong keepalive
bridgeClientAlive.set(ws, true);

// Handle pong responses (keep connection alive)
ws.on('pong', () => {
bridgeClientAlive.set(ws, true);
});

try {
const data = await getBridgeData();
const payload = JSON.stringify(data);
Expand Down