Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 6 additions & 45 deletions cloud/packages/cloud/src/services/session/UserSession.ts
Original file line number Diff line number Diff line change
Expand Up @@ -816,51 +816,12 @@ export class UserSession {
// Mark disposed for leak detection
memoryLeakDetector.markDisposed(`UserSession:${this.userId}`);

// GC after disconnect — force garbage collection and measure how much is freed.
// Rate-limited: at most once per 10 seconds across all sessions to avoid
// thrashing GC during crash cascades (many sessions disconnecting at once).
if (this.userId && UserSession.canRunPostDisconnectGc()) {
const gcLogger = rootLogger.child({ service: "gc-after-disconnect" });
setTimeout(() => {
try {
const memBefore = process.memoryUsage();
const t0 = performance.now();
Bun.gc(true);
const gcDurationMs = performance.now() - t0;
const memAfter = process.memoryUsage();
const freedBytes = memBefore.heapUsed - memAfter.heapUsed;

gcLogger.info(
{
feature: "gc-after-disconnect",
userId: this.userId,
gcDurationMs: Math.round(gcDurationMs * 10) / 10,
heapBeforeMB: Math.round(memBefore.heapUsed / 1048576),
heapAfterMB: Math.round(memAfter.heapUsed / 1048576),
freedMB: Math.round(freedBytes / 1048576),
rssMB: Math.round(memAfter.rss / 1048576),
sessionDurationSeconds,
},
`GC after disconnect (${this.userId}): ${gcDurationMs.toFixed(1)}ms, freed ${Math.round(freedBytes / 1048576)}MB`,
);
} catch (error) {
gcLogger.error(error, "GC after disconnect failed");
}
}, 0);
}
}

// Rate limiter for post-disconnect GC: at most once per 10 seconds
private static lastPostDisconnectGc: number = 0;
private static POST_DISCONNECT_GC_COOLDOWN_MS = 10_000;

private static canRunPostDisconnectGc(): boolean {
const now = Date.now();
if (now - UserSession.lastPostDisconnectGc < UserSession.POST_DISCONNECT_GC_COOLDOWN_MS) {
return false;
}
UserSession.lastPostDisconnectGc = now;
return true;
// gc-after-disconnect REMOVED — confirmed wasteful:
// 31 calls/hour on US Central, 2,242ms total event loop blocking, freed 0 bytes
// every single time. The gc-probe in SystemVitalsLogger provides the same
// diagnostic data on a fixed 60s schedule without being triggered by user behavior.
// See: cloud/issues/066-ws-disconnect-churn/spec.md (A7)
// See: cloud/issues/067-heap-growth-investigation/spike.md
}

/**
Expand Down
27 changes: 27 additions & 0 deletions cloud/porter.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,37 @@ services:
# connections because no client→server traffic flows after audio moved to UDP.
# 3600s prevents nginx from killing healthy WS connections.
# See: cloud/issues/035-nginx-ws-timeout/spike.md
# Liveness probe — "is the process alive?"
# Points to /livez which just returns "ok" with zero computation.
# Previously pointed to /health which iterates all sessions, counts
# WebSockets, updates gauges, and serializes JSON — too much work for
# a "are you alive?" check, especially under load.
# See: cloud/issues/057-cloud-observability/spec.md (1d)
livenessCheck:
enabled: true
httpPath: /livez
timeoutSeconds: 3
initialDelaySeconds: 15
# Readiness probe — "can this pod handle traffic?"
# Points to /health which returns session count, metrics, and memory info.
# If this fails, the pod is removed from the load balancer but NOT killed.
readinessCheck:
enabled: true
httpPath: /health
timeoutSeconds: 5
initialDelaySeconds: 15
# Extended timeouts for WebSocket connections (/glasses-ws, /app-ws).
# Porter sets proxy-send-timeout: 60s by default, which kills idle WS
# connections because no client→server traffic flows after audio moved to UDP.
# 3600s prevents nginx from killing healthy WS connections.
# See: cloud/issues/035-nginx-ws-timeout/spike.md
ingressAnnotations:
nginx.ingress.kubernetes.io/proxy-read-timeout: "3600"
nginx.ingress.kubernetes.io/proxy-send-timeout: "3600"
nginx.ingress.kubernetes.io/proxy-connect-timeout: "60"
# Graceful shutdown — give WebSocket close frames time to flush
# See: cloud/issues/063-graceful-shutdown/spec.md
terminationGracePeriodSeconds: 30
additionalPorts:
- port: 8000
protocol: UDP
Expand Down
Loading