Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/workflows/porter-debug.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ on:
- hotfix/crash-diagnostics
- cloud/062-mongodb-audit
- hotfix/graceful-shutdown
- hotfix/remove-forced-gc-add-livez
paths:
- "cloud/**"
- ".github/workflows/porter-debug.yml"
Expand Down
51 changes: 6 additions & 45 deletions cloud/packages/cloud/src/services/session/UserSession.ts
Original file line number Diff line number Diff line change
Expand Up @@ -816,51 +816,12 @@ export class UserSession {
// Mark disposed for leak detection
memoryLeakDetector.markDisposed(`UserSession:${this.userId}`);

// GC after disconnect — force garbage collection and measure how much is freed.
// Rate-limited: at most once per 10 seconds across all sessions to avoid
// thrashing GC during crash cascades (many sessions disconnecting at once).
if (this.userId && UserSession.canRunPostDisconnectGc()) {
const gcLogger = rootLogger.child({ service: "gc-after-disconnect" });
setTimeout(() => {
try {
const memBefore = process.memoryUsage();
const t0 = performance.now();
Bun.gc(true);
const gcDurationMs = performance.now() - t0;
const memAfter = process.memoryUsage();
const freedBytes = memBefore.heapUsed - memAfter.heapUsed;

gcLogger.info(
{
feature: "gc-after-disconnect",
userId: this.userId,
gcDurationMs: Math.round(gcDurationMs * 10) / 10,
heapBeforeMB: Math.round(memBefore.heapUsed / 1048576),
heapAfterMB: Math.round(memAfter.heapUsed / 1048576),
freedMB: Math.round(freedBytes / 1048576),
rssMB: Math.round(memAfter.rss / 1048576),
sessionDurationSeconds,
},
`GC after disconnect (${this.userId}): ${gcDurationMs.toFixed(1)}ms, freed ${Math.round(freedBytes / 1048576)}MB`,
);
} catch (error) {
gcLogger.error(error, "GC after disconnect failed");
}
}, 0);
}
}

// Rate limiter for post-disconnect GC: at most once per 10 seconds
private static lastPostDisconnectGc: number = 0;
private static POST_DISCONNECT_GC_COOLDOWN_MS = 10_000;

private static canRunPostDisconnectGc(): boolean {
const now = Date.now();
if (now - UserSession.lastPostDisconnectGc < UserSession.POST_DISCONNECT_GC_COOLDOWN_MS) {
return false;
}
UserSession.lastPostDisconnectGc = now;
return true;
// gc-after-disconnect REMOVED — confirmed wasteful:
// 31 calls/hour on US Central, 2,242ms total event loop blocking, freed 0 bytes
// every single time. The gc-probe in SystemVitalsLogger provides the same
// diagnostic data on a fixed 60s schedule without being triggered by user behavior.
// See: cloud/issues/066-ws-disconnect-churn/spec.md (A7)
// See: cloud/issues/067-heap-growth-investigation/spike.md
}

/**
Expand Down
30 changes: 30 additions & 0 deletions cloud/porter.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,25 @@ services:
SERVICE_NAME: "cloud"
RTMP_RELAY_URLS: "rtmp-relay-uscentral.mentra.glass:1935"
LOG_LEVEL: "info"
# Liveness probe — "is the process alive?"
# Points to /livez which just returns "ok" with zero computation.
# Previously pointed to /health which iterates all sessions, counts
# WebSockets, updates gauges, and serializes JSON — too much work for
# a "are you alive?" check, especially under load.
# See: cloud/issues/057-cloud-observability/spec.md (1d)
livenessCheck:
enabled: true
httpPath: /livez
timeoutSeconds: 3
initialDelaySeconds: 15
# Readiness probe — "can this pod handle traffic?"
# Points to /health which returns session count, metrics, and memory info.
# If this fails, the pod is removed from the load balancer but NOT killed.
readinessCheck:
enabled: true
httpPath: /health
timeoutSeconds: 5
initialDelaySeconds: 15
# Extended timeouts for WebSocket connections (/glasses-ws, /app-ws).
# Porter sets proxy-send-timeout: 60s by default, which kills idle WS
# connections because no client→server traffic flows after audio moved to UDP.
Expand All @@ -27,6 +46,17 @@ services:
nginx.ingress.kubernetes.io/proxy-read-timeout: "3600"
nginx.ingress.kubernetes.io/proxy-send-timeout: "3600"
nginx.ingress.kubernetes.io/proxy-connect-timeout: "60"
# Prometheus metrics scraping — exposes mentra_user_sessions, mentra_miniapp_sessions,
# mentra_event_loop_lag_ms, mentra_udp_packets_received_total, etc. to Porter's
# built-in metrics dashboard. The /metrics endpoint is already on port 80.
metricsScraping:
enabled: true
path: /metrics
port: 80
scrapeIntervalSeconds: 30
# Graceful shutdown — give WebSocket close frames time to flush
# See: cloud/issues/063-graceful-shutdown/spec.md
terminationGracePeriodSeconds: 30
additionalPorts:
- port: 8000
protocol: UDP
Expand Down
Loading