Merge pull request #2353 from Mentra-Community/hotfix/remove-forced-gc-add-livez

isaiahb · web-flow · commit c4bb53899009 · 2026-03-29T07:07:11.000-07:00
hotfix: remove gc-after-disconnect, add /livez liveness probe, enable Porter metrics scraping
diff --git a/.github/workflows/porter-debug.yml b/.github/workflows/porter-debug.yml
@@ -11,6 +11,7 @@ on:
       - hotfix/crash-diagnostics
       - cloud/062-mongodb-audit
       - hotfix/graceful-shutdown
+      - hotfix/remove-forced-gc-add-livez
     paths:
       - "cloud/**"
       - ".github/workflows/porter-debug.yml"
diff --git a/cloud/packages/cloud/src/services/session/UserSession.ts b/cloud/packages/cloud/src/services/session/UserSession.ts
@@ -816,51 +816,12 @@ export class UserSession {
     // Mark disposed for leak detection
     memoryLeakDetector.markDisposed(`UserSession:${this.userId}`);
 
-    // GC after disconnect — force garbage collection and measure how much is freed.
-    // Rate-limited: at most once per 10 seconds across all sessions to avoid
-    // thrashing GC during crash cascades (many sessions disconnecting at once).
-    if (this.userId && UserSession.canRunPostDisconnectGc()) {
-      const gcLogger = rootLogger.child({ service: "gc-after-disconnect" });
-      setTimeout(() => {
-        try {
-          const memBefore = process.memoryUsage();
-          const t0 = performance.now();
-          Bun.gc(true);
-          const gcDurationMs = performance.now() - t0;
-          const memAfter = process.memoryUsage();
-          const freedBytes = memBefore.heapUsed - memAfter.heapUsed;
-
-          gcLogger.info(
-            {
-              feature: "gc-after-disconnect",
-              userId: this.userId,
-              gcDurationMs: Math.round(gcDurationMs * 10) / 10,
-              heapBeforeMB: Math.round(memBefore.heapUsed / 1048576),
-              heapAfterMB: Math.round(memAfter.heapUsed / 1048576),
-              freedMB: Math.round(freedBytes / 1048576),
-              rssMB: Math.round(memAfter.rss / 1048576),
-              sessionDurationSeconds,
-            },
-            `GC after disconnect (${this.userId}): ${gcDurationMs.toFixed(1)}ms, freed ${Math.round(freedBytes / 1048576)}MB`,
-          );
-        } catch (error) {
-          gcLogger.error(error, "GC after disconnect failed");
-        }
-      }, 0);
-    }
-  }
-
-  // Rate limiter for post-disconnect GC: at most once per 10 seconds
-  private static lastPostDisconnectGc: number = 0;
-  private static POST_DISCONNECT_GC_COOLDOWN_MS = 10_000;
-
-  private static canRunPostDisconnectGc(): boolean {
-    const now = Date.now();
-    if (now - UserSession.lastPostDisconnectGc < UserSession.POST_DISCONNECT_GC_COOLDOWN_MS) {
-      return false;
-    }
-    UserSession.lastPostDisconnectGc = now;
-    return true;
+    // gc-after-disconnect REMOVED — confirmed wasteful:
+    // 31 calls/hour on US Central, 2,242ms total event loop blocking, freed 0 bytes
+    // every single time. The gc-probe in SystemVitalsLogger provides the same
+    // diagnostic data on a fixed 60s schedule without being triggered by user behavior.
+    // See: cloud/issues/066-ws-disconnect-churn/spec.md (A7)
+    // See: cloud/issues/067-heap-growth-investigation/spike.md
   }
 
   /**
diff --git a/cloud/porter.yaml b/cloud/porter.yaml
@@ -18,6 +18,25 @@ services:
       SERVICE_NAME: "cloud"
       RTMP_RELAY_URLS: "rtmp-relay-uscentral.mentra.glass:1935"
       LOG_LEVEL: "info"
+    # Liveness probe — "is the process alive?"
+    # Points to /livez which just returns "ok" with zero computation.
+    # Previously pointed to /health which iterates all sessions, counts
+    # WebSockets, updates gauges, and serializes JSON — too much work for
+    # a "are you alive?" check, especially under load.
+    # See: cloud/issues/057-cloud-observability/spec.md (1d)
+    livenessCheck:
+      enabled: true
+      httpPath: /livez
+      timeoutSeconds: 3
+      initialDelaySeconds: 15
+    # Readiness probe — "can this pod handle traffic?"
+    # Points to /health which returns session count, metrics, and memory info.
+    # If this fails, the pod is removed from the load balancer but NOT killed.
+    readinessCheck:
+      enabled: true
+      httpPath: /health
+      timeoutSeconds: 5
+      initialDelaySeconds: 15
     # Extended timeouts for WebSocket connections (/glasses-ws, /app-ws).
     # Porter sets proxy-send-timeout: 60s by default, which kills idle WS
     # connections because no client→server traffic flows after audio moved to UDP.
@@ -27,6 +46,17 @@ services:
       nginx.ingress.kubernetes.io/proxy-read-timeout: "3600"
       nginx.ingress.kubernetes.io/proxy-send-timeout: "3600"
       nginx.ingress.kubernetes.io/proxy-connect-timeout: "60"
+    # Prometheus metrics scraping — exposes mentra_user_sessions, mentra_miniapp_sessions,
+    # mentra_event_loop_lag_ms, mentra_udp_packets_received_total, etc. to Porter's
+    # built-in metrics dashboard. The /metrics endpoint is already on port 80.
+    metricsScraping:
+      enabled: true
+      path: /metrics
+      port: 80
+      scrapeIntervalSeconds: 30
+    # Graceful shutdown — give WebSocket close frames time to flush
+    # See: cloud/issues/063-graceful-shutdown/spec.md
+    terminationGracePeriodSeconds: 30
     additionalPorts:
       - port: 8000
         protocol: UDP