Skip to content

Commit c4bb538

Browse files
authored
Merge pull request #2353 from Mentra-Community/hotfix/remove-forced-gc-add-livez
hotfix: remove gc-after-disconnect, add /livez liveness probe, enable Porter metrics scraping
2 parents a89372d + 24ecba5 commit c4bb538

File tree

3 files changed

+37
-45
lines changed

3 files changed

+37
-45
lines changed

.github/workflows/porter-debug.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ on:
1111
- hotfix/crash-diagnostics
1212
- cloud/062-mongodb-audit
1313
- hotfix/graceful-shutdown
14+
- hotfix/remove-forced-gc-add-livez
1415
paths:
1516
- "cloud/**"
1617
- ".github/workflows/porter-debug.yml"

cloud/packages/cloud/src/services/session/UserSession.ts

Lines changed: 6 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -816,51 +816,12 @@ export class UserSession {
816816
// Mark disposed for leak detection
817817
memoryLeakDetector.markDisposed(`UserSession:${this.userId}`);
818818

819-
// GC after disconnect — force garbage collection and measure how much is freed.
820-
// Rate-limited: at most once per 10 seconds across all sessions to avoid
821-
// thrashing GC during crash cascades (many sessions disconnecting at once).
822-
if (this.userId && UserSession.canRunPostDisconnectGc()) {
823-
const gcLogger = rootLogger.child({ service: "gc-after-disconnect" });
824-
setTimeout(() => {
825-
try {
826-
const memBefore = process.memoryUsage();
827-
const t0 = performance.now();
828-
Bun.gc(true);
829-
const gcDurationMs = performance.now() - t0;
830-
const memAfter = process.memoryUsage();
831-
const freedBytes = memBefore.heapUsed - memAfter.heapUsed;
832-
833-
gcLogger.info(
834-
{
835-
feature: "gc-after-disconnect",
836-
userId: this.userId,
837-
gcDurationMs: Math.round(gcDurationMs * 10) / 10,
838-
heapBeforeMB: Math.round(memBefore.heapUsed / 1048576),
839-
heapAfterMB: Math.round(memAfter.heapUsed / 1048576),
840-
freedMB: Math.round(freedBytes / 1048576),
841-
rssMB: Math.round(memAfter.rss / 1048576),
842-
sessionDurationSeconds,
843-
},
844-
`GC after disconnect (${this.userId}): ${gcDurationMs.toFixed(1)}ms, freed ${Math.round(freedBytes / 1048576)}MB`,
845-
);
846-
} catch (error) {
847-
gcLogger.error(error, "GC after disconnect failed");
848-
}
849-
}, 0);
850-
}
851-
}
852-
853-
// Rate limiter for post-disconnect GC: at most once per 10 seconds
854-
private static lastPostDisconnectGc: number = 0;
855-
private static POST_DISCONNECT_GC_COOLDOWN_MS = 10_000;
856-
857-
private static canRunPostDisconnectGc(): boolean {
858-
const now = Date.now();
859-
if (now - UserSession.lastPostDisconnectGc < UserSession.POST_DISCONNECT_GC_COOLDOWN_MS) {
860-
return false;
861-
}
862-
UserSession.lastPostDisconnectGc = now;
863-
return true;
819+
// gc-after-disconnect REMOVED — confirmed wasteful:
820+
// 31 calls/hour on US Central, 2,242ms total event loop blocking, freed 0 bytes
821+
// every single time. The gc-probe in SystemVitalsLogger provides the same
822+
// diagnostic data on a fixed 60s schedule without being triggered by user behavior.
823+
// See: cloud/issues/066-ws-disconnect-churn/spec.md (A7)
824+
// See: cloud/issues/067-heap-growth-investigation/spike.md
864825
}
865826

866827
/**

cloud/porter.yaml

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,25 @@ services:
1818
SERVICE_NAME: "cloud"
1919
RTMP_RELAY_URLS: "rtmp-relay-uscentral.mentra.glass:1935"
2020
LOG_LEVEL: "info"
21+
# Liveness probe — "is the process alive?"
22+
# Points to /livez which just returns "ok" with zero computation.
23+
# Previously pointed to /health which iterates all sessions, counts
24+
# WebSockets, updates gauges, and serializes JSON — too much work for
25+
# a "are you alive?" check, especially under load.
26+
# See: cloud/issues/057-cloud-observability/spec.md (1d)
27+
livenessCheck:
28+
enabled: true
29+
httpPath: /livez
30+
timeoutSeconds: 3
31+
initialDelaySeconds: 15
32+
# Readiness probe — "can this pod handle traffic?"
33+
# Points to /health which returns session count, metrics, and memory info.
34+
# If this fails, the pod is removed from the load balancer but NOT killed.
35+
readinessCheck:
36+
enabled: true
37+
httpPath: /health
38+
timeoutSeconds: 5
39+
initialDelaySeconds: 15
2140
# Extended timeouts for WebSocket connections (/glasses-ws, /app-ws).
2241
# Porter sets proxy-send-timeout: 60s by default, which kills idle WS
2342
# connections because no client→server traffic flows after audio moved to UDP.
@@ -27,6 +46,17 @@ services:
2746
nginx.ingress.kubernetes.io/proxy-read-timeout: "3600"
2847
nginx.ingress.kubernetes.io/proxy-send-timeout: "3600"
2948
nginx.ingress.kubernetes.io/proxy-connect-timeout: "60"
49+
# Prometheus metrics scraping — exposes mentra_user_sessions, mentra_miniapp_sessions,
50+
# mentra_event_loop_lag_ms, mentra_udp_packets_received_total, etc. to Porter's
51+
# built-in metrics dashboard. The /metrics endpoint is already on port 80.
52+
metricsScraping:
53+
enabled: true
54+
path: /metrics
55+
port: 80
56+
scrapeIntervalSeconds: 30
57+
# Graceful shutdown — give WebSocket close frames time to flush
58+
# See: cloud/issues/063-graceful-shutdown/spec.md
59+
terminationGracePeriodSeconds: 30
3060
additionalPorts:
3161
- port: 8000
3262
protocol: UDP

0 commit comments

Comments
 (0)