From 9e05fc70e1eccc14931438190e26e9455bdda858 Mon Sep 17 00:00:00 2001 From: taskbot Date: Fri, 10 Oct 2025 09:42:46 +0200 Subject: [PATCH] avoid telemetry failing interfering with mcp responses Server was crashing with 404 errors when writing to telemetry. So add some control of panic error on the calls to telemetry, to tolerate failures but don't make them fatal. Also implement the flush method, that is essential for sse/streaming Closes: #2114 --- pkg/telemetry/middleware.go | 33 +++++++++++++++++++++++++++++++-- 1 file changed, 31 insertions(+), 2 deletions(-) diff --git a/pkg/telemetry/middleware.go b/pkg/telemetry/middleware.go index 19e943bc3..b438b04cf 100644 --- a/pkg/telemetry/middleware.go +++ b/pkg/telemetry/middleware.go @@ -93,6 +93,13 @@ func NewHTTPMiddleware( // to leverage the parsed MCP data. func (m *HTTPMiddleware) Handler(next http.Handler) http.Handler { return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + // Ultimate safety net - telemetry must NEVER crash the service + defer func() { + if rec := recover(); rec != nil { + logger.Errorf("Telemetry middleware panic (non-fatal): %v", rec) + } + }() + ctx := r.Context() // Handle SSE endpoints specially - they are long-lived connections @@ -123,7 +130,15 @@ func (m *HTTPMiddleware) Handler(next http.Handler) http.Handler { // Create span name based on MCP method if available, otherwise use HTTP method + path spanName := m.createSpanName(ctx, r) ctx, span := m.tracer.Start(ctx, spanName, trace.WithSpanKind(trace.SpanKindServer)) - defer span.End() + // End span with error handling - this is where OTLP export happens + defer func() { + defer func() { + if rec := recover(); rec != nil { + logger.Debugf("Telemetry span.End() panic (non-fatal): %v", rec) + } + }() + span.End() + }() // Create a response writer wrapper to capture response details rw := &responseWriter{ @@ -405,9 +420,16 @@ type responseWriter struct { bytesWritten int64 } -// WriteHeader captures the status code. +// WriteHeader captures the status code with panic protection. func (rw *responseWriter) WriteHeader(statusCode int) { rw.statusCode = statusCode + + // Wrap the actual WriteHeader call to catch any panics (including duplicate calls) + defer func() { + if rec := recover(); rec != nil { + logger.Debugf("WriteHeader panic recovered (non-fatal): %v", rec) + } + }() rw.ResponseWriter.WriteHeader(statusCode) } @@ -418,6 +440,13 @@ func (rw *responseWriter) Write(data []byte) (int, error) { return n, err } +// Flush implements http.Flusher if the underlying ResponseWriter supports it. +func (rw *responseWriter) Flush() { + if flusher, ok := rw.ResponseWriter.(http.Flusher); ok { + flusher.Flush() + } +} + // recordMetrics records request metrics. func (m *HTTPMiddleware) recordMetrics(ctx context.Context, r *http.Request, rw *responseWriter, duration time.Duration) { // Get MCP method from context if available