Security and error handling improvements for observability

sallyom · claude · sallyom · commit b7a8fad13691 · 2025-11-16T10:53:46.000-05:00
Addresses all blocker issues from PR review #329: Co-Authored-By: Claude <noreply@anthropic.com> Signed-off-by: sallyom <somalley@redhat.com>
diff --git a/components/backend/handlers/sessions.go b/components/backend/handlers/sessions.go
@@ -770,10 +770,14 @@ func MintSessionGitHubToken(c *gin.Context) {
 	tr := &authnv1.TokenReview{Spec: authnv1.TokenReviewSpec{Token: token}}
 	rv, err := K8sClient.AuthenticationV1().TokenReviews().Create(c.Request.Context(), tr, v1.CreateOptions{})
 	if err != nil {
+		log.Printf("GitHub token mint: TokenReview API call failed for session %s/%s (token len=%d): %v",
+			project, session, len(token), err)
 		c.JSON(http.StatusInternalServerError, gin.H{"error": "token review failed"})
 		return
 	}
 	if rv.Status.Error != "" || !rv.Status.Authenticated {
+		log.Printf("GitHub token mint: TokenReview authentication failed for session %s/%s: authenticated=%v, error=%q",
+			project, session, rv.Status.Authenticated, rv.Status.Error)
 		c.JSON(http.StatusUnauthorized, gin.H{"error": "unauthenticated"})
 		return
 	}
diff --git a/components/backend/server/server.go b/components/backend/server/server.go
@@ -99,6 +99,17 @@ func forwardedIdentityMiddleware() gin.HandlerFunc {
 						// Perform TokenReview to validate and extract user identity
 						tr := &authnv1.TokenReview{Spec: authnv1.TokenReviewSpec{Token: token}}
 						rv, err := K8sClient.AuthenticationV1().TokenReviews().Create(c.Request.Context(), tr, v1.CreateOptions{})
+						if err != nil {
+							// Log TokenReview API error with context for debugging
+							log.Printf("TokenReview API call failed (token len=%d): %v", len(token), err)
+						} else if !rv.Status.Authenticated {
+							// Log authentication failure with reason
+							log.Printf("TokenReview authentication failed: authenticated=false, error=%q, audiences=%v",
+								rv.Status.Error, rv.Status.Audiences)
+						} else if rv.Status.Error != "" {
+							// Log authentication error from Kubernetes
+							log.Printf("TokenReview returned error: %q (authenticated=%v)", rv.Status.Error, rv.Status.Authenticated)
+						}
 						if err == nil && rv.Status.Authenticated && rv.Status.Error == "" {
 							username := strings.TrimSpace(rv.Status.User.Username)
 							if username != "" {
diff --git a/components/runners/claude-code-runner/wrapper.py b/components/runners/claude-code-runner/wrapper.py
@@ -54,6 +54,7 @@ def __init__(self):
         self._restart_requested = False
         self._first_run = True  # Track if this is the first SDK run or a mid-session restart
         self._otel_initialized = False  # Track if OpenTelemetry has been initialized
+        self._otel_init_lock = asyncio.Lock()  # Thread-safe OTEL initialization
 
     async def initialize(self, context: RunnerContext):
         """Initialize the adapter with context."""
@@ -66,6 +67,43 @@ async def initialize(self, context: RunnerContext):
         # Validate prerequisite files exist for phase-based commands
         await self._validate_prerequisites()
 
+    @staticmethod
+    def _sanitize_user_context(user_id: str, user_name: str) -> tuple[str, str]:
+        """Validate and sanitize user context fields to prevent injection attacks.
+
+        Returns:
+            Tuple of (sanitized_user_id, sanitized_user_name)
+        """
+        # Validate user_id: alphanumeric, dash, underscore, at sign only
+        # Max 255 characters (email addresses can be up to 254 chars)
+        if user_id:
+            user_id = str(user_id).strip()
+            if len(user_id) > 255:
+                logging.warning(f"User ID exceeds max length (255), truncating: {len(user_id)} chars")
+                user_id = user_id[:255]
+            # Remove any characters that could cause injection issues
+            import re
+            sanitized_id = re.sub(r'[^a-zA-Z0-9@._-]', '', user_id)
+            if sanitized_id != user_id:
+                logging.warning(f"User ID contained invalid characters, sanitized from {len(user_id)} to {len(sanitized_id)} chars")
+            user_id = sanitized_id
+
+        # Validate user_name: printable ASCII, no control characters
+        # Max 255 characters
+        if user_name:
+            user_name = str(user_name).strip()
+            if len(user_name) > 255:
+                logging.warning(f"User name exceeds max length (255), truncating: {len(user_name)} chars")
+                user_name = user_name[:255]
+            # Remove control characters and non-printable characters
+            import re
+            sanitized_name = re.sub(r'[\x00-\x1f\x7f-\x9f]', '', user_name)
+            if sanitized_name != user_name:
+                logging.warning(f"User name contained control characters, sanitized from {len(user_name)} to {len(sanitized_name)} chars")
+            user_name = sanitized_name
+
+        return user_id, user_name
+
     async def run(self):
         """Run the Claude Code CLI session."""
         try:
@@ -191,9 +229,11 @@ async def run(self):
     async def _run_claude_agent_sdk(self, prompt: str):
         """Execute the Claude Code SDK with the given prompt."""
         try:
-            # Extract user context for observability (used by both Langfuse and OTLP)
-            user_id = os.getenv('USER_ID', '').strip()
-            user_name = os.getenv('USER_NAME', '').strip()
+            # Extract and sanitize user context for observability (used by both Langfuse and OTLP)
+            # This prevents trace poisoning, log injection, and other security issues
+            raw_user_id = os.getenv('USER_ID', '').strip()
+            raw_user_name = os.getenv('USER_NAME', '').strip()
+            user_id, user_name = self._sanitize_user_context(raw_user_id, raw_user_name)
 
             # Initialize Langfuse for observability if configured
             langfuse_client = None
@@ -209,29 +249,39 @@ async def _run_claude_agent_sdk(self, prompt: str):
                         )
 
                         # Start a span for this session (Langfuse SDK 3.x)
+                        # Note: User tracking is done via OTLP span attributes (enduser.id, user.name)
+                        # Langfuse SDK context managers don't support update_trace()
                         langfuse_session_span = langfuse_client.start_as_current_span(
                             name="claude_agent_session",
                             input={"prompt": prompt},
                             metadata={
                                 "session_id": self.context.session_id,
                                 "namespace": self.context.get_env('AGENTIC_SESSION_NAMESPACE', 'unknown'),
+                                "user_id": user_id if user_id else None,
                                 "user_name": user_name if user_name else None,
                             },
                         )
 
-                        # Set userId on the trace (trace-level attribute, not span-level)
                         if user_id:
-                            try:
-                                langfuse_session_span.update_trace(user_id=user_id)
-                                logging.info(f"Langfuse: Tracking session for user {user_name} ({user_id})")
-                            except Exception as e:
-                                logging.warning(f"Failed to set user_id on trace: {e}")
+                            logging.info(f"Langfuse: Tracking session for user {user_name} ({user_id})")
 
                         logging.info(f"Langfuse tracing enabled for session")
                     except Exception as e:
-                        logging.warning(f"Failed to initialize Langfuse: {e}")
-                        import traceback
-                        logging.warning(traceback.format_exc())
+                        # Sanitize error message to prevent API key leakage
+                        # Replace any occurrence of public_key or secret_key with [REDACTED]
+                        error_msg = str(e)
+                        public_key = os.getenv('LANGFUSE_PUBLIC_KEY', '')
+                        secret_key = os.getenv('LANGFUSE_SECRET_KEY', '')
+                        if public_key:
+                            error_msg = error_msg.replace(public_key, '[REDACTED_PUBLIC_KEY]')
+                        if secret_key:
+                            error_msg = error_msg.replace(secret_key, '[REDACTED_SECRET_KEY]')
+
+                        # Log sanitized error without full traceback
+                        logging.error(f"Failed to initialize Langfuse observability: {error_msg}")
+                        logging.debug(f"Langfuse initialization error type: {type(e).__name__}")
+
+                        # Continue without Langfuse - don't fail the session
                         langfuse_client = None
                         langfuse_session_span = None
 
@@ -291,13 +341,20 @@ async def _run_claude_agent_sdk(self, prompt: str):
                         otel_provider.add_span_processor(BatchSpanProcessor(otlp_exporter))
 
                         # Set as global tracer provider (only on first run)
-                        # This prevents "Overriding of current TracerProvider" warnings on restarts
-                        if not self._otel_initialized:
-                            trace.set_tracer_provider(otel_provider)
-                            self._otel_initialized = True
-                            logging.info("Initialized OpenTelemetry TracerProvider")
-                        else:
-                            logging.debug("Reusing existing OpenTelemetry TracerProvider from previous run")
+                        # Use async lock for thread-safe initialization to prevent race conditions
+                        async with self._otel_init_lock:
+                            if not self._otel_initialized:
+                                # Check if a TracerProvider is already set (prevents override warning)
+                                current_provider = trace.get_tracer_provider()
+                                # If no provider set, or it's the default no-op provider, set ours
+                                if current_provider is None or isinstance(current_provider, trace.ProxyTracerProvider):
+                                    trace.set_tracer_provider(otel_provider)
+                                    logging.info("Initialized OpenTelemetry TracerProvider")
+                                else:
+                                    logging.debug("TracerProvider already set, reusing existing provider")
+                                self._otel_initialized = True
+                            else:
+                                logging.debug("Reusing existing OpenTelemetry TracerProvider from previous run")
 
                         # Get tracer (works whether we just set it or it was already set)
                         otel_tracer = trace.get_tracer(__name__)
@@ -326,9 +383,12 @@ async def _run_claude_agent_sdk(self, prompt: str):
 
                         logging.info(f"OpenTelemetry tracing enabled (endpoint: {otel_endpoint})")
                     except Exception as e:
-                        logging.warning(f"Failed to initialize OpenTelemetry: {e}")
-                        import traceback
-                        logging.warning(traceback.format_exc())
+                        # Log OTEL initialization failure for operator visibility
+                        logging.error(f"Failed to initialize OpenTelemetry observability: {e}")
+                        logging.debug(f"OTEL initialization error type: {type(e).__name__}")
+                        logging.debug(f"OTEL endpoint was: {otel_endpoint}")
+
+                        # Continue without OTEL - don't fail the session
                         otel_tracer = None
                         otel_span = None
 
@@ -936,16 +996,36 @@ async def process_one_prompt(text: str):
             }
         except Exception as e:
             logging.error(f"Failed to run Claude Code SDK: {e}")
-            # End Langfuse session span with error if available
+
+            # Clean up observability spans on error path
+            # 1. End Langfuse session span with error if available
             if 'langfuse_session_span' in locals() and langfuse_session_span and 'langfuse_client' in locals() and langfuse_client:
                 try:
                     langfuse_session_span.end(
                         level="ERROR",
                         status_message=str(e)
                     )
                     langfuse_client.flush()
-                except Exception:
-                    pass
+                except Exception as cleanup_err:
+                    logging.debug(f"Failed to cleanup Langfuse span: {cleanup_err}")
+
+            # 2. End OTEL span with error if available
+            if 'otel_span' in locals() and otel_span:
+                try:
+                    otel_span.set_status(trace.Status(trace.StatusCode.ERROR, str(e)))
+                    otel_span.end()
+                    logging.debug("OTEL span ended with error status")
+                except Exception as cleanup_err:
+                    logging.debug(f"Failed to cleanup OTEL span: {cleanup_err}")
+
+            # 3. Force flush OTEL provider if available
+            if 'otel_provider' in locals() and otel_provider:
+                try:
+                    otel_provider.force_flush(timeout_millis=5000)
+                    logging.debug("OTEL spans flushed on error path")
+                except Exception as cleanup_err:
+                    logging.debug(f"Failed to flush OTEL provider: {cleanup_err}")
+
             return {
                 "success": False,
                 "error": str(e)