1- """Custom OpenTelemetry exporter that sends spans to the existing HUD telemetry
2- HTTP endpoint (/trace/<id>/telemetry-upload).
1+ """Custom OpenTelemetry exporter for HUD telemetry backend.
32
4- The exporter groups spans by ``hud.task_run_id`` baggage / attribute so we keep
5- exactly the same semantics the old async worker in ``hud.telemetry.exporter``
6- implemented.
3+ This exporter sends spans to the HUD telemetry HTTP endpoint, grouping them
4+ by task_run_id for efficient batch uploads.
75
8- This exporter is *synchronous* (derives from :class:`SpanExporter`). We rely on
9- ``hud.shared.make_request_sync`` which already contains retry & auth logic.
6+ Performance optimizations:
7+ - Detects async contexts and runs exports in a thread pool to avoid blocking
8+ - Uses persistent HTTP client with connection pooling for reduced overhead
9+ - Tracks pending export futures to ensure completion during shutdown
10+
11+ The exporter derives from SpanExporter (synchronous interface) but handles
12+ async contexts intelligently to prevent event loop blocking during high-concurrency
13+ workloads.
1014"""
1115
1216from __future__ import annotations
2125from concurrent .futures import ThreadPoolExecutor
2226from datetime import UTC , datetime
2327from typing import TYPE_CHECKING , Any
24- import httpx
2528
2629from mcp .types import ClientRequest , ServerResult
2730from opentelemetry .sdk .trace .export import SpanExporter , SpanExportResult
3538
3639logger = logging .getLogger (__name__ )
3740
38- # Global thread pool for span exports to avoid blocking event loop
41+ # Global singleton thread pool for span exports
3942_export_executor : ThreadPoolExecutor | None = None
4043
44+
4145def get_export_executor () -> ThreadPoolExecutor :
42- """Get or create the global export executor."""
46+ """Get or create the global thread pool for span exports.
47+
48+ Returns a singleton ThreadPoolExecutor used for running span exports
49+ in a thread pool when called from async contexts, preventing event
50+ loop blocking during high-concurrency workloads.
51+
52+ The executor is automatically cleaned up on process exit via atexit.
53+
54+ Returns:
55+ ThreadPoolExecutor with 2 workers
56+ """
4357 global _export_executor
4458 if _export_executor is None :
4559 _export_executor = ThreadPoolExecutor (max_workers = 2 , thread_name_prefix = "span-export" )
46- # Register cleanup
60+
4761 def cleanup ():
4862 if _export_executor is not None :
4963 _export_executor .shutdown (wait = True )
64+
5065 atexit .register (cleanup )
5166 return _export_executor
5267
@@ -316,41 +331,62 @@ def _span_to_dict(span: ReadableSpan) -> dict[str, Any]:
316331
317332
318333class HudSpanExporter (SpanExporter ):
319- """Exporter that forwards spans to HUD backend using existing endpoint."""
334+ """OpenTelemetry span exporter for the HUD backend.
335+
336+ This exporter groups spans by task_run_id and sends them to the HUD
337+ telemetry endpoint. Performance optimizations include:
338+
339+ - Auto-detects async contexts and runs exports in thread pool (non-blocking)
340+ - Tracks pending export futures for proper shutdown coordination
341+
342+ Handles high-concurrency scenarios (200+ parallel tasks) by offloading
343+ synchronous HTTP operations to a thread pool when called from async
344+ contexts, preventing event loop blocking.
345+ """
320346
321347 def __init__ (self , * , telemetry_url : str , api_key : str ) -> None :
348+ """Initialize the HUD span exporter.
349+
350+ Args:
351+ telemetry_url: Base URL for the HUD telemetry backend
352+ api_key: API key for authentication
353+ """
322354 super ().__init__ ()
323355 self ._telemetry_url = telemetry_url .rstrip ("/" )
324356 self ._api_key = api_key
325- # Track pending export futures so we can force-flush on shutdown
357+
358+ # Track pending export futures for shutdown coordination
326359 self ._pending_futures : list [cf .Future [SpanExportResult ]] = []
327- # Persistent HTTP client to reuse connections
328- self ._client = httpx .Client (
329- timeout = 30.0 ,
330- limits = httpx .Limits (max_connections = 2000 , max_keepalive_connections = 512 , keepalive_expiry = 15.0 ),
331- )
332-
333- # ------------------------------------------------------------------
334- # Core API
335- # ------------------------------------------------------------------
360+
336361 def export (self , spans : list [ReadableSpan ]) -> SpanExportResult : # type: ignore[override]
362+ """Export spans to HUD backend.
363+
364+ Auto-detects async contexts: if called from an async event loop, runs
365+ the export in a thread pool to avoid blocking. Otherwise runs synchronously.
366+
367+ Args:
368+ spans: List of ReadableSpan objects to export
369+
370+ Returns:
371+ SpanExportResult.SUCCESS (returns immediately in async contexts)
372+ """
337373 if not spans :
338374 return SpanExportResult .SUCCESS
339375
340- # Group spans by hud. task_run_id attribute
376+ # Group spans by task_run_id for batched uploads
341377 grouped : dict [str , list [ReadableSpan ]] = defaultdict (list )
342378 for span in spans :
343379 run_id = span .attributes .get ("hud.task_run_id" ) if span .attributes else None
344380 if not run_id :
345- # Skip spans that are outside HUD traces
381+ # Skip spans outside HUD traces
346382 continue
347383 grouped [str (run_id )].append (span )
348384
349- # Try to run export in background if we're in an event loop
385+ # Detect async context to avoid event loop blocking
350386 import asyncio
351387 try :
352388 loop = asyncio .get_running_loop ()
353- # We're in an async context - schedule export in thread to avoid blocking
389+ # In async context - offload to thread pool
354390 executor = get_export_executor ()
355391
356392 def _sync_export ():
@@ -384,7 +420,6 @@ def _sync_export():
384420 url = url ,
385421 json = payload ,
386422 api_key = self ._api_key ,
387- client = self ._client ,
388423 )
389424 except Exception as exc :
390425 logger .exception ("HUD exporter failed to send spans for task %s: %s" , run_id , exc )
@@ -442,7 +477,6 @@ def _cleanup_done(f: cf.Future[SpanExportResult]) -> None:
442477 url = url ,
443478 json = payload ,
444479 api_key = self ._api_key ,
445- client = self ._client ,
446480 )
447481 except Exception as exc :
448482 logger .exception ("HUD exporter failed to send spans for task %s: %s" , run_id , exc )
@@ -452,39 +486,52 @@ def _cleanup_done(f: cf.Future[SpanExportResult]) -> None:
452486 return SpanExportResult .SUCCESS
453487
454488 def shutdown (self ) -> None : # type: ignore[override]
455- # Best effort: wait for pending exports to complete
489+ """Shutdown the exporter and wait for pending exports.
490+
491+ Waits up to 10 seconds for any in-flight exports to complete.
492+ """
456493 try :
457494 if self ._pending_futures :
458495 cf .wait (self ._pending_futures , timeout = 10.0 )
459496 except Exception :
460497 pass
461498 finally :
462499 self ._pending_futures .clear ()
463- # Close persistent client
464- try :
465- self ._client .close ()
466- except Exception :
467- pass
468500
469501 def force_flush (self , timeout_millis : int | None = None ) -> bool : # type: ignore[override]
470- # Wait for pending export futures
502+ """Force flush all pending span exports.
503+
504+ Waits for all pending export futures to complete before returning.
505+ This is called by the OpenTelemetry SDK during shutdown to ensure
506+ all telemetry is uploaded.
507+
508+ Args:
509+ timeout_millis: Maximum time to wait in milliseconds
510+
511+ Returns:
512+ True if all exports completed, False otherwise
513+ """
471514 try :
472515 if not self ._pending_futures :
473516 return True
517+
474518 timeout = (timeout_millis or 30000 ) / 1000.0
475519 done , not_done = cf .wait (self ._pending_futures , timeout = timeout )
476- # Consume exceptions to avoid warnings
520+
521+ # Consume exceptions to avoid "exception was never retrieved" warnings
477522 for f in list (done ):
478523 try :
479524 _ = f .exception ()
480525 except Exception :
481526 pass
482- # Remove finished futures
483- try :
484- for f in list (done ):
527+
528+ # Remove completed futures
529+ for f in list (done ):
530+ try :
485531 self ._pending_futures .remove (f )
486- except ValueError :
487- pass
532+ except ValueError :
533+ pass
534+
488535 return len (not_done ) == 0
489536 except Exception :
490537 return False
0 commit comments