hud-evals
diff --git a/‎hud/datasets/runner.py‎
Lines changed: 44 additions & 19 deletions b/‎hud/datasets/runner.py‎
Lines changed: 44 additions & 19 deletions
diff --git a/‎hud/otel/exporters.py‎
Lines changed: 88 additions & 41 deletions b/‎hud/otel/exporters.py‎
Lines changed: 88 additions & 41 deletions
diff --git a/‎hud/telemetry/__init__.py‎
Lines changed: 20 additions & 6 deletions b/‎hud/telemetry/__init__.py‎
Lines changed: 20 additions & 6 deletions
@@ -29,33 +29,40 @@ async def run_dataset(
     auto_respond: bool = False,
     custom_system_prompt: str | None = None,
 ) -> list[Any]:
-    """
-    Run all tasks in a dataset with automatic job tracking.
+    """Run all tasks in a dataset with automatic job and telemetry tracking.
+    
+    This function handles concurrent task execution with proper telemetry collection.
+    All tasks are executed in parallel up to `max_concurrent`, with full telemetry
+    automatically uploaded to the HUD platform.
 
     Args:
         name: Name for the job
         dataset: HuggingFace dataset identifier (e.g. "hud-evals/SheetBench-50"),
                 Dataset object, OR list of Task objects
         agent_class: Agent class to instantiate (e.g., ClaudeAgent)
         agent_config: Configuration/kwargs for agent (model, etc.)
-        max_concurrent: Maximum parallel task execution
+        max_concurrent: Maximum parallel task execution. Higher values improve throughput
+                       but may increase memory usage. Recommended: 30-200 depending on
+                       task complexity and available resources.
         metadata: Optional metadata for the job
         max_steps: Maximum steps per task
         split: Dataset split to use when loading from string (default: "train")
         auto_respond: Whether to use auto-response agent
         custom_system_prompt: Override system prompt for all tasks
 
     Returns:
-        List of results from agent.run() in dataset order
+        List of results from agent.run() in dataset order. Telemetry is automatically
+        collected and uploaded for all tasks.
 
     Example:
         >>> from hud.agents import ClaudeAgent
-        >>> # Option 1: From dataset string identifier
+        >>> # Basic usage with dataset identifier
         >>> results = await run_dataset(
         ...     "SheetBench Eval",
         ...     "hud-evals/SheetBench-50",
         ...     ClaudeAgent,
         ...     {"model": "claude-3-5-sonnet-20241022"},
+        ...     max_concurrent=100,  # Adjust based on your needs
         ... )
         >>> # Option 2: From HuggingFace dataset object
         >>> from datasets import load_dataset
@@ -64,10 +71,13 @@ async def run_dataset(
         >>> # Option 3: From list of dicts
         >>> tasks = [{"prompt": "...", "mcp_config": {...}, ...}, ...]
         >>> results = await run_dataset("browser_eval", tasks, ClaudeAgent)
+
+    Note:
+        Telemetry collection and upload is handled automatically. The function ensures
+        all telemetry is flushed before returning, even at high concurrency levels.
     """
     # Import here to avoid circular imports
-    import hud
-    from hud.telemetry.async_context import async_job, async_trace
+    from hud.telemetry import async_job, async_trace
 
     dataset_link = None
 
@@ -126,24 +136,39 @@ async def _worker(index: int, task_dict: Any, max_steps: int = 10) -> None:
             return_exceptions=True,  # Don't fail entire batch on one error
         )
 
-    # Wait for all tracked tasks to complete (job/trace exits already tried a short wait)
-    from hud.utils.task_tracking import wait_all_tasks
-    completed = await wait_all_tasks(timeout=20.0)
-    if completed > 0:
-        logger.info(f"Waited for {completed} telemetry tasks to complete")
+    # Ensure all telemetry is uploaded before returning
+    await _flush_telemetry()
+    
+    return results
+
+
+async def _flush_telemetry() -> None:
+    """Flush all pending telemetry operations.
     
-    # Flush telemetry - this ensures BatchSpanProcessor exports pending spans
+    Ensures complete telemetry upload by:
+    1. Waiting for all async status updates to complete
+    2. Forcing OpenTelemetry span processor to export remaining spans
+    
+    This prevents telemetry loss at high concurrency (200+ tasks) by ensuring
+    all operations complete before process exit.
+    """
     from hud.otel.config import is_telemetry_configured
+    from hud.utils.task_tracking import wait_all_tasks
+    
+    # Step 1: Wait for async status updates (job/trace status)
+    completed_tasks = await wait_all_tasks(timeout=20.0)
+    if completed_tasks > 0:
+        logger.debug(f"Completed {completed_tasks} pending telemetry tasks")
+    
+    # Step 2: Flush OpenTelemetry span exports
     if is_telemetry_configured():
         try:
             from opentelemetry import trace
             from opentelemetry.sdk.trace import TracerProvider
+            
             provider = trace.get_tracer_provider()
-            # Check if it's an SDK TracerProvider (not the default no-op one)
             if isinstance(provider, TracerProvider):
-                provider.force_flush(timeout_millis=20000)  # 20 second timeout
-                logger.info("Telemetry provider flushed successfully")
+                provider.force_flush(timeout_millis=20000)
+                logger.debug("OpenTelemetry spans flushed successfully")
         except Exception as e:
-            logger.warning(f"Failed to flush telemetry: {e}")
-
-    return results
+            logger.warning(f"Failed to flush OpenTelemetry: {e}")
@@ -1,12 +1,16 @@
-"""Custom OpenTelemetry exporter that sends spans to the existing HUD telemetry
-HTTP endpoint (/trace/<id>/telemetry-upload).
+"""Custom OpenTelemetry exporter for HUD telemetry backend.
 
-The exporter groups spans by ``hud.task_run_id`` baggage / attribute so we keep
-exactly the same semantics the old async worker in ``hud.telemetry.exporter``
-implemented.
+This exporter sends spans to the HUD telemetry HTTP endpoint, grouping them
+by task_run_id for efficient batch uploads.
 
-This exporter is *synchronous* (derives from :class:`SpanExporter`).  We rely on
-``hud.shared.make_request_sync`` which already contains retry & auth logic.
+Performance optimizations:
+- Detects async contexts and runs exports in a thread pool to avoid blocking
+- Uses persistent HTTP client with connection pooling for reduced overhead
+- Tracks pending export futures to ensure completion during shutdown
+
+The exporter derives from SpanExporter (synchronous interface) but handles
+async contexts intelligently to prevent event loop blocking during high-concurrency
+workloads.
 """
 
 from __future__ import annotations
@@ -21,7 +25,6 @@
 from concurrent.futures import ThreadPoolExecutor
 from datetime import UTC, datetime
 from typing import TYPE_CHECKING, Any
-import httpx
 
 from mcp.types import ClientRequest, ServerResult
 from opentelemetry.sdk.trace.export import SpanExporter, SpanExportResult
@@ -35,18 +38,30 @@
 
 logger = logging.getLogger(__name__)
 
-# Global thread pool for span exports to avoid blocking event loop
+# Global singleton thread pool for span exports
 _export_executor: ThreadPoolExecutor | None = None
 
+
 def get_export_executor() -> ThreadPoolExecutor:
-    """Get or create the global export executor."""
+    """Get or create the global thread pool for span exports.
+    
+    Returns a singleton ThreadPoolExecutor used for running span exports
+    in a thread pool when called from async contexts, preventing event
+    loop blocking during high-concurrency workloads.
+    
+    The executor is automatically cleaned up on process exit via atexit.
+    
+    Returns:
+        ThreadPoolExecutor with 2 workers
+    """
     global _export_executor
     if _export_executor is None:
         _export_executor = ThreadPoolExecutor(max_workers=2, thread_name_prefix="span-export")
-        # Register cleanup
+        
         def cleanup():
             if _export_executor is not None:
                 _export_executor.shutdown(wait=True)
+        
         atexit.register(cleanup)
     return _export_executor
 
@@ -316,41 +331,62 @@ def _span_to_dict(span: ReadableSpan) -> dict[str, Any]:
 
 
 class HudSpanExporter(SpanExporter):
-    """Exporter that forwards spans to HUD backend using existing endpoint."""
+    """OpenTelemetry span exporter for the HUD backend.
+    
+    This exporter groups spans by task_run_id and sends them to the HUD
+    telemetry endpoint. Performance optimizations include:
+    
+    - Auto-detects async contexts and runs exports in thread pool (non-blocking)
+    - Tracks pending export futures for proper shutdown coordination
+    
+    Handles high-concurrency scenarios (200+ parallel tasks) by offloading
+    synchronous HTTP operations to a thread pool when called from async
+    contexts, preventing event loop blocking.
+    """
 
     def __init__(self, *, telemetry_url: str, api_key: str) -> None:
+        """Initialize the HUD span exporter.
+        
+        Args:
+            telemetry_url: Base URL for the HUD telemetry backend
+            api_key: API key for authentication
+        """
         super().__init__()
         self._telemetry_url = telemetry_url.rstrip("/")
         self._api_key = api_key
-        # Track pending export futures so we can force-flush on shutdown
+        
+        # Track pending export futures for shutdown coordination
         self._pending_futures: list[cf.Future[SpanExportResult]] = []
-        # Persistent HTTP client to reuse connections
-        self._client = httpx.Client(
-            timeout=30.0,
-            limits=httpx.Limits(max_connections=2000, max_keepalive_connections=512, keepalive_expiry=15.0),
-        )
-
-    # ------------------------------------------------------------------
-    # Core API
-    # ------------------------------------------------------------------
+
     def export(self, spans: list[ReadableSpan]) -> SpanExportResult:  # type: ignore[override]
+        """Export spans to HUD backend.
+        
+        Auto-detects async contexts: if called from an async event loop, runs
+        the export in a thread pool to avoid blocking. Otherwise runs synchronously.
+        
+        Args:
+            spans: List of ReadableSpan objects to export
+            
+        Returns:
+            SpanExportResult.SUCCESS (returns immediately in async contexts)
+        """
         if not spans:
             return SpanExportResult.SUCCESS
 
-        # Group spans by hud.task_run_id attribute
+        # Group spans by task_run_id for batched uploads
         grouped: dict[str, list[ReadableSpan]] = defaultdict(list)
         for span in spans:
             run_id = span.attributes.get("hud.task_run_id") if span.attributes else None
             if not run_id:
-                # Skip spans that are outside HUD traces
+                # Skip spans outside HUD traces
                 continue
             grouped[str(run_id)].append(span)
 
-        # Try to run export in background if we're in an event loop
+        # Detect async context to avoid event loop blocking
         import asyncio
         try:
             loop = asyncio.get_running_loop()
-            # We're in an async context - schedule export in thread to avoid blocking
+            # In async context - offload to thread pool
             executor = get_export_executor()
 
             def _sync_export():
@@ -384,7 +420,6 @@ def _sync_export():
                             url=url,
                             json=payload,
                             api_key=self._api_key,
-                            client=self._client,
                         )
                     except Exception as exc:
                         logger.exception("HUD exporter failed to send spans for task %s: %s", run_id, exc)
@@ -442,7 +477,6 @@ def _cleanup_done(f: cf.Future[SpanExportResult]) -> None:
                         url=url,
                         json=payload,
                         api_key=self._api_key,
-                        client=self._client,
                     )
                 except Exception as exc:
                     logger.exception("HUD exporter failed to send spans for task %s: %s", run_id, exc)
@@ -452,39 +486,52 @@ def _cleanup_done(f: cf.Future[SpanExportResult]) -> None:
             return SpanExportResult.SUCCESS
 
     def shutdown(self) -> None:  # type: ignore[override]
-        # Best effort: wait for pending exports to complete
+        """Shutdown the exporter and wait for pending exports.
+        
+        Waits up to 10 seconds for any in-flight exports to complete.
+        """
         try:
             if self._pending_futures:
                 cf.wait(self._pending_futures, timeout=10.0)
         except Exception:
             pass
         finally:
             self._pending_futures.clear()
-        # Close persistent client
-        try:
-            self._client.close()
-        except Exception:
-            pass
 
     def force_flush(self, timeout_millis: int | None = None) -> bool:  # type: ignore[override]
-        # Wait for pending export futures
+        """Force flush all pending span exports.
+        
+        Waits for all pending export futures to complete before returning.
+        This is called by the OpenTelemetry SDK during shutdown to ensure
+        all telemetry is uploaded.
+        
+        Args:
+            timeout_millis: Maximum time to wait in milliseconds
+            
+        Returns:
+            True if all exports completed, False otherwise
+        """
         try:
             if not self._pending_futures:
                 return True
+            
             timeout = (timeout_millis or 30000) / 1000.0
             done, not_done = cf.wait(self._pending_futures, timeout=timeout)
-            # Consume exceptions to avoid warnings
+            
+            # Consume exceptions to avoid "exception was never retrieved" warnings
             for f in list(done):
                 try:
                     _ = f.exception()
                 except Exception:
                     pass
-            # Remove finished futures
-            try:
-                for f in list(done):
+            
+            # Remove completed futures
+            for f in list(done):
+                try:
                     self._pending_futures.remove(f)
-            except ValueError:
-                pass
+                except ValueError:
+                    pass
+            
             return len(not_done) == 0
         except Exception:
             return False
@@ -1,20 +1,31 @@
-"""HUD Telemetry - User-facing APIs for tracing and job management.
+"""HUD Telemetry - Tracing and job management for agent execution.
 
-This module provides the main telemetry APIs that users interact with:
-- trace: Context manager for tracing code execution
-- job: Context manager and utilities for job management
-- instrument: Decorator for instrumenting functions
-- get_trace: Retrieve collected traces for replay/analysis
+This module provides telemetry APIs for tracking agent execution:
+
+Standard Usage (for most users):
+    - trace(): Context manager for tracing code execution
+    - job(): Context manager for grouping related tasks
+    - instrument(): Decorator for instrumenting functions
+    - get_trace(): Retrieve collected traces for replay/analysis
+
+High-Concurrency Usage (200+ parallel tasks):
+    - async_trace(): Async context manager for traces (prevents event loop blocking)
+    - async_job(): Async context manager for jobs (prevents event loop blocking)
+    
+The async versions are automatically used by run_dataset() and other high-concurrency
+functions. Most users don't need to use them directly.
 """
 
 from __future__ import annotations
 
+from .async_context import async_job, async_trace
 from .instrument import instrument
 from .job import Job, create_job, job
 from .replay import clear_trace, get_trace
 from .trace import Trace, trace
 
 __all__ = [
+    # Standard synchronous APIs (for typical usage)
     "Job",
     "Trace",
     "clear_trace",
@@ -23,4 +34,7 @@
     "instrument",
     "job",
     "trace",
+    # Async APIs (for high-concurrency scenarios)
+    "async_job",
+    "async_trace",
 ]