mlcommons · attafosu · Mar 4, 2026 · Mar 6, 2026 · Mar 6, 2026 · Mar 4, 2026
@@ -325,7 +325,7 @@ def _build_config_from_cli(
             client=ClientSettings(
                 workers=args.workers if args.workers else -1,
                 log_level="DEBUG" if verbose_level >= 2 else "INFO",
-                warmup_connections=getattr(args, "warmup_connections", True),
+                warmup_connections=getattr(args, "warmup_connections", -1),
                 max_connections=getattr(args, "max_connections", None) or -1,
             ),
         ),

@@ -72,7 +72,7 @@ async def run_probe_command(args: argparse.Namespace) -> None:
                 ],
                 api_type=api_type,
                 num_workers=1,
-                warmup_connections=False,
+                warmup_connections=0,
             )
             # Client creates its own event loop in a separate thread
             client = HTTPEndpointClient(http_config, zmq_context=zmq_ctx)

@@ -292,7 +292,8 @@ class ClientSettings(BaseModel):
     log_level: str = "INFO"
 
     # Pre-establish TCP connections during init for reuse at runtime.
-    warmup_connections: bool = True
+    # Values: -1 = auto (50% of pool), 0 = disabled, >0 = explicit total count
+    warmup_connections: int = -1
 
     # Maximum concurrent TCP connections per worker.
     # -1 = unlimited (bound by system ephemeral port limit)

@@ -20,6 +20,7 @@
 
 from inference_endpoint.dataset_manager.transforms import (
     AddStaticColumns,
+    Harmonize,
     Transform,
     UserPromptFormatter,
 )
@@ -48,3 +49,28 @@ def llama3_8b(
         ),
         AddStaticColumns(chat_template),
     ]
+
+
+def llama3_8b_sglang(
+    stream: bool = True,
+    max_new_tokens: int = 128,
+    temperature: float = 0.0,
+    top_p: float = 1.0,
+    top_k: int = 1,
+    tokenizer_name: str = "meta-llama/Llama-3.1-8B-Instruct",
+) -> list[Transform]:
+    return [
+        # Step 1: Format the prompt from "article"
+        UserPromptFormatter(
+            user_prompt_format=f"Summarize the following news article in {max_new_tokens} tokens. Please output the summary only, without any other text.\n\nArticle:\n{{article}}\n\nSummary:",
+            output_column="prompt",
+        ),
+        # Step 2: Tokenize the raw prompt via Harmonize in plain mode.
+        Harmonize(
+            tokenizer_name=tokenizer_name,
+            prompt_column="prompt",
+            tokenized_column="input_tokens",
+            harmonized_column=None,
+            mode="plain",
+        ),
+    ]
@@ -133,6 +133,7 @@ def __init__(
         prompt_column: str = "prompt",
         tokenized_column: str = "input_tokens",
         harmonized_column: str | None = "harmonized_prompt",
+        mode: str = "harmony",
     ):
         """Initialize the Harmonize transform.
 
@@ -145,10 +146,14 @@ def __init__(
             tokenized_column: The name of the column containing the tokenized prompt.
             harmonized_column: The name of the column containing the harmonized prompt. If None,
                 the harmonized prompt will not be stored as text.
+            mode: "harmony" to render a Harmony conversation; "plain" to tokenize the raw prompt.
         """
         self.prompt_column = prompt_column
         self.tokenized_column = tokenized_column
         self.harmonized_column = harmonized_column
+        self.mode = mode
+        if self.mode not in {"harmony", "plain"}:
+            raise ValueError(f"Invalid harmonize mode: {self.mode}")
         self.harmonizer = Harmonizer(
             tokenizer_name=tokenizer_name,
             encoding_name=encoding_name,
@@ -171,7 +176,19 @@ def process_row(self, row: dict[str, Any]) -> dict[str, Any]:
         Returns:
             Row dictionary with the harmonized prompt added
         """
-        row[self.tokenized_column] = self.harmonizer(row[self.prompt_column])
+        # Guard pre-tokenized rows: the SGLang adapter adds a default Harmonize
+        # (GPT-OSS tokenizer + harmony mode). When row processors are fused, the
+        # dataframe-level skip is bypassed, so without this guard, adapter
+        # Harmonize would overwrite input tokens. Alternative: remove Harmonize
+        # from the adapter transforms and require each SGLang preset to add its
+        # own Harmonize with the desired tokenizer/args.
+        if self.tokenized_column in row and row[self.tokenized_column] is not None:
+            return row
+        if self.mode == "plain":
+            tokens = self.harmonizer.to_tokens(row[self.prompt_column])
+            row[self.tokenized_column] = tokens
+        else:
+            row[self.tokenized_column] = self.harmonizer(row[self.prompt_column])
         if self.harmonized_column is not None:
             row[self.harmonized_column] = self.harmonizer.to_text(
                 row[self.tokenized_column]

@@ -1075,7 +1075,12 @@ def derive_TPOT(
             output_sequence, reasoning_sequence = output_sequence_from_data(
                 data_bytes, join_chunks=False
             )
+            if isinstance(output_sequence, str):
+                output_sequence = [output_sequence]
             if not isinstance(output_sequence, list):
+                logging.warning(
+                    f"Output sequence for sample {sample_uuid} is not a list but {type(output_sequence)}: {output_sequence}"
+                )
                 continue
 
             all_chunks = output_sequence

@@ -82,7 +82,7 @@ class ChatCompletionResponseMessage(msgspec.Struct, kw_only=True, omit_defaults=
 
     role: str
     content: str | None
-    refusal: str | None
+    refusal: str | None = None
 
 
 class ChatCompletionChoice(msgspec.Struct, kw_only=True, omit_defaults=True):  # type: ignore[call-arg]
@@ -109,5 +109,5 @@ class ChatCompletionResponse(msgspec.Struct, kw_only=True, omit_defaults=True):
     created: int
     model: str
     choices: list[ChatCompletionChoice]
-    usage: CompletionUsage | None
-    system_fingerprint: str | None
+    usage: CompletionUsage | None = None
+    system_fingerprint: str | None = None
@@ -37,6 +37,7 @@
 import time
 from dataclasses import dataclass
 
+from inference_endpoint.async_utils.transport.zmq.context import ManagedZMQContext
 from inference_endpoint.core.types import Query, QueryResult
 from inference_endpoint.endpoint_client.config import HTTPClientConfig
 from inference_endpoint.endpoint_client.cpu_affinity import compute_affinity_plan
@@ -399,6 +400,7 @@ def _create_client(
     prompt: str,
     enable_affinity: bool,
     verbose: bool = True,
+    zmq_context: ManagedZMQContext | None = None,
 ) -> tuple:
     """Create an endpoint client and query data dict.
 
@@ -422,7 +424,7 @@ def _create_client(
         endpoint_urls=[endpoint_url],
         num_workers=num_workers if num_workers > 0 else -1,
         max_connections=max_connections if max_connections > 0 else -1,
-        warmup_connections=False,
+        warmup_connections=0,
         worker_gc_mode="relaxed",
         log_level="CRITICAL",
         cpu_affinity=cpu_affinity_plan,
@@ -434,7 +436,7 @@ def _create_client(
             f"max_connections={config.max_connections}, stream={streaming}"
         )
 
-    client = AsyncHttpEndpointClient(config)
+    client = AsyncHttpEndpointClient(config, zmq_context=zmq_context)
     query_data = {
         "prompt": prompt,
         "model": "benchmark-model",
@@ -488,13 +490,17 @@ def run_benchmark(
         except OSError:
             pass
 
+    zmq_ctx_manager = ManagedZMQContext.scoped()
+    zmq_ctx = zmq_ctx_manager.__enter__()
+
     client, query_data = _create_client(
         endpoint_url,
         num_workers,
         max_connections,
         streaming,
         prompt,
         enable_affinity,
+        zmq_context=zmq_ctx,
     )
     loop = client.loop
     stats = BenchmarkStats(sse_events_per_response=sse_events_per_response)
@@ -613,6 +619,7 @@ async def receiver():
     gc.collect()
 
     asyncio.run_coroutine_threadsafe(client.shutdown(), loop).result(timeout=10.0)
+    zmq_ctx_manager.__exit__(None, None, None)
 
     # Restore original affinity so the next sweep iteration sees all CPUs
     if saved_affinity is not None:

@@ -59,7 +59,7 @@ async def test_offline_benchmark_with_echo_server(
             verbose=1,
             model="echo-server",
             timeout=None,
-            warmup_connections=False,
+            warmup_connections=0,
         )
 
         with caplog.at_level("INFO"):
@@ -99,7 +99,7 @@ async def test_online_benchmark_with_echo_server(
             verbose=1,
             model="echo-server",
             timeout=None,
-            warmup_connections=False,
+            warmup_connections=0,
         )
         with caplog.at_level("INFO"):
             await run_benchmark_command(args)
@@ -143,7 +143,7 @@ async def test_benchmark_with_output_file(
             verbose=0,
             model="echo-server",
             timeout=None,
-            warmup_connections=False,
+            warmup_connections=0,
         )
 
         await run_benchmark_command(args)
@@ -185,7 +185,7 @@ async def test_benchmark_mode_logging(
             verbose=1,
             model="echo-server",
             timeout=None,
-            warmup_connections=False,
+            warmup_connections=0,
         )
         with caplog.at_level("INFO"):
             await run_benchmark_command(args)

@@ -26,7 +26,7 @@ def create_futures_client(
     url: str,
     num_workers: int = 1,
     max_connections: int = 10,
-    warmup_connections: bool = False,
+    warmup_connections: int = 0,
     zmq_context=None,
 ) -> FuturesHttpClient:
     """Helper to create a FuturesHttpClient with specific config.
@@ -35,7 +35,7 @@ def create_futures_client(
         url: The endpoint URL to connect to
         num_workers: Number of worker processes (default: 1)
         max_connections: Max connections per worker (default: 10 for tests)
-        warmup_connections: Whether to warmup connections (default: False for tests)
+        warmup_connections: Warmup connection count (0 = disabled, -1 = auto, >0 = explicit)
         zmq_context: ManagedZMQContext when using ZMQ transport (required by default config).
 
     Returns:

@@ -41,7 +41,7 @@ def _create_custom_client(
             endpoint_urls=[f"{vllm_docker_server['url']}/v1/chat/completions"],
             num_workers=num_workers,
             max_connections=50,
-            warmup_connections=False,
+            warmup_connections=0,
         )
 
         # TODO(vir):

@@ -156,7 +156,7 @@ async def test_many_workers(self, mock_http_echo_server):
                     num_workers=num_workers,
                     max_connections=num_workers
                     * 10,  # ensure each worker has connections
-                    warmup_connections=False,
+                    warmup_connections=0,
                     zmq_context=zmq_ctx,
                 )
 
@@ -330,7 +330,7 @@ async def test_streaming_error_propagation(self):
             # Use invalid endpoint to trigger errors
             client = create_futures_client(
                 "http://invalid-endpoint-12345:9999/v1/chat/completions",
-                warmup_connections=False,
+                warmup_connections=0,
                 zmq_context=zmq_ctx,
             )
 

@@ -43,7 +43,7 @@ def sglang_futures_client():
         endpoint_urls=[SGLANG_ENDPOINT],
         num_workers=4,
         api_type="sglang",
-        warmup_connections=False,
+        warmup_connections=0,
     )
 
     client = FuturesHttpClient(http_config)

@@ -36,7 +36,7 @@ def worker_config(self, mock_http_echo_server):
             endpoint_urls=[f"{mock_http_echo_server.url}/v1/chat/completions"],
             num_workers=1,
             max_connections=10,
-            warmup_connections=False,
+            warmup_connections=0,
         )
         return http_config
 
@@ -229,7 +229,7 @@ def worker_config(self, mock_http_echo_server):
             endpoint_urls=[f"{mock_http_echo_server.url}/v1/chat/completions"],
             num_workers=1,
             max_connections=10,
-            warmup_connections=False,
+            warmup_connections=0,
         )
         return http_config
 
@@ -240,7 +240,7 @@ def error_config(self):
             endpoint_urls=["http://localhost:59999/v1/chat/completions"],
             num_workers=1,
             max_connections=10,
-            warmup_connections=False,
+            warmup_connections=0,
         )
         return http_config
 
@@ -416,7 +416,7 @@ async def malformed_json_non_streaming_handler(request):
                 endpoint_urls=[f"http://localhost:{server.port}/malformed"],
                 num_workers=1,
                 max_connections=10,
-                warmup_connections=False,
+                warmup_connections=0,
             )
 
             worker = Worker(

@@ -112,7 +112,7 @@ def manager_config(self, mock_http_echo_server):
             endpoint_urls=[f"{mock_http_echo_server.url}/v1/chat/completions"],
             num_workers=2,
             max_connections=10,
-            warmup_connections=False,
+            warmup_connections=0,
         )
         return http_config
 
@@ -270,7 +270,7 @@ def worker_death_config(self):
             endpoint_urls=["http://localhost:59999/advanced"],
             num_workers=2,
             max_connections=10,
-            warmup_connections=False,
+            warmup_connections=0,
         )
         return http_config
 

@@ -45,7 +45,7 @@ class DeepSeekR1SampleIssuer(HttpClientSampleIssuer):
     def __init__(self, tmp_path: Path, url: str, zmq_context: ManagedZMQContext):
         self.http_config = HTTPClientConfig(
             endpoint_urls=[urljoin(url, "/v1/chat/completions")],
-            warmup_connections=False,
+            warmup_connections=0,
         )
         super().__init__(HTTPEndpointClient(self.http_config, zmq_context=zmq_context))
 

@@ -59,7 +59,7 @@ def http_client(perf_http_echo_server):
     http_config = HTTPClientConfig(
         endpoint_urls=[f"{perf_http_echo_server.url}/v1/chat/completions"],
         num_workers=1,
-        warmup_connections=False,
+        warmup_connections=0,
     )
 
     client = HTTPEndpointClient(config=http_config)