Merge branch 'main' into feat/attafosu/sglang-openai-api-compatibility

attafosu · web-flow · commit ff66399c5790 · 2026-03-09T13:16:09.000-07:00
diff --git a/src/inference_endpoint/commands/benchmark.py b/src/inference_endpoint/commands/benchmark.py
@@ -325,7 +325,7 @@ def _build_config_from_cli(
             client=ClientSettings(
                 workers=args.workers if args.workers else -1,
                 log_level="DEBUG" if verbose_level >= 2 else "INFO",
-                warmup_connections=getattr(args, "warmup_connections", True),
+                warmup_connections=getattr(args, "warmup_connections", -1),
                 max_connections=getattr(args, "max_connections", None) or -1,
             ),
         ),
diff --git a/src/inference_endpoint/commands/probe.py b/src/inference_endpoint/commands/probe.py
@@ -72,7 +72,7 @@ async def run_probe_command(args: argparse.Namespace) -> None:
                 ],
                 api_type=api_type,
                 num_workers=1,
-                warmup_connections=False,
+                warmup_connections=0,
             )
             # Client creates its own event loop in a separate thread
             client = HTTPEndpointClient(http_config, zmq_context=zmq_ctx)
diff --git a/src/inference_endpoint/config/schema.py b/src/inference_endpoint/config/schema.py
@@ -292,7 +292,8 @@ class ClientSettings(BaseModel):
     log_level: str = "INFO"
 
     # Pre-establish TCP connections during init for reuse at runtime.
-    warmup_connections: bool = True
+    # Values: -1 = auto (50% of pool), 0 = disabled, >0 = explicit total count
+    warmup_connections: int = -1
 
     # Maximum concurrent TCP connections per worker.
     # -1 = unlimited (bound by system ephemeral port limit)
diff --git a/src/inference_endpoint/metrics/reporter.py b/src/inference_endpoint/metrics/reporter.py
@@ -1075,7 +1075,12 @@ def derive_TPOT(
             output_sequence, reasoning_sequence = output_sequence_from_data(
                 data_bytes, join_chunks=False
             )
+            if isinstance(output_sequence, str):
+                output_sequence = [output_sequence]
             if not isinstance(output_sequence, list):
+                logging.warning(
+                    f"Output sequence for sample {sample_uuid} is not a list but {type(output_sequence)}: {output_sequence}"
+                )
                 continue
 
             all_chunks = output_sequence
diff --git a/src/inference_endpoint/utils/benchmark_httpclient.py b/src/inference_endpoint/utils/benchmark_httpclient.py
@@ -37,6 +37,7 @@
 import time
 from dataclasses import dataclass
 
+from inference_endpoint.async_utils.transport.zmq.context import ManagedZMQContext
 from inference_endpoint.core.types import Query, QueryResult
 from inference_endpoint.endpoint_client.config import HTTPClientConfig
 from inference_endpoint.endpoint_client.cpu_affinity import compute_affinity_plan
@@ -399,6 +400,7 @@ def _create_client(
     prompt: str,
     enable_affinity: bool,
     verbose: bool = True,
+    zmq_context: ManagedZMQContext | None = None,
 ) -> tuple:
     """Create an endpoint client and query data dict.
 
@@ -422,7 +424,7 @@ def _create_client(
         endpoint_urls=[endpoint_url],
         num_workers=num_workers if num_workers > 0 else -1,
         max_connections=max_connections if max_connections > 0 else -1,
-        warmup_connections=False,
+        warmup_connections=0,
         worker_gc_mode="relaxed",
         log_level="CRITICAL",
         cpu_affinity=cpu_affinity_plan,
@@ -434,7 +436,7 @@ def _create_client(
             f"max_connections={config.max_connections}, stream={streaming}"
         )
 
-    client = AsyncHttpEndpointClient(config)
+    client = AsyncHttpEndpointClient(config, zmq_context=zmq_context)
     query_data = {
         "prompt": prompt,
         "model": "benchmark-model",
@@ -488,13 +490,17 @@ def run_benchmark(
         except OSError:
             pass
 
+    zmq_ctx_manager = ManagedZMQContext.scoped()
+    zmq_ctx = zmq_ctx_manager.__enter__()
+
     client, query_data = _create_client(
         endpoint_url,
         num_workers,
         max_connections,
         streaming,
         prompt,
         enable_affinity,
+        zmq_context=zmq_ctx,
     )
     loop = client.loop
     stats = BenchmarkStats(sse_events_per_response=sse_events_per_response)
@@ -613,6 +619,7 @@ async def receiver():
     gc.collect()
 
     asyncio.run_coroutine_threadsafe(client.shutdown(), loop).result(timeout=10.0)
+    zmq_ctx_manager.__exit__(None, None, None)
 
     # Restore original affinity so the next sweep iteration sees all CPUs
     if saved_affinity is not None:
diff --git a/tests/integration/commands/test_benchmark_command.py b/tests/integration/commands/test_benchmark_command.py
@@ -59,7 +59,7 @@ async def test_offline_benchmark_with_echo_server(
             verbose=1,
             model="echo-server",
             timeout=None,
-            warmup_connections=False,
+            warmup_connections=0,
         )
 
         with caplog.at_level("INFO"):
@@ -99,7 +99,7 @@ async def test_online_benchmark_with_echo_server(
             verbose=1,
             model="echo-server",
             timeout=None,
-            warmup_connections=False,
+            warmup_connections=0,
         )
         with caplog.at_level("INFO"):
             await run_benchmark_command(args)
@@ -143,7 +143,7 @@ async def test_benchmark_with_output_file(
             verbose=0,
             model="echo-server",
             timeout=None,
-            warmup_connections=False,
+            warmup_connections=0,
         )
 
         await run_benchmark_command(args)
@@ -185,7 +185,7 @@ async def test_benchmark_mode_logging(
             verbose=1,
             model="echo-server",
             timeout=None,
-            warmup_connections=False,
+            warmup_connections=0,
         )
         with caplog.at_level("INFO"):
             await run_benchmark_command(args)
diff --git a/tests/integration/endpoint_client/conftest.py b/tests/integration/endpoint_client/conftest.py
@@ -26,7 +26,7 @@ def create_futures_client(
     url: str,
     num_workers: int = 1,
     max_connections: int = 10,
-    warmup_connections: bool = False,
+    warmup_connections: int = 0,
     zmq_context=None,
 ) -> FuturesHttpClient:
     """Helper to create a FuturesHttpClient with specific config.
@@ -35,7 +35,7 @@ def create_futures_client(
         url: The endpoint URL to connect to
         num_workers: Number of worker processes (default: 1)
         max_connections: Max connections per worker (default: 10 for tests)
-        warmup_connections: Whether to warmup connections (default: False for tests)
+        warmup_connections: Warmup connection count (0 = disabled, -1 = auto, >0 = explicit)
         zmq_context: ManagedZMQContext when using ZMQ transport (required by default config).
 
     Returns:
diff --git a/tests/integration/endpoint_client/test_external_serving.py b/tests/integration/endpoint_client/test_external_serving.py
@@ -41,7 +41,7 @@ def _create_custom_client(
             endpoint_urls=[f"{vllm_docker_server['url']}/v1/chat/completions"],
             num_workers=num_workers,
             max_connections=50,
-            warmup_connections=False,
+            warmup_connections=0,
         )
 
         # TODO(vir):
diff --git a/tests/integration/endpoint_client/test_http_client.py b/tests/integration/endpoint_client/test_http_client.py
@@ -156,7 +156,7 @@ async def test_many_workers(self, mock_http_echo_server):
                     num_workers=num_workers,
                     max_connections=num_workers
                     * 10,  # ensure each worker has connections
-                    warmup_connections=False,
+                    warmup_connections=0,
                     zmq_context=zmq_ctx,
                 )
 
@@ -330,7 +330,7 @@ async def test_streaming_error_propagation(self):
             # Use invalid endpoint to trigger errors
             client = create_futures_client(
                 "http://invalid-endpoint-12345:9999/v1/chat/completions",
-                warmup_connections=False,
+                warmup_connections=0,
                 zmq_context=zmq_ctx,
             )
 
diff --git a/tests/integration/endpoint_client/test_sglang_adapter.py b/tests/integration/endpoint_client/test_sglang_adapter.py
@@ -43,7 +43,7 @@ def sglang_futures_client():
         endpoint_urls=[SGLANG_ENDPOINT],
         num_workers=4,
         api_type="sglang",
-        warmup_connections=False,
+        warmup_connections=0,
     )
 
     client = FuturesHttpClient(http_config)
diff --git a/tests/integration/endpoint_client/test_worker.py b/tests/integration/endpoint_client/test_worker.py
@@ -36,7 +36,7 @@ def worker_config(self, mock_http_echo_server):
             endpoint_urls=[f"{mock_http_echo_server.url}/v1/chat/completions"],
             num_workers=1,
             max_connections=10,
-            warmup_connections=False,
+            warmup_connections=0,
         )
         return http_config
 
@@ -229,7 +229,7 @@ def worker_config(self, mock_http_echo_server):
             endpoint_urls=[f"{mock_http_echo_server.url}/v1/chat/completions"],
             num_workers=1,
             max_connections=10,
-            warmup_connections=False,
+            warmup_connections=0,
         )
         return http_config
 
@@ -240,7 +240,7 @@ def error_config(self):
             endpoint_urls=["http://localhost:59999/v1/chat/completions"],
             num_workers=1,
             max_connections=10,
-            warmup_connections=False,
+            warmup_connections=0,
         )
         return http_config
 
@@ -416,7 +416,7 @@ async def malformed_json_non_streaming_handler(request):
                 endpoint_urls=[f"http://localhost:{server.port}/malformed"],
                 num_workers=1,
                 max_connections=10,
-                warmup_connections=False,
+                warmup_connections=0,
             )
 
             worker = Worker(
diff --git a/tests/integration/endpoint_client/test_worker_manager.py b/tests/integration/endpoint_client/test_worker_manager.py
@@ -112,7 +112,7 @@ def manager_config(self, mock_http_echo_server):
             endpoint_urls=[f"{mock_http_echo_server.url}/v1/chat/completions"],
             num_workers=2,
             max_connections=10,
-            warmup_connections=False,
+            warmup_connections=0,
         )
         return http_config
 
@@ -270,7 +270,7 @@ def worker_death_config(self):
             endpoint_urls=["http://localhost:59999/advanced"],
             num_workers=2,
             max_connections=10,
-            warmup_connections=False,
+            warmup_connections=0,
         )
         return http_config
 
diff --git a/tests/integration/test_end_to_end_oracle.py b/tests/integration/test_end_to_end_oracle.py
@@ -45,7 +45,7 @@ class DeepSeekR1SampleIssuer(HttpClientSampleIssuer):
     def __init__(self, tmp_path: Path, url: str, zmq_context: ManagedZMQContext):
         self.http_config = HTTPClientConfig(
             endpoint_urls=[urljoin(url, "/v1/chat/completions")],
-            warmup_connections=False,
+            warmup_connections=0,
         )
         super().__init__(HTTPEndpointClient(self.http_config, zmq_context=zmq_context))
 
diff --git a/tests/performance/conftest.py b/tests/performance/conftest.py
@@ -59,7 +59,7 @@ def http_client(perf_http_echo_server):
     http_config = HTTPClientConfig(
         endpoint_urls=[f"{perf_http_echo_server.url}/v1/chat/completions"],
         num_workers=1,
-        warmup_connections=False,
+        warmup_connections=0,
     )
 
     client = HTTPEndpointClient(config=http_config)
diff --git a/tests/unit/metrics/test_reporter.py b/tests/unit/metrics/test_reporter.py
@@ -75,6 +75,88 @@ def test_derive_tpot(events_db, sample_uuids, fake_outputs, tokenizer):
     assert all(tpot == expected_tpot2 for tpot in tpot2)
 
 
+def test_derive_tpot_with_string_output(tmp_path, sample_uuids, tokenizer):
+    """Test that derive_TPOT handles a plain string output gracefully.
+
+    A single-string output has only one chunk, so TPOT cannot be computed.
+    The reporter should not raise an exception and should return None.
+    """
+    test_db = str(tmp_path / "test_string_output.db")
+    uuid1 = sample_uuids(1)
+
+    with sqlite3_cursor(test_db) as (cursor, conn):
+        cursor.execute(
+            "CREATE TABLE IF NOT EXISTS events (sample_uuid VARCHAR(32), event_type VARCHAR(32), timestamp_ns INTEGER, data BLOB)"
+        )
+        cursor.executemany(
+            "INSERT INTO events (sample_uuid, event_type, timestamp_ns, data) VALUES (?, ?, ?, ?)",
+            [
+                ("", SessionEvent.TEST_STARTED.value, 5000, b""),
+                (uuid1, SessionEvent.LOADGEN_ISSUE_CALLED.value, 10000, b""),
+                (uuid1, SampleEvent.FIRST_CHUNK.value, 10010, b""),
+                (
+                    uuid1,
+                    SampleEvent.COMPLETE.value,
+                    10211,
+                    orjson.dumps({"output": "the final answer"}),
+                ),
+                ("", SessionEvent.TEST_ENDED.value, 10300, b""),
+            ],
+        )
+        conn.commit()
+
+    with MetricsReporter(test_db) as reporter:
+        tpot_rows = reporter.derive_TPOT(tokenizer)
+
+    # A single-string output produces only 1 chunk — TPOT requires at least 2
+    assert tpot_rows is None
+
+
+def test_derive_tpot_string_output_with_list_reasoning(
+    tmp_path, sample_uuids, tokenizer
+):
+    """Test that derive_TPOT computes TPOT when string output is paired with a list reasoning sequence.
+
+    The fix wraps string outputs into a single-element list so they can be combined with
+    reasoning chunks. Without the fix, the string output causes the sample to be silently
+    skipped before reasoning is considered, so TPOT returns None even though there are
+    enough chunks (output + reasoning) to compute it.
+    """
+    test_db = str(tmp_path / "test_string_output_with_reasoning.db")
+    uuid1 = sample_uuids(1)
+
+    with sqlite3_cursor(test_db) as (cursor, conn):
+        cursor.execute(
+            "CREATE TABLE IF NOT EXISTS events (sample_uuid VARCHAR(32), event_type VARCHAR(32), timestamp_ns INTEGER, data BLOB)"
+        )
+        cursor.executemany(
+            "INSERT INTO events (sample_uuid, event_type, timestamp_ns, data) VALUES (?, ?, ?, ?)",
+            [
+                ("", SessionEvent.TEST_STARTED.value, 5000, b""),
+                (uuid1, SessionEvent.LOADGEN_ISSUE_CALLED.value, 10000, b""),
+                (uuid1, SampleEvent.FIRST_CHUNK.value, 10010, b""),
+                (
+                    uuid1,
+                    SampleEvent.COMPLETE.value,
+                    10211,
+                    orjson.dumps(
+                        {"output": "the answer", "reasoning": ["thought step"]}
+                    ),
+                ),
+                ("", SessionEvent.TEST_ENDED.value, 10300, b""),
+            ],
+        )
+        conn.commit()
+
+    with MetricsReporter(test_db) as reporter:
+        tpot_rows = reporter.derive_TPOT(tokenizer)
+
+    # String output ("the answer") + list reasoning (["thought step"]) = 2 chunks total,
+    # which is enough for TPOT computation.
+    assert tpot_rows is not None
+    assert len(tpot_rows) == 1
+
+
 def test_derive_sample_latency(events_db, sample_uuids):
     uuid1 = sample_uuids(1)
     uuid2 = sample_uuids(2)

Original file line number	Diff line number	Diff line change
`@@ -72,7 +72,7 @@ async def run_probe_command(args: argparse.Namespace) -> None:`
`72`	`72`	`],`
`73`	`73`	`api_type=api_type,`
`74`	`74`	`num_workers=1,`
`75`		`- warmup_connections=False,`
	`75`	`+ warmup_connections=0,`
`76`	`76`	`)`
`77`	`77`	`# Client creates its own event loop in a separate thread`
`78`	`78`	`client = HTTPEndpointClient(http_config, zmq_context=zmq_ctx)`
Original file line number	Diff line number	Diff line change
`@@ -41,7 +41,7 @@ def _create_custom_client(`
`41`	`41`	`endpoint_urls=[f"{vllm_docker_server['url']}/v1/chat/completions"],`
`42`	`42`	`num_workers=num_workers,`
`43`	`43`	`max_connections=50,`
`44`		`- warmup_connections=False,`
	`44`	`+ warmup_connections=0,`
`45`	`45`	`)`
`46`	`46`
`47`	`47`	`# TODO(vir):`
Original file line number	Diff line number	Diff line change
`@@ -43,7 +43,7 @@ def sglang_futures_client():`
`43`	`43`	`endpoint_urls=[SGLANG_ENDPOINT],`
`44`	`44`	`num_workers=4,`
`45`	`45`	`api_type="sglang",`
`46`		`- warmup_connections=False,`
	`46`	`+ warmup_connections=0,`
`47`	`47`	`)`
`48`	`48`
`49`	`49`	`client = FuturesHttpClient(http_config)`