mlcommons · viraatc · Mar 23, 2026 · Mar 12, 2026 · Mar 12, 2026 · Mar 12, 2026
@@ -92,3 +92,61 @@ python -m inference_endpoint.utils.benchmark_httpclient -w 8 --stream --stream-i
 Pick the worker count where recv rate peaks and stall% is low.
 
 For streaming workloads, also watch **SSE-pkts/s** — a small stream interval (fine-grained events) dramatically increases packet rate and may require more workers to keep up. If SSE-pkts/s plateaus while recv rate drops, the client is bottlenecked on SSE parsing overhead.
+
+---
+
+## Test Servers
+
+Two built-in servers for benchmarking without a real GPU endpoint.
+
+### MaxThroughputServer
+
+Returns identical pre-compiled responses instantly — zero compute, pure client roofline.
+
+```bash
+python -m inference_endpoint.testing.max_throughput_server --port 12345 --stats
+python -m inference_endpoint.testing.max_throughput_server --stream --stream-interval 50 --stats
+```
+
+| Flag                | Default | Description              |
+| ------------------- | ------- | ------------------------ |
+| `--output-length`   | 4000    | Characters in response   |
+| `--stream`          | off     | SSE streaming mode       |
+| `--stream-interval` | 1       | Characters per SSE event |
+| `--num-workers`     | 4       | Server worker processes  |
+
+### VariableResponseServer
+
+Realistic LLM simulation with per-request variable output lengths, TTFT, and TPOT.
+
+Two mutually exclusive timing modes:
+
+- **Response-rate mode** (`--response-rate-mean`): per-worker token bucket controls global throughput
+- **Inter-token mode** (`--inter-token-latency`): per-token generation time (TPOT) in ms. Inter-SSE-event delay = TPOT × stream_interval
+
+```bash
+# Non-streaming with response-rate control
+python -m inference_endpoint.testing.variable_throughput_server --stats \
+    --response-rate-mean 1000
+
+# Streaming with TPOT + TTFT
+python -m inference_endpoint.testing.variable_throughput_server --stream --stats \
+    --inter-token-latency 15 --first-chunk-latency 1.5 --stream-interval 10
+
+# With jitter
+python -m inference_endpoint.testing.variable_throughput_server --stream --stats \
+    --response-rate-mean 50 --response-rate-spread 0.2 \
+    --first-chunk-latency 0.5 --first-chunk-spread 0.2
+```
+
+| Flag                    | Default | Description                                                                   |
+| ----------------------- | ------- | ----------------------------------------------------------------------------- |
+| `--output-len-mean`     | 1000    | Mean output length (chars)                                                    |
+| `--output-len-spread`   | 0.3     | CoV for output length (lognormal)                                             |
+| `--response-rate-mean`  | 0       | Global throughput (resp/sec). Mutually exclusive with `--inter-token-latency` |
+| `--inter-token-latency` | 0       | Per-token delay in ms (TPOT). Mutually exclusive with `--response-rate-mean`  |
+| `--first-chunk-latency` | 0       | Mean TTFT in seconds                                                          |
+| `--first-chunk-spread`  | 0.2     | CoV for TTFT                                                                  |
+| `--stream-interval`     | 1       | Chars per SSE event                                                           |
+| `--max-concurrency`     | 0       | Max concurrent requests (0 = unlimited)                                       |
+| `--num-workers`         | 10      | Server worker processes                                                       |
@@ -1,32 +1,5 @@
-<!--
-DOCUMENT GUIDELINES:
-- No performance claims without experiment data (no "2x faster", "50k+ QPS" etc.)
-- Design choices presented as tables with rationale, not "Why X?" prose
-- Keep explanations factual and descriptive, no marketing language
-- DONT use Problem: Solution: pattern, or the Why XYZ patten, or purpose: pattern when writing sections
-- Write the design doc for principle-engineer level readers, experts in python
--->
-
 # MLPerf Inference Endpoints: HttpClient Design Document
 
-| Item                      | Details                                                      |
-| ------------------------- | ------------------------------------------------------------ |
-| **Version**               | 0.1                                                          |
-| **Author(s)**             | Viraat Chandra US                                            |
-| **Status**                | Draft Completed                                              |
-| **Last Updated**          | 17/02/2026                                                   |
-| **Related Documentation** | [NV Internal] MLPerf Inference Endpoint Technical Design Doc |
-
-### Review
-
-| Name   | Association | Date | Status      | Notes |
-| ------ | ----------- | ---- | ----------- | ----- |
-| Alice  |             |      | Not Started |       |
-| Rashid |             |      | Not Started |       |
-| Zhihan |             |      | Not Started |       |
-
----
-
 ## Table of Contents
 
 - [1. Introduction \& Constraints](#1-introduction--constraints)

@@ -105,7 +105,7 @@ class HTTPClientConfig:
     # Values:
     #   - >0 = explicit minimum required connections
     #   - 0 = disable check (no warning if ports unavailable)
-    #   - -1 = auto (defaults to 90% of system ephemeral port range)
+    #   - -1 = auto (defaults to 12.5% of system ephemeral port range)
     min_required_connections: int = -1
 
     # GC strategy for worker processes to reduce latency spikes from collection pauses
@@ -168,9 +168,9 @@ def __post_init__(self):
             # Auto: use available ephemeral ports
             self.max_connections = available_ports
 
-            # Resolve min_required_connections: -1 means auto (90% of system max)
+            # Resolve min_required_connections: -1 means auto (12.5% of system max)
             if self.min_required_connections == -1:
-                self.min_required_connections = int(system_maximum_ports * 0.90)
+                self.min_required_connections = int(system_maximum_ports * 0.125)
         else:
             # User specified explicit max_connections - validate against port limit
             if self.max_connections > available_ports:
@@ -184,14 +184,14 @@ def _get_auto_num_workers() -> int:
     """
     Compute optimal number of workers based on NUMA topology.
 
-    Defaults to NUMA domain size (min 8, max 24) for optimal memory locality.
+    Defaults to NUMA domain size (min 10, max 24) for optimal memory locality.
     Users can override with explicit num_workers to use more cores (workers
     will be pinned to additional cores outside NUMA domain if needed).
 
     Returns:
         Number of workers to use when num_workers is -1 (auto).
     """
-    min_workers = 8
+    min_workers = 10
     max_workers = 24
 
     numa_node = get_current_numa_node()

@@ -56,11 +56,11 @@ def wrapper(*args, **kwargs):
     return wrapper
 
 
-# NOTE(vir): seeing high jitter when loadgen has <2 Physical CPUs
+# NOTE(vir): seeing high jitter when loadgen has <=2 Physical CPUs
 # Default physical cores for LoadGen (main process):
 #   - Session thread (scheduler, busy-wait timing)
 #   - Event loop thread (uvloop, response handling)
-DEFAULT_LOADGEN_CORES = 2
+DEFAULT_LOADGEN_CORES = 5
 
 
 # =============================================================================

@@ -51,18 +51,22 @@ class _SocketConfig:
 
     # Connection keepalive-probe settings for long-lived connections
     # client kernel sends probe, server's kernel ACKs - no application overhead
+    #
+    # TODO(vir): verify impact on failure-detection, we want to fail fast
+    # detection time: KEEPIDLE + (KEEPCNT × KEEPINTVL) = 1 + 5×1 = 6s
     SO_KEEPALIVE: int = 1  # Enable keepalive at socket level
     TCP_KEEPIDLE: int = 1  # Probe after 1s idle
-    TCP_KEEPCNT: int = 1  # 1 failed probe = dead
+    TCP_KEEPCNT: int = 5  # 5 failed probes = dead
     TCP_KEEPINTVL: int = 1  # 1s between probes
 
-    # Make sure socket buffers are never the bottle neck
-    # With HTTP/1.1, a TCP socket will only be used for a single request
-    # Largest message size would be server response in Offline Mode
-    # 4MB /4 bytes per token = 1M tokens in any given packet
-    # TODO(vir): analyze workloads to better tune buffer sizes
-    SO_RCVBUF: int = 1024 * 1024 * 4  # 4MB receive buffer
-    SO_SNDBUF: int = 1024 * 1024 * 4  # 4MB send buffer
+    # Socket buffer sizing: sliding windows, not full-message buffers.
+    # The event loop reads eagerly so the buffer only holds data between
+    # kernel delivery and application read — typically one RTT worth.
+    #
+    # 128KB ≈ 128K chars buffered in-flight at any instant.
+    # Responses larger than the buffer stream through fine (TCP sliding window).
+    SO_RCVBUF: int = 1024 * 128  # 128KB receive buffer
+    SO_SNDBUF: int = 1024 * 128  # 128KB send buffer
 
     # Linux-specific:
     # kernel closes socket if sent data not ACKed within timeout

@@ -78,6 +78,9 @@ def worker_main(
         connector: Transport connector for IPC (ZMQ, shared memory, etc.).
         http_config: HTTP client configuration.
     """
+    # Suppress transformers "no framework found" warning (only tokenizers used)
+    os.environ.setdefault("TRANSFORMERS_VERBOSITY", "error")
+
     worker_log_format = f"%(asctime)s - %(name)s[W{worker_id}/%(process)d] - %(funcName)s - %(levelname)s - %(message)s"
     setup_logging(level=http_config.log_level, format_string=worker_log_format)
 
@@ -201,8 +204,8 @@ async def run(self) -> None:
 
             # Create connection pool
             # Naively divide max connections among workers
-            connections_per_worker = (
-                self.http_config.max_connections // self.http_config.num_workers
+            connections_per_worker = max(
+                1, self.http_config.max_connections // self.http_config.num_workers
             )
             self._pool = ConnectionPool(
                 host=self._host,
@@ -228,33 +231,22 @@ async def run(self) -> None:
                     warmup_count = warmup_cfg // self.http_config.num_workers
                 warmup_count = max(1, warmup_count)
                 warmed = await self._pool.warmup(count=warmup_count)
-                logger.debug(f"Warmed up {warmed} connections")
-
-                # Error if 0 connections warmed up
-                if warmed == 0:
-                    msg = "Warmup: failed to establish connection to endpoint. Consider closing background TCP connections."
-                    if self.http_config.min_required_connections == 0:
-                        # log error but continue if disabled check
-                        logger.error(msg)
-                    else:
-                        # NOTE(vir):
-                        # 0 warmup connections is always fatal in practice,
-                        # user needs to explicitly disable check to proceed
-                        logger.error(
-                            f"{msg} [ skip-check with --min_required_connections=0 ]"
-                        )
-                        sys.exit(1)
+                logger.debug(f"Warmed up {warmed}/{warmup_count} connections")
 
-                # Warn if below min_required_connections threshold (skip if 0 = disabled)
-                elif self.http_config.min_required_connections > 0:
-                    min_required_per_worker = (
+                # Warn if warmup fell short of target
+                # min_required_connections=0 disables the check
+                if self.http_config.min_required_connections > 0:
+                    min_per_worker = (
                         self.http_config.min_required_connections
                         // self.http_config.num_workers
                     )
-                    if warmed < min_required_per_worker:
+                    threshold = (
+                        max(1, min_per_worker) if warmup_cfg == -1 else warmup_count
+                    )
+                    if warmed < threshold:
                         logger.warning(
-                            f"Warmup: this worker has {warmed} connections, need {min_required_per_worker}. "
-                            "Consider closing background TCP connections or adjusting --min_required_connections."
+                            f"Warmup: only established {warmed}/{warmup_count} connections "
+                            f"(need {threshold}). Consider closing background TCP connections."
                         )
 
             # TODO(vir):

@@ -23,12 +23,16 @@
 
 import asyncio
 import logging
+import os
 import sys
 
-import uvloop
+# Suppress transformers "no framework found" warning (only tokenizers used)
+os.environ.setdefault("TRANSFORMERS_VERBOSITY", "error")
 
-from inference_endpoint.cli import main as cli_main
-from inference_endpoint.utils.logging import setup_logging
+import uvloop  # noqa: E402
+
+from inference_endpoint.cli import main as cli_main  # noqa: E402
+from inference_endpoint.utils.logging import setup_logging  # noqa: E402
 
 logger = logging.getLogger(__name__)
 

@@ -570,7 +570,11 @@ def main():
         stats=args.stats,
     )
 
+    _main_pid = os.getpid()
+
     def sig_handler(signum, frame):
+        if os.getpid() != _main_pid:
+            os._exit(0)
         server.stop()
         os._exit(0)