mlcommons
diff --git a/‎examples/09_Warmup_Example/README.md‎
Lines changed: 110 additions & 0 deletions b/‎examples/09_Warmup_Example/README.md‎
Lines changed: 110 additions & 0 deletions
diff --git a/‎examples/09_Warmup_Example/warmup_offline.yaml‎
Lines changed: 62 additions & 0 deletions b/‎examples/09_Warmup_Example/warmup_offline.yaml‎
Lines changed: 62 additions & 0 deletions
diff --git a/‎examples/09_Warmup_Example/warmup_online.yaml‎
Lines changed: 63 additions & 0 deletions b/‎examples/09_Warmup_Example/warmup_online.yaml‎
Lines changed: 63 additions & 0 deletions
diff --git a/‎src/inference_endpoint/commands/benchmark.py‎
Lines changed: 34 additions & 0 deletions b/‎src/inference_endpoint/commands/benchmark.py‎
Lines changed: 34 additions & 0 deletions
diff --git a/‎src/inference_endpoint/config/schema.py‎
Lines changed: 24 additions & 0 deletions b/‎src/inference_endpoint/config/schema.py‎
Lines changed: 24 additions & 0 deletions
diff --git a/‎src/inference_endpoint/load_generator/__init__.py‎
Lines changed: 2 additions & 0 deletions b/‎src/inference_endpoint/load_generator/__init__.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎src/inference_endpoint/load_generator/scheduler.py‎
Lines changed: 18 additions & 0 deletions b/‎src/inference_endpoint/load_generator/scheduler.py‎
Lines changed: 18 additions & 0 deletions
@@ -0,0 +1,110 @@
+# Warmup Example
+
+This example demonstrates the **warmup phase** feature using
+[Qwen/Qwen2.5-0.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct),
+a small 0.5B parameter model that is easy to run locally.
+
+The warmup phase issues randomly generated requests to the endpoint before the timed
+performance window begins.
+
+## What warmup does
+
+Before the benchmark clock starts, the warmup phase sends a configurable number of
+requests using randomly generated token sequences. This primes the endpoint by:
+
+- Establishing and reusing TCP connections
+- Filling KV caches to steady-state occupancy
+- Triggering JIT compilation / CUDA graph capture in the inference runtime
+
+Warmup samples are **excluded from all reported metrics** — they complete before
+`TEST_STARTED` is recorded, so they do not affect throughput, latency, TTFT, or TPOT.
+
+## Warmup configuration
+
+Add a `warmup` block to any YAML config:
+
+```yaml
+warmup:
+  num_samples: 64 # number of warmup requests to issue
+  input_seq_length: 256 # ISL: target input token count
+  output_seq_length: 64 # OSL: max_new_tokens for warmup requests
+  range_ratio: 0.9 # ISL variance: generates ISL in [256*0.9, 256]
+  random_seed: 42
+```
+
+No real dataset is needed for warmup — sequences are generated at runtime from random
+token IDs using the model's own tokenizer.
+
+## Quick test with the echo server
+
+The built-in echo server lets you verify the warmup flow locally without a GPU.
+
+```bash
+# Terminal 1 — start the echo server
+python -m inference_endpoint.testing.echo_server --port 8000
+
+# Terminal 2 — run offline benchmark with warmup
+inference-endpoint benchmark from-config -c examples/09_Warmup_Example/warmup_offline.yaml
+```
+
+The log output will show the warmup phase completing before the performance run starts:
+
+```
+INFO  Warmup dataset ready: 64 samples (ISL=256, OSL=64)
+INFO  Warmup: issuing samples...
+INFO  Warmup samples issued, waiting for responses to drain...
+INFO  Warmup complete
+INFO  Running...
+```
+
+## Running against a real endpoint
+
+### Prerequisites
+
+```bash
+export HF_TOKEN=<your Hugging Face token>
+export HF_HOME=<path to your HuggingFace cache, e.g. ~/.cache/huggingface>
+```
+
+Download the model before launching so vLLM can reuse the local cache:
+
+```bash
+huggingface-cli download Qwen/Qwen2.5-0.5B-Instruct
+```
+
+### Launch a vLLM server
+
+The `--trust-request-chat-template` flag is required because the CNN DailyMail dataset
+sends requests with a custom chat template.
+
+```bash
+docker run --runtime nvidia --gpus all \
+  -v ${HF_HOME}:/root/.cache/huggingface \
+  --env "HUGGING_FACE_HUB_TOKEN=$HF_TOKEN" \
+  -p 8000:8000 --ipc=host \
+  vllm/vllm-openai:latest \
+  --model Qwen/Qwen2.5-0.5B-Instruct \
+  --trust-request-chat-template
+```
+
+### Offline benchmark with warmup
+
+```bash
+inference-endpoint benchmark from-config -c examples/09_Warmup_Example/warmup_offline.yaml
+```
+
+### Online benchmark with warmup
+
+```bash
+inference-endpoint benchmark from-config -c examples/09_Warmup_Example/warmup_online.yaml
+```
+
+## Tuning warmup parameters
+
+| Parameter           | Guidance                                                                |
+| ------------------- | ----------------------------------------------------------------------- |
+| `num_samples`       | Use enough to saturate the KV cache; 32–128 is typical for small models |
+| `input_seq_length`  | Match the ISL distribution of your real workload                        |
+| `output_seq_length` | Match the OSL distribution; lower values make warmup finish faster      |
+| `range_ratio`       | `1.0` = fixed ISL; `0.8`–`0.9` adds light variance for broader coverage |
+| `random_seed`       | Change to vary which token sequences are generated                      |
@@ -0,0 +1,62 @@
+# Offline Throughput Benchmark with Warmup Phase
+#
+# The warmup phase issues randomly generated requests before the timed
+# performance window starts. This primes the endpoint by:
+#   - Establishing and reusing TCP connections
+#   - Filling KV caches to steady-state
+#   - Triggering JIT compilation in the inference runtime
+#
+# Warmup samples are excluded from all reported metrics.
+name: "warmup-offline-qwen2.5-0.5b"
+version: "1.0"
+type: "offline"
+
+# Warmup configuration: runs before the timed performance test.
+# Uses randomly generated token sequences; no real dataset required.
+warmup:
+  num_samples: 64 # Number of warmup requests to issue
+  input_seq_length: 256 # ISL: target input sequence length in tokens
+  output_seq_length: 64 # OSL: max_new_tokens for warmup requests
+  range_ratio: 0.9 # ISL variance: generates ISL in [256*0.9, 256]
+  random_seed: 42
+
+model_params:
+  name: "Qwen/Qwen2.5-0.5B-Instruct"
+  temperature: 0.0
+  top_p: 1.0
+  max_new_tokens: 128
+
+datasets:
+  - name: cnn_dailymail::llama3_8b
+    type: "performance"
+    samples: 18
+    parser:
+      input: prompt
+
+settings:
+  runtime:
+    min_duration_ms: 60000 # 1 minute
+    max_duration_ms: 360000 # 6 minutes
+    scheduler_random_seed: 137
+    dataloader_random_seed: 111
+    n_samples_to_issue: 4
+
+  load_pattern:
+    type: "max_throughput"
+
+  client:
+    workers: 4
+
+metrics:
+  collect:
+    - "throughput"
+    - "latency"
+    - "ttft"
+    - "tpot"
+
+endpoint_config:
+  endpoints:
+    - "http://localhost:8000"
+  api_key: null
+
+report_dir: logs/warmup_offline_fixed
@@ -0,0 +1,63 @@
+# Online (Sustained QPS) Benchmark with Warmup Phase
+#
+# The warmup phase issues randomly generated requests before the timed
+# performance window starts. This primes the endpoint by:
+#   - Establishing and reusing TCP connections
+#   - Filling KV caches to steady-state
+#   - Triggering JIT compilation in the inference runtime
+#
+# Warmup samples are excluded from all reported metrics.
+name: "warmup-online-qwen2.5-0.5b"
+version: "1.0"
+type: "online"
+
+# Warmup configuration: runs before the timed performance test.
+# Uses randomly generated token sequences; no real dataset required.
+warmup:
+  num_samples: 32 # Number of warmup requests to issue
+  input_seq_length: 128 # ISL: target input sequence length in tokens
+  output_seq_length: 32 # OSL: max_new_tokens for warmup requests
+  range_ratio: 0.8 # ISL variance: generates ISL in [128*0.8, 128]
+  random_seed: 42
+
+model_params:
+  name: "Qwen/Qwen2.5-0.5B-Instruct"
+  temperature: 0.0
+  top_p: 1.0
+  max_new_tokens: 128
+  streaming: "on"
+
+datasets:
+  - name: cnn_dailymail::llama3_8b
+    type: "performance"
+    samples: 13368
+    parser:
+      input: prompt
+
+settings:
+  runtime:
+    min_duration_ms: 60000 # 1 minute
+    max_duration_ms: 360000 # 6 minutes
+    scheduler_random_seed: 137
+    dataloader_random_seed: 111
+
+  load_pattern:
+    type: "poisson"
+    target_qps: 10.0
+
+  client:
+    workers: 4
+
+metrics:
+  collect:
+    - "throughput"
+    - "latency"
+    - "ttft"
+    - "tpot"
+
+endpoint_config:
+  endpoints:
+    - "http://localhost:8000"
+  api_key: null
+
+report_dir: logs/warmup_online
@@ -451,6 +451,39 @@ def _run_benchmark(
             logger.info("Streaming: disabled (auto, offline mode)")
             config.model_params.streaming = StreamingMode.OFF
 
+    # Build warmup dataset if configured
+    warmup_dataset = None
+    if config.warmup is not None:
+        if tokenizer is None:
+            raise InputValidationError(
+                "A tokenizer is required to generate the warmup dataset. Ensure model_params.name is set."
+            )
+        from inference_endpoint.dataset_manager.predefined.random import RandomDataset
+
+        warmup_cfg = config.warmup
+        warmup_df = RandomDataset.generate(
+            datasets_dir=None,
+            force=False,
+            num_sequences=warmup_cfg.num_samples,
+            input_seq_length=warmup_cfg.input_seq_length,
+            range_ratio=warmup_cfg.range_ratio,
+            random_seed=warmup_cfg.random_seed,
+            tokenizer=tokenizer,
+        )
+        warmup_dataset = RandomDataset(warmup_df)
+        warmup_model_params = ModelParams(
+            name=config.model_params.name,
+            max_new_tokens=warmup_cfg.output_seq_length,
+        )
+        warmup_dataset.load(
+            api_type=config.endpoint_config.api_type,
+            model_params=warmup_model_params,
+        )
+        logger.info(
+            f"Warmup dataset ready: {warmup_dataset.num_samples()} samples "
+            f"(ISL={warmup_cfg.input_seq_length}, OSL={warmup_cfg.output_seq_length})"
+        )
+
     # Get dataset - from CLI or from config
     # TODO: Dataset Logic is not yet fully implemented
 
@@ -609,6 +642,7 @@ def _run_benchmark(
                 dataloader,
                 sample_issuer,
                 scheduler,
+                warmup_dataset=warmup_dataset,
                 name=f"cli_benchmark_{uuid.uuid4().hex[0:8]}",
                 report_dir=report_dir,
                 tokenizer_override=tokenizer,
 
@@ -234,6 +234,29 @@ class AccuracyConfig(BaseModel):
     num_repeats: int = 1
 
 
+class WarmupConfig(BaseModel):
+    """Configuration for the warmup phase using randomly generated data.
+
+    The warmup phase runs before the timed performance test to prime the
+    endpoint (warm TCP connections, fill KV caches, trigger JIT compilation).
+    Uses randomly generated token sequences with configurable ISL and OSL.
+
+    Fields:
+        num_samples: Number of warmup queries to issue.
+        input_seq_length: Target input sequence length in tokens (ISL).
+        output_seq_length: Max output tokens for warmup requests (OSL).
+        range_ratio: ISL variance factor in [0.0, 1.0]. Generates ISL in
+            the range [input_seq_length * range_ratio, input_seq_length].
+        random_seed: Seed for reproducible warmup data generation.
+    """
+
+    num_samples: int = 100
+    input_seq_length: int = 512
+    output_seq_length: int = 128
+    range_ratio: float = 1.0
+    random_seed: int = 42
+
+
 class RuntimeConfig(BaseModel):
     """Runtime configuration from YAML (user-facing).
 
@@ -392,6 +415,7 @@ class BenchmarkConfig(BaseModel):
     #   - True = auto (compute optimal NUMA-aware plan)
     #   - False = disabled (no CPU pinning)
     enable_cpu_affinity: bool = True
+    warmup: WarmupConfig | None = None
 
     @classmethod
     def from_yaml_file(cls, path: Path) -> BenchmarkConfig:
 
@@ -29,6 +29,7 @@
     PoissonDistributionScheduler,
     SampleOrder,
     Scheduler,
+    SequentialSampleOrder,
     WithoutReplacementSampleOrder,
     WithReplacementSampleOrder,
 )
@@ -46,6 +47,7 @@
     "MaxThroughputScheduler",
     "PoissonDistributionScheduler",
     "SampleOrder",
+    "SequentialSampleOrder",
     "WithReplacementSampleOrder",
     "WithoutReplacementSampleOrder",
     "LoadGenerator",
 
@@ -168,6 +168,24 @@ def next_sample_index(self) -> int:
         return self.rng.randint(0, self.n_samples_in_dataset - 1)
 
 
+class SequentialSampleOrder(SampleOrder):
+    """Sample ordering without randomness.
+
+    Issues dataset rows in their natural order and wraps around if more samples are
+    requested than the dataset contains.
+    """
+
+    def next_sample_index(self) -> int:
+        """Get next sample index in dataset order.
+
+        Returns:
+            Sample index from dataset.
+        """
+        if self.n_samples_in_dataset <= 0:
+            raise IndexError("Cannot issue samples from an empty dataset")
+        return self._issued_samples % self.n_samples_in_dataset
+
+
 def uniform_delay_fn(
     max_delay_ns: int = 0, rng: random.Random | None = None
 ) -> Callable[[], float]: