mlcommons · arekay-nv · Mar 13, 2026 · Mar 16, 2026 · Mar 17, 2026 · Mar 17, 2026
@@ -0,0 +1,47 @@
+# Warmup Example
+
+This directory shows how to add a warmup phase to offline and online benchmark
+configs for [Qwen/Qwen2.5-0.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct).
+
+Warmup sends randomly generated requests before the timed run to prime the
+endpoint. Those samples complete before `TEST_STARTED`, so they are excluded
+from reported throughput and latency metrics.
+
+## Files
+
+- `warmup_offline.yaml`: offline max-throughput example
+- `warmup_online.yaml`: online Poisson-QPS example
+
+Both configs use the same `warmup` block:
+
+```yaml
+warmup:
+  num_samples: 64
+  input_seq_length: 256
+  output_seq_length: 64
+  input_range_ratio: 0.9
+  random_seed: 42
+```
+
+Warmup data is generated at runtime from random token IDs using the model
+tokenizer, so no separate warmup dataset is needed.
+
+## Run Locally
+
+With the built-in echo server:
+
+```bash
+python -m inference_endpoint.testing.echo_server --port 8000
+inference-endpoint benchmark from-config -c examples/09_Warmup_Example/warmup_offline.yaml
+inference-endpoint benchmark from-config -c examples/09_Warmup_Example/warmup_online.yaml
+```
+
+Against a real endpoint, point `endpoint_config.endpoints` in the YAML at that
+server and run the same commands.
+
+## Tuning
+
+- `num_samples`: use enough requests to reach a steady state
+- `input_seq_length`: match the typical prompt length of the workload
+- `output_seq_length`: match the expected response length
+- `input_range_ratio`: use values below `1.0` to add light ISL variation
@@ -0,0 +1,62 @@
+# Offline Throughput Benchmark with Warmup Phase
+#
+# The warmup phase issues randomly generated requests before the timed
+# performance window starts. This primes the endpoint by:
+#   - Establishing and reusing TCP connections
+#   - Filling KV caches to steady-state
+#   - Triggering JIT compilation in the inference runtime
+#
+# Warmup samples are excluded from all reported metrics.
+name: "warmup-offline-qwen2.5-0.5b"
+version: "1.0"
+type: "offline"
+
+# Warmup configuration: runs before the timed performance test.
+# Uses randomly generated token sequences; no real dataset required.
+warmup:
+  num_samples: 64 # Number of warmup requests to issue
+  input_seq_length: 256 # ISL: target input sequence length in tokens
+  output_seq_length: 64 # OSL: max_new_tokens for warmup requests
+  input_range_ratio: 0.9 # ISL variance: generates ISL in [256*0.9, 256]
+  random_seed: 42
+
+model_params:
+  name: "Qwen/Qwen2.5-0.5B-Instruct"
+  temperature: 0.0
+  top_p: 1.0
+  max_new_tokens: 128
+
+datasets:
+  - name: cnn_dailymail::llama3_8b
+    type: "performance"
+    samples: 18
+    parser:
+      input: prompt
+
+settings:
+  runtime:
+    min_duration_ms: 60000 # 1 minute
+    max_duration_ms: 360000 # 6 minutes
+    scheduler_random_seed: 137
+    dataloader_random_seed: 111
+    n_samples_to_issue: 4
+
+  load_pattern:
+    type: "max_throughput"
+
+  client:
+    workers: 4
+
+metrics:
+  collect:
+    - "throughput"
+    - "latency"
+    - "ttft"
+    - "tpot"
+
+endpoint_config:
+  endpoints:
+    - "http://localhost:8000"
+  api_key: null
+
+report_dir: logs/warmup_offline_fixed
@@ -0,0 +1,63 @@
+# Online (Sustained QPS) Benchmark with Warmup Phase
+#
+# The warmup phase issues randomly generated requests before the timed
+# performance window starts. This primes the endpoint by:
+#   - Establishing and reusing TCP connections
+#   - Filling KV caches to steady-state
+#   - Triggering JIT compilation in the inference runtime
+#
+# Warmup samples are excluded from all reported metrics.
+name: "warmup-online-qwen2.5-0.5b"
+version: "1.0"
+type: "online"
+
+# Warmup configuration: runs before the timed performance test.
+# Uses randomly generated token sequences; no real dataset required.
+warmup:
+  num_samples: 32 # Number of warmup requests to issue
+  input_seq_length: 128 # ISL: target input sequence length in tokens
+  output_seq_length: 32 # OSL: max_new_tokens for warmup requests
+  input_range_ratio: 0.8 # ISL variance: generates ISL in [128*0.8, 128]
+  random_seed: 42
+
+model_params:
+  name: "Qwen/Qwen2.5-0.5B-Instruct"
+  temperature: 0.0
+  top_p: 1.0
+  max_new_tokens: 128
+  streaming: "on"
+
+datasets:
+  - name: cnn_dailymail::llama3_8b
+    type: "performance"
+    samples: 13368
+    parser:
+      input: prompt
+
+settings:
+  runtime:
+    min_duration_ms: 60000 # 1 minute
+    max_duration_ms: 360000 # 6 minutes
+    scheduler_random_seed: 137
+    dataloader_random_seed: 111
+
+  load_pattern:
+    type: "poisson"
+    target_qps: 10.0
+
+  client:
+    workers: 4
+
+metrics:
+  collect:
+    - "throughput"
+    - "latency"
+    - "ttft"
+    - "tpot"
+
+endpoint_config:
+  endpoints:
+    - "http://localhost:8000"
+  api_key: null
+
+report_dir: logs/warmup_online
@@ -63,6 +63,7 @@
 from inference_endpoint.core.types import QueryResult
 from inference_endpoint.dataset_manager.dataset import Dataset
 from inference_endpoint.dataset_manager.factory import DataLoaderFactory
+from inference_endpoint.dataset_manager.predefined.random import RandomDataset
 from inference_endpoint.endpoint_client.config import HTTPClientConfig
 from inference_endpoint.endpoint_client.cpu_affinity import (
     pin_loadgen,
@@ -458,6 +459,43 @@ def _run_benchmark(
             logger.info("Streaming: disabled (auto, offline mode)")
             config.model_params.streaming = StreamingMode.OFF
 
+    # Build warmup dataset if configured
+    warmup_dataset = None
+    if config.warmup is not None:
+        if tokenizer is None:
+            raise InputValidationError(
+                "A tokenizer is required to generate the warmup dataset. Ensure model_params.name is set."
+            )
+
+        warmup_cfg = config.warmup
+        try:
+            warmup_df = RandomDataset.generate(
+                datasets_dir=None,
+                force=False,
+                num_sequences=warmup_cfg.num_samples,
+                input_seq_length=warmup_cfg.input_seq_length,
+                input_range_ratio=warmup_cfg.input_range_ratio,
+                random_seed=warmup_cfg.random_seed,
+                tokenizer=tokenizer,
+            )
+        except (ValueError, TypeError) as e:
+            raise InputValidationError(
+                f"Failed to generate warmup dataset from warmup config: {e}"
+            ) from e
+        warmup_dataset = RandomDataset(warmup_df)
+        # Create a new model params object with the warmup output sequence length
+        warmup_model_params = config.model_params.model_copy(
+            update={"max_new_tokens": warmup_cfg.output_seq_length}
+        )
+        warmup_dataset.load(
+            api_type=config.endpoint_config.api_type,
+            model_params=warmup_model_params,
+        )
+        logger.info(
+            f"Warmup dataset ready: {warmup_dataset.num_samples()} samples "
+            f"(ISL={warmup_cfg.input_seq_length}, OSL={warmup_cfg.output_seq_length})"
+        )
+
     # Get dataset - from CLI or from config
     # TODO: Dataset Logic is not yet fully implemented
 
@@ -543,6 +581,9 @@ def _run_benchmark(
         total_samples += sum(
             [dataset.num_samples() * dataset.repeats for dataset in accuracy_datasets]
         )
+    if warmup_dataset is not None:
+        total_samples += warmup_dataset.num_samples()
+
     duration_s = rt_settings.min_duration_ms / 1000
 
     logger.info(
@@ -626,6 +667,7 @@ def _run_benchmark(
                 dataloader,
                 sample_issuer,
                 scheduler,
+                warmup_dataset=warmup_dataset,
                 name=f"cli_benchmark_{uuid.uuid4().hex[0:8]}",
                 report_dir=report_dir,
                 tokenizer_override=tokenizer,

@@ -234,6 +234,29 @@ class AccuracyConfig(BaseModel):
     num_repeats: int = 1
 
 
+class WarmupConfig(BaseModel):
+    """Configuration for the warmup phase using randomly generated data.
+
+    The warmup phase runs before the timed performance test to prime the
+    endpoint (warm TCP connections, fill KV caches, trigger JIT compilation).
+    Uses randomly generated token sequences with configurable ISL and OSL.
+
+    Fields:
+        num_samples: Number of warmup queries to issue.
+        input_seq_length: Target input sequence length in tokens (ISL).
+        output_seq_length: Max output tokens for warmup requests (OSL).
+        input_range_ratio: ISL variance factor in [0.0, 1.0]. Generates ISL in
+            the range [input_seq_length * input_range_ratio, input_seq_length].
+        random_seed: Seed for reproducible warmup data generation.
+    """
+
+    num_samples: int = Field(100, gt=0)
+    input_seq_length: int = Field(512, gt=0)
+    output_seq_length: int = Field(128, gt=0)
+    input_range_ratio: float = Field(1.0, ge=0.0, le=1.0)
+    random_seed: int = 42
+
+
 class RuntimeConfig(BaseModel):
     """Runtime configuration from YAML (user-facing).
 
@@ -408,6 +431,7 @@ class BenchmarkConfig(BaseModel):
     #   - True = auto (compute optimal NUMA-aware plan)
     #   - False = disabled (no CPU pinning)
     enable_cpu_affinity: bool = True
+    warmup: WarmupConfig | None = None
 
     @classmethod
     def from_yaml_file(cls, path: Path) -> BenchmarkConfig:

@@ -37,7 +37,7 @@ def generate(
         *,
         num_sequences: int = 1024,
         input_seq_length: int = 1024,
-        range_ratio: float = 1.0,
+        input_range_ratio: float = 1.0,
         random_seed: int = 42,
         save_tokenized_data: bool = False,
         tokenizer: str | PreTrainedTokenizer,
@@ -49,7 +49,7 @@ def generate(
         data = []
         # Generate the input sequence lengths given the range ratio
         input_seq_lengths = rng.integers(
-            int(input_seq_length * range_ratio),
+            int(input_seq_length * input_range_ratio),
             input_seq_length + 1,
             num_sequences,
         )

@@ -29,6 +29,7 @@
     PoissonDistributionScheduler,
     SampleOrder,
     Scheduler,
+    SequentialSampleOrder,
     WithoutReplacementSampleOrder,
     WithReplacementSampleOrder,
 )
@@ -46,6 +47,7 @@
     "MaxThroughputScheduler",
     "PoissonDistributionScheduler",
     "SampleOrder",
+    "SequentialSampleOrder",
     "WithReplacementSampleOrder",
     "WithoutReplacementSampleOrder",
     "LoadGenerator",

@@ -168,6 +168,24 @@ def next_sample_index(self) -> int:
         return self.rng.randint(0, self.n_samples_in_dataset - 1)
 
 
+class SequentialSampleOrder(SampleOrder):
+    """Sample ordering without randomness.
+
+    Issues dataset rows in their natural order and wraps around if more samples are
+    requested than the dataset contains.
+    """
+
+    def next_sample_index(self) -> int:
+        """Get next sample index in dataset order.
+
+        Returns:
+            Sample index from dataset.
+        """
+        if self.n_samples_in_dataset <= 0:
+            raise IndexError("Cannot issue samples from an empty dataset")
+        return self._issued_samples % self.n_samples_in_dataset
+
+
 def uniform_delay_fn(
     max_delay_ns: int = 0, rng: random.Random | None = None
 ) -> Callable[[], float]: