From 4565bb830fe73ee182a6747c642280344698f308 Mon Sep 17 00:00:00 2001
From: Rashid Kaleem <230885705+arekay-nv@users.noreply.github.com>
Date: Thu, 12 Mar 2026 18:38:41 -0700
Subject: [PATCH 01/10] Initial warmup implementation

Signed-off-by: Rashid Kaleem <230885705+arekay-nv@users.noreply.github.com>
---
 examples/09_Warmup_Example/README.md          | 110 ++++++++++++
 .../09_Warmup_Example/warmup_offline.yaml     |  62 +++++++
 examples/09_Warmup_Example/warmup_online.yaml |  63 +++++++
 src/inference_endpoint/commands/benchmark.py  |  34 ++++
 src/inference_endpoint/config/schema.py       |  24 +++
 .../load_generator/__init__.py                |   2 +
 .../load_generator/scheduler.py               |  18 ++
 .../load_generator/session.py                 |  52 +++++-
 src/inference_endpoint/metrics/reporter.py    |  62 +++++--
 tests/integration/commands/test_warmup.py     | 161 ++++++++++++++++++
 tests/unit/load_generator/test_session.py     |  52 ++++++
 11 files changed, 624 insertions(+), 16 deletions(-)
 create mode 100644 examples/09_Warmup_Example/README.md
 create mode 100644 examples/09_Warmup_Example/warmup_offline.yaml
 create mode 100644 examples/09_Warmup_Example/warmup_online.yaml
 create mode 100644 tests/integration/commands/test_warmup.py
diff --git a/examples/09_Warmup_Example/README.md b/examples/09_Warmup_Example/README.md
new file mode 100644
index 00000000..f0e382bb
--- /dev/null
+++ b/examples/09_Warmup_Example/README.md
@@ -0,0 +1,110 @@
+# Warmup Example
+
+This example demonstrates the **warmup phase** feature using
+[Qwen/Qwen2.5-0.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct),
+a small 0.5B parameter model that is easy to run locally.
+
+The warmup phase issues randomly generated requests to the endpoint before the timed
+performance window begins.
+
+## What warmup does
+
+Before the benchmark clock starts, the warmup phase sends a configurable number of
+requests using randomly generated token sequences. This primes the endpoint by:
+
+- Establishing and reusing TCP connections
+- Filling KV caches to steady-state occupancy
+- Triggering JIT compilation / CUDA graph capture in the inference runtime
+
+Warmup samples are **excluded from all reported metrics** — they complete before
+`TEST_STARTED` is recorded, so they do not affect throughput, latency, TTFT, or TPOT.
+
+## Warmup configuration
+
+Add a `warmup` block to any YAML config:
+
+```yaml
+warmup:
+  num_samples: 64 # number of warmup requests to issue
+  input_seq_length: 256 # ISL: target input token count
+  output_seq_length: 64 # OSL: max_new_tokens for warmup requests
+  range_ratio: 0.9 # ISL variance: generates ISL in [256*0.9, 256]
+  random_seed: 42
+```
+
+No real dataset is needed for warmup — sequences are generated at runtime from random
+token IDs using the model's own tokenizer.
+
+## Quick test with the echo server
+
+The built-in echo server lets you verify the warmup flow locally without a GPU.
+
+```bash
+# Terminal 1 — start the echo server
+python -m inference_endpoint.testing.echo_server --port 8000
+
+# Terminal 2 — run offline benchmark with warmup
+inference-endpoint benchmark from-config -c examples/09_Warmup_Example/warmup_offline.yaml
+```
+
+The log output will show the warmup phase completing before the performance run starts:
+
+```
+INFO  Warmup dataset ready: 64 samples (ISL=256, OSL=64)
+INFO  Warmup: issuing samples...
+INFO  Warmup samples issued, waiting for responses to drain...
+INFO  Warmup complete
+INFO  Running...
+```
+
+## Running against a real endpoint
+
+### Prerequisites
+
+```bash
+export HF_TOKEN=<your Hugging Face token>
+export HF_HOME=<path to your HuggingFace cache, e.g. ~/.cache/huggingface>
+```
+
+Download the model before launching so vLLM can reuse the local cache:
+
+```bash
+huggingface-cli download Qwen/Qwen2.5-0.5B-Instruct
+```
+
+### Launch a vLLM server
+
+The `--trust-request-chat-template` flag is required because the CNN DailyMail dataset
+sends requests with a custom chat template.
+
+```bash
+docker run --runtime nvidia --gpus all \
+  -v ${HF_HOME}:/root/.cache/huggingface \
+  --env "HUGGING_FACE_HUB_TOKEN=$HF_TOKEN" \
+  -p 8000:8000 --ipc=host \
+  vllm/vllm-openai:latest \
+  --model Qwen/Qwen2.5-0.5B-Instruct \
+  --trust-request-chat-template
+```
+
+### Offline benchmark with warmup
+
+```bash
+inference-endpoint benchmark from-config -c examples/09_Warmup_Example/warmup_offline.yaml
+```
+
+### Online benchmark with warmup
+
+```bash
+inference-endpoint benchmark from-config -c examples/09_Warmup_Example/warmup_online.yaml
+```
+
+## Tuning warmup parameters
+
+| Parameter           | Guidance                                                                |
+| ------------------- | ----------------------------------------------------------------------- |
+| `num_samples`       | Use enough to saturate the KV cache; 32–128 is typical for small models |
+| `input_seq_length`  | Match the ISL distribution of your real workload                        |
+| `output_seq_length` | Match the OSL distribution; lower values make warmup finish faster      |
+| `range_ratio`       | `1.0` = fixed ISL; `0.8`–`0.9` adds light variance for broader coverage |
+| `random_seed`       | Change to vary which token sequences are generated                      |
diff --git a/examples/09_Warmup_Example/warmup_offline.yaml b/examples/09_Warmup_Example/warmup_offline.yaml
new file mode 100644
index 00000000..13d92de7
--- /dev/null
+++ b/examples/09_Warmup_Example/warmup_offline.yaml
@@ -0,0 +1,62 @@
+# Offline Throughput Benchmark with Warmup Phase
+#
+# The warmup phase issues randomly generated requests before the timed
+# performance window starts. This primes the endpoint by:
+#   - Establishing and reusing TCP connections
+#   - Filling KV caches to steady-state
+#   - Triggering JIT compilation in the inference runtime
+#
+# Warmup samples are excluded from all reported metrics.
+name: "warmup-offline-qwen2.5-0.5b"
+version: "1.0"
+type: "offline"
+
+# Warmup configuration: runs before the timed performance test.
+# Uses randomly generated token sequences; no real dataset required.
+warmup:
+  num_samples: 64 # Number of warmup requests to issue
+  input_seq_length: 256 # ISL: target input sequence length in tokens
+  output_seq_length: 64 # OSL: max_new_tokens for warmup requests
+  range_ratio: 0.9 # ISL variance: generates ISL in [256*0.9, 256]
+  random_seed: 42
+
+model_params:
+  name: "Qwen/Qwen2.5-0.5B-Instruct"
+  temperature: 0.0
+  top_p: 1.0
+  max_new_tokens: 128
+
+datasets:
+  - name: cnn_dailymail::llama3_8b
+    type: "performance"
+    samples: 18
+    parser:
+      input: prompt
+
+settings:
+  runtime:
+    min_duration_ms: 60000 # 1 minute
+    max_duration_ms: 360000 # 6 minutes
+    scheduler_random_seed: 137
+    dataloader_random_seed: 111
+    n_samples_to_issue: 4
+
+  load_pattern:
+    type: "max_throughput"
+
+  client:
+    workers: 4
+
+metrics:
+  collect:
+    - "throughput"
+    - "latency"
+    - "ttft"
+    - "tpot"
+
+endpoint_config:
+  endpoints:
+    - "http://localhost:8000"
+  api_key: null
+
+report_dir: logs/warmup_offline_fixed
diff --git a/examples/09_Warmup_Example/warmup_online.yaml b/examples/09_Warmup_Example/warmup_online.yaml
new file mode 100644
index 00000000..f267adbe
--- /dev/null
+++ b/examples/09_Warmup_Example/warmup_online.yaml
@@ -0,0 +1,63 @@
+# Online (Sustained QPS) Benchmark with Warmup Phase
+#
+# The warmup phase issues randomly generated requests before the timed
+# performance window starts. This primes the endpoint by:
+#   - Establishing and reusing TCP connections
+#   - Filling KV caches to steady-state
+#   - Triggering JIT compilation in the inference runtime
+#
+# Warmup samples are excluded from all reported metrics.
+name: "warmup-online-qwen2.5-0.5b"
+version: "1.0"
+type: "online"
+
+# Warmup configuration: runs before the timed performance test.
+# Uses randomly generated token sequences; no real dataset required.
+warmup:
+  num_samples: 32 # Number of warmup requests to issue
+  input_seq_length: 128 # ISL: target input sequence length in tokens
+  output_seq_length: 32 # OSL: max_new_tokens for warmup requests
+  range_ratio: 0.8 # ISL variance: generates ISL in [128*0.8, 128]
+  random_seed: 42
+
+model_params:
+  name: "Qwen/Qwen2.5-0.5B-Instruct"
+  temperature: 0.0
+  top_p: 1.0
+  max_new_tokens: 128
+  streaming: "on"
+
+datasets:
+  - name: cnn_dailymail::llama3_8b
+    type: "performance"
+    samples: 13368
+    parser:
+      input: prompt
+
+settings:
+  runtime:
+    min_duration_ms: 60000 # 1 minute
+    max_duration_ms: 360000 # 6 minutes
+    scheduler_random_seed: 137
+    dataloader_random_seed: 111
+
+  load_pattern:
+    type: "poisson"
+    target_qps: 10.0
+
+  client:
+    workers: 4
+
+metrics:
+  collect:
+    - "throughput"
+    - "latency"
+    - "ttft"
+    - "tpot"
+
+endpoint_config:
+  endpoints:
+    - "http://localhost:8000"
+  api_key: null
+
+report_dir: logs/warmup_online
diff --git a/src/inference_endpoint/commands/benchmark.py b/src/inference_endpoint/commands/benchmark.py
index 69b8a20b..034eb406 100644
--- a/src/inference_endpoint/commands/benchmark.py
+++ b/src/inference_endpoint/commands/benchmark.py
@@ -451,6 +451,39 @@ def _run_benchmark(
             logger.info("Streaming: disabled (auto, offline mode)")
             config.model_params.streaming = StreamingMode.OFF
 
+    # Build warmup dataset if configured
+    warmup_dataset = None
+    if config.warmup is not None:
+        if tokenizer is None:
+            raise InputValidationError(
+                "A tokenizer is required to generate the warmup dataset. Ensure model_params.name is set."
+            )
+        from inference_endpoint.dataset_manager.predefined.random import RandomDataset
+
+        warmup_cfg = config.warmup
+        warmup_df = RandomDataset.generate(
+            datasets_dir=None,
+            force=False,
+            num_sequences=warmup_cfg.num_samples,
+            input_seq_length=warmup_cfg.input_seq_length,
+            range_ratio=warmup_cfg.range_ratio,
+            random_seed=warmup_cfg.random_seed,
+            tokenizer=tokenizer,
+        )
+        warmup_dataset = RandomDataset(warmup_df)
+        warmup_model_params = ModelParams(
+            name=config.model_params.name,
+            max_new_tokens=warmup_cfg.output_seq_length,
+        )
+        warmup_dataset.load(
+            api_type=config.endpoint_config.api_type,
+            model_params=warmup_model_params,
+        )
+        logger.info(
+            f"Warmup dataset ready: {warmup_dataset.num_samples()} samples "
+            f"(ISL={warmup_cfg.input_seq_length}, OSL={warmup_cfg.output_seq_length})"
+        )
+
     # Get dataset - from CLI or from config
     # TODO: Dataset Logic is not yet fully implemented
 
@@ -609,6 +642,7 @@ def _run_benchmark(
                 dataloader,
                 sample_issuer,
                 scheduler,
+                warmup_dataset=warmup_dataset,
                 name=f"cli_benchmark_{uuid.uuid4().hex[0:8]}",
                 report_dir=report_dir,
                 tokenizer_override=tokenizer,
diff --git a/src/inference_endpoint/config/schema.py b/src/inference_endpoint/config/schema.py
index 96ca84c0..c300eaeb 100644
--- a/src/inference_endpoint/config/schema.py
+++ b/src/inference_endpoint/config/schema.py
@@ -234,6 +234,29 @@ class AccuracyConfig(BaseModel):
     num_repeats: int = 1
 
 
+class WarmupConfig(BaseModel):
+    """Configuration for the warmup phase using randomly generated data.
+
+    The warmup phase runs before the timed performance test to prime the
+    endpoint (warm TCP connections, fill KV caches, trigger JIT compilation).
+    Uses randomly generated token sequences with configurable ISL and OSL.
+
+    Fields:
+        num_samples: Number of warmup queries to issue.
+        input_seq_length: Target input sequence length in tokens (ISL).
+        output_seq_length: Max output tokens for warmup requests (OSL).
+        range_ratio: ISL variance factor in [0.0, 1.0]. Generates ISL in
+            the range [input_seq_length * range_ratio, input_seq_length].
+        random_seed: Seed for reproducible warmup data generation.
+    """
+
+    num_samples: int = 100
+    input_seq_length: int = 512
+    output_seq_length: int = 128
+    range_ratio: float = 1.0
+    random_seed: int = 42
+
+
 class RuntimeConfig(BaseModel):
     """Runtime configuration from YAML (user-facing).
 
@@ -392,6 +415,7 @@ class BenchmarkConfig(BaseModel):
     #   - True = auto (compute optimal NUMA-aware plan)
     #   - False = disabled (no CPU pinning)
     enable_cpu_affinity: bool = True
+    warmup: WarmupConfig | None = None
 
     @classmethod
     def from_yaml_file(cls, path: Path) -> BenchmarkConfig:
diff --git a/src/inference_endpoint/load_generator/__init__.py b/src/inference_endpoint/load_generator/__init__.py
index 94b6309f..65f5ebd5 100644
--- a/src/inference_endpoint/load_generator/__init__.py
+++ b/src/inference_endpoint/load_generator/__init__.py
@@ -29,6 +29,7 @@
     PoissonDistributionScheduler,
     SampleOrder,
     Scheduler,
+    SequentialSampleOrder,
     WithoutReplacementSampleOrder,
     WithReplacementSampleOrder,
 )
@@ -46,6 +47,7 @@
     "MaxThroughputScheduler",
     "PoissonDistributionScheduler",
     "SampleOrder",
+    "SequentialSampleOrder",
     "WithReplacementSampleOrder",
     "WithoutReplacementSampleOrder",
     "LoadGenerator",
diff --git a/src/inference_endpoint/load_generator/scheduler.py b/src/inference_endpoint/load_generator/scheduler.py
index ae691d09..1dc1709e 100644
--- a/src/inference_endpoint/load_generator/scheduler.py
+++ b/src/inference_endpoint/load_generator/scheduler.py
@@ -168,6 +168,24 @@ def next_sample_index(self) -> int:
         return self.rng.randint(0, self.n_samples_in_dataset - 1)
 
 
+class SequentialSampleOrder(SampleOrder):
+    """Sample ordering without randomness.
+
+    Issues dataset rows in their natural order and wraps around if more samples are
+    requested than the dataset contains.
+    """
+
+    def next_sample_index(self) -> int:
+        """Get next sample index in dataset order.
+
+        Returns:
+            Sample index from dataset.
+        """
+        if self.n_samples_in_dataset <= 0:
+            raise IndexError("Cannot issue samples from an empty dataset")
+        return self._issued_samples % self.n_samples_in_dataset
+
+
 def uniform_delay_fn(
     max_delay_ns: int = 0, rng: random.Random | None = None
 ) -> Callable[[], float]:
diff --git a/src/inference_endpoint/load_generator/session.py b/src/inference_endpoint/load_generator/session.py
index da5faf84..97694ce1 100644
--- a/src/inference_endpoint/load_generator/session.py
+++ b/src/inference_endpoint/load_generator/session.py
@@ -26,13 +26,14 @@
 from transformers import AutoTokenizer
 
 from ..config.runtime_settings import RuntimeSettings
+from ..config.schema import LoadPattern, LoadPatternType
 from ..dataset_manager.dataset import Dataset
 from ..metrics.recorder import EventRecorder
 from ..metrics.reporter import MetricsReporter
 from ..utils.version import get_version_info
 from .events import SessionEvent
 from .load_generator import LoadGenerator, SampleIssuer, SchedulerBasedLoadGenerator
-from .scheduler import Scheduler, WithoutReplacementSampleOrder
+from .scheduler import Scheduler, SequentialSampleOrder, WithoutReplacementSampleOrder
 
 logger = logging.getLogger(__name__)
 
@@ -79,13 +80,34 @@ def _run_test(
         self,
         perf_test_generator: LoadGenerator,
         accuracy_test_generators: dict[str, LoadGenerator] | None = None,
-        max_shutdown_timeout_s: float | None = 300.0,
+        warmup_generator: LoadGenerator | None = None,
+        max_shutdown_timeout_s: float = 300.0,
         report_dir: os.PathLike | None = None,
         tokenizer_override: AutoTokenizer | None = None,
         dump_events_log: bool = False,
     ):
         with self.event_recorder:
             try:
+                # Warmup phase: issue samples before the timed performance window.
+                # Warmup events land in the DB with timestamps before TEST_STARTED,
+                # so they are excluded from all perf metrics.
+                if warmup_generator is not None:
+                    self.logger.info("Warmup: issuing samples...")
+                    for _ in warmup_generator:
+                        pass
+                    self.logger.info(
+                        "Warmup samples issued, waiting for responses to drain..."
+                    )
+                    warmup_start = time.monotonic()
+                    while self.event_recorder.n_inflight_samples != 0:
+                        if time.monotonic() - warmup_start > max_shutdown_timeout_s:
+                            self.logger.warning(
+                                "Warmup drain timeout exceeded, proceeding to performance test"
+                            )
+                            break
+                        time.sleep(0.1)
+                    self.logger.info("Warmup complete")
+
                 EventRecorder.record_event(
                     SessionEvent.TEST_STARTED,
                     time.monotonic_ns(),
@@ -258,6 +280,7 @@ def start(
         sample_issuer: SampleIssuer,
         scheduler: Scheduler,
         *args,
+        warmup_dataset: Dataset | None = None,
         accuracy_datasets: list[Dataset] | None = None,
         load_generator_cls: type[LoadGenerator] = SchedulerBasedLoadGenerator,
         name: str | None = None,
@@ -290,6 +313,30 @@ def start(
         session = cls(runtime_settings, session_id=name)
         load_generator = load_generator_cls(sample_issuer, dataset, scheduler, *args)  # type: ignore[arg-type]
 
+        # Create warmup generator (max-throughput, issues all samples at t=0)
+        warmup_generator = None
+        if warmup_dataset is not None:
+            warmup_n = warmup_dataset.num_samples()
+            warmup_rt = RuntimeSettings(
+                metric_target=runtime_settings.metric_target,
+                reported_metrics=runtime_settings.reported_metrics,
+                min_duration_ms=0,
+                max_duration_ms=runtime_settings.max_duration_ms,
+                n_samples_from_dataset=warmup_n,
+                n_samples_to_issue=warmup_n,
+                min_sample_count=warmup_n,
+                rng_sched=runtime_settings.rng_sched,
+                rng_sample_index=runtime_settings.rng_sample_index,
+                load_pattern=LoadPattern(type=LoadPatternType.MAX_THROUGHPUT),
+            )
+            warmup_sched_cls = Scheduler.get_implementation(
+                LoadPatternType.MAX_THROUGHPUT
+            )
+            warmup_sched = warmup_sched_cls(warmup_rt, SequentialSampleOrder)
+            warmup_generator = load_generator_cls(
+                sample_issuer, warmup_dataset, warmup_sched, *args
+            )  # type: ignore[arg-type]
+
         # Create accuracy test generators
         accuracy_test_generators = None
         if accuracy_datasets:
@@ -328,6 +375,7 @@ def start(
             target=session._run_test,
             args=(load_generator,),
             kwargs={
+                "warmup_generator": warmup_generator,
                 "accuracy_test_generators": accuracy_test_generators,
                 "max_shutdown_timeout_s": max_shutdown_timeout_s,
                 "report_dir": report_dir,
diff --git a/src/inference_endpoint/metrics/reporter.py b/src/inference_endpoint/metrics/reporter.py
index 6ddd2d54..13fe053e 100644
--- a/src/inference_endpoint/metrics/reporter.py
+++ b/src/inference_endpoint/metrics/reporter.py
@@ -675,6 +675,22 @@ def init_connection(self):
             raise ValueError(f"Invalid client type: {self.client_type}")
         self.is_closed = False
 
+    @functools.cached_property
+    def test_started_timestamp_ns(self) -> int:
+        """Returns the timestamp_ns of the TEST_STARTED event.
+
+        Returns 0 if the event is not found, which acts as an open lower bound
+        (all samples issued after epoch 0 are included). This keeps the queries
+        correct for event logs that pre-date warmup support.
+        """
+        result = self.cur_.execute(f"""
+        SELECT timestamp_ns FROM events
+        WHERE event_type = '{SessionEvent.TEST_STARTED.value}'
+        ORDER BY timestamp_ns ASC
+        LIMIT 1
+        """).fetchone()
+        return int(result[0]) if result else 0
+
     @functools.cached_property
     def stop_performance_tracking_timestamp_ns(self) -> float:
         """Returns the timestamp_ns of the STOP_PERFORMANCE_TRACKING event.
@@ -708,16 +724,19 @@ def derive_metric(self, query: str, metric_type: str) -> RollupQueryTable:
 
     def derive_TTFT(self) -> RollupQueryTable:
         stop_ts = self.stop_performance_tracking_timestamp_ns
+        start_ts = self.test_started_timestamp_ns
 
         # Build the HAVING clause conditionally to handle infinity
         if stop_ts != float("inf"):
             before_stop_ts_clause = f"""
             HAVING COUNT(DISTINCT event_type) = 2
+                AND MAX(CASE WHEN event_type = '{SessionEvent.LOADGEN_ISSUE_CALLED.value}' THEN timestamp_ns END) >= {start_ts}
                 AND MAX(CASE WHEN event_type = '{SessionEvent.LOADGEN_ISSUE_CALLED.value}' THEN timestamp_ns END) < {stop_ts}
             """
         else:
-            before_stop_ts_clause = """
+            before_stop_ts_clause = f"""
             HAVING COUNT(DISTINCT event_type) = 2
+                AND MAX(CASE WHEN event_type = '{SessionEvent.LOADGEN_ISSUE_CALLED.value}' THEN timestamp_ns END) >= {start_ts}
             """
 
         return self.derive_metric(
@@ -870,16 +889,19 @@ def derive_sample_latency(self) -> RollupQueryTable:
             RollupQueryTable: A table containing per-sample latencies in nanoseconds.
         """
         stop_ts = self.stop_performance_tracking_timestamp_ns
+        start_ts = self.test_started_timestamp_ns
 
         # HAVING clause is different if there is a STOP_PERFORMANCE_TRACKING event
         if stop_ts != float("inf"):
             before_stop_ts_clause = f"""
             HAVING COUNT(DISTINCT event_type) = 2
+                AND MAX(CASE WHEN event_type = '{SessionEvent.LOADGEN_ISSUE_CALLED.value}' THEN timestamp_ns END) >= {start_ts}
                 AND MAX(CASE WHEN event_type = '{SessionEvent.LOADGEN_ISSUE_CALLED.value}' THEN timestamp_ns END) < {stop_ts}
             """
         else:
-            before_stop_ts_clause = """
+            before_stop_ts_clause = f"""
             HAVING COUNT(DISTINCT event_type) = 2
+                AND MAX(CASE WHEN event_type = '{SessionEvent.LOADGEN_ISSUE_CALLED.value}' THEN timestamp_ns END) >= {start_ts}
             """
 
         return self.derive_metric(
@@ -904,17 +926,26 @@ def get_sample_statuses(self) -> dict[int, str]:
         - "in_flight" (int): The number of samples in flight
         """
         stop_ts = self.stop_performance_tracking_timestamp_ns
+        start_ts = self.test_started_timestamp_ns
 
-        # Build WHERE clause to filter samples issued before stop_ts
-        where_clause = ""
+        # Build WHERE clause to filter samples in the performance window
         if stop_ts != float("inf"):
             where_clause = f"""
             WHERE sample_uuid IN (
                 SELECT sample_uuid FROM events
                 WHERE event_type = '{SessionEvent.LOADGEN_ISSUE_CALLED.value}'
+                AND timestamp_ns >= {start_ts}
                 AND timestamp_ns < {stop_ts}
             )
             """
+        else:
+            where_clause = f"""
+            WHERE sample_uuid IN (
+                SELECT sample_uuid FROM events
+                WHERE event_type = '{SessionEvent.LOADGEN_ISSUE_CALLED.value}'
+                AND timestamp_ns >= {start_ts}
+            )
+            """
 
         statuses = self.cur_.execute(f"""
         SELECT
@@ -951,16 +982,26 @@ def get_sample_outputs(
             Returns an empty list if no COMPLETE events are found.
         """
         stop_ts = self.stop_performance_tracking_timestamp_ns
+        start_ts = self.test_started_timestamp_ns
 
-        # Build WHERE clause to filter samples issued before STOP_PERFORMANCE_TRACKING
+        # Build WHERE clause to filter samples in the performance window
         if performance_only and stop_ts != float("inf"):
             before_stop_ts_clause = f"""
             AND sample_uuid IN (
                 SELECT sample_uuid FROM events
                 WHERE event_type = '{SessionEvent.LOADGEN_ISSUE_CALLED.value}'
+                AND timestamp_ns >= {start_ts}
                 AND timestamp_ns < {stop_ts}
             )
             """
+        elif performance_only:
+            before_stop_ts_clause = f"""
+            AND sample_uuid IN (
+                SELECT sample_uuid FROM events
+                WHERE event_type = '{SessionEvent.LOADGEN_ISSUE_CALLED.value}'
+                AND timestamp_ns >= {start_ts}
+            )
+            """
         else:
             before_stop_ts_clause = ""
 
@@ -1249,15 +1290,8 @@ def get_test_started_at(self) -> int | None:
         Returns:
             int|None: The timestamp of the TEST_STARTED event in nanoseconds, or None if not found.
         """
-        query = f"""
-        SELECT timestamp_ns FROM events
-        WHERE event_type = '{SessionEvent.TEST_STARTED.value}'
-        ORDER BY timestamp_ns ASC
-        LIMIT 1"""
-        result = self.cur_.execute(query).fetchone()
-        if result and result[0]:
-            return result[0]
-        return None
+        ts = self.test_started_timestamp_ns
+        return ts if ts != 0 else None
 
     def create_report(
         self,
diff --git a/tests/integration/commands/test_warmup.py b/tests/integration/commands/test_warmup.py
new file mode 100644
index 00000000..b94ac8a9
--- /dev/null
+++ b/tests/integration/commands/test_warmup.py
@@ -0,0 +1,161 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Integration tests for warmup phase against echo server."""
+
+import argparse
+import textwrap
+
+import pytest
+from inference_endpoint.commands.benchmark import run_benchmark_command
+
+
+@pytest.mark.integration
+@pytest.mark.asyncio
+async def test_warmup_offline_with_echo_server(
+    mock_http_echo_server, ds_pickle_dataset_path, tmp_path, caplog
+):
+    """Warmup phase runs and completes before the performance test starts."""
+    config_yaml = textwrap.dedent(f"""
+        name: "warmup-test-offline"
+        version: "1.0"
+        type: "offline"
+
+        warmup:
+          num_samples: 4
+          input_seq_length: 16
+          output_seq_length: 8
+          range_ratio: 0.9
+          random_seed: 42
+
+        model_params:
+          name: "Qwen/Qwen2.5-0.5B-Instruct"
+          temperature: 0.0
+          max_new_tokens: 16
+
+        datasets:
+          - name: custom
+            type: "performance"
+            path: "{ds_pickle_dataset_path}"
+            parser:
+              input: prompt
+
+        settings:
+          runtime:
+            min_duration_ms: 0
+            max_duration_ms: 30000
+            n_samples_to_issue: 10
+          load_pattern:
+            type: "max_throughput"
+          client:
+            workers: 1
+            warmup_connections: 0
+
+        endpoint_config:
+          endpoints:
+            - "{mock_http_echo_server.url}"
+          api_key: null
+
+        report_dir: "{tmp_path}"
+    """).strip()
+
+    config_file = tmp_path / "warmup_test.yaml"
+    config_file.write_text(config_yaml)
+
+    args = argparse.Namespace(
+        benchmark_mode="from-config",
+        config=str(config_file),
+        output=None,
+        mode=None,
+        verbose=1,
+    )
+
+    with caplog.at_level("INFO"):
+        await run_benchmark_command(args)
+
+    log_text = caplog.text
+    assert "Warmup: issuing samples" in log_text, "Warmup did not start"
+    assert "Warmup complete" in log_text, "Warmup did not complete"
+    assert "QPS:" in log_text, "Performance test did not run after warmup"
+
+
+@pytest.mark.integration
+@pytest.mark.asyncio
+async def test_warmup_online_with_echo_server(
+    mock_http_echo_server, ds_pickle_dataset_path, tmp_path, caplog
+):
+    """Warmup phase runs before the online (Poisson) performance test."""
+    config_yaml = textwrap.dedent(f"""
+        name: "warmup-test-online"
+        version: "1.0"
+        type: "online"
+
+        warmup:
+          num_samples: 4
+          input_seq_length: 16
+          output_seq_length: 8
+          range_ratio: 0.8
+          random_seed: 42
+
+        model_params:
+          name: "Qwen/Qwen2.5-0.5B-Instruct"
+          temperature: 0.0
+          max_new_tokens: 16
+          streaming: "on"
+
+        datasets:
+          - name: custom
+            type: "performance"
+            path: "{ds_pickle_dataset_path}"
+            parser:
+              input: prompt
+
+        settings:
+          runtime:
+            min_duration_ms: 0
+            max_duration_ms: 5000
+          load_pattern:
+            type: "poisson"
+            target_qps: 50.0
+          client:
+            workers: 1
+            warmup_connections: 0
+
+        endpoint_config:
+          endpoints:
+            - "{mock_http_echo_server.url}"
+          api_key: null
+
+        report_dir: "{tmp_path}"
+    """).strip()
+
+    config_file = tmp_path / "warmup_online_test.yaml"
+    config_file.write_text(config_yaml)
+
+    args = argparse.Namespace(
+        benchmark_mode="from-config",
+        config=str(config_file),
+        output=None,
+        mode=None,
+        verbose=1,
+    )
+
+    with caplog.at_level("INFO"):
+        await run_benchmark_command(args)
+
+    log_text = caplog.text
+    assert "Warmup: issuing samples" in log_text, "Warmup did not start"
+    assert "Warmup complete" in log_text, "Warmup did not complete"
+    assert "QPS:" in log_text, "Performance test did not run after warmup"
diff --git a/tests/unit/load_generator/test_session.py b/tests/unit/load_generator/test_session.py
index afcd9a4b..106485b7 100644
--- a/tests/unit/load_generator/test_session.py
+++ b/tests/unit/load_generator/test_session.py
@@ -157,3 +157,55 @@ def set_pbar(self, pbar: tqdm):
         assert stats["total_sent"] == 10_000
         assert stats["completed"] == 10_000
         assert stats["in_flight"] == 0
+
+
+@pytest.mark.unit
+def test_warmup_uses_sequential_sample_order():
+    rt_settings = RuntimeSettings(
+        metrics.Throughput(10),
+        [metrics.Throughput(10)],
+        min_duration_ms=1000,
+        max_duration_ms=10_000,
+        n_samples_from_dataset=8,
+        n_samples_to_issue=8,
+        min_sample_count=1,
+        rng_sched=random.Random(1234),
+        rng_sample_index=random.Random(5678),
+        load_pattern=LoadPattern(type=LoadPatternType.MAX_THROUGHPUT),
+    )
+
+    dl = DummyDataLoader(n_samples=8)
+    sample_issuer = PooledSampleIssuer(lambda n: [str(n)])
+    sched = MaxThroughputScheduler(rt_settings, WithoutReplacementSampleOrder)
+    warmup_dataset = DummyDataLoader(n_samples=4)
+
+    created_schedulers = []
+
+    class RecordingLoadGenerator:
+        def __init__(self, sample_issuer, dataloader, scheduler, *args):
+            self.sample_issuer = sample_issuer
+            self.dataloader = dataloader
+            self.scheduler = scheduler
+            self.args = args
+            self.uuid_to_index_map = {}
+            self.name = None
+            created_schedulers.append(scheduler)
+
+    with patch(
+        "inference_endpoint.load_generator.session.Scheduler.get_implementation",
+        return_value=MaxThroughputScheduler,
+    ):
+        with patch.object(BenchmarkSession, "_run_test", return_value=None):
+            BenchmarkSession.start(
+                rt_settings,
+                dl,
+                sample_issuer,
+                sched,
+                warmup_dataset=warmup_dataset,
+                load_generator_cls=RecordingLoadGenerator,
+                max_shutdown_timeout_s=0.01,
+            )
+
+    assert len(created_schedulers) >= 2
+    warmup_scheduler = created_schedulers[1]
+    assert [next(warmup_scheduler.sample_order) for _ in range(4)] == [0, 1, 2, 3]

From df1e8db32c7b85ff79e7316ed83f180f5474240b Mon Sep 17 00:00:00 2001
From: Rashid Kaleem <230885705+arekay-nv@users.noreply.github.com>
Date: Mon, 16 Mar 2026 19:23:18 -0500
Subject: [PATCH 02/10] Fix test.

Signed-off-by: Rashid Kaleem <230885705+arekay-nv@users.noreply.github.com>
---
 src/inference_endpoint/commands/benchmark.py  | 23 ++++++++-----
 src/inference_endpoint/config/schema.py       |  8 ++---
 .../load_generator/session.py                 |  9 +++--
 tests/integration/commands/test_warmup.py     | 34 +++++++++++++++----
 4 files changed, 52 insertions(+), 22 deletions(-)

diff --git a/src/inference_endpoint/commands/benchmark.py b/src/inference_endpoint/commands/benchmark.py
index d555bbe0..d54030e7 100644
--- a/src/inference_endpoint/commands/benchmark.py
+++ b/src/inference_endpoint/commands/benchmark.py
@@ -461,15 +461,20 @@ def _run_benchmark(
         from inference_endpoint.dataset_manager.predefined.random import RandomDataset
 
         warmup_cfg = config.warmup
-        warmup_df = RandomDataset.generate(
-            datasets_dir=None,
-            force=False,
-            num_sequences=warmup_cfg.num_samples,
-            input_seq_length=warmup_cfg.input_seq_length,
-            range_ratio=warmup_cfg.range_ratio,
-            random_seed=warmup_cfg.random_seed,
-            tokenizer=tokenizer,
-        )
+        try:
+            warmup_df = RandomDataset.generate(
+                datasets_dir=None,
+                force=False,
+                num_sequences=warmup_cfg.num_samples,
+                input_seq_length=warmup_cfg.input_seq_length,
+                range_ratio=warmup_cfg.range_ratio,
+                random_seed=warmup_cfg.random_seed,
+                tokenizer=tokenizer,
+            )
+        except (ValueError, TypeError) as e:
+            raise InputValidationError(
+                f"Failed to generate warmup dataset from warmup config: {e}"
+            ) from e
         warmup_dataset = RandomDataset(warmup_df)
         warmup_model_params = ModelParams(
             name=config.model_params.name,
diff --git a/src/inference_endpoint/config/schema.py b/src/inference_endpoint/config/schema.py
index c300eaeb..8ce85836 100644
--- a/src/inference_endpoint/config/schema.py
+++ b/src/inference_endpoint/config/schema.py
@@ -250,10 +250,10 @@ class WarmupConfig(BaseModel):
         random_seed: Seed for reproducible warmup data generation.
     """
 
-    num_samples: int = 100
-    input_seq_length: int = 512
-    output_seq_length: int = 128
-    range_ratio: float = 1.0
+    num_samples: int = Field(100, gt=0)
+    input_seq_length: int = Field(512, gt=0)
+    output_seq_length: int = Field(128, gt=0)
+    range_ratio: float = Field(1.0, ge=0.0, le=1.0)
     random_seed: int = 42
 
 
diff --git a/src/inference_endpoint/load_generator/session.py b/src/inference_endpoint/load_generator/session.py
index 97694ce1..928fea41 100644
--- a/src/inference_endpoint/load_generator/session.py
+++ b/src/inference_endpoint/load_generator/session.py
@@ -81,7 +81,7 @@ def _run_test(
         perf_test_generator: LoadGenerator,
         accuracy_test_generators: dict[str, LoadGenerator] | None = None,
         warmup_generator: LoadGenerator | None = None,
-        max_shutdown_timeout_s: float = 300.0,
+        max_shutdown_timeout_s: float | None = 300.0,
         report_dir: os.PathLike | None = None,
         tokenizer_override: AutoTokenizer | None = None,
         dump_events_log: bool = False,
@@ -100,7 +100,10 @@ def _run_test(
                     )
                     warmup_start = time.monotonic()
                     while self.event_recorder.n_inflight_samples != 0:
-                        if time.monotonic() - warmup_start > max_shutdown_timeout_s:
+                        if (
+                            max_shutdown_timeout_s is not None
+                            and time.monotonic() - warmup_start > max_shutdown_timeout_s
+                        ):
                             self.logger.warning(
                                 "Warmup drain timeout exceeded, proceeding to performance test"
                             )
@@ -321,7 +324,7 @@ def start(
                 metric_target=runtime_settings.metric_target,
                 reported_metrics=runtime_settings.reported_metrics,
                 min_duration_ms=0,
-                max_duration_ms=runtime_settings.max_duration_ms,
+                max_duration_ms=None,
                 n_samples_from_dataset=warmup_n,
                 n_samples_to_issue=warmup_n,
                 min_sample_count=warmup_n,
diff --git a/tests/integration/commands/test_warmup.py b/tests/integration/commands/test_warmup.py
index b94ac8a9..b6d16fc2 100644
--- a/tests/integration/commands/test_warmup.py
+++ b/tests/integration/commands/test_warmup.py
@@ -17,15 +17,25 @@
 
 import argparse
 import textwrap
+from unittest.mock import MagicMock, patch
 
 import pytest
 from inference_endpoint.commands.benchmark import run_benchmark_command
 
 
+@pytest.fixture
+def mock_tokenizer():
+    tok = MagicMock()
+    tok.vocab_size = 1000
+    tok.decode.side_effect = lambda ids, **kwargs: " ".join(str(i) for i in ids)
+    tok.tokenize.side_effect = lambda text, **kwargs: text.split()
+    return tok
+
+
 @pytest.mark.integration
 @pytest.mark.asyncio
 async def test_warmup_offline_with_echo_server(
-    mock_http_echo_server, ds_pickle_dataset_path, tmp_path, caplog
+    mock_http_echo_server, ds_pickle_dataset_path, tmp_path, caplog, mock_tokenizer
 ):
     """Warmup phase runs and completes before the performance test starts."""
     config_yaml = textwrap.dedent(f"""
@@ -82,19 +92,25 @@ async def test_warmup_offline_with_echo_server(
         verbose=1,
     )
 
-    with caplog.at_level("INFO"):
+    with (
+        caplog.at_level("INFO"),
+        patch(
+            "inference_endpoint.commands.benchmark.AutoTokenizer.from_pretrained",
+            return_value=mock_tokenizer,
+        ),
+    ):
         await run_benchmark_command(args)
 
     log_text = caplog.text
     assert "Warmup: issuing samples" in log_text, "Warmup did not start"
     assert "Warmup complete" in log_text, "Warmup did not complete"
-    assert "QPS:" in log_text, "Performance test did not run after warmup"
+    assert "Estimated QPS:" in log_text, "Performance test did not run after warmup"
 
 
 @pytest.mark.integration
 @pytest.mark.asyncio
 async def test_warmup_online_with_echo_server(
-    mock_http_echo_server, ds_pickle_dataset_path, tmp_path, caplog
+    mock_http_echo_server, ds_pickle_dataset_path, tmp_path, caplog, mock_tokenizer
 ):
     """Warmup phase runs before the online (Poisson) performance test."""
     config_yaml = textwrap.dedent(f"""
@@ -152,10 +168,16 @@ async def test_warmup_online_with_echo_server(
         verbose=1,
     )
 
-    with caplog.at_level("INFO"):
+    with (
+        caplog.at_level("INFO"),
+        patch(
+            "inference_endpoint.commands.benchmark.AutoTokenizer.from_pretrained",
+            return_value=mock_tokenizer,
+        ),
+    ):
         await run_benchmark_command(args)
 
     log_text = caplog.text
     assert "Warmup: issuing samples" in log_text, "Warmup did not start"
     assert "Warmup complete" in log_text, "Warmup did not complete"
-    assert "QPS:" in log_text, "Performance test did not run after warmup"
+    assert "Estimated QPS:" in log_text, "Performance test did not run after warmup"

From 057991ded1de94baa038c18c1460cfe202787301 Mon Sep 17 00:00:00 2001
From: Rashid Kaleem <230885705+arekay-nv@users.noreply.github.com>
Date: Mon, 16 Mar 2026 19:33:31 -0500
Subject: [PATCH 03/10] Fix type hint

Signed-off-by: Rashid Kaleem <230885705+arekay-nv@users.noreply.github.com>
---
 src/inference_endpoint/load_generator/session.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/inference_endpoint/load_generator/session.py b/src/inference_endpoint/load_generator/session.py
index 928fea41..266b7043 100644
--- a/src/inference_endpoint/load_generator/session.py
+++ b/src/inference_endpoint/load_generator/session.py
@@ -337,8 +337,11 @@ def start(
             )
             warmup_sched = warmup_sched_cls(warmup_rt, SequentialSampleOrder)
             warmup_generator = load_generator_cls(
-                sample_issuer, warmup_dataset, warmup_sched, *args
-            )  # type: ignore[arg-type]
+                sample_issuer,
+                warmup_dataset,
+                warmup_sched, # type: ignore[arg-type]
+                *args,  
+            )
 
         # Create accuracy test generators
         accuracy_test_generators = None

From 9a37881b81d1f1d896d99c78032bf2cff670e1ff Mon Sep 17 00:00:00 2001
From: Rashid Kaleem <230885705+arekay-nv@users.noreply.github.com>
Date: Mon, 16 Mar 2026 19:36:05 -0500
Subject: [PATCH 04/10] Lint

Signed-off-by: Rashid Kaleem <230885705+arekay-nv@users.noreply.github.com>
---
 src/inference_endpoint/commands/benchmark.py  |  5 +-
 .../load_generator/session.py                 | 14 ++++--
 src/inference_endpoint/metrics/reporter.py    | 20 +++++---
 tests/integration/commands/test_warmup.py     |  4 +-
 tests/unit/metrics/test_reporter.py           | 46 +++++++++++++++++++
 5 files changed, 77 insertions(+), 12 deletions(-)

diff --git a/src/inference_endpoint/commands/benchmark.py b/src/inference_endpoint/commands/benchmark.py
index d54030e7..9d9a9640 100644
--- a/src/inference_endpoint/commands/benchmark.py
+++ b/src/inference_endpoint/commands/benchmark.py
@@ -63,6 +63,7 @@
 from inference_endpoint.core.types import QueryResult
 from inference_endpoint.dataset_manager.dataset import Dataset
 from inference_endpoint.dataset_manager.factory import DataLoaderFactory
+from inference_endpoint.dataset_manager.predefined.random import RandomDataset
 from inference_endpoint.endpoint_client.config import HTTPClientConfig
 from inference_endpoint.endpoint_client.cpu_affinity import pin_loadgen
 from inference_endpoint.endpoint_client.http_client import HTTPEndpointClient
@@ -458,7 +459,6 @@ def _run_benchmark(
             raise InputValidationError(
                 "A tokenizer is required to generate the warmup dataset. Ensure model_params.name is set."
             )
-        from inference_endpoint.dataset_manager.predefined.random import RandomDataset
 
         warmup_cfg = config.warmup
         try:
@@ -574,6 +574,9 @@ def _run_benchmark(
         total_samples += sum(
             [dataset.num_samples() * dataset.repeats for dataset in accuracy_datasets]
         )
+    if warmup_dataset is not None:
+        total_samples += warmup_dataset.num_samples()
+
     duration_s = rt_settings.min_duration_ms / 1000
 
     logger.info(
diff --git a/src/inference_endpoint/load_generator/session.py b/src/inference_endpoint/load_generator/session.py
index 266b7043..c858c491 100644
--- a/src/inference_endpoint/load_generator/session.py
+++ b/src/inference_endpoint/load_generator/session.py
@@ -108,8 +108,14 @@ def _run_test(
                                 "Warmup drain timeout exceeded, proceeding to performance test"
                             )
                             break
+                        if self.stop_requested:
+                            self.logger.info(
+                                f"Early stop requested (pending={self.event_recorder.n_inflight_samples}), shutting down test..."
+                            )
+                            break
                         time.sleep(0.1)
-                    self.logger.info("Warmup complete")
+
+                    self.logger.info("Warmup issue complete")
 
                 EventRecorder.record_event(
                     SessionEvent.TEST_STARTED,
@@ -299,6 +305,8 @@ def start(
             dataset: The dataset to use for the performance test.
             sample_issuer: The sample issuer to use for the session.
             scheduler: The scheduler to use for the session.
+            *args: Additional arguments to pass to the load generator constructor.
+            warmup_dataset: The dataset to use for the warmup test. If None, no warmup test will be run.
             accuracy_datasets: The datasets to use for the accuracy tests. If None, no accuracy tests will be run.
             load_generator_cls: The load generator class to use for the session.
             name: The name of the session.
@@ -339,8 +347,8 @@ def start(
             warmup_generator = load_generator_cls(
                 sample_issuer,
                 warmup_dataset,
-                warmup_sched, # type: ignore[arg-type]
-                *args,  
+                warmup_sched,  # type: ignore[arg-type]
+                *args,
             )
 
         # Create accuracy test generators
diff --git a/src/inference_endpoint/metrics/reporter.py b/src/inference_endpoint/metrics/reporter.py
index 7ee666eb..d7a79202 100644
--- a/src/inference_endpoint/metrics/reporter.py
+++ b/src/inference_endpoint/metrics/reporter.py
@@ -826,6 +826,7 @@ def derive_duration(self, check_malformed: bool = True) -> float | None:
 
         # Check if STOP_PERFORMANCE_TRACKING event exists
         stop_ts = self.stop_performance_tracking_timestamp_ns
+        start_ts = self.test_started_timestamp_ns
 
         if stop_ts != float("inf"):
             # Build list of sample_uuids with LOADGEN_ISSUE_CALLED before stop_ts
@@ -838,6 +839,7 @@ def derive_duration(self, check_malformed: bool = True) -> float | None:
                 FROM events
                 WHERE event_type = '{SessionEvent.LOADGEN_ISSUE_CALLED.value}'
                 AND timestamp_ns < {stop_ts}
+                AND timestamp_ns >= {start_ts}
             )
             AND event_type = '{SampleEvent.COMPLETE.value}'
             """).fetchone()
@@ -863,6 +865,7 @@ def derive_duration(self, check_malformed: bool = True) -> float | None:
                 SELECT COUNT(*) AS n_ends, MAX(timestamp_ns) AS end_ts
                 FROM events
                 WHERE event_type = '{SessionEvent.TEST_ENDED.value}'
+                AND timestamp_ns >= {start_ts}
                 """).fetchone()
 
                 n_test_ended = test_ended_result[0]
@@ -974,17 +977,22 @@ def get_error_count(self) -> int:
         Returns:
             int: The number of distinct failed sample UUIDs.
         """
+        start_ts = self.test_started_timestamp_ns
         stop_ts = self.stop_performance_tracking_timestamp_ns
 
-        where_clause = ""
+        where_clause = f"""
+        AND sample_uuid IN (
+            SELECT DISTINCT sample_uuid FROM events
+            WHERE event_type = '{SessionEvent.LOADGEN_ISSUE_CALLED.value}'
+            AND timestamp_ns >= {start_ts}
+        """
         if stop_ts != float("inf"):
-            where_clause = f"""
-            AND sample_uuid IN (
-                SELECT DISTINCT sample_uuid FROM events
-                WHERE event_type = '{SessionEvent.LOADGEN_ISSUE_CALLED.value}'
+            where_clause += f"""
                 AND timestamp_ns < {stop_ts}
-            )
             """
+        where_clause += """
+        )
+        """
 
         return self.cur_.execute(f"""
         SELECT
diff --git a/tests/integration/commands/test_warmup.py b/tests/integration/commands/test_warmup.py
index b6d16fc2..b82109d4 100644
--- a/tests/integration/commands/test_warmup.py
+++ b/tests/integration/commands/test_warmup.py
@@ -103,7 +103,7 @@ async def test_warmup_offline_with_echo_server(
 
     log_text = caplog.text
     assert "Warmup: issuing samples" in log_text, "Warmup did not start"
-    assert "Warmup complete" in log_text, "Warmup did not complete"
+    assert "Warmup issue complete" in log_text, "Warmup issue did not complete"
     assert "Estimated QPS:" in log_text, "Performance test did not run after warmup"
 
 
@@ -179,5 +179,5 @@ async def test_warmup_online_with_echo_server(
 
     log_text = caplog.text
     assert "Warmup: issuing samples" in log_text, "Warmup did not start"
-    assert "Warmup complete" in log_text, "Warmup did not complete"
+    assert "Warmup issue complete" in log_text, "Warmup issue did not complete"
     assert "Estimated QPS:" in log_text, "Performance test did not run after warmup"
diff --git a/tests/unit/metrics/test_reporter.py b/tests/unit/metrics/test_reporter.py
index db6d97fa..6e947429 100644
--- a/tests/unit/metrics/test_reporter.py
+++ b/tests/unit/metrics/test_reporter.py
@@ -43,6 +43,52 @@ def test_error_counting(events_db):
         assert reporter.get_error_count() == 1
 
 
+@pytest.mark.unit
+def test_error_counting_excludes_warmup_errors(tmp_path, sample_uuids):
+    test_db = str(tmp_path / "test_error_counting_excludes_warmup_errors.db")
+    warmup_uuid = sample_uuids(1)
+    perf_error_uuid = sample_uuids(2)
+    perf_success_uuid = sample_uuids(3)
+
+    with sqlite3_cursor(test_db) as (cursor, conn):
+        cursor.execute(
+            "CREATE TABLE IF NOT EXISTS events (sample_uuid VARCHAR(32), event_type VARCHAR(32), timestamp_ns INTEGER, data BLOB)"
+        )
+        cursor.executemany(
+            "INSERT INTO events (sample_uuid, event_type, timestamp_ns, data) VALUES (?, ?, ?, ?)",
+            [
+                (
+                    warmup_uuid,
+                    SessionEvent.LOADGEN_ISSUE_CALLED.value,
+                    1000,
+                    b"",
+                ),
+                (warmup_uuid, SessionEvent.ERROR.value, 1010, b"warmup failed"),
+                ("", SessionEvent.TEST_STARTED.value, 5000, b""),
+                (
+                    perf_error_uuid,
+                    SessionEvent.LOADGEN_ISSUE_CALLED.value,
+                    6000,
+                    b"",
+                ),
+                (perf_error_uuid, SessionEvent.ERROR.value, 6010, b"perf failed"),
+                (
+                    perf_success_uuid,
+                    SessionEvent.LOADGEN_ISSUE_CALLED.value,
+                    7000,
+                    b"",
+                ),
+                (perf_success_uuid, SampleEvent.COMPLETE.value, 7010, b""),
+                ("", SessionEvent.STOP_PERFORMANCE_TRACKING.value, 8000, b""),
+                ("", SessionEvent.TEST_ENDED.value, 9000, b""),
+            ],
+        )
+        conn.commit()
+
+    with MetricsReporter(test_db) as reporter:
+        assert reporter.get_error_count() == 1
+
+
 def test_derive_ttft(events_db, sample_uuids):
     uuid1 = sample_uuids(1)
     uuid2 = sample_uuids(2)

From 4e1e0474b06a58cd4e1c32a26d5c2915e52d771d Mon Sep 17 00:00:00 2001
From: Rashid Kaleem <230885705+arekay-nv@users.noreply.github.com>
Date: Wed, 18 Mar 2026 14:59:20 -0500
Subject: [PATCH 05/10] Update doc

Signed-off-by: Rashid Kaleem <230885705+arekay-nv@users.noreply.github.com>
---
 examples/09_Warmup_Example/README.md | 113 ++++++---------------------
 1 file changed, 25 insertions(+), 88 deletions(-)

diff --git a/examples/09_Warmup_Example/README.md b/examples/09_Warmup_Example/README.md
index f0e382bb..db8ce3ed 100644
--- a/examples/09_Warmup_Example/README.md
+++ b/examples/09_Warmup_Example/README.md
@@ -1,110 +1,47 @@
 # Warmup Example
 
-This example demonstrates the **warmup phase** feature using
-[Qwen/Qwen2.5-0.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct),
-a small 0.5B parameter model that is easy to run locally.
+This directory shows how to add a warmup phase to offline and online benchmark
+configs for [Qwen/Qwen2.5-0.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct).
 
-The warmup phase issues randomly generated requests to the endpoint before the timed
-performance window begins.
+Warmup sends randomly generated requests before the timed run to prime the
+endpoint. Those samples complete before `TEST_STARTED`, so they are excluded
+from reported throughput and latency metrics.
 
-## What warmup does
+## Files
 
-Before the benchmark clock starts, the warmup phase sends a configurable number of
-requests using randomly generated token sequences. This primes the endpoint by:
+- `warmup_offline.yaml`: offline max-throughput example
+- `warmup_online.yaml`: online Poisson-QPS example
 
-- Establishing and reusing TCP connections
-- Filling KV caches to steady-state occupancy
-- Triggering JIT compilation / CUDA graph capture in the inference runtime
-
-Warmup samples are **excluded from all reported metrics** — they complete before
-`TEST_STARTED` is recorded, so they do not affect throughput, latency, TTFT, or TPOT.
-
-## Warmup configuration
-
-Add a `warmup` block to any YAML config:
+Both configs use the same `warmup` block:
 
 ```yaml
 warmup:
-  num_samples: 64 # number of warmup requests to issue
-  input_seq_length: 256 # ISL: target input token count
-  output_seq_length: 64 # OSL: max_new_tokens for warmup requests
-  range_ratio: 0.9 # ISL variance: generates ISL in [256*0.9, 256]
+  num_samples: 64
+  input_seq_length: 256
+  output_seq_length: 64
+  range_ratio: 0.9
   random_seed: 42
 ```
 
-No real dataset is needed for warmup — sequences are generated at runtime from random
-token IDs using the model's own tokenizer.
+Warmup data is generated at runtime from random token IDs using the model
+tokenizer, so no separate warmup dataset is needed.
 
-## Quick test with the echo server
+## Run Locally
 
-The built-in echo server lets you verify the warmup flow locally without a GPU.
+With the built-in echo server:
 
 ```bash
-# Terminal 1 — start the echo server
 python -m inference_endpoint.testing.echo_server --port 8000
-
-# Terminal 2 — run offline benchmark with warmup
-inference-endpoint benchmark from-config -c examples/09_Warmup_Example/warmup_offline.yaml
-```
-
-The log output will show the warmup phase completing before the performance run starts:
-
-```
-INFO  Warmup dataset ready: 64 samples (ISL=256, OSL=64)
-INFO  Warmup: issuing samples...
-INFO  Warmup samples issued, waiting for responses to drain...
-INFO  Warmup complete
-INFO  Running...
-```
-
-## Running against a real endpoint
-
-### Prerequisites
-
-```bash
-export HF_TOKEN=<your Hugging Face token>
-export HF_HOME=<path to your HuggingFace cache, e.g. ~/.cache/huggingface>
-```
-
-Download the model before launching so vLLM can reuse the local cache:
-
-```bash
-huggingface-cli download Qwen/Qwen2.5-0.5B-Instruct
-```
-
-### Launch a vLLM server
-
-The `--trust-request-chat-template` flag is required because the CNN DailyMail dataset
-sends requests with a custom chat template.
-
-```bash
-docker run --runtime nvidia --gpus all \
-  -v ${HF_HOME}:/root/.cache/huggingface \
-  --env "HUGGING_FACE_HUB_TOKEN=$HF_TOKEN" \
-  -p 8000:8000 --ipc=host \
-  vllm/vllm-openai:latest \
-  --model Qwen/Qwen2.5-0.5B-Instruct \
-  --trust-request-chat-template
-```
-
-### Offline benchmark with warmup
-
-```bash
 inference-endpoint benchmark from-config -c examples/09_Warmup_Example/warmup_offline.yaml
-```
-
-### Online benchmark with warmup
-
-```bash
 inference-endpoint benchmark from-config -c examples/09_Warmup_Example/warmup_online.yaml
 ```
 
-## Tuning warmup parameters
+Against a real endpoint, point `endpoint_config.endpoints` in the YAML at that
+server and run the same commands.
+
+## Tuning
 
-| Parameter           | Guidance                                                                |
-| ------------------- | ----------------------------------------------------------------------- |
-| `num_samples`       | Use enough to saturate the KV cache; 32–128 is typical for small models |
-| `input_seq_length`  | Match the ISL distribution of your real workload                        |
-| `output_seq_length` | Match the OSL distribution; lower values make warmup finish faster      |
-| `range_ratio`       | `1.0` = fixed ISL; `0.8`–`0.9` adds light variance for broader coverage |
-| `random_seed`       | Change to vary which token sequences are generated                      |
+- `num_samples`: use enough requests to reach a steady state
+- `input_seq_length`: match the typical prompt length of the workload
+- `output_seq_length`: match the expected response length
+- `range_ratio`: use values below `1.0` to add light ISL variation

From e4cd553cf4efb1c346b1e4b29dce4ba02b4ca08e Mon Sep 17 00:00:00 2001
From: Rashid Kaleem <230885705+arekay-nv@users.noreply.github.com>
Date: Thu, 19 Mar 2026 01:05:30 -0500
Subject: [PATCH 06/10] Address comments.

Signed-off-by: Rashid Kaleem <230885705+arekay-nv@users.noreply.github.com>
---
 src/inference_endpoint/commands/benchmark.py  |  6 +++---
 .../load_generator/session.py                 | 19 +++++++++++++------
 2 files changed, 16 insertions(+), 9 deletions(-)

diff --git a/src/inference_endpoint/commands/benchmark.py b/src/inference_endpoint/commands/benchmark.py
index 9d9a9640..3f2a4106 100644
--- a/src/inference_endpoint/commands/benchmark.py
+++ b/src/inference_endpoint/commands/benchmark.py
@@ -476,9 +476,9 @@ def _run_benchmark(
                 f"Failed to generate warmup dataset from warmup config: {e}"
             ) from e
         warmup_dataset = RandomDataset(warmup_df)
-        warmup_model_params = ModelParams(
-            name=config.model_params.name,
-            max_new_tokens=warmup_cfg.output_seq_length,
+        # Create a new model params object with the warmup output sequence length
+        warmup_model_params = config.model_params.model_copy(
+            update={"max_new_tokens": warmup_cfg.output_seq_length}
         )
         warmup_dataset.load(
             api_type=config.endpoint_config.api_type,
diff --git a/src/inference_endpoint/load_generator/session.py b/src/inference_endpoint/load_generator/session.py
index c858c491..701c0b63 100644
--- a/src/inference_endpoint/load_generator/session.py
+++ b/src/inference_endpoint/load_generator/session.py
@@ -98,24 +98,31 @@ def _run_test(
                     self.logger.info(
                         "Warmup samples issued, waiting for responses to drain..."
                     )
+                    # Enable idle signalling so EventRecorder wakes end_event when
+                    # all warmup responses are received, then reset it for the perf test.
+                    self.event_recorder.should_check_idle = True
                     warmup_start = time.monotonic()
                     while self.event_recorder.n_inflight_samples != 0:
                         if (
                             max_shutdown_timeout_s is not None
                             and time.monotonic() - warmup_start > max_shutdown_timeout_s
                         ):
-                            self.logger.warning(
-                                "Warmup drain timeout exceeded, proceeding to performance test"
+                            raise TimeoutError(
+                                f"Warmup drain timeout exceeded: "
+                                f"{self.event_recorder.n_inflight_samples} warmup requests "
+                                f"still in-flight after {max_shutdown_timeout_s}s"
                             )
-                            break
                         if self.stop_requested:
                             self.logger.info(
                                 f"Early stop requested (pending={self.event_recorder.n_inflight_samples}), shutting down test..."
                             )
-                            break
-                        time.sleep(0.1)
+                            return
+                        self.end_event.wait(timeout=SHUTDOWN_POLL_INTERVAL_S)
+                    # Reset so end_event is not pre-fired for the performance drain below.
+                    self.event_recorder.should_check_idle = False
+                    self.end_event.clear()
 
-                    self.logger.info("Warmup issue complete")
+                    self.logger.info("Warmup complete")
 
                 EventRecorder.record_event(
                     SessionEvent.TEST_STARTED,

From b8309b44cc8e86525f4842b708e9ad8cf660fc72 Mon Sep 17 00:00:00 2001
From: Rashid Kaleem <230885705+arekay-nv@users.noreply.github.com>
Date: Thu, 19 Mar 2026 01:14:58 -0500
Subject: [PATCH 07/10] Fixes.

Signed-off-by: Rashid Kaleem <230885705+arekay-nv@users.noreply.github.com>
---
 examples/09_Warmup_Example/warmup_offline.yaml     |  2 +-
 examples/09_Warmup_Example/warmup_online.yaml      |  2 +-
 src/inference_endpoint/commands/benchmark.py       |  2 +-
 src/inference_endpoint/config/schema.py            |  6 +++---
 .../dataset_manager/predefined/random/__init__.py  |  4 ++--
 tests/integration/commands/test_warmup.py          |  8 ++++----
 tests/unit/dataset_manager/test_data_loader.py     | 14 +++++++-------
 7 files changed, 19 insertions(+), 19 deletions(-)

diff --git a/examples/09_Warmup_Example/warmup_offline.yaml b/examples/09_Warmup_Example/warmup_offline.yaml
index 13d92de7..8ea56c1d 100644
--- a/examples/09_Warmup_Example/warmup_offline.yaml
+++ b/examples/09_Warmup_Example/warmup_offline.yaml
@@ -17,7 +17,7 @@ warmup:
   num_samples: 64 # Number of warmup requests to issue
   input_seq_length: 256 # ISL: target input sequence length in tokens
   output_seq_length: 64 # OSL: max_new_tokens for warmup requests
-  range_ratio: 0.9 # ISL variance: generates ISL in [256*0.9, 256]
+  input_range_ratio: 0.9 # ISL variance: generates ISL in [256*0.9, 256]
   random_seed: 42
 
 model_params:
diff --git a/examples/09_Warmup_Example/warmup_online.yaml b/examples/09_Warmup_Example/warmup_online.yaml
index f267adbe..a270273a 100644
--- a/examples/09_Warmup_Example/warmup_online.yaml
+++ b/examples/09_Warmup_Example/warmup_online.yaml
@@ -17,7 +17,7 @@ warmup:
   num_samples: 32 # Number of warmup requests to issue
   input_seq_length: 128 # ISL: target input sequence length in tokens
   output_seq_length: 32 # OSL: max_new_tokens for warmup requests
-  range_ratio: 0.8 # ISL variance: generates ISL in [128*0.8, 128]
+  input_range_ratio: 0.8 # ISL variance: generates ISL in [128*0.8, 128]
   random_seed: 42
 
 model_params:
diff --git a/src/inference_endpoint/commands/benchmark.py b/src/inference_endpoint/commands/benchmark.py
index 3f2a4106..408270da 100644
--- a/src/inference_endpoint/commands/benchmark.py
+++ b/src/inference_endpoint/commands/benchmark.py
@@ -467,7 +467,7 @@ def _run_benchmark(
                 force=False,
                 num_sequences=warmup_cfg.num_samples,
                 input_seq_length=warmup_cfg.input_seq_length,
-                range_ratio=warmup_cfg.range_ratio,
+                input_range_ratio=warmup_cfg.input_range_ratio,
                 random_seed=warmup_cfg.random_seed,
                 tokenizer=tokenizer,
             )
diff --git a/src/inference_endpoint/config/schema.py b/src/inference_endpoint/config/schema.py
index 8ce85836..08d348db 100644
--- a/src/inference_endpoint/config/schema.py
+++ b/src/inference_endpoint/config/schema.py
@@ -245,15 +245,15 @@ class WarmupConfig(BaseModel):
         num_samples: Number of warmup queries to issue.
         input_seq_length: Target input sequence length in tokens (ISL).
         output_seq_length: Max output tokens for warmup requests (OSL).
-        range_ratio: ISL variance factor in [0.0, 1.0]. Generates ISL in
-            the range [input_seq_length * range_ratio, input_seq_length].
+        input_range_ratio: ISL variance factor in [0.0, 1.0]. Generates ISL in
+            the range [input_seq_length * input_range_ratio, input_seq_length].
         random_seed: Seed for reproducible warmup data generation.
     """
 
     num_samples: int = Field(100, gt=0)
     input_seq_length: int = Field(512, gt=0)
     output_seq_length: int = Field(128, gt=0)
-    range_ratio: float = Field(1.0, ge=0.0, le=1.0)
+    input_range_ratio: float = Field(1.0, ge=0.0, le=1.0)
     random_seed: int = 42
 
 
diff --git a/src/inference_endpoint/dataset_manager/predefined/random/__init__.py b/src/inference_endpoint/dataset_manager/predefined/random/__init__.py
index 45e096b0..f2e86a97 100644
--- a/src/inference_endpoint/dataset_manager/predefined/random/__init__.py
+++ b/src/inference_endpoint/dataset_manager/predefined/random/__init__.py
@@ -37,7 +37,7 @@ def generate(
         *,
         num_sequences: int = 1024,
         input_seq_length: int = 1024,
-        range_ratio: float = 1.0,
+        input_range_ratio: float = 1.0,
         random_seed: int = 42,
         save_tokenized_data: bool = False,
         tokenizer: str | PreTrainedTokenizer,
@@ -49,7 +49,7 @@ def generate(
         data = []
         # Generate the input sequence lengths given the range ratio
         input_seq_lengths = rng.integers(
-            int(input_seq_length * range_ratio),
+            int(input_seq_length * input_range_ratio),
             input_seq_length + 1,
             num_sequences,
         )
diff --git a/tests/integration/commands/test_warmup.py b/tests/integration/commands/test_warmup.py
index b82109d4..ed17af90 100644
--- a/tests/integration/commands/test_warmup.py
+++ b/tests/integration/commands/test_warmup.py
@@ -47,7 +47,7 @@ async def test_warmup_offline_with_echo_server(
           num_samples: 4
           input_seq_length: 16
           output_seq_length: 8
-          range_ratio: 0.9
+          input_range_ratio: 0.9
           random_seed: 42
 
         model_params:
@@ -103,7 +103,7 @@ async def test_warmup_offline_with_echo_server(
 
     log_text = caplog.text
     assert "Warmup: issuing samples" in log_text, "Warmup did not start"
-    assert "Warmup issue complete" in log_text, "Warmup issue did not complete"
+    assert "Warmup complete" in log_text, "Warmup did not complete"
     assert "Estimated QPS:" in log_text, "Performance test did not run after warmup"
 
 
@@ -122,7 +122,7 @@ async def test_warmup_online_with_echo_server(
           num_samples: 4
           input_seq_length: 16
           output_seq_length: 8
-          range_ratio: 0.8
+          input_range_ratio: 0.8
           random_seed: 42
 
         model_params:
@@ -179,5 +179,5 @@ async def test_warmup_online_with_echo_server(
 
     log_text = caplog.text
     assert "Warmup: issuing samples" in log_text, "Warmup did not start"
-    assert "Warmup issue complete" in log_text, "Warmup issue did not complete"
+    assert "Warmup complete" in log_text, "Warmup did not complete"
     assert "Estimated QPS:" in log_text, "Performance test did not run after warmup"
diff --git a/tests/unit/dataset_manager/test_data_loader.py b/tests/unit/dataset_manager/test_data_loader.py
index 4e08bdf9..e63083fd 100644
--- a/tests/unit/dataset_manager/test_data_loader.py
+++ b/tests/unit/dataset_manager/test_data_loader.py
@@ -50,8 +50,8 @@ def test_hf_squad_dataset(hf_squad_dataset):
 
 
 @pytest.mark.slow
-@pytest.mark.parametrize("range_ratio", [0.5, 0.8, 1.0])
-def test_random_data_loader(range_ratio):
+@pytest.mark.parametrize("input_range_ratio", [0.5, 0.8, 1.0])
+def test_random_data_loader(input_range_ratio):
     num_sequences = 1024
     input_seq_length = 1024
     random_seed = 42
@@ -60,7 +60,7 @@ def test_random_data_loader(range_ratio):
     random_data_loader = RandomDataset.get_dataloader(
         num_sequences=num_sequences,
         input_seq_length=input_seq_length,
-        range_ratio=range_ratio,
+        input_range_ratio=input_range_ratio,
         random_seed=random_seed,
         tokenizer=tokenizer,
         save_tokenized_data=save_tokenized_data,
@@ -83,11 +83,11 @@ def test_random_data_loader(range_ratio):
         # the decoding-encoding which may coalesce some sequences to newer tokens. We use a 0.8 factor to allow for this.
         # And we allow for a 20% overhead due to the decoding-encoding.
         assert (
-            len(sample["input_tokens"]) > input_seq_length * range_ratio * 0.8
+            len(sample["input_tokens"]) > input_seq_length * input_range_ratio * 0.8
             and len(sample["input_tokens"]) <= input_seq_length * 1.2
-        ), f"Expected {input_seq_length*range_ratio*0.8} to {input_seq_length*0.2} input tokens, got {len(sample["input_tokens"])}"
+        ), f"Expected {input_seq_length*input_range_ratio*0.8} to {input_seq_length*0.2} input tokens, got {len(sample["input_tokens"])}"
 
         assert (
-            len(sample["prompt"]) >= 1024 * range_ratio * 0.5
+            len(sample["prompt"]) >= 1024 * input_range_ratio * 0.5
             and len(sample["prompt"]) <= 7 * 1024
-        ), f"Expected length between 1024*range_ratio*0.5 and 1024, got {len(sample["prompt"])}"
+        ), f"Expected length between 1024*input_range_ratio*0.5 and 1024, got {len(sample["prompt"])}"

From 909ac9f1de06fe31f06422ee41f29825fe700cb8 Mon Sep 17 00:00:00 2001
From: Rashid Kaleem <230885705+arekay-nv@users.noreply.github.com>
Date: Thu, 19 Mar 2026 01:34:30 -0500
Subject: [PATCH 08/10] Address doc/yaml

Signed-off-by: Rashid Kaleem <230885705+arekay-nv@users.noreply.github.com>
---
 examples/09_Warmup_Example/README.md             | 4 ++--
 src/inference_endpoint/load_generator/session.py | 2 +-
 tests/unit/dataset_manager/test_data_loader.py   | 4 ++--
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/examples/09_Warmup_Example/README.md b/examples/09_Warmup_Example/README.md
index db8ce3ed..3c9c2497 100644
--- a/examples/09_Warmup_Example/README.md
+++ b/examples/09_Warmup_Example/README.md
@@ -19,7 +19,7 @@ warmup:
   num_samples: 64
   input_seq_length: 256
   output_seq_length: 64
-  range_ratio: 0.9
+  input_range_ratio: 0.9
   random_seed: 42
 ```
 
@@ -44,4 +44,4 @@ server and run the same commands.
 - `num_samples`: use enough requests to reach a steady state
 - `input_seq_length`: match the typical prompt length of the workload
 - `output_seq_length`: match the expected response length
-- `range_ratio`: use values below `1.0` to add light ISL variation
+- `input_range_ratio`: use values below `1.0` to add light ISL variation
diff --git a/src/inference_endpoint/load_generator/session.py b/src/inference_endpoint/load_generator/session.py
index 701c0b63..243cc9f2 100644
--- a/src/inference_endpoint/load_generator/session.py
+++ b/src/inference_endpoint/load_generator/session.py
@@ -322,7 +322,7 @@ def start(
             report_dir: The path to save the report to. If None, no report will be saved.
             tokenizer_override: The tokenizer to use for the session. If None, a tokenizer will be automatically selected
                                 based on the model name in the runtime settings.
-            dump_events_csv: Whether to dump the events to a CSV file. Only use for debugging
+            dump_events_log: Whether to dump the events to a JSONL file. Only use for debugging
                              purposes, as the events database can get quite large.
 
         Returns:
diff --git a/tests/unit/dataset_manager/test_data_loader.py b/tests/unit/dataset_manager/test_data_loader.py
index e63083fd..73862966 100644
--- a/tests/unit/dataset_manager/test_data_loader.py
+++ b/tests/unit/dataset_manager/test_data_loader.py
@@ -85,9 +85,9 @@ def test_random_data_loader(input_range_ratio):
         assert (
             len(sample["input_tokens"]) > input_seq_length * input_range_ratio * 0.8
             and len(sample["input_tokens"]) <= input_seq_length * 1.2
-        ), f"Expected {input_seq_length*input_range_ratio*0.8} to {input_seq_length*0.2} input tokens, got {len(sample["input_tokens"])}"
+        ), f"Expected {input_seq_length*input_range_ratio*0.8} to {input_seq_length*1.2} input tokens, got {len(sample['input_tokens'])}"
 
         assert (
             len(sample["prompt"]) >= 1024 * input_range_ratio * 0.5
             and len(sample["prompt"]) <= 7 * 1024
-        ), f"Expected length between 1024*input_range_ratio*0.5 and 1024, got {len(sample["prompt"])}"
+        ), f"Expected length between 1024*input_range_ratio*0.5 and 7*1024, got {len(sample['prompt'])}"

From 5eb6b684c6fff94c63ba01efed0df1fc7d1a30b1 Mon Sep 17 00:00:00 2001
From: Rashid Kaleem <230885705+arekay-nv@users.noreply.github.com>
Date: Mon, 23 Mar 2026 10:56:30 -0700
Subject: [PATCH 09/10] Move warmup outside timeout

Signed-off-by: Rashid Kaleem <230885705+arekay-nv@users.noreply.github.com>
---
 .../load_generator/session.py                 | 141 +++++++++++-------
 src/inference_endpoint/metrics/reporter.py    |   5 +
 tests/unit/load_generator/test_session.py     |  29 ++--
 tests/unit/metrics/test_recorder.py           |   2 +-
 tests/unit/metrics/test_reporter.py           |   1 +
 5 files changed, 115 insertions(+), 63 deletions(-)

diff --git a/src/inference_endpoint/load_generator/session.py b/src/inference_endpoint/load_generator/session.py
index 243cc9f2..955cf6d9 100644
--- a/src/inference_endpoint/load_generator/session.py
+++ b/src/inference_endpoint/load_generator/session.py
@@ -66,6 +66,9 @@ def __init__(
 
         self.sample_uuid_map: dict[str, dict[str, int]] | None = None
 
+        # Wall-clock nanoseconds spent in the warmup phase; set by _run_warmup.
+        self.warmup_duration_ns: int | None = None
+
     @property
     def is_running(self):
         return self.thread is not None and self.thread.is_alive()
@@ -76,54 +79,69 @@ def stop(self) -> None:
         # wakeup _run_test if needed, short-circuit SHUTDOWN_POLL_INTERVAL_S
         self.end_event.set()
 
+    def _run_warmup(
+        self,
+        warmup_generator: LoadGenerator,
+        max_shutdown_timeout_s: float | None = 300.0,
+    ) -> None:
+        """Issue warmup samples and drain responses using the already-open EventRecorder.
+
+        Warmup events land in the shared DB with timestamps before TEST_STARTED, so they
+        are excluded from all perf metrics. Wall-clock duration is stored on
+        ``self.warmup_duration_ns``.
+
+        Args:
+            warmup_generator: Pre-configured load generator for the warmup dataset.
+            max_shutdown_timeout_s: Drain timeout in seconds. None means wait forever.
+        """
+        warmup_start_ns = time.monotonic_ns()
+
+        self.logger.info("Warmup: issuing samples...")
+        for _ in warmup_generator:
+            pass
+        self.logger.info("Warmup samples issued, waiting for responses to drain...")
+
+        # Enable idle signalling so EventRecorder wakes end_event when all warmup
+        # responses are received, then reset it for the perf test.
+        self.event_recorder.should_check_idle = True
+        drain_start = time.monotonic()
+        while self.event_recorder.n_inflight_samples != 0:
+            if (
+                max_shutdown_timeout_s is not None
+                and time.monotonic() - drain_start > max_shutdown_timeout_s
+            ):
+                raise TimeoutError(
+                    f"Warmup drain timeout exceeded: "
+                    f"{self.event_recorder.n_inflight_samples} warmup requests "
+                    f"still in-flight after {max_shutdown_timeout_s}s"
+                )
+            if self.stop_requested:
+                self.logger.info(
+                    f"Early stop requested (pending={self.event_recorder.n_inflight_samples}), shutting down warmup..."
+                )
+                return
+            self.end_event.wait(timeout=SHUTDOWN_POLL_INTERVAL_S)
+
+        # Reset so end_event is not pre-fired for the performance drain below.
+        self.event_recorder.should_check_idle = False
+        self.end_event.clear()
+
+        self.warmup_duration_ns = time.monotonic_ns() - warmup_start_ns
+        self.logger.info("Warmup complete")
+
     def _run_test(
         self,
         perf_test_generator: LoadGenerator,
         accuracy_test_generators: dict[str, LoadGenerator] | None = None,
-        warmup_generator: LoadGenerator | None = None,
         max_shutdown_timeout_s: float | None = 300.0,
         report_dir: os.PathLike | None = None,
         tokenizer_override: AutoTokenizer | None = None,
         dump_events_log: bool = False,
     ):
-        with self.event_recorder:
+        # The EventRecorder was opened in start() before warmup ran, so both the warmup
+        # phase and the perf test share the same DB.  We are responsible for closing it.
+        try:
             try:
-                # Warmup phase: issue samples before the timed performance window.
-                # Warmup events land in the DB with timestamps before TEST_STARTED,
-                # so they are excluded from all perf metrics.
-                if warmup_generator is not None:
-                    self.logger.info("Warmup: issuing samples...")
-                    for _ in warmup_generator:
-                        pass
-                    self.logger.info(
-                        "Warmup samples issued, waiting for responses to drain..."
-                    )
-                    # Enable idle signalling so EventRecorder wakes end_event when
-                    # all warmup responses are received, then reset it for the perf test.
-                    self.event_recorder.should_check_idle = True
-                    warmup_start = time.monotonic()
-                    while self.event_recorder.n_inflight_samples != 0:
-                        if (
-                            max_shutdown_timeout_s is not None
-                            and time.monotonic() - warmup_start > max_shutdown_timeout_s
-                        ):
-                            raise TimeoutError(
-                                f"Warmup drain timeout exceeded: "
-                                f"{self.event_recorder.n_inflight_samples} warmup requests "
-                                f"still in-flight after {max_shutdown_timeout_s}s"
-                            )
-                        if self.stop_requested:
-                            self.logger.info(
-                                f"Early stop requested (pending={self.event_recorder.n_inflight_samples}), shutting down test..."
-                            )
-                            return
-                        self.end_event.wait(timeout=SHUTDOWN_POLL_INTERVAL_S)
-                    # Reset so end_event is not pre-fired for the performance drain below.
-                    self.event_recorder.should_check_idle = False
-                    self.end_event.clear()
-
-                    self.logger.info("Warmup complete")
-
                 EventRecorder.record_event(
                     SessionEvent.TEST_STARTED,
                     time.monotonic_ns(),
@@ -199,7 +217,9 @@ def _run_test(
                                 f"Error loading tokenizer for model {model}: {e}"
                             )
                             tokenizer = None
-                report = reporter.create_report(tokenizer)
+                report = reporter.create_report(
+                    tokenizer, warmup_duration_ns=self.warmup_duration_ns
+                )
 
                 # Store report on session so external callers can use it
                 self.report = report
@@ -272,6 +292,8 @@ def _run_test(
                     with open(report_path, "w") as f:
                         report.display(fn=f.write, summary_only=False, newline="\n")
                     logger.info(f"Report saved to {report_path}")
+        finally:
+            self.event_recorder.close()
 
     def wait_for_test_end(self, timeout: float | None = None) -> bool:
         """
@@ -392,17 +414,32 @@ def start(
                     *args,
                 )
 
-        session.thread = threading.Thread(
-            target=session._run_test,
-            args=(load_generator,),
-            kwargs={
-                "warmup_generator": warmup_generator,
-                "accuracy_test_generators": accuracy_test_generators,
-                "max_shutdown_timeout_s": max_shutdown_timeout_s,
-                "report_dir": report_dir,
-                "tokenizer_override": tokenizer_override,
-                "dump_events_log": dump_events_log,
-            },
-        )
-        session.thread.start()
+        # Open the recorder before warmup so both phases share the same DB.
+        # _run_test is responsible for closing it when the perf test finishes.
+        session.event_recorder.__enter__()
+        thread_started = False
+        try:
+            if warmup_generator is not None:
+                session._run_warmup(warmup_generator, max_shutdown_timeout_s or 300.0)
+
+            if not session.stop_requested:
+                session.thread = threading.Thread(
+                    target=session._run_test,
+                    args=(load_generator,),
+                    kwargs={
+                        "accuracy_test_generators": accuracy_test_generators,
+                        "max_shutdown_timeout_s": max_shutdown_timeout_s,
+                        "report_dir": report_dir,
+                        "tokenizer_override": tokenizer_override,
+                        "dump_events_log": dump_events_log,
+                    },
+                )
+                session.thread.start()
+                thread_started = True
+        finally:
+            # If the thread did not start (warmup stopped early or an exception was
+            # raised), we own the recorder and must close it here.
+            if not thread_started:
+                session.event_recorder.close()
+
         return session
diff --git a/src/inference_endpoint/metrics/reporter.py b/src/inference_endpoint/metrics/reporter.py
index d7a79202..1c6e426a 100644
--- a/src/inference_endpoint/metrics/reporter.py
+++ b/src/inference_endpoint/metrics/reporter.py
@@ -362,6 +362,7 @@ class Report:
     tpot: dict[str, float]
     latency: dict[str, float]
     output_sequence_lengths: dict[str, int]
+    warmup_duration_ns: int | None = None
     tpot_reporting_mode: TPOTReportingMode = TPOTReportingMode.REQUEST_WEIGHTED
 
     @functools.cached_property
@@ -497,6 +498,8 @@ def display(
         fn(f"Total samples issued: {self.n_samples_issued}{newline}")
         fn(f"Total samples completed: {self.n_samples_completed}{newline}")
         fn(f"Total samples failed: {self.n_samples_failed}{newline}")
+        if self.warmup_duration_ns is not None:
+            fn(f"Warmup duration: {self.warmup_duration_ns / 1e9:.2f} seconds{newline}")
         if self.duration_ns is not None:
             fn(f"Duration: {self.duration_ns / 1e9:.2f} seconds{newline}")
         else:
@@ -1331,6 +1334,7 @@ def create_report(
         self,
         tokenizer: Tokenizer | None = None,
         tpot_reporting_mode: TPOTReportingMode = TPOTReportingMode.REQUEST_WEIGHTED,
+        warmup_duration_ns: int | None = None,
     ) -> Report:
         """Creates a Report object from the metrics.
 
@@ -1382,6 +1386,7 @@ def create_report(
             n_samples_completed=sample_statuses["completed"],
             n_samples_failed=self.get_error_count(),
             duration_ns=self.derive_duration(),
+            warmup_duration_ns=warmup_duration_ns,
             ttft=ttft_summary,
             tpot=tpot_summary,
             latency=sample_latency_rollup.summarize(),
diff --git a/tests/unit/load_generator/test_session.py b/tests/unit/load_generator/test_session.py
index 106485b7..68fa4c6e 100644
--- a/tests/unit/load_generator/test_session.py
+++ b/tests/unit/load_generator/test_session.py
@@ -15,7 +15,7 @@
 
 import random
 from pathlib import Path
-from unittest.mock import patch
+from unittest.mock import MagicMock, patch
 
 import inference_endpoint.metrics as metrics
 import pytest
@@ -191,20 +191,29 @@ def __init__(self, sample_issuer, dataloader, scheduler, *args):
             self.name = None
             created_schedulers.append(scheduler)
 
+    mock_recorder = MagicMock()
+    mock_recorder.__enter__ = MagicMock(return_value=mock_recorder)
+    mock_recorder.close = MagicMock()
+
     with patch(
         "inference_endpoint.load_generator.session.Scheduler.get_implementation",
         return_value=MaxThroughputScheduler,
     ):
         with patch.object(BenchmarkSession, "_run_test", return_value=None):
-            BenchmarkSession.start(
-                rt_settings,
-                dl,
-                sample_issuer,
-                sched,
-                warmup_dataset=warmup_dataset,
-                load_generator_cls=RecordingLoadGenerator,
-                max_shutdown_timeout_s=0.01,
-            )
+            with patch.object(BenchmarkSession, "_run_warmup", return_value=None):
+                with patch(
+                    "inference_endpoint.load_generator.session.EventRecorder",
+                    return_value=mock_recorder,
+                ):
+                    BenchmarkSession.start(
+                        rt_settings,
+                        dl,
+                        sample_issuer,
+                        sched,
+                        warmup_dataset=warmup_dataset,
+                        load_generator_cls=RecordingLoadGenerator,
+                        max_shutdown_timeout_s=0.01,
+                    )
 
     assert len(created_schedulers) >= 2
     warmup_scheduler = created_schedulers[1]
diff --git a/tests/unit/metrics/test_recorder.py b/tests/unit/metrics/test_recorder.py
index 058ee50a..5d2b7757 100644
--- a/tests/unit/metrics/test_recorder.py
+++ b/tests/unit/metrics/test_recorder.py
@@ -342,7 +342,7 @@ def test_shm_usage(sample_uuids):
         rec.wait_for_writes()
     events_created_ev.set()
 
-    worker_proc.join(timeout=10)
+    worker_proc.join(timeout=60)
     if worker_proc.is_alive():
         worker_proc.terminate()
         worker_proc.join(timeout=1)
diff --git a/tests/unit/metrics/test_reporter.py b/tests/unit/metrics/test_reporter.py
index 6e947429..23752697 100644
--- a/tests/unit/metrics/test_reporter.py
+++ b/tests/unit/metrics/test_reporter.py
@@ -488,6 +488,7 @@ def test_reporter_json(events_db):
         "latency",
         "output_sequence_lengths",
         "tpot_reporting_mode",
+        "warmup_duration_ns",
         "qps",
         "tps",
         "test_started_at",

From dc8a302ac476b68bd5ebff75290eea601dd46671 Mon Sep 17 00:00:00 2001
From: Rashid Kaleem <230885705+arekay-nv@users.noreply.github.com>
Date: Mon, 23 Mar 2026 11:44:04 -0700
Subject: [PATCH 10/10] Fix docstring

Signed-off-by: Rashid Kaleem <230885705+arekay-nv@users.noreply.github.com>
---
 src/inference_endpoint/load_generator/session.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/inference_endpoint/load_generator/session.py b/src/inference_endpoint/load_generator/session.py
index 955cf6d9..a0c768d7 100644
--- a/src/inference_endpoint/load_generator/session.py
+++ b/src/inference_endpoint/load_generator/session.py
@@ -340,7 +340,7 @@ def start(
             load_generator_cls: The load generator class to use for the session.
             name: The name of the session.
             max_shutdown_timeout_s: The maximum timeout to wait for the test to complete after all samples have been issued.
-                                    If None, wait indefinitely. (Default: 300.0 seconds)
+                                    If None, wait indefinitely. Warmup drain defaults to 300.0 seconds if not specified.
             report_dir: The path to save the report to. If None, no report will be saved.
             tokenizer_override: The tokenizer to use for the session. If None, a tokenizer will be automatically selected
                                 based on the model name in the runtime settings.