mlcommons
diff --git a/‎.github/workflows/test.yml‎
Lines changed: 39 additions & 0 deletions b/‎.github/workflows/test.yml‎
Lines changed: 39 additions & 0 deletions
diff --git a/‎.pre-commit-config.yaml‎
Lines changed: 5 additions & 5 deletions b/‎.pre-commit-config.yaml‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎docs/DEVELOPMENT.md‎
Lines changed: 12 additions & 0 deletions b/‎docs/DEVELOPMENT.md‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎pyproject.toml‎
Lines changed: 3 additions & 0 deletions b/‎pyproject.toml‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎scripts/regenerate_templates.py‎
Lines changed: 128 additions & 0 deletions b/‎scripts/regenerate_templates.py‎
Lines changed: 128 additions & 0 deletions
diff --git a/‎src/inference_endpoint/config/schema.py‎
Lines changed: 2 additions & 1 deletion b/‎src/inference_endpoint/config/schema.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎src/inference_endpoint/config/templates/concurrency_template.yaml‎
Lines changed: 54 additions & 35 deletions b/‎src/inference_endpoint/config/templates/concurrency_template.yaml‎
Lines changed: 54 additions & 35 deletions
@@ -53,3 +53,42 @@ jobs:
           python -m pip install pip==26.0.1
           pip install -e ".[dev,test,performance]"
           pip-audit
+
+  schema-updated:
+    runs-on: ubuntu-latest
+    if: github.event_name == 'pull_request'
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: Check for schema changes
+        id: schema
+        run: |
+          CHANGED=$(git diff --name-only origin/${{ github.base_ref }}...HEAD -- \
+            'src/inference_endpoint/config/schema.py' \
+            'src/inference_endpoint/endpoint_client/config.py' \
+            'src/inference_endpoint/commands/benchmark/cli.py')
+          echo "changed=$([[ -n "$CHANGED" ]] && echo true || echo false)" >> "$GITHUB_OUTPUT"
+
+      - name: Set up Python 3.12
+        if: steps.schema.outputs.changed == 'true'
+        uses: actions/setup-python@v4
+        with:
+          python-version: "3.12"
+
+      - name: Install dependencies
+        if: steps.schema.outputs.changed == 'true'
+        run: |
+          python -m pip install --upgrade pip
+          pip install -e .[test]
+
+      - name: Run schema fuzz tests
+        if: steps.schema.outputs.changed == 'true'
+        run: |
+          pytest -xv -m schema_fuzz
+
+      - name: Check YAML templates are up to date
+        if: steps.schema.outputs.changed == 'true'
+        run: |
+          python scripts/regenerate_templates.py --check
@@ -35,7 +35,7 @@ repos:
     hooks:
       - id: prettier
         types_or: [yaml, json, markdown]
-        exclude: ^(src/inference_endpoint/openai/openai_types_gen.py|src/inference_endpoint/openai/openapi.yaml)$
+        exclude: ^(src/inference_endpoint/openai/openai_types_gen.py|src/inference_endpoint/openai/openapi.yaml|src/inference_endpoint/config/templates/)
 
   - repo: local
     hooks:
@@ -48,12 +48,12 @@ repos:
         args: ["--tb=short", "--strict-markers"]
         stages: [manual]
 
-      - id: validate-templates
-        name: Validate YAML templates against schema
-        entry: python -c "from pathlib import Path; from inference_endpoint.config.schema import BenchmarkConfig; [BenchmarkConfig.from_yaml_file(f) for f in sorted(Path('src/inference_endpoint/config/templates').glob('*.yaml'))]"
+      - id: check-templates
+        name: Check YAML templates match schema defaults
+        entry: python scripts/regenerate_templates.py --check
         language: system
         pass_filenames: false
-        files: ^src/inference_endpoint/config/(schema\.py|templates/)
+        files: ^(src/inference_endpoint/config/schema\.py|scripts/regenerate_templates\.py)$
 
       - id: add-license-header
         name: Add license headers
 
@@ -276,6 +276,18 @@ pytest -s -v
 python -m pdb -m pytest test_file.py
 ```
 
+## 📄 YAML Config Templates
+
+Config templates in `src/inference_endpoint/config/templates/` are auto-generated from schema defaults. When you change `config/schema.py`, regenerate them:
+
+```bash
+python scripts/regenerate_templates.py
+```
+
+Pre-commit and CI will fail if committed templates are out of sync with the schema (`--check` mode).
+
+The script applies overrides (model name, endpoint URL, dataset path) defined in `scripts/regenerate_templates.py` on top of `BenchmarkConfig.create_default_config()` defaults. To change a template override, edit `_COMMON` or `_TEMPLATES` in the script and re-run.
+
 ## 📦 Package Management
 
 ### Adding Dependencies
 
@@ -97,6 +97,8 @@ test = [
     "aiohttp==3.13.4",
     # Plotting for benchmark sweep mode
     "matplotlib==3.10.8",
+    # Property-based testing (CLI fuzz)
+    "hypothesis==6.151.10",
 ]
 performance = [
     "pytest-benchmark==5.2.3",
@@ -184,6 +186,7 @@ markers = [
     "integration: marks tests as integration tests",
     "unit: marks tests as unit tests",
     "run_explicitly: mark test to only run explicitly",
+    "schema_fuzz: hypothesis CLI fuzz tests (run in CI on schema changes)",
 ]
 filterwarnings = [
     "ignore:Session timeout reached:RuntimeWarning",
 
@@ -0,0 +1,128 @@
+#!/usr/bin/env python3
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Regenerate YAML config templates from schema defaults + overrides.
+
+Used by pre-commit to keep templates in sync when schema.py changes.
+Overrides below are to make templates more readable, rest default.
+"""
+
+from pathlib import Path
+
+import yaml
+from inference_endpoint.config.schema import (
+    BenchmarkConfig,
+    Dataset,
+    EndpointConfig,
+    LoadPattern,
+    LoadPatternType,
+    ModelParams,
+    OnlineSettings,
+    TestType,
+)
+from inference_endpoint.exceptions import CLIError
+
+TEMPLATES_DIR = Path(__file__).parent.parent / "src/inference_endpoint/config/templates"
+
+_COMMON = {
+    "model_params": ModelParams(
+        name="<MODEL_NAME eg: meta-llama/Llama-3.1-8B-Instruct>",
+        temperature=0.7,
+        top_p=0.9,
+        max_new_tokens=1024,
+    ),
+    "endpoint_config": EndpointConfig(
+        endpoints=["<ENDPOINT_URL eg: http://localhost:8000>"],
+    ),
+    "datasets": [
+        Dataset(
+            name="perf-test",
+            type="performance",
+            path="<DATASET_PATH eg: tests/datasets/dummy_1k.jsonl>",
+            samples=1000,
+            parser={"prompt": "text_input"},
+        )
+    ],
+}
+
+_TEMPLATES = {
+    "offline": {**_COMMON},
+    "online": {
+        **_COMMON,
+        "settings": OnlineSettings(
+            load_pattern=LoadPattern(type=LoadPatternType.POISSON, target_qps=10.0),
+        ),
+    },
+    "concurrency": {
+        **_COMMON,
+        "settings": OnlineSettings(
+            load_pattern=LoadPattern(
+                type=LoadPatternType.CONCURRENCY, target_concurrency=32
+            ),
+        ),
+    },
+}
+
+_EXCLUDE = {"verbose", "submission_ref", "benchmark_mode"}
+
+
+def _clean(full: dict) -> dict:
+    out = {k: v for k, v in full.items() if k not in _EXCLUDE}
+    if "settings" in out and "client" in out["settings"]:
+        out["settings"]["client"]["num_workers"] = 4
+    return out
+
+
+def main(check_only: bool = False):
+    """Regenerate templates, or check they're up to date (--check flag)."""
+    stale = False
+    for name, overrides in _TEMPLATES.items():
+        test_type = TestType.OFFLINE if name == "offline" else TestType.ONLINE
+        try:
+            base = BenchmarkConfig.create_default_config(test_type)
+            cfg = base.with_updates(**overrides)
+        except (CLIError, ValueError) as e:
+            print(f"  FAIL: {name} ({e})")
+            stale = True
+            continue
+
+        expected = yaml.dump(
+            _clean(cfg.model_dump(mode="json")),
+            default_flow_style=False,
+            sort_keys=False,
+        )
+        path = TEMPLATES_DIR / f"{name}_template.yaml"
+
+        if check_only:
+            current = path.read_text() if path.exists() else ""
+            if current != expected:
+                print(f"  STALE: {path.name}")
+                stale = True
+            else:
+                print(f"  OK: {path.name}")
+        else:
+            path.write_text(expected)
+            print(f"  Generated: {path.name}")
+
+    if stale:
+        print("\nRun: python scripts/regenerate_templates.py")
+        raise SystemExit(1)
+
+
+if __name__ == "__main__":
+    import sys
+
+    main(check_only="--check" in sys.argv)
@@ -339,6 +339,7 @@ def _validate_durations(self) -> Self:
         return self
 
 
+@cyclopts.Parameter(name="*")
 class LoadPattern(BaseModel):
     """Load pattern configuration.
 
@@ -352,7 +353,7 @@ class LoadPattern(BaseModel):
 
     type: Annotated[
         LoadPatternType,
-        cyclopts.Parameter(alias="--load-pattern", help="Load pattern type"),
+        cyclopts.Parameter(name="--load-pattern", help="Load pattern type"),
     ] = LoadPatternType.MAX_THROUGHPUT
     target_qps: Annotated[
         float | None, cyclopts.Parameter(alias="--target-qps", help="Target QPS")
 
@@ -1,50 +1,69 @@
-# Online Concurrency-Based Benchmark (NOT YET IMPLEMENTED)
-# This template shows the future concurrency-based online mode
-name: "concurrency-benchmark"
-version: "1.0"
-type: "online"
-
+name: online_benchmark
+version: '1.0'
+type: online
 model_params:
-  name: "meta-llama/Llama-3.1-8B-Instruct"
+  name: '<MODEL_NAME eg: meta-llama/Llama-3.1-8B-Instruct>'
   temperature: 0.7
+  top_k: null
   top_p: 0.9
+  repetition_penalty: null
   max_new_tokens: 1024
-
+  osl_distribution: null
+  streaming: 'on'
 datasets:
-  - name: "concurrency-test"
-    type: "performance"
-    path: "datasets/queries.jsonl"
-    samples: 500
-
+- name: perf-test
+  type: performance
+  path: '<DATASET_PATH eg: tests/datasets/dummy_1k.jsonl>'
+  format: null
+  samples: 1000
+  eval_method: null
+  parser:
+    prompt: text_input
+  accuracy_config: null
 settings:
   runtime:
-    min_duration_ms: 600000 # 10 minutes
-    max_duration_ms: 1800000 # 30 minutes
-    scheduler_random_seed: 42 # For Poisson/distribution sampling
-    dataloader_random_seed: 42 # For dataset shuffling
-
+    min_duration_ms: 600000
+    max_duration_ms: 0
+    n_samples_to_issue: null
+    scheduler_random_seed: 42
+    dataloader_random_seed: 42
   load_pattern:
-    type: "concurrency" # NOT YET IMPLEMENTED
-    target_concurrency: 32 # Maintain 32 concurrent requests
-    # Note: target_qps is not used in this mode
-    # QPS will be determined by: concurrency / avg_latency
-
+    type: concurrency
+    target_qps: null
+    target_concurrency: 32
   client:
     num_workers: 4
-
+    record_worker_events: false
+    log_level: INFO
+    warmup_connections: -1
+    max_connections: -1
+    transport:
+      type: zmq
+      recv_buffer_size: 16777216
+      send_buffer_size: 16777216
+      io_threads: 4
+      worker_io_threads: 1
+      high_water_mark: 0
+      linger: -1
+      immediate: 1
+    stream_all_chunks: false
+    worker_initialization_timeout: 60.0
+    worker_graceful_shutdown_wait: 0.5
+    worker_force_kill_timeout: 0.5
+    max_idle_time: 4.0
+    min_required_connections: -1
+    worker_gc_mode: relaxed
 metrics:
   collect:
-    - "throughput" # Will be concurrency / avg_latency
-    - "latency" # p50, p90, p95, p99, p999 at this concurrency level
-    - "ttft"
-    - "tpot"
-
+  - throughput
+  - latency
+  - ttft
+  - tpot
 endpoint_config:
   endpoints:
-    - "http://localhost:8000"
+  - '<ENDPOINT_URL eg: http://localhost:8000>'
   api_key: null
-  api_type: "openai" # Options: openai or sglang
-# How this differs from Poisson mode:
-# - Poisson: Fixed QPS target, concurrency varies based on latency
-# - Concurrency: Fixed N requests in-flight, QPS varies based on latency
-# - Useful for: Measuring latency at specific concurrency levels
+  api_type: openai
+report_dir: null
+timeout: null
+enable_cpu_affinity: true