mlcommons · nvzhihanj · Mar 19, 2026 · Mar 17, 2026 · Mar 17, 2026
@@ -70,7 +70,8 @@ Multi-process, event-loop design optimized for throughput:
 ### CLI Modes
 
 - **CLI mode** (`offline`/`online`): Parameters from command-line arguments
-- **YAML mode** (`from-config`): All config from file, no CLI overrides except `--output`
+- **YAML mode** (`from-config`): All config from file, no CLI overrides except `--timeout`
+- **eval**: Accuracy evaluation — subcommand exists but is not yet implemented (raises `NotImplementedError`)
 
 ### Load Patterns
 
@@ -87,7 +88,9 @@ src/inference_endpoint/
 ├── exceptions.py              # CLIError, ExecutionError, InputValidationError, SetupError
 ├── commands/                  # benchmark, eval, probe, info, validate, init
 │   ├── benchmark.py           # Core benchmark command implementation
-│   └── probe.py               # Endpoint health checking
+│   ├── eval.py                # Accuracy evaluation command (not yet implemented)
+│   ├── probe.py               # Endpoint health checking
+│   └── utils.py               # info, validate, init command implementations
 ├── core/types.py              # Query, QueryResult, StreamChunk, QueryStatus (msgspec Structs)
 ├── load_generator/
 │   ├── session.py             # BenchmarkSession - top-level orchestrator
@@ -104,7 +107,8 @@ src/inference_endpoint/
 │   ├── config.py              # HTTPClientConfig
 │   ├── adapter_protocol.py    # HttpRequestAdapter protocol
 │   ├── accumulator_protocol.py # Response accumulation protocol
-│   └── cpu_affinity.py        # CPU pinning
+│   ├── cpu_affinity.py        # CPU pinning
+│   └── utils.py               # Port range helpers
 ├── async_utils/
 │   ├── loop_manager.py        # LoopManager (uvloop + eager_task_factory)
 │   ├── event_publisher.py     # Async event pub/sub
@@ -127,6 +131,7 @@ src/inference_endpoint/
 │   ├── runtime_settings.py    # RuntimeSettings dataclass
 │   ├── ruleset_base.py        # BenchmarkSuiteRuleset base
 │   ├── ruleset_registry.py    # Ruleset registry
+│   ├── user_config.py         # UserConfig dataclass for ruleset user overrides
 │   ├── rulesets/mlcommons/    # MLCommons-specific rules, datasets, models
 │   └── templates/             # YAML config templates (offline, online, eval, etc.)
 ├── openai/                    # OpenAI-compatible API types and adapters
@@ -146,7 +151,8 @@ src/inference_endpoint/
 └── utils/
     ├── logging.py             # Logging setup
     ├── version.py             # Version info
-    └── dataset_utils.py       # Dataset utilities
+    ├── dataset_utils.py       # Dataset utilities
+    └── benchmark_httpclient.py # HTTP client throughput benchmarking utility
 
 tests/
 ├── conftest.py                # Shared fixtures (echo/oracle servers, datasets, settings)

@@ -159,8 +159,7 @@ inference-endpoint benchmark online \
   --target-qps 100 \
   --num-samples 10000 \
   --workers 16 \
-  --output results.json \
-  --report-path production_report \
+  --report-dir production_report \
   -v
 
 # Or with duration (calculates samples from target_qps * duration)
@@ -172,8 +171,7 @@ inference-endpoint benchmark online \
   --target-qps 100 \
   --duration 300 \
   --workers 16 \
-  --output results.json \
-  --report-path production_report \
+  --report-dir production_report \
   -v
 ```
 
@@ -260,8 +258,8 @@ endpoint_config:
 
 - All configuration from YAML file
 - Reproducible, shareable configs
-- No CLI parameter mixing (only --output auxiliary allowed)
-- Example: `benchmark from-config --config file.yaml --output results.json`
+- No CLI parameter mixing (only `--timeout` auxiliary allowed)
+- Example: `benchmark from-config --config file.yaml --timeout 600`
 
 ## Tips
 
@@ -281,6 +279,6 @@ endpoint_config:
 **Best Practices:**
 
 - Share YAML configs for reproducible results across systems
-- Use `--report-path` for detailed metrics with TTFT, TPOT, and token analysis
+- Use `--report-dir` for detailed metrics with TTFT, TPOT, and token analysis
 - Set `HF_TOKEN` environment variable for non-public models
 - Use `--min-output-tokens` and `--max-output-tokens` to control output length
@@ -23,8 +23,7 @@ python3.12 -m venv venv
 source venv/bin/activate  # On Windows: venv\Scripts\activate
 
 # 3. Install development dependencies
-pip install -e .
-pip install -r requirements/base.txt
+pip install -e ".[dev,test]"
 
 # 4. Install pre-commit hooks
 pre-commit install
@@ -49,7 +48,6 @@ inference-endpoint/
 │   ├── metrics/                # Performance measurement and reporting
 │   ├── openai/                 # OpenAI API compatibility
 │   ├── profiling/              # Performance profiling tools
-│   ├── runtime/                # Runtime configuration
 │   ├── testing/                # Test utilities (echo server, etc.)
 │   └── utils/                  # Common utilities
 ├── tests/                      # Test suite
@@ -59,7 +57,6 @@ inference-endpoint/
 │   └── datasets/               # Test datasets
 ├── docs/                       # Documentation
 ├── examples/                   # Usage examples
-├── requirements/               # Dependency management
 └── scripts/                    # Utility scripts
 ```
 
@@ -112,7 +109,7 @@ class TestQuery:
         assert query.prompt == "Test"
         assert query.model == "test-model"
 
-    @pytest.mark.asyncio
+    @pytest.mark.asyncio(mode="strict")
     async def test_async_operation(self):
         """Test async operations."""
         # Your async test here
@@ -142,22 +139,18 @@ git commit --no-verify
 ### Code Formatting
 
 ```bash
-# Format code with Black
-black src/ tests/
-
-# Sort imports with isort
-isort src/ tests/
+# Format code with ruff
+ruff format src/ tests/
 
 # Check formatting without changing files
-black --check src/ tests/
-isort --check-only src/ tests/
+ruff format --check src/ tests/
 ```
 
 ### Linting
 
 ```bash
-# Run flake8
-flake8 src/ tests/
+# Run ruff linter
+ruff check src/ tests/
 
 # Run mypy for type checking
 mypy src/
@@ -195,7 +188,7 @@ When developing a new component:
 3. **Implement the component** following the established patterns
 4. **Add tests** in the corresponding `tests/unit/` directory
 5. **Update main package** `__init__.py` if needed
-6. **Add dependencies** to appropriate `requirements/` files
+6. **Add dependencies** to `pyproject.toml` under `[project.dependencies]` or `[project.optional-dependencies]`
 
 ### 3. Testing Strategy
 
@@ -287,20 +280,18 @@ python -m pdb -m pytest test_file.py
 
 ### Adding Dependencies
 
-1. **Base Dependencies** (`requirements/base.txt`): Required for package to function, development tools, linters, and pre-commit hooks
-2. **Test Dependencies** (`requirements/test.txt`): Testing framework and utilities (pytest, pytest-asyncio, etc.)
+Add dependencies to `pyproject.toml`:
 
-### Updating Dependencies
+- **Runtime dependencies**: `[project.dependencies]`
+- **Optional groups** (dev, test, etc.): `[project.optional-dependencies]`
+
+Install after updating:
 
 ```bash
-# Update all dependencies
-pip install --upgrade -r requirements/base.txt
+pip install -e ".[dev,test]"
 
 # Check for outdated packages
 pip list --outdated
-
-# Update specific package
-pip install --upgrade package-name
 ```
 
 ## 🚨 Troubleshooting

@@ -81,8 +81,7 @@ inference-endpoint -v benchmark offline \
   --dataset tests/datasets/dummy_1k.pkl \
   --num-samples 5000 \
   --workers 4 \
-  --output benchmark_results.json \
-  --report-path benchmark_report
+  --report-dir benchmark_report
 
 # Note: Set HF_TOKEN environment variable if using non-public models
 # export HF_TOKEN=your_huggingface_token
@@ -115,7 +114,7 @@ inference-endpoint -v benchmark online \
   --dataset tests/datasets/dummy_1k.pkl \
   --load-pattern poisson \
   --target-qps 100 \
-  --report-path online_benchmark_report
+  --report-dir online_benchmark_report
 ```
 
 **Expected Output:**
@@ -157,26 +156,7 @@ inference-endpoint benchmark offline \
 
 ### 6. View Results
 
-```bash
-# View benchmark results
-cat benchmark_results.json | jq
-
-# Example output:
-{
-  "config": {
-    "endpoint": "http://localhost:8765",
-    "mode": null,
-    "qps": 10
-  },
-  "results": {
-    "total": 1000,
-    "successful": 1000,
-    "failed": 0,
-    "elapsed_time": 1.8,
-    "qps": 555.6
-  }
-}
-```
+When run with `--report-dir`, a directory is created containing benchmark metrics files (JSON/CSV) with detailed QPS, latency, TTFT, and TPOT data.
 
 ### 7. Stop the Echo Server
 
@@ -262,13 +242,9 @@ inference-endpoint -v benchmark offline \
   --model Qwen/Qwen3-8B \
   --dataset tests/datasets/dummy_1k.pkl \
   --workers 4 \
-  --output benchmark_results.json \
-  --report-path benchmark_report
-
-# 6. Check results
-cat benchmark_results.json | jq '.results'
+  --report-dir benchmark_report
 
-# 7. Stop server
+# 6. Stop server
 pkill -f echo_server
 ```
 
@@ -280,7 +256,7 @@ inference-endpoint benchmark offline \
   --endpoints http://localhost:8765 \
   --model Qwen/Qwen3-8B \
   --dataset tests/datasets/dummy_1k.pkl \
-  --report-path offline_report
+  --report-dir offline_report
 
 # Online (Poisson distribution)
 inference-endpoint benchmark online \
@@ -289,7 +265,7 @@ inference-endpoint benchmark online \
   --dataset tests/datasets/dummy_1k.pkl \
   --load-pattern poisson \
   --target-qps 500 \
-  --report-path online_report
+  --report-dir online_report
 
 # With explicit sample count
 inference-endpoint benchmark offline \
@@ -339,5 +315,5 @@ inference-endpoint benchmark online \
 **Advanced:**
 
 - Streaming: `auto` (default), `on`, or `off` - auto enables for online, disables for offline
-- Use `--report-path` for detailed metrics reports with TTFT, TPOT, and token analysis
+- Use `--report-dir` for detailed metrics reports with TTFT, TPOT, and token analysis
 - Dataset format auto-inferred from file extension
@@ -67,7 +67,7 @@ If you already have the model weights or prefer a direct approach, follow the [i
 LiveCodeBench has a few security concerns and dependency conflicts, so it is recommended to run LiveCodeBench via the
 containerized workflow.
 
-Follow the instructions in the [LiveCodeBench README](../../src/inference_endpoint/dataset_manager/predefined/livecodebench/README.md#running-the-container)
+Follow the instructions in the [LiveCodeBench README](../../src/inference_endpoint/evaluation/livecodebench/README.md#running-the-container)
 
 #### Non-containerized run (NOT RECOMMENDED)