mlcommons
diff --git a/‎.github/workflows/test-submission-generation.yml‎
Lines changed: 6 additions & 40 deletions b/‎.github/workflows/test-submission-generation.yml‎
Lines changed: 6 additions & 40 deletions
diff --git a/‎.gitmodules‎
Lines changed: 6 additions & 0 deletions b/‎.gitmodules‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎language/deepseek-r1/.gitignore‎
Lines changed: 10 additions & 0 deletions b/‎language/deepseek-r1/.gitignore‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎language/deepseek-r1/README.md‎
Lines changed: 191 additions & 0 deletions b/‎language/deepseek-r1/README.md‎
Lines changed: 191 additions & 0 deletions
diff --git a/‎language/deepseek-r1/backends/__init__.py‎
Lines changed: 14 additions & 0 deletions b/‎language/deepseek-r1/backends/__init__.py‎
Lines changed: 14 additions & 0 deletions
@@ -9,44 +9,10 @@ on:
       - '.github/workflows/test-submission-generation.yml'
       - '**'  
       - '!**.md'
+      
 jobs:
-  submission_generation:
-    runs-on: ${{ matrix.os }}
-    strategy:
-      fail-fast: false
-      matrix:
-        os: [ubuntu-latest, windows-latest, macos-latest]
-        python-version: [ "3.12" ]
-        division: ["closed", "open", "closed-open"]
-        category: ["datacenter", "edge"]
-        case: ["closed"]
-        action: ["run", "docker"]
-        exclude:
-          - os: macos-latest
-          - os: windows-latest
-          - category: "edge"
-            
-    steps:
-    - uses: actions/checkout@v4
-    - name: Set up Python ${{ matrix.python-version }}
-      uses: actions/setup-python@v3
-      with:
-        python-version: ${{ matrix.python-version }}
-    - name: Install dependencies
-      run: |
-        pip install mlc-scripts
-    - name: Pull repo where test cases are uploaded
-      run: |
-        git clone -b submission-generation-examples https://github.com/mlcommons/inference.git submission_generation_examples
-    - name: Run Submission Generation - ${{ matrix.case }} ${{ matrix.action }} ${{ matrix.category }} ${{ matrix.division }} 
-      continue-on-error: true
-      run: |
-        if [ "${{ matrix.case }}" == "closed" ]; then
-          description="Test submission - contains closed edge and datacenter"
-        elif [ "${{ matrix.case }}" == "closed-power" ]; then
-          description="Test submission - contains closed-power edge and datacenter results"
-        fi
-        # Dynamically set the log group to simulate a dynamic step name
-        echo "::group::$description"
-        mlc ${{ matrix.action }} script --tags=generate,inference,submission  --adr.compiler.tags=gcc --version=v5.0 --clean --preprocess_submission=yes --submission_base_dir=mysubmissions --results_dir=$PWD/submission_generation_tests/${{ matrix.case }}/ --run-checker --submitter=MLCommons --tar=yes --division=${{ matrix.division }} --env.MLC_DETERMINE_MEMORY_CONFIGURATION=yes --quiet
-        mlc ${{ matrix.action }} script --tags=run,submission,checker --submitter_id_off=mysubmitter_id --tar=yes --submission_dir=mysubmissions/submissions --submission_tar_file=mysubmission.tar.gz
+  run-tests:
+    uses: mlcommons/mlperf-automations/.github/workflows/test-mlperf-inference-submission-generation.yml@dev
+    with:
+      ref: ${{ github.event.pull_request.head.ref }}
+      repo-url: ${{ github.event.pull_request.head.repo.html_url }}
@@ -4,3 +4,9 @@
 [submodule "vision/medical_imaging/3d-unet-brats19/nnUnet"]
 	path = vision/medical_imaging/3d-unet-brats19/nnUnet
 	url = https://github.com/MIC-DKFZ/nnUNet.git
+[submodule "language/deepseek-r1/submodules/prm800k"]
+	path = language/deepseek-r1/submodules/prm800k
+	url = https://github.com/openai/prm800k
+[submodule "language/deepseek-r1/submodules/LiveCodeBench"]
+	path = language/deepseek-r1/submodules/LiveCodeBench
+	url = https://github.com/LiveCodeBench/LiveCodeBench
@@ -0,0 +1,10 @@
+*.pyc
+.nvimrc.lua
+__pycache__/
+.venv/
+build/
+data/
+mlperf_results/
+remove_dev_files.sh
+.cursor
+.venv_*
@@ -0,0 +1,191 @@
+# Mlperf Inference DeepSeek Reference Implementation
+
+## Model & Dataset Download
+
+> **Model**: [deepseek-ai/DeepSeek-R1](https://huggingface.co/deepseek-ai/DeepSeek-R1) (revision: `56d4cbbb4d29f4355bab4b9a39ccb717a14ad5ad`)
+
+- DeepSeek-R1 model is automatically downloaded as part of setup
+- Checkpoint conversion is done transparently when needed.
+
+## Dataset Download
+
+### Preprocessed
+
+You can use Rclone to download the preprocessed dataset from a Cloudflare R2 bucket.
+
+To run Rclone on Windows, you can download the executable [here](https://rclone.org/install/#windows).
+To install Rclone on Linux/macOS/BSD systems, run:
+```
+sudo -v ; curl https://rclone.org/install.sh | sudo bash
+```
+Once Rclone is installed, run the following command to authenticate with the bucket:
+```
+rclone config create mlc-inference s3 provider=Cloudflare access_key_id=f65ba5eef400db161ea49967de89f47b secret_access_key=fbea333914c292b854f14d3fe232bad6c5407bf0ab1bebf78833c2b359bdfd2b endpoint=https://c2686074cb2caf5cbaf6d134bdba8b47.r2.cloudflarestorage.com
+```
+You can then navigate in the terminal to your desired download directory and run the following command to download the dataset:
+
+```
+rclone copy mlc-inference:mlcommons-inference-wg-public/deepseek_r1/mlperf_deepseek_r1_dataset_4388_fp8_eval.pkl ./ -P
+```
+
+### Calibration
+
+Download and install Rclone as described in the previous section.
+
+Then navigate in the terminal to your desired download directory and run the following command to download the dataset:
+
+```
+rclone copy mlc-inference:mlcommons-inference-wg-public/deepseek_r1/mlperf_deepseek_r1_calibration_dataset_500_fp8_eval.pkl ./ -P
+```
+
+## Docker
+
+The MLPerf DeepSeek reference implementation includes a comprehensive Docker launch system that supports multiple backends and provides advanced features like user management, persistent storage, and flexible configuration.
+
+### Launch Backend Specific Container
+
+Launch a Docker container with your preferred backend:
+
+```bash
+# Launch PyTorch backend
+./launch_docker.sh --backend pytorch
+
+# Launch vLLM backend
+./launch_docker.sh --backend vllm
+
+# Launch SGLang backend
+./launch_docker.sh --backend sglang
+
+# See launch_docker.sh for full list of args
+./launch_docker.sh --backend vllm --gpu-count 2 --extra-mounts "/data:/data,/models:/models" --local-user 0
+```
+
+### Available Backends
+
+- **pytorch**: via [DeepSeek-Ai/DeepSeek-V3](https://github.com/deepseek-ai/DeepSeek-V3) (reference implementation by DeepSeek-Ai)
+- **vllm**: vLLM's LLM api-based inference
+- **sglang**: sglang's OpenAI endpoint-based inference
+
+## Backend-Specific Setup
+
+After launching any Docker container, run the setup script which automatically detects your backend:
+
+```bash
+# Automatic backend detection and setup
+setup.sh
+```
+
+The setup script creates a virtual environment and configures it differently based on the backend:
+
+#### All Backends
+- Virtual environment is **activated** after `setup.sh`
+- Activate backend-specific venv using `source .venv_[pytorch|vllm|sglang]/bin/activate`
+- All commands are to be run using the virtual environment
+
+## Running Evaluations
+
+### PyTorch Backend (Distributed)
+
+> ⚠️ **IMPORTANT NOTE**: The PyTorch reference implementation takes approximately 8 days to run on an H200x8 system. This is because large max-OSL (32K) limits concurrency (max-BS=16), and unoptimized pytorch forward and decode logics.
+
+PyTorch backend uses distributed execution with `torchrun` and `run_eval_mpi.py`:
+
+```bash
+# Regular inference evaluation
+(.venv_pytorch) $ torchrun --nproc_per_node=8 run_eval_mpi.py --input-file <input_dataset>.pkl --output-file pytorch_output.pkl --num-samples 32
+
+# MLPerf performance benchmarks
+(.venv_pytorch) $ torchrun --nproc_per_node=8 run_mlperf_mpi.py --mode offline --input-file <input_dataset>.pkl --output-dir mlperf_results
+
+# MLPerf accuracy mode
+(.venv_pytorch) $ torchrun --nproc_per_node=8 run_mlperf_mpi.py --mode offline --accuracy --input-file <input_dataset>.pkl --output-dir mlperf_results
+```
+
+### vLLM and SGLang Backends
+
+For vLLM and SGLang, use single-process execution in `run_eval.py`:
+
+```bash
+# Regular inference evaluation
+(.venv_vllm) $ python run_eval.py --input-file <input_dataset>.pkl
+(.venv_sglang) $ python run_eval.py --input-file <input_dataset>.pkl
+
+# MLPerf performance benchmarks
+(.venv_vllm) $ python run_mlperf.py --mode offline --input-file <input_dataset>.pkl --output-dir mlperf_results
+(.venv_sglang) $ python run_mlperf.py --mode server --input-file <input_dataset>.pkl --output-dir mlperf_results
+```
+
+## MLPerf Inference Support
+
+The reference implementation includes full support for MLPerf inference benchmarks through a System Under Test (SUT) wrapper that integrates with MLPerf LoadGen.
+
+### Running MLPerf Benchmarks
+
+#### Offline Scenario
+```bash
+(.venv_BACKEND) $ python run_mlperf.py \
+    --mode offline \
+    --input-file <input_dataset>.pkl \
+    --output-dir mlperf_results
+```
+
+#### Server Scenario
+```bash
+(.venv_BACKEND) $ python run_mlperf.py \
+    --mode server \
+    --input-file <input_dataset>.pkl \
+    --output-dir mlperf_results
+```
+
+#### Pytorch Backend for Mlperf
+
+PyTorch backend uses distributed execution with `torchrun` and `run_mlperf_mpi.py`:
+
+```bash
+# PyTorch MLPerf offline scenario
+(.venv_BACKEND) $ torchrun --nproc_per_node=8 run_mlperf_mpi.py \
+    --mode offline \
+    --input-file <input_dataset>.pkl \
+    --output-dir mlperf_results
+```
+
+### MLPerf Command Line Options
+
+| Option         | Description                    | Default          |
+| -------------- | ------------------------------ | ---------------- |
+| `--mode`       | Scenario mode (offline/server) | `offline`        |
+| `--accuracy`   | Run accuracy test              | `False`          |
+| `--output-dir` | Output directory for results   | `mlperf_results` |
+
+### Backend Support Matrix
+
+The following table shows which backends support different evaluation and MLPerf operations:
+
+| Backend     | `run_eval.py` | `run_mlperf.py --mode=offline` | `run_mlperf.py --mode=server` |
+| ----------- | ------------- | ------------------------------ | ----------------------------- |
+| pytorch-fp8 | x             | x                              |                               |
+| vllm-fp8    | x             | x                              |                               |
+| sglang-fp8  | x             | x                              | x                             |
+
+> **Note**: For PyTorch backend, use the `_mpi` versions with `torchrun`. For vLLM and SGLang backends, use the single-process versions without `_mpi`.
+
+## Accuracy Evaluation
+
+Accuracy evaluation is handled uniformly across all backends:
+
+```bash
+# within container, with virtualenv activated
+(.venv_BACKEND) $ python3 eval_accuracy.py --input-file <input_file>.pkl
+```
+
+### Reference Evals
+
+Pytorch reference scores:
+
+```bash
+Evaluation Results: {
+  "mean-accuracy": 81.67730173199635,
+  "mean-output-tok-len": 4043.449863263446,
+  "num-samples": 4388
+}
+```
@@ -0,0 +1,14 @@
+"""
+Modular backend system for MLPerf DeepSeek reference implementation.
+
+Supports TensorRT-LLM, SGLang, vLLM, and PyTorch backends with shared API arguments
+but independent execution implementations.
+"""
+
+from .base_backend import BaseBackend
+
+# Note: Specific backend implementations are imported dynamically as needed
+# to avoid dependency issues when only using certain backends
+__all__ = [
+    'BaseBackend',
+]