mlcommons
diff --git a/‎.github/workflows/build_wheels.yml‎
Lines changed: 2 additions & 2 deletions b/‎.github/workflows/build_wheels.yml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎.gitmodules‎
Lines changed: 3 additions & 0 deletions b/‎.gitmodules‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎docs/submission/index.md‎
Lines changed: 233 additions & 218 deletions b/‎docs/submission/index.md‎
Lines changed: 233 additions & 218 deletions
diff --git a/‎docs/submission/submission-cli.md‎
Lines changed: 231 additions & 0 deletions b/‎docs/submission/submission-cli.md‎
Lines changed: 231 additions & 0 deletions
diff --git a/‎loadgen/README_BUILD.md‎
Lines changed: 1 addition & 1 deletion b/‎loadgen/README_BUILD.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎loadgen/VERSION.txt‎
Lines changed: 1 addition & 1 deletion b/‎loadgen/VERSION.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎loadgen/bindings/python_api.cc‎
Lines changed: 10 additions & 6 deletions b/‎loadgen/bindings/python_api.cc‎
Lines changed: 10 additions & 6 deletions
diff --git a/‎loadgen/mlperf.conf‎
Lines changed: 8 additions & 2 deletions b/‎loadgen/mlperf.conf‎
Lines changed: 8 additions & 2 deletions
@@ -124,10 +124,10 @@ jobs:
           name: wheels-macos-latest
           path: wheels
       # Download the built wheels from macOS-13 (x86)
-      - name: Download macOS-13 (x86) wheels
+      - name: Download macOS-15 (x86) wheels
         uses: actions/download-artifact@v4
         with:
-          name: wheels-macos-13
+          name: wheels-macos-15
           path: wheels
       # Download the built wheels from Windows
       - name: Download Windows wheels
 
@@ -10,3 +10,6 @@
 [submodule "language/deepseek-r1/submodules/LiveCodeBench"]
 	path = language/deepseek-r1/submodules/LiveCodeBench
 	url = https://github.com/LiveCodeBench/LiveCodeBench
+[submodule "text_to_video/wan2.2-t2v-14b/submodules/VBench"]
+	path = text_to_video/wan2.2-t2v-14b/submodules/VBench
+	url = https://github.com/Vchitect/VBench
@@ -0,0 +1,231 @@
+---
+hide:
+  - toc
+---
+
+Click [here](https://docs.google.com/presentation/d/1cmbpZUpVr78EIrhzyMBnnWnjJrD-mZ2vmSb-yETkTA8/edit?usp=sharing) to view the proposal slide for Common Automation for MLPerf Inference Submission Generation through MLCFlow.
+
+Please refer to the [installation page](site:inference/install/) to install MLCFlow for automating the submission generation. In a typical development environment `pip install mlc-scripts` should be enough.
+
+=== "Custom automation based MLPerf results"
+    If you have not followed the `mlcr` commands under the individual model pages in the [benchmarks](../index.md) directory, please make sure that the result directory is structured in the following way. You can see the real examples for the expected folder structure [here](https://github.com/mlcommons/inference/tree/submission-generation-examples).
+    ```
+    └── System description ID(SUT Name)
+        ├── system_meta.json
+        └── Benchmark
+            └── Scenario
+                ├── Performance
+                |   └── run_1 run for all scenarios
+                |       ├── mlperf_log_summary.txt
+                |       └── mlperf_log_detail.txt
+                ├── Accuracy
+                |   ├── mlperf_log_summary.txt
+                |   ├── mlperf_log_detail.txt
+                |   ├── mlperf_log_accuracy.json
+                |   └── accuracy.txt
+                |── Compliance_Test_ID
+                |   ├── Performance
+                |   |   └── run_x/#1 run for all scenarios
+                |   |       ├── mlperf_log_summary.txt
+                |   |       └── mlperf_log_detail.txt
+                |   ├── Accuracy # for TEST01 only
+                |   |   ├── baseline_accuracy.txt (if test fails in deterministic mode)
+                |   |   ├── compliance_accuracy.txt (if test fails in deterministic mode)
+                |   |   ├── mlperf_log_accuracy.json
+                |   |   └── accuracy.txt
+                |   ├── verify_performance.txt
+                |   └── verify_accuracy.txt # for TEST01 only
+                |── user.conf
+                └── measurements.json
+    ```
+    
+    <details>
+    <summary>Click here if you are submitting in open division</summary>
+
+    * The `model_mapping.json` should be included inside the SUT folder which is used to map the custom model full name to the official model name. The format of json file is:
+
+    ```
+        {
+            "custom_model_name_for_model1":"official_model_name_for_model1",
+            "custom_model_name_for_model2":"official_model_name_for_model2",
+
+        }
+    ```
+    </details>
+
+=== "MLC automation based results"
+    If you have followed the `mlcr` commands under the individual model pages in the [benchmarks](../index.md) directory, all the valid results will get aggregated to the `mlc cache` folder. The following command could be used to browse the structure of inference results folder generated by MLCFlow.
+    ### Get results folder structure
+
+    === "Unix Terminal"
+        ```bash
+        mlc find cache --tags=get,mlperf,inference,results,dir | xargs tree
+        ```
+    === "Windows PowerShell"
+        ```
+        mlc find cache --tags=get,mlperf,inference,results,dir |  ForEach-Object { Get-ChildItem -Recurse $_ }
+        ```
+
+
+Once all the results across all the models are ready you can use the following the below section to generate a valid submission tree compliant with the [MLPerf requirements](https://github.com/mlcommons/policies/blob/master/submission_rules.adoc#inference-1).
+
+## Generate submission folder
+
+The submission generation flow is explained in the below diagram
+
+```mermaid
+flowchart LR
+    subgraph Generation [Submission Generation SUT1]
+      direction TB
+      A[populate system details] --> B[generate submission structure]
+      B --> C[truncate-accuracy-logs]
+      C --> D{Infer low latency results <br>and/or<br> filter out invalid results}
+      D --> yes --> E[preprocess-mlperf-inference-submission]
+      D --> no --> F[run-mlperf-inference-submission-checker]
+      E --> F
+    end
+    Input((Results SUT1)) --> Generation
+    Generation --> Output((Submission Folder <br> SUT1))
+```
+
+### Command to generate submission folder
+
+```bash
+mlcr generate,inference,submission \
+  --clean \
+  --preprocess_submission=yes \
+  --run_checker=yes \
+  --submitter=MLCommons \
+  --division=closed \
+  --env.MLC_DETERMINE_MEMORY_CONFIGURATION=yes \
+  --quiet
+```
+!!! tip
+    * Use `--hw_name="My system name"` to give a meaningful system name. Examples can be seen [here](https://github.com/mlcommons/inference_results_v3.0/tree/main/open/cTuning/systems)
+
+    * Use `--submitter=<Your name>` if your organization is an official MLCommons member and would like to submit under your organization
+
+    * Use `--hw_notes_extra` option to add additional notes like `--hw_notes_extra="Result taken by NAME" `
+
+    * Use `--results_dir` option to specify the results folder.  It is automatically taken from MLC cache for MLPerf automation based runs
+
+    * Use `--submission_dir` option to specify the submission folder. (You can avoid this if you're pushing to github or only running a single SUT and MLC will use its cache folder)
+
+    * Use `--division=open` for open division submission 
+
+    * Use `--category` option to specify the category for which submission is generated(datacenter/edge). By default, the category is taken from `system_meta.json` file located in the SUT root directory.
+
+    * Use `--submission_base_dir` to specify the directory to which the outputs from preprocess submission script and final submission is added. No need to provide `--submission_dir` along with this. For `docker run`, use `--submission_base_dir` instead of `--submission_dir`.
+
+
+If there are multiple systems where MLPerf results are collected, the same process needs to be repeated on each of them. One we have submission folders on all the SUTs, we need to sync them to make a single submission folder
+
+=== "Sync Locally"
+    If you are having results in multiple systems, you need to merge them to one system. You can use `rsync` for this. For example, the below command will sync the submission folder from SUT2 to the one in SUT1. 
+    ```
+    rsync -avz username@host1:<path_to_submission_folder2>/ <path_to_submission_folder1>/
+    ```
+    Same needs to be repeated for all other SUTs so that we have the full submissions in SUT1.
+
+    ```mermaid
+        flowchart LR
+            subgraph SUT1 [Submission Generation SUT1]
+              A[Submission Folder SUT1]
+            end
+            subgraph SUT2 [Submission Generation SUT2]
+              B[Submission Folder SUT2]
+            end
+            subgraph SUT3 [Submission Generation SUT3]
+              C[Submission Folder SUT3]
+            end
+            subgraph SUTN [Submission Generation SUTN]
+              D[Submission Folder SUTN]
+            end
+            SUT2 --> SUT1
+            SUT3 --> SUT1
+            SUTN --> SUT1
+           
+    ```
+
+=== "Sync via a Github repo"
+    If you are collecting results across multiple systems you can generate different submissions and aggregate all of them to a GitHub repository (can be private) and use it to generate a single tar ball which can be uploaded to the [MLCommons Submission UI](https://submissions-ui.mlcommons.org/submission). 
+
+    Run the following command after **replacing `--repo_url` with your GitHub repository URL**.
+
+    ```bash
+    mlcr push,github,mlperf,inference,submission \
+       --repo_url=https://github.com/mlcommons/mlperf_inference_submissions_v5.0 \
+       --commit_message="Results on <HW name> added by <Name>" \
+       --quiet
+    ```
+
+    > **Note:** The path to the locally synced submission directory from the output below can be used in the next step by passing it to the `--submission_dir` argument.
+    <details>
+  	<summary>Click to see the sample output</summary>
+	```
+  	[2025-07-23 16:36:56,399 module.py:2197 INFO] - 
+    
+    Path to the locally synced submission directory: mysubmissions/mlperf_submission
+
+    
+ 	```
+    </details>
+    
+    ```mermaid
+        flowchart LR
+            subgraph SUT1 [Submission Generation SUT1]
+              A[Submission Folder SUT1]
+            end
+            subgraph SUT2 [Submission Generation SUT2]
+              B[Submission Folder SUT2]
+            end
+            subgraph SUT3 [Submission Generation SUT3]
+              C[Submission Folder SUT3]
+            end
+            subgraph SUTN [Submission Generation SUTN]
+              D[Submission Folder SUTN]
+            end
+	    SUT2 -- git sync and push --> G[Github Repo]
+	    SUT3 -- git sync and push --> G[Github Repo]
+	    SUTN -- git sync and push --> G[Github Repo]
+	    SUT1 -- git sync and push --> G[Github Repo]
+           
+    ```
+
+## Upload the final submission
+    
+!!! warning
+    If you are using GitHub for consolidating your results, make sure that you have run the [`push-to-github` command](#__tabbed_2_2) on the same system to ensure results are synced as is on the GitHub repository.
+
+Once you have all the results on the system, you can upload them to the MLCommons submission server as follows:
+
+=== "via CLI"
+    You can do the following command which will run the submission checker and upload the results to the MLCommons submission server
+    ```
+    mlcr run,mlperf,submission,checker,inference \
+    --submitter_id=<> \
+    --submission_dir=<Path to the locally synced submission directory> --quiet
+    ``` 
+    
+=== "via Browser"
+    You can do the following command to generate the final submission tar file and then upload to the [MLCommons Submission UI](https://submissions-ui.mlcommons.org/submission). 
+    ```
+    mlcr run,mlperf,submission,checker,inference \
+    --submission_dir=<Path to the submission folder> \
+    --tar=yes \
+    --submission_tar_file=mysubmission.tar.gz --quiet
+    ```
+    
+```mermaid
+        flowchart LR
+            subgraph SUT [Combined Submissions]
+              A[Combined Submission Folder in SUT1]
+            end
+	    SUT --> B[Run submission checker]
+	    B --> C[Upload to MLC Submission server]
+	    C --> D[Receive validation email]
+```
+
+
+
+<!--Click [here](https://youtu.be/eI1Hoecc3ho) to view the recording of the workshop: Streamlining your MLPerf Inference results using CM.-->
@@ -12,7 +12,7 @@
     pip install absl-py numpy
     git clone --recurse-submodules https://github.com/mlcommons/inference.git mlperf_inference
     cd mlperf_inference/loadgen
-    CFLAGS="-std=c++14 -O3" python -m pip install .
+    python -m pip install .
 
 This will fetch the loadgen source, build and install the loadgen as a python module, and run a simple end-to-end demo.
 
 
@@ -1 +1 @@
-6.0.1
+6.0.3
@@ -312,6 +312,8 @@ PYBIND11_MODULE(mlperf_loadgen, m) {
                      &TestSettings::server_num_issue_query_threads)
       .def_readwrite("offline_expected_qps",
                      &TestSettings::offline_expected_qps)
+      .def_readwrite("sample_concatenate_permutation",
+                     &TestSettings::sample_concatenate_permutation)
       .def_readwrite("min_duration_ms", &TestSettings::min_duration_ms)
       .def_readwrite("max_duration_ms", &TestSettings::max_duration_ms)
       .def_readwrite("min_query_count", &TestSettings::min_query_count)
@@ -324,6 +326,14 @@ PYBIND11_MODULE(mlperf_loadgen, m) {
                      &TestSettings::accuracy_log_rng_seed)
       .def_readwrite("accuracy_log_probability",
                      &TestSettings::accuracy_log_probability)
+      .def_readwrite("accuracy_log_sampling_target",
+                     &TestSettings::accuracy_log_sampling_target)
+      .def_readwrite("test05", &TestSettings::test05)
+      .def_readwrite("test05_qsl_rng_seed", &TestSettings::test05_qsl_rng_seed)
+      .def_readwrite("test05_sample_index_rng_seed",
+                     &TestSettings::test05_sample_index_rng_seed)
+      .def_readwrite("test05_schedule_rng_seed",
+                     &TestSettings::test05_schedule_rng_seed)
       .def_readwrite("print_timestamps", &TestSettings::print_timestamps)
       .def_readwrite("performance_issue_unique",
                      &TestSettings::performance_issue_unique)
@@ -333,12 +343,6 @@ PYBIND11_MODULE(mlperf_loadgen, m) {
                      &TestSettings::performance_issue_same_index)
       .def_readwrite("performance_sample_count_override",
                      &TestSettings::performance_sample_count_override)
-      .def_readwrite("test05", &TestSettings::test05)
-      .def_readwrite("test05_qsl_rng_seed", &TestSettings::test05_qsl_rng_seed)
-      .def_readwrite("test05_sample_index_rng_seed",
-                     &TestSettings::test05_sample_index_rng_seed)
-      .def_readwrite("test05_schedule_rng_seed",
-                     &TestSettings::test05_schedule_rng_seed)
       .def_readwrite("use_token_latencies", &TestSettings::use_token_latencies)
       .def_readwrite("ttft_latency", &TestSettings::server_ttft_latency)
       .def_readwrite("tpot_latency", &TestSettings::server_tpot_latency)
 
@@ -27,6 +27,7 @@ pointpainting.*.performance_sample_count_override = 1024
 deepseek-r1.*.performance_sample_count_override = 4388
 deepseek-r1-interactive.*.performance_sample_count_override = 4388
 whisper.*.performance_sample_count_override = 1633
+qwen3-vl-235b-a22b.*.performance_sample_count_override = 48289
 # set to 0 to let entire sample set to be performance sample
 3d-unet.*.performance_sample_count_override = 0
 
@@ -69,7 +70,7 @@ llama3_1-8b-interactive.*.sample_concatenate_permutation = 1
 deepseek-r1.*.sample_concatenate_permutation = 1
 deepseek-r1-interactive.*.sample_concatenate_permutation = 1
 whisper.*.sample_concatenate_permutation = 1
-
+qwen3-vl-235b-a22b.*.sample_concatenate_permutation = 1
 *.Server.target_latency = 10
 *.Server.target_latency_percentile = 99
 *.Server.target_duration = 0
@@ -94,7 +95,9 @@ llama3_1-8b-interactive.*.use_token_latencies = 1
 deepseek-r1.*.use_token_latencies = 1
 deepseek-r1-interactive.*.use_token_latencies = 1
 whisper.*.use_token_latencies = 1
-
+# For the VLM benchmark, the model response is relatively short, therefore we track 
+# end-to-end latency instead of token latencies.
+qwen3-vl-235b-a22b.*.use_token_latencies = 0
 # gptj benchmark infers token latencies
 gptj.*.infer_token_latencies = 1
 gptj.*.token_latency_scaling_factor = 69
@@ -140,6 +143,8 @@ deepseek-r1-interactive.Server.target_latency = 0
 deepseek-r1-interactive.Server.ttft_latency = 1500
 deepseek-r1-interactive.Server.tpot_latency = 15
 
+qwen3-vl-235b-a22b.Server.target_latency = 12000
+
 *.Offline.target_latency_percentile = 90
 *.Offline.min_duration = 600000
 
@@ -164,6 +169,7 @@ mixtral-8x7b.Offline.min_query_count = 15000
 rgat.Offline.min_query_count = 788379
 deepseek-r1.Offline.min_query_count = 4388
 whisper.Offline.min_query_count = 1633
+qwen3-vl-235b-a22b.Offline.min_query_count = 48289
 
 # These fields should be defined and overridden by user.conf.
 *.SingleStream.target_latency = 10