Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
60 commits
Select commit Hold shift + click to select a range
5b45943
enable VllmDeployer to fail fast if the underying vllm process failed.
wangshangsam Dec 10, 2025
bad5387
example slurm script for submitting jobs
wangshangsam Dec 10, 2025
08b32cc
fix slurm scripts
wangshangsam Dec 11, 2025
1cdf563
small fix
wangshangsam Dec 11, 2025
d9caddc
[Automated Commit] Format Codebase
github-actions[bot] Dec 11, 2025
6f62339
Update the readme about the example slurm scripts.
wangshangsam Dec 11, 2025
88b34a4
Merge branch 'wangshangsam/fix-req-timeout' of github.com:CentML/mlpe…
wangshangsam Dec 11, 2025
59dc167
Change the default endpoint startup timeout to 1 hour in case someone…
wangshangsam Dec 11, 2025
d9c0bcc
change servr expected qps and target latency
johncalesp Dec 11, 2025
a75dc68
Change the default dataset repo_id to the new name of the public dataset
wangshangsam Dec 12, 2025
866eba9
[Automated Commit] Format Codebase
github-actions[bot] Dec 12, 2025
a8a8870
evaluate the json file with multiprocess
johncalesp Dec 12, 2025
9f3b52e
[Automated Commit] Format Codebase
github-actions[bot] Dec 12, 2025
0342909
change default server_target_latency to 12
wangshangsam Dec 12, 2025
7576e0c
Merge branch 'wangshangsam/fix-req-timeout' of github.com:CentML/mlpe…
wangshangsam Dec 12, 2025
d10d634
revert evaluation changeS
johncalesp Dec 12, 2025
e75a34a
[Automated Commit] Format Codebase
github-actions[bot] Dec 12, 2025
2209ae6
update slurm script
wangshangsam Dec 14, 2025
1450143
update slurm script
wangshangsam Dec 15, 2025
6a5f17d
revert evaluation.py changes after analysing the discrepancy in is_se…
johncalesp Dec 15, 2025
d5d2cc8
[Automated Commit] Format Codebase
github-actions[bot] Dec 15, 2025
f72d82d
linting
wangshangsam Dec 16, 2025
0e731ed
[Automated Commit] Format Codebase
github-actions[bot] Dec 16, 2025
4771f13
lock in model and dataset SHA
wangshangsam Dec 16, 2025
55a8cf1
Merge branch 'wangshangsam/fix-req-timeout' of github.com:CentML/mlpe…
wangshangsam Dec 16, 2025
d4d6f78
[Automated Commit] Format Codebase
github-actions[bot] Dec 16, 2025
c0d0925
Specify model quality target and server target latency in the README
wangshangsam Dec 16, 2025
e2adf60
Merge branch 'wangshangsam/fix-req-timeout' of github.com:CentML/mlpe…
wangshangsam Dec 16, 2025
7dabbfe
Update loadgen/mlperf.conf
wangshangsam Dec 18, 2025
423cea4
aligning TestSettings'C++ code with its python binding
wangshangsam Dec 18, 2025
817f0e8
[Automated Commit] Format Codebase
github-actions[bot] Dec 18, 2025
9d3b36b
remove ttft and tpot from mlperf.conf
wangshangsam Dec 18, 2025
29e7c1a
Merge branch 'wangshangsam/fix-req-timeout' of github.com:CentML/mlpe…
wangshangsam Dec 18, 2025
95f4179
Enable CLI to take in user.conf
wangshangsam Dec 18, 2025
5370ecd
[Automated Commit] Format Codebase
github-actions[bot] Dec 18, 2025
f9d983f
readme
wangshangsam Dec 19, 2025
897894d
Merge branch 'wangshangsam/fix-req-timeout' of github.com:CentML/mlpe…
wangshangsam Dec 19, 2025
8f8e886
Merge branch 'master' into wangshangsam/fix-req-timeout
wangshangsam Dec 19, 2025
f8e6bf8
readme
wangshangsam Dec 19, 2025
8bfbeb9
rename vl2l -> q3vl
wangshangsam Dec 19, 2025
b589ddd
[Automated Commit] Format Codebase
github-actions[bot] Dec 19, 2025
3b065ee
empty
wangshangsam Dec 19, 2025
eb65590
rerun ci
wangshangsam Dec 19, 2025
38ff6f9
rerun ci
wangshangsam Dec 19, 2025
c1534ae
Introduce sampling parameters
wangshangsam Dec 20, 2025
472471f
[Automated Commit] Format Codebase
github-actions[bot] Dec 20, 2025
e9117a7
Merge branch 'master' into wangshangsam/fix-req-timeout
wangshangsam Dec 22, 2025
1b04e7b
[Automated Commit] Format Codebase
github-actions[bot] Dec 22, 2025
4c66f1c
empty
wangshangsam Dec 22, 2025
69c8b08
move CFLAGS="-std=c++14 -O3" into extra_compile_args of Pybind11Exten…
wangshangsam Dec 22, 2025
c24d286
[Automated Commit] Format Codebase
github-actions[bot] Dec 22, 2025
f24a6a9
enable specifying loadgen source in the Dockerfile
wangshangsam Dec 22, 2025
bc1449a
Merge branch 'wangshangsam/fix-req-timeout' of github.com:CentML/mlpe…
wangshangsam Dec 22, 2025
8a517cd
update slurm scripts
wangshangsam Dec 22, 2025
deb6dd0
Maintain None as the default value for the sampling params
wangshangsam Dec 22, 2025
3e55d26
[Automated Commit] Format Codebase
github-actions[bot] Dec 22, 2025
8fa86ab
update readme
wangshangsam Dec 22, 2025
f859932
Merge branch 'master' into wangshangsam/fix-req-timeout
wangshangsam Dec 22, 2025
8c600ce
[Automated Commit] Format Codebase
github-actions[bot] Dec 22, 2025
ff8a727
empty
wangshangsam Dec 22, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion loadgen/README_BUILD.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
pip install absl-py numpy
git clone --recurse-submodules https://github.com/mlcommons/inference.git mlperf_inference
cd mlperf_inference/loadgen
CFLAGS="-std=c++14 -O3" python -m pip install .
python -m pip install .

This will fetch the loadgen source, build and install the loadgen as a python module, and run a simple end-to-end demo.

Expand Down
16 changes: 10 additions & 6 deletions loadgen/bindings/python_api.cc
Original file line number Diff line number Diff line change
Expand Up @@ -312,6 +312,8 @@ PYBIND11_MODULE(mlperf_loadgen, m) {
&TestSettings::server_num_issue_query_threads)
.def_readwrite("offline_expected_qps",
&TestSettings::offline_expected_qps)
.def_readwrite("sample_concatenate_permutation",
&TestSettings::sample_concatenate_permutation)
.def_readwrite("min_duration_ms", &TestSettings::min_duration_ms)
.def_readwrite("max_duration_ms", &TestSettings::max_duration_ms)
.def_readwrite("min_query_count", &TestSettings::min_query_count)
Expand All @@ -324,6 +326,14 @@ PYBIND11_MODULE(mlperf_loadgen, m) {
&TestSettings::accuracy_log_rng_seed)
.def_readwrite("accuracy_log_probability",
&TestSettings::accuracy_log_probability)
.def_readwrite("accuracy_log_sampling_target",
&TestSettings::accuracy_log_sampling_target)
.def_readwrite("test05", &TestSettings::test05)
.def_readwrite("test05_qsl_rng_seed", &TestSettings::test05_qsl_rng_seed)
.def_readwrite("test05_sample_index_rng_seed",
&TestSettings::test05_sample_index_rng_seed)
.def_readwrite("test05_schedule_rng_seed",
&TestSettings::test05_schedule_rng_seed)
.def_readwrite("print_timestamps", &TestSettings::print_timestamps)
.def_readwrite("performance_issue_unique",
&TestSettings::performance_issue_unique)
Expand All @@ -333,12 +343,6 @@ PYBIND11_MODULE(mlperf_loadgen, m) {
&TestSettings::performance_issue_same_index)
.def_readwrite("performance_sample_count_override",
&TestSettings::performance_sample_count_override)
.def_readwrite("test05", &TestSettings::test05)
.def_readwrite("test05_qsl_rng_seed", &TestSettings::test05_qsl_rng_seed)
.def_readwrite("test05_sample_index_rng_seed",
&TestSettings::test05_sample_index_rng_seed)
.def_readwrite("test05_schedule_rng_seed",
&TestSettings::test05_schedule_rng_seed)
.def_readwrite("use_token_latencies", &TestSettings::use_token_latencies)
.def_readwrite("ttft_latency", &TestSettings::server_ttft_latency)
.def_readwrite("tpot_latency", &TestSettings::server_tpot_latency)
Expand Down
10 changes: 8 additions & 2 deletions loadgen/mlperf.conf
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ pointpainting.*.performance_sample_count_override = 1024
deepseek-r1.*.performance_sample_count_override = 4388
deepseek-r1-interactive.*.performance_sample_count_override = 4388
whisper.*.performance_sample_count_override = 1633
qwen3-vl-235b-a22b.*.performance_sample_count_override = 48289
# set to 0 to let entire sample set to be performance sample
3d-unet.*.performance_sample_count_override = 0

Expand Down Expand Up @@ -69,7 +70,7 @@ llama3_1-8b-interactive.*.sample_concatenate_permutation = 1
deepseek-r1.*.sample_concatenate_permutation = 1
deepseek-r1-interactive.*.sample_concatenate_permutation = 1
whisper.*.sample_concatenate_permutation = 1

qwen3-vl-235b-a22b.*.sample_concatenate_permutation = 1
*.Server.target_latency = 10
*.Server.target_latency_percentile = 99
*.Server.target_duration = 0
Expand All @@ -94,7 +95,9 @@ llama3_1-8b-interactive.*.use_token_latencies = 1
deepseek-r1.*.use_token_latencies = 1
deepseek-r1-interactive.*.use_token_latencies = 1
whisper.*.use_token_latencies = 1

# For the VLM benchmark, the model response is relatively short, therefore we track
# end-to-end latency instead of token latencies.
qwen3-vl-235b-a22b.*.use_token_latencies = 0
# gptj benchmark infers token latencies
gptj.*.infer_token_latencies = 1
gptj.*.token_latency_scaling_factor = 69
Expand Down Expand Up @@ -140,6 +143,8 @@ deepseek-r1-interactive.Server.target_latency = 0
deepseek-r1-interactive.Server.ttft_latency = 1500
deepseek-r1-interactive.Server.tpot_latency = 15

qwen3-vl-235b-a22b.Server.target_latency = 12000

*.Offline.target_latency_percentile = 90
*.Offline.min_duration = 600000

Expand All @@ -164,6 +169,7 @@ mixtral-8x7b.Offline.min_query_count = 15000
rgat.Offline.min_query_count = 788379
deepseek-r1.Offline.min_query_count = 4388
whisper.Offline.min_query_count = 1633
qwen3-vl-235b-a22b.Offline.min_query_count = 48289

# These fields should be defined and overridden by user.conf.
*.SingleStream.target_latency = 10
Expand Down
47 changes: 24 additions & 23 deletions loadgen/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,12 +24,12 @@
# and binaries. Use one of the gn build targets instead if you want
# to avoid poluting the source tree.

from setuptools import Extension, setup
from pathlib import Path

from pybind11 import get_include
from pybind11.setup_helpers import Pybind11Extension, build_ext
from setuptools import setup
from version_generator import generate_loadgen_version_definitions
import subprocess

generated_version_source_filename = "generated/version_generated.cc"
generate_loadgen_version_definitions(generated_version_source_filename, ".")
Expand All @@ -42,7 +42,7 @@
"test_settings.h",
"issue_query_controller.h",
"early_stopping.h",
"query_dispatch_library.h"
"query_dispatch_library.h",
]

lib_headers = [
Expand All @@ -54,7 +54,7 @@
"results.h",
"bindings/c_api.h",
"version_generator.py",
"mlperf_conf.h"
"mlperf_conf.h",
]

lib_sources = [
Expand Down Expand Up @@ -93,18 +93,18 @@


try:
with open("mlperf.conf", 'r') as file:
with open("mlperf.conf", "r") as file:
conf_contents = file.read()

# Escape backslashes and double quotes
conf_contents = conf_contents.replace('\\', '\\\\').replace('"', '\\"')
conf_contents = conf_contents.replace("\\", "\\\\").replace('"', '\\"')

# Convert newlines
conf_contents = conf_contents.replace('\n', '\\n"\n"')
conf_contents = conf_contents.replace("\n", '\\n"\n"')

formatted_content = f'const char* mlperf_conf =\n"{conf_contents}";\n'

with open("mlperf_conf.h", 'w') as header_file:
with open("mlperf_conf.h", "w") as header_file:
header_file.write(formatted_content)

except IOError as e:
Expand All @@ -113,24 +113,25 @@
mlperf_loadgen_module = Pybind11Extension(
"mlperf_loadgen",
define_macros=[
("MAJOR_VERSION",
version_split[0]),
("MINOR_VERSION",
version_split[1])
("MAJOR_VERSION", version_split[0]),
("MINOR_VERSION", version_split[1]),
],
include_dirs=[".", get_include()],
sources=mlperf_loadgen_sources,
depends=mlperf_loadgen_headers,
extra_compile_args=["-std=c++14", "-O3"],
)

setup(name="mlcommons_loadgen",
version=version,
description="MLPerf Inference LoadGen python bindings",
url="https://mlcommons.org/",
cmdclass={"build_ext": build_ext},
ext_modules=[mlperf_loadgen_module],
packages=['mlcommons_loadgen'],
package_dir={'mlcommons_loadgen': '.'},
include_package_data=True,
long_description=mlperf_long_description,
long_description_content_type='text/markdown')
setup(
name="mlcommons_loadgen",
version=version,
description="MLPerf Inference LoadGen python bindings",
url="https://mlcommons.org/",
cmdclass={"build_ext": build_ext},
ext_modules=[mlperf_loadgen_module],
packages=["mlcommons_loadgen"],
package_dir={"mlcommons_loadgen": "."},
include_package_data=True,
long_description=mlperf_long_description,
long_description_content_type="text/markdown",
)
8 changes: 4 additions & 4 deletions loadgen/test_settings.h
Original file line number Diff line number Diff line change
Expand Up @@ -234,10 +234,6 @@ struct TestSettings {
uint64_t test05_qsl_rng_seed = 0;
uint64_t test05_sample_index_rng_seed = 0;
uint64_t test05_schedule_rng_seed = 0;

/// \brief Load mlperf parameter config from file.
int FromConfig(const std::string &path, const std::string &model,
const std::string &scenario, int conf_type = 1);
/**@}*/

// ==================================
Expand Down Expand Up @@ -272,6 +268,10 @@ struct TestSettings {
bool infer_token_latencies = false;
uint64_t token_latency_scaling_factor;
/**@}*/

/// \brief Load mlperf parameter config from file.
int FromConfig(const std::string &path, const std::string &model,
const std::string &scenario, int conf_type = 1);
};

///
Expand Down
File renamed without changes.
Loading