Skip to content

Commit 66b89f6

Browse files
authored
patch errors for VLM submissions with version v6.0 (#2426)
1 parent 571fa92 commit 66b89f6

File tree

3 files changed

+29
-4
lines changed

3 files changed

+29
-4
lines changed

multimodal/qwen3-vl/README.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -133,6 +133,10 @@ You should pass the `mlperf_log_accuracy.json` file (generated by LoadGen) to th
133133
mlperf-inf-mm-q3vl evaluate --filename output/mlperf_log_accuracy.json
134134
```
135135

136+
The command will generate the file accuracy.txt that you can use in your submission.
137+
Additionally, don't forget to truncate your original file `mlperf_log_accuracy.json` to a size less
138+
than 10KB for your submission.
139+
136140
## Docker
137141

138142
[docker/](docker/) provides examples of Dockerfiles that install the Q3VL benchmarking

multimodal/qwen3-vl/src/mlperf_inf_mm_q3vl/evaluation.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
from rapidfuzz import fuzz # type: ignore[import-untyped]
1616
from sklearn.metrics import f1_score # type: ignore[import-untyped]
1717
from tabulate import tabulate
18+
import hashlib
1819

1920
if TYPE_CHECKING:
2021
from typing import Any
@@ -398,3 +399,13 @@ def run_evaluation(random_seed: int, filename: FilePath,
398399
tablefmt="fancy_grid",
399400
),
400401
)
402+
403+
# Generate accuracy.txt file
404+
results_dict = {"f1": category_f1_score}
405+
data_string = json.dumps(results_dict, sort_keys=True)
406+
file_hash = hashlib.sha256(data_string.encode()).hexdigest()
407+
408+
with open("accuracy.txt", "w") as f:
409+
f.write("Results\n\n")
410+
f.write(f"{data_string}\n\n")
411+
f.write(f"hash={file_hash}")

tools/submission/submission_checker.py

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -208,7 +208,7 @@
208208
# TODO: Placeholder for now
209209
"gpt-oss-120b": ("exact_match", 83.13 * 0.99),
210210
# TODO: Placeholder for now
211-
"qwen3-vl-235b-a22b": ("F1", 0.7903 * 0.99),
211+
"qwen3-vl-235b-a22b": ("F1_HIERARCHICAL", 0.7903 * 0.99),
212212
"dlrm-v3": ("AUC", 78.663 * 0.99), # TODO: Placeholder for now
213213
},
214214
"accuracy-upper-limit": {
@@ -313,7 +313,7 @@
313313
"llama3.1-405b": {"Server": 60000000000},
314314
"deepseek-r1": {"Server": 60000000000},
315315
"gpt-oss-120b": {"Server": 60000000000},
316-
"qwen3-vl-235b-a22b": {"Server": 60000000000},
316+
"qwen3-vl-235b-a22b": {"Server": 12000000000},
317317
"dlrm-v3": {"Server": 60000000000},
318318
},
319319
"min-queries": {
@@ -779,6 +779,12 @@
779779
"MultiStream": "early_stopping_latency_ms",
780780
"Server": "result_completed_samples_per_sec",
781781
},
782+
"v6.0": {
783+
"Offline": "result_samples_per_second",
784+
"SingleStream": "early_stopping_latency_ss",
785+
"MultiStream": "early_stopping_latency_ms",
786+
"Server": "result_completed_samples_per_sec",
787+
},
782788
}
783789

784790
RESULT_FIELD_BENCHMARK_OVERWRITE = {
@@ -938,7 +944,8 @@
938944
"FID_SCORE": r".*'FID_SCORE':\s+'?([\d.]+).*",
939945
"gsm8k_accuracy": r".*'gsm8k':\s([\d.]+).*",
940946
"mbxp_accuracy": r".*'mbxp':\s([\d.]+).*",
941-
"exact_match": r".*'exact_match':\s([\d.]+).*"
947+
"exact_match": r".*'exact_match':\s([\d.]+).*",
948+
"F1_HIERARCHICAL": r'\{.*"f1":\s*([\d\.]+).*\}'
942949
}
943950

944951
SYSTEM_DESC_REQUIRED_FIELDS = [
@@ -1507,6 +1514,7 @@ def check_accuracy_dir(config, model, path, verbose):
15071514
is_valid = False
15081515
else:
15091516
if os.stat(fname).st_size > MAX_ACCURACY_LOG_SIZE:
1517+
log.error("Max expected file size is: %s bytes", MAX_ACCURACY_LOG_SIZE)
15101518
log.error("%s is not truncated", fname)
15111519
is_valid = False
15121520

@@ -3302,6 +3310,7 @@ def check_compliance_dir(
33023310
"rgat",
33033311
"deepseek-r1",
33043312
"whisper",
3313+
"qwen3-vl-235b-a22b"
33053314
]:
33063315
test_list.remove("TEST04")
33073316

@@ -3312,7 +3321,8 @@ def check_compliance_dir(
33123321
"llama2-70b-99.9",
33133322
"mixtral-8x7b",
33143323
"llama3.1-405b",
3315-
"deepseek-r1"
3324+
"deepseek-r1",
3325+
"qwen3-vl-235b-a22b"
33163326
]:
33173327
test_list.remove("TEST01")
33183328

0 commit comments

Comments
 (0)