Skip to content

Commit 4412556

Browse files
committed
add vlm acc benchmark
1 parent 29fd280 commit 4412556

File tree

1 file changed

+75
-0
lines changed

1 file changed

+75
-0
lines changed

test/acc/test_vlm_models.py

Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
import argparse
2+
import glob
3+
import json
4+
import os
5+
import random
6+
import subprocess
7+
import sys
8+
import unittest
9+
from types import SimpleNamespace
10+
11+
"""
12+
git clone --branch v0.3.3 --depth 1 https://github.com/EvolvingLMMs-Lab/lmms-eval.git
13+
pip install -e lmms-eval/
14+
"""
15+
16+
# VLM models for testing
17+
MODELS = [
18+
SimpleNamespace(
19+
model="Qwen/Qwen2.5-VL-7B-Instruct",
20+
mmmu_accuracy=0.4,
21+
),
22+
]
23+
os.environ["OPENAI_API_KEY"] = "lightllm123"
24+
os.environ["OPENAI_API_BASE"] = "http://localhost:8000/v1"
25+
26+
27+
def run_mmmu_eval(
28+
model_version: str,
29+
output_path: str,
30+
):
31+
"""
32+
Evaluate a VLM on the MMMU validation set with lmms‑eval.
33+
Only `model_version` (checkpoint) and `chat_template` vary;
34+
We are focusing only on the validation set due to resource constraints.
35+
"""
36+
# -------- fixed settings --------
37+
model = "openai_compatible"
38+
tp = 1
39+
tasks = "mmmu_val"
40+
batch_size = 16
41+
log_suffix = "openai_compatible"
42+
os.makedirs(output_path, exist_ok=True)
43+
44+
# -------- compose --model_args --------
45+
model_args = f"model_version={model_version}," f"tp={tp}"
46+
print(model_args)
47+
48+
# -------- build command list --------
49+
cmd = [
50+
"python3",
51+
"-m",
52+
"lmms_eval",
53+
"--model",
54+
model,
55+
"--model_args",
56+
model_args,
57+
"--tasks",
58+
tasks,
59+
"--batch_size",
60+
str(batch_size),
61+
"--log_samples",
62+
"--log_samples_suffix",
63+
log_suffix,
64+
"--output_path",
65+
str(output_path),
66+
]
67+
68+
subprocess.run(
69+
cmd,
70+
check=True,
71+
timeout=3600,
72+
)
73+
74+
75+
run_mmmu_eval("Qwen/Qwen2.5-VL-7B-Instruct", "./logs")

0 commit comments

Comments
 (0)