Skip to content

Commit 43e446b

Browse files
authored
add mmvu task (#713)
* add mmvu task * fix linting videomathqa
1 parent f12b79a commit 43e446b

File tree

3 files changed

+377
-0
lines changed

3 files changed

+377
-0
lines changed

lmms_eval/tasks/mmvu/mmvu_val.yaml

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
dataset_path: lmms-lab/MMVU
2+
dataset_kwargs:
3+
token: True
4+
cache_dir: mmvu
5+
video: True
6+
generation_kwargs:
7+
max_new_tokens: 1024
8+
temperature: 0.7
9+
top_p: 1.0
10+
num_beams: 1
11+
do_sample: false
12+
task: mmvu_val
13+
test_split: validation
14+
output_type: generate_until
15+
doc_to_visual: !function utils.mmvu_doc_to_visual_val
16+
doc_to_text: !function utils.mmvu_doc_to_text
17+
doc_to_target: "answer"
18+
19+
process_results: !function utils.mmvu_process_results
20+
21+
metric_list:
22+
- metric: accuracy
23+
aggregation: !function utils.mmvu_aggregate_results_val
24+
higher_is_better: true
25+
lmms_eval_specific_kwargs:
26+
default:
27+
pre_prompt: ""
28+
post_prompt: ""
Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
dataset_path: lmms-lab/MMVU
2+
dataset_kwargs:
3+
token: True
4+
cache_dir: mmvu
5+
video: True
6+
generation_kwargs:
7+
max_new_tokens: 1024
8+
temperature: 0.7
9+
top_p: 1.0
10+
num_beams: 1
11+
do_sample: false
12+
task: mmvu_val_cot
13+
test_split: validation
14+
output_type: generate_until
15+
doc_to_visual: !function utils.mmvu_doc_to_visual_val
16+
doc_to_text: !function utils.mmvu_doc_to_text_cot
17+
doc_to_target: "answer"
18+
19+
process_results: !function utils.mmvu_process_results
20+
21+
metric_list:
22+
- metric: accuracy
23+
aggregation: !function utils.mmvu_aggregate_results_val
24+
higher_is_better: true
25+
lmms_eval_specific_kwargs:
26+
default:
27+
pre_prompt: ""
28+
post_prompt: ""

lmms_eval/tasks/mmvu/utils.py

Lines changed: 321 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,321 @@
1+
import base64
2+
import datetime
3+
import json
4+
import os
5+
import re
6+
import sys
7+
import time
8+
from collections import defaultdict
9+
from pathlib import Path
10+
from typing import Dict, List, Optional, Union
11+
12+
import cv2
13+
import numpy as np
14+
import requests
15+
import yaml
16+
from loguru import logger as eval_logger
17+
from openai import AzureOpenAI, OpenAI
18+
19+
from lmms_eval.tasks._task_utils.file_utils import generate_submission_file
20+
21+
hf_home = os.getenv("HF_HOME", "~/.cache/huggingface")
22+
23+
base_cache_dir = os.path.expanduser(hf_home)
24+
25+
26+
with open(Path(__file__).parent / "mmvu_val.yaml", "r") as f:
27+
raw_data_val = f.readlines()
28+
safe_data_val = []
29+
for i, line in enumerate(raw_data_val):
30+
# remove function definition since yaml load cannot handle it
31+
if "!function" not in line:
32+
safe_data_val.append(line)
33+
cache_name_val = yaml.safe_load("".join(safe_data_val))["dataset_kwargs"]["cache_dir"]
34+
cache_dir_val = os.path.join(base_cache_dir, cache_name_val)
35+
36+
37+
def mmvu_doc_to_visual_val(doc):
38+
video_path = doc["video_path"]
39+
video_path = os.path.join(cache_dir_val, video_path)
40+
if os.path.exists(video_path):
41+
video_path = video_path
42+
else:
43+
sys.exit(f"video path:{video_path} does not exist, please check")
44+
return [video_path]
45+
46+
47+
multiple_choice_prompt = """
48+
Question:{question}
49+
A: {a}
50+
B: {b}
51+
C: {c}
52+
D: {d}
53+
E: {e}
54+
Visual Information: processed video
55+
Do not generate any intermediate reasoning process. Answer directly with the option letter from the
56+
given choices.
57+
"""
58+
59+
open_ended_prompt = """
60+
Question:{question}
61+
Visual Information: processed video
62+
Do not generate any intermediate reasoning process. Directly output the final answer.
63+
"""
64+
65+
multiple_choice_prompt_cot = """
66+
Question:{question}
67+
A: {a}
68+
B: {b}
69+
C: {c}
70+
D: {d}
71+
E: {e}
72+
Visual Information: processed video
73+
Answer the given multiple-choice question step by step. Begin by explaining your reasoning process
74+
clearly. Conclude by stating the final answer using the following format: "Therefore, the final answer
75+
is: $LETTER" (without quotes), where $LETTER is one of the options. Think step by step before
76+
answering.
77+
"""
78+
79+
open_ended_prompt_cot = """
80+
Question:{question}
81+
Visual Information: processed video
82+
Answer the given question step by step. Begin by explaining your reasoning process clearly. Conclude
83+
by stating the final answer using the following format: "Therefore, the final answer is: "Answer:
84+
$ANSWER" (without quotes), where $ANSWER is the final answer of the question. Think step by
85+
step before answering.
86+
"""
87+
88+
89+
def mmvu_doc_to_text(doc, lmms_eval_specific_kwargs=None):
90+
question_type = doc["question_type"]
91+
if question_type == "multiple_choice":
92+
question = doc["question"]
93+
choices = doc["choices"]
94+
full_prompt = multiple_choice_prompt.format(question=question, a=choices["A"], b=choices["B"], c=choices["C"], d=choices["D"], e=choices["E"])
95+
else:
96+
question = doc["question"]
97+
full_prompt = open_ended_prompt.format(question=question)
98+
return full_prompt
99+
100+
101+
def mmvu_doc_to_text_cot(doc, lmms_eval_specific_kwargs=None):
102+
question_type = doc["question_type"]
103+
if question_type == "multiple_choice":
104+
question = doc["question"]
105+
choices = doc["choices"]
106+
full_prompt = multiple_choice_prompt_cot.format(question=question, a=choices["A"], b=choices["B"], c=choices["C"], d=choices["D"], e=choices["E"])
107+
else:
108+
question = doc["question"]
109+
full_prompt = open_ended_prompt_cot.format(question=question)
110+
return full_prompt
111+
112+
113+
mcq_eval_prompt = """
114+
[Instruction]
115+
Evaluate whether the model's final answer is correct by comparing it to the ground-truth answer provided for the given question.
116+
You should first extract the final answer from the model's response, and then compare the extracted
117+
answer with the ground-truth answer to determine its accuracy. Output your response in the following
118+
structured format:
119+
{{
120+
"extracted answer": // str value "A" "B" "C" "D" "E", should be a single character
121+
"correct": // boolean value, true if the answer is correct, false otherwise
122+
}}
123+
[User]
124+
Question:{question}
125+
A: {a}
126+
B: {b}
127+
C: {c}
128+
D: {d}
129+
E: {e}
130+
Ground Truth Answer: {ground_truth}
131+
Model Response to the Question: {model_response}
132+
"""
133+
134+
open_ended_eval_prompt = """
135+
[Instruction]
136+
Evaluate whether the model's final answer is correct by comparing it to the ground-truth answer
137+
provided for the given question. You should first extract the final answer from the model's response,
138+
and then compare the extracted answer with the ground-truth answer to determine its accuracy. The
139+
final answer generated by the model does not need to match the ground-truth answer word-for-word.
140+
However, it should only be considered correct if it demonstrates the exact same technique or concept
141+
explicitly and unambiguously equivalent to the ground-truth answer. Output your response in the
142+
following structured format:
143+
{{
144+
"extracted answer": // str value, the short final answer extracted from the model's response, do not
145+
hallucinate one that is not present in the response
146+
"correct": // boolean value, true if the answer is correct, false otherwise
147+
}}
148+
[User]
149+
Question:{question}
150+
Ground Truth Answer: {ground_truth}
151+
Model Response to the Question: {model_response}
152+
"""
153+
154+
MAX_ITER = 5
155+
NUM_SECONDS_TO_SLEEP = 1
156+
API_TYPE = os.getenv("API_TYPE", "azure")
157+
if API_TYPE == "openai":
158+
endpoint = os.getenv("OPENAI_API_URL", "https://api.openai.com/v1/chat/completions")
159+
deployment = os.getenv("DEPLOYMENT_NAME", "gpt-4o")
160+
subscription_key = os.getenv("OPENAI_API_KEY", "YOUR_API_KEY")
161+
client = OpenAI(
162+
api_key=subscription_key,
163+
api_base=endpoint,
164+
api_version="2025-01-01-preview",
165+
)
166+
elif API_TYPE == "azure":
167+
endpoint = os.getenv("ENDPOINT_URL", "your_endpoint_url")
168+
deployment = os.getenv("DEPLOYMENT_NAME", "gpt-4o")
169+
subscription_key = os.getenv("AZURE_OPENAI_API_KEY", "your_api_key")
170+
client = AzureOpenAI(
171+
azure_endpoint=endpoint,
172+
api_key=subscription_key,
173+
api_version="2025-01-01-preview",
174+
)
175+
else:
176+
raise ValueError(f"Unsupported API_TYPE: {API_TYPE}. Please set it to 'openai' or 'azure'.")
177+
178+
179+
def gpt_parser(response, doc):
180+
question_type = doc["question_type"]
181+
if question_type == "multiple-choice":
182+
prompt = mcq_eval_prompt.format(
183+
question=doc["question"],
184+
a=doc["choices"]["A"],
185+
b=doc["choices"]["B"],
186+
c=doc["choices"]["C"],
187+
d=doc["choices"]["D"],
188+
e=doc["choices"]["E"],
189+
ground_truth=doc["answer"] + " " + doc["choices"][doc["answer"]],
190+
model_response=response,
191+
)
192+
else:
193+
prompt = open_ended_eval_prompt.format(question=doc["question"], ground_truth=doc["answer"], model_response=response)
194+
195+
prompt_message = [
196+
{
197+
"role": "user",
198+
"content": prompt,
199+
}
200+
]
201+
202+
params = {
203+
"model": "gpt-4o",
204+
"messages": prompt_message,
205+
"max_tokens": 512,
206+
"temperature": 0.0,
207+
}
208+
209+
try:
210+
response = client.chat.completions.create(**params)
211+
response_text = response.choices[0].message.content
212+
eval_logger.debug(f"Raw GPT response: {response_text}")
213+
return json.loads(response_text)
214+
215+
except Exception as e:
216+
print(response)
217+
eval_logger.error(f"Error parsing GPT response: {e}")
218+
return None
219+
220+
221+
def extract_category(doc):
222+
category = doc["video_path"].split("/")[-2]
223+
return category
224+
225+
226+
def mmvu_process_results(doc, results):
227+
"""
228+
Args:
229+
doc: a instance of the eval dataset
230+
results: [pred]
231+
Returns:
232+
a dictionary with key: metric name (in this case videomme score), value: metric value
233+
"""
234+
pred = results[0]
235+
pred_ans = pred
236+
category = extract_category(doc)
237+
curr_iter = 0
238+
parsed_response = None
239+
while parsed_response is None and curr_iter < MAX_ITER:
240+
parsed_response = gpt_parser(pred_ans, doc)
241+
curr_iter += 1
242+
time.sleep(NUM_SECONDS_TO_SLEEP)
243+
if parsed_response is None:
244+
parsed_response = {"extracted answer": "N/A", "correct": False}
245+
246+
pred_ans = parsed_response.get("extracted answer", "N/A")
247+
correct = parsed_response.get("correct", False)
248+
249+
data_dict = {"question_id": doc["id"], "category": category, "pred_answer": pred_ans, "answer": doc["answer"], "correct": correct}
250+
251+
return {f"accuracy": data_dict}
252+
253+
254+
def mmvu_aggregate_results_val(results):
255+
"""
256+
Args:
257+
results: a list of values returned by process_results
258+
Returns:
259+
A score
260+
"""
261+
262+
TASK_MAP = {
263+
"Biology": "Science",
264+
"Chemistry": "Science",
265+
"Modern_Physics": "Science",
266+
"Astronomy": "Science",
267+
"Geography": "Science",
268+
"Materials_Science": "Science",
269+
"Neurobiology": "Science",
270+
"Electromagnetism": "Science",
271+
"Thermodynamics": "Science",
272+
"Mechanics": "Science",
273+
"Civil_Engineering": "Engineering",
274+
"Electrical_Engineering": "Engineering",
275+
"Mechanical_Engineering": "Engineering",
276+
"Biomedical_Engineering": "Engineering",
277+
"Electronics_and_Communication": "Engineering",
278+
"Computer_Science": "Engineering",
279+
"Clinical_Medicine": "Healthcare",
280+
"Basic_Medicine": "Healthcare",
281+
"Preventive_Medicine": "Healthcare",
282+
"Pharmacy": "Healthcare",
283+
"Dentistry": "Healthcare",
284+
"Art": "Humanities_and_Social_Science",
285+
"Literature": "Humanities_and_Social_Science",
286+
"History": "Humanities_and_Social_Science",
287+
"Law": "Humanities_and_Social_Science",
288+
"Economics": "Humanities_and_Social_Science",
289+
"Management": "Humanities_and_Social_Science",
290+
}
291+
292+
TASK_TYPES = list(set(TASK_MAP.values()))
293+
294+
category2score = {}
295+
for task_type in TASK_TYPES:
296+
category2score[task_type] = {"correct": 0, "answered": 0}
297+
298+
for result in results:
299+
category = result["category"]
300+
if category in TASK_MAP:
301+
category = TASK_MAP[category]
302+
category2score[category]["answered"] += 1
303+
category2score[category]["correct"] += result.get("correct", False)
304+
category_scores = {}
305+
306+
for category in TASK_TYPES:
307+
total_correct = category2score[category]["correct"]
308+
total_answered = category2score[category]["answered"]
309+
accuracy = 100 * total_correct / total_answered if total_answered > 0 else 0
310+
category_scores[category] = accuracy
311+
312+
total_correct = sum(category2score[category]["correct"] for category in TASK_TYPES)
313+
total_answered = sum(category2score[category]["answered"] for category in TASK_TYPES)
314+
accuracy = 100 * total_correct / total_answered if total_answered > 0 else 0
315+
eval_logger.info("=" * 50)
316+
eval_logger.info(f"Average Accuracy: {accuracy:.2f}%")
317+
eval_logger.info("Categorical accuracy: ")
318+
for key, value in category_scores.items():
319+
eval_logger.info(f"{key} accuracy: {value:.2f}%")
320+
eval_logger.info("=" * 50)
321+
return accuracy

0 commit comments

Comments
 (0)