Skip to content

Commit 736454b

Browse files
authored
[FIX] Resolve MMMU-test submission file generation issue (#724)
* fix(mmmu): resolve MMMU-test submission file generation issue * fix(mmmu): Increase max_new_tokens to 512 for Qwen2.5-VL series models * style(*): Fix linting issues
1 parent dc4424a commit 736454b

File tree

3 files changed

+22
-20
lines changed

3 files changed

+22
-20
lines changed

lmms_eval/tasks/mmmu/mmmu_test.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,11 @@ doc_to_target: "answer"
88
# The return value of process_results will be used by metrics
99
process_results: !function utils.mmmu_process_results
1010

11+
generation_kwargs:
12+
# Set max_new_tokens to a higher value (e.g., 512) for models like Qwen2.5-VL series
13+
# that tend to generate longer, more detailed responses and require larger token limits
14+
max_new_tokens: 512
15+
1116
metric_list:
1217
- metric: submission
1318
aggregation: !function utils.mmmu_test_aggregate_results_for_submission

lmms_eval/tasks/mmmu/utils.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -159,11 +159,11 @@ def mmmu_process_results(doc, results):
159159
parsed_pred = parse_multi_choice_response(pred, all_choices, index2ans)
160160
else:
161161
parsed_pred = parse_open_response(pred)
162-
162+
parsed_pred = str(parsed_pred[0]) if parsed_pred else ""
163163
parsed_preds.append(parsed_pred)
164-
164+
mmmu_submission = {doc["id"]: parsed_preds[0]}
165165
mmmu_exact_acc = {"id": doc["id"], "subdomain": extract_subset_name(doc["id"]), "question_type": doc["question_type"], "answer": doc["answer"], "parsed_pred": parsed_preds}
166-
return {"mmmu_acc": mmmu_exact_acc, "mmmu_acc_pass_at_k": mmmu_exact_acc}
166+
return {"mmmu_acc": mmmu_exact_acc, "mmmu_acc_pass_at_k": mmmu_exact_acc, "submission": mmmu_submission}
167167

168168

169169
def mmmu_reasoning_process_results(doc, results):

lmms_eval/tasks/mmsi_bench/utils.py

Lines changed: 14 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,14 @@
1+
import io
12
import logging
23
import re
3-
from PIL import Image
4+
from collections import defaultdict
5+
46
import numpy as np
5-
import io
67
import pandas as pd
7-
from collections import defaultdict
8+
from PIL import Image
9+
810
from lmms_eval.filters.extraction import ExtendedRegexFilter
911
from lmms_eval.filters.transformation import MapFilter
10-
import re
1112

1213
eval_logger = logging.getLogger("lmms-eval")
1314

@@ -31,30 +32,28 @@ def msr_doc_to_visual(doc):
3132
return image_list
3233

3334

34-
35-
3635
def extract_single_choice_with_word_boundary(pred, gt):
37-
pattern_1 = r'``([^`]*)``'
36+
pattern_1 = r"``([^`]*)``"
3837
match = re.search(pattern_1, pred)
3938
if match:
40-
pred = match.group(1)
39+
pred = match.group(1)
4140

42-
pattern_2 = r'`([^`]*)`'
41+
pattern_2 = r"`([^`]*)`"
4342
match = re.search(pattern_2, pred)
4443
if match:
45-
pred = match.group(1)
44+
pred = match.group(1)
4645

47-
pattern_add = r'\{([^}]*)\}'
46+
pattern_add = r"\{([^}]*)\}"
4847
match = re.search(pattern_add, pred)
4948
if match:
50-
pred = match.group(1)
49+
pred = match.group(1)
5150

52-
pattern_3 = r'\b[A-D]\b(?!\s[a-zA-Z])'
51+
pattern_3 = r"\b[A-D]\b(?!\s[a-zA-Z])"
5352
match = re.search(pattern_3, pred)
5453
if match:
55-
pred = match.group()
54+
pred = match.group()
5655
else:
57-
return None
56+
return None
5857

5958
answer = gt.lower().replace("\n", " ").strip()
6059
predict = pred.lower().replace("\n", " ").strip()
@@ -72,7 +71,6 @@ def extract_single_choice_with_word_boundary(pred, gt):
7271
return 0.0
7372

7473

75-
7674
def msr_process_results(doc, results):
7775
"""
7876
Args:
@@ -114,4 +112,3 @@ def msr_aggregate_results(results):
114112
all_scores = [score for scores in l2_category_scores.values() for score in scores]
115113
avg_score = sum(all_scores) / len(all_scores) if all_scores else 0.0
116114
return avg_score
117-

0 commit comments

Comments
 (0)