Skip to content

Commit e316f86

Browse files
authored
Add macro matric (#700)
1 parent 9935012 commit e316f86

File tree

2 files changed

+26
-4
lines changed

2 files changed

+26
-4
lines changed

lmms_eval/tasks/xlrs/XLRS-lite.yaml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,9 @@ metric_list:
1818
- metric: xlrs_micro_score
1919
aggregation: !function mcq_utils.xlrs_aggregate_results
2020
higher_is_better: true
21+
- metric: xlrs_macro_score
22+
aggregation: !function mcq_utils.xlrs_aggregate_results_macro_score
23+
higher_is_better: true
2124
lmms_eval_specific_kwargs:
2225
default:
2326
pre_prompt: ""

lmms_eval/tasks/xlrs/mcq_utils.py

Lines changed: 23 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import re
2+
from contextlib import contextmanager
23

34
from loguru import logger as eval_logger
45
from PIL import Image
@@ -101,10 +102,10 @@ def xlrs_process_results(doc, results):
101102
"answer": doc["answer"],
102103
}
103104

104-
return {"xlrs_micro_score": data_dict}
105+
return {"xlrs_micro_score": data_dict, "xlrs_macro_score": data_dict}
105106

106107

107-
def xlrs_aggregate_results(results):
108+
def xlrs_aggregate_results(results, macro=False):
108109
"""
109110
Args:
110111
results: a list of values returned by process_results
@@ -141,7 +142,7 @@ def xlrs_aggregate_results(results):
141142
else:
142143
metrics[Task][Subtask][f"{Category}"]["true"] += cnt
143144
metrics[Task][Subtask][f"{Category}"]["false"] += 1 - cnt
144-
145+
macros = []
145146
sum_all, succ_all = 0, 0
146147
for task, tasks_values in metrics.items():
147148
eval_logger.info("*" * 32 + f"{task} (Task Start)")
@@ -160,6 +161,7 @@ def xlrs_aggregate_results(results):
160161
else:
161162
acc_subtasks = cnt_subtask / sum_subtask
162163
eval_logger.info("+" * 16 + "\t Acc " + "{:.4f}".format(acc_subtasks) + f"\t{substask} ({sum_subtask} items)")
164+
macros.append(acc_subtasks)
163165
cnt_task += cnt_subtask
164166
sum_task += sum_subtask
165167

@@ -171,4 +173,21 @@ def xlrs_aggregate_results(results):
171173
sum_all += sum_task
172174
eval_logger.info("*" * 32 + "Acc " + "{:.4f}".format(acc_task) + f"\t{task} ({sum_task} items)\n")
173175
eval_logger.info("*" * 32 + "Overall Acc " + "{:.4f}".format(succ_all / sum_all))
174-
return succ_all / sum_all
176+
if macro is True:
177+
return sum(macros) / len(macros)
178+
else:
179+
return succ_all / sum_all
180+
181+
182+
@contextmanager
183+
def mute_eval_logger():
184+
eval_logger.disable(__name__)
185+
try:
186+
yield
187+
finally:
188+
eval_logger.enable(__name__)
189+
190+
191+
def xlrs_aggregate_results_macro_score(results):
192+
with mute_eval_logger():
193+
return xlrs_aggregate_results(results, macro=True)

0 commit comments

Comments
 (0)