Skip to content

Commit 48fe6c0

Browse files
authored
Merge pull request #80 from llm-jp/70-record
add record benchmark result table
2 parents bd80f1f + 0a4c848 commit 48fe6c0

File tree

2 files changed

+149
-17
lines changed

2 files changed

+149
-17
lines changed

scripts/record_table.py

Lines changed: 146 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,146 @@
1+
import json
2+
import os
3+
import csv
4+
from argparse import ArgumentParser
5+
6+
BENCHMARK_METRICS = {
7+
"japanese-heron-bench": {
8+
"llm_as_a_judge_heron_bench": [
9+
"conv",
10+
"detail",
11+
"complex",
12+
"overall",
13+
"conv_rel",
14+
"detail_rel",
15+
"complex_rel",
16+
"overall_rel",
17+
],
18+
},
19+
"ja-vlm-bench-in-the-wild": [
20+
"rougel",
21+
"llm_as_a_judge",
22+
],
23+
"ja-vg-vqa-500": [
24+
"rougel",
25+
"llm_as_a_judge",
26+
],
27+
"jdocqa": {
28+
"jdocqa": [
29+
"yesno_exact",
30+
"factoid_exact",
31+
"numerical_exact",
32+
"open-ended_bleu",
33+
],
34+
},
35+
"ja-multi-image-vqa": {
36+
"rougel",
37+
"llm_as_a_judge",
38+
},
39+
"jmmmu": {
40+
"jmmmu": [
41+
"Overall-Art and Psychology",
42+
"Design",
43+
"Music",
44+
"Psychology",
45+
"Overall-Business",
46+
"Accounting",
47+
"Economics",
48+
"Finance",
49+
"Manage",
50+
"Marketing",
51+
"Overall-Science",
52+
"Biology",
53+
"Chemistry",
54+
"Math",
55+
"Physics",
56+
"Overall-Health and Medicine",
57+
"Basic_Medical_Science",
58+
"Clinical_Medicine",
59+
"Diagnostics_and_Laboratory_Medicine",
60+
"Pharmacy",
61+
"Public_Health",
62+
"Overall-Tech and Engineering",
63+
"Agriculture",
64+
"Architecture_and_Engineering",
65+
"Computer_Science",
66+
"Electronics",
67+
"Energy_and_Power",
68+
"Materials",
69+
"Mechanical_Engineering",
70+
"Overall",
71+
],
72+
},
73+
}
74+
75+
def get_benchmark_metrics(benchmark_name: str):
76+
"""Retrieve metrics for the given benchmark name."""
77+
return BENCHMARK_METRICS.get(benchmark_name)
78+
79+
def process_metrics(model_name: str, benchmark_name: str, metric_name: str, metric_scores: float|list|dict):
80+
"""Process metrics and return them in a standardized format.
81+
"""
82+
results = []
83+
if isinstance(metric_scores, float):
84+
results.append([model_name, benchmark_name, metric_name, metric_scores])
85+
elif isinstance(metric_scores, list):
86+
results.extend(
87+
[[model_name, benchmark_name, metric_name, score] for score in metric_scores]
88+
)
89+
elif isinstance(metric_scores, dict):
90+
results.extend(
91+
[[model_name, benchmark_name, name, value] for name, value in metric_scores.items()]
92+
)
93+
else:
94+
raise ValueError(f"Unsupported metric type for {benchmark_name}: {metric_name}")
95+
return results
96+
97+
def extract_results(result_dir: str):
98+
"""
99+
Extracts evaluation results, filtering by specified metrics for each benchmark.
100+
"""
101+
csv_data = []
102+
for benchmark_name in filter(
103+
lambda name: os.path.isdir(os.path.join(result_dir, name)),
104+
os.listdir(result_dir),
105+
):
106+
benchmark_dir = os.path.join(result_dir, benchmark_name)
107+
evaluation_dir = os.path.join(benchmark_dir, "evaluation")
108+
109+
for metrics_file in filter(lambda f: f.endswith(".jsonl"), os.listdir(evaluation_dir)):
110+
model_name = metrics_file[:-6]
111+
metrics_path = os.path.join(evaluation_dir, metrics_file)
112+
113+
with open(metrics_path, "r", encoding="utf-8") as f:
114+
data = json.load(f)
115+
116+
metrics = get_benchmark_metrics(benchmark_name)
117+
if not metrics:
118+
continue
119+
120+
for metric_name in metrics:
121+
metric_scores = data.get(metric_name)
122+
if metric_scores is not None:
123+
csv_data.extend(process_metrics(model_name, benchmark_name, metric_name, metric_scores))
124+
return csv_data
125+
126+
def write_to_csv(csv_data: list, output_file: str):
127+
"""Writes the extracted data to a CSV file."""
128+
os.makedirs(os.path.dirname(output_file), exist_ok=True)
129+
with open(output_file, "w", newline="", encoding="utf-8") as csvfile:
130+
writer = csv.writer(csvfile)
131+
writer.writerow(["Model Name", "Benchmark Name", "Metric Name", "Score"])
132+
writer.writerows(csv_data)
133+
134+
def get_args():
135+
parser = ArgumentParser(description="Extract evaluation results and write to a CSV file.")
136+
parser.add_argument("--result_dir", default="result", help="Directory containing evaluation results.")
137+
parser.add_argument("--output_csv", default="result/benchmark_results.csv", help="Output CSV file.")
138+
return parser.parse_args()
139+
if __name__ == "__main__":
140+
args = get_args()
141+
try:
142+
csv_data = extract_results(args.result_dir)
143+
write_to_csv(csv_data, args.output_csv)
144+
print(f"Results written to {output_csv}")
145+
except Exception as e:
146+
print(f"An error occurred: {e}")

src/eval_mm/metrics/jmmmu_scorer.py

Lines changed: 3 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -443,26 +443,12 @@ def aggregate(scores: list[int], **kwargs) -> float:
443443
for cat_results in in_domain_cat_results.values()
444444
]
445445
)
446-
printable_results["Overall-" + domain] = {
447-
"num": int(in_domain_data_num),
448-
"acc": round(in_domain_ins_acc, 5),
449-
}
446+
printable_results["Overall-" + domain] = round(in_domain_ins_acc, 5)
450447
# add sub category
451448
for cat_name, cat_results in in_domain_cat_results.items():
452-
printable_results[cat_name] = {
453-
"num": int(cat_results["num_example"]),
454-
"acc": round(cat_results["acc"], 5),
455-
}
449+
printable_results[cat_name] = round(cat_results["acc"], 5)
456450
all_ins_acc = calculate_ins_level_acc(evaluation_result)
457-
printable_results["Overall"] = {
458-
"num": sum(
459-
[
460-
cat_results["num_example"]
461-
for cat_results in evaluation_result.values()
462-
]
463-
),
464-
"acc": round(all_ins_acc, 5),
465-
}
451+
printable_results["Overall"] = round(all_ins_acc, 5)
466452
return printable_results
467453

468454

0 commit comments

Comments
 (0)