Skip to content

Commit 14d32c8

Browse files
authored
Fix Heron-bench scoring and Add Asagi model (#146)
* Fix Heron-bench scoring * Add Asagi * Emphasize top1 score
1 parent b188f8c commit 14d32c8

File tree

10 files changed

+197
-94
lines changed

10 files changed

+197
-94
lines changed

README.md

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -76,8 +76,14 @@ uv run --group normal python examples/sample.py \
7676
--overwrite
7777
```
7878

79-
The evaluation score and output results will be saved in
80-
`test/{task_id}/{model_id}/evaluation.jsonl` and `test/{task_id}/{model_id}/prediction.jsonl`.
79+
The evaluation score and model outputs will be saved in the `result` directory like below:
80+
```
81+
├── japanese-heron-bench
82+
│ ├── llava-hf
83+
│ │ ├── llava-1.5-7b-hf
84+
│ │ │ ├── evaluation.jsonl
85+
│ │ │ └── prediction.jsonl
86+
```
8187

8288
If you want to evaluate multiple models on multiple tasks, please check `eval_all.sh`.
8389

examples/Asagi.py

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
import torch
2+
from PIL import Image
3+
from transformers import AutoModel, AutoProcessor
4+
from base_vlm import BaseVLM
5+
from utils import GenerationConfig
6+
7+
8+
class VLM(BaseVLM):
9+
def __init__(self, model_id: str = "MIL-UT/Asagi-14B") -> None:
10+
self.model_id = model_id
11+
self.model = AutoModel.from_pretrained(
12+
self.model_id,
13+
trust_remote_code=True,
14+
torch_dtype=torch.bfloat16,
15+
device_map="auto",
16+
)
17+
self.processor = AutoProcessor.from_pretrained(self.model_id)
18+
19+
def generate(
20+
self,
21+
images: list[Image.Image],
22+
text: str,
23+
gen_kwargs: GenerationConfig = GenerationConfig(),
24+
) -> str:
25+
prompt = f"""以下は、タスクを説明する指示です。要求を適切に満たす応答を書きなさい。
26+
### 指示:
27+
{"<image>"*len(images)}
28+
{text}
29+
### 応答:
30+
"""
31+
32+
if len(images) == 0:
33+
images = None
34+
inputs = self.processor(text=prompt, images=images, return_tensors="pt")
35+
inputs_text = self.processor.tokenizer(prompt, return_tensors="pt")
36+
inputs["input_ids"] = inputs_text["input_ids"]
37+
inputs["attention_mask"] = inputs_text["attention_mask"]
38+
inputs = {
39+
k: inputs[k].to(self.model.device) for k in inputs if k != "token_type_ids"
40+
}
41+
42+
generate_ids = self.model.generate(**inputs, **gen_kwargs.__dict__)
43+
generated_text = self.processor.batch_decode(
44+
generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
45+
)[0]
46+
# truncate the text to remove the prompt
47+
generated_text = generated_text.split("### 応答:")[1].strip()
48+
return generated_text
49+
50+
51+
if __name__ == "__main__":
52+
vlm = VLM()
53+
vlm.test_vlm()

examples/base_vlm.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -19,12 +19,6 @@ def generate(
1919

2020
def test_vlm(self):
2121
"""Test the model with one or two images."""
22-
output = self.generate([], "画像には何が映っていますか?")
23-
logger.info(f"Output: {output}")
24-
assert isinstance(
25-
output, str
26-
), f"Expected output to be a string, but got {type(output)}"
27-
2822
image_file = "http://images.cocodataset.org/val2017/000000039769.jpg"
2923
image = Image.open(requests.get(image_file, stream=True).raw)
3024
image_file2 = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/car.jpg"
@@ -40,3 +34,9 @@ def test_vlm(self):
4034
assert isinstance(
4135
output, str
4236
), f"Expected output to be a string, but got {type(output)}"
37+
38+
output = self.generate([], "画像には何が映っていますか?")
39+
logger.info(f"Output: {output}")
40+
assert isinstance(
41+
output, str
42+
), f"Expected output to be a string, but got {type(output)}"

examples/japanese_stable_vlm.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -181,16 +181,15 @@ def generate(
181181
# instruct blip does not expect the <image> tag
182182
prompt = build_prompt(task="vqa", input=text)
183183
if len(images) == 0:
184-
images = None
185-
inputs = self.processor(text=prompt, return_tensors="pt")
184+
raise ValueError("Please provide at least one image.")
185+
186186
else:
187187
images = [process_images(images)]
188188
inputs = self.processor(images=images, return_tensors="pt", truncation=True)
189189
text_encoding = self.tokenizer(
190190
prompt, add_special_tokens=False, return_tensors="pt"
191191
)
192192
inputs.update(text_encoding)
193-
194193
# autoregressively complete prompt
195194
output = self.model.generate(
196195
**inputs.to(self.device, dtype=self.model.dtype), **gen_kwargs.__dict__

examples/model_table.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@
3535
"sbintuitions/sarashina2-vision-8b": "sarashina.VLM",
3636
"sbintuitions/sarashina2-vision-14b": "sarashina.VLM",
3737
"microsoft/Phi-4-multimodal-instruct": "phi_4_mm.VLM",
38+
"MIL-UT/Asagi-14B": "Asagi.VLM",
3839
}
3940

4041

examples/sample.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ def parse_args():
1515
parser = argparse.ArgumentParser()
1616
parser.add_argument("--model_id", type=str, default="llava-hf/llava-1.5-7b-hf")
1717
parser.add_argument("--task_id", type=str, default="japanese-heron-bench")
18-
parser.add_argument("--judge_model", type=str, default="gpt-4o-mini-2024-07-18")
18+
parser.add_argument("--judge_model", type=str, default="gpt-4o-2024-11-20")
1919
parser.add_argument("--batch_size_for_evaluation", type=int, default=10)
2020
parser.add_argument("--overwrite", action="store_true")
2121
parser.add_argument("--result_dir", type=str, default="result")
@@ -85,6 +85,11 @@ def parse_args():
8585
logger.info(task.dataset)
8686
error_count = 0
8787
for doc in tqdm(task.dataset):
88+
if error_count > len(task.dataset) * 0.1:
89+
logger.error(
90+
f"Error count is too high. Error count: {error_count}, Dataset length: {len(task.dataset)}. You need to re-run the evaluation."
91+
)
92+
exit()
8893
images = task.doc_to_visual(doc)
8994
text = task.doc_to_text(doc)
9095
if "<image>" in text:

scripts/browse_prediction.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
import streamlit as st
2-
from datasets import load_dataset
32
import random
43
import eval_mm
54
from argparse import ArgumentParser

scripts/make_leaderboard.py

Lines changed: 47 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
import os
33
import pandas as pd
44
from argparse import ArgumentParser
5-
from typing import Dict, List, Optional
5+
from typing import List, Optional
66
from loguru import logger
77
import eval_mm
88
import eval_mm.metrics
@@ -35,7 +35,12 @@
3535
}
3636

3737

38-
def main(result_dir: str, model_list: List[str], output_path: Optional[str] = None):
38+
def main(
39+
result_dir: str,
40+
model_list: List[str],
41+
output_path: Optional[str] = None,
42+
output_format: str = "markdown",
43+
):
3944
task_dirs = [d for d in os.listdir(result_dir) if not d.startswith(".")]
4045

4146
df = pd.DataFrame()
@@ -67,6 +72,8 @@ def main(result_dir: str, model_list: List[str], output_path: Optional[str] = No
6772
df = df._append(model_results, ignore_index=True)
6873

6974
df = df.set_index("Model")
75+
# round to 2 decimal places
76+
df = df.round(2)
7077
df = df.rename(
7178
columns={
7279
k: f"{TASK_ALIAS[k.split('/')[0]]}/{METRIC_ALIAS[k.split('/')[1]]}"
@@ -76,16 +83,31 @@ def main(result_dir: str, model_list: List[str], output_path: Optional[str] = No
7683
# sort columns
7784
df = df.reindex(sorted(df.columns), axis=1)
7885

79-
print(df.to_markdown(mode="github"))
86+
# textbf top1 score for each column
87+
for col in df.columns:
88+
top1_model = df[col].idxmax()
89+
if output_format == "latex":
90+
df.loc[top1_model, col] = f"\\textbf{{{df.loc[top1_model, col]}}}"
91+
else:
92+
df.loc[top1_model, col] = f"**{df.loc[top1_model, col]}**"
93+
94+
if output_format == "markdown":
95+
table = df.to_markdown(mode="github", floatfmt=".2f")
96+
elif output_format == "latex":
97+
table = df.to_latex(float_format="%.2f")
98+
print(table)
8099

81100
with open(output_path, "w") as f:
82-
f.write(df.to_markdown(mode="github"))
101+
f.write(table)
83102

84103

85104
def parse_args():
86105
parser = ArgumentParser()
87106
parser.add_argument("--result_dir", type=str, default="result")
88107
parser.add_argument("--output_path", type=str, default="leaderboard.md")
108+
parser.add_argument(
109+
"--output_format", type=str, default="markdown", choices=["markdown", "latex"]
110+
)
89111
return parser.parse_args()
90112

91113

@@ -94,11 +116,29 @@ def parse_args():
94116

95117
# モデルは実行時引数でも取れるようにしても良い
96118
model_list = [
97-
"Qwen/Qwen2.5-VL-7B-Instruct",
119+
"stabilityai/japanese-instructblip-alpha",
120+
"stabilityai/japanese-stable-vlm",
121+
"SakanaAI/Llama-3-EvoVLM-JP-v2",
122+
"cyberagent/llava-calm2-siglip",
123+
"llm-jp/llm-jp-3-vila-14b",
98124
"sbintuitions/sarashina2-vision-8b",
99125
"sbintuitions/sarashina2-vision-14b",
100-
"google/gemma-3-12b-it",
126+
"MIL-UT/Asagi-14B",
101127
"llava-hf/llava-1.5-7b-hf",
128+
"llava-hf/llava-v1.6-mistral-7b-hf",
129+
"neulab/Pangea-7B-hf",
130+
"mistralai/Pixtral-12B-2409",
131+
"meta-llama/Llama-3.2-11B-Vision-Instruct",
132+
"Efficient-Large-Model/VILA1.5-13b",
133+
"OpenGVLab/InternVL2-8B",
134+
"OpenGVLab/InternVL2-26B",
135+
"Qwen/Qwen2.5-VL-7B-Instruct",
136+
"Qwen/Qwen2.5-VL-72B-Instruct",
137+
"google/gemma-3-4b-it",
138+
"google/gemma-3-12b-it",
139+
"google/gemma-3-27b-it",
140+
"microsoft/Phi-4-multimodal-instruct",
141+
"gpt-4o-2024-11-20",
102142
]
103143

104-
main(args.result_dir, model_list, args.output_path)
144+
main(args.result_dir, model_list, args.output_path, args.output_format)

0 commit comments

Comments
 (0)