Add visualization script (#143)

speed1313 · web-flow · commit b188f8c76793 · 2025-03-22T21:38:34.000+09:00
diff --git a/README.md b/README.md
@@ -21,6 +21,7 @@ This tool automatically evaluates Japanese multi-modal large language models acr
   - [Supported Tasks](#supported-tasks)
   - [Required Libraries for Each VLM Model Inference](#required-libraries-for-each-vlm-model-inference)
   - [Benchmark-Specific Required Libraries](#benchmark-specific-required-libraries)
+  - [Analyze VLMs Prediction](#analyze-vlms-prediction)
   - [License](#license)
   - [Contribution](#contribution)
     - [How to Add a Benchmark Task](#how-to-add-a-benchmark-task)
@@ -139,6 +140,16 @@ JIC-VQA only provide the image URL, so you need to download the images from the
 python scripts/prepare_jic_vqa.py
 ```
 
+## Analyze VLMs Prediction
+
+Let's analyze VLMs prediction!
+```bash
+uv run streamlit run scripts/browse_prediction.py --task_id "japanese-heron-bench" --result_dir "result"
+```
+You can see the visualization like below.
+![Streamlit](./assets/streamlit_visualization.png)
+
+
 ## License
 
 This repository is licensed under the Apache-2.0 License.
diff --git a/assets/streamlit_visualization.png b/assets/streamlit_visualization.png
diff --git a/pyproject.toml b/pyproject.toml
@@ -59,6 +59,7 @@ dev = [
     "mypy>=1.15.0",
     "pytest>=8.3.4",
     "seaborn>=0.13.2",
+    "streamlit>=1.43.2",
 ]
 
 evovlm = [
diff --git a/scripts/browse_prediction.py b/scripts/browse_prediction.py
@@ -0,0 +1,104 @@
+import streamlit as st
+from datasets import load_dataset
+import random
+import eval_mm
+from argparse import ArgumentParser
+import os
+import json
+
+
+def parse_args():
+    parser = ArgumentParser()
+    parser.add_argument("--task_id", type=str, default="japanese-heron-bench")
+    parser.add_argument("--result_dir", type=str, default="result")
+
+    return parser.parse_args()
+
+
+def scrollable_text(text):
+    return (
+        f'<div style="max-height: 300px; overflow-y: auto; height: auto;">{text}</div>'
+    )
+
+
+if __name__ == "__main__":
+    args = parse_args()
+
+    task = eval_mm.tasks.TaskRegistry().get_task_cls(args.task_id)(
+        eval_mm.tasks.TaskConfig()
+    )
+
+    # Load model prediction
+    model_list = [
+        "google/gemma-3-12b-it",
+        "google/gemma-3-27b-it",
+        "microsoft/Phi-4-multimodal-instruct",
+    ]
+    predictions_per_model = {}
+    for model_id in model_list:
+        prediction_path = os.path.join(
+            args.result_dir, args.task_id, model_id, "prediction.jsonl"
+        )
+        with open(prediction_path, "r") as f:
+            predictions_per_model[model_id] = [json.loads(line) for line in f]
+
+    # VQAデータ読み込み
+    ds = task.dataset
+    # session_stateの初期化
+    st.set_page_config(layout="wide")
+    if "page" not in st.session_state:
+        st.session_state.page = 0  # 現在のページ番号
+
+    SAMPLES_PER_PAGE = 30  # 1ページに表示する件数
+    # Question ID, Image, Question, Answer, Prediction_model1, Prediction_model2,..
+    column_width_list = [1, 3, 3, 3] + [4] * len(model_list)
+    st.write(f"# {args.task_id} dataset")
+
+    def show_sample(idx):
+        sample = ds[idx]
+        cols = st.columns(column_width_list)
+        cols[0].markdown(task.doc_to_id(sample))
+        cols[1].image(task.doc_to_visual(sample)[0], width=300)
+        cols[2].markdown(
+            scrollable_text(task.doc_to_text(sample)), unsafe_allow_html=True
+        )
+        cols[3].markdown(
+            scrollable_text(task.doc_to_answer(sample)), unsafe_allow_html=True
+        )
+        for model_id in model_list:
+            cols[4 + model_list.index(model_id)].markdown(
+                scrollable_text(predictions_per_model[model_id][idx]["text"]),
+                unsafe_allow_html=True,
+            )
+
+    # ナビゲーションボタン
+    nav_col1, nav_col2, nav_col3 = st.columns(3)
+    if nav_col1.button(f"Prev {SAMPLES_PER_PAGE}"):
+        st.session_state.page = max(st.session_state.page - 1, 0)
+    if nav_col2.button("Random"):
+        st.session_state.page = random.randint(0, len(ds) // SAMPLES_PER_PAGE)
+    if nav_col3.button(f"Next {SAMPLES_PER_PAGE}"):
+        st.session_state.page = min(
+            st.session_state.page + 1, len(ds) // SAMPLES_PER_PAGE
+        )
+
+    # 現在のページのサンプルを表示
+    start_idx = st.session_state.page * SAMPLES_PER_PAGE
+    end_idx = min(start_idx + SAMPLES_PER_PAGE, len(ds))
+
+    st.write(f"### Showing samples {start_idx + 1} to {end_idx} / {len(ds)}")
+
+    # ヘッダー columnを表示
+    header_cols = st.columns(column_width_list)
+    header_cols[0].markdown("ID")
+    header_cols[1].markdown("Image")
+    header_cols[2].markdown("Question")
+    header_cols[3].markdown("Answer")
+    for model_id in model_list:
+        header_cols[4 + model_list.index(model_id)].markdown(f"Prediction ({model_id})")
+
+    # サンプルを表示
+    for idx in range(start_idx, end_idx):
+        with st.container():
+            show_sample(idx)
+            st.markdown("---")

Original file line number	Diff line number	Diff line change
`@@ -59,6 +59,7 @@ dev = [`
`59`	`59`	`"mypy>=1.15.0",`
`60`	`60`	`"pytest>=8.3.4",`
`61`	`61`	`"seaborn>=0.13.2",`
	`62`	`+ "streamlit>=1.43.2",`
`62`	`63`	`]`
`63`	`64`
`64`	`65`	`evovlm = [`