Add JDocQA task (#79)

speed1313 · web-flow · commit f593fec42954 · 2024-11-28T12:31:45.000+09:00
diff --git a/README.md b/README.md
@@ -26,6 +26,7 @@
     - [リーダーボードの公開](#リーダーボードの公開)
   - [サポートするタスク](#サポートするタスク)
   - [各VLMモデル推論時の必要ライブラリ情報](#各vlmモデル推論時の必要ライブラリ情報)
+  - [タスク固有の必要ライブラリ情報](#タスク固有の必要ライブラリ情報)
   - [ライセンス](#ライセンス)
   - [Contribution](#contribution)
 
@@ -165,6 +166,16 @@ qwen-vl-utils のインストールが必要です.
 rye add --dev qwen-vl-utils
 ```
 
+## タスク固有の必要ライブラリ情報
+
+- JDocQA
+
+```bash
+sudo apt-get install poppler-utils
+rye add pdf2image
+rye add "sacrebleu[ja]"
+```
+
 ## ライセンス
 
 各評価データセットのライセンスは[DATASET.md](./DATASET.md)を参照してください．
diff --git a/src/eval_mm/metrics/__init__.py b/src/eval_mm/metrics/__init__.py
@@ -5,6 +5,7 @@
 from .substring_match_scorer import SubstringMatchScorer
 from .scorer import Scorer
 from .jmmmu_scorer import JMMMUScorer
+from .jdocqa_scorer import JDocQAScorer
 
 
 class ScorerRegistry:
@@ -17,6 +18,7 @@ class ScorerRegistry:
         "rougel": RougeLScorer,
         "substring_match": SubstringMatchScorer,
         "jmmmu": JMMMUScorer,
+        "jdocqa": JDocQAScorer,
     }
 
     @classmethod
diff --git a/src/eval_mm/metrics/jdocqa_scorer.py b/src/eval_mm/metrics/jdocqa_scorer.py
@@ -0,0 +1,115 @@
+from eval_mm.metrics.scorer import Scorer
+from sacrebleu import sentence_bleu
+from unicodedata import normalize
+
+ANSWER_TYPE_MAP = {
+    "yesno": 0,  # Yes/No questions
+    "factoid": 1,  # Factoid questions
+    "numerical": 2,  # Numerical questions
+    "open-ended": 3,  # Open-ended questions
+}
+
+NUM_TO_ANSWER_TYPE = {v: k for k, v in ANSWER_TYPE_MAP.items()}
+
+
+def jdocqa_normalize(text):
+    text = (
+        text.replace("です", "")
+        .replace("。", "")
+        .replace("、", "")
+        .replace(" ", "")
+        .strip()
+    )
+    text = normalize("NFKC", text)
+    return text
+
+
+def bleu_ja(refs, pred):
+    bleu_score = sentence_bleu(
+        hypothesis=pred,
+        references=refs,
+        smooth_method="exp",
+        smooth_value=0.0,
+        tokenize="ja-mecab",
+        use_effective_order=False,
+        lowercase=False,
+    )
+    return bleu_score.score
+
+
+class JDocQAScorer(Scorer):
+    @staticmethod
+    def score(refs: list[str], preds: list[str], **kwargs) -> list[int]:
+        docs = kwargs["docs"]
+        scores = []
+
+        for doc, ref, pred in zip(docs, refs, preds):
+            if doc["answer_type"] == ANSWER_TYPE_MAP["open-ended"]:
+                scores.append(bleu_ja([ref], pred))
+            elif doc["answer_type"] in [
+                ANSWER_TYPE_MAP["yesno"],
+                ANSWER_TYPE_MAP["factoid"],
+                ANSWER_TYPE_MAP["numerical"],
+            ]:
+                ref = jdocqa_normalize(ref)
+                pred = jdocqa_normalize(pred)
+                if ref in pred:
+                    scores.append(1)
+                else:
+                    scores.append(0)
+            else:
+                raise NotImplementedError("Bad answer type.")
+
+        return scores
+
+    @staticmethod
+    def aggregate(scores: list[int], **kwargs) -> dict:
+        docs = kwargs["docs"]
+        metrics = {
+            "yesno_exact": [],
+            "factoid_exact": [],
+            "numerical_exact": [],
+            "open-ended_bleu": [],
+        }
+        for doc, score in zip(docs, scores):
+            answer_type = doc["answer_type"]
+            if answer_type == ANSWER_TYPE_MAP["open-ended"]:
+                metrics["open-ended_bleu"].append(score)
+            else:
+                metrics[f"{NUM_TO_ANSWER_TYPE[answer_type]}_exact"].append(score)
+
+        for key, value in metrics.items():
+            if len(value) == 0:
+                metrics[key] = 0
+                continue
+            metrics[key] = sum(value) / len(value)
+
+        return metrics
+
+
+if __name__ == "__main__":
+    from datasets import load_dataset
+
+    ds = load_dataset("shunk031/JDocQA", split="test")
+    ds = ds.select(range(10))
+
+    ref = ds["answer"][0]
+    pred = ds["answer"][0]
+    print(ref)
+    print(pred)
+    print(bleu_ja([ref], pred))
+    answer_types = ds["answer_type"]
+    answers = ds["answer"]
+    print("Original answers")
+    for answer_type, answer in zip(answer_types, answers):
+        print(NUM_TO_ANSWER_TYPE[answer_type], answer)
+
+    print("JDocQA normalized answers")
+    jdocqa_normalize_answers = [jdocqa_normalize(x) for x in ds["answer"]]
+    for answer_type, answer in zip(answer_types, jdocqa_normalize_answers):
+        print(NUM_TO_ANSWER_TYPE[answer_type], answer)
+
+    scores = JDocQAScorer.score(refs=ds["answer"], preds=ds["answer"], docs=ds)
+    print(scores)
+    metrics = JDocQAScorer.aggregate(scores, docs=ds)
+    print(metrics)
diff --git a/src/eval_mm/tasks/__init__.py b/src/eval_mm/tasks/__init__.py
@@ -3,3 +3,4 @@
 from .ja_vlm_bench_in_the_wild import JaVLMBenchIntheWild
 from .jmmmu import JMMMU
 from .ja_multi_image_vqa import JAMultiImageVQA
+from .jdocqa import JDocQA
diff --git a/src/eval_mm/tasks/jdocqa.py b/src/eval_mm/tasks/jdocqa.py
@@ -0,0 +1,95 @@
+from datasets import Dataset, load_dataset
+from pdf2image import convert_from_path
+
+from ..api.registry import register_task
+from ..api.task import Task
+from eval_mm.metrics import ScorerRegistry
+
+import aiohttp
+from PIL import Image
+
+Image.MAX_IMAGE_PIXELS = None
+
+
+def pdf_to_images(pdf_path):
+    images = convert_from_path(pdf_path)
+    return images
+
+
+def get_elements_from_index(indices_str, array):
+    try:
+        indices = [int(x.strip()) - 1 for x in indices_str.split(",")]
+        elements = [array[i] for i in indices if 0 <= i < len(array)]
+        return elements
+    except ValueError:
+        print("The string doesn't seem to have numbers or commas in the right places.")
+        return None  # Or maybe an empty list, depending on how you wanna handle it
+    except IndexError:
+        print("Out of bounds error!")
+        return None  # Same, an empty list or special value could work
+
+
+@register_task("jdocqa")
+class JDocQA(Task):
+    @staticmethod
+    def _prepare_dataset() -> Dataset:
+        ds = load_dataset(
+            "shunk031/JDocQA",
+            split="test",
+            rename_pdf_category=True,
+            trust_remote_code=True,
+            storage_options={
+                "client_kwargs": {"timeout": aiohttp.ClientTimeout(total=3600)}
+            },
+        )
+        ds = ds.rename_column("question", "input_text")
+        ds = ds.map(lambda example, idx: {"question_id": idx}, with_indices=True)
+        keep_columns = [
+            "input_text",
+            "pdf_filepath",
+            "question_page_number",
+            "question_id",
+            "answer",
+            "answer_type",
+        ]
+        ds = ds.remove_columns(
+            [col for col in ds.column_names if col not in keep_columns]
+        )
+        return ds
+
+    @staticmethod
+    def doc_to_text(doc):
+        return doc["input_text"]
+
+    @staticmethod
+    def doc_to_visual(doc):
+        images_all = pdf_to_images(doc["pdf_filepath"])
+        images = get_elements_from_index(doc["question_page_number"], images_all)
+        return images
+
+    @staticmethod
+    def doc_to_id(doc):
+        return doc["question_id"]
+
+    @staticmethod
+    def doc_to_answer(doc):
+        return doc["answer"]
+
+    def calc_scores(self, preds: list, metric: str) -> list:
+        """Calculate scores of each prediction based on the metric."""
+        docs = self.dataset
+        refs = [doc["answer"] for doc in docs]
+        pred_texts = [pred["text"] for pred in preds]
+        scorer = ScorerRegistry.get_scorer(metric)
+        kwargs = {
+            "docs": docs,
+            "client": self.client,
+            "judge_model": self.config.judge_model,
+            "batch_size": self.config.batch_size_for_evaluation,
+        }
+        return scorer.score(refs, pred_texts, **kwargs)
+
+    def gather_scores(self, scores: list[dict], metric: str) -> dict:
+        kwargs = {"docs": self.dataset}
+        scorer = ScorerRegistry.get_scorer(metric)
+        return scorer.aggregate(scores, **kwargs)