llm-jp
diff --git a/‎README.md‎
Lines changed: 35 additions & 5 deletions b/‎README.md‎
Lines changed: 35 additions & 5 deletions
diff --git a/‎examples/InternVL2_8B.py‎
Lines changed: 4 additions & 1 deletion b/‎examples/InternVL2_8B.py‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎examples/sample.py‎
Lines changed: 50 additions & 16 deletions b/‎examples/sample.py‎
Lines changed: 50 additions & 16 deletions
diff --git a/‎src/eval_mm/api/registry.py‎
Lines changed: 2 additions & 3 deletions b/‎src/eval_mm/api/registry.py‎
Lines changed: 2 additions & 3 deletions
diff --git a/‎src/eval_mm/api/task.py‎
Lines changed: 28 additions & 48 deletions b/‎src/eval_mm/api/task.py‎
Lines changed: 28 additions & 48 deletions
diff --git a/‎src/eval_mm/metrics/__init__.py‎
Lines changed: 33 additions & 0 deletions b/‎src/eval_mm/metrics/__init__.py‎
Lines changed: 33 additions & 0 deletions
diff --git a/‎src/eval_mm/metrics/exact_match_scorer.py‎
Lines changed: 12 additions & 0 deletions b/‎src/eval_mm/metrics/exact_match_scorer.py‎
Lines changed: 12 additions & 0 deletions
@@ -18,11 +18,14 @@
 - [LLM-jp-eval-mm](#llm-jp-eval-mm)
   - [目次](#目次)
   - [環境構築](#環境構築)
+    - [PyPIでインストールする](#pypiでインストールする)
+    - [GitHubをCloneする場合](#githubをcloneする場合)
   - [評価方法](#評価方法)
-    - [サンプルコードの実行](#サンプルコードの実行)
+    - [評価の実行](#評価の実行)
     - [評価結果の確認](#評価結果の確認)
     - [リーダーボードの公開](#リーダーボードの公開)
   - [サポートするタスク](#サポートするタスク)
+  - [各VLMモデル推論時の必要ライブラリ情報](#各vlmモデル推論時の必要ライブラリ情報)
   - [ライセンス](#ライセンス)
   - [Contribution](#contribution)
 
@@ -98,10 +101,13 @@ rye run bash examples/evaluate.sh
 その場合は以下のコマンドを実行してください．
 
 ```bash
-rye run python3 examples/sample.py \
---class_path llava_1_5 \
---task_id japanese-heron-bench \
---openai_model_id gpt-4o-mini-2024-07-18
+python3 examples/sample.py \
+  --class_path llava_1_5_7b_hf \
+  --task_id japanese-heron-bench  \
+  --result_dir test  \
+  --metrics "llm_as_a_judge_heron_bench,exact_match,rougel" \
+  --judge_model "gpt-4o-2024-05-13" \
+  --overwrite
 ```
 
 ### 評価結果の確認
@@ -135,6 +141,30 @@ rye run python3 scripts/japanese-heron-bench/record_output.py
 - JA-Multi-Image-VQA
 - JMMMU
 
+## 各VLMモデル推論時の必要ライブラリ情報
+
+- OpenGVLab/InternVL2-8B
+
+OOM防止のためFlashAttentionのInstallが必要です.
+```bash
+uv pip install flash-attn --no-build-isolation --python .venv
+```
+
+- Llama_3_EvoVLM_JP_v2
+
+mantis-vl のインストールが必要です.
+```bash
+rye add "datasets==2.18.0"
+rye add --dev mantis-vl --git=https://github.com/TIGER-AI-Lab/Mantis.git
+```
+
+- Qwen/Qwen2-VL-7B-Instruct
+
+qwen-vl-utils のインストールが必要です.
+```bash
+rye add --dev qwen-vl-utils
+```
+
 ## ライセンス
 
 各評価データセットのライセンスは[DATASET.md](./DATASET.md)を参照してください．
 
@@ -166,13 +166,16 @@ def generate(
             pixel_values = (
                 load_image(image, max_num=12).to(self.model.device).to(self.model.dtype)
             )
+        import copy
+        generation_config = copy.deepcopy(gen_kwargs.__dict__)
+        generation_config.pop("use_cache")
 
         response = self.model.chat(
             self.tokenizer,
             pixel_values,
             text,
             num_patches_list=num_patches_list,
-            generation_config=gen_kwargs.__dict__,
+            generation_config=generation_config,
         )
         generated_text = response
         return generated_text
 
@@ -11,7 +11,7 @@
 parser = argparse.ArgumentParser()
 parser.add_argument("--class_path", type=str, default="llava_1_5_7b_hf")
 parser.add_argument("--task_id", type=str, default="japanese-heron-bench")
-parser.add_argument("--openai_model_id", type=str, default="gpt-4o-mini-2024-07-18")
+parser.add_argument("--judge_model", type=str, default="gpt-4o-mini-2024-07-18")
 parser.add_argument("--batch_size_for_evaluation", type=int, default=10)
 parser.add_argument("--overwrite", action="store_true")
 parser.add_argument("--result_dir", type=str, default="result")
@@ -22,6 +22,18 @@
 parser.add_argument("--top_p", type=float, default=1.0)
 parser.add_argument("--do_sample", action="store_true", default=False)
 parser.add_argument("--use_cache", action="store_true", default=True)
+parser.add_argument(
+    "--max_dataset_len",
+    type=int,
+    default=None,
+    help="max data size for evaluation. If None, use all data. Else, use the first n data.",
+)
+parser.add_argument(
+    "--metrics",
+    type=str,
+    default="llm_as_a_judge_heron_bench",
+    help="metrics to evaluate. You can specify multiple metrics separated by comma (e.g. --metrics exact_match,rougel).",
+)
 
 args = parser.parse_args()
 
@@ -36,13 +48,16 @@
 
 class_path = args.class_path
 task_id = args.task_id
-openai_model_id = args.openai_model_id
 
 module = importlib.import_module(class_path)
 model_id = module.VLM.model_id.replace("/", "-")
 
-task = eval_mm.api.registry.get_task(task_id)
-dataset = task.dataset
+task_config = eval_mm.api.task.TaskConfig(
+    max_dataset_len=args.max_dataset_len,
+    judge_model=args.judge_model,
+    batch_size_for_evaluation=args.batch_size_for_evaluation,
+)
+task = eval_mm.api.registry.get_task_cls(task_id)(task_config)
 
 # save the predictions to jsonl file
 os.makedirs(args.result_dir, exist_ok=True)
@@ -57,16 +72,19 @@
 
 prediction_result_file_path = os.path.join(prediction_result_dir, f"{model_id}.jsonl")
 
-
 # if prediciton is already done, load the prediction
 if os.path.exists(prediction_result_file_path) and not args.overwrite:
     with open(prediction_result_file_path, "r") as f:
         preds = [json.loads(line) for line in f]
+    assert (
+        len(preds) == len(task.dataset)
+    ), f"Prediction result length is not equal to the dataset length. Prediction result length: {len(preds)}, Dataset length: {len(task.dataset)}"
     print(f"Prediction result loaded from {prediction_result_file_path}")
 else:
     model = module.VLM()
     preds = []
-    for doc in tqdm(dataset):
+    print(task.dataset)
+    for doc in tqdm(task.dataset):
         # print("doc", doc)
         image = task.doc_to_visual(doc)
         text = task.doc_to_text(doc)
@@ -90,20 +108,36 @@
     exit()
 print("Evaluation start")
 # evaluate the predictions
-metrics, eval_results = task.compute_metrics(
-    preds, model_id=openai_model_id, batch_size=args.batch_size_for_evaluation
-)
+
+metrics = args.metrics.split(",")
+
+scores_for_each_metric = {}
+
+for metric in metrics:
+    scores_for_each_metric[metric] = task.calc_scores(preds, metric)
+    print(f"Scores for {metric}: {scores_for_each_metric[metric]}")
+
+calculated_metrics = {}
+
+for metric in metrics:
+    calculated_metrics[metric] = task.gather_scores(
+        scores_for_each_metric[metric], metric
+    )
+    print(f"{metric}: {calculated_metrics[metric]}")
 
 
-results = task.format_result(preds, eval_results)
 with open(os.path.join(prediction_result_file_path), "w") as f:
-    for result in results:
-        f.write(json.dumps(result, ensure_ascii=False) + "\n")
+    for i, pred in enumerate(preds):
+        question_id = pred["question_id"]
+        text = pred["text"]
+        answer = task.doc_to_answer(task.dataset[i])
+        content = {"question_id": question_id, "text": text, "answer": answer}
+        for metric in metrics:
+            content[metric] = scores_for_each_metric[metric][i]
+        f.write(json.dumps(content, ensure_ascii=False) + "\n")
 print(f"Prediction result saved to {prediction_result_file_path}")
 
 eval_result_file_path = os.path.join(evaluation_result_dir, f"{model_id}.jsonl")
 with open(eval_result_file_path, "w") as f:
-    f.write(json.dumps(metrics, ensure_ascii=False) + "\n")
-
-print(f"Metrics: {metrics}")
-print(f"Evaluation result example: {eval_results[0]}")
+    f.write(json.dumps(calculated_metrics, ensure_ascii=False) + "\n")
+print(f"Evaluation result saved to {eval_result_file_path}")
@@ -18,10 +18,9 @@ def decorate(fn):
     return decorate
 
 
-def get_task(task_name):
+def get_task_cls(task_name):
     try:
         task_cls = TASK_REGISTRY[task_name]
-        task = task_cls()
-        return task
+        return task_cls
     except KeyError:
         raise KeyError(f"Missing task {task_name}")
@@ -1,30 +1,15 @@
 import abc
-from collections.abc import Callable
-from dataclasses import asdict, dataclass
 
+from dataclasses import dataclass
+from eval_mm.utils.azure_client import OpenAIChatAPI
+from datasets import Dataset
 
-@dataclass
-class TaskConfig(dict):
-    def __getitem__(self, item):
-        return getattr(self, item)
-
-    def __setitem__(self, item, value):
-        return setattr(self, item, value)
 
-    def to_dict(self):
-        """dumps the current config as a dictionary object, as a printable format.
-        :return: dict
-            A printable dictionary version of the TaskConfig object.
-        """
-        cfg_dict = asdict(self)
-        # remove values that are `None`
-        for k, v in list(cfg_dict.items()):
-            if v is None:
-                cfg_dict.pop(k)
-            elif isinstance(v, Callable):
-                # TODO: this should handle Promptsource template objects as a separate case?
-                cfg_dict[k] = str(v)
-        return cfg_dict
+@dataclass
+class TaskConfig:
+    max_dataset_len: int | None = None
+    judge_model: str = "gpt-4o-mini-2024-07-18"
+    batch_size_for_evaluation: int = 10
 
 
 class Task(abc.ABC):
@@ -34,24 +19,21 @@ class Task(abc.ABC):
     {"question": ..., "answer": ...} or {"question": ..., question, answer)
     """
 
-    def __init__(self, config=None) -> None:
-        self._config = TaskConfig({**config}) if config else TaskConfig()
+    def __init__(self, config: TaskConfig):
         self._dataset = None
-        self.prepare_task(config)
-
-    @property
-    def config(self):
-        """Returns the TaskConfig associated with this class."""
-        return self._config
+        self.client = OpenAIChatAPI()
+        self.config = config
 
-    @property
-    def dataset(self):
-        """Returns the dataset associated with this class."""
-        return self._dataset
+        if self.config.max_dataset_len is not None:
+            self.dataset = self._prepare_dataset().select(
+                range(self.config.max_dataset_len)
+            )
+        else:
+            self.dataset = self._prepare_dataset()
 
     @abc.abstractmethod
-    def prepare_task(self, config):
-        """Prepares a document for evaluation."""
+    def _prepare_dataset(self) -> Dataset:
+        """Prepares the dataset."""
         pass
 
     @abc.abstractmethod
@@ -70,18 +52,16 @@ def doc_to_id(self, doc):
         pass
 
     @abc.abstractmethod
-    def evaluate(self, docs: list, preds: list) -> list[dict]:
-        """Evaluate batch prediction."""
+    def doc_to_answer(self, doc):
+        """Converts a document to answer."""
+        pass
+
+    @abc.abstractmethod
+    def calc_scores(self, preds: list, metric: str) -> list:
+        """Calculates scores for the predictions."""
         pass
 
     @abc.abstractmethod
-    def compute_metrics(self, preds):
-        """
-        Args:
-            doc: a instance of the eval dataset
-            results: [pred]
-        Returns:
-            metrics: a dictionary with key: metric name (in this case coco_bleu), value: metric value
-            results_verbose: a dictionary with key: metric name, value: a dictionary with key: 'score' and 'verbose'
-        """
+    def gather_scores(self, scores: list[dict], metric: str) -> dict:
+        """Aggregates the scores."""
         pass
@@ -0,0 +1,33 @@
+from .heron_bench_scorer import HeronBenchScorer
+from .exact_match_scorer import ExactMatchScorer
+from .llm_as_a_judge_scorer import LlmAsaJudgeScorer
+from .rougel_scorer import RougeLScorer
+from .substring_match_scorer import SubstringMatchScorer
+from .scorer import Scorer
+from .jmmmu_scorer import JMMMUScorer
+
+
+class ScorerRegistry:
+    """Registry to map metrics to their corresponding scorer classes."""
+
+    _scorers = {
+        "llm_as_a_judge_heron_bench": HeronBenchScorer,
+        "exact_match": ExactMatchScorer,
+        "llm_as_a_judge": LlmAsaJudgeScorer,
+        "rougel": RougeLScorer,
+        "substring_match": SubstringMatchScorer,
+        "jmmmu": JMMMUScorer,
+    }
+
+    @classmethod
+    def register(cls, metric: str, scorer_class: type):
+        """Register a new scorer for a metric."""
+        cls._scorers[metric] = scorer_class
+
+    @classmethod
+    def get_scorer(cls, metric: str) -> Scorer:
+        """Get the scorer class for the given metric."""
+        try:
+            return cls._scorers[metric]
+        except KeyError:
+            raise ValueError(f"Metric '{metric}' is not supported.")
@@ -0,0 +1,12 @@
+from .scorer import Scorer
+
+
+class ExactMatchScorer(Scorer):
+    @staticmethod
+    def score(refs: list[str], preds: list[str], **kwargs) -> list[int]:
+        scores = [int(ref == pred) for ref, pred in zip(refs, preds)]
+        return scores
+
+    @staticmethod
+    def aggregate(scores: list[int], **kwargs) -> float:
+        return sum(scores) / len(scores)