Title: Add Benchmark from "Vision-Language Models Can’t See the Obvious" (ICCV 2025) (#744)

dunghuynhandy · web-flow · commit 978eb7f1ccb9 · 2025-07-16T12:23:49.000+08:00
* add salbench tasks

* Apply pre-commit formatting

* remove duplicates

* 1. Optimize salbench utils\n2. Recover qwen2.5vl example/n3. Pre-commit
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1,12 +1,12 @@
 repos:
   - repo: https://github.com/psf/black
-    rev: 23.12.1
+    rev: 25.1.0
     hooks:
       - id: black
         language_version: python3
         args: ["--line-length=240"]
   - repo: https://github.com/PyCQA/isort
-    rev: 5.13.2
+    rev: 6.0.1
     hooks:
       - id: isort
         language_version: python3
diff --git a/docs/current_tasks.md b/docs/current_tasks.md
@@ -159,6 +159,13 @@
   - WildVision 0617(wildvision_0617)
   - WildVision 0630 (wildvision_0630)
 - [SeedBench 2 Plus](https://huggingface.co/datasets/AILab-CVC/SEED-Bench-2-plus) (seedbench_2_plus)
+- [SalBench](https://salbench.github.io/)
+  - p3
+  - p3_box
+  - p3_box_img
+  - o3
+  - o3_box
+  - o3_box_img
 
 ## 2. Multi-image tasks:
 
diff --git a/examples/models/qwen25vl.sh b/examples/models/qwen25vl.sh
@@ -15,4 +15,4 @@ accelerate launch --num_processes=8 --main_process_port=12346 -m lmms_eval \
     --model qwen2_5_vl \
     --model_args=pretrained=Qwen/Qwen2.5-VL-7B-Instruct,max_pixels=12845056,attn_implementation=flash_attention_2,interleave_visuals=False \
     --tasks mme \
-    --batch_size 1
+    --batch_size 1
diff --git a/lmms_eval/api/samplers.py b/lmms_eval/api/samplers.py
@@ -37,9 +37,7 @@ def get_context(self, doc, num_fewshot):
                     + (
                         str(self.doc_to_target(doc)[0])
                         if type(self.doc_to_target(doc)) is list
-                        else self.doc_to_target(doc)
-                        if (self.config.doc_to_choice is None or type(self.doc_to_target(doc)) is str)
-                        else str(self.doc_to_choice(doc)[self.doc_to_target(doc)])
+                        else self.doc_to_target(doc) if (self.config.doc_to_choice is None or type(self.doc_to_target(doc)) is str) else str(self.doc_to_choice(doc)[self.doc_to_target(doc)])
                     )
                     for doc in selected_docs
                 ]
diff --git a/lmms_eval/models/mplug_owl_video/configuration_mplug_owl.py b/lmms_eval/models/mplug_owl_video/configuration_mplug_owl.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" MplugOwl model configuration """
+"""MplugOwl model configuration"""
 import copy
 import os
 from typing import Union
diff --git a/lmms_eval/models/mplug_owl_video/modeling_mplug_owl.py b/lmms_eval/models/mplug_owl_video/modeling_mplug_owl.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch MplugOwl model. """
+"""PyTorch MplugOwl model."""
 
 import math
 from typing import Any, Optional, Tuple, Union
diff --git a/lmms_eval/tasks/librispeech/cn_tn.py b/lmms_eval/tasks/librispeech/cn_tn.py
@@ -41,7 +41,12 @@
 
 FILLER_CHARS = ["呃", "啊"]
 
-ER_WHITELIST = "(儿女|儿子|儿孙|女儿|儿媳|妻儿|" "胎儿|婴儿|新生儿|婴幼儿|幼儿|少儿|小儿|儿歌|儿童|儿科|托儿所|孤儿|" "儿戏|儿化|台儿庄|鹿儿岛|正儿八经|吊儿郎当|生儿育女|托儿带女|养儿防老|痴儿呆女|" "佳儿佳妇|儿怜兽扰|儿无常父|儿不嫌母丑|儿行千里母担忧|儿大不由爷|苏乞儿)"
+ER_WHITELIST = (
+    "(儿女|儿子|儿孙|女儿|儿媳|妻儿|"
+    "胎儿|婴儿|新生儿|婴幼儿|幼儿|少儿|小儿|儿歌|儿童|儿科|托儿所|孤儿|"
+    "儿戏|儿化|台儿庄|鹿儿岛|正儿八经|吊儿郎当|生儿育女|托儿带女|养儿防老|痴儿呆女|"
+    "佳儿佳妇|儿怜兽扰|儿无常父|儿不嫌母丑|儿行千里母担忧|儿大不由爷|苏乞儿)"
+)
 ER_WHITELIST_PATTERN = re.compile(ER_WHITELIST)
 
 # 中文数字系统类型
diff --git a/lmms_eval/tasks/salbench/_o3_default b/lmms_eval/tasks/salbench/_o3_default
@@ -0,0 +1,73 @@
+dataset_path: salbench-vlm/salbench
+dataset_kwargs:
+  token: True
+
+test_split: test
+output_type: generate_until
+doc_to_visual: !function utils.p3o3_doc_to_visual
+doc_to_text: !function utils.p3o3_doc_to_text
+doc_to_target: "answer"
+generation_kwargs:
+  max_new_tokens: 128
+  # temperature: 0
+  # top_p: 0
+  # num_beams: 1
+  # do_sample: false
+
+process_results: !function utils.o3_process_results
+metric_list:
+  - metric: exact_match
+    aggregation: !function utils.aggregate_per_sample_score
+    higher_is_better: true
+  - metric: sample_precision
+    aggregation: !function utils.aggregate_per_sample_score
+    higher_is_better: true
+  - metric: sample_recall
+    aggregation: !function utils.aggregate_per_sample_score
+    higher_is_better: true
+  - metric: sample_f1
+    aggregation: !function utils.aggregate_per_sample_score
+    higher_is_better: true
+
+  - metric: all_cat_precision
+    aggregation: !function utils.p3_aggregate_all_category_precision
+    higher_is_better: true
+  - metric: all_cat_recall
+    aggregation: !function utils.p3_aggregate_all_category_recall
+    higher_is_better: true
+  - metric: all_cat_f1
+    aggregation: !function utils.p3_aggregate_all_category_f1
+    higher_is_better: true
+
+  - metric: orientation_precision
+    aggregation: !function utils.aggregate_per_category_precision
+    higher_is_better: true
+  - metric: orientation_recall
+    aggregation: !function utils.aggregate_per_category_recall
+    higher_is_better: true
+  - metric: orientation_f1
+    aggregation: !function utils.aggregate_per_category_f1
+    higher_is_better: true
+
+  - metric: color_precision
+    aggregation: !function utils.aggregate_per_category_precision
+    higher_is_better: true
+  - metric: color_recall
+    aggregation: !function utils.aggregate_per_category_recall
+    higher_is_better: true
+  - metric: color_f1
+    aggregation: !function utils.aggregate_per_category_f1
+    higher_is_better: true
+
+  - metric: size_precision
+    aggregation: !function utils.aggregate_per_category_precision
+    higher_is_better: true
+  - metric: size_recall
+    aggregation: !function utils.aggregate_per_category_recall
+    higher_is_better: true
+  - metric: size_f1
+    aggregation: !function utils.aggregate_per_category_f1
+    higher_is_better: true
+
+metadata:
+  - version: 0.0
diff --git a/lmms_eval/tasks/salbench/_p3_default b/lmms_eval/tasks/salbench/_p3_default
@@ -0,0 +1,73 @@
+dataset_path: salbench-vlm/salbench
+dataset_kwargs:
+  token: True
+
+test_split: test
+output_type: generate_until
+doc_to_visual: !function utils.p3o3_doc_to_visual
+doc_to_text: !function utils.p3o3_doc_to_text
+doc_to_target: "answer"
+generation_kwargs:
+  max_new_tokens: 128
+#   temperature: 0
+#   top_p: 0
+#   num_beams: 1
+#   do_sample: false
+
+process_results: !function utils.p3_process_results
+metric_list:
+  - metric: exact_match
+    aggregation: !function utils.aggregate_per_sample_score
+    higher_is_better: true
+  - metric: sample_precision
+    aggregation: !function utils.aggregate_per_sample_score
+    higher_is_better: true
+  - metric: sample_recall
+    aggregation: !function utils.aggregate_per_sample_score
+    higher_is_better: true
+  - metric: sample_f1
+    aggregation: !function utils.aggregate_per_sample_score
+    higher_is_better: true
+
+  - metric: all_cat_precision
+    aggregation: !function utils.p3_aggregate_all_category_precision
+    higher_is_better: true
+  - metric: all_cat_recall
+    aggregation: !function utils.p3_aggregate_all_category_recall
+    higher_is_better: true
+  - metric: all_cat_f1
+    aggregation: !function utils.p3_aggregate_all_category_f1
+    higher_is_better: true
+
+  - metric: orientation_precision
+    aggregation: !function utils.aggregate_per_category_precision
+    higher_is_better: true
+  - metric: orientation_recall
+    aggregation: !function utils.aggregate_per_category_recall
+    higher_is_better: true
+  - metric: orientation_f1
+    aggregation: !function utils.aggregate_per_category_f1
+    higher_is_better: true
+
+  - metric: color_precision
+    aggregation: !function utils.aggregate_per_category_precision
+    higher_is_better: true
+  - metric: color_recall
+    aggregation: !function utils.aggregate_per_category_recall
+    higher_is_better: true
+  - metric: color_f1
+    aggregation: !function utils.aggregate_per_category_f1
+    higher_is_better: true
+
+  - metric: size_precision
+    aggregation: !function utils.aggregate_per_category_precision
+    higher_is_better: true
+  - metric: size_recall
+    aggregation: !function utils.aggregate_per_category_recall
+    higher_is_better: true
+  - metric: size_f1
+    aggregation: !function utils.aggregate_per_category_f1
+    higher_is_better: true
+
+metadata:
+  - version: 0.0
diff --git a/lmms_eval/tasks/salbench/o3.yaml b/lmms_eval/tasks/salbench/o3.yaml
@@ -0,0 +1,3 @@
+dataset_name: O3
+task: "o3"
+include: _o3_default
diff --git a/lmms_eval/tasks/salbench/o3_box.yaml b/lmms_eval/tasks/salbench/o3_box.yaml
@@ -0,0 +1,3 @@
+dataset_name: O3_box
+task: "o3_box"
+include: _o3_default
diff --git a/lmms_eval/tasks/salbench/o3_box_img.yaml b/lmms_eval/tasks/salbench/o3_box_img.yaml
@@ -0,0 +1,3 @@
+dataset_name: O3_box_img
+task: "o3_box_img"
+include: _o3_default
diff --git a/lmms_eval/tasks/salbench/p3.yaml b/lmms_eval/tasks/salbench/p3.yaml
@@ -0,0 +1,3 @@
+dataset_name: P3
+task: "p3"
+include: _p3_default
diff --git a/lmms_eval/tasks/salbench/p3_box.yaml b/lmms_eval/tasks/salbench/p3_box.yaml
@@ -0,0 +1,3 @@
+dataset_name: P3_box
+task: "p3_box"
+include: _p3_default
diff --git a/lmms_eval/tasks/salbench/p3_box_img.yaml b/lmms_eval/tasks/salbench/p3_box_img.yaml
@@ -0,0 +1,3 @@
+dataset_name: P3_box_img
+task: "p3_box_img"
+include: _p3_default
diff --git a/lmms_eval/tasks/salbench/utils.py b/lmms_eval/tasks/salbench/utils.py

Original file line number	Diff line number	Diff line change
`@@ -37,9 +37,7 @@ def get_context(self, doc, num_fewshot):`
`37`	`37`	`+ (`
`38`	`38`	`str(self.doc_to_target(doc)[0])`
`39`	`39`	`if type(self.doc_to_target(doc)) is list`
`40`		`- else self.doc_to_target(doc)`
`41`		`- if (self.config.doc_to_choice is None or type(self.doc_to_target(doc)) is str)`
`42`		`- else str(self.doc_to_choice(doc)[self.doc_to_target(doc)])`
	`40`	`+ else self.doc_to_target(doc) if (self.config.doc_to_choice is None or type(self.doc_to_target(doc)) is str) else str(self.doc_to_choice(doc)[self.doc_to_target(doc)])`
`43`	`41`	`)`
`44`	`42`	`for doc in selected_docs`
`45`	`43`	`]`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+dataset_name: O3`
	`2`	`+task: "o3"`
	`3`	`+include: _o3_default`