EvolvingLMMs-Lab
diff --git a/‎.gitignore
Lines changed: 1 addition & 0 deletions b/‎.gitignore
Lines changed: 1 addition & 0 deletions
diff --git a/‎lmms_eval/models/qwen2_5_vl.py
Lines changed: 25 additions & 12 deletions b/‎lmms_eval/models/qwen2_5_vl.py
Lines changed: 25 additions & 12 deletions
diff --git a/‎lmms_eval/tasks/vlmsareblind/README.md
Lines changed: 98 additions & 0 deletions b/‎lmms_eval/tasks/vlmsareblind/README.md
Lines changed: 98 additions & 0 deletions
diff --git a/‎lmms_eval/tasks/vlmsareblind/__init__.py
Lines changed: 2 additions & 0 deletions b/‎lmms_eval/tasks/vlmsareblind/__init__.py
Lines changed: 2 additions & 0 deletions
diff --git a/‎lmms_eval/tasks/vlmsareblind/utils.py
Lines changed: 58 additions & 0 deletions b/‎lmms_eval/tasks/vlmsareblind/utils.py
Lines changed: 58 additions & 0 deletions
diff --git a/‎lmms_eval/tasks/vlmsareblind/vlmsareblind.yaml
Lines changed: 27 additions & 0 deletions b/‎lmms_eval/tasks/vlmsareblind/vlmsareblind.yaml
Lines changed: 27 additions & 0 deletions
diff --git a/‎lmms_eval/tasks/vlmsareblind/vlmsareblind_lite.yaml
Lines changed: 30 additions & 0 deletions b/‎lmms_eval/tasks/vlmsareblind/vlmsareblind_lite.yaml
Lines changed: 30 additions & 0 deletions
diff --git a/‎lmms_eval/tasks/vstar_bench/README.md
Lines changed: 84 additions & 0 deletions b/‎lmms_eval/tasks/vstar_bench/README.md
Lines changed: 84 additions & 0 deletions
diff --git a/‎lmms_eval/tasks/vstar_bench/__init__.py
Lines changed: 1 addition & 0 deletions b/‎lmms_eval/tasks/vstar_bench/__init__.py
Lines changed: 1 addition & 0 deletions
@@ -44,6 +44,7 @@ lmms_eval/tasks/mlvu/__pycache__/utils.cpython-310.pyc
 
 scripts/
 .env
+.venv
 outputs/
 span.log
 uv.lock
@@ -42,7 +42,7 @@ def __init__(
         device_map: Optional[str] = "auto",
         batch_size: Optional[Union[int, str]] = 1,
         use_cache=True,
-        use_flash_attention_2: Optional[bool] = False,
+        attn_implementation: Optional[str] = None,
         min_pixels: int = 256 * 28 * 28,
         max_pixels: int = 1605632,
         max_num_frames: int = 32,
@@ -58,6 +58,11 @@ def __init__(
         # Do not use kwargs for now
         assert kwargs == {}, f"Unexpected kwargs: {kwargs}"
 
+        # Validate attention implementation
+        valid_attn_implementations = [None, "flash_attention_2", "sdpa", "eager"]
+        if attn_implementation not in valid_attn_implementations:
+            raise ValueError(f"attn_implementation must be one of {valid_attn_implementations}, got {attn_implementation}")
+
         self.use_custom_video_loader = use_custom_video_loader
         self.fps = fps
         # if self.fps and not self.use_custom_video_loader:
@@ -74,15 +79,17 @@ def __init__(
             self._device = torch.device(device)
             self.device_map = device_map if device_map else device
 
-        if use_flash_attention_2:
-            self._model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
-                pretrained,
-                torch_dtype=torch.bfloat16,
-                device_map=self.device_map,
-                attn_implementation="flash_attention_2",
-            ).eval()
-        else:
-            self._model = Qwen2_5_VLForConditionalGeneration.from_pretrained(pretrained, torch_dtype="auto", device_map=self.device_map).eval()
+        # Prepare model loading arguments
+        model_kwargs = {
+            "torch_dtype": "auto",
+            "device_map": self.device_map,
+        }
+
+        # Add attention implementation if specified
+        if attn_implementation is not None:
+            model_kwargs["attn_implementation"] = attn_implementation
+
+        self._model = Qwen2_5_VLForConditionalGeneration.from_pretrained(pretrained, **model_kwargs).eval()
         self.max_pixels = max_pixels
         self.min_pixels = min_pixels
         self.max_num_frames = max_num_frames
@@ -296,14 +303,20 @@ def _collate(x):
             }
             # Update with provided kwargs
             current_gen_kwargs = {**default_gen_kwargs, **gen_kwargs}
-
             pad_token_id = self.tokenizer.pad_token_id
 
+            if current_gen_kwargs["temperature"] > 0:
+                current_gen_kwargs["do_sample"] = True
+            else:
+                current_gen_kwargs["do_sample"] = False
+                current_gen_kwargs["temperature"] = None
+                current_gen_kwargs["top_p"] = None
+
             cont = self.model.generate(
                 **inputs,
                 eos_token_id=self.tokenizer.eos_token_id,
                 pad_token_id=pad_token_id,
-                do_sample=True if current_gen_kwargs["temperature"] > 0 else False,
+                do_sample=current_gen_kwargs["do_sample"],
                 temperature=current_gen_kwargs["temperature"],
                 top_p=current_gen_kwargs["top_p"],
                 num_beams=current_gen_kwargs["num_beams"],
 
@@ -0,0 +1,98 @@
+# VLMs Are Blind
+
+## Overview
+
+VLMs Are Blind is a benchmark designed to test the visual reasoning capabilities of Vision-Language Models (VLMs) through path-counting tasks in subway connection diagrams. The benchmark reveals fundamental limitations in VLMs' ability to process visual information, showing that many models struggle with basic visual tasks that are trivial for humans.
+
+## Paper Information
+
+- **Paper Title**: VLMs Are Blind
+- **Paper**: https://arxiv.org/abs/2407.06581
+- **GitHub**: https://github.com/xai-org/vlmsareblind
+- **Dataset**: https://huggingface.co/datasets/XAI/vlmsareblind
+
+## Dataset Details
+
+The benchmark consists of path-counting tasks where models must count the number of paths between two stations in subway-style connection diagrams. Each instance contains:
+- A subway map diagram image
+- A question asking for the number of paths between two specified stations
+- The correct answer in the format {N}
+
+## Task Configuration
+
+### Main Task
+- **Task Name**: `vlmsareblind`
+- **Split**: `valid`
+- **Output Type**: `generate_until`
+- **Metric**: Exact match accuracy
+
+### Lite Version
+- **Task Name**: `vlmsareblind_lite`
+- A subset version for faster evaluation
+
+## Evaluation
+
+The benchmark uses exact match accuracy as the primary metric. Models must output their answer in the format `{N}` where N is the number of paths.
+
+### Metrics
+- **exact_match**: Binary score for each instance (1 if prediction matches ground truth, 0 otherwise)
+- **Aggregation**: Mean across all instances
+
+## Running the Benchmark
+
+```bash
+# Run the full benchmark
+lmms-eval --model <model_name> --tasks vlmsareblind --batch_size 1
+
+# Run the lite version
+lmms-eval --model <model_name> --tasks vlmsareblind_lite --batch_size 1
+```
+
+## Implementation Details
+
+### Generation Configuration
+- **max_new_tokens**: 32
+- **temperature**: 0
+- **top_p**: 1.0
+- **num_beams**: 1
+- **do_sample**: false
+
+### Answer Extraction
+The evaluation expects answers in the format `{N}`. The answer extraction logic:
+1. First looks for numbers within curly brackets: `{3}`
+2. If not found, looks for standalone numbers and adds brackets
+3. Falls back to the raw response if no pattern matches
+
+### Prompt Format
+```
+[Image of subway diagram]
+[Question about counting paths]
+Answer with a number in curly brackets, e.g., {3}.
+```
+
+## File Structure
+```
+vlmsareblind/
+├── README.md           # This file
+├── utils.py           # Evaluation utilities
+├── vlmsareblind.yaml  # Main task configuration
+└── vlmsareblind_lite.yaml  # Lite version configuration
+```
+
+## Citation
+
+```bibtex
+@article{vlmsareblind2024,
+  title={VLMs Are Blind},
+  author={XAI Team},
+  journal={arXiv preprint arXiv:2407.06581},
+  year={2024}
+}
+```
+
+## Notes
+
+- The benchmark is designed to test fundamental visual reasoning capabilities
+- Results often reveal significant gaps between VLM performance and human abilities
+- The simple counting task format makes it easy to verify model outputs
+- Temperature is set to 0 for deterministic outputs
@@ -0,0 +1,2 @@
+# VLMs Are Blind benchmark task
+# Tests visual reasoning capabilities through path-counting in subway connection diagrams
@@ -0,0 +1,58 @@
+import re
+from typing import Dict, List
+
+
+def vlmsareblind_doc_to_visual(doc: Dict) -> List:
+    """Extract image from the document."""
+    if "image" in doc:
+        return [doc["image"]]
+    return []
+
+
+def vlmsareblind_doc_to_text(doc: Dict, lmms_eval_specific_kwargs: Dict) -> str:
+    """Format the prompt for the model."""
+    prompt = doc.get("prompt", "")
+    pre_prompt = lmms_eval_specific_kwargs.get("pre_prompt", "")
+    post_prompt = lmms_eval_specific_kwargs.get("post_prompt", "")
+    return f"{pre_prompt}{prompt}{post_prompt}"
+
+
+def vlmsareblind_doc_to_target(doc: Dict) -> str:
+    """Extract the expected answer from the document."""
+    # The answer should be a number in curly brackets like {3}
+    answer = doc.get("answer", "")
+    return str(answer)
+
+
+def extract_answer(response: str) -> str:
+    """Extract the number from a response containing {N} format."""
+    # Look for pattern {number}
+    match = re.search(r"\{(\d+)\}", response)
+    if match:
+        return match.group(0)  # Return the full match including brackets
+
+    # If no brackets found, try to find just a number
+    match = re.search(r"\b(\d+)\b", response)
+    if match:
+        return f"{{{match.group(1)}}}"  # Add brackets
+
+    return response.strip()
+
+
+def vlmsareblind_process_result(doc: Dict, result) -> Dict:
+    """Process the model's response and compare with ground truth."""
+    # Handle case where result is a list (common with generate_until)
+    if isinstance(result, list):
+        result = result[0] if result else ""
+
+    pred = extract_answer(str(result))
+    target = doc.get("answer", "")
+
+    # Exact match comparison
+    correct = pred == target
+
+    return {
+        "exact_match": correct,
+        "pred": pred,
+        "target": target,
+    }
@@ -0,0 +1,27 @@
+dataset_path: XAI/vlmsareblind
+task: "vlmsareblind"
+test_split: valid
+output_type: generate_until
+doc_to_visual: !function utils.vlmsareblind_doc_to_visual
+doc_to_text: !function utils.vlmsareblind_doc_to_text
+doc_to_target: !function utils.vlmsareblind_doc_to_target
+process_results: !function utils.vlmsareblind_process_result
+generation_kwargs:
+  max_new_tokens: 32
+  temperature: 0
+  top_p: 1.0
+  num_beams: 1
+  do_sample: false
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+metadata:
+  - version: 0.0
+  - description: "VLMs Are Blind: A benchmark testing visual reasoning capabilities of VLMs through path-counting tasks in subway connection diagrams."
+  - reference: "https://arxiv.org/abs/2407.06581"
+  
+lmms_eval_specific_kwargs:
+  default:
+    pre_prompt: ""
+    post_prompt: "\nAnswer with a number in curly brackets, e.g., {3}."
@@ -0,0 +1,30 @@
+dataset_path: XAI/vlmsareblind
+task: "vlmsareblind_lite"
+test_split: valid
+output_type: generate_until
+doc_to_visual: !function utils.vlmsareblind_doc_to_visual
+doc_to_text: !function utils.vlmsareblind_doc_to_text
+doc_to_target: !function utils.vlmsareblind_doc_to_target
+process_results: !function utils.vlmsareblind_process_result
+generation_kwargs:
+  max_new_tokens: 32
+  temperature: 0
+  top_p: 1.0
+  num_beams: 1
+  do_sample: false
+# Sample only 100 examples for lite version
+dataset_kwargs:
+  num_examples: 100
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+metadata:
+  - version: 0.0
+  - description: "VLMs Are Blind (Lite): A smaller subset for quick testing. Tests visual reasoning through path-counting in subway diagrams."
+  - reference: "https://arxiv.org/abs/2407.06581"
+  
+lmms_eval_specific_kwargs:
+  default:
+    pre_prompt: ""
+    post_prompt: "\nAnswer with a number in curly brackets, e.g., {3}."
@@ -0,0 +1,84 @@
+# V*-Bench (Visual Star Benchmark)
+
+## Overview
+
+V*-Bench is a visual question-answering benchmark designed to evaluate multimodal language models' capabilities in visual perception and reasoning. The benchmark focuses on assessing models' ability to accurately identify and reason about visual attributes in images through multiple-choice questions.
+
+## Dataset Details
+
+- **Dataset**: `lmms-lab/vstar-bench`
+- **Size**: 191 test samples
+- **Format**: Multiple-choice questions with 4 options (A, B, C, D)
+- **Modalities**: Image + Text
+
+## Task Categories
+
+The benchmark includes two main categories:
+
+1. **Direct Attributes** (`vstar_bench_direct_attributes`)
+   - Questions about direct visual properties such as colors, objects, counts, and characteristics
+   - Examples: "What is the color of the glove?", "What is the breed of the dog?", "How many people are in the image?"
+
+2. **Relative Position** (`vstar_bench_relative_position`)
+   - Questions about spatial relationships and positioning of objects within images
+   - Evaluates understanding of spatial concepts and object relationships
+
+## Evaluation
+
+### Metrics
+- **Overall Accuracy**: Percentage of correctly answered questions across all categories
+- **Category-specific Accuracy**: Accuracy for each individual category (direct_attributes, relative_position)
+
+### Running the Benchmark
+
+To evaluate a model on V*-Bench:
+
+```bash
+# Run the full benchmark
+lmms-eval --model <model_name> --tasks vstar_bench --output_path ./results
+
+# Run specific categories
+lmms-eval --model <model_name> --tasks vstar_bench_direct_attributes --output_path ./results
+lmms-eval --model <model_name> --tasks vstar_bench_relative_position --output_path ./results
+```
+
+## Configuration
+
+The benchmark uses the following configuration:
+- **Generation Settings**:
+  - `max_new_tokens`: 16
+  - `temperature`: 0
+  - `top_p`: 1.0
+  - `num_beams`: 1
+  - `do_sample`: false
+
+- **Prompt Template**:
+  - Post-prompt: "\nAnswer with the option's letter from the given choices directly."
+
+## Implementation Details
+
+### Answer Extraction
+The evaluation system extracts answer letters (A, B, C, or D) from model responses using multiple patterns to handle various response formats:
+- Direct letter: "A"
+- With punctuation: "A.", "A)", "(A)"
+- Full answer format: "Answer: A", "The answer is A"
+
+### Aggregation
+Results are aggregated both by category and overall, providing detailed performance metrics for different aspects of visual understanding.
+
+## File Structure
+
+```
+vstar_bench/
+├── __init__.py
+├── README.md
+├── _default_template_yaml         # Base configuration
+├── vstar_bench.yaml              # Main task configuration
+├── vstar_bench_direct_attributes.yaml
+├── vstar_bench_relative_position.yaml
+└── utils.py                      # Processing and evaluation functions
+```
+
+## References
+
+- Dataset: https://huggingface.co/datasets/lmms-lab/vstar-bench
@@ -0,0 +1 @@
+# V* Benchmark: Guided Visual Search as a Core Mechanism in Multimodal LLMs
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+# VLMs Are Blind benchmark task`
	`2`	`+# Tests visual reasoning capabilities through path-counting in subway connection diagrams`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+# V* Benchmark: Guided Visual Search as a Core Mechanism in Multimodal LLMs`