Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion lmms_eval/evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@
get_git_commit_hash,
handle_non_serializable,
hash_string,
is_multimodal_content,
make_table,
positional_deprecated,
run_task_tests,
Expand Down Expand Up @@ -562,7 +563,8 @@ def evaluate(
target = task.doc_to_target(doc)
saved_doc = {}
for key, value in doc.items():
saved_doc[key] = value
if not is_multimodal_content(value):
saved_doc[key] = value
Comment on lines 565 to +567

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Filter lists of multimodal content before logging

The new filter only checks the top-level value type, so a list of images/tensors still passes through unchanged. Several tasks populate doc["images"] with a list of PIL images (e.g., lmms_eval/tasks/stare/utils.py around lines 105–109 and live_bench/utils.py), meaning --log_samples will still attempt to serialize those lists and can hit the same serialization/size problems this change is meant to fix. Consider recursively filtering lists/tuples or stripping multimodal elements before saving.

Useful? React with 👍 / 👎.

filtered_arguments = []
for req in requests:
# check if req.args is a list of tuples, and each item in the list is a serializable object
Expand Down
89 changes: 59 additions & 30 deletions lmms_eval/tasks/spatialviz/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,11 @@
import re
from collections import defaultdict
from pathlib import Path
from typing import Any, Dict, List

import yaml
from huggingface_hub import snapshot_download
from loguru import logger as eval_logger
from PIL import Image

with open(Path(__file__).parent / "_default_template_yaml", "r") as f:
Expand All @@ -23,7 +25,7 @@
)


def spatialviz_doc_to_visual(doc):
def spatialviz_doc_to_visual(doc: Dict[str, Any]) -> List[Image.Image]:
visual = []

category = doc["Category"]
Expand All @@ -33,16 +35,21 @@ def spatialviz_doc_to_visual(doc):
image_path = f"{cache_dir}/{category}/{task}/{level}/{image_id}.png"

if os.path.exists(image_path):
image_path = image_path
visual.append(Image.open(image_path).convert("RGB"))
else:
raise FileExistsError(f"video path:{image_path} does not exist.")
raise FileNotFoundError(f"image path: {image_path} does not exist.")
return visual


def spatialviz_doc_to_text(doc):
def spatialviz_doc_to_text(doc: Dict[str, Any]) -> str:
ops = ["A", "B", "C", "D"]
prompt = "You should first provide a reasoning process, then provide a single option(A, B, C or D) as the final answer. The reasoning process and the answer are enclosed within <think></think> and <answer></answer> tags, respectively, i.e., <think>reasoning process</think>, <answer>answer</answer>.\n"
prompt = (
"You should first provide a reasoning process, then provide a single "
"option(A, B, C or D) as the final answer. The reasoning process and "
"the answer are enclosed within <think></think> and <answer></answer> "
"tags, respectively, i.e., <think>reasoning process</think>, "
"<answer>answer</answer>.\n"
)
question = doc["Question"]
choices = doc["Choices"]
choice_text = ""
Expand All @@ -53,7 +60,7 @@ def spatialviz_doc_to_text(doc):
return text


def spatialviz_process_results(doc, results):
def spatialviz_process_results(doc: Dict[str, Any], results: List[str]) -> Dict[str, Dict[str, Any]]:
key_name = "spatialviz_score"
grounded_output = doc["Answer"]
response = results[0]
Expand All @@ -63,14 +70,28 @@ def spatialviz_process_results(doc, results):

think_match = re.search(think_pattern, response, re.DOTALL)
answer_match = re.search(answer_pattern, response, re.DOTALL)

op: List[str] = []
if think_match and answer_match:
final_answer = answer_match.group(1).strip()
pred_answer = final_answer.split(".")[0]
op = re.findall(r"[A-D]", pred_answer)

else:
print("No match for think/answer \n")
final_answer_patterns = ["<answer>", "Answer:", "Final answer", "final answer", "Final Answer", "the answer is", "The answer is", "correct answer", "Correct answer", "Correct Answer", "答案" "correct path"]
eval_logger.debug("No match for think/answer tags in response")
final_answer_patterns = [
"<answer>",
"Answer:",
"Final answer",
"final answer",
"Final Answer",
"the answer is",
"The answer is",
"correct answer",
"Correct answer",
"Correct Answer",
"答案",
"correct path",
]
if len(response) == 1:
op = re.findall(r"[A-D]", response)
else:
Expand All @@ -88,14 +109,23 @@ def spatialviz_process_results(doc, results):
is_correct = False

query = spatialviz_doc_to_text(doc)
spatialviz_submission = {"id": doc["Image_id"], "query": query, "gt_content": grounded_output, "pred": response, "category": doc["Category"], "task": doc["Task"], "level": doc["Level"], "is_correct": is_correct}
spatialviz_submission = {
"id": doc["Image_id"],
"query": query,
"gt_content": grounded_output,
"pred": response,
"category": doc["Category"],
"task": doc["Task"],
"level": doc["Level"],
"is_correct": is_correct,
}
return {key_name: spatialviz_submission}


def spatialviz_aggregate_results(results):
task_to_eval_samples = defaultdict(list)
category_to_eval_samples = defaultdict(list)
key_to_eval_samples = defaultdict(list)
def spatialviz_aggregate_results(results: List[Dict[str, Any]]) -> float:
task_to_eval_samples: Dict[str, List[int]] = defaultdict(list)
category_to_eval_samples: Dict[str, List[int]] = defaultdict(list)
key_to_eval_samples: Dict[str, List[int]] = defaultdict(list)
total_samples = len(results)
total_correct = 0

Expand All @@ -120,26 +150,25 @@ def spatialviz_aggregate_results(results):
task_accuracies = {task: sum(scores) / len(scores) for task, scores in task_to_eval_samples.items()}
category_accuracies = {category: sum(scores) / len(scores) for category, scores in category_to_eval_samples.items()}
key_accuracies = {key: sum(scores) / len(scores) for key, scores in key_to_eval_samples.items()}
print(f"{'Total Samples':<20}: {total_samples}")
print(f"{'Total Correct':<20}: {total_correct}")
print(f"{'Overall Accuracy':<20}: {accuracy:.4f}")
print()

print(f"{'Per-Task Accuracy':<40}")
print("-" * 40)
eval_logger.info(f"{'Total Samples':<20}: {total_samples}")
eval_logger.info(f"{'Total Correct':<20}: {total_correct}")
eval_logger.info(f"{'Overall Accuracy':<20}: {accuracy:.4f}")

eval_logger.info(f"{'Per-Task Accuracy':<40}")
eval_logger.info("-" * 40)
for task, acc in task_accuracies.items():
print(f"{task:<20}: {acc:.4f}")
print()
eval_logger.info(f"{task:<20}: {acc:.4f}")

print(f"{'Per-Category Accuracy':<40}")
print("-" * 40)
eval_logger.info(f"{'Per-Category Accuracy':<40}")
eval_logger.info("-" * 40)
for category, acc in category_accuracies.items():
print(f"{category:<20}: {acc:.4f}")
print("=" * 40)
eval_logger.info(f"{category:<20}: {acc:.4f}")
eval_logger.info("=" * 40)

print(f"{'Per-Key Accuracy':<40}")
print("-" * 40)
eval_logger.info(f"{'Per-Key Accuracy':<40}")
eval_logger.info("-" * 40)
for key, acc in key_accuracies.items():
print(f"{key:<20}: {acc:.4f}")
print()
eval_logger.info(f"{key:<20}: {acc:.4f}")

return accuracy
28 changes: 28 additions & 0 deletions lmms_eval/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,34 @@ def handle_non_serializable(o):
return str(o)


def is_multimodal_content(value: Any) -> bool:
"""
Check if a value is multimodal content (image, audio, video) that should
not be serialized to log files.

Returns True for:
- PIL.Image objects
- numpy arrays (typically image/audio data)
- bytes (binary data)
- torch tensors
- dicts with 'array' key (HuggingFace audio format)
- dicts with 'bytes' key (HuggingFace image format)
"""
if isinstance(value, (bytes, bytearray, np.ndarray, torch.Tensor)):
return True
if isinstance(value, dict):
if "array" in value or "bytes" in value:
return True
try:
from PIL import Image

if isinstance(value, Image.Image):
return True
except ImportError:
pass
return False


def sanitize_list(sub):
"""
Takes possible nested list and recursively converts all inner component to strings
Expand Down
Loading