diff --git a/lmms_eval/evaluator.py b/lmms_eval/evaluator.py
index eedd6dd3e..872e75a67 100755
--- a/lmms_eval/evaluator.py
+++ b/lmms_eval/evaluator.py
@@ -41,6 +41,7 @@
get_git_commit_hash,
handle_non_serializable,
hash_string,
+ is_multimodal_content,
make_table,
positional_deprecated,
run_task_tests,
@@ -562,7 +563,8 @@ def evaluate(
target = task.doc_to_target(doc)
saved_doc = {}
for key, value in doc.items():
- saved_doc[key] = value
+ if not is_multimodal_content(value):
+ saved_doc[key] = value
filtered_arguments = []
for req in requests:
# check if req.args is a list of tuples, and each item in the list is a serializable object
diff --git a/lmms_eval/tasks/spatialviz/utils.py b/lmms_eval/tasks/spatialviz/utils.py
index fa248f451..853df7fd2 100644
--- a/lmms_eval/tasks/spatialviz/utils.py
+++ b/lmms_eval/tasks/spatialviz/utils.py
@@ -2,9 +2,11 @@
import re
from collections import defaultdict
from pathlib import Path
+from typing import Any, Dict, List
import yaml
from huggingface_hub import snapshot_download
+from loguru import logger as eval_logger
from PIL import Image
with open(Path(__file__).parent / "_default_template_yaml", "r") as f:
@@ -23,7 +25,7 @@
)
-def spatialviz_doc_to_visual(doc):
+def spatialviz_doc_to_visual(doc: Dict[str, Any]) -> List[Image.Image]:
visual = []
category = doc["Category"]
@@ -33,16 +35,21 @@ def spatialviz_doc_to_visual(doc):
image_path = f"{cache_dir}/{category}/{task}/{level}/{image_id}.png"
if os.path.exists(image_path):
- image_path = image_path
visual.append(Image.open(image_path).convert("RGB"))
else:
- raise FileExistsError(f"video path:{image_path} does not exist.")
+ raise FileNotFoundError(f"image path: {image_path} does not exist.")
return visual
-def spatialviz_doc_to_text(doc):
+def spatialviz_doc_to_text(doc: Dict[str, Any]) -> str:
ops = ["A", "B", "C", "D"]
- prompt = "You should first provide a reasoning process, then provide a single option(A, B, C or D) as the final answer. The reasoning process and the answer are enclosed within and tags, respectively, i.e., reasoning process, answer.\n"
+ prompt = (
+ "You should first provide a reasoning process, then provide a single "
+ "option(A, B, C or D) as the final answer. The reasoning process and "
+ "the answer are enclosed within and "
+ "tags, respectively, i.e., reasoning process, "
+ "answer.\n"
+ )
question = doc["Question"]
choices = doc["Choices"]
choice_text = ""
@@ -53,7 +60,7 @@ def spatialviz_doc_to_text(doc):
return text
-def spatialviz_process_results(doc, results):
+def spatialviz_process_results(doc: Dict[str, Any], results: List[str]) -> Dict[str, Dict[str, Any]]:
key_name = "spatialviz_score"
grounded_output = doc["Answer"]
response = results[0]
@@ -63,14 +70,28 @@ def spatialviz_process_results(doc, results):
think_match = re.search(think_pattern, response, re.DOTALL)
answer_match = re.search(answer_pattern, response, re.DOTALL)
+
+ op: List[str] = []
if think_match and answer_match:
final_answer = answer_match.group(1).strip()
pred_answer = final_answer.split(".")[0]
op = re.findall(r"[A-D]", pred_answer)
-
else:
- print("No match for think/answer \n")
- final_answer_patterns = ["", "Answer:", "Final answer", "final answer", "Final Answer", "the answer is", "The answer is", "correct answer", "Correct answer", "Correct Answer", "答案" "correct path"]
+ eval_logger.debug("No match for think/answer tags in response")
+ final_answer_patterns = [
+ "",
+ "Answer:",
+ "Final answer",
+ "final answer",
+ "Final Answer",
+ "the answer is",
+ "The answer is",
+ "correct answer",
+ "Correct answer",
+ "Correct Answer",
+ "答案",
+ "correct path",
+ ]
if len(response) == 1:
op = re.findall(r"[A-D]", response)
else:
@@ -88,14 +109,23 @@ def spatialviz_process_results(doc, results):
is_correct = False
query = spatialviz_doc_to_text(doc)
- spatialviz_submission = {"id": doc["Image_id"], "query": query, "gt_content": grounded_output, "pred": response, "category": doc["Category"], "task": doc["Task"], "level": doc["Level"], "is_correct": is_correct}
+ spatialviz_submission = {
+ "id": doc["Image_id"],
+ "query": query,
+ "gt_content": grounded_output,
+ "pred": response,
+ "category": doc["Category"],
+ "task": doc["Task"],
+ "level": doc["Level"],
+ "is_correct": is_correct,
+ }
return {key_name: spatialviz_submission}
-def spatialviz_aggregate_results(results):
- task_to_eval_samples = defaultdict(list)
- category_to_eval_samples = defaultdict(list)
- key_to_eval_samples = defaultdict(list)
+def spatialviz_aggregate_results(results: List[Dict[str, Any]]) -> float:
+ task_to_eval_samples: Dict[str, List[int]] = defaultdict(list)
+ category_to_eval_samples: Dict[str, List[int]] = defaultdict(list)
+ key_to_eval_samples: Dict[str, List[int]] = defaultdict(list)
total_samples = len(results)
total_correct = 0
@@ -120,26 +150,25 @@ def spatialviz_aggregate_results(results):
task_accuracies = {task: sum(scores) / len(scores) for task, scores in task_to_eval_samples.items()}
category_accuracies = {category: sum(scores) / len(scores) for category, scores in category_to_eval_samples.items()}
key_accuracies = {key: sum(scores) / len(scores) for key, scores in key_to_eval_samples.items()}
- print(f"{'Total Samples':<20}: {total_samples}")
- print(f"{'Total Correct':<20}: {total_correct}")
- print(f"{'Overall Accuracy':<20}: {accuracy:.4f}")
- print()
- print(f"{'Per-Task Accuracy':<40}")
- print("-" * 40)
+ eval_logger.info(f"{'Total Samples':<20}: {total_samples}")
+ eval_logger.info(f"{'Total Correct':<20}: {total_correct}")
+ eval_logger.info(f"{'Overall Accuracy':<20}: {accuracy:.4f}")
+
+ eval_logger.info(f"{'Per-Task Accuracy':<40}")
+ eval_logger.info("-" * 40)
for task, acc in task_accuracies.items():
- print(f"{task:<20}: {acc:.4f}")
- print()
+ eval_logger.info(f"{task:<20}: {acc:.4f}")
- print(f"{'Per-Category Accuracy':<40}")
- print("-" * 40)
+ eval_logger.info(f"{'Per-Category Accuracy':<40}")
+ eval_logger.info("-" * 40)
for category, acc in category_accuracies.items():
- print(f"{category:<20}: {acc:.4f}")
- print("=" * 40)
+ eval_logger.info(f"{category:<20}: {acc:.4f}")
+ eval_logger.info("=" * 40)
- print(f"{'Per-Key Accuracy':<40}")
- print("-" * 40)
+ eval_logger.info(f"{'Per-Key Accuracy':<40}")
+ eval_logger.info("-" * 40)
for key, acc in key_accuracies.items():
- print(f"{key:<20}: {acc:.4f}")
- print()
+ eval_logger.info(f"{key:<20}: {acc:.4f}")
+
return accuracy
diff --git a/lmms_eval/utils.py b/lmms_eval/utils.py
index f8f1050b2..af4e23ab1 100755
--- a/lmms_eval/utils.py
+++ b/lmms_eval/utils.py
@@ -102,6 +102,34 @@ def handle_non_serializable(o):
return str(o)
+def is_multimodal_content(value: Any) -> bool:
+ """
+ Check if a value is multimodal content (image, audio, video) that should
+ not be serialized to log files.
+
+ Returns True for:
+ - PIL.Image objects
+ - numpy arrays (typically image/audio data)
+ - bytes (binary data)
+ - torch tensors
+ - dicts with 'array' key (HuggingFace audio format)
+ - dicts with 'bytes' key (HuggingFace image format)
+ """
+ if isinstance(value, (bytes, bytearray, np.ndarray, torch.Tensor)):
+ return True
+ if isinstance(value, dict):
+ if "array" in value or "bytes" in value:
+ return True
+ try:
+ from PIL import Image
+
+ if isinstance(value, Image.Image):
+ return True
+ except ImportError:
+ pass
+ return False
+
+
def sanitize_list(sub):
"""
Takes possible nested list and recursively converts all inner component to strings