fix: filter multimodal content from log samples while preserving metadata (#962)

Luodian · web-flow · commit c79490b4ae2e · 2025-12-31T00:29:16.000+08:00
* fix: improve spatialviz utils quality - Fix FileExistsError -> FileNotFoundError (correct exception type) - Replace print() with eval_logger for consistent logging - Add type hints to all functions - Fix missing comma bug in final_answer_patterns list - Remove redundant image_path = image_path assignment - Initialize op variable to prevent potential UnboundLocalError - Break long prompt string for readability (88 char line limit) * style: apply black formatting * fix: filter multimodal content from log samples while preserving metadata When using --log_samples, the previous implementation either saved all fields (causing serialization issues with images/audio) or filtered based on key names (missing useful metadata like image_id, image_path). This fix introduces is_multimodal_content() that detects actual multimodal data types (PIL.Image, numpy arrays, torch tensors, HuggingFace audio/image dicts) while preserving all scalar metadata fields for dataset traceability. Github-Issue:#943
diff --git a/lmms_eval/evaluator.py b/lmms_eval/evaluator.py
@@ -41,6 +41,7 @@
     get_git_commit_hash,
     handle_non_serializable,
     hash_string,
+    is_multimodal_content,
     make_table,
     positional_deprecated,
     run_task_tests,
@@ -562,7 +563,8 @@ def evaluate(
                     target = task.doc_to_target(doc)
                     saved_doc = {}
                     for key, value in doc.items():
-                        saved_doc[key] = value
+                        if not is_multimodal_content(value):
+                            saved_doc[key] = value
                     filtered_arguments = []
                     for req in requests:
                         # check if req.args is a list of tuples, and each item in the list is a serializable object
diff --git a/lmms_eval/tasks/spatialviz/utils.py b/lmms_eval/tasks/spatialviz/utils.py
@@ -2,9 +2,11 @@
 import re
 from collections import defaultdict
 from pathlib import Path
+from typing import Any, Dict, List
 
 import yaml
 from huggingface_hub import snapshot_download
+from loguru import logger as eval_logger
 from PIL import Image
 
 with open(Path(__file__).parent / "_default_template_yaml", "r") as f:
@@ -23,7 +25,7 @@
 )
 
 
-def spatialviz_doc_to_visual(doc):
+def spatialviz_doc_to_visual(doc: Dict[str, Any]) -> List[Image.Image]:
     visual = []
 
     category = doc["Category"]
@@ -33,16 +35,21 @@ def spatialviz_doc_to_visual(doc):
     image_path = f"{cache_dir}/{category}/{task}/{level}/{image_id}.png"
 
     if os.path.exists(image_path):
-        image_path = image_path
         visual.append(Image.open(image_path).convert("RGB"))
     else:
-        raise FileExistsError(f"video path:{image_path} does not exist.")
+        raise FileNotFoundError(f"image path: {image_path} does not exist.")
     return visual
 
 
-def spatialviz_doc_to_text(doc):
+def spatialviz_doc_to_text(doc: Dict[str, Any]) -> str:
     ops = ["A", "B", "C", "D"]
-    prompt = "You should first provide a reasoning process, then provide a single option(A, B, C or D) as the final answer. The reasoning process and the answer are enclosed within <think></think> and <answer></answer> tags, respectively, i.e., <think>reasoning process</think>, <answer>answer</answer>.\n"
+    prompt = (
+        "You should first provide a reasoning process, then provide a single "
+        "option(A, B, C or D) as the final answer. The reasoning process and "
+        "the answer are enclosed within <think></think> and <answer></answer> "
+        "tags, respectively, i.e., <think>reasoning process</think>, "
+        "<answer>answer</answer>.\n"
+    )
     question = doc["Question"]
     choices = doc["Choices"]
     choice_text = ""
@@ -53,7 +60,7 @@ def spatialviz_doc_to_text(doc):
     return text
 
 
-def spatialviz_process_results(doc, results):
+def spatialviz_process_results(doc: Dict[str, Any], results: List[str]) -> Dict[str, Dict[str, Any]]:
     key_name = "spatialviz_score"
     grounded_output = doc["Answer"]
     response = results[0]
@@ -63,14 +70,28 @@ def spatialviz_process_results(doc, results):
 
     think_match = re.search(think_pattern, response, re.DOTALL)
     answer_match = re.search(answer_pattern, response, re.DOTALL)
+
+    op: List[str] = []
     if think_match and answer_match:
         final_answer = answer_match.group(1).strip()
         pred_answer = final_answer.split(".")[0]
         op = re.findall(r"[A-D]", pred_answer)
-
     else:
-        print("No match for think/answer \n")
-        final_answer_patterns = ["<answer>", "Answer:", "Final answer", "final answer", "Final Answer", "the answer is", "The answer is", "correct answer", "Correct answer", "Correct Answer", "答案" "correct path"]
+        eval_logger.debug("No match for think/answer tags in response")
+        final_answer_patterns = [
+            "<answer>",
+            "Answer:",
+            "Final answer",
+            "final answer",
+            "Final Answer",
+            "the answer is",
+            "The answer is",
+            "correct answer",
+            "Correct answer",
+            "Correct Answer",
+            "答案",
+            "correct path",
+        ]
         if len(response) == 1:
             op = re.findall(r"[A-D]", response)
         else:
@@ -88,14 +109,23 @@ def spatialviz_process_results(doc, results):
         is_correct = False
 
     query = spatialviz_doc_to_text(doc)
-    spatialviz_submission = {"id": doc["Image_id"], "query": query, "gt_content": grounded_output, "pred": response, "category": doc["Category"], "task": doc["Task"], "level": doc["Level"], "is_correct": is_correct}
+    spatialviz_submission = {
+        "id": doc["Image_id"],
+        "query": query,
+        "gt_content": grounded_output,
+        "pred": response,
+        "category": doc["Category"],
+        "task": doc["Task"],
+        "level": doc["Level"],
+        "is_correct": is_correct,
+    }
     return {key_name: spatialviz_submission}
 
 
-def spatialviz_aggregate_results(results):
-    task_to_eval_samples = defaultdict(list)
-    category_to_eval_samples = defaultdict(list)
-    key_to_eval_samples = defaultdict(list)
+def spatialviz_aggregate_results(results: List[Dict[str, Any]]) -> float:
+    task_to_eval_samples: Dict[str, List[int]] = defaultdict(list)
+    category_to_eval_samples: Dict[str, List[int]] = defaultdict(list)
+    key_to_eval_samples: Dict[str, List[int]] = defaultdict(list)
     total_samples = len(results)
     total_correct = 0
 
@@ -120,26 +150,25 @@ def spatialviz_aggregate_results(results):
     task_accuracies = {task: sum(scores) / len(scores) for task, scores in task_to_eval_samples.items()}
     category_accuracies = {category: sum(scores) / len(scores) for category, scores in category_to_eval_samples.items()}
     key_accuracies = {key: sum(scores) / len(scores) for key, scores in key_to_eval_samples.items()}
-    print(f"{'Total Samples':<20}: {total_samples}")
-    print(f"{'Total Correct':<20}: {total_correct}")
-    print(f"{'Overall Accuracy':<20}: {accuracy:.4f}")
-    print()
 
-    print(f"{'Per-Task Accuracy':<40}")
-    print("-" * 40)
+    eval_logger.info(f"{'Total Samples':<20}: {total_samples}")
+    eval_logger.info(f"{'Total Correct':<20}: {total_correct}")
+    eval_logger.info(f"{'Overall Accuracy':<20}: {accuracy:.4f}")
+
+    eval_logger.info(f"{'Per-Task Accuracy':<40}")
+    eval_logger.info("-" * 40)
     for task, acc in task_accuracies.items():
-        print(f"{task:<20}: {acc:.4f}")
-    print()
+        eval_logger.info(f"{task:<20}: {acc:.4f}")
 
-    print(f"{'Per-Category Accuracy':<40}")
-    print("-" * 40)
+    eval_logger.info(f"{'Per-Category Accuracy':<40}")
+    eval_logger.info("-" * 40)
     for category, acc in category_accuracies.items():
-        print(f"{category:<20}: {acc:.4f}")
-    print("=" * 40)
+        eval_logger.info(f"{category:<20}: {acc:.4f}")
+    eval_logger.info("=" * 40)
 
-    print(f"{'Per-Key Accuracy':<40}")
-    print("-" * 40)
+    eval_logger.info(f"{'Per-Key Accuracy':<40}")
+    eval_logger.info("-" * 40)
     for key, acc in key_accuracies.items():
-        print(f"{key:<20}: {acc:.4f}")
-    print()
+        eval_logger.info(f"{key:<20}: {acc:.4f}")
+
     return accuracy
diff --git a/lmms_eval/utils.py b/lmms_eval/utils.py
@@ -102,6 +102,34 @@ def handle_non_serializable(o):
         return str(o)
 
 
+def is_multimodal_content(value: Any) -> bool:
+    """
+    Check if a value is multimodal content (image, audio, video) that should
+    not be serialized to log files.
+
+    Returns True for:
+    - PIL.Image objects
+    - numpy arrays (typically image/audio data)
+    - bytes (binary data)
+    - torch tensors
+    - dicts with 'array' key (HuggingFace audio format)
+    - dicts with 'bytes' key (HuggingFace image format)
+    """
+    if isinstance(value, (bytes, bytearray, np.ndarray, torch.Tensor)):
+        return True
+    if isinstance(value, dict):
+        if "array" in value or "bytes" in value:
+            return True
+    try:
+        from PIL import Image
+
+        if isinstance(value, Image.Image):
+            return True
+    except ImportError:
+        pass
+    return False
+
+
 def sanitize_list(sub):
     """
     Takes possible nested list and recursively converts all inner component to strings