fix: improve spatialviz utils quality

Luodian · Luodian · commit 0196082d54fb · 2025-12-31T00:24:07.000+08:00
- Fix FileExistsError -&gt; FileNotFoundError (correct exception type)
- Replace print() with eval_logger for consistent logging
- Add type hints to all functions
- Fix missing comma bug in final_answer_patterns list
- Remove redundant image_path = image_path assignment
- Initialize op variable to prevent potential UnboundLocalError
- Break long prompt string for readability (88 char line limit)
diff --git a/lmms_eval/tasks/spatialviz/utils.py b/lmms_eval/tasks/spatialviz/utils.py
@@ -2,9 +2,11 @@
 import re
 from collections import defaultdict
 from pathlib import Path
+from typing import Any, Dict, List
 
 import yaml
 from huggingface_hub import snapshot_download
+from loguru import logger as eval_logger
 from PIL import Image
 
 with open(Path(__file__).parent / "_default_template_yaml", "r") as f:
@@ -23,7 +25,7 @@
 )
 
 
-def spatialviz_doc_to_visual(doc):
+def spatialviz_doc_to_visual(doc: Dict[str, Any]) -> List[Image.Image]:
     visual = []
 
     category = doc["Category"]
@@ -33,16 +35,21 @@ def spatialviz_doc_to_visual(doc):
     image_path = f"{cache_dir}/{category}/{task}/{level}/{image_id}.png"
 
     if os.path.exists(image_path):
-        image_path = image_path
         visual.append(Image.open(image_path).convert("RGB"))
     else:
-        raise FileExistsError(f"video path:{image_path} does not exist.")
+        raise FileNotFoundError(f"image path: {image_path} does not exist.")
     return visual
 
 
-def spatialviz_doc_to_text(doc):
+def spatialviz_doc_to_text(doc: Dict[str, Any]) -> str:
     ops = ["A", "B", "C", "D"]
-    prompt = "You should first provide a reasoning process, then provide a single option(A, B, C or D) as the final answer. The reasoning process and the answer are enclosed within <think></think> and <answer></answer> tags, respectively, i.e., <think>reasoning process</think>, <answer>answer</answer>.\n"
+    prompt = (
+        "You should first provide a reasoning process, then provide a single "
+        "option(A, B, C or D) as the final answer. The reasoning process and "
+        "the answer are enclosed within <think></think> and <answer></answer> "
+        "tags, respectively, i.e., <think>reasoning process</think>, "
+        "<answer>answer</answer>.\n"
+    )
     question = doc["Question"]
     choices = doc["Choices"]
     choice_text = ""
@@ -53,7 +60,9 @@ def spatialviz_doc_to_text(doc):
     return text
 
 
-def spatialviz_process_results(doc, results):
+def spatialviz_process_results(
+    doc: Dict[str, Any], results: List[str]
+) -> Dict[str, Dict[str, Any]]:
     key_name = "spatialviz_score"
     grounded_output = doc["Answer"]
     response = results[0]
@@ -63,14 +72,28 @@ def spatialviz_process_results(doc, results):
 
     think_match = re.search(think_pattern, response, re.DOTALL)
     answer_match = re.search(answer_pattern, response, re.DOTALL)
+
+    op: List[str] = []
     if think_match and answer_match:
         final_answer = answer_match.group(1).strip()
         pred_answer = final_answer.split(".")[0]
         op = re.findall(r"[A-D]", pred_answer)
-
     else:
-        print("No match for think/answer \n")
-        final_answer_patterns = ["<answer>", "Answer:", "Final answer", "final answer", "Final Answer", "the answer is", "The answer is", "correct answer", "Correct answer", "Correct Answer", "答案" "correct path"]
+        eval_logger.debug("No match for think/answer tags in response")
+        final_answer_patterns = [
+            "<answer>",
+            "Answer:",
+            "Final answer",
+            "final answer",
+            "Final Answer",
+            "the answer is",
+            "The answer is",
+            "correct answer",
+            "Correct answer",
+            "Correct Answer",
+            "答案",
+            "correct path",
+        ]
         if len(response) == 1:
             op = re.findall(r"[A-D]", response)
         else:
@@ -88,14 +111,23 @@ def spatialviz_process_results(doc, results):
         is_correct = False
 
     query = spatialviz_doc_to_text(doc)
-    spatialviz_submission = {"id": doc["Image_id"], "query": query, "gt_content": grounded_output, "pred": response, "category": doc["Category"], "task": doc["Task"], "level": doc["Level"], "is_correct": is_correct}
+    spatialviz_submission = {
+        "id": doc["Image_id"],
+        "query": query,
+        "gt_content": grounded_output,
+        "pred": response,
+        "category": doc["Category"],
+        "task": doc["Task"],
+        "level": doc["Level"],
+        "is_correct": is_correct,
+    }
     return {key_name: spatialviz_submission}
 
 
-def spatialviz_aggregate_results(results):
-    task_to_eval_samples = defaultdict(list)
-    category_to_eval_samples = defaultdict(list)
-    key_to_eval_samples = defaultdict(list)
+def spatialviz_aggregate_results(results: List[Dict[str, Any]]) -> float:
+    task_to_eval_samples: Dict[str, List[int]] = defaultdict(list)
+    category_to_eval_samples: Dict[str, List[int]] = defaultdict(list)
+    key_to_eval_samples: Dict[str, List[int]] = defaultdict(list)
     total_samples = len(results)
     total_correct = 0
 
@@ -117,29 +149,35 @@ def spatialviz_aggregate_results(results):
             key_to_eval_samples[key].append(0)
 
     accuracy = total_correct / total_samples if total_samples > 0 else 0
-    task_accuracies = {task: sum(scores) / len(scores) for task, scores in task_to_eval_samples.items()}
-    category_accuracies = {category: sum(scores) / len(scores) for category, scores in category_to_eval_samples.items()}
-    key_accuracies = {key: sum(scores) / len(scores) for key, scores in key_to_eval_samples.items()}
-    print(f"{'Total Samples':<20}: {total_samples}")
-    print(f"{'Total Correct':<20}: {total_correct}")
-    print(f"{'Overall Accuracy':<20}: {accuracy:.4f}")
-    print()
-
-    print(f"{'Per-Task Accuracy':<40}")
-    print("-" * 40)
+    task_accuracies = {
+        task: sum(scores) / len(scores) for task, scores in task_to_eval_samples.items()
+    }
+    category_accuracies = {
+        category: sum(scores) / len(scores)
+        for category, scores in category_to_eval_samples.items()
+    }
+    key_accuracies = {
+        key: sum(scores) / len(scores) for key, scores in key_to_eval_samples.items()
+    }
+
+    eval_logger.info(f"{'Total Samples':<20}: {total_samples}")
+    eval_logger.info(f"{'Total Correct':<20}: {total_correct}")
+    eval_logger.info(f"{'Overall Accuracy':<20}: {accuracy:.4f}")
+
+    eval_logger.info(f"{'Per-Task Accuracy':<40}")
+    eval_logger.info("-" * 40)
     for task, acc in task_accuracies.items():
-        print(f"{task:<20}: {acc:.4f}")
-    print()
+        eval_logger.info(f"{task:<20}: {acc:.4f}")
 
-    print(f"{'Per-Category Accuracy':<40}")
-    print("-" * 40)
+    eval_logger.info(f"{'Per-Category Accuracy':<40}")
+    eval_logger.info("-" * 40)
     for category, acc in category_accuracies.items():
-        print(f"{category:<20}: {acc:.4f}")
-    print("=" * 40)
+        eval_logger.info(f"{category:<20}: {acc:.4f}")
+    eval_logger.info("=" * 40)
 
-    print(f"{'Per-Key Accuracy':<40}")
-    print("-" * 40)
+    eval_logger.info(f"{'Per-Key Accuracy':<40}")
+    eval_logger.info("-" * 40)
     for key, acc in key_accuracies.items():
-        print(f"{key:<20}: {acc:.4f}")
-    print()
+        eval_logger.info(f"{key:<20}: {acc:.4f}")
+
     return accuracy