diff --git a/lmms_eval/evaluator.py b/lmms_eval/evaluator.py index eedd6dd3e..872e75a67 100755 --- a/lmms_eval/evaluator.py +++ b/lmms_eval/evaluator.py @@ -41,6 +41,7 @@ get_git_commit_hash, handle_non_serializable, hash_string, + is_multimodal_content, make_table, positional_deprecated, run_task_tests, @@ -562,7 +563,8 @@ def evaluate( target = task.doc_to_target(doc) saved_doc = {} for key, value in doc.items(): - saved_doc[key] = value + if not is_multimodal_content(value): + saved_doc[key] = value filtered_arguments = [] for req in requests: # check if req.args is a list of tuples, and each item in the list is a serializable object diff --git a/lmms_eval/tasks/spatialviz/utils.py b/lmms_eval/tasks/spatialviz/utils.py index fa248f451..853df7fd2 100644 --- a/lmms_eval/tasks/spatialviz/utils.py +++ b/lmms_eval/tasks/spatialviz/utils.py @@ -2,9 +2,11 @@ import re from collections import defaultdict from pathlib import Path +from typing import Any, Dict, List import yaml from huggingface_hub import snapshot_download +from loguru import logger as eval_logger from PIL import Image with open(Path(__file__).parent / "_default_template_yaml", "r") as f: @@ -23,7 +25,7 @@ ) -def spatialviz_doc_to_visual(doc): +def spatialviz_doc_to_visual(doc: Dict[str, Any]) -> List[Image.Image]: visual = [] category = doc["Category"] @@ -33,16 +35,21 @@ def spatialviz_doc_to_visual(doc): image_path = f"{cache_dir}/{category}/{task}/{level}/{image_id}.png" if os.path.exists(image_path): - image_path = image_path visual.append(Image.open(image_path).convert("RGB")) else: - raise FileExistsError(f"video path:{image_path} does not exist.") + raise FileNotFoundError(f"image path: {image_path} does not exist.") return visual -def spatialviz_doc_to_text(doc): +def spatialviz_doc_to_text(doc: Dict[str, Any]) -> str: ops = ["A", "B", "C", "D"] - prompt = "You should first provide a reasoning process, then provide a single option(A, B, C or D) as the final answer. The reasoning process and the answer are enclosed within and tags, respectively, i.e., reasoning process, answer.\n" + prompt = ( + "You should first provide a reasoning process, then provide a single " + "option(A, B, C or D) as the final answer. The reasoning process and " + "the answer are enclosed within and " + "tags, respectively, i.e., reasoning process, " + "answer.\n" + ) question = doc["Question"] choices = doc["Choices"] choice_text = "" @@ -53,7 +60,7 @@ def spatialviz_doc_to_text(doc): return text -def spatialviz_process_results(doc, results): +def spatialviz_process_results(doc: Dict[str, Any], results: List[str]) -> Dict[str, Dict[str, Any]]: key_name = "spatialviz_score" grounded_output = doc["Answer"] response = results[0] @@ -63,14 +70,28 @@ def spatialviz_process_results(doc, results): think_match = re.search(think_pattern, response, re.DOTALL) answer_match = re.search(answer_pattern, response, re.DOTALL) + + op: List[str] = [] if think_match and answer_match: final_answer = answer_match.group(1).strip() pred_answer = final_answer.split(".")[0] op = re.findall(r"[A-D]", pred_answer) - else: - print("No match for think/answer \n") - final_answer_patterns = ["", "Answer:", "Final answer", "final answer", "Final Answer", "the answer is", "The answer is", "correct answer", "Correct answer", "Correct Answer", "答案" "correct path"] + eval_logger.debug("No match for think/answer tags in response") + final_answer_patterns = [ + "", + "Answer:", + "Final answer", + "final answer", + "Final Answer", + "the answer is", + "The answer is", + "correct answer", + "Correct answer", + "Correct Answer", + "答案", + "correct path", + ] if len(response) == 1: op = re.findall(r"[A-D]", response) else: @@ -88,14 +109,23 @@ def spatialviz_process_results(doc, results): is_correct = False query = spatialviz_doc_to_text(doc) - spatialviz_submission = {"id": doc["Image_id"], "query": query, "gt_content": grounded_output, "pred": response, "category": doc["Category"], "task": doc["Task"], "level": doc["Level"], "is_correct": is_correct} + spatialviz_submission = { + "id": doc["Image_id"], + "query": query, + "gt_content": grounded_output, + "pred": response, + "category": doc["Category"], + "task": doc["Task"], + "level": doc["Level"], + "is_correct": is_correct, + } return {key_name: spatialviz_submission} -def spatialviz_aggregate_results(results): - task_to_eval_samples = defaultdict(list) - category_to_eval_samples = defaultdict(list) - key_to_eval_samples = defaultdict(list) +def spatialviz_aggregate_results(results: List[Dict[str, Any]]) -> float: + task_to_eval_samples: Dict[str, List[int]] = defaultdict(list) + category_to_eval_samples: Dict[str, List[int]] = defaultdict(list) + key_to_eval_samples: Dict[str, List[int]] = defaultdict(list) total_samples = len(results) total_correct = 0 @@ -120,26 +150,25 @@ def spatialviz_aggregate_results(results): task_accuracies = {task: sum(scores) / len(scores) for task, scores in task_to_eval_samples.items()} category_accuracies = {category: sum(scores) / len(scores) for category, scores in category_to_eval_samples.items()} key_accuracies = {key: sum(scores) / len(scores) for key, scores in key_to_eval_samples.items()} - print(f"{'Total Samples':<20}: {total_samples}") - print(f"{'Total Correct':<20}: {total_correct}") - print(f"{'Overall Accuracy':<20}: {accuracy:.4f}") - print() - print(f"{'Per-Task Accuracy':<40}") - print("-" * 40) + eval_logger.info(f"{'Total Samples':<20}: {total_samples}") + eval_logger.info(f"{'Total Correct':<20}: {total_correct}") + eval_logger.info(f"{'Overall Accuracy':<20}: {accuracy:.4f}") + + eval_logger.info(f"{'Per-Task Accuracy':<40}") + eval_logger.info("-" * 40) for task, acc in task_accuracies.items(): - print(f"{task:<20}: {acc:.4f}") - print() + eval_logger.info(f"{task:<20}: {acc:.4f}") - print(f"{'Per-Category Accuracy':<40}") - print("-" * 40) + eval_logger.info(f"{'Per-Category Accuracy':<40}") + eval_logger.info("-" * 40) for category, acc in category_accuracies.items(): - print(f"{category:<20}: {acc:.4f}") - print("=" * 40) + eval_logger.info(f"{category:<20}: {acc:.4f}") + eval_logger.info("=" * 40) - print(f"{'Per-Key Accuracy':<40}") - print("-" * 40) + eval_logger.info(f"{'Per-Key Accuracy':<40}") + eval_logger.info("-" * 40) for key, acc in key_accuracies.items(): - print(f"{key:<20}: {acc:.4f}") - print() + eval_logger.info(f"{key:<20}: {acc:.4f}") + return accuracy diff --git a/lmms_eval/utils.py b/lmms_eval/utils.py index f8f1050b2..af4e23ab1 100755 --- a/lmms_eval/utils.py +++ b/lmms_eval/utils.py @@ -102,6 +102,34 @@ def handle_non_serializable(o): return str(o) +def is_multimodal_content(value: Any) -> bool: + """ + Check if a value is multimodal content (image, audio, video) that should + not be serialized to log files. + + Returns True for: + - PIL.Image objects + - numpy arrays (typically image/audio data) + - bytes (binary data) + - torch tensors + - dicts with 'array' key (HuggingFace audio format) + - dicts with 'bytes' key (HuggingFace image format) + """ + if isinstance(value, (bytes, bytearray, np.ndarray, torch.Tensor)): + return True + if isinstance(value, dict): + if "array" in value or "bytes" in value: + return True + try: + from PIL import Image + + if isinstance(value, Image.Image): + return True + except ImportError: + pass + return False + + def sanitize_list(sub): """ Takes possible nested list and recursively converts all inner component to strings