EvolvingLMMs-Lab
diff --git a/‎lmms_eval/models/qwen2_5_omni.py
Lines changed: 4 additions & 6 deletions b/‎lmms_eval/models/qwen2_5_omni.py
Lines changed: 4 additions & 6 deletions
diff --git a/‎lmms_eval/models/qwen2_vl.py
Lines changed: 1 addition & 1 deletion b/‎lmms_eval/models/qwen2_vl.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎lmms_eval/tasks/tomato/tomato.yaml
Lines changed: 25 additions & 0 deletions b/‎lmms_eval/tasks/tomato/tomato.yaml
Lines changed: 25 additions & 0 deletions
@@ -11,6 +11,7 @@
 import torch
 from accelerate import Accelerator, DistributedType
 from loguru import logger as eval_logger
+from moviepy import VideoFileClip
 from PIL import Image
 from tqdm import tqdm
 from transformers import Qwen2_5OmniForConditionalGeneration, Qwen2_5OmniProcessor
@@ -167,11 +168,8 @@ def resample_audio(self, audio: np.ndarray, current_sample_rate: int):
         return audio
 
     def _check_if_video_has_audio(self, video_path):
-        container = av.open(video_path)
-        audio_streams = [stream for stream in container.streams if stream.type == "audio"]
-        if not audio_streams:
-            return False
-        return True
+        clip = VideoFileClip(video_path)
+        return clip.audio is not None
 
     def generate_until(self, requests: List[Instance]) -> List[str]:
         res = []
@@ -220,7 +218,7 @@ def _collate(x):
                 if len(visuals) > 0:
                     visual = visuals[i] if i < len(visuals) else None
                     if isinstance(visual, str) and visual.endswith((".mp4", ".avi", ".mov")):  # Video file
-                        current_use_audio = True
+                        current_use_audio = self._check_if_video_has_audio(visual)
                         if self.use_custom_video_loader:
                             visual = read_video_pyav_base64(visual, num_frm=self.max_num_frames, fps=self.fps, img_format="JPEG", max_image_size=self.max_image_size)
                             image_contents = list(map(lambda x: f"data:image/jpeg;base64,{x}", visual))
 
@@ -41,7 +41,7 @@ def __init__(
         use_cache=True,
         use_flash_attention_2: Optional[bool] = False,
         max_length: Optional[int] = 2048,  # Added max_length parameter
-        max_pixels: int = 12845056,
+        max_pixels: int = 602112,
         min_pixels: int = 3136,
         max_num_frames: int = 32,
         system_prompt: Optional[str] = "You are a helpful assistant.",
 
@@ -0,0 +1,25 @@
+dataset_path: lmms-lab/TOMATO
+dataset_kwargs:
+  token: True
+  cache_dir: TOMATO
+  video: True
+task: "tomato"
+test_split: test
+output_type: generate_until
+doc_to_visual: !function utils.tomato_doc_to_visual
+doc_to_text: !function utils.tomato_doc_to_text
+doc_to_target: utils.tomato_doc_to_target
+generation_kwargs:
+  max_new_tokens: 1024
+  temperature: 0
+  top_p: 1.0
+  num_beams: 1
+  do_sample: false
+# The return value of process_results will be used by metrics
+process_results: !function utils.tomato_process_results
+# Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
+metric_list:
+  - metric: tomato_score
+    aggregation: !function utils.tomato_aggregate_results
+    higher_is_better: true
+