Skip to content

Commit 17de94e

Browse files
Dev/tomato (#709)
* ADD new task TOMATO * [Fix] check if video got audio for non-audio video qwen_omni_2_5 * [Fix] max_pixels in qwen2_vl, typo in Worldsense task * delete comments * Fix linting videomathqa
1 parent 43e446b commit 17de94e

File tree

6 files changed

+401
-9
lines changed

6 files changed

+401
-9
lines changed

lmms_eval/models/qwen2_5_omni.py

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
import torch
1212
from accelerate import Accelerator, DistributedType
1313
from loguru import logger as eval_logger
14+
from moviepy import VideoFileClip
1415
from PIL import Image
1516
from tqdm import tqdm
1617
from transformers import Qwen2_5OmniForConditionalGeneration, Qwen2_5OmniProcessor
@@ -167,11 +168,8 @@ def resample_audio(self, audio: np.ndarray, current_sample_rate: int):
167168
return audio
168169

169170
def _check_if_video_has_audio(self, video_path):
170-
container = av.open(video_path)
171-
audio_streams = [stream for stream in container.streams if stream.type == "audio"]
172-
if not audio_streams:
173-
return False
174-
return True
171+
clip = VideoFileClip(video_path)
172+
return clip.audio is not None
175173

176174
def generate_until(self, requests: List[Instance]) -> List[str]:
177175
res = []
@@ -220,7 +218,7 @@ def _collate(x):
220218
if len(visuals) > 0:
221219
visual = visuals[i] if i < len(visuals) else None
222220
if isinstance(visual, str) and visual.endswith((".mp4", ".avi", ".mov")): # Video file
223-
current_use_audio = True
221+
current_use_audio = self._check_if_video_has_audio(visual)
224222
if self.use_custom_video_loader:
225223
visual = read_video_pyav_base64(visual, num_frm=self.max_num_frames, fps=self.fps, img_format="JPEG", max_image_size=self.max_image_size)
226224
image_contents = list(map(lambda x: f"data:image/jpeg;base64,{x}", visual))

lmms_eval/models/qwen2_vl.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ def __init__(
4141
use_cache=True,
4242
use_flash_attention_2: Optional[bool] = False,
4343
max_length: Optional[int] = 2048, # Added max_length parameter
44-
max_pixels: int = 12845056,
44+
max_pixels: int = 602112,
4545
min_pixels: int = 3136,
4646
max_num_frames: int = 32,
4747
system_prompt: Optional[str] = "You are a helpful assistant.",

lmms_eval/tasks/tomato/tomato.yaml

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
dataset_path: lmms-lab/TOMATO
2+
dataset_kwargs:
3+
token: True
4+
cache_dir: TOMATO
5+
video: True
6+
task: "tomato"
7+
test_split: test
8+
output_type: generate_until
9+
doc_to_visual: !function utils.tomato_doc_to_visual
10+
doc_to_text: !function utils.tomato_doc_to_text
11+
doc_to_target: utils.tomato_doc_to_target
12+
generation_kwargs:
13+
max_new_tokens: 1024
14+
temperature: 0
15+
top_p: 1.0
16+
num_beams: 1
17+
do_sample: false
18+
# The return value of process_results will be used by metrics
19+
process_results: !function utils.tomato_process_results
20+
# Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
21+
metric_list:
22+
- metric: tomato_score
23+
aggregation: !function utils.tomato_aggregate_results
24+
higher_is_better: true
25+

0 commit comments

Comments
 (0)