@@ -244,7 +244,9 @@ def replace_tag(self, media_type: Literal['image', 'video', 'audio'], index: int
244244 video , video_kwargs = fetch_video ({'video' : video }, return_video_sample_fps = True )
245245 if isinstance (video , torch .Tensor ):
246246 video = video .to (torch .uint8 )
247- inputs .videos [index ] = (video , video_kwargs )
247+ inputs .videos [index ] = video
248+ if self .version == 'v2_5' :
249+ inputs .mm_processor_kwargs .setdefault ('fps' , []).append (video_kwargs )
248250 return ['<|vision_start|><|video_pad|><|vision_end|>' ]
249251
250252 def replace_ref (self , ref : str , index : int , inputs : StdTemplateInputs ) -> List [Context ]:
@@ -259,14 +261,12 @@ def _encode(self, inputs: StdTemplateInputs) -> Dict[str, Any]:
259261 input_ids = encoded ['input_ids' ]
260262 labels = encoded ['labels' ]
261263 loss_scale = encoded .get ('loss_scale' , None )
262- images = inputs .images
263- videos = [video [0 ] for video in inputs .videos ]
264- fps = [video [1 ] for video in inputs .videos ]
265264 for media_type in ['images' , 'videos' ]:
266- if locals ()[media_type ]:
265+ mm_data = getattr (inputs , media_type )
266+ if mm_data :
267267 if media_type == 'images' :
268268 media_token = self .image_token_id
269- media_inputs = processor .image_processor (images = images , return_tensors = 'pt' , do_resize = False )
269+ media_inputs = processor .image_processor (images = mm_data , return_tensors = 'pt' , do_resize = False )
270270 media_grid_thw = media_inputs ['image_grid_thw' ]
271271 else :
272272 kwargs = {}
@@ -275,10 +275,11 @@ def _encode(self, inputs: StdTemplateInputs) -> Dict[str, Any]:
275275 else :
276276 processor_func = processor .image_processor
277277 kwargs ['images' ] = None
278- media_inputs = processor_func (videos = videos , return_tensors = 'pt' , do_resize = False , ** kwargs )
278+ media_inputs = processor_func (videos = mm_data , return_tensors = 'pt' , do_resize = False , ** kwargs )
279279 media_grid_thw = media_inputs ['video_grid_thw' ]
280280 media_token = self .video_token_id
281281 if self .version == 'v2_5' :
282+ fps = inputs .mm_processor_kwargs ['fps' ]
282283 media_inputs ['second_per_grid_ts' ] = [
283284 processor .image_processor .temporal_patch_size / tmp for tmp in fps
284285 ]
0 commit comments