@@ -198,15 +198,13 @@ def prepare_inputs(
198198 if visual_pos_masks is not None :
199199 inputs ["visual_pos_masks" ] = visual_pos_masks
200200 else :
201- inputs ["visual_pos_masks" ] = torch .ones (1 , 1 , dtype = torch .bool )
201+ inputs ["visual_pos_masks" ] = torch .zeros (1 , 1 , dtype = torch .bool )
202202
203203 if "deepstack_visual_embeds" in self .input_names :
204204 if isinstance (deepstack_visual_embeds , list ):
205205 inputs ["deepstack_visual_embeds" ] = torch .Tensor (deepstack_visual_embeds )
206206 else :
207- inputs ["deepstack_visual_embeds" ] = torch .ones ((3 , 1 , 1 ), dtype = torch .float32 )
208- print (inputs ["deepstack_visual_embeds" ].shape )
209-
207+ inputs ["deepstack_visual_embeds" ] = torch .zeros ((3 , 1 , 1 ), dtype = torch .float32 )
210208 if "token_type_ids" in self .input_names :
211209 if token_type_ids is None :
212210 token_type_ids = np .zeros (inputs_embeds .shape [:2 ], dtype = int )
@@ -216,11 +214,6 @@ def prepare_inputs(
216214 inputs ["beam_idx" ] = (
217215 self .next_beam_idx if self .next_beam_idx is not None else np .arange (batch_size , dtype = int )
218216 )
219- for key , value in inputs .items ():
220- if hasattr (value , 'dtype' ):
221- print (f"{ key } : { value .dtype } " )
222- else :
223- print (f"{ key } : { type (value )} " )
224217 return inputs
225218
226219 def forward (
@@ -2549,27 +2542,6 @@ class QWen2VLModelOutputWithPast(ModelOutput):
25492542 rope_deltas : Optional [torch .FloatTensor ] = None
25502543 second_per_grid_ts : Optional [torch .FloatTensor ] = None
25512544
2552-
2553- # @dataclass
2554- # class QWen3VLModelOutputWithPast(ModelOutput):
2555- # r"""
2556- # past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
2557- # Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
2558- # `(batch_size, num_heads, sequence_length, embed_size_per_head)`)
2559-
2560- # Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
2561- # `past_key_values` input) to speed up sequential decoding.
2562- # rope_deltas (`torch.LongTensor` of shape `(batch_size, )`, *optional*):
2563- # The rope index difference between sequence length and multimodal rope.
2564- # """
2565-
2566- # last_hidden_state: Optional[torch.FloatTensor] = None
2567- # past_key_values: Optional[list[torch.FloatTensor]] = None
2568- # hidden_states: Optional[tuple[torch.FloatTensor]] = None
2569- # attentions: Optional[tuple[torch.FloatTensor]] = None
2570- # rope_deltas: Optional[torch.LongTensor] = None
2571-
2572-
25732545class _OVQwen2VLForCausalLM (OVModelForVisualCausalLM ):
25742546 additional_parts = ["vision_embeddings_merger" ]
25752547
@@ -3855,9 +3827,7 @@ def get_image_features(self, pixel_values: torch.FloatTensor, image_grid_thw: Op
38553827 image_embeds , deepstack_image_embeds = self .get_vision_embeddings (pixel_values , image_grid_thw )
38563828 image_embeds , deepstack_image_embeds = torch .from_numpy (image_embeds ), torch .from_numpy (deepstack_image_embeds )
38573829 deepstack_image_embeds = deepstack_image_embeds .tolist ()
3858- print (image_grid_thw .prod (- 1 ))
38593830 split_sizes = (image_grid_thw .prod (- 1 ) // self .spatial_merge_size ** 2 ).tolist ()
3860- print (image_embeds .shape )
38613831 image_embeds = torch .split (image_embeds , split_sizes )
38623832 return image_embeds , deepstack_image_embeds
38633833
0 commit comments