@@ -349,13 +349,40 @@ class DummyQwen3VLLMInputGenerator(DummyTextInputGenerator):
349349 "visual_pos_masks" ,
350350 "deepstack_visual_embeds" ,
351351 )
352-
352+
353+ def __init__ (
354+ self ,
355+ task : str ,
356+ normalized_config : NormalizedTextConfig ,
357+ batch_size : int = DEFAULT_DUMMY_SHAPES ["batch_size" ],
358+ sequence_length : int = DEFAULT_DUMMY_SHAPES ["sequence_length" ],
359+ num_choices : int = DEFAULT_DUMMY_SHAPES ["num_choices" ],
360+ random_batch_size_range : Optional [Tuple [int , int ]] = None ,
361+ random_sequence_length_range : Optional [Tuple [int , int ]] = None ,
362+ random_num_choices_range : Optional [Tuple [int , int ]] = None ,
363+ padding_side : str = "right" ,
364+ ** kwargs ,
365+ ):
366+ super ().__init__ (
367+ task = task ,
368+ normalized_config = normalized_config ,
369+ batch_size = batch_size ,
370+ sequence_length = sequence_length ,
371+ num_choices = num_choices ,
372+ random_batch_size_range = random_batch_size_range ,
373+ random_sequence_length_range = random_sequence_length_range ,
374+ random_num_choices_range = random_num_choices_range ,
375+ padding_side = padding_side ,
376+ ** kwargs ,
377+ )
378+ self .embed_dim = normalized_config .hidden_size
379+
353380 def generate (self , input_name : str , framework : str = "pt" , int_dtype : str = "int64" , float_dtype : str = "fp32" , bool_dtype : str = "bool" ):
354381 if input_name == "deepstack_visual_embeds" :
355- return self .random_float_tensor ([3 , 32 , 2560 ], framework = framework , dtype = float_dtype )
382+ return self .random_float_tensor ([3 , 2 * self . sequence_length , self . embed_dim ], framework = framework , dtype = float_dtype )
356383 if input_name == "visual_pos_masks" :
357384 return self .constant_tensor (
358- shape = [self .batch_size , 16 ],
385+ shape = [self .batch_size , self . sequence_length ],
359386 framework = framework ,
360387 value = 1 ,
361388 dtype = DTYPE_MAPPER .pt (bool_dtype ),
@@ -381,7 +408,7 @@ class Qwen3VLTextOpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig):
381408 def inputs (self ) -> Dict [str , Dict [int , str ]]:
382409 common_inputs = super ().inputs
383410 common_inputs ["visual_pos_masks" ] = {0 : "batch_size" , 1 : "sequence_length" }
384- common_inputs ["deepstack_visual_embeds" ] = {0 : "num_layers" , 1 : "visual_seqlen" , 2 : "embed_dim" }
411+ common_inputs ["deepstack_visual_embeds" ] = {0 : "num_layers" , 1 : "visual_seqlen" }
385412 return common_inputs
386413
387414 def patch_model_for_export (
@@ -3962,7 +3989,7 @@ def inputs(self) -> Dict[str, Dict[int, str]]:
39623989 }
39633990 if self ._behavior == Qwen3VLConfigBehavior .VISION_EMBEDDINGS_POS :
39643991 return {
3965- "input" : {0 : "sequence_length" , 1 : "sequence_length" },
3992+ "input" : {1 : "sequence_length" },
39663993 }
39673994
39683995
0 commit comments