138138 Qwen2MoEPatcher ,
139139 Qwen2VLLanguageModelPatcher ,
140140 Qwen2VLVisionEmbMergerPatcher ,
141+ Qwen3VLVisionEmbMergerPatcher ,
142+ Qwen3VLLanguageModelPatcher ,
141143 Qwen3MoeModelPatcher ,
142144 QwenModelPatcher ,
143145 SanaTextEncoderModelPatcher ,
@@ -164,6 +166,10 @@ def init_model_configs():
164166 "transformers" ,
165167 "AutoModelForImageTextToText" ,
166168 )
169+ TasksManager ._CUSTOM_CLASSES [("pt" , "qwen3_vl" , "image-text-to-text" )] = (
170+ "transformers" ,
171+ "AutoModelForImageTextToText" ,
172+ )
167173 TasksManager ._CUSTOM_CLASSES [("pt" , "llava_next_video" , "image-text-to-text" )] = (
168174 "transformers" ,
169175 "AutoModelForVision2Seq" ,
@@ -333,6 +339,57 @@ def patch_model_for_export(
333339 ) -> "ModelPatcher" :
334340 return OVDecoderModelPatcher (self , model , model_kwargs = model_kwargs )
335341
342+ class DummyQwen3VLLMInputGenerator (DummyTextInputGenerator ):
343+ SUPPORTED_INPUT_NAMES = (
344+ "input_ids" ,
345+ "attention_mask" ,
346+ "encoder_attention_mask" ,
347+ "token_type_ids" ,
348+ "position_ids" ,
349+ "visual_pos_masks" ,
350+ "deepstack_visual_embeds" ,
351+ )
352+
353+ def generate (self , input_name : str , framework : str = "pt" , int_dtype : str = "int64" , float_dtype : str = "fp32" , bool_dtype : str = "bool" ):
354+ if input_name == "deepstack_visual_embeds" :
355+ return self .random_float_tensor ([3 , 32 , 2560 ], framework = framework , dtype = float_dtype )
356+ if input_name == "visual_pos_masks" :
357+ return self .constant_tensor (
358+ shape = [self .batch_size , 16 ],
359+ framework = framework ,
360+ value = 1 ,
361+ dtype = DTYPE_MAPPER .pt (bool_dtype ),
362+ )
363+ return super ().generate (input_name , framework , int_dtype , float_dtype )
364+
365+ @register_in_tasks_manager (
366+ "qwen3_vl_text" ,
367+ * [
368+ "text-generation" ,
369+ "text-generation-with-past" ,
370+ ],
371+ library_name = "transformers" ,
372+ )
373+ class Qwen3VLTextOpenVINOConfig (TextDecoderWithPositionIdsOnnxConfig ):
374+ MIN_TRANSFORMERS_VERSION = "4.56.0"
375+
376+ DUMMY_INPUT_GENERATOR_CLASSES = (DummyQwen3VLLMInputGenerator , GemmaDummyPastKeyValuesGenerator )
377+ DUMMY_PKV_GENERATOR_CLASS = GemmaDummyPastKeyValuesGenerator
378+ NORMALIZED_CONFIG_CLASS = NormalizedTextConfig
379+
380+ @property
381+ def inputs (self ) -> Dict [str , Dict [int , str ]]:
382+ common_inputs = super ().inputs
383+ common_inputs ["visual_pos_masks" ] = {0 : "batch_size" , 1 : "sequence_length" }
384+ common_inputs ["deepstack_visual_embeds" ] = {0 : "num_layers" , 1 : "visual_seqlen" , 2 : "embed_dim" }
385+ return common_inputs
386+
387+ def patch_model_for_export (
388+ self , model : Union ["PreTrainedModel" , "TFPreTrainedModel" ], model_kwargs : Optional [Dict [str , Any ]] = None
389+ ) -> "ModelPatcher" :
390+ return OVDecoderModelPatcher (self , model , model_kwargs = model_kwargs )
391+
392+
336393
337394@register_in_tasks_manager (
338395 "qwen3_moe" ,
@@ -3437,6 +3494,8 @@ def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int
34373494 return generated_input
34383495
34393496
3497+
3498+
34403499class DummyQwen2VLVisionEmbedInputGenerator (DummyVisionInputGenerator ):
34413500 SUPPORTED_INPUT_NAMES = (
34423501 "hidden_states" ,
@@ -3503,6 +3562,75 @@ def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int
35033562 return self .random_int_tensor ([hidden_size ], max_value = hidden_size )
35043563
35053564
3565+ class DummyQwen3VLVisionEmbedInputGenerator (DummyVisionInputGenerator ):
3566+ SUPPORTED_INPUT_NAMES = (
3567+ "hidden_states" ,
3568+ "attention_mask" ,
3569+ "window_attention_mask" ,
3570+ "window_index" ,
3571+ "rotary_pos_emb" ,
3572+ "input" ,
3573+ )
3574+
3575+ def __init__ (
3576+ self ,
3577+ task : str ,
3578+ normalized_config : NormalizedVisionConfig ,
3579+ batch_size : int = 1 ,
3580+ num_channels : int = DEFAULT_DUMMY_SHAPES ["num_channels" ],
3581+ width : int = 420 ,
3582+ height : int = 420 ,
3583+ ** kwargs ,
3584+ ):
3585+ self .batch_size = batch_size
3586+ self .height = height
3587+ self .width = width
3588+ self .num_channels = num_channels
3589+ self .temporal_patch_size = normalized_config .config .temporal_patch_size
3590+ self .patch_size = normalized_config .config .patch_size
3591+ if normalized_config .use_embed_dim :
3592+ self .embed_dim = (
3593+ normalized_config .config .embed_dim
3594+ if hasattr (normalized_config .config , "embed_dim" )
3595+ else normalized_config .hidden_size
3596+ )
3597+ else :
3598+ self .embed_dim = self .num_channels * self .temporal_patch_size * self .patch_size * self .patch_size
3599+ self .num_heads = normalized_config .config .num_heads
3600+ self .spatial_merge_size = None
3601+ if hasattr (normalized_config .config , "spatial_merge_size" ):
3602+ self .spatial_merge_size = normalized_config .config .spatial_merge_size
3603+
3604+ def generate (self , input_name : str , framework : str = "pt" , int_dtype : str = "int64" , float_dtype : str = "fp32" ):
3605+ grid_h , grid_w = self .height // self .patch_size , self .width // self .patch_size
3606+ grid_t = self .batch_size
3607+
3608+ if input_name == "hidden_states" :
3609+ return self .random_float_tensor (
3610+ [grid_t * grid_h * grid_w , self .embed_dim ], framework = framework , dtype = float_dtype
3611+ )
3612+
3613+ if input_name in ["attention_mask" , "window_attention_mask" ]:
3614+ return self .random_mask_tensor (
3615+ [1 , grid_t * grid_h * grid_w , grid_t * grid_h * grid_w ], framework = framework , dtype = float_dtype
3616+ )
3617+
3618+ if input_name == "rotary_pos_emb" :
3619+ dim = self .embed_dim // self .num_heads // 2
3620+ return self .random_float_tensor ([grid_h * grid_t * grid_w , dim ], framework = framework , dtype = float_dtype )
3621+
3622+ if input_name == "input" :
3623+ return self .constant_tensor ([4 , 2520 ], framework = framework , value = 0 , dtype = DTYPE_MAPPER .pt (int_dtype ))
3624+
3625+ if input_name == "window_index" :
3626+ if self .spatial_merge_size is None :
3627+ raise ValueError (
3628+ "`spatial_merge_size` parameter is not found in model config. Can not generate dummy input data for `window_index` input"
3629+ )
3630+ spatial_merge_unit = self .spatial_merge_size * self .spatial_merge_size
3631+ hidden_size = (grid_t * grid_h * grid_w ) // spatial_merge_unit
3632+ return self .random_int_tensor ([hidden_size ], max_value = hidden_size )
3633+
35063634class Qwen2VLConfigBehavior (str , enum .Enum ):
35073635 LANGUAGE = "language"
35083636 VISION_EMBEDDINGS = "vision_embeddings"
@@ -3674,6 +3802,180 @@ def patch_model_for_export(
36743802 if self ._behavior == Qwen2VLConfigBehavior .VISION_EMBEDDINGS_MERGER :
36753803 return Qwen2_5_VLVisionEmbMergerPatcher (self , model , model_kwargs )
36763804 return super ().patch_model_for_export (model , model_kwargs )
3805+
3806+ class Qwen3VLConfigBehavior (str , enum .Enum ):
3807+ LANGUAGE = "language"
3808+ VISION_EMBEDDINGS = "vision_embeddings"
3809+ VISION_EMBEDDINGS_MERGER = "vision_embeddings_merger"
3810+ TEXT_EMBEDDINGS = "text_embeddings"
3811+ VISION_EMBEDDINGS_POS = "vision_embeddings_pos"
3812+
3813+ @register_in_tasks_manager (
3814+ "qwen3_vl" ,
3815+ * ["image-text-to-text" , "video-text-to-text" ],
3816+ library_name = "transformers" ,
3817+ )
3818+ class Qwen3_VLOpenVINOConfig (BaseVLMOpenVINOConfig ):
3819+ SUPPORTED_BEHAVIORS = [model_type .value for model_type in Qwen3VLConfigBehavior ]
3820+ NORMALIZED_CONFIG_CLASS = NormalizedVisionConfig
3821+ DUMMY_INPUT_GENERATOR_CLASSES = (DummyQwen3VLVisionEmbedInputGenerator ,)
3822+ MIN_TRANSFORMERS_VERSION = version .parse ("4.45.0" )
3823+
3824+ def __init__ (
3825+ self ,
3826+ config : "PretrainedConfig" ,
3827+ task : str = "feature-extraction" ,
3828+ int_dtype : str = "int64" ,
3829+ float_dtype : str = "fp32" ,
3830+ behavior : Qwen3VLConfigBehavior = Qwen3VLConfigBehavior .VISION_EMBEDDINGS ,
3831+ preprocessors : Optional [List [Any ]] = None ,
3832+ ):
3833+ super ().__init__ (
3834+ config = config ,
3835+ task = task ,
3836+ int_dtype = int_dtype ,
3837+ float_dtype = float_dtype ,
3838+ preprocessors = preprocessors ,
3839+ )
3840+ self ._behavior = behavior
3841+ self ._orig_config = config
3842+ if self ._behavior == Qwen3VLConfigBehavior .VISION_EMBEDDINGS and hasattr (config , "vision_config" ):
3843+ self ._config = config .vision_config
3844+ self ._normalized_config = self .NORMALIZED_CONFIG_CLASS (self ._config )
3845+ self ._normalized_config .use_embed_dim = False
3846+ if self ._behavior == Qwen3VLConfigBehavior .VISION_EMBEDDINGS_MERGER and hasattr (config , "vision_config" ):
3847+ self ._config = config .vision_config
3848+ self ._normalized_config = self .NORMALIZED_CONFIG_CLASS (self ._config )
3849+ self ._normalized_config .use_embed_dim = True
3850+ if self ._behavior == Qwen3VLConfigBehavior .VISION_EMBEDDINGS_POS and hasattr (config , "vision_config" ):
3851+ self ._config = config .vision_config
3852+ self ._normalized_config = self .NORMALIZED_CONFIG_CLASS (self ._config )
3853+ self ._normalized_config .use_embed_dim = True
3854+
3855+
3856+
3857+
3858+ @staticmethod
3859+ def get_model_for_behavior (model , behavior : Union [str , Qwen3VLConfigBehavior ]):
3860+ if isinstance (behavior , str ) and not isinstance (behavior , Qwen3VLConfigBehavior ):
3861+ behavior = Qwen3VLConfigBehavior (behavior )
3862+
3863+ if behavior == Qwen3VLConfigBehavior .LANGUAGE :
3864+ return model
3865+
3866+ if behavior == Qwen3VLConfigBehavior .VISION_EMBEDDINGS :
3867+ vision_embeddings = model .visual .patch_embed
3868+ vision_embeddings .config = model .config .vision_config
3869+ return vision_embeddings
3870+
3871+ if behavior == Qwen3VLConfigBehavior .VISION_EMBEDDINGS_MERGER :
3872+ vision_emb_merger = model .visual
3873+ vision_emb_merger .config = model .config .vision_config
3874+ return vision_emb_merger
3875+
3876+ if behavior == Qwen3VLConfigBehavior .VISION_EMBEDDINGS_POS :
3877+ vision_emb_pos = model .visual .pos_embed
3878+ vision_emb_pos .config = model .config .vision_config
3879+ return vision_emb_pos
3880+
3881+ if behavior == Qwen3VLConfigBehavior .TEXT_EMBEDDINGS :
3882+ text_embedding = (
3883+ model .model .embed_tokens if hasattr (model .model , "embed_tokens" ) else model .language_model .embed_tokens
3884+ )
3885+ text_embedding .config = model .config
3886+ return text_embedding
3887+
3888+ def with_behavior (
3889+ self ,
3890+ behavior : Union [str , Qwen3VLConfigBehavior ],
3891+ ):
3892+ """
3893+ Creates a config for different behaviour.
3894+ Args:
3895+ behavior ([`ConfigBehavior`]):
3896+ The behavior to use for the new instance.
3897+ """
3898+ if isinstance (behavior , str ) and not isinstance (behavior , Qwen3VLConfigBehavior ):
3899+ behavior = Qwen3VLConfigBehavior (behavior )
3900+
3901+ if behavior == Qwen3VLConfigBehavior .TEXT_EMBEDDINGS :
3902+ return get_vlm_text_embeddings_config ("qwen3_vl_text" , self ._orig_config .text_config , self .int_dtype , self .float_dtype )
3903+
3904+ if behavior == Qwen3VLConfigBehavior .LANGUAGE :
3905+ return get_vlm_text_generation_config (
3906+ "qwen3_vl_text" ,
3907+ self ._orig_config .text_config ,
3908+ self .int_dtype ,
3909+ self .float_dtype ,
3910+ model_patcher = Qwen3VLLanguageModelPatcher ,
3911+ dummy_input_generator = DummyQwen2VLLMInputGenerator ,
3912+ inputs_update = {"position_ids" : {1 : "batch_size" , 2 : "sequence_length" }},
3913+ )
3914+
3915+ if behavior == Qwen3VLConfigBehavior .VISION_EMBEDDINGS :
3916+ return self .__class__ (
3917+ self ._orig_config ,
3918+ task = self .task ,
3919+ int_dtype = self .int_dtype ,
3920+ float_dtype = self .float_dtype ,
3921+ behavior = behavior ,
3922+ preprocessors = self ._preprocessors ,
3923+ )
3924+ if behavior == Qwen3VLConfigBehavior .VISION_EMBEDDINGS_MERGER :
3925+ return self .__class__ (
3926+ self ._orig_config ,
3927+ task = self .task ,
3928+ int_dtype = self .int_dtype ,
3929+ float_dtype = self .float_dtype ,
3930+ behavior = behavior ,
3931+ preprocessors = self ._preprocessors ,
3932+ )
3933+ if behavior == Qwen3VLConfigBehavior .VISION_EMBEDDINGS_POS :
3934+ return self .__class__ (
3935+ self ._orig_config ,
3936+ task = self .task ,
3937+ int_dtype = self .int_dtype ,
3938+ float_dtype = self .float_dtype ,
3939+ behavior = behavior ,
3940+ preprocessors = self ._preprocessors ,
3941+ )
3942+ def patch_model_for_export (
3943+ self , model : Union ["PreTrainedModel" , "TFPreTrainedModel" ], model_kwargs : Optional [Dict [str , Any ]] = None
3944+ ):
3945+ model_kwargs = model_kwargs or {}
3946+ if self ._behavior == Qwen3VLConfigBehavior .VISION_EMBEDDINGS_MERGER :
3947+ return Qwen3VLVisionEmbMergerPatcher (self , model , model_kwargs )
3948+ if self ._behavior == Qwen3VLConfigBehavior .VISION_EMBEDDINGS or self ._behavior == Qwen3VLConfigBehavior .VISION_EMBEDDINGS_POS :
3949+ return ModelPatcher (self , model , model_kwargs = model_kwargs )
3950+ return super ().patch_model_for_export (model , model_kwargs )
3951+
3952+
3953+ @property
3954+ def inputs (self ) -> Dict [str , Dict [int , str ]]:
3955+ if self ._behavior == Qwen3VLConfigBehavior .VISION_EMBEDDINGS :
3956+ return {"hidden_states" : {0 : "patch_thw_grid" , 1 : "patch_temporal_channels" }}
3957+ if self ._behavior == Qwen3VLConfigBehavior .VISION_EMBEDDINGS_MERGER :
3958+ return {
3959+ "hidden_states" : {0 : "sequence_length" },
3960+ "attention_mask" : {1 : "sequence_length" , 2 : "sequence_length" },
3961+ "rotary_pos_emb" : {0 : "sequence_length" },
3962+ }
3963+ if self ._behavior == Qwen3VLConfigBehavior .VISION_EMBEDDINGS_POS :
3964+ return {
3965+ "input" : {0 : "sequence_length" , 1 : "sequence_length" },
3966+ }
3967+
3968+
3969+
3970+ @property
3971+ def outputs (self ) -> Dict [str , Dict [int , str ]]:
3972+ if self ._behavior == Qwen3VLConfigBehavior .VISION_EMBEDDINGS :
3973+ return {"last_hidden_state" : {0 : "seq_len" }}
3974+ if self ._behavior == Qwen3VLConfigBehavior .VISION_EMBEDDINGS_MERGER :
3975+ return {"last_hidden_state" : {0 : "seq_len" }, "deepstack_feature_lists" : {0 : "seq_len" }}
3976+ if self ._behavior == Qwen3VLConfigBehavior .VISION_EMBEDDINGS_POS :
3977+ return {"last_hidden_state" : {0 : "seq_len" , 1 : "seq_len" }}
3978+ return {}
36773979
36783980
36793981@register_in_tasks_manager (
0 commit comments