146146 XverseModelPatcher ,
147147)
148148
149-
150149def init_model_configs ():
151150 if "open_clip" not in TasksManager ._LIBRARY_TO_SUPPORTED_MODEL_TYPES :
152151 TasksManager ._LIBRARY_TO_SUPPORTED_MODEL_TYPES ["open_clip" ] = {}
@@ -170,6 +169,10 @@ def init_model_configs():
170169 "transformers" ,
171170 "AutoModelForImageTextToText" ,
172171 )
172+ TasksManager ._CUSTOM_CLASSES [("pt" , "qwen3_vl_moe" , "image-text-to-text" )] = (
173+ "transformers" ,
174+ "AutoModelForImageTextToText" ,
175+ )
173176 TasksManager ._CUSTOM_CLASSES [("pt" , "llava_next_video" , "image-text-to-text" )] = (
174177 "transformers" ,
175178 "AutoModelForVision2Seq" ,
@@ -397,6 +400,14 @@ def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int
397400 ],
398401 library_name = "transformers" ,
399402)
403+ @register_in_tasks_manager (
404+ "qwen3_vl_moe_text" ,
405+ * [
406+ "text-generation" ,
407+ "text-generation-with-past" ,
408+ ],
409+ library_name = "transformers" ,
410+ )
400411class Qwen3VLTextOpenVINOConfig (TextDecoderWithPositionIdsOnnxConfig ):
401412 MIN_TRANSFORMERS_VERSION = "4.56.0"
402413
@@ -4004,6 +4015,68 @@ def outputs(self) -> Dict[str, Dict[int, str]]:
40044015 return {}
40054016
40064017
4018+ @register_in_tasks_manager (
4019+ "qwen3_vl_moe" ,
4020+ * ["image-text-to-text" , "video-text-to-text" ],
4021+ library_name = "transformers" ,
4022+ )
4023+ class Qwen3_VL_MOEOpenVINOConfig (Qwen3_VLOpenVINOConfig ):
4024+ def with_behavior (
4025+ self ,
4026+ behavior : Union [str , Qwen3VLConfigBehavior ],
4027+ ):
4028+ """
4029+ Creates a config for different behaviour.
4030+ Args:
4031+ behavior ([`ConfigBehavior`]):
4032+ The behavior to use for the new instance.
4033+ """
4034+ if isinstance (behavior , str ) and not isinstance (behavior , Qwen3VLConfigBehavior ):
4035+ behavior = Qwen3VLConfigBehavior (behavior )
4036+
4037+ if behavior == Qwen3VLConfigBehavior .TEXT_EMBEDDINGS :
4038+ return get_vlm_text_embeddings_config ("qwen3_vl_moe_text" , self ._orig_config .text_config , self .int_dtype , self .float_dtype )
4039+
4040+ if behavior == Qwen3VLConfigBehavior .LANGUAGE :
4041+ return get_vlm_text_generation_config (
4042+ "qwen3_vl_moe_text" ,
4043+ self ._orig_config .text_config ,
4044+ self .int_dtype ,
4045+ self .float_dtype ,
4046+ model_patcher = Qwen3VLLanguageModelPatcher ,
4047+ dummy_input_generator = DummyQwen2VLLMInputGenerator ,
4048+ inputs_update = {"position_ids" : {1 : "batch_size" , 2 : "sequence_length" }},
4049+ )
4050+
4051+ if behavior == Qwen3VLConfigBehavior .VISION_EMBEDDINGS :
4052+ return self .__class__ (
4053+ self ._orig_config ,
4054+ task = self .task ,
4055+ int_dtype = self .int_dtype ,
4056+ float_dtype = self .float_dtype ,
4057+ behavior = behavior ,
4058+ preprocessors = self ._preprocessors ,
4059+ )
4060+ if behavior == Qwen3VLConfigBehavior .VISION_EMBEDDINGS_MERGER :
4061+ return self .__class__ (
4062+ self ._orig_config ,
4063+ task = self .task ,
4064+ int_dtype = self .int_dtype ,
4065+ float_dtype = self .float_dtype ,
4066+ behavior = behavior ,
4067+ preprocessors = self ._preprocessors ,
4068+ )
4069+ if behavior == Qwen3VLConfigBehavior .VISION_EMBEDDINGS_POS :
4070+ return self .__class__ (
4071+ self ._orig_config ,
4072+ task = self .task ,
4073+ int_dtype = self .int_dtype ,
4074+ float_dtype = self .float_dtype ,
4075+ behavior = behavior ,
4076+ preprocessors = self ._preprocessors ,
4077+ )
4078+
4079+
40074080@register_in_tasks_manager (
40084081 "glm" ,
40094082 * [
0 commit comments