Skip to content

Commit 6c88fbf

Browse files
add qwen3vl_moe support
1 parent 047e30b commit 6c88fbf

File tree

3 files changed

+78
-4
lines changed

3 files changed

+78
-4
lines changed

optimum/exporters/openvino/model_configs.py

Lines changed: 74 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -146,7 +146,6 @@
146146
XverseModelPatcher,
147147
)
148148

149-
150149
def init_model_configs():
151150
if "open_clip" not in TasksManager._LIBRARY_TO_SUPPORTED_MODEL_TYPES:
152151
TasksManager._LIBRARY_TO_SUPPORTED_MODEL_TYPES["open_clip"] = {}
@@ -170,6 +169,10 @@ def init_model_configs():
170169
"transformers",
171170
"AutoModelForImageTextToText",
172171
)
172+
TasksManager._CUSTOM_CLASSES[("pt", "qwen3_vl_moe", "image-text-to-text")] = (
173+
"transformers",
174+
"AutoModelForImageTextToText",
175+
)
173176
TasksManager._CUSTOM_CLASSES[("pt", "llava_next_video", "image-text-to-text")] = (
174177
"transformers",
175178
"AutoModelForVision2Seq",
@@ -397,6 +400,14 @@ def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int
397400
],
398401
library_name="transformers",
399402
)
403+
@register_in_tasks_manager(
404+
"qwen3_vl_moe_text",
405+
*[
406+
"text-generation",
407+
"text-generation-with-past",
408+
],
409+
library_name="transformers",
410+
)
400411
class Qwen3VLTextOpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig):
401412
MIN_TRANSFORMERS_VERSION = "4.56.0"
402413

@@ -4004,6 +4015,68 @@ def outputs(self) -> Dict[str, Dict[int, str]]:
40044015
return {}
40054016

40064017

4018+
@register_in_tasks_manager(
4019+
"qwen3_vl_moe",
4020+
*["image-text-to-text", "video-text-to-text"],
4021+
library_name="transformers",
4022+
)
4023+
class Qwen3_VL_MOEOpenVINOConfig(Qwen3_VLOpenVINOConfig):
4024+
def with_behavior(
4025+
self,
4026+
behavior: Union[str, Qwen3VLConfigBehavior],
4027+
):
4028+
"""
4029+
Creates a config for different behaviour.
4030+
Args:
4031+
behavior ([`ConfigBehavior`]):
4032+
The behavior to use for the new instance.
4033+
"""
4034+
if isinstance(behavior, str) and not isinstance(behavior, Qwen3VLConfigBehavior):
4035+
behavior = Qwen3VLConfigBehavior(behavior)
4036+
4037+
if behavior == Qwen3VLConfigBehavior.TEXT_EMBEDDINGS:
4038+
return get_vlm_text_embeddings_config("qwen3_vl_moe_text", self._orig_config.text_config, self.int_dtype, self.float_dtype)
4039+
4040+
if behavior == Qwen3VLConfigBehavior.LANGUAGE:
4041+
return get_vlm_text_generation_config(
4042+
"qwen3_vl_moe_text",
4043+
self._orig_config.text_config,
4044+
self.int_dtype,
4045+
self.float_dtype,
4046+
model_patcher=Qwen3VLLanguageModelPatcher,
4047+
dummy_input_generator=DummyQwen2VLLMInputGenerator,
4048+
inputs_update={"position_ids": {1: "batch_size", 2: "sequence_length"}},
4049+
)
4050+
4051+
if behavior == Qwen3VLConfigBehavior.VISION_EMBEDDINGS:
4052+
return self.__class__(
4053+
self._orig_config,
4054+
task=self.task,
4055+
int_dtype=self.int_dtype,
4056+
float_dtype=self.float_dtype,
4057+
behavior=behavior,
4058+
preprocessors=self._preprocessors,
4059+
)
4060+
if behavior == Qwen3VLConfigBehavior.VISION_EMBEDDINGS_MERGER:
4061+
return self.__class__(
4062+
self._orig_config,
4063+
task=self.task,
4064+
int_dtype=self.int_dtype,
4065+
float_dtype=self.float_dtype,
4066+
behavior=behavior,
4067+
preprocessors=self._preprocessors,
4068+
)
4069+
if behavior == Qwen3VLConfigBehavior.VISION_EMBEDDINGS_POS:
4070+
return self.__class__(
4071+
self._orig_config,
4072+
task=self.task,
4073+
int_dtype=self.int_dtype,
4074+
float_dtype=self.float_dtype,
4075+
behavior=behavior,
4076+
preprocessors=self._preprocessors,
4077+
)
4078+
4079+
40074080
@register_in_tasks_manager(
40084081
"glm",
40094082
*[

optimum/exporters/openvino/utils.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -229,6 +229,7 @@ def get_submodels(model):
229229
"qwen2_vl",
230230
"qwen2_5_vl",
231231
"qwen3_vl",
232+
"qwen3_vl_moe",
232233
"got_ocr2",
233234
"gemma3",
234235
"idefics3",

optimum/intel/openvino/modeling_visual_language.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -189,7 +189,7 @@ def prepare_inputs(
189189
if past_len:
190190
position_ids = position_ids[:, -inputs_embeds.shape[1] :]
191191

192-
if (self.config.model_type == "qwen2_vl" or self.config.model_type == "qwen3_vl") and position_ids.ndim != 3:
192+
if (self.config.model_type == "qwen2_vl" or self.config.model_type == "qwen3_vl" or self.config.model_type == "qwen3_vl_moe") and position_ids.ndim != 3:
193193
position_ids = np.repeat(np.expand_dims(position_ids, 0), 3, axis=0)
194194

195195
inputs["position_ids"] = position_ids
@@ -230,7 +230,6 @@ def forward(
230230
**kwargs,
231231
):
232232
self.compile()
233-
234233
inputs = self.prepare_inputs(
235234
input_ids=input_ids,
236235
attention_mask=attention_mask,
@@ -787,7 +786,7 @@ def forward(
787786
):
788787
if pixel_values is None:
789788
pixel_values = images if images is not None else image_pixel_values
790-
if self.config.model_type == "qwen3_vl":
789+
if self.config.model_type == "qwen3_vl" or self.config.model_type == "qwen3_vl_moe":
791790
inputs_embeds, attention_mask, position_ids, visual_pos_masks, deepstack_visual_embeds = self.get_multimodal_embeddings(
792791
input_ids,
793792
pixel_values,
@@ -4986,6 +4985,7 @@ def preprocess_inputs(
49864985
"qwen2_vl": _OVQwen2VLForCausalLM,
49874986
"qwen2_5_vl": _OVQwen2_5_VLForCausalLM,
49884987
"qwen3_vl": _OVQwen3VLForCausalLM,
4988+
"qwen3_vl_moe": _OVQwen3VLForCausalLM,
49894989
"got_ocr2": _OVGotOCR2ForCausalLM,
49904990
"gemma3": _OVGemma3ForCausalLM,
49914991
"idefics3": _OVIdefics3ForCausalLM,

0 commit comments

Comments
 (0)