Skip to content

Commit 2c95f78

Browse files
add qwen3_vl
1 parent b9c151f commit 2c95f78

File tree

3 files changed

+1078
-35
lines changed

3 files changed

+1078
-35
lines changed

optimum/exporters/openvino/model_configs.py

Lines changed: 302 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -138,6 +138,8 @@
138138
Qwen2MoEPatcher,
139139
Qwen2VLLanguageModelPatcher,
140140
Qwen2VLVisionEmbMergerPatcher,
141+
Qwen3VLVisionEmbMergerPatcher,
142+
Qwen3VLLanguageModelPatcher,
141143
Qwen3MoeModelPatcher,
142144
QwenModelPatcher,
143145
SanaTextEncoderModelPatcher,
@@ -164,6 +166,10 @@ def init_model_configs():
164166
"transformers",
165167
"AutoModelForImageTextToText",
166168
)
169+
TasksManager._CUSTOM_CLASSES[("pt", "qwen3_vl", "image-text-to-text")] = (
170+
"transformers",
171+
"AutoModelForImageTextToText",
172+
)
167173
TasksManager._CUSTOM_CLASSES[("pt", "llava_next_video", "image-text-to-text")] = (
168174
"transformers",
169175
"AutoModelForVision2Seq",
@@ -333,6 +339,57 @@ def patch_model_for_export(
333339
) -> "ModelPatcher":
334340
return OVDecoderModelPatcher(self, model, model_kwargs=model_kwargs)
335341

342+
class DummyQwen3VLLMInputGenerator(DummyTextInputGenerator):
343+
SUPPORTED_INPUT_NAMES = (
344+
"input_ids",
345+
"attention_mask",
346+
"encoder_attention_mask",
347+
"token_type_ids",
348+
"position_ids",
349+
"visual_pos_masks",
350+
"deepstack_visual_embeds",
351+
)
352+
353+
def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32", bool_dtype: str = "bool"):
354+
if input_name == "deepstack_visual_embeds":
355+
return self.random_float_tensor([3, 32, 2560], framework=framework, dtype=float_dtype)
356+
if input_name == "visual_pos_masks":
357+
return self.constant_tensor(
358+
shape=[self.batch_size, 16],
359+
framework=framework,
360+
value=1,
361+
dtype=DTYPE_MAPPER.pt(bool_dtype),
362+
)
363+
return super().generate(input_name, framework, int_dtype, float_dtype)
364+
365+
@register_in_tasks_manager(
366+
"qwen3_vl_text",
367+
*[
368+
"text-generation",
369+
"text-generation-with-past",
370+
],
371+
library_name="transformers",
372+
)
373+
class Qwen3VLTextOpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig):
374+
MIN_TRANSFORMERS_VERSION = "4.56.0"
375+
376+
DUMMY_INPUT_GENERATOR_CLASSES = (DummyQwen3VLLMInputGenerator, GemmaDummyPastKeyValuesGenerator)
377+
DUMMY_PKV_GENERATOR_CLASS = GemmaDummyPastKeyValuesGenerator
378+
NORMALIZED_CONFIG_CLASS = NormalizedTextConfig
379+
380+
@property
381+
def inputs(self) -> Dict[str, Dict[int, str]]:
382+
common_inputs = super().inputs
383+
common_inputs["visual_pos_masks"] = {0: "batch_size", 1: "sequence_length"}
384+
common_inputs["deepstack_visual_embeds"] = {0: "num_layers", 1: "visual_seqlen", 2: "embed_dim"}
385+
return common_inputs
386+
387+
def patch_model_for_export(
388+
self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None
389+
) -> "ModelPatcher":
390+
return OVDecoderModelPatcher(self, model, model_kwargs=model_kwargs)
391+
392+
336393

337394
@register_in_tasks_manager(
338395
"qwen3_moe",
@@ -3437,6 +3494,8 @@ def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int
34373494
return generated_input
34383495

34393496

3497+
3498+
34403499
class DummyQwen2VLVisionEmbedInputGenerator(DummyVisionInputGenerator):
34413500
SUPPORTED_INPUT_NAMES = (
34423501
"hidden_states",
@@ -3503,6 +3562,75 @@ def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int
35033562
return self.random_int_tensor([hidden_size], max_value=hidden_size)
35043563

35053564

3565+
class DummyQwen3VLVisionEmbedInputGenerator(DummyVisionInputGenerator):
3566+
SUPPORTED_INPUT_NAMES = (
3567+
"hidden_states",
3568+
"attention_mask",
3569+
"window_attention_mask",
3570+
"window_index",
3571+
"rotary_pos_emb",
3572+
"input",
3573+
)
3574+
3575+
def __init__(
3576+
self,
3577+
task: str,
3578+
normalized_config: NormalizedVisionConfig,
3579+
batch_size: int = 1,
3580+
num_channels: int = DEFAULT_DUMMY_SHAPES["num_channels"],
3581+
width: int = 420,
3582+
height: int = 420,
3583+
**kwargs,
3584+
):
3585+
self.batch_size = batch_size
3586+
self.height = height
3587+
self.width = width
3588+
self.num_channels = num_channels
3589+
self.temporal_patch_size = normalized_config.config.temporal_patch_size
3590+
self.patch_size = normalized_config.config.patch_size
3591+
if normalized_config.use_embed_dim:
3592+
self.embed_dim = (
3593+
normalized_config.config.embed_dim
3594+
if hasattr(normalized_config.config, "embed_dim")
3595+
else normalized_config.hidden_size
3596+
)
3597+
else:
3598+
self.embed_dim = self.num_channels * self.temporal_patch_size * self.patch_size * self.patch_size
3599+
self.num_heads = normalized_config.config.num_heads
3600+
self.spatial_merge_size = None
3601+
if hasattr(normalized_config.config, "spatial_merge_size"):
3602+
self.spatial_merge_size = normalized_config.config.spatial_merge_size
3603+
3604+
def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"):
3605+
grid_h, grid_w = self.height // self.patch_size, self.width // self.patch_size
3606+
grid_t = self.batch_size
3607+
3608+
if input_name == "hidden_states":
3609+
return self.random_float_tensor(
3610+
[grid_t * grid_h * grid_w, self.embed_dim], framework=framework, dtype=float_dtype
3611+
)
3612+
3613+
if input_name in ["attention_mask", "window_attention_mask"]:
3614+
return self.random_mask_tensor(
3615+
[1, grid_t * grid_h * grid_w, grid_t * grid_h * grid_w], framework=framework, dtype=float_dtype
3616+
)
3617+
3618+
if input_name == "rotary_pos_emb":
3619+
dim = self.embed_dim // self.num_heads // 2
3620+
return self.random_float_tensor([grid_h * grid_t * grid_w, dim], framework=framework, dtype=float_dtype)
3621+
3622+
if input_name == "input":
3623+
return self.constant_tensor([4, 2520], framework=framework, value=0, dtype=DTYPE_MAPPER.pt(int_dtype))
3624+
3625+
if input_name == "window_index":
3626+
if self.spatial_merge_size is None:
3627+
raise ValueError(
3628+
"`spatial_merge_size` parameter is not found in model config. Can not generate dummy input data for `window_index` input"
3629+
)
3630+
spatial_merge_unit = self.spatial_merge_size * self.spatial_merge_size
3631+
hidden_size = (grid_t * grid_h * grid_w) // spatial_merge_unit
3632+
return self.random_int_tensor([hidden_size], max_value=hidden_size)
3633+
35063634
class Qwen2VLConfigBehavior(str, enum.Enum):
35073635
LANGUAGE = "language"
35083636
VISION_EMBEDDINGS = "vision_embeddings"
@@ -3674,6 +3802,180 @@ def patch_model_for_export(
36743802
if self._behavior == Qwen2VLConfigBehavior.VISION_EMBEDDINGS_MERGER:
36753803
return Qwen2_5_VLVisionEmbMergerPatcher(self, model, model_kwargs)
36763804
return super().patch_model_for_export(model, model_kwargs)
3805+
3806+
class Qwen3VLConfigBehavior(str, enum.Enum):
3807+
LANGUAGE = "language"
3808+
VISION_EMBEDDINGS = "vision_embeddings"
3809+
VISION_EMBEDDINGS_MERGER = "vision_embeddings_merger"
3810+
TEXT_EMBEDDINGS = "text_embeddings"
3811+
VISION_EMBEDDINGS_POS = "vision_embeddings_pos"
3812+
3813+
@register_in_tasks_manager(
3814+
"qwen3_vl",
3815+
*["image-text-to-text", "video-text-to-text"],
3816+
library_name="transformers",
3817+
)
3818+
class Qwen3_VLOpenVINOConfig(BaseVLMOpenVINOConfig):
3819+
SUPPORTED_BEHAVIORS = [model_type.value for model_type in Qwen3VLConfigBehavior]
3820+
NORMALIZED_CONFIG_CLASS = NormalizedVisionConfig
3821+
DUMMY_INPUT_GENERATOR_CLASSES = (DummyQwen3VLVisionEmbedInputGenerator,)
3822+
MIN_TRANSFORMERS_VERSION = version.parse("4.45.0")
3823+
3824+
def __init__(
3825+
self,
3826+
config: "PretrainedConfig",
3827+
task: str = "feature-extraction",
3828+
int_dtype: str = "int64",
3829+
float_dtype: str = "fp32",
3830+
behavior: Qwen3VLConfigBehavior = Qwen3VLConfigBehavior.VISION_EMBEDDINGS,
3831+
preprocessors: Optional[List[Any]] = None,
3832+
):
3833+
super().__init__(
3834+
config=config,
3835+
task=task,
3836+
int_dtype=int_dtype,
3837+
float_dtype=float_dtype,
3838+
preprocessors=preprocessors,
3839+
)
3840+
self._behavior = behavior
3841+
self._orig_config = config
3842+
if self._behavior == Qwen3VLConfigBehavior.VISION_EMBEDDINGS and hasattr(config, "vision_config"):
3843+
self._config = config.vision_config
3844+
self._normalized_config = self.NORMALIZED_CONFIG_CLASS(self._config)
3845+
self._normalized_config.use_embed_dim = False
3846+
if self._behavior == Qwen3VLConfigBehavior.VISION_EMBEDDINGS_MERGER and hasattr(config, "vision_config"):
3847+
self._config = config.vision_config
3848+
self._normalized_config = self.NORMALIZED_CONFIG_CLASS(self._config)
3849+
self._normalized_config.use_embed_dim = True
3850+
if self._behavior == Qwen3VLConfigBehavior.VISION_EMBEDDINGS_POS and hasattr(config, "vision_config"):
3851+
self._config = config.vision_config
3852+
self._normalized_config = self.NORMALIZED_CONFIG_CLASS(self._config)
3853+
self._normalized_config.use_embed_dim = True
3854+
3855+
3856+
3857+
3858+
@staticmethod
3859+
def get_model_for_behavior(model, behavior: Union[str, Qwen3VLConfigBehavior]):
3860+
if isinstance(behavior, str) and not isinstance(behavior, Qwen3VLConfigBehavior):
3861+
behavior = Qwen3VLConfigBehavior(behavior)
3862+
3863+
if behavior == Qwen3VLConfigBehavior.LANGUAGE:
3864+
return model
3865+
3866+
if behavior == Qwen3VLConfigBehavior.VISION_EMBEDDINGS:
3867+
vision_embeddings = model.visual.patch_embed
3868+
vision_embeddings.config = model.config.vision_config
3869+
return vision_embeddings
3870+
3871+
if behavior == Qwen3VLConfigBehavior.VISION_EMBEDDINGS_MERGER:
3872+
vision_emb_merger = model.visual
3873+
vision_emb_merger.config = model.config.vision_config
3874+
return vision_emb_merger
3875+
3876+
if behavior == Qwen3VLConfigBehavior.VISION_EMBEDDINGS_POS:
3877+
vision_emb_pos = model.visual.pos_embed
3878+
vision_emb_pos.config = model.config.vision_config
3879+
return vision_emb_pos
3880+
3881+
if behavior == Qwen3VLConfigBehavior.TEXT_EMBEDDINGS:
3882+
text_embedding = (
3883+
model.model.embed_tokens if hasattr(model.model, "embed_tokens") else model.language_model.embed_tokens
3884+
)
3885+
text_embedding.config = model.config
3886+
return text_embedding
3887+
3888+
def with_behavior(
3889+
self,
3890+
behavior: Union[str, Qwen3VLConfigBehavior],
3891+
):
3892+
"""
3893+
Creates a config for different behaviour.
3894+
Args:
3895+
behavior ([`ConfigBehavior`]):
3896+
The behavior to use for the new instance.
3897+
"""
3898+
if isinstance(behavior, str) and not isinstance(behavior, Qwen3VLConfigBehavior):
3899+
behavior = Qwen3VLConfigBehavior(behavior)
3900+
3901+
if behavior == Qwen3VLConfigBehavior.TEXT_EMBEDDINGS:
3902+
return get_vlm_text_embeddings_config("qwen3_vl_text", self._orig_config.text_config, self.int_dtype, self.float_dtype)
3903+
3904+
if behavior == Qwen3VLConfigBehavior.LANGUAGE:
3905+
return get_vlm_text_generation_config(
3906+
"qwen3_vl_text",
3907+
self._orig_config.text_config,
3908+
self.int_dtype,
3909+
self.float_dtype,
3910+
model_patcher=Qwen3VLLanguageModelPatcher,
3911+
dummy_input_generator=DummyQwen2VLLMInputGenerator,
3912+
inputs_update={"position_ids": {1: "batch_size", 2: "sequence_length"}},
3913+
)
3914+
3915+
if behavior == Qwen3VLConfigBehavior.VISION_EMBEDDINGS:
3916+
return self.__class__(
3917+
self._orig_config,
3918+
task=self.task,
3919+
int_dtype=self.int_dtype,
3920+
float_dtype=self.float_dtype,
3921+
behavior=behavior,
3922+
preprocessors=self._preprocessors,
3923+
)
3924+
if behavior == Qwen3VLConfigBehavior.VISION_EMBEDDINGS_MERGER:
3925+
return self.__class__(
3926+
self._orig_config,
3927+
task=self.task,
3928+
int_dtype=self.int_dtype,
3929+
float_dtype=self.float_dtype,
3930+
behavior=behavior,
3931+
preprocessors=self._preprocessors,
3932+
)
3933+
if behavior == Qwen3VLConfigBehavior.VISION_EMBEDDINGS_POS:
3934+
return self.__class__(
3935+
self._orig_config,
3936+
task=self.task,
3937+
int_dtype=self.int_dtype,
3938+
float_dtype=self.float_dtype,
3939+
behavior=behavior,
3940+
preprocessors=self._preprocessors,
3941+
)
3942+
def patch_model_for_export(
3943+
self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None
3944+
):
3945+
model_kwargs = model_kwargs or {}
3946+
if self._behavior == Qwen3VLConfigBehavior.VISION_EMBEDDINGS_MERGER:
3947+
return Qwen3VLVisionEmbMergerPatcher(self, model, model_kwargs)
3948+
if self._behavior == Qwen3VLConfigBehavior.VISION_EMBEDDINGS or self._behavior == Qwen3VLConfigBehavior.VISION_EMBEDDINGS_POS:
3949+
return ModelPatcher(self, model, model_kwargs=model_kwargs)
3950+
return super().patch_model_for_export(model, model_kwargs)
3951+
3952+
3953+
@property
3954+
def inputs(self) -> Dict[str, Dict[int, str]]:
3955+
if self._behavior == Qwen3VLConfigBehavior.VISION_EMBEDDINGS:
3956+
return {"hidden_states": {0: "patch_thw_grid", 1: "patch_temporal_channels"}}
3957+
if self._behavior == Qwen3VLConfigBehavior.VISION_EMBEDDINGS_MERGER:
3958+
return {
3959+
"hidden_states": {0: "sequence_length"},
3960+
"attention_mask": {1: "sequence_length", 2: "sequence_length"},
3961+
"rotary_pos_emb": {0: "sequence_length"},
3962+
}
3963+
if self._behavior == Qwen3VLConfigBehavior.VISION_EMBEDDINGS_POS:
3964+
return {
3965+
"input": {0: "sequence_length", 1: "sequence_length"},
3966+
}
3967+
3968+
3969+
3970+
@property
3971+
def outputs(self) -> Dict[str, Dict[int, str]]:
3972+
if self._behavior == Qwen3VLConfigBehavior.VISION_EMBEDDINGS:
3973+
return {"last_hidden_state": {0: "seq_len"}}
3974+
if self._behavior == Qwen3VLConfigBehavior.VISION_EMBEDDINGS_MERGER:
3975+
return {"last_hidden_state": {0: "seq_len"}, "deepstack_feature_lists": {0: "seq_len"}}
3976+
if self._behavior == Qwen3VLConfigBehavior.VISION_EMBEDDINGS_POS:
3977+
return {"last_hidden_state": {0: "seq_len", 1: "seq_len"}}
3978+
return {}
36773979

36783980

36793981
@register_in_tasks_manager(

0 commit comments

Comments
 (0)