From 546952df2bf3bebd036a090af4056cc64287ee06 Mon Sep 17 00:00:00 2001
From: ethan <ethan.yang@intel.com>
Date: Tue, 15 Jul 2025 20:06:32 -0700
Subject: [PATCH 1/7] add glm4v support

---
 optimum/exporters/openvino/model_configs.py | 212 +++++++++++++++
 optimum/exporters/openvino/model_patcher.py | 279 ++++++++++++++++++++
 2 files changed, 491 insertions(+)

diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py
index 2577836967..e4f28d2bd9 100644
--- a/optimum/exporters/openvino/model_configs.py
+++ b/optimum/exporters/openvino/model_configs.py
@@ -96,6 +96,9 @@
     FluxTransfromerModelPatcher,
     Gemma2ModelPatcher,
     Gemma3LMModelPatcher,
+    Glm4vVisionEmbMergerPatcher,
+    Glm4vVisionEmbeddingsPatcher,
+    Glm4vLanguageModelPatcher,
     GptBigCodeModelPatcher,
     GptJModelPatcher,
     GptNeoModelPatcher,
@@ -154,6 +157,10 @@
 def init_model_configs():
     if "open_clip" not in TasksManager._LIBRARY_TO_SUPPORTED_MODEL_TYPES:
         TasksManager._LIBRARY_TO_SUPPORTED_MODEL_TYPES["open_clip"] = {}
+    TasksManager._CUSTOM_CLASSES[("pt", "glm4v", "image-text-to-text")] = (
+        "transformers",
+        "Glm4vForConditionalGeneration",
+    )
     TasksManager._CUSTOM_CLASSES[("pt", "llava", "image-text-to-text")] = (
         "transformers",
         "LlavaForConditionalGeneration",
@@ -4490,3 +4497,208 @@ def generate_dummy_inputs(self, framework: str = "pt", **kwargs):
                 )
 
         return dummy_inputs
+
+
+class DummyGlm4vVisionEmbedInputGenerator(DummyVisionInputGenerator):
+    SUPPORTED_INPUT_NAMES = (
+        "hidden_states",
+        "seqlens",
+        "grid_thw",
+        "attention_mask",
+        "image_type_ids",
+        "rotary_pos_emb",
+    )
+
+    def __init__(
+        self,
+        task: str,
+        normalized_config: NormalizedVisionConfig,
+        batch_size: int = 1,
+        num_channels: int = DEFAULT_DUMMY_SHAPES["num_channels"],
+        width: int = 420,
+        height: int = 420,
+        **kwargs,
+    ):
+        self.batch_size = batch_size
+        self.height = height
+        self.width = width
+        self.num_channels = num_channels
+        self.temporal_patch_size = normalized_config.config.temporal_patch_size
+        self.patch_size = normalized_config.config.patch_size
+        if normalized_config.use_embed_dim:
+            self.embed_dim = (
+                normalized_config.config.embed_dim
+                if hasattr(normalized_config.config, "embed_dim")
+                else normalized_config.hidden_size
+            )
+        else:
+            self.embed_dim = self.num_channels * self.temporal_patch_size * self.patch_size * self.patch_size
+        self.num_heads = normalized_config.config.num_heads
+        self.spatial_merge_size = None
+        if hasattr(normalized_config.config, "spatial_merge_size"):
+            self.spatial_merge_size = normalized_config.config.spatial_merge_size
+
+    def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"):
+        grid_h, grid_w = self.height // self.patch_size, self.width // self.patch_size
+        grid_t = self.batch_size
+        import torch
+        if input_name == "hidden_states":
+            return self.random_float_tensor(
+                [grid_t * grid_h * grid_w, self.embed_dim], framework=framework, dtype=float_dtype
+            )
+
+        if input_name == "seqlens":
+            return torch.tensor([grid_t * grid_h * grid_w], dtype=torch.int64)
+            
+        if input_name in ["attention_mask", "window_attention_mask"]:
+            return self.random_mask_tensor(
+                [1, grid_t * grid_h * grid_w, grid_t * grid_h * grid_w], framework=framework, dtype=float_dtype
+            )
+
+        if input_name == "rotary_pos_emb":
+            dim = self.embed_dim // self.num_heads // 2
+            return self.random_float_tensor([grid_h * grid_t * grid_w, dim], framework=framework, dtype=float_dtype)
+
+        if input_name == "image_type_ids":
+            return self.random_int_tensor(
+                [grid_t * grid_h * grid_w, 2], max_value=grid_h, framework=framework, dtype=int_dtype
+            )
+            
+        if input_name == "grid_thw":
+            return torch.tensor([[grid_t, grid_h, grid_w]], dtype=torch.int64)
+
+@register_in_tasks_manager("glm4v", *["image-text-to-text", "video-text-to-text"], library_name="transformers")
+class Glm4vOpenVINOConfig(BaseVLMOpenVINOConfig):
+    SUPPORTED_BEHAVIORS = [model_type.value for model_type in Qwen2VLConfigBehavior]
+    NORMALIZED_CONFIG_CLASS = NormalizedVisionConfig
+    DUMMY_INPUT_GENERATOR_CLASSES = (DummyGlm4vVisionEmbedInputGenerator,)
+    MIN_TRANSFORMERS_VERSION = version.parse("4.54.0")
+
+    def __init__(
+        self,
+        config: "PretrainedConfig",
+        task: str = "feature-extraction",
+        int_dtype: str = "int64",
+        float_dtype: str = "fp32",
+        behavior: Qwen2VLConfigBehavior = Qwen2VLConfigBehavior.VISION_EMBEDDINGS,
+        preprocessors: Optional[List[Any]] = None,
+        **kwargs,
+    ):
+        super().__init__(
+            config=config,
+            task=task,
+            int_dtype=int_dtype,
+            float_dtype=float_dtype,
+            preprocessors=preprocessors,
+        )
+        self._behavior = behavior
+        self._orig_config = config
+        if self._behavior == Qwen2VLConfigBehavior.VISION_EMBEDDINGS and hasattr(config, "vision_config"):
+            self._config = config.vision_config
+            self._normalized_config = self.NORMALIZED_CONFIG_CLASS(self._config)
+            self._normalized_config.use_embed_dim = False
+        if self._behavior == Qwen2VLConfigBehavior.VISION_EMBEDDINGS_MERGER and hasattr(config, "vision_config"):
+            self._config = config.vision_config
+            self._normalized_config = self.NORMALIZED_CONFIG_CLASS(self._config)
+            self._normalized_config.use_embed_dim = True
+
+    @staticmethod
+    def get_model_for_behavior(model, behavior: Union[str, Qwen2VLConfigBehavior]):
+        if isinstance(behavior, str) and not isinstance(behavior, Qwen2VLConfigBehavior):
+            behavior = Qwen2VLConfigBehavior(behavior)
+
+        if behavior == Qwen2VLConfigBehavior.LANGUAGE:
+            return model
+
+        if behavior == Qwen2VLConfigBehavior.VISION_EMBEDDINGS:
+            vision_embeddings = model.visual
+            vision_embeddings.config = model.config.vision_config
+            return vision_embeddings
+
+        if behavior == Qwen2VLConfigBehavior.VISION_EMBEDDINGS_MERGER:
+            vision_emb_merger = model.visual
+            vision_emb_merger.config = model.config.vision_config
+            return vision_emb_merger
+
+        if behavior == Qwen2VLConfigBehavior.TEXT_EMBEDDINGS:
+            text_embedding = (
+                model.model.embed_tokens if hasattr(model.model, "embed_tokens") else model.language_model.embed_tokens
+            )
+            text_embedding.config = model.config
+            return text_embedding
+
+    def with_behavior(
+        self,
+        behavior: Union[str, Qwen2VLConfigBehavior],
+    ):
+        """
+        Creates a config for different behaviour.
+        Args:
+            behavior ([`ConfigBehavior`]):
+                The behavior to use for the new instance.
+        """
+        if isinstance(behavior, str) and not isinstance(behavior, Qwen2VLConfigBehavior):
+            behavior = Qwen2VLConfigBehavior(behavior)
+
+        if behavior == Qwen2VLConfigBehavior.TEXT_EMBEDDINGS:
+            return get_vlm_text_embeddings_config("qwen2", self._orig_config, self.int_dtype, self.float_dtype)
+
+        if behavior == Qwen2VLConfigBehavior.LANGUAGE:
+            return get_vlm_text_generation_config(
+                "qwen2",
+                self._orig_config,
+                self.int_dtype,
+                self.float_dtype,
+                model_patcher=Glm4vLanguageModelPatcher,
+                dummy_input_generator=DummyQwen2VLLMInputGenerator,
+                inputs_update={"position_ids": {1: "batch_size", 2: "sequence_length"}},
+            )
+
+        if behavior == Qwen2VLConfigBehavior.VISION_EMBEDDINGS:
+            return self.__class__(
+                self._orig_config,
+                task=self.task,
+                int_dtype=self.int_dtype,
+                float_dtype=self.float_dtype,
+                behavior=behavior,
+                preprocessors=self._preprocessors,
+            )
+        if behavior == Qwen2VLConfigBehavior.VISION_EMBEDDINGS_MERGER:
+            return self.__class__(
+                self._orig_config,
+                task=self.task,
+                int_dtype=self.int_dtype,
+                float_dtype=self.float_dtype,
+                behavior=behavior,
+                preprocessors=self._preprocessors,
+            )
+
+    def patch_model_for_export(
+        self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None
+    ):
+        model_kwargs = model_kwargs or {}
+        if self._behavior == Qwen2VLConfigBehavior.VISION_EMBEDDINGS_MERGER:
+            return Glm4vVisionEmbMergerPatcher(self, model, model_kwargs)
+        if self._behavior == Qwen2VLConfigBehavior.VISION_EMBEDDINGS:
+            return Glm4vVisionEmbeddingsPatcher(self, model, model_kwargs=model_kwargs)
+        return super().patch_model_for_export(model, model_kwargs)
+
+    @property
+    def inputs(self) -> Dict[str, Dict[int, str]]:
+        if self._behavior == Qwen2VLConfigBehavior.VISION_EMBEDDINGS:
+            return {"hidden_states": {0: "patch_thw_grid", 1: "patch_temporal_channels"}}
+        if self._behavior == Qwen2VLConfigBehavior.VISION_EMBEDDINGS_MERGER:
+            return {
+                "hidden_states": {0: "sequence_length"},
+                "seqlens": {0: "sequence_length"},
+                "grid_thw": {0: "sequence_length"},
+                "attention_mask": {1: "sequence_length", 2: "sequence_length"},
+                "image_type_ids": {0: "sequence_length"},
+                "rotary_pos_emb": {0: "sequence_length"},
+            }
+
+    @property
+    def outputs(self) -> Dict[str, Dict[int, str]]:
+        if self._behavior in [Qwen2VLConfigBehavior.VISION_EMBEDDINGS, Qwen2VLConfigBehavior.VISION_EMBEDDINGS_MERGER]:
+            return {"last_hidden_state": {0: "seq_len"}}
+        return {}
\ No newline at end of file
diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py
index 0ffb612d6b..5478bbcf0a 100644
--- a/optimum/exporters/openvino/model_patcher.py
+++ b/optimum/exporters/openvino/model_patcher.py
@@ -6754,3 +6754,282 @@ def __exit__(self, exc_type, exc_value, traceback):
         setattr(self._model, self.orig_forward_name, self.orig_forward)
         for layer in self._model.backbone.layers:
             layer.mixer.forward = layer.mixer._orig_forward
+
+def _glm4v_prepare_4d_causal_attention_mask_with_cache_position(
+    attention_mask: torch.Tensor,
+    sequence_length: int,
+    target_length: int,
+    dtype: torch.dtype,
+    device: torch.device,
+    cache_position: torch.Tensor,
+    batch_size: int,
+    **kwargs,
+):
+    """
+    Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
+    `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.
+
+    Args:
+        attention_mask (`torch.Tensor`):
+            A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
+            `(batch_size, 1, query_length, key_value_length)`.
+        sequence_length (`int`):
+            The sequence length being processed.
+        target_length (`int`):
+            The target length: when generating with static cache, the mask should be as long as the static cache,
+            to account for the 0 padding, the part of the cache that is not filled yet.
+        dtype (`torch.dtype`):
+            The dtype to use for the 4D attention mask.
+        device (`torch.device`):
+            The device to place the 4D attention mask on.
+        cache_position (`torch.Tensor`):
+            Indices depicting the position of the input sequence tokens in the sequence.
+        batch_size (`torch.Tensor`):
+            Batch size.
+    """
+    if attention_mask is not None and attention_mask.dim() == 4:
+        # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
+        causal_mask = attention_mask
+    else:
+        min_dtype = torch.finfo(dtype).min
+        causal_mask = torch.full(
+            (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device
+        )
+        if sequence_length != 1:
+            causal_mask = torch.triu(causal_mask, diagonal=1)
+        causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
+        causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
+        if attention_mask is not None:
+            causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
+            mask_length = attention_mask.shape[-1]
+            padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :].to(
+                causal_mask.device
+            )
+            padding_mask = padding_mask == 0
+            causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
+                padding_mask, min_dtype
+            )
+
+    return causal_mask            
+            
+def _glm4v_update_causal_mask(
+    attention_mask,
+    input_tensor,
+    cache_position,
+    past_key_values,
+    output_attentions: bool = False,
+):
+    from transformers.modeling_attn_mask_utils import AttentionMaskConverter
+    from transformers.cache_utils import StaticCache
+
+    # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
+    # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
+    # to infer the attention mask.
+    past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+    using_static_cache = isinstance(past_key_values, StaticCache)
+
+    dtype, device = input_tensor.dtype, input_tensor.device
+    sequence_length = input_tensor.shape[1]
+    if using_static_cache:
+        target_length = past_key_values.get_max_cache_shape()
+    else:
+        target_length = (
+            attention_mask.shape[-1]
+            if isinstance(attention_mask, torch.Tensor)
+            else past_seen_tokens + sequence_length + 1
+        )
+
+    # In case the provided `attention` mask is 2D, we generate a causal mask here (4D).
+    causal_mask = _glm4v_prepare_4d_causal_attention_mask_with_cache_position(
+        attention_mask,
+        sequence_length=sequence_length,
+        target_length=target_length,
+        dtype=dtype,
+        device=device,
+        cache_position=cache_position,
+        batch_size=input_tensor.shape[0],
+    )
+
+    return causal_mask
+
+class Glm4vLanguageModelPatcher(DecoderModelPatcher):
+    def __init__(
+        self,
+        config: "OnnxConfig",
+        model: Union["PreTrainedModel", "TFPreTrainedModel"],
+        model_kwargs: Dict[str, Any] = None,
+    ):
+        model.language_model.__orig_forward = model.language_model.forward
+
+        def forward_wrap(
+            self,
+            input_ids,
+            attention_mask,
+            position_ids,
+            past_key_values,
+            inputs_embeds,
+            use_cache,
+            output_attentions,
+            output_hidden_states,
+            cache_position,
+            **kwargs,
+        ) -> Union[tuple, BaseModelOutputWithPast]:
+            from transformers.cache_utils import DynamicCache
+
+            output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+            output_hidden_states = (
+                output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+            )
+            use_cache = use_cache if use_cache is not None else self.config.use_cache
+
+            if (input_ids is None) ^ (inputs_embeds is not None):
+                raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+
+            # torch.jit.trace() doesn't support cache objects in the output
+            if use_cache and past_key_values is None and not torch.jit.is_tracing():
+                past_key_values = DynamicCache()
+
+            if inputs_embeds is None:
+                inputs_embeds = self.embed_tokens(input_ids)
+
+            if cache_position is None:
+                past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+                cache_position = torch.arange(
+                    past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+                )
+
+            # the hard coded `3` is for temporal, height and width.
+            if position_ids is None:
+                position_ids = cache_position.view(1, 1, -1).expand(3, inputs_embeds.shape[0], -1)
+            elif position_ids.dim() == 2:
+                position_ids = position_ids[None, ...].expand(3, position_ids.shape[0], -1)
+            
+            causal_mask = _glm4v_update_causal_mask(
+                attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions
+            )
+
+            hidden_states = inputs_embeds
+
+            # create position embeddings to be shared across the decoder layers
+            position_embeddings = self.rotary_emb(hidden_states, position_ids)
+
+            # decoder layers
+            all_hidden_states = () if output_hidden_states else None
+            all_self_attns = () if output_attentions else None
+
+            for decoder_layer in self.layers:
+                if output_hidden_states:
+                    all_hidden_states += (hidden_states,)
+
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    position_embeddings=position_embeddings,
+                    attention_mask=causal_mask,
+                    position_ids=position_ids,
+                    past_key_value=past_key_values,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                    cache_position=cache_position,
+                    **kwargs,
+                )
+
+                hidden_states = layer_outputs[0]
+
+                if output_attentions:
+                    all_self_attns += (layer_outputs[1],)
+
+            hidden_states = self.norm(hidden_states)
+
+            # add hidden states from the last decoder layer
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+
+            return BaseModelOutputWithPast(
+                last_hidden_state=hidden_states,
+                past_key_values=past_key_values if use_cache else None,
+                hidden_states=all_hidden_states,
+                attentions=all_self_attns,
+            )
+
+        model.language_model.forward = types.MethodType(forward_wrap, model.language_model)
+        super().__init__(config, model, model_kwargs)
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        super().__exit__(exc_type, exc_value, traceback)
+        self._model.language_model.forward = self._model.language_model.__orig_forward
+        
+def glm4v_vision_embeddings_forward(self, hidden_states: torch.FloatTensor):
+    hidden_states = self.patch_embed(hidden_states)
+    hidden_states = self.post_conv_layernorm(hidden_states)
+    return hidden_states
+
+class Glm4vVisionEmbeddingsPatcher(ModelPatcher):
+    def __init__(
+        self,
+        config: "OnnxConfig",
+        model: Union["PreTrainedModel", "TFPreTrainedModel"],
+        model_kwargs: Dict[str, Any],
+    ):
+        model.__orig_forward = model.forward
+        model.forward = types.MethodType(glm4v_vision_embeddings_forward, model)
+        super().__init__(config, model, model_kwargs)
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        super().__exit__(exc_type, exc_value, traceback)
+        self._model.forward = self._model.__orig_forward
+            
+
+class Glm4vVisionEmbMergerPatcher(ModelPatcher):
+    def __init__(
+        self,
+        config: "OnnxConfig",
+        model: Union["PreTrainedModel", "TFPreTrainedModel"],
+        model_kwargs: Dict[str, Any] = None,
+    ):
+        super().__init__(config, model, model_kwargs)
+
+        model.__orig_forward = model.forward
+
+        # Modified from https://github.com/huggingface/transformers/blob/v4.49.0/src/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py#L405
+        # added attention_mask and window_attention_mask inputs instead cu_lens and window_cu_lens processing for its internal calculation model
+        # (unsupported by tracing due to cycle with dynamic len)
+        # separated patch_embed and rot_pos_emb calls for performing as part of another model
+        def image_embed_forward(
+            self,
+            hidden_states: torch.Tensor,
+            seqlens: torch.Tensor,
+            grid_thw: torch.Tensor,
+            attention_mask: torch.Tensor,
+            image_type_ids: torch.Tensor,
+            rotary_pos_emb: torch.Tensor,
+        ) -> torch.Tensor:
+
+            hidden_states = self.embeddings(hidden_states, seqlens, grid_thw, image_type_ids[:, 0], image_type_ids[:, 1])
+
+            for blk in self.blocks:
+                hidden_states = blk(hidden_states, attention_mask=attention_mask, rotary_pos_emb=rotary_pos_emb)
+
+            hidden_states = self.post_layernorm(hidden_states)
+
+            hidden_states = hidden_states.view(
+                -1, self.spatial_merge_size, self.spatial_merge_size, hidden_states.shape[-1]
+            )
+            hidden_states = hidden_states.permute(0, 3, 1, 2)
+            hidden_states = self.downsample(hidden_states).view(-1, self.config.out_hidden_size)
+
+            hidden_states = self.merger(hidden_states)
+            return hidden_states
+
+        model.forward = types.MethodType(image_embed_forward, model)
+        super().__init__(config, model, model_kwargs)
+
+    def __enter__(self):
+        patch_qwen2vl_vision_blocks(self._model, force_new_behaviour=True)
+        super().__enter__()
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        super().__exit__(exc_type, exc_value, traceback)
+        self._model.forward = self._model.__orig_forward
+        for block in self._model.blocks:
+            block.forward = block._orig_forward
+            block.attn.forward = block.attn._orig_forward
\ No newline at end of file

From 1e7f17ccc7a7fa3e4fcba790d7f5ce3416cf63f6 Mon Sep 17 00:00:00 2001
From: ethan <ethan.yang@intel.com>
Date: Thu, 17 Jul 2025 20:21:21 -0700
Subject: [PATCH 2/7] add vlm pipeline

---
 .../openvino/modeling_visual_language.py      | 306 ++++++++++++++++++
 1 file changed, 306 insertions(+)

diff --git a/optimum/intel/openvino/modeling_visual_language.py b/optimum/intel/openvino/modeling_visual_language.py
index f1d0ccb16f..f088bcd77b 100644
--- a/optimum/intel/openvino/modeling_visual_language.py
+++ b/optimum/intel/openvino/modeling_visual_language.py
@@ -4348,6 +4348,311 @@ def preprocess_inputs(
         inputs = processor(images=image, text=text_prompt, return_tensors="pt")
         return inputs
 
+class _OVGlm4vForCausalLM(_OVQwen2VLForCausalLM):
+    additional_parts = ["vision_embeddings_merger"]
+
+    def __init__(
+        self,
+        language_model: ov.Model,
+        text_embeddings: ov.Model,
+        vision_embeddings: ov.Model,
+        config: PretrainedConfig = None,
+        device: str = "CPU",
+        dynamic_shapes: bool = None,
+        ov_config: Optional[Dict[str, str]] = None,
+        model_save_dir: Optional[Union[str, Path, TemporaryDirectory]] = None,
+        quantization_config: Union[OVWeightQuantizationConfig, Dict] = None,
+        **kwargs,
+    ):
+        super(_OVQwen2VLForCausalLM, self).__init__(
+            language_model=language_model,
+            text_embeddings=text_embeddings,
+            vision_embeddings=vision_embeddings,
+            config=config,
+            device=device,
+            dynamic_shapes=dynamic_shapes,
+            ov_config=ov_config,
+            model_save_dir=model_save_dir,
+            quantization_config=quantization_config,
+            **kwargs,
+        )
+        self.rope_deltas = None  # cache rope_deltas here
+
+        if is_transformers_version(">=", "4.53.0"):
+            from transformers.models.glm4v.modeling_glm4v import (
+                Glm4vVisionRotaryEmbedding,
+            )
+
+            self._rotary_pos_emb = Glm4vVisionRotaryEmbedding(
+                self.config.vision_config.hidden_size // self.config.vision_config.num_heads // 2
+            )
+        else:
+            raise ValueError(
+                f"Initialization model for {self.config.model_type} required at least transformers >= 4.45"
+            )
+            
+    def get_rope_index(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        image_grid_thw: Optional[torch.LongTensor] = None,
+        video_grid_thw: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        """
+        Calculate the 3D rope index based on image and video's temporal, height and width in LLM.
+
+        Explanation:
+            Each embedding sequence contains vision embedding and text embedding or just contains text embedding.
+
+            For pure text embedding sequence, the rotary position embedding has no difference with modern LLMs.
+            Examples:
+                input_ids: [T T T T T], here T is for text.
+                temporal position_ids: [0, 1, 2, 3, 4]
+                height position_ids: [0, 1, 2, 3, 4]
+                width position_ids: [0, 1, 2, 3, 4]
+
+            For vision and text embedding sequence, we calculate 3D rotary position embedding for vision part
+            and 1D rotary position embedding for text part.
+            Examples:
+                Temporal (Time): 3 patches, representing different segments of the video in time.
+                Height: 2 patches, dividing each frame vertically.
+                Width: 2 patches, dividing each frame horizontally.
+                We also have some important parameters:
+                fps (Frames Per Second): The video's frame rate, set to 1. This means one frame is processed each second.
+                tokens_per_second: This is a crucial parameter. It dictates how many "time-steps" or "temporal tokens" are conceptually packed into a one-second interval of the video. In this case, we have 25 tokens per second. So each second of the video will be represented with 25 separate time points. It essentially defines the temporal granularity.
+                temporal_patch_size: The number of frames that compose one temporal patch. Here, it's 2 frames.
+                interval: The step size for the temporal position IDs, calculated as tokens_per_second * temporal_patch_size / fps. In this case, 25 * 2 / 1 = 50. This means that each temporal patch will be have a difference of 50 in the temporal position IDs.
+                input_ids: [V V V V V V V V V V V V T T T T T], here V is for vision.
+                vision temporal position_ids: [0, 0, 0, 0, 50, 50, 50, 50, 100, 100, 100, 100]
+                vision height position_ids: [0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1]
+                vision width position_ids: [0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1]
+                text temporal position_ids: [101, 102, 103, 104, 105]
+                text height position_ids: [101, 102, 103, 104, 105]
+                text width position_ids: [101, 102, 103, 104, 105]
+                Here we calculate the text start position_ids as the max vision position_ids plus 1.
+
+        Args:
+            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+                it.
+            image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*):
+                The temporal, height and width of feature shape of each image in LLM.
+            video_grid_thw (`torch.LongTensor` of shape `(num_videos, 3)`, *optional*):
+                The temporal, height and width of feature shape of each video in LLM.
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+        Returns:
+            position_ids (`torch.LongTensor` of shape `(3, batch_size, sequence_length)`)
+            mrope_position_deltas (`torch.Tensor` of shape `(batch_size)`)
+        """
+
+        spatial_merge_size = self.config.vision_config.spatial_merge_size
+        image_token_id = self.config.image_token_id
+        video_start_token_id = self.config.video_start_token_id
+        video_end_token_id = self.config.video_end_token_id
+
+        mrope_position_deltas = []
+        if input_ids is not None and (image_grid_thw is not None or video_grid_thw is not None):
+            total_input_ids = input_ids
+            if attention_mask is None:
+                attention_mask = torch.ones_like(total_input_ids)
+            position_ids = torch.ones(
+                3,
+                input_ids.shape[0],
+                input_ids.shape[1],
+                dtype=input_ids.dtype,
+                device=input_ids.device,
+            )
+            image_index, video_index = 0, 0
+            attention_mask = attention_mask.to(total_input_ids.device)
+            for i, input_ids in enumerate(total_input_ids):
+                input_ids = input_ids[attention_mask[i] == 1]
+                input_tokens = input_ids.tolist()
+
+                input_token_type = []
+                video_check_flg = False
+                for token in input_tokens:
+                    if token == video_start_token_id:
+                        video_check_flg = True
+                    elif token == video_end_token_id:
+                        video_check_flg = False
+
+                    if token == image_token_id and not video_check_flg:
+                        input_token_type.append("image")
+                    elif token == image_token_id and video_check_flg:
+                        input_token_type.append("video")
+                    else:
+                        input_token_type.append("text")
+
+                input_type_group = []
+                for key, group in itertools.groupby(enumerate(input_token_type), lambda x: x[1]):
+                    group = list(group)
+                    start_index = group[0][0]
+                    end_index = group[-1][0] + 1
+                    input_type_group.append((key, start_index, end_index))
+
+                llm_pos_ids_list = []
+                video_frame_num = 1
+
+                for modality_type, start_idx, end_idx in input_type_group:
+                    st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
+
+                    if modality_type == "image":
+                        t, h, w = (
+                            image_grid_thw[image_index][0],
+                            image_grid_thw[image_index][1],
+                            image_grid_thw[image_index][2],
+                        )
+                        llm_grid_t, llm_grid_h, llm_grid_w = (
+                            t.item(),
+                            h.item() // spatial_merge_size,
+                            w.item() // spatial_merge_size,
+                        )
+
+                        t_index = torch.arange(llm_grid_t).view(-1, 1).expand(-1, llm_grid_h * llm_grid_w).flatten()
+                        h_index = torch.arange(llm_grid_h).view(1, -1, 1).expand(llm_grid_t, -1, llm_grid_w).flatten()
+                        w_index = torch.arange(llm_grid_w).view(1, 1, -1).expand(llm_grid_t, llm_grid_h, -1).flatten()
+                        llm_pos_ids_list.append(torch.stack([t_index, h_index, w_index]) + st_idx)
+
+                        image_index += 1
+                        video_frame_num = 1
+
+                    elif modality_type == "video":
+                        t, h, w = (
+                            video_frame_num,
+                            video_grid_thw[video_index][1],
+                            video_grid_thw[video_index][2],
+                        )
+
+                        llm_grid_t, llm_grid_h, llm_grid_w = (
+                            t,
+                            h.item() // spatial_merge_size,
+                            w.item() // spatial_merge_size,
+                        )
+
+                        for t_idx in range(llm_grid_t):
+                            t_index = torch.tensor(t_idx).view(-1, 1).expand(-1, llm_grid_h * llm_grid_w).flatten()
+
+                            h_index = torch.arange(llm_grid_h).view(1, -1, 1).expand(1, -1, llm_grid_w).flatten()
+                            w_index = torch.arange(llm_grid_w).view(1, 1, -1).expand(1, llm_grid_h, -1).flatten()
+                            llm_pos_ids_list.append(torch.stack([t_index, h_index, w_index]) + st_idx)
+
+                        video_index += 1
+
+                        video_frame_num += 1
+
+                    else:
+                        text_len = end_idx - start_idx
+                        llm_pos_ids_list.append(torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx)
+
+                        video_frame_num = 1
+
+                llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1)
+                position_ids[..., i, attention_mask[i] == 1] = llm_positions.to(position_ids.device)
+                mrope_position_deltas.append(llm_positions.max() + 1 - len(total_input_ids[i]))
+            mrope_position_deltas = torch.tensor(mrope_position_deltas, device=input_ids.device).unsqueeze(1)
+            return position_ids, mrope_position_deltas
+        else:
+            if attention_mask is not None:
+                position_ids = attention_mask.long().cumsum(-1) - 1
+                position_ids.masked_fill_(attention_mask == 0, 1)
+                position_ids = position_ids.unsqueeze(0).expand(3, -1, -1).to(attention_mask.device)
+                max_position_ids = position_ids.max(0, keepdim=False)[0].max(-1, keepdim=True)[0]
+                mrope_position_deltas = max_position_ids + 1 - attention_mask.shape[-1]
+            else:
+                position_ids = (
+                    torch.arange(input_ids.shape[1], device=input_ids.device)
+                    .view(1, 1, -1)
+                    .expand(3, input_ids.shape[0], -1)
+                )
+                mrope_position_deltas = torch.zeros(
+                    [input_ids.shape[0], 1],
+                    device=input_ids.device,
+                    dtype=input_ids.dtype,
+                )
+
+            return position_ids, mrope_position_deltas
+            
+    def rot_pos_emb(self, grid_thw):
+        pos_ids = []
+        for t, h, w in grid_thw:
+            hpos_ids = torch.arange(h).unsqueeze(1).expand(-1, w)
+            hpos_ids = hpos_ids.reshape(
+                h // self.config.vision_config.spatial_merge_size,
+                self.config.vision_config.spatial_merge_size,
+                w // self.config.vision_config.spatial_merge_size,
+                self.config.vision_config.spatial_merge_size,
+            )
+            hpos_ids = hpos_ids.permute(0, 2, 1, 3)
+            hpos_ids = hpos_ids.flatten()
+
+            wpos_ids = torch.arange(w).unsqueeze(0).expand(h, -1)
+            wpos_ids = wpos_ids.reshape(
+                h // self.config.vision_config.spatial_merge_size,
+                self.config.vision_config.spatial_merge_size,
+                w // self.config.vision_config.spatial_merge_size,
+                self.config.vision_config.spatial_merge_size,
+            )
+            wpos_ids = wpos_ids.permute(0, 2, 1, 3)
+            wpos_ids = wpos_ids.flatten()
+            pos_ids.append(torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1))
+        pos_ids = torch.cat(pos_ids, dim=0)
+        max_grid_size = grid_thw[:, 1:].max()
+        rotary_pos_emb_full = self._rotary_pos_emb(max_grid_size)
+        rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1)
+        return rotary_pos_emb, pos_ids
+
+    # Copied from https://github.com/huggingface/transformers/blob/v4.45.2/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py#L1602
+    def _update_model_kwargs_for_generation(
+        self,
+        outputs: ModelOutput,
+        model_kwargs: Dict[str, Any],
+        is_encoder_decoder: bool = False,
+        num_new_tokens: int = 1,
+    ) -> Dict[str, Any]:
+        model_kwargs = super()._update_model_kwargs_for_generation(
+            outputs=outputs,
+            model_kwargs=model_kwargs,
+            is_encoder_decoder=is_encoder_decoder,
+            num_new_tokens=num_new_tokens,
+        )
+
+        if getattr(outputs, "rope_deltas", None) is not None:
+            model_kwargs["rope_deltas"] = outputs.rope_deltas
+
+        return model_kwargs
+
+
+    def get_vision_embeddings(self, pixel_values, grid_thw, **kwargs):
+        hidden_states = self.vision_embeddings(pixel_values)[0]
+        rotary_pos_emb, image_type_ids = self.rot_pos_emb(grid_thw)
+
+        cu_seqlens = torch.repeat_interleave(grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0]).cumsum(
+            dim=0,
+            # Select dtype based on the following factors:
+            #  - FA2 requires that cu_seqlens_q must have dtype int32
+            #  - torch.onnx.export requires that cu_seqlens_q must have same dtype as grid_thw
+            # See https://github.com/huggingface/transformers/pull/34852 for more information
+            dtype=grid_thw.dtype if torch.jit.is_tracing() else torch.int32,
+        )
+
+        cu_seqlens = torch.nn.functional.pad(cu_seqlens, (1, 0), value=0)
+        attention_mask = torch.zeros((1, hidden_states.shape[0], hidden_states.shape[0]), dtype=torch.bool)
+        causal_mask = torch.zeros_like(attention_mask, dtype=torch.float32)
+        for i in range(1, len(cu_seqlens)):
+            attention_mask[..., cu_seqlens[i - 1] : cu_seqlens[i], cu_seqlens[i - 1] : cu_seqlens[i]] = True
+
+        causal_mask.masked_fill_(torch.logical_not(attention_mask), float("-inf"))
+        seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
+
+        res = self.vision_embeddings_merger(
+            pixel_values=hidden_states, image_type_ids=image_type_ids, attention_mask=causal_mask, seqlens=seqlens, grid_thw=grid_thw, rotary_pos_emb=rotary_pos_emb
+        )[0]
+        return res
 
 MODEL_TYPE_TO_CLS_MAPPING = {
     "llava": _OVLlavaForCausalLM,
@@ -4367,4 +4672,5 @@ def preprocess_inputs(
     "phi4mm": _OVPhi4MMForCausalLM,
     "phi4_multimodal": _OVPhi4MMForCausalLM,
     "llama4": _OVLlama4ForCausalLM,
+    "glm4v": _OVGlm4vForCausalLM,
 }

From eefe590755c2b663e4252d2567a2d0a035564834 Mon Sep 17 00:00:00 2001
From: ethan <ethan.yang@intel.com>
Date: Thu, 17 Jul 2025 21:40:27 -0700
Subject: [PATCH 3/7] add glm4

---
 optimum/exporters/openvino/utils.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/optimum/exporters/openvino/utils.py b/optimum/exporters/openvino/utils.py
index 51aa411775..0ed539bf52 100644
--- a/optimum/exporters/openvino/utils.py
+++ b/optimum/exporters/openvino/utils.py
@@ -235,6 +235,7 @@ def get_submodels(model):
     "phi4mm",
     "phi4_multimodal",
     "llama4",
+    "glm4v",
 ]
 
 SSM_MODELS = ["mamba", "falcon_mamba"]

From 42e8acb7cf2266281167edeafbfcd7343f01a4c2 Mon Sep 17 00:00:00 2001
From: ethan <ethan.yang@intel.com>
Date: Thu, 17 Jul 2025 21:49:51 -0700
Subject: [PATCH 4/7] add glm4

---
 optimum/intel/openvino/modeling_visual_language.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/optimum/intel/openvino/modeling_visual_language.py b/optimum/intel/openvino/modeling_visual_language.py
index f088bcd77b..497c28eb9f 100644
--- a/optimum/intel/openvino/modeling_visual_language.py
+++ b/optimum/intel/openvino/modeling_visual_language.py
@@ -4449,7 +4449,7 @@ def get_rope_index(
             position_ids (`torch.LongTensor` of shape `(3, batch_size, sequence_length)`)
             mrope_position_deltas (`torch.Tensor` of shape `(batch_size)`)
         """
-
+        import itertools
         spatial_merge_size = self.config.vision_config.spatial_merge_size
         image_token_id = self.config.image_token_id
         video_start_token_id = self.config.video_start_token_id

From 2975feffcb6b2126f922e0447f1f0f119caf88f7 Mon Sep 17 00:00:00 2001
From: ethan <ethan.yang@intel.com>
Date: Fri, 18 Jul 2025 00:01:55 -0700
Subject: [PATCH 5/7] reformat

---
 optimum/exporters/openvino/model_configs.py   |  8 +++--
 optimum/exporters/openvino/model_patcher.py   | 30 ++++++++++---------
 .../openvino/modeling_visual_language.py      | 15 +++++++---
 3 files changed, 32 insertions(+), 21 deletions(-)

diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py
index e4f28d2bd9..168a0195b7 100644
--- a/optimum/exporters/openvino/model_configs.py
+++ b/optimum/exporters/openvino/model_configs.py
@@ -4542,6 +4542,7 @@ def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int
         grid_h, grid_w = self.height // self.patch_size, self.width // self.patch_size
         grid_t = self.batch_size
         import torch
+
         if input_name == "hidden_states":
             return self.random_float_tensor(
                 [grid_t * grid_h * grid_w, self.embed_dim], framework=framework, dtype=float_dtype
@@ -4549,7 +4550,7 @@ def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int
 
         if input_name == "seqlens":
             return torch.tensor([grid_t * grid_h * grid_w], dtype=torch.int64)
-            
+
         if input_name in ["attention_mask", "window_attention_mask"]:
             return self.random_mask_tensor(
                 [1, grid_t * grid_h * grid_w, grid_t * grid_h * grid_w], framework=framework, dtype=float_dtype
@@ -4563,10 +4564,11 @@ def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int
             return self.random_int_tensor(
                 [grid_t * grid_h * grid_w, 2], max_value=grid_h, framework=framework, dtype=int_dtype
             )
-            
+
         if input_name == "grid_thw":
             return torch.tensor([[grid_t, grid_h, grid_w]], dtype=torch.int64)
 
+
 @register_in_tasks_manager("glm4v", *["image-text-to-text", "video-text-to-text"], library_name="transformers")
 class Glm4vOpenVINOConfig(BaseVLMOpenVINOConfig):
     SUPPORTED_BEHAVIORS = [model_type.value for model_type in Qwen2VLConfigBehavior]
@@ -4701,4 +4703,4 @@ def inputs(self) -> Dict[str, Dict[int, str]]:
     def outputs(self) -> Dict[str, Dict[int, str]]:
         if self._behavior in [Qwen2VLConfigBehavior.VISION_EMBEDDINGS, Qwen2VLConfigBehavior.VISION_EMBEDDINGS_MERGER]:
             return {"last_hidden_state": {0: "seq_len"}}
-        return {}
\ No newline at end of file
+        return {}
diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py
index 5478bbcf0a..3498c44e8e 100644
--- a/optimum/exporters/openvino/model_patcher.py
+++ b/optimum/exporters/openvino/model_patcher.py
@@ -6755,6 +6755,7 @@ def __exit__(self, exc_type, exc_value, traceback):
         for layer in self._model.backbone.layers:
             layer.mixer.forward = layer.mixer._orig_forward
 
+
 def _glm4v_prepare_4d_causal_attention_mask_with_cache_position(
     attention_mask: torch.Tensor,
     sequence_length: int,
@@ -6792,9 +6793,7 @@ def _glm4v_prepare_4d_causal_attention_mask_with_cache_position(
         causal_mask = attention_mask
     else:
         min_dtype = torch.finfo(dtype).min
-        causal_mask = torch.full(
-            (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device
-        )
+        causal_mask = torch.full((sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device)
         if sequence_length != 1:
             causal_mask = torch.triu(causal_mask, diagonal=1)
         causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
@@ -6802,16 +6801,15 @@ def _glm4v_prepare_4d_causal_attention_mask_with_cache_position(
         if attention_mask is not None:
             causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
             mask_length = attention_mask.shape[-1]
-            padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :].to(
-                causal_mask.device
-            )
+            padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :].to(causal_mask.device)
             padding_mask = padding_mask == 0
             causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
                 padding_mask, min_dtype
             )
 
-    return causal_mask            
-            
+    return causal_mask
+
+
 def _glm4v_update_causal_mask(
     attention_mask,
     input_tensor,
@@ -6852,6 +6850,7 @@ def _glm4v_update_causal_mask(
 
     return causal_mask
 
+
 class Glm4vLanguageModelPatcher(DecoderModelPatcher):
     def __init__(
         self,
@@ -6903,7 +6902,7 @@ def forward_wrap(
                 position_ids = cache_position.view(1, 1, -1).expand(3, inputs_embeds.shape[0], -1)
             elif position_ids.dim() == 2:
                 position_ids = position_ids[None, ...].expand(3, position_ids.shape[0], -1)
-            
+
             causal_mask = _glm4v_update_causal_mask(
                 attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions
             )
@@ -6957,12 +6956,14 @@ def forward_wrap(
     def __exit__(self, exc_type, exc_value, traceback):
         super().__exit__(exc_type, exc_value, traceback)
         self._model.language_model.forward = self._model.language_model.__orig_forward
-        
+
+
 def glm4v_vision_embeddings_forward(self, hidden_states: torch.FloatTensor):
     hidden_states = self.patch_embed(hidden_states)
     hidden_states = self.post_conv_layernorm(hidden_states)
     return hidden_states
 
+
 class Glm4vVisionEmbeddingsPatcher(ModelPatcher):
     def __init__(
         self,
@@ -6977,7 +6978,7 @@ def __init__(
     def __exit__(self, exc_type, exc_value, traceback):
         super().__exit__(exc_type, exc_value, traceback)
         self._model.forward = self._model.__orig_forward
-            
+
 
 class Glm4vVisionEmbMergerPatcher(ModelPatcher):
     def __init__(
@@ -7003,8 +7004,9 @@ def image_embed_forward(
             image_type_ids: torch.Tensor,
             rotary_pos_emb: torch.Tensor,
         ) -> torch.Tensor:
-
-            hidden_states = self.embeddings(hidden_states, seqlens, grid_thw, image_type_ids[:, 0], image_type_ids[:, 1])
+            hidden_states = self.embeddings(
+                hidden_states, seqlens, grid_thw, image_type_ids[:, 0], image_type_ids[:, 1]
+            )
 
             for blk in self.blocks:
                 hidden_states = blk(hidden_states, attention_mask=attention_mask, rotary_pos_emb=rotary_pos_emb)
@@ -7032,4 +7034,4 @@ def __exit__(self, exc_type, exc_value, traceback):
         self._model.forward = self._model.__orig_forward
         for block in self._model.blocks:
             block.forward = block._orig_forward
-            block.attn.forward = block.attn._orig_forward
\ No newline at end of file
+            block.attn.forward = block.attn._orig_forward
diff --git a/optimum/intel/openvino/modeling_visual_language.py b/optimum/intel/openvino/modeling_visual_language.py
index 497c28eb9f..e32b0309e1 100644
--- a/optimum/intel/openvino/modeling_visual_language.py
+++ b/optimum/intel/openvino/modeling_visual_language.py
@@ -4348,6 +4348,7 @@ def preprocess_inputs(
         inputs = processor(images=image, text=text_prompt, return_tensors="pt")
         return inputs
 
+
 class _OVGlm4vForCausalLM(_OVQwen2VLForCausalLM):
     additional_parts = ["vision_embeddings_merger"]
 
@@ -4390,7 +4391,7 @@ def __init__(
             raise ValueError(
                 f"Initialization model for {self.config.model_type} required at least transformers >= 4.45"
             )
-            
+
     def get_rope_index(
         self,
         input_ids: Optional[torch.LongTensor] = None,
@@ -4450,6 +4451,7 @@ def get_rope_index(
             mrope_position_deltas (`torch.Tensor` of shape `(batch_size)`)
         """
         import itertools
+
         spatial_merge_size = self.config.vision_config.spatial_merge_size
         image_token_id = self.config.image_token_id
         video_start_token_id = self.config.video_start_token_id
@@ -4576,7 +4578,7 @@ def get_rope_index(
                 )
 
             return position_ids, mrope_position_deltas
-            
+
     def rot_pos_emb(self, grid_thw):
         pos_ids = []
         for t, h, w in grid_thw:
@@ -4626,7 +4628,6 @@ def _update_model_kwargs_for_generation(
 
         return model_kwargs
 
-
     def get_vision_embeddings(self, pixel_values, grid_thw, **kwargs):
         hidden_states = self.vision_embeddings(pixel_values)[0]
         rotary_pos_emb, image_type_ids = self.rot_pos_emb(grid_thw)
@@ -4650,10 +4651,16 @@ def get_vision_embeddings(self, pixel_values, grid_thw, **kwargs):
         seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
 
         res = self.vision_embeddings_merger(
-            pixel_values=hidden_states, image_type_ids=image_type_ids, attention_mask=causal_mask, seqlens=seqlens, grid_thw=grid_thw, rotary_pos_emb=rotary_pos_emb
+            pixel_values=hidden_states,
+            image_type_ids=image_type_ids,
+            attention_mask=causal_mask,
+            seqlens=seqlens,
+            grid_thw=grid_thw,
+            rotary_pos_emb=rotary_pos_emb,
         )[0]
         return res
 
+
 MODEL_TYPE_TO_CLS_MAPPING = {
     "llava": _OVLlavaForCausalLM,
     "llava_next": _OVLlavaNextForCausalLM,

From bfa073df01fbe5ae1ace70f28fc685d5658f8969 Mon Sep 17 00:00:00 2001
From: ethan <ethan.yang@intel.com>
Date: Mon, 28 Jul 2025 18:01:54 -0700
Subject: [PATCH 6/7] update the glm4v

---
 optimum/exporters/openvino/model_configs.py   |  2 +-
 .../openvino/modeling_visual_language.py      |  2 +-
 tests/openvino/test_modeling.py               |  3 ++
 tests/openvino/test_quantization.py           | 45 +++++++++++++++++++
 tests/openvino/utils_tests.py                 |  1 +
 5 files changed, 51 insertions(+), 2 deletions(-)

diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py
index 168a0195b7..d6319f6eaa 100644
--- a/optimum/exporters/openvino/model_configs.py
+++ b/optimum/exporters/openvino/model_configs.py
@@ -159,7 +159,7 @@ def init_model_configs():
         TasksManager._LIBRARY_TO_SUPPORTED_MODEL_TYPES["open_clip"] = {}
     TasksManager._CUSTOM_CLASSES[("pt", "glm4v", "image-text-to-text")] = (
         "transformers",
-        "Glm4vForConditionalGeneration",
+        "AutoModelForImageTextToText",
     )
     TasksManager._CUSTOM_CLASSES[("pt", "llava", "image-text-to-text")] = (
         "transformers",
diff --git a/optimum/intel/openvino/modeling_visual_language.py b/optimum/intel/openvino/modeling_visual_language.py
index e32b0309e1..812228b0a4 100644
--- a/optimum/intel/openvino/modeling_visual_language.py
+++ b/optimum/intel/openvino/modeling_visual_language.py
@@ -2611,7 +2611,7 @@ def _update_model_kwargs_for_generation(
 
         return model_kwargs
 
-    # Copied from https://github.com/huggingface/transformers/blob/v4.51.3/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py#L1423
+    # Copied from https://github.com/huggingface/transformers/blob/v4.53.3/src/transformers/models/glm4v/modular_glm4v.py#L1014
     def get_rope_index(
         self,
         input_ids: Optional[torch.LongTensor] = None,
diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py
index 5d4e802708..5d3a6f03d6 100644
--- a/tests/openvino/test_modeling.py
+++ b/tests/openvino/test_modeling.py
@@ -2446,6 +2446,8 @@ class OVModelForVisualCausalLMIntegrationTest(unittest.TestCase):
         SUPPORTED_ARCHITECTURES += ["gemma3", "smolvlm"]
     if is_transformers_version(">=", "4.51"):
         SUPPORTED_ARCHITECTURES += ["llama4"]
+    if is_transformers_version(">=", "4.53"):
+        SUPPORTED_ARCHITECTURES += ["glm4v"]
     TASK = "image-text-to-text"
     REMOTE_CODE_MODELS = ["internvl2", "minicpmv", "nanollava", "phi3_v", "maira2", "phi4mm"]
 
@@ -2467,6 +2469,7 @@ def get_transformer_model_class(self, model_arch):
             "idefics3",
             "smolvlm",
             "llama4",
+            "glm4v"
         ]:
             from transformers import AutoModelForImageTextToText
 
diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py
index 1d54ade14f..dc420518f8 100644
--- a/tests/openvino/test_quantization.py
+++ b/tests/openvino/test_quantization.py
@@ -417,6 +417,32 @@ class OVQuantizerTest(unittest.TestCase):
                 ),
             ]
         )
+    if is_transformers_version(">=", "4.53.0"):
+        SUPPORTED_ARCHITECTURES_OV_MODEL_WITH_AUTO_DATASET.extend(
+            [
+                (
+                    OVModelForVisualCausalLM,
+                    "glm4v",
+                    OVQuantizationConfig(
+                        bits=8,
+                        dataset="contextual",
+                        num_samples=1,
+                    ),
+                    {
+                        "lm_model": 13,
+                        "text_embeddings_model": 0,
+                        "vision_embeddings_model": 0,
+                        "vision_embeddings_merger_model": 0,
+                    },
+                    {
+                        "lm_model": {"int8": 15},
+                        "text_embeddings_model": {"int8": 1},
+                        "vision_embeddings_model": {"int8": 1},
+                        "vision_embeddings_merger_model": {"int8": 10},
+                    },
+                ),
+            ]
+        )
 
     @staticmethod
     def get_calibration_dataset(
@@ -1002,6 +1028,25 @@ class OVWeightCompressionTest(unittest.TestCase):
                         "vision_embeddings_merger_model": {"int8": 10},
                     },
                 ),
+                (
+                    OVModelForVisualCausalLM,
+                    "glm4v",
+                    False,
+                    dict(
+                        bits=4,
+                        group_size=16,
+                        dataset="contextual",
+                        ratio=0.8,
+                        sensitivity_metric="mean_activation_magnitude",
+                        num_samples=1,
+                    ),
+                    {
+                        "lm_model": {"int8": 10, "int4": 20},
+                        "text_embeddings_model": {"int8": 1},
+                        "vision_embeddings_model": {"int8": 1},
+                        "vision_embeddings_merger_model": {"int8": 10},
+                    },
+                ),
             ]
         )
 
diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py
index 57dfa94fca..f40cb35e6d 100644
--- a/tests/openvino/utils_tests.py
+++ b/tests/openvino/utils_tests.py
@@ -186,6 +186,7 @@
     "xglm": "hf-internal-testing/tiny-random-XGLMForCausalLM",
     "xverse": "katuni4ka/tiny-random-xverse",
     "glm4": "snake7gun/tiny-random-glm4",
+    "glm4v": "snake7gun/glm4v-tiny-random",
     "glm": "katuni4ka/tiny-random-glm-edge",
     "open-clip": "hf-internal-testing/tiny-open-clip-model",
     "open-clip-ov": "zofinka/tiny-open-clip-model",

From 53a2bb40cc0aebda7f3e9801159f5e56bffba08a Mon Sep 17 00:00:00 2001
From: ethan <ethan.yang@intel.com>
Date: Wed, 20 Aug 2025 00:38:41 -0700
Subject: [PATCH 7/7] remove glm4v patcher

---
 optimum/exporters/openvino/model_configs.py |   3 +-
 optimum/exporters/openvino/model_patcher.py | 201 --------------------
 2 files changed, 1 insertion(+), 203 deletions(-)

diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py
index 0817a9938c..2e7cea8f7f 100644
--- a/optimum/exporters/openvino/model_configs.py
+++ b/optimum/exporters/openvino/model_configs.py
@@ -96,7 +96,6 @@
     Gemma3LMModelPatcher,
     Glm4vVisionEmbMergerPatcher,
     Glm4vVisionEmbeddingsPatcher,
-    Glm4vLanguageModelPatcher,
     GptBigCodeModelPatcher,
     GptJModelPatcher,
     GptNeoModelPatcher,
@@ -4684,7 +4683,7 @@ def with_behavior(
                 self._orig_config,
                 self.int_dtype,
                 self.float_dtype,
-                model_patcher=Glm4vLanguageModelPatcher,
+                model_patcher=OVDecoderModelPatcher,
                 dummy_input_generator=DummyQwen2VLLMInputGenerator,
                 inputs_update={"position_ids": {1: "batch_size", 2: "sequence_length"}},
             )
diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py
index 606664fad7..68a1be8f58 100644
--- a/optimum/exporters/openvino/model_patcher.py
+++ b/optimum/exporters/openvino/model_patcher.py
@@ -6806,207 +6806,6 @@ def __exit__(self, exc_type, exc_value, traceback):
             layer.mixer.forward = layer.mixer._orig_forward
 
 
-def _glm4v_prepare_4d_causal_attention_mask_with_cache_position(
-    attention_mask: torch.Tensor,
-    sequence_length: int,
-    target_length: int,
-    dtype: torch.dtype,
-    device: torch.device,
-    cache_position: torch.Tensor,
-    batch_size: int,
-    **kwargs,
-):
-    """
-    Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
-    `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.
-
-    Args:
-        attention_mask (`torch.Tensor`):
-            A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
-            `(batch_size, 1, query_length, key_value_length)`.
-        sequence_length (`int`):
-            The sequence length being processed.
-        target_length (`int`):
-            The target length: when generating with static cache, the mask should be as long as the static cache,
-            to account for the 0 padding, the part of the cache that is not filled yet.
-        dtype (`torch.dtype`):
-            The dtype to use for the 4D attention mask.
-        device (`torch.device`):
-            The device to place the 4D attention mask on.
-        cache_position (`torch.Tensor`):
-            Indices depicting the position of the input sequence tokens in the sequence.
-        batch_size (`torch.Tensor`):
-            Batch size.
-    """
-    if attention_mask is not None and attention_mask.dim() == 4:
-        # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
-        causal_mask = attention_mask
-    else:
-        min_dtype = torch.finfo(dtype).min
-        causal_mask = torch.full((sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device)
-        if sequence_length != 1:
-            causal_mask = torch.triu(causal_mask, diagonal=1)
-        causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
-        causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
-        if attention_mask is not None:
-            causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
-            mask_length = attention_mask.shape[-1]
-            padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :].to(causal_mask.device)
-            padding_mask = padding_mask == 0
-            causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
-                padding_mask, min_dtype
-            )
-
-    return causal_mask
-
-
-def _glm4v_update_causal_mask(
-    attention_mask,
-    input_tensor,
-    cache_position,
-    past_key_values,
-    output_attentions: bool = False,
-):
-    from transformers.modeling_attn_mask_utils import AttentionMaskConverter
-    from transformers.cache_utils import StaticCache
-
-    # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
-    # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
-    # to infer the attention mask.
-    past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
-    using_static_cache = isinstance(past_key_values, StaticCache)
-
-    dtype, device = input_tensor.dtype, input_tensor.device
-    sequence_length = input_tensor.shape[1]
-    if using_static_cache:
-        target_length = past_key_values.get_max_cache_shape()
-    else:
-        target_length = (
-            attention_mask.shape[-1]
-            if isinstance(attention_mask, torch.Tensor)
-            else past_seen_tokens + sequence_length + 1
-        )
-
-    # In case the provided `attention` mask is 2D, we generate a causal mask here (4D).
-    causal_mask = _glm4v_prepare_4d_causal_attention_mask_with_cache_position(
-        attention_mask,
-        sequence_length=sequence_length,
-        target_length=target_length,
-        dtype=dtype,
-        device=device,
-        cache_position=cache_position,
-        batch_size=input_tensor.shape[0],
-    )
-
-    return causal_mask
-
-
-class Glm4vLanguageModelPatcher(DecoderModelPatcher):
-    def __init__(
-        self,
-        config: "OnnxConfig",
-        model: Union["PreTrainedModel", "TFPreTrainedModel"],
-        model_kwargs: Dict[str, Any] = None,
-    ):
-        model.language_model.__orig_forward = model.language_model.forward
-
-        def forward_wrap(
-            self,
-            input_ids,
-            attention_mask,
-            position_ids,
-            past_key_values,
-            inputs_embeds,
-            use_cache,
-            output_attentions,
-            output_hidden_states,
-            cache_position,
-            **kwargs,
-        ) -> Union[tuple, BaseModelOutputWithPast]:
-            from transformers.cache_utils import DynamicCache
-
-            output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-            output_hidden_states = (
-                output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-            )
-            use_cache = use_cache if use_cache is not None else self.config.use_cache
-
-            if (input_ids is None) ^ (inputs_embeds is not None):
-                raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
-
-            # torch.jit.trace() doesn't support cache objects in the output
-            if use_cache and past_key_values is None and not torch.jit.is_tracing():
-                past_key_values = DynamicCache()
-
-            if inputs_embeds is None:
-                inputs_embeds = self.embed_tokens(input_ids)
-
-            if cache_position is None:
-                past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
-                cache_position = torch.arange(
-                    past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
-                )
-
-            # the hard coded `3` is for temporal, height and width.
-            if position_ids is None:
-                position_ids = cache_position.view(1, 1, -1).expand(3, inputs_embeds.shape[0], -1)
-            elif position_ids.dim() == 2:
-                position_ids = position_ids[None, ...].expand(3, position_ids.shape[0], -1)
-
-            causal_mask = _glm4v_update_causal_mask(
-                attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions
-            )
-
-            hidden_states = inputs_embeds
-
-            # create position embeddings to be shared across the decoder layers
-            position_embeddings = self.rotary_emb(hidden_states, position_ids)
-
-            # decoder layers
-            all_hidden_states = () if output_hidden_states else None
-            all_self_attns = () if output_attentions else None
-
-            for decoder_layer in self.layers:
-                if output_hidden_states:
-                    all_hidden_states += (hidden_states,)
-
-                layer_outputs = decoder_layer(
-                    hidden_states,
-                    position_embeddings=position_embeddings,
-                    attention_mask=causal_mask,
-                    position_ids=position_ids,
-                    past_key_value=past_key_values,
-                    output_attentions=output_attentions,
-                    use_cache=use_cache,
-                    cache_position=cache_position,
-                    **kwargs,
-                )
-
-                hidden_states = layer_outputs[0]
-
-                if output_attentions:
-                    all_self_attns += (layer_outputs[1],)
-
-            hidden_states = self.norm(hidden_states)
-
-            # add hidden states from the last decoder layer
-            if output_hidden_states:
-                all_hidden_states += (hidden_states,)
-
-            return BaseModelOutputWithPast(
-                last_hidden_state=hidden_states,
-                past_key_values=past_key_values if use_cache else None,
-                hidden_states=all_hidden_states,
-                attentions=all_self_attns,
-            )
-
-        model.language_model.forward = types.MethodType(forward_wrap, model.language_model)
-        super().__init__(config, model, model_kwargs)
-
-    def __exit__(self, exc_type, exc_value, traceback):
-        super().__exit__(exc_type, exc_value, traceback)
-        self._model.language_model.forward = self._model.language_model.__orig_forward
-
 
 def glm4v_vision_embeddings_forward(self, hidden_states: torch.FloatTensor):
     hidden_states = self.patch_embed(hidden_states)