diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py index 9c1684db81..2e7cea8f7f 100644 --- a/optimum/exporters/openvino/model_configs.py +++ b/optimum/exporters/openvino/model_configs.py @@ -94,6 +94,8 @@ FluxTransfromerModelPatcher, Gemma2ModelPatcher, Gemma3LMModelPatcher, + Glm4vVisionEmbMergerPatcher, + Glm4vVisionEmbeddingsPatcher, GptBigCodeModelPatcher, GptJModelPatcher, GptNeoModelPatcher, @@ -148,6 +150,10 @@ def init_model_configs(): if "open_clip" not in TasksManager._LIBRARY_TO_SUPPORTED_MODEL_TYPES: TasksManager._LIBRARY_TO_SUPPORTED_MODEL_TYPES["open_clip"] = {} + TasksManager._CUSTOM_CLASSES[("pt", "glm4v", "image-text-to-text")] = ( + "transformers", + "AutoModelForImageTextToText", + ) TasksManager._CUSTOM_CLASSES[("pt", "llava", "image-text-to-text")] = ( "transformers", "LlavaForConditionalGeneration", @@ -4525,6 +4531,211 @@ def generate_dummy_inputs(self, framework: str = "pt", **kwargs): return dummy_inputs +class DummyGlm4vVisionEmbedInputGenerator(DummyVisionInputGenerator): + SUPPORTED_INPUT_NAMES = ( + "hidden_states", + "seqlens", + "grid_thw", + "attention_mask", + "image_type_ids", + "rotary_pos_emb", + ) + + def __init__( + self, + task: str, + normalized_config: NormalizedVisionConfig, + batch_size: int = 1, + num_channels: int = DEFAULT_DUMMY_SHAPES["num_channels"], + width: int = 420, + height: int = 420, + **kwargs, + ): + self.batch_size = batch_size + self.height = height + self.width = width + self.num_channels = num_channels + self.temporal_patch_size = normalized_config.config.temporal_patch_size + self.patch_size = normalized_config.config.patch_size + if normalized_config.use_embed_dim: + self.embed_dim = ( + normalized_config.config.embed_dim + if hasattr(normalized_config.config, "embed_dim") + else normalized_config.hidden_size + ) + else: + self.embed_dim = self.num_channels * self.temporal_patch_size * self.patch_size * self.patch_size + self.num_heads = normalized_config.config.num_heads + self.spatial_merge_size = None + if hasattr(normalized_config.config, "spatial_merge_size"): + self.spatial_merge_size = normalized_config.config.spatial_merge_size + + def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"): + grid_h, grid_w = self.height // self.patch_size, self.width // self.patch_size + grid_t = self.batch_size + import torch + + if input_name == "hidden_states": + return self.random_float_tensor( + [grid_t * grid_h * grid_w, self.embed_dim], framework=framework, dtype=float_dtype + ) + + if input_name == "seqlens": + return torch.tensor([grid_t * grid_h * grid_w], dtype=torch.int64) + + if input_name in ["attention_mask", "window_attention_mask"]: + return self.random_mask_tensor( + [1, grid_t * grid_h * grid_w, grid_t * grid_h * grid_w], framework=framework, dtype=float_dtype + ) + + if input_name == "rotary_pos_emb": + dim = self.embed_dim // self.num_heads // 2 + return self.random_float_tensor([grid_h * grid_t * grid_w, dim], framework=framework, dtype=float_dtype) + + if input_name == "image_type_ids": + return self.random_int_tensor( + [grid_t * grid_h * grid_w, 2], max_value=grid_h, framework=framework, dtype=int_dtype + ) + + if input_name == "grid_thw": + return torch.tensor([[grid_t, grid_h, grid_w]], dtype=torch.int64) + + +@register_in_tasks_manager("glm4v", *["image-text-to-text", "video-text-to-text"], library_name="transformers") +class Glm4vOpenVINOConfig(BaseVLMOpenVINOConfig): + SUPPORTED_BEHAVIORS = [model_type.value for model_type in Qwen2VLConfigBehavior] + NORMALIZED_CONFIG_CLASS = NormalizedVisionConfig + DUMMY_INPUT_GENERATOR_CLASSES = (DummyGlm4vVisionEmbedInputGenerator,) + MIN_TRANSFORMERS_VERSION = version.parse("4.54.0") + + def __init__( + self, + config: "PretrainedConfig", + task: str = "feature-extraction", + int_dtype: str = "int64", + float_dtype: str = "fp32", + behavior: Qwen2VLConfigBehavior = Qwen2VLConfigBehavior.VISION_EMBEDDINGS, + preprocessors: Optional[List[Any]] = None, + **kwargs, + ): + super().__init__( + config=config, + task=task, + int_dtype=int_dtype, + float_dtype=float_dtype, + preprocessors=preprocessors, + ) + self._behavior = behavior + self._orig_config = config + if self._behavior == Qwen2VLConfigBehavior.VISION_EMBEDDINGS and hasattr(config, "vision_config"): + self._config = config.vision_config + self._normalized_config = self.NORMALIZED_CONFIG_CLASS(self._config) + self._normalized_config.use_embed_dim = False + if self._behavior == Qwen2VLConfigBehavior.VISION_EMBEDDINGS_MERGER and hasattr(config, "vision_config"): + self._config = config.vision_config + self._normalized_config = self.NORMALIZED_CONFIG_CLASS(self._config) + self._normalized_config.use_embed_dim = True + + @staticmethod + def get_model_for_behavior(model, behavior: Union[str, Qwen2VLConfigBehavior]): + if isinstance(behavior, str) and not isinstance(behavior, Qwen2VLConfigBehavior): + behavior = Qwen2VLConfigBehavior(behavior) + + if behavior == Qwen2VLConfigBehavior.LANGUAGE: + return model + + if behavior == Qwen2VLConfigBehavior.VISION_EMBEDDINGS: + vision_embeddings = model.visual + vision_embeddings.config = model.config.vision_config + return vision_embeddings + + if behavior == Qwen2VLConfigBehavior.VISION_EMBEDDINGS_MERGER: + vision_emb_merger = model.visual + vision_emb_merger.config = model.config.vision_config + return vision_emb_merger + + if behavior == Qwen2VLConfigBehavior.TEXT_EMBEDDINGS: + text_embedding = ( + model.model.embed_tokens if hasattr(model.model, "embed_tokens") else model.language_model.embed_tokens + ) + text_embedding.config = model.config + return text_embedding + + def with_behavior( + self, + behavior: Union[str, Qwen2VLConfigBehavior], + ): + """ + Creates a config for different behaviour. + Args: + behavior ([`ConfigBehavior`]): + The behavior to use for the new instance. + """ + if isinstance(behavior, str) and not isinstance(behavior, Qwen2VLConfigBehavior): + behavior = Qwen2VLConfigBehavior(behavior) + + if behavior == Qwen2VLConfigBehavior.TEXT_EMBEDDINGS: + return get_vlm_text_embeddings_config("qwen2", self._orig_config, self.int_dtype, self.float_dtype) + + if behavior == Qwen2VLConfigBehavior.LANGUAGE: + return get_vlm_text_generation_config( + "qwen2", + self._orig_config, + self.int_dtype, + self.float_dtype, + model_patcher=OVDecoderModelPatcher, + dummy_input_generator=DummyQwen2VLLMInputGenerator, + inputs_update={"position_ids": {1: "batch_size", 2: "sequence_length"}}, + ) + + if behavior == Qwen2VLConfigBehavior.VISION_EMBEDDINGS: + return self.__class__( + self._orig_config, + task=self.task, + int_dtype=self.int_dtype, + float_dtype=self.float_dtype, + behavior=behavior, + preprocessors=self._preprocessors, + ) + if behavior == Qwen2VLConfigBehavior.VISION_EMBEDDINGS_MERGER: + return self.__class__( + self._orig_config, + task=self.task, + int_dtype=self.int_dtype, + float_dtype=self.float_dtype, + behavior=behavior, + preprocessors=self._preprocessors, + ) + + def patch_model_for_export( + self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None + ): + model_kwargs = model_kwargs or {} + if self._behavior == Qwen2VLConfigBehavior.VISION_EMBEDDINGS_MERGER: + return Glm4vVisionEmbMergerPatcher(self, model, model_kwargs) + if self._behavior == Qwen2VLConfigBehavior.VISION_EMBEDDINGS: + return Glm4vVisionEmbeddingsPatcher(self, model, model_kwargs=model_kwargs) + return super().patch_model_for_export(model, model_kwargs) + + @property + def inputs(self) -> Dict[str, Dict[int, str]]: + if self._behavior == Qwen2VLConfigBehavior.VISION_EMBEDDINGS: + return {"hidden_states": {0: "patch_thw_grid", 1: "patch_temporal_channels"}} + if self._behavior == Qwen2VLConfigBehavior.VISION_EMBEDDINGS_MERGER: + return { + "hidden_states": {0: "sequence_length"}, + "seqlens": {0: "sequence_length"}, + "grid_thw": {0: "sequence_length"}, + "attention_mask": {1: "sequence_length", 2: "sequence_length"}, + "image_type_ids": {0: "sequence_length"}, + "rotary_pos_emb": {0: "sequence_length"}, + } + + @property + def outputs(self) -> Dict[str, Dict[int, str]]: + if self._behavior in [Qwen2VLConfigBehavior.VISION_EMBEDDINGS, Qwen2VLConfigBehavior.VISION_EMBEDDINGS_MERGER]: + return {"last_hidden_state": {0: "seq_len"}} + return {} @register_in_tasks_manager("ernie4_5", *["text-generation", "text-generation-with-past"], library_name="transformers") class ErnieOpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig): MIN_TRANSFORMERS_VERSION = "4.54.0" diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index 2b94d388cc..68a1be8f58 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -6806,6 +6806,84 @@ def __exit__(self, exc_type, exc_value, traceback): layer.mixer.forward = layer.mixer._orig_forward + +def glm4v_vision_embeddings_forward(self, hidden_states: torch.FloatTensor): + hidden_states = self.patch_embed(hidden_states) + hidden_states = self.post_conv_layernorm(hidden_states) + return hidden_states + + +class Glm4vVisionEmbeddingsPatcher(ModelPatcher): + def __init__( + self, + config: "OnnxConfig", + model: Union["PreTrainedModel", "TFPreTrainedModel"], + model_kwargs: Dict[str, Any], + ): + model.__orig_forward = model.forward + model.forward = types.MethodType(glm4v_vision_embeddings_forward, model) + super().__init__(config, model, model_kwargs) + + def __exit__(self, exc_type, exc_value, traceback): + super().__exit__(exc_type, exc_value, traceback) + self._model.forward = self._model.__orig_forward + + +class Glm4vVisionEmbMergerPatcher(ModelPatcher): + def __init__( + self, + config: "OnnxConfig", + model: Union["PreTrainedModel", "TFPreTrainedModel"], + model_kwargs: Dict[str, Any] = None, + ): + super().__init__(config, model, model_kwargs) + + model.__orig_forward = model.forward + + # Modified from https://github.com/huggingface/transformers/blob/v4.49.0/src/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py#L405 + # added attention_mask and window_attention_mask inputs instead cu_lens and window_cu_lens processing for its internal calculation model + # (unsupported by tracing due to cycle with dynamic len) + # separated patch_embed and rot_pos_emb calls for performing as part of another model + def image_embed_forward( + self, + hidden_states: torch.Tensor, + seqlens: torch.Tensor, + grid_thw: torch.Tensor, + attention_mask: torch.Tensor, + image_type_ids: torch.Tensor, + rotary_pos_emb: torch.Tensor, + ) -> torch.Tensor: + hidden_states = self.embeddings( + hidden_states, seqlens, grid_thw, image_type_ids[:, 0], image_type_ids[:, 1] + ) + + for blk in self.blocks: + hidden_states = blk(hidden_states, attention_mask=attention_mask, rotary_pos_emb=rotary_pos_emb) + + hidden_states = self.post_layernorm(hidden_states) + + hidden_states = hidden_states.view( + -1, self.spatial_merge_size, self.spatial_merge_size, hidden_states.shape[-1] + ) + hidden_states = hidden_states.permute(0, 3, 1, 2) + hidden_states = self.downsample(hidden_states).view(-1, self.config.out_hidden_size) + + hidden_states = self.merger(hidden_states) + return hidden_states + + model.forward = types.MethodType(image_embed_forward, model) + super().__init__(config, model, model_kwargs) + + def __enter__(self): + patch_qwen2vl_vision_blocks(self._model, force_new_behaviour=True) + super().__enter__() + + def __exit__(self, exc_type, exc_value, traceback): + super().__exit__(exc_type, exc_value, traceback) + self._model.forward = self._model.__orig_forward + for block in self._model.blocks: + block.forward = block._orig_forward + block.attn.forward = block.attn._orig_forward # https://github.com/huggingface/transformers/blob/v4.53.0/src/transformers/models/qwen3_moe/modeling_qwen3_moe.py#L228 def qwen3_moe_forward_patched(self, hidden_states: torch.Tensor) -> torch.Tensor: batch_size, sequence_length, hidden_dim = hidden_states.shape diff --git a/optimum/exporters/openvino/utils.py b/optimum/exporters/openvino/utils.py index d1318fc109..ae2b4f492f 100644 --- a/optimum/exporters/openvino/utils.py +++ b/optimum/exporters/openvino/utils.py @@ -235,6 +235,7 @@ def get_submodels(model): "phi4mm", "phi4_multimodal", "llama4", + "glm4v", ] SSM_MODELS = ["mamba", "falcon_mamba"] diff --git a/optimum/intel/openvino/modeling_visual_language.py b/optimum/intel/openvino/modeling_visual_language.py index a35c532c54..c67a2b386a 100644 --- a/optimum/intel/openvino/modeling_visual_language.py +++ b/optimum/intel/openvino/modeling_visual_language.py @@ -2600,7 +2600,7 @@ def _update_model_kwargs_for_generation( return model_kwargs - # Copied from https://github.com/huggingface/transformers/blob/v4.51.3/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py#L1423 + # Copied from https://github.com/huggingface/transformers/blob/v4.53.3/src/transformers/models/glm4v/modular_glm4v.py#L1014 def get_rope_index( self, input_ids: Optional[torch.LongTensor] = None, @@ -4338,6 +4338,318 @@ def preprocess_inputs( return inputs +class _OVGlm4vForCausalLM(_OVQwen2VLForCausalLM): + additional_parts = ["vision_embeddings_merger"] + + def __init__( + self, + language_model: ov.Model, + text_embeddings: ov.Model, + vision_embeddings: ov.Model, + config: PretrainedConfig = None, + device: str = "CPU", + dynamic_shapes: bool = None, + ov_config: Optional[Dict[str, str]] = None, + model_save_dir: Optional[Union[str, Path, TemporaryDirectory]] = None, + quantization_config: Union[OVWeightQuantizationConfig, Dict] = None, + **kwargs, + ): + super(_OVQwen2VLForCausalLM, self).__init__( + language_model=language_model, + text_embeddings=text_embeddings, + vision_embeddings=vision_embeddings, + config=config, + device=device, + dynamic_shapes=dynamic_shapes, + ov_config=ov_config, + model_save_dir=model_save_dir, + quantization_config=quantization_config, + **kwargs, + ) + self.rope_deltas = None # cache rope_deltas here + + if is_transformers_version(">=", "4.53.0"): + from transformers.models.glm4v.modeling_glm4v import ( + Glm4vVisionRotaryEmbedding, + ) + + self._rotary_pos_emb = Glm4vVisionRotaryEmbedding( + self.config.vision_config.hidden_size // self.config.vision_config.num_heads // 2 + ) + else: + raise ValueError( + f"Initialization model for {self.config.model_type} required at least transformers >= 4.45" + ) + + def get_rope_index( + self, + input_ids: Optional[torch.LongTensor] = None, + image_grid_thw: Optional[torch.LongTensor] = None, + video_grid_thw: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + ) -> tuple[torch.Tensor, torch.Tensor]: + """ + Calculate the 3D rope index based on image and video's temporal, height and width in LLM. + + Explanation: + Each embedding sequence contains vision embedding and text embedding or just contains text embedding. + + For pure text embedding sequence, the rotary position embedding has no difference with modern LLMs. + Examples: + input_ids: [T T T T T], here T is for text. + temporal position_ids: [0, 1, 2, 3, 4] + height position_ids: [0, 1, 2, 3, 4] + width position_ids: [0, 1, 2, 3, 4] + + For vision and text embedding sequence, we calculate 3D rotary position embedding for vision part + and 1D rotary position embedding for text part. + Examples: + Temporal (Time): 3 patches, representing different segments of the video in time. + Height: 2 patches, dividing each frame vertically. + Width: 2 patches, dividing each frame horizontally. + We also have some important parameters: + fps (Frames Per Second): The video's frame rate, set to 1. This means one frame is processed each second. + tokens_per_second: This is a crucial parameter. It dictates how many "time-steps" or "temporal tokens" are conceptually packed into a one-second interval of the video. In this case, we have 25 tokens per second. So each second of the video will be represented with 25 separate time points. It essentially defines the temporal granularity. + temporal_patch_size: The number of frames that compose one temporal patch. Here, it's 2 frames. + interval: The step size for the temporal position IDs, calculated as tokens_per_second * temporal_patch_size / fps. In this case, 25 * 2 / 1 = 50. This means that each temporal patch will be have a difference of 50 in the temporal position IDs. + input_ids: [V V V V V V V V V V V V T T T T T], here V is for vision. + vision temporal position_ids: [0, 0, 0, 0, 50, 50, 50, 50, 100, 100, 100, 100] + vision height position_ids: [0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1] + vision width position_ids: [0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1] + text temporal position_ids: [101, 102, 103, 104, 105] + text height position_ids: [101, 102, 103, 104, 105] + text width position_ids: [101, 102, 103, 104, 105] + Here we calculate the text start position_ids as the max vision position_ids plus 1. + + Args: + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide + it. + image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*): + The temporal, height and width of feature shape of each image in LLM. + video_grid_thw (`torch.LongTensor` of shape `(num_videos, 3)`, *optional*): + The temporal, height and width of feature shape of each video in LLM. + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + Returns: + position_ids (`torch.LongTensor` of shape `(3, batch_size, sequence_length)`) + mrope_position_deltas (`torch.Tensor` of shape `(batch_size)`) + """ + import itertools + + spatial_merge_size = self.config.vision_config.spatial_merge_size + image_token_id = self.config.image_token_id + video_start_token_id = self.config.video_start_token_id + video_end_token_id = self.config.video_end_token_id + + mrope_position_deltas = [] + if input_ids is not None and (image_grid_thw is not None or video_grid_thw is not None): + total_input_ids = input_ids + if attention_mask is None: + attention_mask = torch.ones_like(total_input_ids) + position_ids = torch.ones( + 3, + input_ids.shape[0], + input_ids.shape[1], + dtype=input_ids.dtype, + device=input_ids.device, + ) + image_index, video_index = 0, 0 + attention_mask = attention_mask.to(total_input_ids.device) + for i, input_ids in enumerate(total_input_ids): + input_ids = input_ids[attention_mask[i] == 1] + input_tokens = input_ids.tolist() + + input_token_type = [] + video_check_flg = False + for token in input_tokens: + if token == video_start_token_id: + video_check_flg = True + elif token == video_end_token_id: + video_check_flg = False + + if token == image_token_id and not video_check_flg: + input_token_type.append("image") + elif token == image_token_id and video_check_flg: + input_token_type.append("video") + else: + input_token_type.append("text") + + input_type_group = [] + for key, group in itertools.groupby(enumerate(input_token_type), lambda x: x[1]): + group = list(group) + start_index = group[0][0] + end_index = group[-1][0] + 1 + input_type_group.append((key, start_index, end_index)) + + llm_pos_ids_list = [] + video_frame_num = 1 + + for modality_type, start_idx, end_idx in input_type_group: + st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0 + + if modality_type == "image": + t, h, w = ( + image_grid_thw[image_index][0], + image_grid_thw[image_index][1], + image_grid_thw[image_index][2], + ) + llm_grid_t, llm_grid_h, llm_grid_w = ( + t.item(), + h.item() // spatial_merge_size, + w.item() // spatial_merge_size, + ) + + t_index = torch.arange(llm_grid_t).view(-1, 1).expand(-1, llm_grid_h * llm_grid_w).flatten() + h_index = torch.arange(llm_grid_h).view(1, -1, 1).expand(llm_grid_t, -1, llm_grid_w).flatten() + w_index = torch.arange(llm_grid_w).view(1, 1, -1).expand(llm_grid_t, llm_grid_h, -1).flatten() + llm_pos_ids_list.append(torch.stack([t_index, h_index, w_index]) + st_idx) + + image_index += 1 + video_frame_num = 1 + + elif modality_type == "video": + t, h, w = ( + video_frame_num, + video_grid_thw[video_index][1], + video_grid_thw[video_index][2], + ) + + llm_grid_t, llm_grid_h, llm_grid_w = ( + t, + h.item() // spatial_merge_size, + w.item() // spatial_merge_size, + ) + + for t_idx in range(llm_grid_t): + t_index = torch.tensor(t_idx).view(-1, 1).expand(-1, llm_grid_h * llm_grid_w).flatten() + + h_index = torch.arange(llm_grid_h).view(1, -1, 1).expand(1, -1, llm_grid_w).flatten() + w_index = torch.arange(llm_grid_w).view(1, 1, -1).expand(1, llm_grid_h, -1).flatten() + llm_pos_ids_list.append(torch.stack([t_index, h_index, w_index]) + st_idx) + + video_index += 1 + + video_frame_num += 1 + + else: + text_len = end_idx - start_idx + llm_pos_ids_list.append(torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx) + + video_frame_num = 1 + + llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1) + position_ids[..., i, attention_mask[i] == 1] = llm_positions.to(position_ids.device) + mrope_position_deltas.append(llm_positions.max() + 1 - len(total_input_ids[i])) + mrope_position_deltas = torch.tensor(mrope_position_deltas, device=input_ids.device).unsqueeze(1) + return position_ids, mrope_position_deltas + else: + if attention_mask is not None: + position_ids = attention_mask.long().cumsum(-1) - 1 + position_ids.masked_fill_(attention_mask == 0, 1) + position_ids = position_ids.unsqueeze(0).expand(3, -1, -1).to(attention_mask.device) + max_position_ids = position_ids.max(0, keepdim=False)[0].max(-1, keepdim=True)[0] + mrope_position_deltas = max_position_ids + 1 - attention_mask.shape[-1] + else: + position_ids = ( + torch.arange(input_ids.shape[1], device=input_ids.device) + .view(1, 1, -1) + .expand(3, input_ids.shape[0], -1) + ) + mrope_position_deltas = torch.zeros( + [input_ids.shape[0], 1], + device=input_ids.device, + dtype=input_ids.dtype, + ) + + return position_ids, mrope_position_deltas + + def rot_pos_emb(self, grid_thw): + pos_ids = [] + for t, h, w in grid_thw: + hpos_ids = torch.arange(h).unsqueeze(1).expand(-1, w) + hpos_ids = hpos_ids.reshape( + h // self.config.vision_config.spatial_merge_size, + self.config.vision_config.spatial_merge_size, + w // self.config.vision_config.spatial_merge_size, + self.config.vision_config.spatial_merge_size, + ) + hpos_ids = hpos_ids.permute(0, 2, 1, 3) + hpos_ids = hpos_ids.flatten() + + wpos_ids = torch.arange(w).unsqueeze(0).expand(h, -1) + wpos_ids = wpos_ids.reshape( + h // self.config.vision_config.spatial_merge_size, + self.config.vision_config.spatial_merge_size, + w // self.config.vision_config.spatial_merge_size, + self.config.vision_config.spatial_merge_size, + ) + wpos_ids = wpos_ids.permute(0, 2, 1, 3) + wpos_ids = wpos_ids.flatten() + pos_ids.append(torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1)) + pos_ids = torch.cat(pos_ids, dim=0) + max_grid_size = grid_thw[:, 1:].max() + rotary_pos_emb_full = self._rotary_pos_emb(max_grid_size) + rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1) + return rotary_pos_emb, pos_ids + + # Copied from https://github.com/huggingface/transformers/blob/v4.45.2/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py#L1602 + def _update_model_kwargs_for_generation( + self, + outputs: ModelOutput, + model_kwargs: Dict[str, Any], + is_encoder_decoder: bool = False, + num_new_tokens: int = 1, + ) -> Dict[str, Any]: + model_kwargs = super()._update_model_kwargs_for_generation( + outputs=outputs, + model_kwargs=model_kwargs, + is_encoder_decoder=is_encoder_decoder, + num_new_tokens=num_new_tokens, + ) + + if getattr(outputs, "rope_deltas", None) is not None: + model_kwargs["rope_deltas"] = outputs.rope_deltas + + return model_kwargs + + def get_vision_embeddings(self, pixel_values, grid_thw, **kwargs): + hidden_states = self.vision_embeddings(pixel_values)[0] + rotary_pos_emb, image_type_ids = self.rot_pos_emb(grid_thw) + + cu_seqlens = torch.repeat_interleave(grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0]).cumsum( + dim=0, + # Select dtype based on the following factors: + # - FA2 requires that cu_seqlens_q must have dtype int32 + # - torch.onnx.export requires that cu_seqlens_q must have same dtype as grid_thw + # See https://github.com/huggingface/transformers/pull/34852 for more information + dtype=grid_thw.dtype if torch.jit.is_tracing() else torch.int32, + ) + + cu_seqlens = torch.nn.functional.pad(cu_seqlens, (1, 0), value=0) + attention_mask = torch.zeros((1, hidden_states.shape[0], hidden_states.shape[0]), dtype=torch.bool) + causal_mask = torch.zeros_like(attention_mask, dtype=torch.float32) + for i in range(1, len(cu_seqlens)): + attention_mask[..., cu_seqlens[i - 1] : cu_seqlens[i], cu_seqlens[i - 1] : cu_seqlens[i]] = True + + causal_mask.masked_fill_(torch.logical_not(attention_mask), float("-inf")) + seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist() + + res = self.vision_embeddings_merger( + pixel_values=hidden_states, + image_type_ids=image_type_ids, + attention_mask=causal_mask, + seqlens=seqlens, + grid_thw=grid_thw, + rotary_pos_emb=rotary_pos_emb, + )[0] + return res + + MODEL_TYPE_TO_CLS_MAPPING = { "llava": _OVLlavaForCausalLM, "llava_next": _OVLlavaNextForCausalLM, @@ -4356,4 +4668,5 @@ def preprocess_inputs( "phi4mm": _OVPhi4MMForCausalLM, "phi4_multimodal": _OVPhi4MMForCausalLM, "llama4": _OVLlama4ForCausalLM, + "glm4v": _OVGlm4vForCausalLM, } diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py index a7a80bde3e..81d834793b 100644 --- a/tests/openvino/test_modeling.py +++ b/tests/openvino/test_modeling.py @@ -2464,6 +2464,8 @@ class OVModelForVisualCausalLMIntegrationTest(unittest.TestCase): SUPPORTED_ARCHITECTURES += ["gemma3", "smolvlm"] if is_transformers_version(">=", "4.51"): SUPPORTED_ARCHITECTURES += ["llama4"] + if is_transformers_version(">=", "4.53"): + SUPPORTED_ARCHITECTURES += ["glm4v"] TASK = "image-text-to-text" REMOTE_CODE_MODELS = ["internvl2", "minicpmv", "nanollava", "phi3_v", "maira2", "phi4mm"] @@ -2486,6 +2488,7 @@ def get_transformer_model_class(self, model_arch): "idefics3", "smolvlm", "llama4", + "glm4v" ]: from transformers import AutoModelForImageTextToText diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py index c9497e804e..64df572f53 100644 --- a/tests/openvino/test_quantization.py +++ b/tests/openvino/test_quantization.py @@ -445,6 +445,32 @@ class OVQuantizerTest(unittest.TestCase): ), ] ) + if is_transformers_version(">=", "4.53.0"): + SUPPORTED_ARCHITECTURES_OV_MODEL_WITH_AUTO_DATASET.extend( + [ + ( + OVModelForVisualCausalLM, + "glm4v", + OVQuantizationConfig( + bits=8, + dataset="contextual", + num_samples=1, + ), + { + "lm_model": 13, + "text_embeddings_model": 0, + "vision_embeddings_model": 0, + "vision_embeddings_merger_model": 0, + }, + { + "lm_model": {"int8": 15}, + "text_embeddings_model": {"int8": 1}, + "vision_embeddings_model": {"int8": 1}, + "vision_embeddings_merger_model": {"int8": 10}, + }, + ), + ] + ) @staticmethod def get_calibration_dataset( @@ -1043,6 +1069,25 @@ class OVWeightCompressionTest(unittest.TestCase): "vision_embeddings_merger_model": {"int8": 10}, }, ), + ( + OVModelForVisualCausalLM, + "glm4v", + False, + dict( + bits=4, + group_size=16, + dataset="contextual", + ratio=0.8, + sensitivity_metric="mean_activation_magnitude", + num_samples=1, + ), + { + "lm_model": {"int8": 10, "int4": 20}, + "text_embeddings_model": {"int8": 1}, + "vision_embeddings_model": {"int8": 1}, + "vision_embeddings_merger_model": {"int8": 10}, + }, + ), ] ) diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py index 8d8ba3e098..4ff072ca55 100644 --- a/tests/openvino/utils_tests.py +++ b/tests/openvino/utils_tests.py @@ -189,6 +189,7 @@ "xglm": "hf-internal-testing/tiny-random-XGLMForCausalLM", "xverse": "katuni4ka/tiny-random-xverse", "glm4": "snake7gun/tiny-random-glm4", + "glm4v": "snake7gun/glm4v-tiny-random", "glm": "katuni4ka/tiny-random-glm-edge", "open-clip": "hf-internal-testing/tiny-open-clip-model", "open-clip-ov": "zofinka/tiny-open-clip-model",