Skip to content

Commit 02a4acf

Browse files
fix CI
fix CI
1 parent a96d993 commit 02a4acf

File tree

3 files changed

+27
-18
lines changed

3 files changed

+27
-18
lines changed

optimum/exporters/openvino/model_configs.py

Lines changed: 18 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -2588,6 +2588,8 @@ def __init__(
25882588
)
25892589
self._behavior = behavior
25902590
self._orig_config = config
2591+
model_mapping = {2.6: "llama", 4.0: "qwen2", 4.5: "qwen3"}
2592+
self.model_type = model_mapping[self._orig_config.version]
25912593
if self._behavior == MiniCPMVConfigBehavior.VISION_EMBEDDINGS and hasattr(config, "vision_config"):
25922594
self._config = config.vision_config
25932595
self.DUMMY_INPUT_GENERATOR_CLASSES = (DummyMiniCPMVImageInputGenerator,)
@@ -2604,12 +2606,19 @@ def inputs(self) -> Dict[str, Dict[int, str]]:
26042606
"position_ids": {0: "batch_size", 1: "patch_size"},
26052607
}
26062608
if self._behavior == MiniCPMVConfigBehavior.RESAMPLER:
2607-
return {
2608-
"image_feature": {0: "batch_size", 1: "patch_height", 2: "patch_width"},
2609-
"pos_embed": {0: "patch_size", 1: "batch_size", 2: "num_patches"},
2610-
"key_padding_mask": {0: "batch_size", 1: "patch_size"},
2611-
"temporal_embed": {0: "patch_size", 1: "batch_size"},
2612-
}
2609+
if self._orig_config.version == 4.5:
2610+
return {
2611+
"image_feature": {0: "batch_size", 1: "patch_height", 2: "patch_width"},
2612+
"pos_embed": {0: "patch_size", 1: "batch_size", 2: "num_patches"},
2613+
"key_padding_mask": {0: "batch_size", 1: "patch_size"},
2614+
"temporal_embed": {0: "patch_size", 1: "batch_size"},
2615+
}
2616+
else:
2617+
return {
2618+
"image_feature": {0: "batch_size", 1: "patch_height", 2: "patch_width"},
2619+
"pos_embed": {0: "patch_size", 1: "batch_size", 2: "num_patches"},
2620+
"key_padding_mask": {0: "batch_size", 1: "patch_size"},
2621+
}
26132622
return {}
26142623

26152624
@property
@@ -2633,18 +2642,18 @@ def with_behavior(
26332642
"""
26342643
if isinstance(behavior, str) and not isinstance(behavior, MiniCPMVConfigBehavior):
26352644
behavior = MiniCPMVConfigBehavior(behavior)
2636-
model_mapping = {2.6: "llama", 4.0: "qwen2", 4.5: "qwen3"}
2645+
26372646
if behavior == MiniCPMVConfigBehavior.TEXT_EMBEDDINGS:
26382647
return get_vlm_text_embeddings_config(
2639-
model_mapping[self._orig_config.version],
2648+
self.model_type,
26402649
self._orig_config,
26412650
self.int_dtype,
26422651
self.float_dtype,
26432652
)
26442653

26452654
if behavior == MiniCPMVConfigBehavior.LANGUAGE:
26462655
return get_vlm_text_generation_config(
2647-
model_mapping[self._orig_config.version],
2656+
self.model_type,
26482657
self._orig_config,
26492658
self.int_dtype,
26502659
self.float_dtype,

optimum/exporters/openvino/model_patcher.py

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3333,13 +3333,11 @@ def _minicpmv_resampler_forward(self, image_feature, pos_embed, key_padding_mask
33333333

33343334

33353335
def _minicpmv4_5_resampler_forward(self, image_feature, pos_embed, key_padding_mask, temporal_embed):
3336-
bs = image_feature.shape[0]
33373336
image_feature = self.kv_proj(image_feature) # B * L * D
33383337
image_feature = self.ln_kv(image_feature).permute(1, 0, 2) # L * B * D
3339-
image_feature = image_feature + pos_embed
3340-
3341-
image_feature_temporal = image_feature + temporal_embed # [L, bs, D] + [1, bs, D]
3342-
3338+
image_feature_emb = image_feature + pos_embed
3339+
image_feature_temporal = image_feature_emb + temporal_embed # [L, bs, D] + [1, bs, D]
3340+
bs = image_feature_temporal.shape[1]
33433341
q = self.ln_q(self.query) # Q * D
33443342

33453343
q_bs = q.unsqueeze(1).repeat(1, bs, 1)

optimum/intel/openvino/modeling_visual_language.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1941,6 +1941,8 @@ def __init__(
19411941
def get_vision_embeddings(self, pixel_values, input_ids=None, temporal_ids=None, **kwargs):
19421942
if input_ids is not None and input_ids.shape[1] == 1:
19431943
return None
1944+
1945+
all_temporal_ids = None
19441946
if temporal_ids is not None:
19451947
all_temporal_ids = []
19461948
for t in temporal_ids:
@@ -2020,7 +2022,7 @@ def resampling(self, x, tgt_sizes, temporal_ids=None):
20202022

20212023
max_patch_len = torch.max(patch_len)
20222024
key_padding_mask = torch.zeros((bs, max_patch_len), dtype=torch.bool)
2023-
2025+
20242026
temporal_embed = None
20252027
pos_embed = []
20262028
pos_embed_temporal = []
@@ -2039,8 +2041,8 @@ def resampling(self, x, tgt_sizes, temporal_ids=None):
20392041
pos_embed = torch.nn.utils.rnn.pad_sequence(pos_embed, batch_first=True, padding_value=0.0).permute(
20402042
1, 0, 2
20412043
) # BLD => L * B * D
2042-
2043-
temporal_embed = torch.stack(pos_embed_temporal, dim=0).unsqueeze(0)
2044+
if temporal_pos_emb:
2045+
temporal_embed = torch.stack(pos_embed_temporal, dim=0).unsqueeze(0)
20442046
res = torch.from_numpy(
20452047
self.resampler(
20462048
image_feature=x,
@@ -4483,4 +4485,4 @@ def preprocess_inputs(
44834485
"phi4_multimodal": _OVPhi4MMForCausalLM,
44844486
"llama4": _OVLlama4ForCausalLM,
44854487
"minicpmo": _OVMiniCPMOForCausalLM,
4486-
}
4488+
}

0 commit comments

Comments
 (0)