Skip to content

Commit b1e9ace

Browse files
fix CI
1 parent a96d993 commit b1e9ace

File tree

2 files changed

+9
-9
lines changed

2 files changed

+9
-9
lines changed

optimum/exporters/openvino/model_patcher.py

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3333,13 +3333,11 @@ def _minicpmv_resampler_forward(self, image_feature, pos_embed, key_padding_mask
33333333

33343334

33353335
def _minicpmv4_5_resampler_forward(self, image_feature, pos_embed, key_padding_mask, temporal_embed):
3336-
bs = image_feature.shape[0]
33373336
image_feature = self.kv_proj(image_feature) # B * L * D
33383337
image_feature = self.ln_kv(image_feature).permute(1, 0, 2) # L * B * D
3339-
image_feature = image_feature + pos_embed
3340-
3341-
image_feature_temporal = image_feature + temporal_embed # [L, bs, D] + [1, bs, D]
3342-
3338+
image_feature_emb = image_feature + pos_embed
3339+
image_feature_temporal = image_feature_emb + temporal_embed # [L, bs, D] + [1, bs, D]
3340+
bs = image_feature_temporal.shape[1]
33433341
q = self.ln_q(self.query) # Q * D
33443342

33453343
q_bs = q.unsqueeze(1).repeat(1, bs, 1)

optimum/intel/openvino/modeling_visual_language.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1941,6 +1941,8 @@ def __init__(
19411941
def get_vision_embeddings(self, pixel_values, input_ids=None, temporal_ids=None, **kwargs):
19421942
if input_ids is not None and input_ids.shape[1] == 1:
19431943
return None
1944+
1945+
all_temporal_ids = None
19441946
if temporal_ids is not None:
19451947
all_temporal_ids = []
19461948
for t in temporal_ids:
@@ -2020,7 +2022,7 @@ def resampling(self, x, tgt_sizes, temporal_ids=None):
20202022

20212023
max_patch_len = torch.max(patch_len)
20222024
key_padding_mask = torch.zeros((bs, max_patch_len), dtype=torch.bool)
2023-
2025+
20242026
temporal_embed = None
20252027
pos_embed = []
20262028
pos_embed_temporal = []
@@ -2039,8 +2041,8 @@ def resampling(self, x, tgt_sizes, temporal_ids=None):
20392041
pos_embed = torch.nn.utils.rnn.pad_sequence(pos_embed, batch_first=True, padding_value=0.0).permute(
20402042
1, 0, 2
20412043
) # BLD => L * B * D
2042-
2043-
temporal_embed = torch.stack(pos_embed_temporal, dim=0).unsqueeze(0)
2044+
if temporal_pos_emb:
2045+
temporal_embed = torch.stack(pos_embed_temporal, dim=0).unsqueeze(0)
20442046
res = torch.from_numpy(
20452047
self.resampler(
20462048
image_feature=x,
@@ -4483,4 +4485,4 @@ def preprocess_inputs(
44834485
"phi4_multimodal": _OVPhi4MMForCausalLM,
44844486
"llama4": _OVLlama4ForCausalLM,
44854487
"minicpmo": _OVMiniCPMOForCausalLM,
4486-
}
4488+
}

0 commit comments

Comments
 (0)