fix CI

openvino-dev-samples · openvino-dev-samples · commit b1e9ace18392 · 2025-10-31T01:53:46.000-07:00
diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py
@@ -3333,13 +3333,11 @@ def _minicpmv_resampler_forward(self, image_feature, pos_embed, key_padding_mask
 
 
 def _minicpmv4_5_resampler_forward(self, image_feature, pos_embed, key_padding_mask, temporal_embed):
-    bs = image_feature.shape[0]
     image_feature = self.kv_proj(image_feature)  # B * L * D
     image_feature = self.ln_kv(image_feature).permute(1, 0, 2)  # L * B * D
-    image_feature = image_feature + pos_embed
-
-    image_feature_temporal = image_feature + temporal_embed  # [L, bs, D] + [1, bs, D]
-
+    image_feature_emb = image_feature + pos_embed
+    image_feature_temporal = image_feature_emb + temporal_embed  # [L, bs, D] + [1, bs, D]
+    bs = image_feature_temporal.shape[1]
     q = self.ln_q(self.query)  # Q * D
 
     q_bs = q.unsqueeze(1).repeat(1, bs, 1)
diff --git a/optimum/intel/openvino/modeling_visual_language.py b/optimum/intel/openvino/modeling_visual_language.py
@@ -1941,6 +1941,8 @@ def __init__(
     def get_vision_embeddings(self, pixel_values, input_ids=None, temporal_ids=None, **kwargs):
         if input_ids is not None and input_ids.shape[1] == 1:
             return None
+
+        all_temporal_ids = None
         if temporal_ids is not None:
             all_temporal_ids = []
             for t in temporal_ids:
@@ -2020,7 +2022,7 @@ def resampling(self, x, tgt_sizes, temporal_ids=None):
 
         max_patch_len = torch.max(patch_len)
         key_padding_mask = torch.zeros((bs, max_patch_len), dtype=torch.bool)
-        
+
         temporal_embed = None
         pos_embed = []
         pos_embed_temporal = []
@@ -2039,8 +2041,8 @@ def resampling(self, x, tgt_sizes, temporal_ids=None):
         pos_embed = torch.nn.utils.rnn.pad_sequence(pos_embed, batch_first=True, padding_value=0.0).permute(
             1, 0, 2
         )  # BLD => L * B * D
-
-        temporal_embed = torch.stack(pos_embed_temporal, dim=0).unsqueeze(0)
+        if temporal_pos_emb:
+            temporal_embed = torch.stack(pos_embed_temporal, dim=0).unsqueeze(0)
         res = torch.from_numpy(
             self.resampler(
                 image_feature=x,
@@ -4483,4 +4485,4 @@ def preprocess_inputs(
     "phi4_multimodal": _OVPhi4MMForCausalLM,
     "llama4": _OVLlama4ForCausalLM,
     "minicpmo": _OVMiniCPMOForCausalLM,
-}
+}