From 4b70b7f01cb84f57bc42347544ba280798183c24 Mon Sep 17 00:00:00 2001
From: ethan <ethan.yang@intel.com>
Date: Wed, 6 Aug 2025 20:01:46 -0700
Subject: [PATCH 01/11] add support for minicpm4v

---
 optimum/exporters/openvino/model_configs.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py
index 6c152c4a94..252ab6d757 100644
--- a/optimum/exporters/openvino/model_configs.py
+++ b/optimum/exporters/openvino/model_configs.py
@@ -2822,10 +2822,10 @@ def with_behavior(
             behavior = MiniCPMVConfigBehavior(behavior)
 
         if behavior == MiniCPMVConfigBehavior.TEXT_EMBEDDINGS:
-            return get_vlm_text_embeddings_config("qwen2", self._orig_config, self.int_dtype, self.float_dtype)
+            return get_vlm_text_embeddings_config("qwen2" if self._orig_config.version == 2.6 else "llama", self._orig_config, self.int_dtype, self.float_dtype)
 
         if behavior == MiniCPMVConfigBehavior.LANGUAGE:
-            return get_vlm_text_generation_config("qwen2", self._orig_config, self.int_dtype, self.float_dtype)
+            return get_vlm_text_generation_config("qwen2" if self._orig_config.version == 2.6 else "llama", self._orig_config, self.int_dtype, self.float_dtype)
 
         if behavior == MiniCPMVConfigBehavior.VISION_EMBEDDINGS:
             return self.__class__(

From 7c64417837761023c40602f8cbe440b2e26bac25 Mon Sep 17 00:00:00 2001
From: ethan <ethan.yang@intel.com>
Date: Thu, 7 Aug 2025 08:32:45 -0700
Subject: [PATCH 02/11] reformat

---
 optimum/exporters/openvino/model_configs.py | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py
index 252ab6d757..d92eed4f86 100644
--- a/optimum/exporters/openvino/model_configs.py
+++ b/optimum/exporters/openvino/model_configs.py
@@ -2822,10 +2822,20 @@ def with_behavior(
             behavior = MiniCPMVConfigBehavior(behavior)
 
         if behavior == MiniCPMVConfigBehavior.TEXT_EMBEDDINGS:
-            return get_vlm_text_embeddings_config("qwen2" if self._orig_config.version == 2.6 else "llama", self._orig_config, self.int_dtype, self.float_dtype)
+            return get_vlm_text_embeddings_config(
+                "qwen2" if self._orig_config.version == 2.6 else "llama",
+                self._orig_config,
+                self.int_dtype,
+                self.float_dtype,
+            )
 
         if behavior == MiniCPMVConfigBehavior.LANGUAGE:
-            return get_vlm_text_generation_config("qwen2" if self._orig_config.version == 2.6 else "llama", self._orig_config, self.int_dtype, self.float_dtype)
+            return get_vlm_text_generation_config(
+                "qwen2" if self._orig_config.version == 2.6 else "llama",
+                self._orig_config,
+                self.int_dtype,
+                self.float_dtype,
+            )
 
         if behavior == MiniCPMVConfigBehavior.VISION_EMBEDDINGS:
             return self.__class__(

From 4b245f14a29f92c97f391c8492c551168ee8309d Mon Sep 17 00:00:00 2001
From: ethan <ethan.yang@intel.com>
Date: Tue, 12 Aug 2025 01:23:09 -0700
Subject: [PATCH 03/11] add minicpmv4 test case

---
 tests/openvino/test_exporters_cli.py | 11 +++++++++++
 tests/openvino/test_modeling.py      | 20 ++++++++++----------
 tests/openvino/test_quantization.py  | 22 ++++++++++++++++++++++
 tests/openvino/utils_tests.py        |  7 +++++++
 4 files changed, 50 insertions(+), 10 deletions(-)

diff --git a/tests/openvino/test_exporters_cli.py b/tests/openvino/test_exporters_cli.py
index 9effd2fc9c..b3bd680eca 100644
--- a/tests/openvino/test_exporters_cli.py
+++ b/tests/openvino/test_exporters_cli.py
@@ -622,6 +622,17 @@ class OVCLIExportTestCase(unittest.TestCase):
                         "resampler_model": {"int8": 6},
                     },
                 ),
+                (
+                    "image-text-to-text",
+                    "minicpmv4",
+                    "int4 --group-size 4 --ratio 0.8 --trust-remote-code",
+                    {
+                        "lm_model": {"int8": 10, "int4": 20},
+                        "text_embeddings_model": {"int8": 1},
+                        "vision_embeddings_model": {"int8": 26},
+                        "resampler_model": {"int8": 6},
+                    },
+                ),
                 (
                     "image-text-to-text",
                     "minicpmv",
diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py
index 27426bd1a5..0e9377b80b 100644
--- a/tests/openvino/test_modeling.py
+++ b/tests/openvino/test_modeling.py
@@ -2439,7 +2439,7 @@ class OVModelForVisualCausalLMIntegrationTest(unittest.TestCase):
         SUPPORT_VIDEO.append("llava_next_video")
 
     if is_transformers_version(">=", "4.45.0"):
-        SUPPORTED_ARCHITECTURES += ["minicpmv", "internvl2", "phi3_v", "qwen2_vl"]
+        SUPPORTED_ARCHITECTURES += ["minicpmv", "internvl2", "phi3_v", "qwen2_vl", "minicpmv4"]
         SUPPORT_VIDEO.append("qwen2_vl")
 
     if is_transformers_version(">=", "4.46.0"):
@@ -2454,7 +2454,7 @@ class OVModelForVisualCausalLMIntegrationTest(unittest.TestCase):
     if is_transformers_version(">=", "4.51"):
         SUPPORTED_ARCHITECTURES += ["llama4"]
     TASK = "image-text-to-text"
-    REMOTE_CODE_MODELS = ["internvl2", "minicpmv", "nanollava", "phi3_v", "maira2", "phi4mm"]
+    REMOTE_CODE_MODELS = ["internvl2", "minicpmv", "nanollava", "phi3_v", "maira2", "phi4mm", "minicpmv4"]
 
     IMAGE = Image.open(
         requests.get(
@@ -2558,8 +2558,8 @@ def test_compare_to_transformers(self, model_arch):
         ov_model.clear_requests()
         self._check_device_and_request(ov_model, test_device, False)
 
-        # pytorch minicpmv and internvl2 are not designed to be used via forward
-        if model_arch not in ["minicpmv", "internvl2"]:
+        # pytorch minicpmv/minicpmv4 and internvl2 are not designed to be used via forward
+        if model_arch not in ["minicpmv", "minicpmv4", "internvl2"]:
             set_seed(SEED)
             ov_outputs = ov_model(**inputs)
             set_seed(SEED)
@@ -2608,8 +2608,8 @@ def test_compare_to_transformers(self, model_arch):
                 **transformers_inputs, generation_config=gen_config, **additional_inputs
             )
 
-        # original minicpmv, internvl always skip input tokens in generation results, while transformers based approach provide them
-        if model_arch in ["minicpmv", "internvl2"]:
+        # original minicpmv/minicpmv4, internvl always skip input tokens in generation results, while transformers based approach provide them
+        if model_arch in ["minicpmv", "minicpmv4", "internvl2"]:
             ov_outputs = ov_outputs[:, inputs["input_ids"].shape[1] :]
         self.assertTrue(
             torch.equal(ov_outputs, transformers_outputs),
@@ -2634,8 +2634,8 @@ def test_compare_to_transformers(self, model_arch):
             inputs = ov_model.preprocess_inputs(**preprocessors, text=question, video=input_video)
             transformers_inputs = copy.deepcopy(inputs)
             ov_outputs = ov_model.generate(**inputs, generation_config=gen_config)
-            # original minicpmv, internvl always skip input tokens in generation results, while transformers based approach provide them
-            if model_arch in ["minicpmv", "internvl2"]:
+            # original minicpmv/minicpmv4, internvl always skip input tokens in generation results, while transformers based approach provide them
+            if model_arch in ["minicpmv", "minicpmv4", "internvl2"]:
                 ov_outputs = ov_outputs[:, inputs["input_ids"].shape[1] :]
             with torch.no_grad():
                 transformers_outputs = transformers_model.generate(
@@ -2652,8 +2652,8 @@ def test_compare_to_transformers(self, model_arch):
             inputs = ov_model.preprocess_inputs(**preprocessors, text=question, audio=[input_audio])
             transformers_inputs = copy.deepcopy(inputs)
             ov_outputs = ov_model.generate(**inputs, generation_config=gen_config)
-            # original minicpmv, internvl always skip input tokens in generation results, while transformers based approach provide them
-            if model_arch in ["minicpmv", "internvl2"]:
+            # original minicpmv/minicpmv4, internvl always skip input tokens in generation results, while transformers based approach provide them
+            if model_arch in ["minicpmv", "minicpmv4", "internvl2"]:
                 ov_outputs = ov_outputs[:, inputs["input_ids"].shape[1] :]
             with torch.no_grad():
                 transformers_outputs = transformers_model.generate(
diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py
index 7714d7bef4..9cc47dd262 100644
--- a/tests/openvino/test_quantization.py
+++ b/tests/openvino/test_quantization.py
@@ -982,6 +982,27 @@ class OVWeightCompressionTest(unittest.TestCase):
                         "resampler_model": {"int8": 6},
                     },
                 ),
+                (
+                    OVModelForVisualCausalLM,
+                    "minicpmv4",
+                    True,
+                    dict(
+                        bits=4,
+                        group_size=16,
+                        dataset="contextual",
+                        ratio=0.8,
+                        sensitivity_metric="mean_activation_magnitude",
+                        num_samples=1,
+                        processor=MODEL_NAMES["minicpmv4"],
+                        trust_remote_code=True,
+                    ),
+                    {
+                        "lm_model": {"int8": 8, "int4": 22},
+                        "text_embeddings_model": {"int8": 1},
+                        "vision_embeddings_model": {"int8": 26},
+                        "resampler_model": {"int8": 6},
+                    },
+                ),
                 (
                     OVModelForVisualCausalLM,
                     "internvl2",
@@ -1116,6 +1137,7 @@ class OVWeightCompressionTest(unittest.TestCase):
 
     if is_transformers_version(">=", "4.45.0"):
         SUPPORTED_ARCHITECTURES_WITH_AUTO_COMPRESSION.append((OVModelForVisualCausalLM, "minicpmv", True))
+        SUPPORTED_ARCHITECTURES_WITH_AUTO_COMPRESSION.append((OVModelForVisualCausalLM, "minicpmv4", True))
         SUPPORTED_ARCHITECTURES_WITH_AUTO_COMPRESSION.append((OVModelForVisualCausalLM, "qwen2_vl", False))
 
     SUPPORTED_ARCHITECTURES_WITH_HYBRID_QUANTIZATION = [
diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py
index b603e0c2c6..7d072f843b 100644
--- a/tests/openvino/utils_tests.py
+++ b/tests/openvino/utils_tests.py
@@ -114,6 +114,7 @@
     "minicpm": "katuni4ka/tiny-random-minicpm",
     "minicpm3": "katuni4ka/tiny-random-minicpm3",
     "minicpmv": "katuni4ka/tiny-random-minicpmv-2_6",
+    "minicpmv4": "snake7gun/minicpm-v-4-tiny",
     "mistral": "echarlaix/tiny-random-mistral",
     "mistral-nemo": "katuni4ka/tiny-random-mistral-nemo",
     "mixtral": "TitanML/tiny-mixtral",
@@ -282,6 +283,12 @@
         "vision_embeddings_model": 26,
         "resampler_model": 6,
     },
+    "minicpmv4": {
+        "lm_model": 30,
+        "text_embeddings_model": 1,
+        "vision_embeddings_model": 26,
+        "resampler_model": 6,
+    },
     "llava_next_video": {
         "lm_model": 30,
         "text_embeddings_model": 1,

From 81f69be5e0f7fd80aba81b5e5d4c213dfe3e9344 Mon Sep 17 00:00:00 2001
From: ethan <ethan.yang@intel.com>
Date: Tue, 26 Aug 2025 20:48:20 -0700
Subject: [PATCH 04/11] add minicpmv4_5

---
 optimum/exporters/openvino/model_configs.py |  6 +++---
 tests/openvino/test_exporters_cli.py        | 11 +++++++++++
 tests/openvino/test_modeling.py             | 12 ++++++------
 tests/openvino/utils_tests.py               |  1 +
 4 files changed, 21 insertions(+), 9 deletions(-)

diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py
index d92eed4f86..9cf857bd21 100644
--- a/optimum/exporters/openvino/model_configs.py
+++ b/optimum/exporters/openvino/model_configs.py
@@ -2820,10 +2820,10 @@ def with_behavior(
         """
         if isinstance(behavior, str) and not isinstance(behavior, MiniCPMVConfigBehavior):
             behavior = MiniCPMVConfigBehavior(behavior)
-
+        model_mapping = {2.6: "llama", 4.0: "qwen2", 4.5: "qwen3"}
         if behavior == MiniCPMVConfigBehavior.TEXT_EMBEDDINGS:
             return get_vlm_text_embeddings_config(
-                "qwen2" if self._orig_config.version == 2.6 else "llama",
+                model_mapping[self._orig_config.version],
                 self._orig_config,
                 self.int_dtype,
                 self.float_dtype,
@@ -2831,7 +2831,7 @@ def with_behavior(
 
         if behavior == MiniCPMVConfigBehavior.LANGUAGE:
             return get_vlm_text_generation_config(
-                "qwen2" if self._orig_config.version == 2.6 else "llama",
+                model_mapping[self._orig_config.version],
                 self._orig_config,
                 self.int_dtype,
                 self.float_dtype,
diff --git a/tests/openvino/test_exporters_cli.py b/tests/openvino/test_exporters_cli.py
index b3bd680eca..ffa7f68433 100644
--- a/tests/openvino/test_exporters_cli.py
+++ b/tests/openvino/test_exporters_cli.py
@@ -633,6 +633,17 @@ class OVCLIExportTestCase(unittest.TestCase):
                         "resampler_model": {"int8": 6},
                     },
                 ),
+                (
+                    "image-text-to-text",
+                    "minicpmv4_5",
+                    "int4 --group-size 4 --ratio 0.8 --trust-remote-code",
+                    {
+                        "lm_model": {"int8": 10, "int4": 20},
+                        "text_embeddings_model": {"int8": 1},
+                        "vision_embeddings_model": {"int8": 26},
+                        "resampler_model": {"int8": 6},
+                    },
+                ),
                 (
                     "image-text-to-text",
                     "minicpmv",
diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py
index 0e9377b80b..f8329c87b8 100644
--- a/tests/openvino/test_modeling.py
+++ b/tests/openvino/test_modeling.py
@@ -2439,7 +2439,7 @@ class OVModelForVisualCausalLMIntegrationTest(unittest.TestCase):
         SUPPORT_VIDEO.append("llava_next_video")
 
     if is_transformers_version(">=", "4.45.0"):
-        SUPPORTED_ARCHITECTURES += ["minicpmv", "internvl2", "phi3_v", "qwen2_vl", "minicpmv4"]
+        SUPPORTED_ARCHITECTURES += ["minicpmv", "internvl2", "phi3_v", "qwen2_vl", "minicpmv4", "minicpmv4_5"]
         SUPPORT_VIDEO.append("qwen2_vl")
 
     if is_transformers_version(">=", "4.46.0"):
@@ -2454,7 +2454,7 @@ class OVModelForVisualCausalLMIntegrationTest(unittest.TestCase):
     if is_transformers_version(">=", "4.51"):
         SUPPORTED_ARCHITECTURES += ["llama4"]
     TASK = "image-text-to-text"
-    REMOTE_CODE_MODELS = ["internvl2", "minicpmv", "nanollava", "phi3_v", "maira2", "phi4mm", "minicpmv4"]
+    REMOTE_CODE_MODELS = ["internvl2", "minicpmv", "nanollava", "phi3_v", "maira2", "phi4mm", "minicpmv4", "minicpmv4_5"]
 
     IMAGE = Image.open(
         requests.get(
@@ -2559,7 +2559,7 @@ def test_compare_to_transformers(self, model_arch):
         self._check_device_and_request(ov_model, test_device, False)
 
         # pytorch minicpmv/minicpmv4 and internvl2 are not designed to be used via forward
-        if model_arch not in ["minicpmv", "minicpmv4", "internvl2"]:
+        if model_arch not in ["minicpmv", "minicpmv4", "minicpmv4_5", "internvl2"]:
             set_seed(SEED)
             ov_outputs = ov_model(**inputs)
             set_seed(SEED)
@@ -2609,7 +2609,7 @@ def test_compare_to_transformers(self, model_arch):
             )
 
         # original minicpmv/minicpmv4, internvl always skip input tokens in generation results, while transformers based approach provide them
-        if model_arch in ["minicpmv", "minicpmv4", "internvl2"]:
+        if model_arch in ["minicpmv", "minicpmv4", "minicpmv4_5", "internvl2"]:
             ov_outputs = ov_outputs[:, inputs["input_ids"].shape[1] :]
         self.assertTrue(
             torch.equal(ov_outputs, transformers_outputs),
@@ -2635,7 +2635,7 @@ def test_compare_to_transformers(self, model_arch):
             transformers_inputs = copy.deepcopy(inputs)
             ov_outputs = ov_model.generate(**inputs, generation_config=gen_config)
             # original minicpmv/minicpmv4, internvl always skip input tokens in generation results, while transformers based approach provide them
-            if model_arch in ["minicpmv", "minicpmv4", "internvl2"]:
+            if model_arch in ["minicpmv", "minicpmv4", "minicpmv4_5", "internvl2"]:
                 ov_outputs = ov_outputs[:, inputs["input_ids"].shape[1] :]
             with torch.no_grad():
                 transformers_outputs = transformers_model.generate(
@@ -2653,7 +2653,7 @@ def test_compare_to_transformers(self, model_arch):
             transformers_inputs = copy.deepcopy(inputs)
             ov_outputs = ov_model.generate(**inputs, generation_config=gen_config)
             # original minicpmv/minicpmv4, internvl always skip input tokens in generation results, while transformers based approach provide them
-            if model_arch in ["minicpmv", "minicpmv4", "internvl2"]:
+            if model_arch in ["minicpmv", "minicpmv4", "minicpmv4_5", "internvl2"]:
                 ov_outputs = ov_outputs[:, inputs["input_ids"].shape[1] :]
             with torch.no_grad():
                 transformers_outputs = transformers_model.generate(
diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py
index 7d072f843b..ceaac6a180 100644
--- a/tests/openvino/utils_tests.py
+++ b/tests/openvino/utils_tests.py
@@ -115,6 +115,7 @@
     "minicpm3": "katuni4ka/tiny-random-minicpm3",
     "minicpmv": "katuni4ka/tiny-random-minicpmv-2_6",
     "minicpmv4": "snake7gun/minicpm-v-4-tiny",
+    "minicpmv4_5": "snake7gun/tiny-minicpmv-4_5",
     "mistral": "echarlaix/tiny-random-mistral",
     "mistral-nemo": "katuni4ka/tiny-random-mistral-nemo",
     "mixtral": "TitanML/tiny-mixtral",

From 6c0c61752235e4c9b00569de88efd79e3328f58b Mon Sep 17 00:00:00 2001
From: ethan <ethan.yang@intel.com>
Date: Mon, 20 Oct 2025 19:02:24 -0700
Subject: [PATCH 05/11] update seq2seq test

---
 optimum/intel/openvino/modeling_visual_language.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/optimum/intel/openvino/modeling_visual_language.py b/optimum/intel/openvino/modeling_visual_language.py
index 6f991ea457..733caddc90 100644
--- a/optimum/intel/openvino/modeling_visual_language.py
+++ b/optimum/intel/openvino/modeling_visual_language.py
@@ -2111,6 +2111,9 @@ def preprocess_inputs(
             )
         inputs = processor([prompt], [image], return_tensors="pt")
         inputs.pop("image_sizes", None)
+        # skip the temporal_ids which makes the number of loop inconstant:
+        # https://huggingface.co/openbmb/MiniCPM-V-4_5/blob/main/resampler.py#L261
+        inputs.pop("temporal_ids", None)
         return inputs
 
 

From 05fad58ceadd7135b8909bde0a63d274c7ddd841 Mon Sep 17 00:00:00 2001
From: ethan <ethan.yang@intel.com>
Date: Mon, 20 Oct 2025 19:02:49 -0700
Subject: [PATCH 06/11] update seq2seq test

---
 tests/openvino/test_seq2seq.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/tests/openvino/test_seq2seq.py b/tests/openvino/test_seq2seq.py
index ba7d20bbcc..8275bb79f1 100644
--- a/tests/openvino/test_seq2seq.py
+++ b/tests/openvino/test_seq2seq.py
@@ -656,7 +656,7 @@ def test_compare_to_transformers(self, model_arch):
             transformers_inputs["past_key_values"] = DynamicCache()
 
         with torch.no_grad():
-            if model_arch in ["minicpmo"]:
+            if model_arch in ["minicpmo", "minicpmv4", "minicpmv4_5"]:
                 # `generate` method for minicpmo requires tokenizer
                 tokenizer = AutoTokenizer.from_pretrained(
                     model_id, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS
@@ -837,6 +837,9 @@ def test_generate_utils(self, model_arch):
             input_audio = self._generate_random_audio_data()
             question = "Translate this audio to French"
             inputs = model.preprocess_inputs(**preprocessors, text=question, audio=[input_audio])
+            # skip the temporal_ids which makes the number of loop inconstant:
+            # https://huggingface.co/openbmb/MiniCPM-V-4_5/blob/main/resampler.py#L261
+            inputs.pop("temporal_ids", None)
             outputs = model.generate(**inputs, max_new_tokens=10)
             # filter out original prompt becuase it may contains out of tokenizer tokens e.g. in nanollva text separator = -200
             outputs = outputs[:, inputs["input_ids"].shape[1] :]

From b4e2ce1b92c67463ab95f85a52f3304884d18bfe Mon Sep 17 00:00:00 2001
From: ethan <ethan.yang@intel.com>
Date: Tue, 21 Oct 2025 09:18:23 -0700
Subject: [PATCH 07/11] update

---
 tests/openvino/test_exporters_cli.py |  2 +-
 tests/openvino/test_quantization.py  | 10 +++++-----
 tests/openvino/test_seq2seq.py       | 14 ++++++++++++--
 3 files changed, 18 insertions(+), 8 deletions(-)

diff --git a/tests/openvino/test_exporters_cli.py b/tests/openvino/test_exporters_cli.py
index 04572ac17b..c3b474da04 100644
--- a/tests/openvino/test_exporters_cli.py
+++ b/tests/openvino/test_exporters_cli.py
@@ -788,7 +788,7 @@ def test_filtered_architectures(cls):
         elif is_transformers_version("<", "4.52"):
             expected = set()
         else:
-            expected = {"llava-qwen2", "phi3_v", "phi4mm", "minicpmo"}
+            expected = {"llava-qwen2", "phi3_v", "phi4mm", "minicpmo", "minicpmv4", "minicpmv4_5"}
 
         all_model_type = {config[1] for config in cls.TRANSFORMERS_4BIT_CONFIGURATIONS}
         filtered_model_type = {config[1] for config in cls.SUPPORTED_4BIT_CONFIGURATIONS}
diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py
index c7c0145170..086aa36123 100644
--- a/tests/openvino/test_quantization.py
+++ b/tests/openvino/test_quantization.py
@@ -1038,7 +1038,7 @@ class OVWeightCompressionTest(unittest.TestCase):
                 ratio=0.8,
                 sensitivity_metric="mean_activation_magnitude",
                 num_samples=1,
-                processor=MODEL_NAMES["minicpmv"],
+                processor=MODEL_NAMES["minicpmv4"],
                 trust_remote_code=True,
             ),
             {
@@ -1059,7 +1059,7 @@ class OVWeightCompressionTest(unittest.TestCase):
                 ratio=0.8,
                 sensitivity_metric="mean_activation_magnitude",
                 num_samples=1,
-                processor=MODEL_NAMES["minicpmv"],
+                processor=MODEL_NAMES["minicpmv4_5"],
                 trust_remote_code=True,
             ),
             {
@@ -1095,8 +1095,8 @@ class OVWeightCompressionTest(unittest.TestCase):
         (OVModelForVisualCausalLM, "llava_next_video", False),
         (OVModelForVisualCausalLM, "minicpmv", True),
         (OVModelForVisualCausalLM, "qwen2_vl", False),
-        (OVModelForVisualCausalLM, "minicpmv4", False),
-        (OVModelForVisualCausalLM, "minicpmv4_5", False),
+        (OVModelForVisualCausalLM, "minicpmv4", True),
+        (OVModelForVisualCausalLM, "minicpmv4_5", True),
     ]
 
     if is_transformers_version("<", "4.54.0"):
@@ -1126,7 +1126,7 @@ def test_filtered_architectures(cls):
         elif is_transformers_version("<", "4.52"):
             expected = set()
         else:
-            expected = {"llava-qwen2", "phi3_v", "minicpmo"}
+            expected = {"llava-qwen2", "phi3_v", "minicpmo", "minicpmv4", "minicpmv4_5"}
 
         all_model_type = {config[1] for config in cls.TRANSFORMERS_4BIT_CONFIGURATIONS}
         filtered_model_type = {config[1] for config in cls.LOAD_IN_4_BITS_SCOPE}
diff --git a/tests/openvino/test_seq2seq.py b/tests/openvino/test_seq2seq.py
index 8275bb79f1..63b5000134 100644
--- a/tests/openvino/test_seq2seq.py
+++ b/tests/openvino/test_seq2seq.py
@@ -506,7 +506,17 @@ class OVModelForVisualCausalLMIntegrationTest(unittest.TestCase):
         SUPPORTED_ARCHITECTURES = set(SUPPORTED_ARCHITECTURES) - {"llava-qwen2", "phi3_v", "phi4mm"}
 
     TASK = "image-text-to-text"
-    REMOTE_CODE_MODELS = ["internvl_chat", "minicpmv", "minicpmv4", "minicpmv4_5", "minicpmo", "llava-qwen2", "phi3_v", "maira2", "phi4mm"]
+    REMOTE_CODE_MODELS = [
+        "internvl_chat",
+        "minicpmv",
+        "minicpmv4",
+        "minicpmv4_5",
+        "minicpmo",
+        "llava-qwen2",
+        "phi3_v",
+        "maira2",
+        "phi4mm",
+    ]
 
     IMAGE = Image.open(
         requests.get(
@@ -611,7 +621,7 @@ def test_compare_to_transformers(self, model_arch):
         self._check_device_and_request(ov_model, test_device, False)
 
         # pytorch minicpmv and internvl_chat are not designed to be used via forward
-        if model_arch not in ["minicpmv", "minicpmv4","minicpmv4_5","minicpmo", "internvl_chat"]:
+        if model_arch not in ["minicpmv", "minicpmv4", "minicpmv4_5", "minicpmo", "internvl_chat"]:
             set_seed(SEED)
             ov_outputs = ov_model(**inputs)
             set_seed(SEED)

From 39223a4070b4e4fe522852d31f69f25f2c18cd45 Mon Sep 17 00:00:00 2001
From: ethan <ethan.yang@intel.com>
Date: Wed, 22 Oct 2025 23:04:40 -0700
Subject: [PATCH 08/11] fix the CI issues for export and quantization

---
 tests/openvino/test_exporters_cli.py | 2 +-
 tests/openvino/test_quantization.py  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/openvino/test_exporters_cli.py b/tests/openvino/test_exporters_cli.py
index c3b474da04..4d0ae9a41d 100644
--- a/tests/openvino/test_exporters_cli.py
+++ b/tests/openvino/test_exporters_cli.py
@@ -782,7 +782,7 @@ def _openvino_export(self, model_name: str, task: str, model_kwargs: Dict = None
 
     def test_filtered_architectures(cls):
         if is_transformers_version("<", "4.49"):
-            expected = {"llama4", "qwen2_5_vl", "phi4mm"}
+            expected = {"llama4", "qwen2_5_vl", "phi4mm", "minicpmv4", "minicpmv4_5"}
         elif is_transformers_version("<", "4.51"):
             expected = {"llama4", "phi4mm"}
         elif is_transformers_version("<", "4.52"):
diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py
index 086aa36123..60140505b2 100644
--- a/tests/openvino/test_quantization.py
+++ b/tests/openvino/test_quantization.py
@@ -1120,7 +1120,7 @@ class OVWeightCompressionTest(unittest.TestCase):
 
     def test_filtered_architectures(cls):
         if is_transformers_version("<", "4.49"):
-            expected = {"llama4", "qwen2_5_vl"}
+            expected = {"llama4", "qwen2_5_vl", "minicpmv4", "minicpmv4_5"}
         elif is_transformers_version("<", "4.51"):
             expected = {"llama4"}
         elif is_transformers_version("<", "4.52"):

From f6d77504e47d807b7ab6063490e6f7b4cc85c1bb Mon Sep 17 00:00:00 2001
From: ethan <ethan.yang@intel.com>
Date: Thu, 30 Oct 2025 23:30:23 -0700
Subject: [PATCH 09/11] fix conflict

---
 optimum/exporters/openvino/model_patcher.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py
index dd024a27d2..d26f56d295 100644
--- a/optimum/exporters/openvino/model_patcher.py
+++ b/optimum/exporters/openvino/model_patcher.py
@@ -6461,7 +6461,6 @@ def patched_forward(
         # falcon-mamba model has only difference from mamba that is RMS normalization for B, C, and time-step coefficients
         if model_type == "falcon_mamba":
             from transformers.models.falcon_mamba.modeling_falcon_mamba import rms_forward
-            import inspect
 
             self.ssm_rms_normalization = rms_forward
 

From a96d993cff10097f6051a46654548877b9430b2b Mon Sep 17 00:00:00 2001
From: ethan <ethan.yang@intel.com>
Date: Thu, 30 Oct 2025 23:56:10 -0700
Subject: [PATCH 10/11] update

---
 .../openvino/modeling_visual_language.py      | 30 ++++++++-----------
 1 file changed, 13 insertions(+), 17 deletions(-)

diff --git a/optimum/intel/openvino/modeling_visual_language.py b/optimum/intel/openvino/modeling_visual_language.py
index 5d3ede35ba..926839d837 100644
--- a/optimum/intel/openvino/modeling_visual_language.py
+++ b/optimum/intel/openvino/modeling_visual_language.py
@@ -286,7 +286,7 @@ def __init__(self, model: ov.Model, parent_model: OVBaseModel) -> None:
         self.output_names = {key.get_any_name(): idx for idx, key in enumerate(self.model.outputs)}
 
     def forward(self, image_feature, pos_embed, key_padding_mask, temporal_embed=None):
-        self._compile()
+        self.compile()
         if temporal_embed is not None:
             result = self.request(
                 {
@@ -2020,7 +2020,8 @@ def resampling(self, x, tgt_sizes, temporal_ids=None):
 
         max_patch_len = torch.max(patch_len)
         key_padding_mask = torch.zeros((bs, max_patch_len), dtype=torch.bool)
-
+        
+        temporal_embed = None
         pos_embed = []
         pos_embed_temporal = []
         for i in range(bs):
@@ -2038,21 +2039,16 @@ def resampling(self, x, tgt_sizes, temporal_ids=None):
         pos_embed = torch.nn.utils.rnn.pad_sequence(pos_embed, batch_first=True, padding_value=0.0).permute(
             1, 0, 2
         )  # BLD => L * B * D
-        if pos_embed_temporal:
-            temporal_embed = torch.stack(pos_embed_temporal, dim=0).unsqueeze(0)
-            res = torch.from_numpy(
-                self.resampler(
-                    image_feature=x,
-                    pos_embed=pos_embed,
-                    key_padding_mask=key_padding_mask,
-                    temporal_embed=temporal_embed,
-                )
-            )
-        else:
-            # Print shapes of all inputs to resampler
-            res = torch.from_numpy(
-                self.resampler(image_feature=x, pos_embed=pos_embed, key_padding_mask=key_padding_mask)
+
+        temporal_embed = torch.stack(pos_embed_temporal, dim=0).unsqueeze(0)
+        res = torch.from_numpy(
+            self.resampler(
+                image_feature=x,
+                pos_embed=pos_embed,
+                key_padding_mask=key_padding_mask,
+                temporal_embed=temporal_embed,
             )
+        )
         return res
 
     def _set_2d_pos_cache(self, max_size):
@@ -4487,4 +4483,4 @@ def preprocess_inputs(
     "phi4_multimodal": _OVPhi4MMForCausalLM,
     "llama4": _OVLlama4ForCausalLM,
     "minicpmo": _OVMiniCPMOForCausalLM,
-}
+}
\ No newline at end of file

From 02a4acf759ea48fe2c9f3acd1f0f4d0c93aaef92 Mon Sep 17 00:00:00 2001
From: ethan <ethan.yang@intel.com>
Date: Fri, 31 Oct 2025 01:53:46 -0700
Subject: [PATCH 11/11] fix CI

fix CI
---
 optimum/exporters/openvino/model_configs.py   | 27 ++++++++++++-------
 optimum/exporters/openvino/model_patcher.py   |  8 +++---
 .../openvino/modeling_visual_language.py      | 10 ++++---
 3 files changed, 27 insertions(+), 18 deletions(-)

diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py
index b8027f745b..62f03f8b09 100644
--- a/optimum/exporters/openvino/model_configs.py
+++ b/optimum/exporters/openvino/model_configs.py
@@ -2588,6 +2588,8 @@ def __init__(
         )
         self._behavior = behavior
         self._orig_config = config
+        model_mapping = {2.6: "llama", 4.0: "qwen2", 4.5: "qwen3"}
+        self.model_type = model_mapping[self._orig_config.version]
         if self._behavior == MiniCPMVConfigBehavior.VISION_EMBEDDINGS and hasattr(config, "vision_config"):
             self._config = config.vision_config
             self.DUMMY_INPUT_GENERATOR_CLASSES = (DummyMiniCPMVImageInputGenerator,)
@@ -2604,12 +2606,19 @@ def inputs(self) -> Dict[str, Dict[int, str]]:
                 "position_ids": {0: "batch_size", 1: "patch_size"},
             }
         if self._behavior == MiniCPMVConfigBehavior.RESAMPLER:
-            return {
-                "image_feature": {0: "batch_size", 1: "patch_height", 2: "patch_width"},
-                "pos_embed": {0: "patch_size", 1: "batch_size", 2: "num_patches"},
-                "key_padding_mask": {0: "batch_size", 1: "patch_size"},
-                "temporal_embed": {0: "patch_size", 1: "batch_size"},
-            }
+            if self._orig_config.version == 4.5:
+                return {
+                    "image_feature": {0: "batch_size", 1: "patch_height", 2: "patch_width"},
+                    "pos_embed": {0: "patch_size", 1: "batch_size", 2: "num_patches"},
+                    "key_padding_mask": {0: "batch_size", 1: "patch_size"},
+                    "temporal_embed": {0: "patch_size", 1: "batch_size"},
+                }
+            else:
+                return {
+                    "image_feature": {0: "batch_size", 1: "patch_height", 2: "patch_width"},
+                    "pos_embed": {0: "patch_size", 1: "batch_size", 2: "num_patches"},
+                    "key_padding_mask": {0: "batch_size", 1: "patch_size"},
+                }
         return {}
 
     @property
@@ -2633,10 +2642,10 @@ def with_behavior(
         """
         if isinstance(behavior, str) and not isinstance(behavior, MiniCPMVConfigBehavior):
             behavior = MiniCPMVConfigBehavior(behavior)
-        model_mapping = {2.6: "llama", 4.0: "qwen2", 4.5: "qwen3"}
+
         if behavior == MiniCPMVConfigBehavior.TEXT_EMBEDDINGS:
             return get_vlm_text_embeddings_config(
-                model_mapping[self._orig_config.version],
+                self.model_type,
                 self._orig_config,
                 self.int_dtype,
                 self.float_dtype,
@@ -2644,7 +2653,7 @@ def with_behavior(
 
         if behavior == MiniCPMVConfigBehavior.LANGUAGE:
             return get_vlm_text_generation_config(
-                model_mapping[self._orig_config.version],
+                self.model_type,
                 self._orig_config,
                 self.int_dtype,
                 self.float_dtype,
diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py
index d26f56d295..04a4d2e59e 100644
--- a/optimum/exporters/openvino/model_patcher.py
+++ b/optimum/exporters/openvino/model_patcher.py
@@ -3333,13 +3333,11 @@ def _minicpmv_resampler_forward(self, image_feature, pos_embed, key_padding_mask
 
 
 def _minicpmv4_5_resampler_forward(self, image_feature, pos_embed, key_padding_mask, temporal_embed):
-    bs = image_feature.shape[0]
     image_feature = self.kv_proj(image_feature)  # B * L * D
     image_feature = self.ln_kv(image_feature).permute(1, 0, 2)  # L * B * D
-    image_feature = image_feature + pos_embed
-
-    image_feature_temporal = image_feature + temporal_embed  # [L, bs, D] + [1, bs, D]
-
+    image_feature_emb = image_feature + pos_embed
+    image_feature_temporal = image_feature_emb + temporal_embed  # [L, bs, D] + [1, bs, D]
+    bs = image_feature_temporal.shape[1]
     q = self.ln_q(self.query)  # Q * D
 
     q_bs = q.unsqueeze(1).repeat(1, bs, 1)
diff --git a/optimum/intel/openvino/modeling_visual_language.py b/optimum/intel/openvino/modeling_visual_language.py
index 926839d837..d74564da2e 100644
--- a/optimum/intel/openvino/modeling_visual_language.py
+++ b/optimum/intel/openvino/modeling_visual_language.py
@@ -1941,6 +1941,8 @@ def __init__(
     def get_vision_embeddings(self, pixel_values, input_ids=None, temporal_ids=None, **kwargs):
         if input_ids is not None and input_ids.shape[1] == 1:
             return None
+
+        all_temporal_ids = None
         if temporal_ids is not None:
             all_temporal_ids = []
             for t in temporal_ids:
@@ -2020,7 +2022,7 @@ def resampling(self, x, tgt_sizes, temporal_ids=None):
 
         max_patch_len = torch.max(patch_len)
         key_padding_mask = torch.zeros((bs, max_patch_len), dtype=torch.bool)
-        
+
         temporal_embed = None
         pos_embed = []
         pos_embed_temporal = []
@@ -2039,8 +2041,8 @@ def resampling(self, x, tgt_sizes, temporal_ids=None):
         pos_embed = torch.nn.utils.rnn.pad_sequence(pos_embed, batch_first=True, padding_value=0.0).permute(
             1, 0, 2
         )  # BLD => L * B * D
-
-        temporal_embed = torch.stack(pos_embed_temporal, dim=0).unsqueeze(0)
+        if temporal_pos_emb:
+            temporal_embed = torch.stack(pos_embed_temporal, dim=0).unsqueeze(0)
         res = torch.from_numpy(
             self.resampler(
                 image_feature=x,
@@ -4483,4 +4485,4 @@ def preprocess_inputs(
     "phi4_multimodal": _OVPhi4MMForCausalLM,
     "llama4": _OVLlama4ForCausalLM,
     "minicpmo": _OVMiniCPMOForCausalLM,
-}
\ No newline at end of file
+}