From 4b70b7f01cb84f57bc42347544ba280798183c24 Mon Sep 17 00:00:00 2001 From: ethan Date: Wed, 6 Aug 2025 20:01:46 -0700 Subject: [PATCH 01/11] add support for minicpm4v --- optimum/exporters/openvino/model_configs.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py index 6c152c4a94..252ab6d757 100644 --- a/optimum/exporters/openvino/model_configs.py +++ b/optimum/exporters/openvino/model_configs.py @@ -2822,10 +2822,10 @@ def with_behavior( behavior = MiniCPMVConfigBehavior(behavior) if behavior == MiniCPMVConfigBehavior.TEXT_EMBEDDINGS: - return get_vlm_text_embeddings_config("qwen2", self._orig_config, self.int_dtype, self.float_dtype) + return get_vlm_text_embeddings_config("qwen2" if self._orig_config.version == 2.6 else "llama", self._orig_config, self.int_dtype, self.float_dtype) if behavior == MiniCPMVConfigBehavior.LANGUAGE: - return get_vlm_text_generation_config("qwen2", self._orig_config, self.int_dtype, self.float_dtype) + return get_vlm_text_generation_config("qwen2" if self._orig_config.version == 2.6 else "llama", self._orig_config, self.int_dtype, self.float_dtype) if behavior == MiniCPMVConfigBehavior.VISION_EMBEDDINGS: return self.__class__( From 7c64417837761023c40602f8cbe440b2e26bac25 Mon Sep 17 00:00:00 2001 From: ethan Date: Thu, 7 Aug 2025 08:32:45 -0700 Subject: [PATCH 02/11] reformat --- optimum/exporters/openvino/model_configs.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py index 252ab6d757..d92eed4f86 100644 --- a/optimum/exporters/openvino/model_configs.py +++ b/optimum/exporters/openvino/model_configs.py @@ -2822,10 +2822,20 @@ def with_behavior( behavior = MiniCPMVConfigBehavior(behavior) if behavior == MiniCPMVConfigBehavior.TEXT_EMBEDDINGS: - return get_vlm_text_embeddings_config("qwen2" if self._orig_config.version == 2.6 else "llama", self._orig_config, self.int_dtype, self.float_dtype) + return get_vlm_text_embeddings_config( + "qwen2" if self._orig_config.version == 2.6 else "llama", + self._orig_config, + self.int_dtype, + self.float_dtype, + ) if behavior == MiniCPMVConfigBehavior.LANGUAGE: - return get_vlm_text_generation_config("qwen2" if self._orig_config.version == 2.6 else "llama", self._orig_config, self.int_dtype, self.float_dtype) + return get_vlm_text_generation_config( + "qwen2" if self._orig_config.version == 2.6 else "llama", + self._orig_config, + self.int_dtype, + self.float_dtype, + ) if behavior == MiniCPMVConfigBehavior.VISION_EMBEDDINGS: return self.__class__( From 4b245f14a29f92c97f391c8492c551168ee8309d Mon Sep 17 00:00:00 2001 From: ethan Date: Tue, 12 Aug 2025 01:23:09 -0700 Subject: [PATCH 03/11] add minicpmv4 test case --- tests/openvino/test_exporters_cli.py | 11 +++++++++++ tests/openvino/test_modeling.py | 20 ++++++++++---------- tests/openvino/test_quantization.py | 22 ++++++++++++++++++++++ tests/openvino/utils_tests.py | 7 +++++++ 4 files changed, 50 insertions(+), 10 deletions(-) diff --git a/tests/openvino/test_exporters_cli.py b/tests/openvino/test_exporters_cli.py index 9effd2fc9c..b3bd680eca 100644 --- a/tests/openvino/test_exporters_cli.py +++ b/tests/openvino/test_exporters_cli.py @@ -622,6 +622,17 @@ class OVCLIExportTestCase(unittest.TestCase): "resampler_model": {"int8": 6}, }, ), + ( + "image-text-to-text", + "minicpmv4", + "int4 --group-size 4 --ratio 0.8 --trust-remote-code", + { + "lm_model": {"int8": 10, "int4": 20}, + "text_embeddings_model": {"int8": 1}, + "vision_embeddings_model": {"int8": 26}, + "resampler_model": {"int8": 6}, + }, + ), ( "image-text-to-text", "minicpmv", diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py index 27426bd1a5..0e9377b80b 100644 --- a/tests/openvino/test_modeling.py +++ b/tests/openvino/test_modeling.py @@ -2439,7 +2439,7 @@ class OVModelForVisualCausalLMIntegrationTest(unittest.TestCase): SUPPORT_VIDEO.append("llava_next_video") if is_transformers_version(">=", "4.45.0"): - SUPPORTED_ARCHITECTURES += ["minicpmv", "internvl2", "phi3_v", "qwen2_vl"] + SUPPORTED_ARCHITECTURES += ["minicpmv", "internvl2", "phi3_v", "qwen2_vl", "minicpmv4"] SUPPORT_VIDEO.append("qwen2_vl") if is_transformers_version(">=", "4.46.0"): @@ -2454,7 +2454,7 @@ class OVModelForVisualCausalLMIntegrationTest(unittest.TestCase): if is_transformers_version(">=", "4.51"): SUPPORTED_ARCHITECTURES += ["llama4"] TASK = "image-text-to-text" - REMOTE_CODE_MODELS = ["internvl2", "minicpmv", "nanollava", "phi3_v", "maira2", "phi4mm"] + REMOTE_CODE_MODELS = ["internvl2", "minicpmv", "nanollava", "phi3_v", "maira2", "phi4mm", "minicpmv4"] IMAGE = Image.open( requests.get( @@ -2558,8 +2558,8 @@ def test_compare_to_transformers(self, model_arch): ov_model.clear_requests() self._check_device_and_request(ov_model, test_device, False) - # pytorch minicpmv and internvl2 are not designed to be used via forward - if model_arch not in ["minicpmv", "internvl2"]: + # pytorch minicpmv/minicpmv4 and internvl2 are not designed to be used via forward + if model_arch not in ["minicpmv", "minicpmv4", "internvl2"]: set_seed(SEED) ov_outputs = ov_model(**inputs) set_seed(SEED) @@ -2608,8 +2608,8 @@ def test_compare_to_transformers(self, model_arch): **transformers_inputs, generation_config=gen_config, **additional_inputs ) - # original minicpmv, internvl always skip input tokens in generation results, while transformers based approach provide them - if model_arch in ["minicpmv", "internvl2"]: + # original minicpmv/minicpmv4, internvl always skip input tokens in generation results, while transformers based approach provide them + if model_arch in ["minicpmv", "minicpmv4", "internvl2"]: ov_outputs = ov_outputs[:, inputs["input_ids"].shape[1] :] self.assertTrue( torch.equal(ov_outputs, transformers_outputs), @@ -2634,8 +2634,8 @@ def test_compare_to_transformers(self, model_arch): inputs = ov_model.preprocess_inputs(**preprocessors, text=question, video=input_video) transformers_inputs = copy.deepcopy(inputs) ov_outputs = ov_model.generate(**inputs, generation_config=gen_config) - # original minicpmv, internvl always skip input tokens in generation results, while transformers based approach provide them - if model_arch in ["minicpmv", "internvl2"]: + # original minicpmv/minicpmv4, internvl always skip input tokens in generation results, while transformers based approach provide them + if model_arch in ["minicpmv", "minicpmv4", "internvl2"]: ov_outputs = ov_outputs[:, inputs["input_ids"].shape[1] :] with torch.no_grad(): transformers_outputs = transformers_model.generate( @@ -2652,8 +2652,8 @@ def test_compare_to_transformers(self, model_arch): inputs = ov_model.preprocess_inputs(**preprocessors, text=question, audio=[input_audio]) transformers_inputs = copy.deepcopy(inputs) ov_outputs = ov_model.generate(**inputs, generation_config=gen_config) - # original minicpmv, internvl always skip input tokens in generation results, while transformers based approach provide them - if model_arch in ["minicpmv", "internvl2"]: + # original minicpmv/minicpmv4, internvl always skip input tokens in generation results, while transformers based approach provide them + if model_arch in ["minicpmv", "minicpmv4", "internvl2"]: ov_outputs = ov_outputs[:, inputs["input_ids"].shape[1] :] with torch.no_grad(): transformers_outputs = transformers_model.generate( diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py index 7714d7bef4..9cc47dd262 100644 --- a/tests/openvino/test_quantization.py +++ b/tests/openvino/test_quantization.py @@ -982,6 +982,27 @@ class OVWeightCompressionTest(unittest.TestCase): "resampler_model": {"int8": 6}, }, ), + ( + OVModelForVisualCausalLM, + "minicpmv4", + True, + dict( + bits=4, + group_size=16, + dataset="contextual", + ratio=0.8, + sensitivity_metric="mean_activation_magnitude", + num_samples=1, + processor=MODEL_NAMES["minicpmv4"], + trust_remote_code=True, + ), + { + "lm_model": {"int8": 8, "int4": 22}, + "text_embeddings_model": {"int8": 1}, + "vision_embeddings_model": {"int8": 26}, + "resampler_model": {"int8": 6}, + }, + ), ( OVModelForVisualCausalLM, "internvl2", @@ -1116,6 +1137,7 @@ class OVWeightCompressionTest(unittest.TestCase): if is_transformers_version(">=", "4.45.0"): SUPPORTED_ARCHITECTURES_WITH_AUTO_COMPRESSION.append((OVModelForVisualCausalLM, "minicpmv", True)) + SUPPORTED_ARCHITECTURES_WITH_AUTO_COMPRESSION.append((OVModelForVisualCausalLM, "minicpmv4", True)) SUPPORTED_ARCHITECTURES_WITH_AUTO_COMPRESSION.append((OVModelForVisualCausalLM, "qwen2_vl", False)) SUPPORTED_ARCHITECTURES_WITH_HYBRID_QUANTIZATION = [ diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py index b603e0c2c6..7d072f843b 100644 --- a/tests/openvino/utils_tests.py +++ b/tests/openvino/utils_tests.py @@ -114,6 +114,7 @@ "minicpm": "katuni4ka/tiny-random-minicpm", "minicpm3": "katuni4ka/tiny-random-minicpm3", "minicpmv": "katuni4ka/tiny-random-minicpmv-2_6", + "minicpmv4": "snake7gun/minicpm-v-4-tiny", "mistral": "echarlaix/tiny-random-mistral", "mistral-nemo": "katuni4ka/tiny-random-mistral-nemo", "mixtral": "TitanML/tiny-mixtral", @@ -282,6 +283,12 @@ "vision_embeddings_model": 26, "resampler_model": 6, }, + "minicpmv4": { + "lm_model": 30, + "text_embeddings_model": 1, + "vision_embeddings_model": 26, + "resampler_model": 6, + }, "llava_next_video": { "lm_model": 30, "text_embeddings_model": 1, From 81f69be5e0f7fd80aba81b5e5d4c213dfe3e9344 Mon Sep 17 00:00:00 2001 From: ethan Date: Tue, 26 Aug 2025 20:48:20 -0700 Subject: [PATCH 04/11] add minicpmv4_5 --- optimum/exporters/openvino/model_configs.py | 6 +++--- tests/openvino/test_exporters_cli.py | 11 +++++++++++ tests/openvino/test_modeling.py | 12 ++++++------ tests/openvino/utils_tests.py | 1 + 4 files changed, 21 insertions(+), 9 deletions(-) diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py index d92eed4f86..9cf857bd21 100644 --- a/optimum/exporters/openvino/model_configs.py +++ b/optimum/exporters/openvino/model_configs.py @@ -2820,10 +2820,10 @@ def with_behavior( """ if isinstance(behavior, str) and not isinstance(behavior, MiniCPMVConfigBehavior): behavior = MiniCPMVConfigBehavior(behavior) - + model_mapping = {2.6: "llama", 4.0: "qwen2", 4.5: "qwen3"} if behavior == MiniCPMVConfigBehavior.TEXT_EMBEDDINGS: return get_vlm_text_embeddings_config( - "qwen2" if self._orig_config.version == 2.6 else "llama", + model_mapping[self._orig_config.version], self._orig_config, self.int_dtype, self.float_dtype, @@ -2831,7 +2831,7 @@ def with_behavior( if behavior == MiniCPMVConfigBehavior.LANGUAGE: return get_vlm_text_generation_config( - "qwen2" if self._orig_config.version == 2.6 else "llama", + model_mapping[self._orig_config.version], self._orig_config, self.int_dtype, self.float_dtype, diff --git a/tests/openvino/test_exporters_cli.py b/tests/openvino/test_exporters_cli.py index b3bd680eca..ffa7f68433 100644 --- a/tests/openvino/test_exporters_cli.py +++ b/tests/openvino/test_exporters_cli.py @@ -633,6 +633,17 @@ class OVCLIExportTestCase(unittest.TestCase): "resampler_model": {"int8": 6}, }, ), + ( + "image-text-to-text", + "minicpmv4_5", + "int4 --group-size 4 --ratio 0.8 --trust-remote-code", + { + "lm_model": {"int8": 10, "int4": 20}, + "text_embeddings_model": {"int8": 1}, + "vision_embeddings_model": {"int8": 26}, + "resampler_model": {"int8": 6}, + }, + ), ( "image-text-to-text", "minicpmv", diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py index 0e9377b80b..f8329c87b8 100644 --- a/tests/openvino/test_modeling.py +++ b/tests/openvino/test_modeling.py @@ -2439,7 +2439,7 @@ class OVModelForVisualCausalLMIntegrationTest(unittest.TestCase): SUPPORT_VIDEO.append("llava_next_video") if is_transformers_version(">=", "4.45.0"): - SUPPORTED_ARCHITECTURES += ["minicpmv", "internvl2", "phi3_v", "qwen2_vl", "minicpmv4"] + SUPPORTED_ARCHITECTURES += ["minicpmv", "internvl2", "phi3_v", "qwen2_vl", "minicpmv4", "minicpmv4_5"] SUPPORT_VIDEO.append("qwen2_vl") if is_transformers_version(">=", "4.46.0"): @@ -2454,7 +2454,7 @@ class OVModelForVisualCausalLMIntegrationTest(unittest.TestCase): if is_transformers_version(">=", "4.51"): SUPPORTED_ARCHITECTURES += ["llama4"] TASK = "image-text-to-text" - REMOTE_CODE_MODELS = ["internvl2", "minicpmv", "nanollava", "phi3_v", "maira2", "phi4mm", "minicpmv4"] + REMOTE_CODE_MODELS = ["internvl2", "minicpmv", "nanollava", "phi3_v", "maira2", "phi4mm", "minicpmv4", "minicpmv4_5"] IMAGE = Image.open( requests.get( @@ -2559,7 +2559,7 @@ def test_compare_to_transformers(self, model_arch): self._check_device_and_request(ov_model, test_device, False) # pytorch minicpmv/minicpmv4 and internvl2 are not designed to be used via forward - if model_arch not in ["minicpmv", "minicpmv4", "internvl2"]: + if model_arch not in ["minicpmv", "minicpmv4", "minicpmv4_5", "internvl2"]: set_seed(SEED) ov_outputs = ov_model(**inputs) set_seed(SEED) @@ -2609,7 +2609,7 @@ def test_compare_to_transformers(self, model_arch): ) # original minicpmv/minicpmv4, internvl always skip input tokens in generation results, while transformers based approach provide them - if model_arch in ["minicpmv", "minicpmv4", "internvl2"]: + if model_arch in ["minicpmv", "minicpmv4", "minicpmv4_5", "internvl2"]: ov_outputs = ov_outputs[:, inputs["input_ids"].shape[1] :] self.assertTrue( torch.equal(ov_outputs, transformers_outputs), @@ -2635,7 +2635,7 @@ def test_compare_to_transformers(self, model_arch): transformers_inputs = copy.deepcopy(inputs) ov_outputs = ov_model.generate(**inputs, generation_config=gen_config) # original minicpmv/minicpmv4, internvl always skip input tokens in generation results, while transformers based approach provide them - if model_arch in ["minicpmv", "minicpmv4", "internvl2"]: + if model_arch in ["minicpmv", "minicpmv4", "minicpmv4_5", "internvl2"]: ov_outputs = ov_outputs[:, inputs["input_ids"].shape[1] :] with torch.no_grad(): transformers_outputs = transformers_model.generate( @@ -2653,7 +2653,7 @@ def test_compare_to_transformers(self, model_arch): transformers_inputs = copy.deepcopy(inputs) ov_outputs = ov_model.generate(**inputs, generation_config=gen_config) # original minicpmv/minicpmv4, internvl always skip input tokens in generation results, while transformers based approach provide them - if model_arch in ["minicpmv", "minicpmv4", "internvl2"]: + if model_arch in ["minicpmv", "minicpmv4", "minicpmv4_5", "internvl2"]: ov_outputs = ov_outputs[:, inputs["input_ids"].shape[1] :] with torch.no_grad(): transformers_outputs = transformers_model.generate( diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py index 7d072f843b..ceaac6a180 100644 --- a/tests/openvino/utils_tests.py +++ b/tests/openvino/utils_tests.py @@ -115,6 +115,7 @@ "minicpm3": "katuni4ka/tiny-random-minicpm3", "minicpmv": "katuni4ka/tiny-random-minicpmv-2_6", "minicpmv4": "snake7gun/minicpm-v-4-tiny", + "minicpmv4_5": "snake7gun/tiny-minicpmv-4_5", "mistral": "echarlaix/tiny-random-mistral", "mistral-nemo": "katuni4ka/tiny-random-mistral-nemo", "mixtral": "TitanML/tiny-mixtral", From 6c0c61752235e4c9b00569de88efd79e3328f58b Mon Sep 17 00:00:00 2001 From: ethan Date: Mon, 20 Oct 2025 19:02:24 -0700 Subject: [PATCH 05/11] update seq2seq test --- optimum/intel/openvino/modeling_visual_language.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/optimum/intel/openvino/modeling_visual_language.py b/optimum/intel/openvino/modeling_visual_language.py index 6f991ea457..733caddc90 100644 --- a/optimum/intel/openvino/modeling_visual_language.py +++ b/optimum/intel/openvino/modeling_visual_language.py @@ -2111,6 +2111,9 @@ def preprocess_inputs( ) inputs = processor([prompt], [image], return_tensors="pt") inputs.pop("image_sizes", None) + # skip the temporal_ids which makes the number of loop inconstant: + # https://huggingface.co/openbmb/MiniCPM-V-4_5/blob/main/resampler.py#L261 + inputs.pop("temporal_ids", None) return inputs From 05fad58ceadd7135b8909bde0a63d274c7ddd841 Mon Sep 17 00:00:00 2001 From: ethan Date: Mon, 20 Oct 2025 19:02:49 -0700 Subject: [PATCH 06/11] update seq2seq test --- tests/openvino/test_seq2seq.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tests/openvino/test_seq2seq.py b/tests/openvino/test_seq2seq.py index ba7d20bbcc..8275bb79f1 100644 --- a/tests/openvino/test_seq2seq.py +++ b/tests/openvino/test_seq2seq.py @@ -656,7 +656,7 @@ def test_compare_to_transformers(self, model_arch): transformers_inputs["past_key_values"] = DynamicCache() with torch.no_grad(): - if model_arch in ["minicpmo"]: + if model_arch in ["minicpmo", "minicpmv4", "minicpmv4_5"]: # `generate` method for minicpmo requires tokenizer tokenizer = AutoTokenizer.from_pretrained( model_id, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS @@ -837,6 +837,9 @@ def test_generate_utils(self, model_arch): input_audio = self._generate_random_audio_data() question = "Translate this audio to French" inputs = model.preprocess_inputs(**preprocessors, text=question, audio=[input_audio]) + # skip the temporal_ids which makes the number of loop inconstant: + # https://huggingface.co/openbmb/MiniCPM-V-4_5/blob/main/resampler.py#L261 + inputs.pop("temporal_ids", None) outputs = model.generate(**inputs, max_new_tokens=10) # filter out original prompt becuase it may contains out of tokenizer tokens e.g. in nanollva text separator = -200 outputs = outputs[:, inputs["input_ids"].shape[1] :] From b4e2ce1b92c67463ab95f85a52f3304884d18bfe Mon Sep 17 00:00:00 2001 From: ethan Date: Tue, 21 Oct 2025 09:18:23 -0700 Subject: [PATCH 07/11] update --- tests/openvino/test_exporters_cli.py | 2 +- tests/openvino/test_quantization.py | 10 +++++----- tests/openvino/test_seq2seq.py | 14 ++++++++++++-- 3 files changed, 18 insertions(+), 8 deletions(-) diff --git a/tests/openvino/test_exporters_cli.py b/tests/openvino/test_exporters_cli.py index 04572ac17b..c3b474da04 100644 --- a/tests/openvino/test_exporters_cli.py +++ b/tests/openvino/test_exporters_cli.py @@ -788,7 +788,7 @@ def test_filtered_architectures(cls): elif is_transformers_version("<", "4.52"): expected = set() else: - expected = {"llava-qwen2", "phi3_v", "phi4mm", "minicpmo"} + expected = {"llava-qwen2", "phi3_v", "phi4mm", "minicpmo", "minicpmv4", "minicpmv4_5"} all_model_type = {config[1] for config in cls.TRANSFORMERS_4BIT_CONFIGURATIONS} filtered_model_type = {config[1] for config in cls.SUPPORTED_4BIT_CONFIGURATIONS} diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py index c7c0145170..086aa36123 100644 --- a/tests/openvino/test_quantization.py +++ b/tests/openvino/test_quantization.py @@ -1038,7 +1038,7 @@ class OVWeightCompressionTest(unittest.TestCase): ratio=0.8, sensitivity_metric="mean_activation_magnitude", num_samples=1, - processor=MODEL_NAMES["minicpmv"], + processor=MODEL_NAMES["minicpmv4"], trust_remote_code=True, ), { @@ -1059,7 +1059,7 @@ class OVWeightCompressionTest(unittest.TestCase): ratio=0.8, sensitivity_metric="mean_activation_magnitude", num_samples=1, - processor=MODEL_NAMES["minicpmv"], + processor=MODEL_NAMES["minicpmv4_5"], trust_remote_code=True, ), { @@ -1095,8 +1095,8 @@ class OVWeightCompressionTest(unittest.TestCase): (OVModelForVisualCausalLM, "llava_next_video", False), (OVModelForVisualCausalLM, "minicpmv", True), (OVModelForVisualCausalLM, "qwen2_vl", False), - (OVModelForVisualCausalLM, "minicpmv4", False), - (OVModelForVisualCausalLM, "minicpmv4_5", False), + (OVModelForVisualCausalLM, "minicpmv4", True), + (OVModelForVisualCausalLM, "minicpmv4_5", True), ] if is_transformers_version("<", "4.54.0"): @@ -1126,7 +1126,7 @@ def test_filtered_architectures(cls): elif is_transformers_version("<", "4.52"): expected = set() else: - expected = {"llava-qwen2", "phi3_v", "minicpmo"} + expected = {"llava-qwen2", "phi3_v", "minicpmo", "minicpmv4", "minicpmv4_5"} all_model_type = {config[1] for config in cls.TRANSFORMERS_4BIT_CONFIGURATIONS} filtered_model_type = {config[1] for config in cls.LOAD_IN_4_BITS_SCOPE} diff --git a/tests/openvino/test_seq2seq.py b/tests/openvino/test_seq2seq.py index 8275bb79f1..63b5000134 100644 --- a/tests/openvino/test_seq2seq.py +++ b/tests/openvino/test_seq2seq.py @@ -506,7 +506,17 @@ class OVModelForVisualCausalLMIntegrationTest(unittest.TestCase): SUPPORTED_ARCHITECTURES = set(SUPPORTED_ARCHITECTURES) - {"llava-qwen2", "phi3_v", "phi4mm"} TASK = "image-text-to-text" - REMOTE_CODE_MODELS = ["internvl_chat", "minicpmv", "minicpmv4", "minicpmv4_5", "minicpmo", "llava-qwen2", "phi3_v", "maira2", "phi4mm"] + REMOTE_CODE_MODELS = [ + "internvl_chat", + "minicpmv", + "minicpmv4", + "minicpmv4_5", + "minicpmo", + "llava-qwen2", + "phi3_v", + "maira2", + "phi4mm", + ] IMAGE = Image.open( requests.get( @@ -611,7 +621,7 @@ def test_compare_to_transformers(self, model_arch): self._check_device_and_request(ov_model, test_device, False) # pytorch minicpmv and internvl_chat are not designed to be used via forward - if model_arch not in ["minicpmv", "minicpmv4","minicpmv4_5","minicpmo", "internvl_chat"]: + if model_arch not in ["minicpmv", "minicpmv4", "minicpmv4_5", "minicpmo", "internvl_chat"]: set_seed(SEED) ov_outputs = ov_model(**inputs) set_seed(SEED) From 39223a4070b4e4fe522852d31f69f25f2c18cd45 Mon Sep 17 00:00:00 2001 From: ethan Date: Wed, 22 Oct 2025 23:04:40 -0700 Subject: [PATCH 08/11] fix the CI issues for export and quantization --- tests/openvino/test_exporters_cli.py | 2 +- tests/openvino/test_quantization.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/openvino/test_exporters_cli.py b/tests/openvino/test_exporters_cli.py index c3b474da04..4d0ae9a41d 100644 --- a/tests/openvino/test_exporters_cli.py +++ b/tests/openvino/test_exporters_cli.py @@ -782,7 +782,7 @@ def _openvino_export(self, model_name: str, task: str, model_kwargs: Dict = None def test_filtered_architectures(cls): if is_transformers_version("<", "4.49"): - expected = {"llama4", "qwen2_5_vl", "phi4mm"} + expected = {"llama4", "qwen2_5_vl", "phi4mm", "minicpmv4", "minicpmv4_5"} elif is_transformers_version("<", "4.51"): expected = {"llama4", "phi4mm"} elif is_transformers_version("<", "4.52"): diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py index 086aa36123..60140505b2 100644 --- a/tests/openvino/test_quantization.py +++ b/tests/openvino/test_quantization.py @@ -1120,7 +1120,7 @@ class OVWeightCompressionTest(unittest.TestCase): def test_filtered_architectures(cls): if is_transformers_version("<", "4.49"): - expected = {"llama4", "qwen2_5_vl"} + expected = {"llama4", "qwen2_5_vl", "minicpmv4", "minicpmv4_5"} elif is_transformers_version("<", "4.51"): expected = {"llama4"} elif is_transformers_version("<", "4.52"): From f6d77504e47d807b7ab6063490e6f7b4cc85c1bb Mon Sep 17 00:00:00 2001 From: ethan Date: Thu, 30 Oct 2025 23:30:23 -0700 Subject: [PATCH 09/11] fix conflict --- optimum/exporters/openvino/model_patcher.py | 1 - 1 file changed, 1 deletion(-) diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index dd024a27d2..d26f56d295 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -6461,7 +6461,6 @@ def patched_forward( # falcon-mamba model has only difference from mamba that is RMS normalization for B, C, and time-step coefficients if model_type == "falcon_mamba": from transformers.models.falcon_mamba.modeling_falcon_mamba import rms_forward - import inspect self.ssm_rms_normalization = rms_forward From a96d993cff10097f6051a46654548877b9430b2b Mon Sep 17 00:00:00 2001 From: ethan Date: Thu, 30 Oct 2025 23:56:10 -0700 Subject: [PATCH 10/11] update --- .../openvino/modeling_visual_language.py | 30 ++++++++----------- 1 file changed, 13 insertions(+), 17 deletions(-) diff --git a/optimum/intel/openvino/modeling_visual_language.py b/optimum/intel/openvino/modeling_visual_language.py index 5d3ede35ba..926839d837 100644 --- a/optimum/intel/openvino/modeling_visual_language.py +++ b/optimum/intel/openvino/modeling_visual_language.py @@ -286,7 +286,7 @@ def __init__(self, model: ov.Model, parent_model: OVBaseModel) -> None: self.output_names = {key.get_any_name(): idx for idx, key in enumerate(self.model.outputs)} def forward(self, image_feature, pos_embed, key_padding_mask, temporal_embed=None): - self._compile() + self.compile() if temporal_embed is not None: result = self.request( { @@ -2020,7 +2020,8 @@ def resampling(self, x, tgt_sizes, temporal_ids=None): max_patch_len = torch.max(patch_len) key_padding_mask = torch.zeros((bs, max_patch_len), dtype=torch.bool) - + + temporal_embed = None pos_embed = [] pos_embed_temporal = [] for i in range(bs): @@ -2038,21 +2039,16 @@ def resampling(self, x, tgt_sizes, temporal_ids=None): pos_embed = torch.nn.utils.rnn.pad_sequence(pos_embed, batch_first=True, padding_value=0.0).permute( 1, 0, 2 ) # BLD => L * B * D - if pos_embed_temporal: - temporal_embed = torch.stack(pos_embed_temporal, dim=0).unsqueeze(0) - res = torch.from_numpy( - self.resampler( - image_feature=x, - pos_embed=pos_embed, - key_padding_mask=key_padding_mask, - temporal_embed=temporal_embed, - ) - ) - else: - # Print shapes of all inputs to resampler - res = torch.from_numpy( - self.resampler(image_feature=x, pos_embed=pos_embed, key_padding_mask=key_padding_mask) + + temporal_embed = torch.stack(pos_embed_temporal, dim=0).unsqueeze(0) + res = torch.from_numpy( + self.resampler( + image_feature=x, + pos_embed=pos_embed, + key_padding_mask=key_padding_mask, + temporal_embed=temporal_embed, ) + ) return res def _set_2d_pos_cache(self, max_size): @@ -4487,4 +4483,4 @@ def preprocess_inputs( "phi4_multimodal": _OVPhi4MMForCausalLM, "llama4": _OVLlama4ForCausalLM, "minicpmo": _OVMiniCPMOForCausalLM, -} +} \ No newline at end of file From 02a4acf759ea48fe2c9f3acd1f0f4d0c93aaef92 Mon Sep 17 00:00:00 2001 From: ethan Date: Fri, 31 Oct 2025 01:53:46 -0700 Subject: [PATCH 11/11] fix CI fix CI --- optimum/exporters/openvino/model_configs.py | 27 ++++++++++++------- optimum/exporters/openvino/model_patcher.py | 8 +++--- .../openvino/modeling_visual_language.py | 10 ++++--- 3 files changed, 27 insertions(+), 18 deletions(-) diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py index b8027f745b..62f03f8b09 100644 --- a/optimum/exporters/openvino/model_configs.py +++ b/optimum/exporters/openvino/model_configs.py @@ -2588,6 +2588,8 @@ def __init__( ) self._behavior = behavior self._orig_config = config + model_mapping = {2.6: "llama", 4.0: "qwen2", 4.5: "qwen3"} + self.model_type = model_mapping[self._orig_config.version] if self._behavior == MiniCPMVConfigBehavior.VISION_EMBEDDINGS and hasattr(config, "vision_config"): self._config = config.vision_config self.DUMMY_INPUT_GENERATOR_CLASSES = (DummyMiniCPMVImageInputGenerator,) @@ -2604,12 +2606,19 @@ def inputs(self) -> Dict[str, Dict[int, str]]: "position_ids": {0: "batch_size", 1: "patch_size"}, } if self._behavior == MiniCPMVConfigBehavior.RESAMPLER: - return { - "image_feature": {0: "batch_size", 1: "patch_height", 2: "patch_width"}, - "pos_embed": {0: "patch_size", 1: "batch_size", 2: "num_patches"}, - "key_padding_mask": {0: "batch_size", 1: "patch_size"}, - "temporal_embed": {0: "patch_size", 1: "batch_size"}, - } + if self._orig_config.version == 4.5: + return { + "image_feature": {0: "batch_size", 1: "patch_height", 2: "patch_width"}, + "pos_embed": {0: "patch_size", 1: "batch_size", 2: "num_patches"}, + "key_padding_mask": {0: "batch_size", 1: "patch_size"}, + "temporal_embed": {0: "patch_size", 1: "batch_size"}, + } + else: + return { + "image_feature": {0: "batch_size", 1: "patch_height", 2: "patch_width"}, + "pos_embed": {0: "patch_size", 1: "batch_size", 2: "num_patches"}, + "key_padding_mask": {0: "batch_size", 1: "patch_size"}, + } return {} @property @@ -2633,10 +2642,10 @@ def with_behavior( """ if isinstance(behavior, str) and not isinstance(behavior, MiniCPMVConfigBehavior): behavior = MiniCPMVConfigBehavior(behavior) - model_mapping = {2.6: "llama", 4.0: "qwen2", 4.5: "qwen3"} + if behavior == MiniCPMVConfigBehavior.TEXT_EMBEDDINGS: return get_vlm_text_embeddings_config( - model_mapping[self._orig_config.version], + self.model_type, self._orig_config, self.int_dtype, self.float_dtype, @@ -2644,7 +2653,7 @@ def with_behavior( if behavior == MiniCPMVConfigBehavior.LANGUAGE: return get_vlm_text_generation_config( - model_mapping[self._orig_config.version], + self.model_type, self._orig_config, self.int_dtype, self.float_dtype, diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index d26f56d295..04a4d2e59e 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -3333,13 +3333,11 @@ def _minicpmv_resampler_forward(self, image_feature, pos_embed, key_padding_mask def _minicpmv4_5_resampler_forward(self, image_feature, pos_embed, key_padding_mask, temporal_embed): - bs = image_feature.shape[0] image_feature = self.kv_proj(image_feature) # B * L * D image_feature = self.ln_kv(image_feature).permute(1, 0, 2) # L * B * D - image_feature = image_feature + pos_embed - - image_feature_temporal = image_feature + temporal_embed # [L, bs, D] + [1, bs, D] - + image_feature_emb = image_feature + pos_embed + image_feature_temporal = image_feature_emb + temporal_embed # [L, bs, D] + [1, bs, D] + bs = image_feature_temporal.shape[1] q = self.ln_q(self.query) # Q * D q_bs = q.unsqueeze(1).repeat(1, bs, 1) diff --git a/optimum/intel/openvino/modeling_visual_language.py b/optimum/intel/openvino/modeling_visual_language.py index 926839d837..d74564da2e 100644 --- a/optimum/intel/openvino/modeling_visual_language.py +++ b/optimum/intel/openvino/modeling_visual_language.py @@ -1941,6 +1941,8 @@ def __init__( def get_vision_embeddings(self, pixel_values, input_ids=None, temporal_ids=None, **kwargs): if input_ids is not None and input_ids.shape[1] == 1: return None + + all_temporal_ids = None if temporal_ids is not None: all_temporal_ids = [] for t in temporal_ids: @@ -2020,7 +2022,7 @@ def resampling(self, x, tgt_sizes, temporal_ids=None): max_patch_len = torch.max(patch_len) key_padding_mask = torch.zeros((bs, max_patch_len), dtype=torch.bool) - + temporal_embed = None pos_embed = [] pos_embed_temporal = [] @@ -2039,8 +2041,8 @@ def resampling(self, x, tgt_sizes, temporal_ids=None): pos_embed = torch.nn.utils.rnn.pad_sequence(pos_embed, batch_first=True, padding_value=0.0).permute( 1, 0, 2 ) # BLD => L * B * D - - temporal_embed = torch.stack(pos_embed_temporal, dim=0).unsqueeze(0) + if temporal_pos_emb: + temporal_embed = torch.stack(pos_embed_temporal, dim=0).unsqueeze(0) res = torch.from_numpy( self.resampler( image_feature=x, @@ -4483,4 +4485,4 @@ def preprocess_inputs( "phi4_multimodal": _OVPhi4MMForCausalLM, "llama4": _OVLlama4ForCausalLM, "minicpmo": _OVMiniCPMOForCausalLM, -} \ No newline at end of file +}