Skip to content
Open
Show file tree
Hide file tree
Changes from 9 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 13 additions & 3 deletions optimum/exporters/openvino/model_configs.py
Original file line number Diff line number Diff line change
Expand Up @@ -2625,12 +2625,22 @@ def with_behavior(
"""
if isinstance(behavior, str) and not isinstance(behavior, MiniCPMVConfigBehavior):
behavior = MiniCPMVConfigBehavior(behavior)

model_mapping = {2.6: "llama", 4.0: "qwen2", 4.5: "qwen3"}
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should use str for versions

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

may i understand why?the version in model's config is a number:
https://huggingface.co/openbmb/MiniCPM-V-4_5/blob/main/config.json#L3

Copy link
Member

@IlyasMoutawwakil IlyasMoutawwakil Oct 24, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ah okay I see ! thanks for the clarification.
(it's generally a bad idea to use numbers for versions: 4.0 becomes 4 and 4.10 and 4.1 are the same version 😅)

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it is a bad idea to make decision about architecture based on model version in general.
I think you should parse class model object and use isinstance for inner objects to make decision.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, its a better approach in this case, but I dont know if we can access the modeling file at this stage.

if behavior == MiniCPMVConfigBehavior.TEXT_EMBEDDINGS:
return get_vlm_text_embeddings_config("qwen2", self._orig_config, self.int_dtype, self.float_dtype)
return get_vlm_text_embeddings_config(
model_mapping[self._orig_config.version],
self._orig_config,
self.int_dtype,
self.float_dtype,
)

if behavior == MiniCPMVConfigBehavior.LANGUAGE:
return get_vlm_text_generation_config("qwen2", self._orig_config, self.int_dtype, self.float_dtype)
return get_vlm_text_generation_config(
model_mapping[self._orig_config.version],
self._orig_config,
self.int_dtype,
self.float_dtype,
)

if behavior == MiniCPMVConfigBehavior.VISION_EMBEDDINGS:
return self.__class__(
Expand Down
3 changes: 3 additions & 0 deletions optimum/intel/openvino/modeling_visual_language.py
Original file line number Diff line number Diff line change
Expand Up @@ -2111,6 +2111,9 @@ def preprocess_inputs(
)
inputs = processor([prompt], [image], return_tensors="pt")
inputs.pop("image_sizes", None)
# skip the temporal_ids which makes the number of loop inconstant:
# https://huggingface.co/openbmb/MiniCPM-V-4_5/blob/main/resampler.py#L261
inputs.pop("temporal_ids", None)
return inputs


Expand Down
24 changes: 23 additions & 1 deletion tests/openvino/test_exporters_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -635,6 +635,28 @@ class OVCLIExportTestCase(unittest.TestCase):
"resampler_model": {"int8": 6},
},
),
(
"image-text-to-text",
"minicpmv4",
"int4 --group-size 4 --ratio 0.8 --trust-remote-code",
{
"lm_model": {"int8": 12, "int4": 18},
"text_embeddings_model": {"int8": 1},
"vision_embeddings_model": {"int8": 14},
"resampler_model": {"int8": 6},
},
),
(
"image-text-to-text",
"minicpmv4_5",
"int4 --group-size 4 --ratio 0.8 --trust-remote-code",
{
"lm_model": {"int8": 12, "int4": 18},
"text_embeddings_model": {"int8": 1},
"vision_embeddings_model": {"int8": 14},
"resampler_model": {"int8": 6},
},
),
(
"image-text-to-text",
"internvl_chat",
Expand Down Expand Up @@ -766,7 +788,7 @@ def test_filtered_architectures(cls):
elif is_transformers_version("<", "4.52"):
expected = set()
else:
expected = {"llava-qwen2", "phi3_v", "phi4mm", "minicpmo"}
expected = {"llava-qwen2", "phi3_v", "phi4mm", "minicpmo", "minicpmv4", "minicpmv4_5"}

all_model_type = {config[1] for config in cls.TRANSFORMERS_4BIT_CONFIGURATIONS}
filtered_model_type = {config[1] for config in cls.SUPPORTED_4BIT_CONFIGURATIONS}
Expand Down
46 changes: 45 additions & 1 deletion tests/openvino/test_quantization.py
Original file line number Diff line number Diff line change
Expand Up @@ -1027,6 +1027,48 @@ class OVWeightCompressionTest(unittest.TestCase):
"resampler_model": {"int8": 6},
},
),
(
OVModelForVisualCausalLM,
"minicpmv4",
True,
dict(
bits=4,
group_size=16,
dataset="contextual",
ratio=0.8,
sensitivity_metric="mean_activation_magnitude",
num_samples=1,
processor=MODEL_NAMES["minicpmv4"],
trust_remote_code=True,
),
{
"lm_model": {"int8": 8, "int4": 22},
"text_embeddings_model": {"int8": 1},
"vision_embeddings_model": {"int8": 26},
"resampler_model": {"int8": 6},
},
),
(
OVModelForVisualCausalLM,
"minicpmv4_5",
True,
dict(
bits=4,
group_size=16,
dataset="contextual",
ratio=0.8,
sensitivity_metric="mean_activation_magnitude",
num_samples=1,
processor=MODEL_NAMES["minicpmv4_5"],
trust_remote_code=True,
),
{
"lm_model": {"int8": 8, "int4": 22},
"text_embeddings_model": {"int8": 1},
"vision_embeddings_model": {"int8": 26},
"resampler_model": {"int8": 6},
},
),
]

# filter models type depending on min max transformers version
Expand All @@ -1053,6 +1095,8 @@ class OVWeightCompressionTest(unittest.TestCase):
(OVModelForVisualCausalLM, "llava_next_video", False),
(OVModelForVisualCausalLM, "minicpmv", True),
(OVModelForVisualCausalLM, "qwen2_vl", False),
(OVModelForVisualCausalLM, "minicpmv4", True),
(OVModelForVisualCausalLM, "minicpmv4_5", True),
]

if is_transformers_version("<", "4.54.0"):
Expand Down Expand Up @@ -1082,7 +1126,7 @@ def test_filtered_architectures(cls):
elif is_transformers_version("<", "4.52"):
expected = set()
else:
expected = {"llava-qwen2", "phi3_v", "minicpmo"}
expected = {"llava-qwen2", "phi3_v", "minicpmo", "minicpmv4", "minicpmv4_5"}

all_model_type = {config[1] for config in cls.TRANSFORMERS_4BIT_CONFIGURATIONS}
filtered_model_type = {config[1] for config in cls.LOAD_IN_4_BITS_SCOPE}
Expand Down
27 changes: 20 additions & 7 deletions tests/openvino/test_seq2seq.py
Original file line number Diff line number Diff line change
Expand Up @@ -497,7 +497,7 @@ class OVModelForVisualCausalLMIntegrationTest(unittest.TestCase):
if is_transformers_version(">", "4.49"):
SUPPORTED_ARCHITECTURES += ["gemma3", "smolvlm"]
if is_transformers_version(">=", "4.51"):
SUPPORTED_ARCHITECTURES += ["llama4"]
SUPPORTED_ARCHITECTURES += ["llama4", "minicpmv4", "minicpmv4_5"]
if is_transformers_version("<", "4.52"):
SUPPORTED_ARCHITECTURES += ["minicpmo"]

Expand All @@ -506,7 +506,17 @@ class OVModelForVisualCausalLMIntegrationTest(unittest.TestCase):
SUPPORTED_ARCHITECTURES = set(SUPPORTED_ARCHITECTURES) - {"llava-qwen2", "phi3_v", "phi4mm"}

TASK = "image-text-to-text"
REMOTE_CODE_MODELS = ["internvl_chat", "minicpmv", "minicpmo", "llava-qwen2", "phi3_v", "maira2", "phi4mm"]
REMOTE_CODE_MODELS = [
"internvl_chat",
"minicpmv",
"minicpmv4",
"minicpmv4_5",
"minicpmo",
"llava-qwen2",
"phi3_v",
"maira2",
"phi4mm",
]

IMAGE = Image.open(
requests.get(
Expand Down Expand Up @@ -611,7 +621,7 @@ def test_compare_to_transformers(self, model_arch):
self._check_device_and_request(ov_model, test_device, False)

# pytorch minicpmv and internvl_chat are not designed to be used via forward
if model_arch not in ["minicpmv", "minicpmo", "internvl_chat"]:
if model_arch not in ["minicpmv", "minicpmv4", "minicpmv4_5", "minicpmo", "internvl_chat"]:
set_seed(SEED)
ov_outputs = ov_model(**inputs)
set_seed(SEED)
Expand Down Expand Up @@ -656,7 +666,7 @@ def test_compare_to_transformers(self, model_arch):
transformers_inputs["past_key_values"] = DynamicCache()

with torch.no_grad():
if model_arch in ["minicpmo"]:
if model_arch in ["minicpmo", "minicpmv4", "minicpmv4_5"]:
# `generate` method for minicpmo requires tokenizer
tokenizer = AutoTokenizer.from_pretrained(
model_id, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS
Expand All @@ -670,7 +680,7 @@ def test_compare_to_transformers(self, model_arch):
transformers_outputs = transformers_outputs[1].sequences

# original minicpmv, internvl always skip input tokens in generation results, while transformers based approach provide them
if model_arch in ["minicpmv", "minicpmo", "internvl_chat"]:
if model_arch in ["minicpmv", "minicpmv4", "minicpmv4_5", "minicpmo", "internvl_chat"]:
ov_outputs = ov_outputs[:, inputs["input_ids"].shape[1] :]
self.assertTrue(
torch.equal(ov_outputs, transformers_outputs),
Expand All @@ -696,7 +706,7 @@ def test_compare_to_transformers(self, model_arch):
transformers_inputs = copy.deepcopy(inputs)
ov_outputs = ov_model.generate(**inputs, generation_config=gen_config)
# original minicpmv, internvl always skip input tokens in generation results, while transformers based approach provide them
if model_arch in ["minicpmv", "minicpmo", "internvl_chat"]:
if model_arch in ["minicpmv", "minicpmv4", "minicpmv4_5", "minicpmo", "internvl_chat"]:
ov_outputs = ov_outputs[:, inputs["input_ids"].shape[1] :]
with torch.no_grad():
transformers_outputs = transformers_model.generate(
Expand All @@ -714,7 +724,7 @@ def test_compare_to_transformers(self, model_arch):
transformers_inputs = copy.deepcopy(inputs)
ov_outputs = ov_model.generate(**inputs, generation_config=gen_config)
# original minicpmv, internvl always skip input tokens in generation results, while transformers based approach provide them
if model_arch in ["minicpmv", "minicpmo", "internvl_chat"]:
if model_arch in ["minicpmv", "minicpmv4", "minicpmv4_5", "minicpmo", "internvl_chat"]:
ov_outputs = ov_outputs[:, inputs["input_ids"].shape[1] :]
with torch.no_grad():
transformers_outputs = transformers_model.generate(
Expand Down Expand Up @@ -837,6 +847,9 @@ def test_generate_utils(self, model_arch):
input_audio = self._generate_random_audio_data()
question = "Translate this audio to French"
inputs = model.preprocess_inputs(**preprocessors, text=question, audio=[input_audio])
# skip the temporal_ids which makes the number of loop inconstant:
# https://huggingface.co/openbmb/MiniCPM-V-4_5/blob/main/resampler.py#L261
inputs.pop("temporal_ids", None)
Comment on lines +849 to +851
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

don't quite understand. To me it degrades user experience that we make users to skip this parameter after preprocessing to have matched results with the reference.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, it will impact the user experience in some cases. I think its one of the limitation for torchscipt. I will try to find a better solution handle this parameter if possible.

outputs = model.generate(**inputs, max_new_tokens=10)
# filter out original prompt becuase it may contains out of tokenizer tokens e.g. in nanollva text separator = -200
outputs = outputs[:, inputs["input_ids"].shape[1] :]
Expand Down
14 changes: 14 additions & 0 deletions tests/openvino/utils_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,8 @@
"minicpm": "katuni4ka/tiny-random-minicpm",
"minicpm3": "katuni4ka/tiny-random-minicpm3",
"minicpmv": "katuni4ka/tiny-random-minicpmv-2_6",
"minicpmv4": "snake7gun/minicpm-v-4-tiny",
"minicpmv4_5": "tiny-random/minicpm-v-4_5",
"minicpmo": "rkazants/tiny-random-MiniCPM-o-2_6",
"mistral": "echarlaix/tiny-random-mistral",
"mistral-nemo": "katuni4ka/tiny-random-mistral-nemo",
Expand Down Expand Up @@ -285,6 +287,18 @@
"vision_embeddings_model": 26,
"resampler_model": 6,
},
"minicpmv4": {
"lm_model": 30,
"text_embeddings_model": 1,
"vision_embeddings_model": 14,
"resampler_model": 6,
},
"minicpmv4_5": {
"lm_model": 30,
"text_embeddings_model": 1,
"vision_embeddings_model": 14,
"resampler_model": 6,
},
"llava_next_video": {
"lm_model": 30,
"text_embeddings_model": 1,
Expand Down
Loading