Skip to content
Open
Show file tree
Hide file tree
Changes from 17 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 31 additions & 8 deletions optimum/exporters/openvino/model_configs.py
Original file line number Diff line number Diff line change
Expand Up @@ -2516,7 +2516,7 @@ def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int


class DummyMiniCPMVResampleInputGenerator(DummyVisionInputGenerator):
SUPPORTED_INPUT_NAMES = ("image_feature", "pos_embed", "key_padding_mask")
SUPPORTED_INPUT_NAMES = ("image_feature", "pos_embed", "key_padding_mask", "temporal_embed")

def __init__(
self,
Expand Down Expand Up @@ -2553,6 +2553,9 @@ def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int
if input_name == "pos_embed":
return self.random_float_tensor(shape=[self.feat_size, self.batch_size, self.hidden_size])

if input_name == "temporal_embed":
return self.random_float_tensor(shape=[1, self.batch_size, self.hidden_size])


class MiniCPMVConfigBehavior(str, enum.Enum):
RESAMPLER = "resampler"
Expand Down Expand Up @@ -2585,6 +2588,8 @@ def __init__(
)
self._behavior = behavior
self._orig_config = config
model_mapping = {2.6: "llama", 4.0: "qwen2", 4.5: "qwen3"}
self.model_type = model_mapping[self._orig_config.version]
if self._behavior == MiniCPMVConfigBehavior.VISION_EMBEDDINGS and hasattr(config, "vision_config"):
self._config = config.vision_config
self.DUMMY_INPUT_GENERATOR_CLASSES = (DummyMiniCPMVImageInputGenerator,)
Expand All @@ -2601,11 +2606,19 @@ def inputs(self) -> Dict[str, Dict[int, str]]:
"position_ids": {0: "batch_size", 1: "patch_size"},
}
if self._behavior == MiniCPMVConfigBehavior.RESAMPLER:
return {
"image_feature": {0: "batch_size", 1: "patch_height", 2: "patch_width"},
"pos_embed": {0: "patch_size", 1: "batch_size", 2: "num_patches"},
"key_padding_mask": {0: "batch_size", 1: "patch_size"},
}
if self._orig_config.version == 4.5:
return {
"image_feature": {0: "batch_size", 1: "patch_height", 2: "patch_width"},
"pos_embed": {0: "patch_size", 1: "batch_size", 2: "num_patches"},
"key_padding_mask": {0: "batch_size", 1: "patch_size"},
"temporal_embed": {0: "patch_size", 1: "batch_size"},
}
else:
return {
"image_feature": {0: "batch_size", 1: "patch_height", 2: "patch_width"},
"pos_embed": {0: "patch_size", 1: "batch_size", 2: "num_patches"},
"key_padding_mask": {0: "batch_size", 1: "patch_size"},
}
return {}

@property
Expand All @@ -2631,10 +2644,20 @@ def with_behavior(
behavior = MiniCPMVConfigBehavior(behavior)

if behavior == MiniCPMVConfigBehavior.TEXT_EMBEDDINGS:
return get_vlm_text_embeddings_config("qwen2", self._orig_config, self.int_dtype, self.float_dtype)
return get_vlm_text_embeddings_config(
self.model_type,
self._orig_config,
self.int_dtype,
self.float_dtype,
)

if behavior == MiniCPMVConfigBehavior.LANGUAGE:
return get_vlm_text_generation_config("qwen2", self._orig_config, self.int_dtype, self.float_dtype)
return get_vlm_text_generation_config(
self.model_type,
self._orig_config,
self.int_dtype,
self.float_dtype,
)

if behavior == MiniCPMVConfigBehavior.VISION_EMBEDDINGS:
return self.__class__(
Expand Down
26 changes: 25 additions & 1 deletion optimum/exporters/openvino/model_patcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -3323,6 +3323,27 @@ def _minicpmv_resampler_forward(self, image_feature, pos_embed, key_padding_mask

out = self.attn(q_bs, image_feature + pos_embed, image_feature, key_padding_mask=key_padding_mask)[
0
] # Q * B * D # L * B * D + L * B * Dpos_embed
# out: Q * B * D
x = out.permute(1, 0, 2) # B * Q * D

x = self.ln_post(x)
x = x @ self.proj
return x


def _minicpmv4_5_resampler_forward(self, image_feature, pos_embed, key_padding_mask, temporal_embed):
image_feature = self.kv_proj(image_feature) # B * L * D
image_feature = self.ln_kv(image_feature).permute(1, 0, 2) # L * B * D
image_feature_emb = image_feature + pos_embed
image_feature_temporal = image_feature_emb + temporal_embed # [L, bs, D] + [1, bs, D]
bs = image_feature_temporal.shape[1]
q = self.ln_q(self.query) # Q * D

q_bs = q.unsqueeze(1).repeat(1, bs, 1)

out = self.attn(q_bs, image_feature_temporal, image_feature, key_padding_mask=key_padding_mask)[
0
] # Q * B * D # L * B * D + L * B * D
# out: Q * B * D
x = out.permute(1, 0, 2) # B * Q * D
Expand Down Expand Up @@ -3482,7 +3503,10 @@ def __init__(
model_kwargs: Dict[str, Any],
):
model.__orig_forward = model.forward
model.forward = types.MethodType(_minicpmv_resampler_forward, model)
has_temporal_ids = "temporal_ids" in inspect.signature(model.__orig_forward).parameters
model.forward = types.MethodType(
_minicpmv4_5_resampler_forward if has_temporal_ids else _minicpmv_resampler_forward, model
)

super().__init__(config, model, model_kwargs)

Expand Down
68 changes: 60 additions & 8 deletions optimum/intel/openvino/modeling_visual_language.py
Original file line number Diff line number Diff line change
Expand Up @@ -285,11 +285,21 @@ def __init__(self, model: ov.Model, parent_model: OVBaseModel) -> None:
self.output_dtypes = {key.get_any_name(): key.get_element_type().get_type_name() for key in self.model.outputs}
self.output_names = {key.get_any_name(): idx for idx, key in enumerate(self.model.outputs)}

def forward(self, image_feature, pos_embed, key_padding_mask):
def forward(self, image_feature, pos_embed, key_padding_mask, temporal_embed=None):
self.compile()
result = self.request(
{"image_feature": image_feature, "pos_embed": pos_embed, "key_padding_mask": key_padding_mask}
)[0]
if temporal_embed is not None:
result = self.request(
{
"image_feature": image_feature,
"pos_embed": pos_embed,
"key_padding_mask": key_padding_mask,
"temporal_embed": temporal_embed,
}
)[0]
else:
result = self.request(
{"image_feature": image_feature, "pos_embed": pos_embed, "key_padding_mask": key_padding_mask}
)[0]
return result


Expand Down Expand Up @@ -784,6 +794,7 @@ def forward(
audio_embed_sizes=None,
audio_attention_mask=None,
input_mode=None,
temporal_ids=None,
**kwargs,
):
if pixel_values is None:
Expand All @@ -809,6 +820,7 @@ def forward(
audio_embed_sizes=audio_embed_sizes,
audio_attention_mask=audio_attention_mask,
input_mode=input_mode,
temporal_ids=temporal_ids,
**kwargs,
)
return self.language_model.forward(
Expand Down Expand Up @@ -921,6 +933,7 @@ def prepare_inputs_for_generation(
"input_audio_embeds": kwargs.get("input_audio_embeds", kwargs.get("audio_input_features")),
"audio_embed_sizes": kwargs.get("audio_embed_sizes"),
"input_mode": kwargs.get("input_mode"),
"temporal_ids": kwargs.get("temporal_ids"),
}
)
return model_inputs
Expand Down Expand Up @@ -1923,10 +1936,18 @@ def __init__(
max_size = self.config.vision_config.image_size // self.config.vision_config.patch_size
self._pos_embeds = torch.from_numpy(self._get_2d_sincos_pos_embed(self.embed_dim, max_size)).float()
self.max_size = (max_size, max_size)
self.max_temporal_size = 72000
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why 72000? Should this value be loaded from the config?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.


def get_vision_embeddings(self, pixel_values, input_ids=None, **kwargs):
def get_vision_embeddings(self, pixel_values, input_ids=None, temporal_ids=None, **kwargs):
if input_ids is not None and input_ids.shape[1] == 1:
return None

all_temporal_ids = None
if temporal_ids is not None:
all_temporal_ids = []
for t in temporal_ids:
all_temporal_ids.extend(t)
Comment on lines +1936 to +1940
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
all_temporal_ids = None
if temporal_ids is not None:
all_temporal_ids = []
for t in temporal_ids:
all_temporal_ids.extend(t)
all_temporal_ids = [t for seq_t in temporal_ids for t in seq_t] if temporal_ids is not None else None

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.


tgt_sizes = kwargs["tgt_sizes"]
pixel_values_list = pixel_values
vision_hidden_states = []
Expand Down Expand Up @@ -1963,7 +1984,7 @@ def get_vision_embeddings(self, pixel_values, input_ids=None, **kwargs):
pixel_values=all_pixel_values, patch_attention_mask=patch_attn_mask, position_ids=position_ids
)[0]
)
vision_embedding = self.resampling(vision_embedding, tgt_sizes)
vision_embedding = self.resampling(vision_embedding, tgt_sizes, all_temporal_ids)

start = 0
for pixel_value in pixel_values_list:
Expand All @@ -1979,26 +2000,57 @@ def get_vision_embeddings(self, pixel_values, input_ids=None, **kwargs):
vision_hidden_states.append(dummy_feature)
return vision_hidden_states

def resampling(self, x, tgt_sizes):
def resampling(self, x, tgt_sizes, temporal_ids=None):
from itertools import chain
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should be imported at the top of the file.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this imports is used by minicpmv only, so i think it can be left here. e.g https://github.com/huggingface/optimum-intel/blob/main/optimum/intel/openvino/modeling_visual_language.py#L1229


bs = x.shape[0]

patch_len = tgt_sizes[:, 0] * tgt_sizes[:, 1]

self._adjust_pos_cache(tgt_sizes)

temporal_pos_emb = False
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For me these names are a bit confusing: temporal_pos_emb, pos_embed_temporal, self.temporal_pos_embed, temporal_embed. I would suggest to rename these variable to something more meaningful. For example, use_temporal_pos_embed instead of temporal_pos_emb.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i only created temporal_embed, and other of them are from original modeling file directly.

temporal_ids_flatten = None
if temporal_ids is not None:
# example: [[-1], [-1], [2, 6, 9]]
temporal_ids_flatten = list(chain.from_iterable(temporal_ids))
Comment on lines +2040 to +2041
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we actually need to do an additional flattening pass here? As I understand all_temporal_ids is prepared already flattened inside get_vision_embeddings(). If not needed, I'd remove flattening logic from get_vision_embeddings() and keep it only here.

max_temporal_size = max(temporal_ids_flatten) + 1
if max_temporal_size > -1:
temporal_pos_emb = True
if max_temporal_size > self.max_temporal_size:
self._adjust_temporal_pos_cache(max_temporal_size, "cpu")
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't see a definition of self._adjust_temporal_pos_cache(). Since the tests pass, this means the code does not reach this point in any of existing tests. Please clarify this. Ideally, every scenario should be tested.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Updated to align with original model.


max_patch_len = torch.max(patch_len)
key_padding_mask = torch.zeros((bs, max_patch_len), dtype=torch.bool)

temporal_embed = None
pos_embed = []
pos_embed_temporal = []
for i in range(bs):
tgt_h, tgt_w = tgt_sizes[i]

if temporal_pos_emb:
if temporal_ids_flatten[i] == -1:
pos_embed_temporal.append(torch.zeros(self.embed_dim, dtype=torch.float32, device="cpu"))
else:
pos_embed_temporal.append(self.temporal_pos_embed[temporal_ids_flatten[i]].to(torch.float32)) # D
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Where is self.temporal_pos_embed defined?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed


pos_embed.append(self._pos_embeds[:tgt_h, :tgt_w, :].reshape((tgt_h * tgt_w, -1))) # patches * D
key_padding_mask[i, patch_len[i] :] = True

pos_embed = torch.nn.utils.rnn.pad_sequence(pos_embed, batch_first=True, padding_value=0.0).permute(
1, 0, 2
) # BLD => L * B * D
res = torch.from_numpy(self.resampler(image_feature=x, pos_embed=pos_embed, key_padding_mask=key_padding_mask))
if temporal_pos_emb:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
if temporal_pos_emb:
if len(pos_embed_temporal) > 0:

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

temporal_embed = torch.stack(pos_embed_temporal, dim=0).unsqueeze(0)
res = torch.from_numpy(
self.resampler(
image_feature=x,
pos_embed=pos_embed,
key_padding_mask=key_padding_mask,
temporal_embed=temporal_embed,
)
)
return res

def _set_2d_pos_cache(self, max_size):
Expand Down
26 changes: 24 additions & 2 deletions tests/openvino/test_exporters_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -618,6 +618,28 @@ class OVCLIExportTestCase(unittest.TestCase):
"resampler_model": {"int8": 6},
},
),
(
"image-text-to-text",
"minicpmv4",
"int4 --group-size 4 --ratio 0.8 --trust-remote-code",
{
"lm_model": {"int8": 12, "int4": 18},
"text_embeddings_model": {"int8": 1},
"vision_embeddings_model": {"int8": 14},
"resampler_model": {"int8": 6},
},
),
(
"image-text-to-text",
"minicpmv4_5",
"int4 --group-size 4 --ratio 0.8 --trust-remote-code",
{
"lm_model": {"int8": 12, "int4": 18},
"text_embeddings_model": {"int8": 1},
"vision_embeddings_model": {"int8": 14},
"resampler_model": {"int8": 6},
},
),
(
"image-text-to-text",
"internvl_chat",
Expand Down Expand Up @@ -743,13 +765,13 @@ def _openvino_export(self, model_name: str, task: str, model_kwargs: Dict = None

def test_filtered_architectures(cls):
if is_transformers_version("<", "4.49"):
expected = {"llama4", "qwen2_5_vl", "phi4mm"}
expected = {"llama4", "qwen2_5_vl", "phi4mm", "minicpmv4", "minicpmv4_5"}
elif is_transformers_version("<", "4.51"):
expected = {"llama4", "phi4mm"}
elif is_transformers_version("<", "4.52"):
expected = set()
else:
expected = {"llava-qwen2", "phi3_v", "phi4mm", "minicpmo"}
expected = {"llava-qwen2", "phi3_v", "phi4mm", "minicpmo", "minicpmv4", "minicpmv4_5"}
Comment on lines 767 to +774
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

From this, I get an understanding that minicpmv4/minicpmv4_5 are supported for transformers 4.49 .. 4.51. Is this correct? If so, please set MIN_TRANSFORMERS_VERSION = "4.49.0" and MAX_TRANSFORMERS_VERSION = "4.51.3" for MiniCPMVOpenVINOConfig.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I dont see any limitation on these 2 models. They can share same same version of transformers with minicpm-v-2.6


all_model_type = {config[1] for config in cls.TRANSFORMERS_4BIT_CONFIGURATIONS}
filtered_model_type = {config[1] for config in cls.SUPPORTED_4BIT_CONFIGURATIONS}
Expand Down
50 changes: 48 additions & 2 deletions tests/openvino/test_quantization.py
Original file line number Diff line number Diff line change
Expand Up @@ -938,6 +938,48 @@ class OVWeightCompressionTest(unittest.TestCase):
"resampler_model": {"int8": 6},
},
),
(
OVModelForVisualCausalLM,
"minicpmv4",
True,
dict(
bits=4,
group_size=16,
dataset="contextual",
ratio=0.8,
sensitivity_metric="mean_activation_magnitude",
num_samples=1,
processor=MODEL_NAMES["minicpmv4"],
trust_remote_code=True,
),
{
"lm_model": {"int8": 8, "int4": 22},
"text_embeddings_model": {"int8": 1},
"vision_embeddings_model": {"int8": 26},
"resampler_model": {"int8": 6},
},
),
(
OVModelForVisualCausalLM,
"minicpmv4_5",
True,
dict(
bits=4,
group_size=16,
dataset="contextual",
ratio=0.8,
sensitivity_metric="mean_activation_magnitude",
num_samples=1,
processor=MODEL_NAMES["minicpmv4_5"],
trust_remote_code=True,
),
{
"lm_model": {"int8": 8, "int4": 22},
"text_embeddings_model": {"int8": 1},
"vision_embeddings_model": {"int8": 26},
"resampler_model": {"int8": 6},
},
),
]

# filter models type depending on min max transformers version
Expand All @@ -964,6 +1006,7 @@ class OVWeightCompressionTest(unittest.TestCase):
(OVModelForVisualCausalLM, "llava_next_video", False),
(OVModelForVisualCausalLM, "minicpmv", True),
(OVModelForVisualCausalLM, "qwen2_vl", False),
(OVModelForVisualCausalLM, "minicpmv4", True),
]

if is_transformers_version("<", "4.54.0"):
Expand All @@ -972,6 +1015,9 @@ class OVWeightCompressionTest(unittest.TestCase):
if is_transformers_version("<", "4.52.0"):
SUPPORTED_ARCHITECTURES_WITH_AUTO_COMPRESSION.append((OVModelForVisualCausalLM, "minicpmo", True))

if is_transformers_version(">=", "4.51.0"):
SUPPORTED_ARCHITECTURES_WITH_AUTO_COMPRESSION.append((OVModelForVisualCausalLM, "minicpmv4_5", True))

SUPPORTED_ARCHITECTURES_WITH_HYBRID_QUANTIZATION = [
(OVStableDiffusionPipeline, "stable-diffusion", 72, 195),
(OVStableDiffusionXLPipeline, "stable-diffusion-xl", 84, 331),
Expand All @@ -987,13 +1033,13 @@ class OVWeightCompressionTest(unittest.TestCase):

def test_filtered_architectures(cls):
if is_transformers_version("<", "4.49"):
expected = {"llama4", "qwen2_5_vl"}
expected = {"llama4", "qwen2_5_vl", "minicpmv4", "minicpmv4_5"}
elif is_transformers_version("<", "4.51"):
expected = {"llama4"}
elif is_transformers_version("<", "4.52"):
expected = set()
else:
expected = {"llava-qwen2", "phi3_v", "minicpmo"}
expected = {"llava-qwen2", "phi3_v", "minicpmo", "minicpmv4", "minicpmv4_5"}

all_model_type = {config[1] for config in cls.TRANSFORMERS_4BIT_CONFIGURATIONS}
filtered_model_type = {config[1] for config in cls.LOAD_IN_4_BITS_SCOPE}
Expand Down
Loading
Loading