Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 31 additions & 8 deletions optimum/exporters/openvino/model_configs.py
Original file line number Diff line number Diff line change
Expand Up @@ -2516,7 +2516,7 @@ def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int


class DummyMiniCPMVResampleInputGenerator(DummyVisionInputGenerator):
SUPPORTED_INPUT_NAMES = ("image_feature", "pos_embed", "key_padding_mask")
SUPPORTED_INPUT_NAMES = ("image_feature", "pos_embed", "key_padding_mask", "temporal_embed")

def __init__(
self,
Expand Down Expand Up @@ -2553,6 +2553,9 @@ def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int
if input_name == "pos_embed":
return self.random_float_tensor(shape=[self.feat_size, self.batch_size, self.hidden_size])

if input_name == "temporal_embed":
return self.random_float_tensor(shape=[1, self.batch_size, self.hidden_size])


class MiniCPMVConfigBehavior(str, enum.Enum):
RESAMPLER = "resampler"
Expand Down Expand Up @@ -2585,6 +2588,8 @@ def __init__(
)
self._behavior = behavior
self._orig_config = config
model_mapping = {2.6: "llama", 4.0: "qwen2", 4.5: "qwen3"}
self.model_type = model_mapping[self._orig_config.version]
if self._behavior == MiniCPMVConfigBehavior.VISION_EMBEDDINGS and hasattr(config, "vision_config"):
self._config = config.vision_config
self.DUMMY_INPUT_GENERATOR_CLASSES = (DummyMiniCPMVImageInputGenerator,)
Expand All @@ -2601,11 +2606,19 @@ def inputs(self) -> Dict[str, Dict[int, str]]:
"position_ids": {0: "batch_size", 1: "patch_size"},
}
if self._behavior == MiniCPMVConfigBehavior.RESAMPLER:
return {
"image_feature": {0: "batch_size", 1: "patch_height", 2: "patch_width"},
"pos_embed": {0: "patch_size", 1: "batch_size", 2: "num_patches"},
"key_padding_mask": {0: "batch_size", 1: "patch_size"},
}
if self._orig_config.version == 4.5:
return {
"image_feature": {0: "batch_size", 1: "patch_height", 2: "patch_width"},
"pos_embed": {0: "patch_size", 1: "batch_size", 2: "num_patches"},
"key_padding_mask": {0: "batch_size", 1: "patch_size"},
"temporal_embed": {0: "patch_size", 1: "batch_size"},
}
else:
return {
"image_feature": {0: "batch_size", 1: "patch_height", 2: "patch_width"},
"pos_embed": {0: "patch_size", 1: "batch_size", 2: "num_patches"},
"key_padding_mask": {0: "batch_size", 1: "patch_size"},
}
return {}

@property
Expand All @@ -2631,10 +2644,20 @@ def with_behavior(
behavior = MiniCPMVConfigBehavior(behavior)

if behavior == MiniCPMVConfigBehavior.TEXT_EMBEDDINGS:
return get_vlm_text_embeddings_config("qwen2", self._orig_config, self.int_dtype, self.float_dtype)
return get_vlm_text_embeddings_config(
self.model_type,
self._orig_config,
self.int_dtype,
self.float_dtype,
)

if behavior == MiniCPMVConfigBehavior.LANGUAGE:
return get_vlm_text_generation_config("qwen2", self._orig_config, self.int_dtype, self.float_dtype)
return get_vlm_text_generation_config(
self.model_type,
self._orig_config,
self.int_dtype,
self.float_dtype,
)

if behavior == MiniCPMVConfigBehavior.VISION_EMBEDDINGS:
return self.__class__(
Expand Down
26 changes: 25 additions & 1 deletion optimum/exporters/openvino/model_patcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -3323,6 +3323,27 @@ def _minicpmv_resampler_forward(self, image_feature, pos_embed, key_padding_mask

out = self.attn(q_bs, image_feature + pos_embed, image_feature, key_padding_mask=key_padding_mask)[
0
] # Q * B * D # L * B * D + L * B * Dpos_embed
# out: Q * B * D
x = out.permute(1, 0, 2) # B * Q * D

x = self.ln_post(x)
x = x @ self.proj
return x


def _minicpmv4_5_resampler_forward(self, image_feature, pos_embed, key_padding_mask, temporal_embed):
image_feature = self.kv_proj(image_feature) # B * L * D
image_feature = self.ln_kv(image_feature).permute(1, 0, 2) # L * B * D
image_feature_emb = image_feature + pos_embed
image_feature_temporal = image_feature_emb + temporal_embed # [L, bs, D] + [1, bs, D]
bs = image_feature_temporal.shape[1]
q = self.ln_q(self.query) # Q * D

q_bs = q.unsqueeze(1).repeat(1, bs, 1)

out = self.attn(q_bs, image_feature_temporal, image_feature, key_padding_mask=key_padding_mask)[
0
] # Q * B * D # L * B * D + L * B * D
# out: Q * B * D
x = out.permute(1, 0, 2) # B * Q * D
Expand Down Expand Up @@ -3482,7 +3503,10 @@ def __init__(
model_kwargs: Dict[str, Any],
):
model.__orig_forward = model.forward
model.forward = types.MethodType(_minicpmv_resampler_forward, model)
has_temporal_ids = "temporal_ids" in inspect.signature(model.__orig_forward).parameters
model.forward = types.MethodType(
_minicpmv4_5_resampler_forward if has_temporal_ids else _minicpmv_resampler_forward, model
)

super().__init__(config, model, model_kwargs)

Expand Down
68 changes: 60 additions & 8 deletions optimum/intel/openvino/modeling_visual_language.py
Original file line number Diff line number Diff line change
Expand Up @@ -285,11 +285,21 @@ def __init__(self, model: ov.Model, parent_model: OVBaseModel) -> None:
self.output_dtypes = {key.get_any_name(): key.get_element_type().get_type_name() for key in self.model.outputs}
self.output_names = {key.get_any_name(): idx for idx, key in enumerate(self.model.outputs)}

def forward(self, image_feature, pos_embed, key_padding_mask):
def forward(self, image_feature, pos_embed, key_padding_mask, temporal_embed=None):
self.compile()
result = self.request(
{"image_feature": image_feature, "pos_embed": pos_embed, "key_padding_mask": key_padding_mask}
)[0]
if temporal_embed is not None:
result = self.request(
{
"image_feature": image_feature,
"pos_embed": pos_embed,
"key_padding_mask": key_padding_mask,
"temporal_embed": temporal_embed,
}
)[0]
else:
result = self.request(
{"image_feature": image_feature, "pos_embed": pos_embed, "key_padding_mask": key_padding_mask}
)[0]
return result


Expand Down Expand Up @@ -784,6 +794,7 @@ def forward(
audio_embed_sizes=None,
audio_attention_mask=None,
input_mode=None,
temporal_ids=None,
**kwargs,
):
if pixel_values is None:
Expand All @@ -809,6 +820,7 @@ def forward(
audio_embed_sizes=audio_embed_sizes,
audio_attention_mask=audio_attention_mask,
input_mode=input_mode,
temporal_ids=temporal_ids,
**kwargs,
)
return self.language_model.forward(
Expand Down Expand Up @@ -921,6 +933,7 @@ def prepare_inputs_for_generation(
"input_audio_embeds": kwargs.get("input_audio_embeds", kwargs.get("audio_input_features")),
"audio_embed_sizes": kwargs.get("audio_embed_sizes"),
"input_mode": kwargs.get("input_mode"),
"temporal_ids": kwargs.get("temporal_ids"),
}
)
return model_inputs
Expand Down Expand Up @@ -1923,10 +1936,18 @@ def __init__(
max_size = self.config.vision_config.image_size // self.config.vision_config.patch_size
self._pos_embeds = torch.from_numpy(self._get_2d_sincos_pos_embed(self.embed_dim, max_size)).float()
self.max_size = (max_size, max_size)
self.max_temporal_size = 72000

def get_vision_embeddings(self, pixel_values, input_ids=None, **kwargs):
def get_vision_embeddings(self, pixel_values, input_ids=None, temporal_ids=None, **kwargs):
if input_ids is not None and input_ids.shape[1] == 1:
return None

all_temporal_ids = None
if temporal_ids is not None:
all_temporal_ids = []
for t in temporal_ids:
all_temporal_ids.extend(t)

tgt_sizes = kwargs["tgt_sizes"]
pixel_values_list = pixel_values
vision_hidden_states = []
Expand Down Expand Up @@ -1963,7 +1984,7 @@ def get_vision_embeddings(self, pixel_values, input_ids=None, **kwargs):
pixel_values=all_pixel_values, patch_attention_mask=patch_attn_mask, position_ids=position_ids
)[0]
)
vision_embedding = self.resampling(vision_embedding, tgt_sizes)
vision_embedding = self.resampling(vision_embedding, tgt_sizes, all_temporal_ids)

start = 0
for pixel_value in pixel_values_list:
Expand All @@ -1979,26 +2000,57 @@ def get_vision_embeddings(self, pixel_values, input_ids=None, **kwargs):
vision_hidden_states.append(dummy_feature)
return vision_hidden_states

def resampling(self, x, tgt_sizes):
def resampling(self, x, tgt_sizes, temporal_ids=None):
from itertools import chain

bs = x.shape[0]

patch_len = tgt_sizes[:, 0] * tgt_sizes[:, 1]

self._adjust_pos_cache(tgt_sizes)

temporal_pos_emb = False
temporal_ids_flatten = None
if temporal_ids is not None:
# example: [[-1], [-1], [2, 6, 9]]
temporal_ids_flatten = list(chain.from_iterable(temporal_ids))
max_temporal_size = max(temporal_ids_flatten) + 1
if max_temporal_size > -1:
temporal_pos_emb = True
if max_temporal_size > self.max_temporal_size:
self._adjust_temporal_pos_cache(max_temporal_size, "cpu")

max_patch_len = torch.max(patch_len)
key_padding_mask = torch.zeros((bs, max_patch_len), dtype=torch.bool)

temporal_embed = None
pos_embed = []
pos_embed_temporal = []
for i in range(bs):
tgt_h, tgt_w = tgt_sizes[i]

if temporal_pos_emb:
if temporal_ids_flatten[i] == -1:
pos_embed_temporal.append(torch.zeros(self.embed_dim, dtype=torch.float32, device="cpu"))
else:
pos_embed_temporal.append(self.temporal_pos_embed[temporal_ids_flatten[i]].to(torch.float32)) # D

pos_embed.append(self._pos_embeds[:tgt_h, :tgt_w, :].reshape((tgt_h * tgt_w, -1))) # patches * D
key_padding_mask[i, patch_len[i] :] = True

pos_embed = torch.nn.utils.rnn.pad_sequence(pos_embed, batch_first=True, padding_value=0.0).permute(
1, 0, 2
) # BLD => L * B * D
res = torch.from_numpy(self.resampler(image_feature=x, pos_embed=pos_embed, key_padding_mask=key_padding_mask))
if temporal_pos_emb:
temporal_embed = torch.stack(pos_embed_temporal, dim=0).unsqueeze(0)
res = torch.from_numpy(
self.resampler(
image_feature=x,
pos_embed=pos_embed,
key_padding_mask=key_padding_mask,
temporal_embed=temporal_embed,
)
)
return res

def _set_2d_pos_cache(self, max_size):
Expand Down
26 changes: 24 additions & 2 deletions tests/openvino/test_exporters_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -618,6 +618,28 @@ class OVCLIExportTestCase(unittest.TestCase):
"resampler_model": {"int8": 6},
},
),
(
"image-text-to-text",
"minicpmv4",
"int4 --group-size 4 --ratio 0.8 --trust-remote-code",
{
"lm_model": {"int8": 12, "int4": 18},
"text_embeddings_model": {"int8": 1},
"vision_embeddings_model": {"int8": 14},
"resampler_model": {"int8": 6},
},
),
(
"image-text-to-text",
"minicpmv4_5",
"int4 --group-size 4 --ratio 0.8 --trust-remote-code",
{
"lm_model": {"int8": 12, "int4": 18},
"text_embeddings_model": {"int8": 1},
"vision_embeddings_model": {"int8": 14},
"resampler_model": {"int8": 6},
},
),
(
"image-text-to-text",
"internvl_chat",
Expand Down Expand Up @@ -743,13 +765,13 @@ def _openvino_export(self, model_name: str, task: str, model_kwargs: Dict = None

def test_filtered_architectures(cls):
if is_transformers_version("<", "4.49"):
expected = {"llama4", "qwen2_5_vl", "phi4mm"}
expected = {"llama4", "qwen2_5_vl", "phi4mm", "minicpmv4", "minicpmv4_5"}
elif is_transformers_version("<", "4.51"):
expected = {"llama4", "phi4mm"}
elif is_transformers_version("<", "4.52"):
expected = set()
else:
expected = {"llava-qwen2", "phi3_v", "phi4mm", "minicpmo"}
expected = {"llava-qwen2", "phi3_v", "phi4mm", "minicpmo", "minicpmv4", "minicpmv4_5"}

all_model_type = {config[1] for config in cls.TRANSFORMERS_4BIT_CONFIGURATIONS}
filtered_model_type = {config[1] for config in cls.SUPPORTED_4BIT_CONFIGURATIONS}
Expand Down
48 changes: 46 additions & 2 deletions tests/openvino/test_quantization.py
Original file line number Diff line number Diff line change
Expand Up @@ -938,6 +938,48 @@ class OVWeightCompressionTest(unittest.TestCase):
"resampler_model": {"int8": 6},
},
),
(
OVModelForVisualCausalLM,
"minicpmv4",
True,
dict(
bits=4,
group_size=16,
dataset="contextual",
ratio=0.8,
sensitivity_metric="mean_activation_magnitude",
num_samples=1,
processor=MODEL_NAMES["minicpmv4"],
trust_remote_code=True,
),
{
"lm_model": {"int8": 8, "int4": 22},
"text_embeddings_model": {"int8": 1},
"vision_embeddings_model": {"int8": 26},
"resampler_model": {"int8": 6},
},
),
(
OVModelForVisualCausalLM,
"minicpmv4_5",
True,
dict(
bits=4,
group_size=16,
dataset="contextual",
ratio=0.8,
sensitivity_metric="mean_activation_magnitude",
num_samples=1,
processor=MODEL_NAMES["minicpmv4_5"],
trust_remote_code=True,
),
{
"lm_model": {"int8": 8, "int4": 22},
"text_embeddings_model": {"int8": 1},
"vision_embeddings_model": {"int8": 26},
"resampler_model": {"int8": 6},
},
),
]

# filter models type depending on min max transformers version
Expand All @@ -964,6 +1006,8 @@ class OVWeightCompressionTest(unittest.TestCase):
(OVModelForVisualCausalLM, "llava_next_video", False),
(OVModelForVisualCausalLM, "minicpmv", True),
(OVModelForVisualCausalLM, "qwen2_vl", False),
(OVModelForVisualCausalLM, "minicpmv4", True),
(OVModelForVisualCausalLM, "minicpmv4_5", True),
]

if is_transformers_version("<", "4.54.0"):
Expand All @@ -987,13 +1031,13 @@ class OVWeightCompressionTest(unittest.TestCase):

def test_filtered_architectures(cls):
if is_transformers_version("<", "4.49"):
expected = {"llama4", "qwen2_5_vl"}
expected = {"llama4", "qwen2_5_vl", "minicpmv4", "minicpmv4_5"}
elif is_transformers_version("<", "4.51"):
expected = {"llama4"}
elif is_transformers_version("<", "4.52"):
expected = set()
else:
expected = {"llava-qwen2", "phi3_v", "minicpmo"}
expected = {"llava-qwen2", "phi3_v", "minicpmo", "minicpmv4", "minicpmv4_5"}

all_model_type = {config[1] for config in cls.TRANSFORMERS_4BIT_CONFIGURATIONS}
filtered_model_type = {config[1] for config in cls.LOAD_IN_4_BITS_SCOPE}
Expand Down
Loading
Loading