Skip to content

Commit d260216

Browse files
update
1 parent e1f75c3 commit d260216

File tree

3 files changed

+4
-36
lines changed

3 files changed

+4
-36
lines changed

optimum/exporters/openvino/model_configs.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3819,7 +3819,7 @@ class Qwen3_VLOpenVINOConfig(BaseVLMOpenVINOConfig):
38193819
SUPPORTED_BEHAVIORS = [model_type.value for model_type in Qwen3VLConfigBehavior]
38203820
NORMALIZED_CONFIG_CLASS = NormalizedVisionConfig
38213821
DUMMY_INPUT_GENERATOR_CLASSES = (DummyQwen3VLVisionEmbedInputGenerator,)
3822-
MIN_TRANSFORMERS_VERSION = version.parse("4.45.0")
3822+
MIN_TRANSFORMERS_VERSION = version.parse("4.56.0")
38233823

38243824
def __init__(
38253825
self,
@@ -3966,7 +3966,6 @@ def inputs(self) -> Dict[str, Dict[int, str]]:
39663966
}
39673967

39683968

3969-
39703969
@property
39713970
def outputs(self) -> Dict[str, Dict[int, str]]:
39723971
if self._behavior == Qwen3VLConfigBehavior.VISION_EMBEDDINGS:

optimum/exporters/openvino/model_patcher.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4362,9 +4362,8 @@ def lm_forward(self, attention_mask, position_ids, past_key_values, inputs_embed
43624362
deepstack_visual_embeds=deepstack_visual_embeds,
43634363
)
43644364
hidden_states = outputs[0]
4365-
logits_to_keep = 1
43664365
# Only compute necessary logits, and do not upcast them to float if we are not computing the loss
4367-
slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
4366+
slice_indices = slice(-1, None)
43684367
logits = self.lm_head(hidden_states[:, slice_indices, :])
43694368
return (logits, outputs.past_key_values.to_legacy_cache())
43704369

optimum/intel/openvino/modeling_visual_language.py

Lines changed: 2 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -198,15 +198,13 @@ def prepare_inputs(
198198
if visual_pos_masks is not None:
199199
inputs["visual_pos_masks"] = visual_pos_masks
200200
else:
201-
inputs["visual_pos_masks"] = torch.ones(1, 1, dtype=torch.bool)
201+
inputs["visual_pos_masks"] = torch.zeros(1, 1, dtype=torch.bool)
202202

203203
if "deepstack_visual_embeds" in self.input_names:
204204
if isinstance(deepstack_visual_embeds, list):
205205
inputs["deepstack_visual_embeds"] = torch.Tensor(deepstack_visual_embeds)
206206
else:
207-
inputs["deepstack_visual_embeds"] = torch.ones((3, 1, 1), dtype=torch.float32)
208-
print(inputs["deepstack_visual_embeds"].shape)
209-
207+
inputs["deepstack_visual_embeds"] = torch.zeros((3, 1, 1), dtype=torch.float32)
210208
if "token_type_ids" in self.input_names:
211209
if token_type_ids is None:
212210
token_type_ids = np.zeros(inputs_embeds.shape[:2], dtype=int)
@@ -216,11 +214,6 @@ def prepare_inputs(
216214
inputs["beam_idx"] = (
217215
self.next_beam_idx if self.next_beam_idx is not None else np.arange(batch_size, dtype=int)
218216
)
219-
for key, value in inputs.items():
220-
if hasattr(value, 'dtype'):
221-
print(f"{key}: {value.dtype}")
222-
else:
223-
print(f"{key}: {type(value)}")
224217
return inputs
225218

226219
def forward(
@@ -2549,27 +2542,6 @@ class QWen2VLModelOutputWithPast(ModelOutput):
25492542
rope_deltas: Optional[torch.FloatTensor] = None
25502543
second_per_grid_ts: Optional[torch.FloatTensor] = None
25512544

2552-
2553-
# @dataclass
2554-
# class QWen3VLModelOutputWithPast(ModelOutput):
2555-
# r"""
2556-
# past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
2557-
# Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
2558-
# `(batch_size, num_heads, sequence_length, embed_size_per_head)`)
2559-
2560-
# Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
2561-
# `past_key_values` input) to speed up sequential decoding.
2562-
# rope_deltas (`torch.LongTensor` of shape `(batch_size, )`, *optional*):
2563-
# The rope index difference between sequence length and multimodal rope.
2564-
# """
2565-
2566-
# last_hidden_state: Optional[torch.FloatTensor] = None
2567-
# past_key_values: Optional[list[torch.FloatTensor]] = None
2568-
# hidden_states: Optional[tuple[torch.FloatTensor]] = None
2569-
# attentions: Optional[tuple[torch.FloatTensor]] = None
2570-
# rope_deltas: Optional[torch.LongTensor] = None
2571-
2572-
25732545
class _OVQwen2VLForCausalLM(OVModelForVisualCausalLM):
25742546
additional_parts = ["vision_embeddings_merger"]
25752547

@@ -3855,9 +3827,7 @@ def get_image_features(self, pixel_values: torch.FloatTensor, image_grid_thw: Op
38553827
image_embeds, deepstack_image_embeds = self.get_vision_embeddings(pixel_values, image_grid_thw)
38563828
image_embeds, deepstack_image_embeds = torch.from_numpy(image_embeds), torch.from_numpy(deepstack_image_embeds)
38573829
deepstack_image_embeds = deepstack_image_embeds.tolist()
3858-
print(image_grid_thw.prod(-1))
38593830
split_sizes = (image_grid_thw.prod(-1) // self.spatial_merge_size**2).tolist()
3860-
print(image_embeds.shape)
38613831
image_embeds = torch.split(image_embeds, split_sizes)
38623832
return image_embeds, deepstack_image_embeds
38633833

0 commit comments

Comments
 (0)