Skip to content

Commit 6dede8c

Browse files
committed
Import smart_resize from qwen_vl_utils and allow user input for min and max pixels
Signed-off-by: quic-sanising <sanising@qti.qualcomm.com>
1 parent 58183e9 commit 6dede8c

File tree

2 files changed

+22
-60
lines changed

2 files changed

+22
-60
lines changed

QEfficient/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py

Lines changed: 13 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
import torch
1313
import torch.nn as nn
1414
import torch.nn.functional as F
15+
from qwen_vl_utils import smart_resize
1516
from transformers import Qwen2_5_VLForConditionalGeneration, Qwen2_5_VLModel
1617
from transformers.cache_utils import Cache
1718
from transformers.modeling_outputs import (
@@ -1026,69 +1027,31 @@ def get_specializations(
10261027
logger.warning(
10271028
f"Setting height and width to be {height} and {width} respectively, as it was neither passed nor found in vision_config"
10281029
)
1030+
height = [height] if isinstance(height, int) else height
1031+
width = [width] if isinstance(width, int) else width
1032+
10291033
prefill_seq_len = prefill_seq_len if prefill_seq_len else 128
10301034
ctx_len = ctx_len if ctx_len else constants.INTERN_CTX_LEN
10311035
channel = 3
10321036
patch_size = self.config.vision_config.patch_size
10331037
temporal_patch_size = self.config.vision_config.temporal_patch_size
10341038

1035-
# Modified from qwen_vl_utils/vision_process.py
10361039
IMAGE_FACTOR = 28
1037-
MAX_RATIO = 200
10381040
IMAGE_MIN_TOKEN_NUM = 4
10391041
IMAGE_MAX_TOKEN_NUM = 16384
1040-
1041-
def round_by_factor(number: int, factor: int) -> int:
1042-
"""Returns the closest integer to 'number' that is divisible by 'factor'."""
1043-
return round(number / factor) * factor
1044-
1045-
def ceil_by_factor(number: int, factor: int) -> int:
1046-
"""Returns the smallest integer greater than or equal to 'number' that is divisible by 'factor'."""
1047-
return math.ceil(number / factor) * factor
1048-
1049-
def floor_by_factor(number: int, factor: int) -> int:
1050-
"""Returns the largest integer less than or equal to 'number' that is divisible by 'factor'."""
1051-
return math.floor(number / factor) * factor
1052-
1053-
def smart_resize(
1054-
height: int,
1055-
width: int,
1056-
factor: int = IMAGE_FACTOR,
1057-
min_pixels: Optional[int] = None,
1058-
max_pixels: Optional[int] = None,
1059-
) -> tuple[int, int]:
1060-
"""
1061-
Rescales the image so that the following conditions are met:
1062-
1063-
1. Both dimensions (height and width) are divisible by 'factor'.
1064-
2. The total number of pixels is within the range ['min_pixels', 'max_pixels'].
1065-
3. The aspect ratio of the image is maintained as closely as possible.
1066-
"""
1067-
max_pixels = max_pixels if max_pixels is not None else (IMAGE_MAX_TOKEN_NUM * factor ** 2)
1068-
min_pixels = min_pixels if min_pixels is not None else (IMAGE_MIN_TOKEN_NUM * factor ** 2)
1069-
assert max_pixels >= min_pixels, "The max_pixels of image must be greater than or equal to min_pixels."
1070-
if max(height, width) / min(height, width) > MAX_RATIO:
1071-
raise ValueError(
1072-
f"absolute aspect ratio must be smaller than {MAX_RATIO}, got {max(height, width) / min(height, width)}"
1073-
)
1074-
h_bar = max(factor, round_by_factor(height, factor))
1075-
w_bar = max(factor, round_by_factor(width, factor))
1076-
if h_bar * w_bar > max_pixels:
1077-
beta = math.sqrt((height * width) / max_pixels)
1078-
h_bar = floor_by_factor(height / beta, factor)
1079-
w_bar = floor_by_factor(width / beta, factor)
1080-
elif h_bar * w_bar < min_pixels:
1081-
beta = math.sqrt(min_pixels / (height * width))
1082-
h_bar = ceil_by_factor(height * beta, factor)
1083-
w_bar = ceil_by_factor(width * beta, factor)
1084-
return h_bar, w_bar
1042+
min_pixels = IMAGE_MIN_TOKEN_NUM * IMAGE_FACTOR**2
1043+
max_pixels = IMAGE_MAX_TOKEN_NUM * IMAGE_FACTOR**2
1044+
mm_processor_kwargs = compiler_options.pop("mm_processor_kwargs", None)
1045+
if mm_processor_kwargs:
1046+
min_pixels = mm_processor_kwargs.get("min_pixels", min_pixels)
1047+
max_pixels = mm_processor_kwargs.get("max_pixels", max_pixels)
10851048

10861049
vision = []
10871050
min_vision_size = ctx_len
1088-
height = [height] if isinstance(height, int) else height
1089-
width = [width] if isinstance(width, int) else width
10901051
for h, w in zip(height, width):
1091-
resized_height, resized_width = smart_resize(height=h, width=w)
1052+
resized_height, resized_width = smart_resize(
1053+
height=h, width=w, factor=IMAGE_FACTOR, min_pixels=min_pixels, max_pixels=max_pixels
1054+
)
10921055
grid_h, grid_w = resized_height // patch_size, resized_width // patch_size
10931056
grid_height = grid_h * grid_w
10941057
grid_width = patch_size * patch_size * temporal_patch_size * channel

examples/image_text_to_text/models/qwen_vl/multi_specialization_inference.py

Lines changed: 9 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -80,8 +80,9 @@
8080
else:
8181
batch_size = 1
8282
ctx_len = 14336
83-
heights = [360, 320, 360, 454, 536, 640, 720, 910, 720, 1280, 1920]
84-
widths = [120, 180, 240, 256, 354, 360, 480, 512, 576, 720, 1080]
83+
widths = [360, 320, 360, 454, 536, 640, 720, 910, 720, 1280, 1920]
84+
heights = [120, 180, 240, 256, 354, 360, 480, 512, 576, 720, 1080]
85+
num_frames = [177, 139, 78, 64, 37, 30, 20, 16, 16, 7, 7]
8586

8687
## Vision + Text ##
8788
qeff_model.compile(
@@ -92,6 +93,11 @@
9293
num_devices=2,
9394
height=heights,
9495
width=widths,
96+
num_frames=max(num_frames),
97+
mm_processor_kwargs={
98+
"min_pixels": 4 * 28 * 28,
99+
"max_pixels": 16384 * 28 * 28,
100+
},
95101
mxfp6_matmul=True,
96102
mxint8_kv_cache=True,
97103
aic_enable_depth_first=True,
@@ -100,12 +106,8 @@
100106

101107
### IMAGE + TEXT ###
102108
image_url = "https://picsum.photos/id/237/536/354"
103-
104109
image = Image.open(requests.get(image_url, stream=True).raw)
105-
106-
## Resize to any deimnsion present in specializations ##
107-
image = image.resize((360, 120))
108-
110+
image = image.resize((360, 120)) # Resize to any deimnsion present in specializations (width, height)
109111
messages_1 = [
110112
{
111113
"role": "user",
@@ -115,7 +117,6 @@
115117
],
116118
},
117119
]
118-
119120
messages_2 = [
120121
{
121122
"role": "user",
@@ -125,9 +126,7 @@
125126
],
126127
},
127128
]
128-
129129
messages = [messages_1] * batch_size
130-
131130
texts = [processor.apply_chat_template(msg, tokenize=False, add_generation_prompt=True) for msg in messages]
132131

133132
image_inputs, video_inputs = process_vision_info(messages)

0 commit comments

Comments
 (0)