Skip to content

Commit a41357e

Browse files
authored
[VLM] Improve consistency between feature size calculation and dummy data for profiling (#6146)
1 parent ae96ef8 commit a41357e

File tree

2 files changed

+18
-26
lines changed

2 files changed

+18
-26
lines changed

vllm/model_executor/models/llava_next.py

Lines changed: 8 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,9 @@
3737
"language_model.model": "language_model",
3838
}
3939

40+
# Result in the max possible feature size (2x2 grid of 336x336px tiles)
41+
MAX_IMAGE_FEATURE_SIZE_HEIGHT = MAX_IMAGE_FEATURE_SIZE_WIDTH = 448
42+
4043

4144
class LlavaNextImagePixelInputs(TypedDict):
4245
type: Literal["pixel_values"]
@@ -128,27 +131,19 @@ def get_llava_next_image_feature_size(
128131

129132

130133
def get_max_llava_next_image_tokens(ctx: InputContext):
131-
# Result in the max possible feature size (2x2 grid of 336x336px tiles)
132-
dummy_height = dummy_width = 448
133134

134135
return get_llava_next_image_feature_size(
135136
ctx.get_hf_config(LlavaNextConfig),
136-
input_height=dummy_height,
137-
input_width=dummy_width,
137+
input_height=MAX_IMAGE_FEATURE_SIZE_HEIGHT,
138+
input_width=MAX_IMAGE_FEATURE_SIZE_WIDTH,
138139
)
139140

140141

141142
def dummy_data_for_llava_next(ctx: InputContext, seq_len: int):
142143
hf_config = ctx.get_hf_config(LlavaNextConfig)
143144
vision_config = hf_config.vision_config
144145

145-
# Result in the max possible feature size (2x2 grid of 336x336px tiles)
146-
dummy_height = dummy_width = 448
147-
image_feature_size = get_llava_next_image_feature_size(
148-
hf_config,
149-
input_height=dummy_height,
150-
input_width=dummy_width,
151-
)
146+
image_feature_size = get_max_llava_next_image_tokens(ctx)
152147

153148
if isinstance(vision_config, CLIPVisionConfig):
154149
seq_data = dummy_seq_data_for_clip(
@@ -160,8 +155,8 @@ def dummy_data_for_llava_next(ctx: InputContext, seq_len: int):
160155

161156
mm_data = dummy_image_for_clip(
162157
vision_config,
163-
image_width_override=dummy_width,
164-
image_height_override=dummy_height,
158+
image_width_override=MAX_IMAGE_FEATURE_SIZE_WIDTH,
159+
image_height_override=MAX_IMAGE_FEATURE_SIZE_HEIGHT,
165160
)
166161

167162
return seq_data, mm_data

vllm/model_executor/models/phi3v.py

Lines changed: 10 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,10 @@
5353
# Cannot find the following 2 numbers from hf config.
5454
_IMAGE_TOKEN_ID = 32044
5555

56+
# Result in the max possible feature size (h:w = 16:1)
57+
MAX_IMAGE_FEATURE_SIZE_HEIGHT = 8000
58+
MAX_IMAGE_FEATURE_SIZE_WIDTH = 50
59+
5660
CLIP_VIT_LARGE_PATCH14_336_CONFIG = CLIPVisionConfig(dropout=0.0,
5761
hidden_act="quick_gelu",
5862
hidden_size=1024,
@@ -322,24 +326,17 @@ def get_phi3v_image_feature_size(
322326

323327

324328
def get_max_phi3v_image_tokens(ctx: InputContext):
325-
# Result in the max possible feature size (h:w = 16:1)
326-
dummy_height, dummy_width = 8000, 50
327329

328330
return get_phi3v_image_feature_size(
329331
ctx.get_hf_config(PretrainedConfig),
330-
input_height=dummy_height,
331-
input_width=dummy_width,
332+
input_height=MAX_IMAGE_FEATURE_SIZE_HEIGHT,
333+
input_width=MAX_IMAGE_FEATURE_SIZE_WIDTH,
332334
)
333335

334336

335337
def dummy_data_for_phi3v(ctx: InputContext, seq_len: int):
336-
# Result in the max possible feature size (h:w = 16:1)
337-
dummy_height, dummy_width = 8000, 50
338-
image_feature_size = get_phi3v_image_feature_size(
339-
ctx.get_hf_config(PretrainedConfig),
340-
input_height=dummy_height,
341-
input_width=dummy_width,
342-
)
338+
339+
image_feature_size = get_max_phi3v_image_tokens(ctx)
343340

344341
seq_data = dummy_seq_data_for_clip(
345342
CLIP_VIT_LARGE_PATCH14_336_CONFIG,
@@ -349,8 +346,8 @@ def dummy_data_for_phi3v(ctx: InputContext, seq_len: int):
349346
)
350347
mm_data = dummy_image_for_clip(
351348
CLIP_VIT_LARGE_PATCH14_336_CONFIG,
352-
image_width_override=dummy_width,
353-
image_height_override=dummy_height,
349+
image_width_override=MAX_IMAGE_FEATURE_SIZE_WIDTH,
350+
image_height_override=MAX_IMAGE_FEATURE_SIZE_HEIGHT,
354351
)
355352

356353
return seq_data, mm_data

0 commit comments

Comments
 (0)