53
53
# Cannot find the following 2 numbers from hf config.
54
54
_IMAGE_TOKEN_ID = 32044
55
55
56
+ # Result in the max possible feature size (h:w = 16:1)
57
+ MAX_IMAGE_FEATURE_SIZE_HEIGHT = 8000
58
+ MAX_IMAGE_FEATURE_SIZE_WIDTH = 50
59
+
56
60
CLIP_VIT_LARGE_PATCH14_336_CONFIG = CLIPVisionConfig (dropout = 0.0 ,
57
61
hidden_act = "quick_gelu" ,
58
62
hidden_size = 1024 ,
@@ -322,24 +326,17 @@ def get_phi3v_image_feature_size(
322
326
323
327
324
328
def get_max_phi3v_image_tokens (ctx : InputContext ):
325
- # Result in the max possible feature size (h:w = 16:1)
326
- dummy_height , dummy_width = 8000 , 50
327
329
328
330
return get_phi3v_image_feature_size (
329
331
ctx .get_hf_config (PretrainedConfig ),
330
- input_height = dummy_height ,
331
- input_width = dummy_width ,
332
+ input_height = MAX_IMAGE_FEATURE_SIZE_HEIGHT ,
333
+ input_width = MAX_IMAGE_FEATURE_SIZE_WIDTH ,
332
334
)
333
335
334
336
335
337
def dummy_data_for_phi3v (ctx : InputContext , seq_len : int ):
336
- # Result in the max possible feature size (h:w = 16:1)
337
- dummy_height , dummy_width = 8000 , 50
338
- image_feature_size = get_phi3v_image_feature_size (
339
- ctx .get_hf_config (PretrainedConfig ),
340
- input_height = dummy_height ,
341
- input_width = dummy_width ,
342
- )
338
+
339
+ image_feature_size = get_max_phi3v_image_tokens (ctx )
343
340
344
341
seq_data = dummy_seq_data_for_clip (
345
342
CLIP_VIT_LARGE_PATCH14_336_CONFIG ,
@@ -349,8 +346,8 @@ def dummy_data_for_phi3v(ctx: InputContext, seq_len: int):
349
346
)
350
347
mm_data = dummy_image_for_clip (
351
348
CLIP_VIT_LARGE_PATCH14_336_CONFIG ,
352
- image_width_override = dummy_width ,
353
- image_height_override = dummy_height ,
349
+ image_width_override = MAX_IMAGE_FEATURE_SIZE_WIDTH ,
350
+ image_height_override = MAX_IMAGE_FEATURE_SIZE_HEIGHT ,
354
351
)
355
352
356
353
return seq_data , mm_data
0 commit comments