15
15
# limitations under the License.
16
16
import re
17
17
from functools import lru_cache
18
- from typing import (Iterable , List , Literal , Mapping , Optional , Tuple ,
19
- TypedDict , Union )
18
+ from typing import (Any , Dict , Iterable , List , Literal , Mapping , Optional ,
19
+ Tuple , TypedDict , Union )
20
20
21
21
import numpy as np
22
22
import torch
@@ -324,12 +324,12 @@ def _calc_hd_transform_size(*, width: int, height: int, hd_num: int = 16):
324
324
325
325
# Based on https://huggingface.co/microsoft/Phi-3-vision-128k-instruct/blob/main/image_processing_phi3_v.py#L181
326
326
def get_phi3v_image_feature_size (
327
- hf_config : PretrainedConfig ,
327
+ hf_config : Dict [ str , Any ] ,
328
328
* ,
329
329
input_height : int ,
330
330
input_width : int ,
331
331
) -> int :
332
- num_crops = getattr ( hf_config , "num_crops" , 16 )
332
+ num_crops = hf_config . get ( "num_crops" , 16 )
333
333
new_width , new_height = _calc_hd_transform_size (width = input_width ,
334
334
height = input_height ,
335
335
hd_num = num_crops )
@@ -341,7 +341,7 @@ def get_phi3v_image_feature_size(
341
341
def get_max_phi3v_image_tokens (ctx : InputContext ):
342
342
343
343
return get_phi3v_image_feature_size (
344
- ctx .get_hf_config (),
344
+ ctx .get_hf_image_processor_config (),
345
345
input_height = MAX_IMAGE_FEATURE_SIZE_HEIGHT ,
346
346
input_width = MAX_IMAGE_FEATURE_SIZE_WIDTH ,
347
347
)
@@ -395,7 +395,7 @@ def input_processor_for_phi3v(ctx: InputContext, llm_inputs: LLMInputs):
395
395
return llm_inputs
396
396
397
397
model_config = ctx .model_config
398
- hf_config = ctx .get_hf_config ()
398
+ hf_config = ctx .get_hf_image_processor_config ()
399
399
400
400
image_data = multi_modal_data ["image" ]
401
401
if isinstance (image_data , Image .Image ):
0 commit comments