@@ -667,7 +667,7 @@ def smart_resize(
667667 return h_bar , w_bar
668668
669669
670- def fetch_image (ele : dict [str , str | Image .Image ], size_factor : int = IMAGE_FACTOR ) -> Image .Image :
670+ def fetch_image (ele : dict [str , Union [ str , Image .Image ] ], size_factor : int = IMAGE_FACTOR ) -> Image .Image :
671671 if "image" in ele :
672672 image = ele ["image" ]
673673 else :
@@ -715,7 +715,7 @@ def fetch_image(ele: dict[str, str | Image.Image], size_factor: int = IMAGE_FACT
715715def smart_nframes (
716716 ele : dict ,
717717 total_frames : int ,
718- video_fps : int | float ,
718+ video_fps : Union [ int , float ] ,
719719) -> int :
720720 """calculate the number of frames for video used for model inputs.
721721
@@ -850,7 +850,7 @@ def gaussian_kernel_1d(size, sigma):
850850 kernel = np .exp (- x ** 2 / (2 * sigma ** 2 ))
851851 return kernel / kernel .sum ()
852852
853- def fetch_video (ele : dict , image_factor : int = IMAGE_FACTOR ) -> paddle .Tensor | list [Image .Image ]:
853+ def fetch_video (ele : dict , image_factor : int = IMAGE_FACTOR ) -> Union [ paddle .Tensor , list [Image .Image ] ]:
854854 if isinstance (ele ["video" ], str ):
855855 video_reader_backend = get_video_reader_backend ()
856856
@@ -902,7 +902,7 @@ def fetch_video(ele: dict, image_factor: int = IMAGE_FACTOR) -> paddle.Tensor |
902902 return images
903903
904904
905- def extract_vision_info (conversations : list [dict ] | list [list [dict ]]) -> list [dict ]:
905+ def extract_vision_info (conversations : Union [ list [dict ], list [list [dict ] ]]) -> list [dict ]:
906906 vision_infos = []
907907 if isinstance (conversations [0 ], dict ):
908908 conversations = [conversations ]
@@ -921,8 +921,8 @@ def extract_vision_info(conversations: list[dict] | list[list[dict]]) -> list[di
921921
922922
923923def process_vision_info (
924- conversations : list [dict ] | list [list [dict ]],
925- ) -> tuple [list [Image .Image ] | None , list [paddle .Tensor | list [Image .Image ]] | None ]:
924+ conversations : Union [ list [dict ], list [list [dict ] ]],
925+ ) -> tuple [Union [ list [Image .Image ], None , list [Union [ paddle .Tensor , list [Image .Image ]]], None ] ]:
926926 vision_infos = extract_vision_info (conversations )
927927 image_inputs = []
928928 video_inputs = []
0 commit comments