@@ -31,15 +31,15 @@ def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
3131 start = time .time ()
3232 filename = sample [self .filename_key ]
3333 filename_without_ext = os .path .splitext (filename )[0 ]
34- if not filename .lower ().endswith (".png" , ".jpeg" , ".jpg" , ".webp" , ".gif" , ".pdf" ):
34+ if not filename .lower ().endswith (( ".png" , ".jpeg" , ".jpg" , ".webp" , ".gif" , ".pdf" ) ):
3535 return sample
3636 try :
3737 filepath = sample [self .filepath_key ]
3838 parse_dir = os .path .join (self .output_dir , filename_without_ext , "vlm" )
3939 pdf_bytes = read_fn (filepath )
4040 total_page = len (PdfReader (filepath ).pages )
4141 content = ""
42- for page in range (total_page , 0 , 10 ):
42+ for page in range (0 , total_page , 10 ):
4343 do_parse (
4444 output_dir = self .output_dir ,
4545 pdf_file_names = [filename_without_ext ],
@@ -48,7 +48,7 @@ def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
4848 backend = self .backend ,
4949 server_url = self .server_url ,
5050 start_page_id = page ,
51- end_page_id = min (page + 10 , total_page - 1 ),
51+ end_page_id = min (page + 9 , total_page - 1 ),
5252 )
5353 if os .path .exists (parse_dir ):
5454 content += get_infer_result (".md" , filename_without_ext , parse_dir )
0 commit comments