@@ -1662,8 +1662,10 @@ def extract_text(genparams):
16621662 return ""
16631663
16641664def extract_text_from_pdf (docData ):
1665+ import traceback
16651666 global args
16661667
1668+ decoded_bytes = None
16671669 try :
16681670 # Add padding if necessary
16691671 padding = len (docData ) % 4
@@ -1672,10 +1674,22 @@ def extract_text_from_pdf(docData):
16721674
16731675 # Decode the Base64 string
16741676 decoded_bytes = base64 .b64decode (docData )
1677+ except Exception as e :
1678+ print (f"Error decoding text from PDF: { str (e )} " )
1679+ print (traceback .format_exc ())
1680+ return ""
16751681
1682+ try :
16761683 return getTextFromPDFEncapsulatedPyMuPdf (decoded_bytes )
16771684 except Exception as e :
1678- print (f"Error extracting text: { str (e )} " )
1685+ print (f"Error extracting text with PyMuPdf: { str (e )} " )
1686+ print (traceback .format_exc ())
1687+
1688+ try :
1689+ return getTextFromPDFEncapsulated (decoded_bytes )
1690+ except Exception as e :
1691+ print (f"Error extracting text with PdfPlumber: { str (e )} " )
1692+ print (traceback .format_exc ())
16791693 return ""
16801694
16811695# PDF extraction code by sevenof9
@@ -1779,28 +1793,50 @@ def clamp_bbox(bbox, page_width, page_height):
17791793
17801794 for idx , table in enumerate (table_json_outputs , start = 1 ):
17811795 page_output += f'"table { idx } ":\n { table } \n '
1782-
1783- print (f"Finished processing PDF page { page_number } " )
17841796 return page_number , page_output
17851797
1798+ def process_pages (pagesArgs ):
1799+ pageOutputs = []
1800+ for i in range (0 , len (pagesArgs )):
1801+ pageOutputs .append (process_page (pagesArgs [i ]))
1802+ return pageOutputs
1803+
17861804 def run_serial (pages ):
1787- # Seroa; execution
1788- return [process_page (args ) for args in pages ]
1805+ from tqdm .auto import tqdm
1806+ results = []
1807+ for i in tqdm (range (len (pages )), desc = "Processing pages" ):
1808+ results .append (process_page (pages [i ]))
1809+ return results
1810+ # return [process_page(args) for args in pages]
17891811
17901812 def run_parallel (pages ):
17911813 from multiprocessing import cpu_count
1814+ from tqdm .auto import tqdm
17921815
17931816 # Parallel execution based on either the number of pages or number of CPU cores
17941817 num_cores = min (cpu_count (), len (pages ))
17951818 print (f"Started processing PDF document with { len (pages )} using { num_cores } cores..." )
1819+
1820+ total_pages = len (pages )
1821+ num_cores = min (multiprocessing .cpu_count (), os .cpu_count () or 1 )
1822+ chunk_size = max (1 , min (total_pages // (num_cores * 8 ), 20 ))
1823+ chunks = []
1824+ for i in range (0 , total_pages , chunk_size ):
1825+ end = min (i + chunk_size , total_pages )
1826+ chunk = []
1827+ for pageNo in range (i , end ):
1828+ chunk .append (pages [pageNo ])
1829+ chunks .append (chunk )
1830+ results = []
17961831 with ThreadPoolExecutor (max_workers = num_cores ) as exe :
1797- return exe .map (process_page , pages )
1798- # exe.submit(cube,2)
1799-
1800- # Maps the method 'cube' with a list of values.
1801-
1802- # with Pool(num_cores) as pool:
1803- # return pool.map(process_page, pages)
1832+ with tqdm (total = total_pages , desc = "Processing pages" ) as pbar :
1833+ for chunk_results in exe .map (process_pages , chunks ):
1834+ results .extend (chunk_results )
1835+ pbar .update (len (chunk_results ))
1836+ return results
1837+
1838+ # with ThreadPoolExecutor(max_workers=num_cores) as exe:
1839+ # return exe.map(process_page, pages)
18041840
18051841 decoded_bytes = io .BytesIO (decoded_bytes )
18061842 with pdfplumber .open (decoded_bytes ) as pdf :
@@ -1814,7 +1850,7 @@ def run_parallel(pages):
18141850 }
18151851
18161852 # Number of pages before multithreading should be used
1817- PARALLEL_THRESHOLD = 8
1853+ PARALLEL_THRESHOLD = 14
18181854
18191855 pages = [(i , pdf , TEXT_EXTRACTION_SETTINGS ) for i in range (num_pages )]
18201856
@@ -1875,10 +1911,19 @@ def sanitize_spans(line):
18751911 s0 ["size" ],
18761912 ) != (s1 ["flags" ], s1 ["char_flags" ], s1 ["size" ]):
18771913 continue
1878- if s0 ["text" ].endswith ("-" ) and s1 ["text" ] and s1 ["text" ][0 ].isalpha ():
1914+
1915+ dashHandler = False
1916+ try :
1917+ if s0 ["text" ].endswith ("-" ) and s1 ["text" ] and s1 ["text" ][0 ].isalpha ():
1918+ dashHandler = True
1919+ except Exception :
1920+ print (f"Failed to check opacity for dash handler on page" )
1921+
1922+ if dashHandler :
18791923 s0 ["text" ] = s0 ["text" ][:- 1 ] + s1 ["text" ]
18801924 else :
18811925 s0 ["text" ] += s1 ["text" ]
1926+
18821927 s0 ["bbox" ] |= s1 ["bbox" ]
18831928 del line [i ]
18841929 line [i - 1 ] = s0
@@ -1899,8 +1944,11 @@ def sanitize_spans(line):
18991944 sbbox = fitz .Rect (s ["bbox" ])
19001945 if is_white (s ["text" ]):
19011946 continue
1902- if s ["alpha" ] == 0 and ignore_invisible :
1903- continue
1947+ try :
1948+ if s ["alpha" ] == 0 and ignore_invisible :
1949+ continue
1950+ except Exception :
1951+ print (f"Failed to check opacity for text on page" )
19041952 if abs (sbbox & clip ) < abs (sbbox ) * 0.8 :
19051953 continue
19061954 if s ["flags" ] & 1 == 1 :
@@ -1992,7 +2040,7 @@ def extract_page(doc, pageTables, page_number):
19922040 # tables_block = page.get_text("dict")["blocks"]
19932041 # table_rects = []
19942042
1995- tables = page .find_tables () # pageTables[page_number]
2043+ tables = {} # page.find_tables() # pageTables[page_number]
19962044 table_rects = []
19972045 if tables and tables .tables :
19982046 for table in tables .tables :
@@ -2053,9 +2101,9 @@ def process_pages(args):
20532101 # for i in tqdm(range(total_pages), desc="Extracting tables"):
20542102 # pageTables.append({}) # {}
20552103
2056- if total_pages > parallel_threshold and True is False :
2104+ if total_pages > parallel_threshold : # and True is False
20572105 num_cores = min (multiprocessing .cpu_count (), os .cpu_count () or 1 )
2058- chunk_size = max (1 , min (total_pages // (num_cores * 2 ), 20 ))
2106+ chunk_size = max (1 , min (total_pages // (num_cores * 8 ), 20 ))
20592107 chunks = []
20602108 for i in range (0 , total_pages , chunk_size ):
20612109 end = min (i + chunk_size , total_pages )
0 commit comments