@@ -1887,7 +1887,6 @@ def getJsonFromPDFEncapsulatedPyMuPdf(decoded_bytes):
18871887 # Global PDF variables
18881888 CLEAN_PATTERN = re .compile (r"[^\u0000-\uFFFF]" , re .DOTALL )
18891889 WHITE = set (string .whitespace )
1890- parallel_threshold = 14
18911890
18921891 # Functions for text extraction
18931892 def clean_text (text ):
@@ -2107,11 +2106,14 @@ def process_pages(args):
21072106 print (f"Start processing PDF with { total_pages } " )
21082107 results = []
21092108 pageTables = []
2109+
21102110 # for i in tqdm(range(total_pages), desc="Extracting tables"):
21112111 # pageTables.append({}) # {}
21122112
2113- if total_pages > parallel_threshold : # and True is False
2114- num_cores = min (multiprocessing .cpu_count (), os .cpu_count () or 1 )
2113+ num_cores = os .cpu_count ()
2114+ if (num_cores is None ):
2115+ num_cores = 1
2116+ if total_pages > num_cores : # and True is False
21152117 chunk_size = max (1 , min (total_pages // (num_cores * 8 ), 20 ))
21162118 chunks = []
21172119 for i in range (0 , total_pages , chunk_size ):
@@ -2138,9 +2140,6 @@ def getTextFromPDFJsonEncapsulatedPyMuPdf(pages):
21382140 import json
21392141 import textwrap
21402142
2141- # Globals
2142- PAGE_BREAK = "[[PAGE_BREAK]]"
2143-
21442143 def col_widths (rows ):
21452144 w = []
21462145 for row in rows :
@@ -2183,7 +2182,7 @@ def format_table(rows):
21832182 lines = []
21842183 for page_key in sorted (pages , key = lambda k : int (k .split ()[1 ])):
21852184 page_no = page_key .split ()[1 ]
2186- lines .append (f"{ PAGE_BREAK } \n -------- Page { page_no } -------- \n " )
2185+ lines .append (f"\n [PAGE BREAK][ { page_no } ] \n " )
21872186 for blk in pages [page_key ]:
21882187 if blk .get ("type" ) == "paragraph" :
21892188 para = blk ["text" ].rstrip ()
0 commit comments