fix: Further fixes attempted to PyMuPdf, fallback implemented as well

prima · prima · commit 60fb5f5ae879 · 2025-05-23T19:40:21.000+01:00
diff --git a/koboldcpp.py b/koboldcpp.py
@@ -1662,8 +1662,10 @@ def extract_text(genparams):
         return ""
 
 def extract_text_from_pdf(docData):
+    import traceback
     global args
     
+    decoded_bytes = None
     try:
         # Add padding if necessary
         padding = len(docData) % 4
@@ -1672,10 +1674,22 @@ def extract_text_from_pdf(docData):
 
         # Decode the Base64 string
         decoded_bytes = base64.b64decode(docData)
+    except Exception as e:
+        print(f"Error decoding text from PDF: {str(e)}")
+        print(traceback.format_exc())
+        return ""
 
+    try:
         return getTextFromPDFEncapsulatedPyMuPdf(decoded_bytes)
     except Exception as e:
-        print(f"Error extracting text: {str(e)}")
+        print(f"Error extracting text with PyMuPdf: {str(e)}")
+        print(traceback.format_exc())
+    
+    try:
+        return getTextFromPDFEncapsulated(decoded_bytes)
+    except Exception as e:
+        print(f"Error extracting text with PdfPlumber: {str(e)}")
+        print(traceback.format_exc())
         return ""
 
 # PDF extraction code by sevenof9
@@ -1779,28 +1793,50 @@ def clamp_bbox(bbox, page_width, page_height):
 
         for idx, table in enumerate(table_json_outputs, start=1):
             page_output += f'"table {idx}":\n{table}\n'
-
-        print(f"Finished processing PDF page {page_number}")
         return page_number, page_output
 
+    def process_pages(pagesArgs):
+        pageOutputs = []
+        for i in range(0, len(pagesArgs)):
+            pageOutputs.append(process_page(pagesArgs[i]))
+        return pageOutputs
+
     def run_serial(pages):
-        # Seroa; execution
-        return [process_page(args) for args in pages]
+        from tqdm.auto import tqdm
+        results = []
+        for i in tqdm(range(len(pages)), desc="Processing pages"):
+            results.append(process_page(pages[i]))
+        return results
+        # return [process_page(args) for args in pages]
 
     def run_parallel(pages):
         from multiprocessing import cpu_count
+        from tqdm.auto import tqdm
 
         # Parallel execution based on either the number of pages or number of CPU cores
         num_cores = min(cpu_count(), len(pages))
         print(f"Started processing PDF document with {len(pages)} using {num_cores} cores...")
+
+        total_pages = len(pages)
+        num_cores = min(multiprocessing.cpu_count(), os.cpu_count() or 1)
+        chunk_size = max(1, min(total_pages // (num_cores * 8), 20))
+        chunks = []
+        for i in range(0, total_pages, chunk_size):
+            end = min(i + chunk_size, total_pages)
+            chunk = []
+            for pageNo in range(i, end):
+                chunk.append(pages[pageNo])
+            chunks.append(chunk)
+        results = []
         with ThreadPoolExecutor(max_workers=num_cores) as exe:
-            return exe.map(process_page, pages)
-            # exe.submit(cube,2)
-            
-            # Maps the method 'cube' with a list of values.
-            
-        # with Pool(num_cores) as pool:
-        #     return pool.map(process_page, pages)
+            with tqdm(total=total_pages, desc="Processing pages") as pbar:
+                for chunk_results in exe.map(process_pages, chunks):
+                    results.extend(chunk_results)
+                    pbar.update(len(chunk_results))
+        return results
+
+        # with ThreadPoolExecutor(max_workers=num_cores) as exe:
+        #     return exe.map(process_page, pages)
 
     decoded_bytes = io.BytesIO(decoded_bytes)  
     with pdfplumber.open(decoded_bytes) as pdf:
@@ -1814,7 +1850,7 @@ def run_parallel(pages):
         }
 
         # Number of pages before multithreading should be used
-        PARALLEL_THRESHOLD = 8
+        PARALLEL_THRESHOLD = 14
 
         pages = [(i, pdf, TEXT_EXTRACTION_SETTINGS) for i in range(num_pages)]
 
@@ -1875,10 +1911,19 @@ def sanitize_spans(line):
                     s0["size"],
                 ) != (s1["flags"], s1["char_flags"], s1["size"]):
                     continue
-                if s0["text"].endswith("-") and s1["text"] and s1["text"][0].isalpha():
+                
+                dashHandler = False
+                try:
+                    if s0["text"].endswith("-") and s1["text"] and s1["text"][0].isalpha():
+                        dashHandler = True
+                except Exception:
+                    print(f"Failed to check opacity for dash handler on page")
+
+                if dashHandler:
                     s0["text"] = s0["text"][:-1] + s1["text"]
                 else:
                     s0["text"] += s1["text"]
+
                 s0["bbox"] |= s1["bbox"]
                 del line[i]
                 line[i - 1] = s0
@@ -1899,8 +1944,11 @@ def sanitize_spans(line):
                     sbbox = fitz.Rect(s["bbox"])
                     if is_white(s["text"]):
                         continue
-                    if s["alpha"] == 0 and ignore_invisible:
-                        continue
+                    try:
+                        if s["alpha"] == 0 and ignore_invisible:
+                            continue
+                    except Exception:
+                        print(f"Failed to check opacity for text on page")
                     if abs(sbbox & clip) < abs(sbbox) * 0.8:
                         continue
                     if s["flags"] & 1 == 1:
@@ -1992,7 +2040,7 @@ def extract_page(doc, pageTables, page_number):
         # tables_block = page.get_text("dict")["blocks"]
         # table_rects = []
 
-        tables = page.find_tables() # pageTables[page_number]
+        tables = {} # page.find_tables() # pageTables[page_number]
         table_rects = []
         if tables and tables.tables:
             for table in tables.tables:
@@ -2053,9 +2101,9 @@ def process_pages(args):
         # for i in tqdm(range(total_pages), desc="Extracting tables"):
         #     pageTables.append({}) # {} 
     
-        if total_pages > parallel_threshold and True is False:
+        if total_pages > parallel_threshold: # and True is False
             num_cores = min(multiprocessing.cpu_count(), os.cpu_count() or 1)
-            chunk_size = max(1, min(total_pages // (num_cores * 2), 20))
+            chunk_size = max(1, min(total_pages // (num_cores * 8), 20))
             chunks = []
             for i in range(0, total_pages, chunk_size):
                 end = min(i + chunk_size, total_pages)