feat: Tweaks to parallel threshold and page break for PDF

prima · prima · commit 8d56ea24f9eb · 2025-05-24T19:48:14.000+01:00
diff --git a/koboldcpp.py b/koboldcpp.py
@@ -1887,7 +1887,6 @@ def getJsonFromPDFEncapsulatedPyMuPdf(decoded_bytes):
     # Global PDF variables
     CLEAN_PATTERN = re.compile(r"[^\u0000-\uFFFF]", re.DOTALL)
     WHITE = set(string.whitespace)
-    parallel_threshold = 14
 
     # Functions for text extraction
     def clean_text(text):
@@ -2107,11 +2106,14 @@ def process_pages(args):
         print(f"Start processing PDF with {total_pages}")
         results = []
         pageTables = []
+
         # for i in tqdm(range(total_pages), desc="Extracting tables"):
         #     pageTables.append({}) # {} 
     
-        if total_pages > parallel_threshold: # and True is False
-            num_cores = min(multiprocessing.cpu_count(), os.cpu_count() or 1)
+        num_cores = os.cpu_count()
+        if (num_cores is None):
+            num_cores = 1
+        if total_pages > num_cores: # and True is False
             chunk_size = max(1, min(total_pages // (num_cores * 8), 20))
             chunks = []
             for i in range(0, total_pages, chunk_size):
@@ -2138,9 +2140,6 @@ def getTextFromPDFJsonEncapsulatedPyMuPdf(pages):
     import json
     import textwrap
 
-    # Globals
-    PAGE_BREAK = "[[PAGE_BREAK]]"
-
     def col_widths(rows):
         w = []
         for row in rows:
@@ -2183,7 +2182,7 @@ def format_table(rows):
         lines = []
         for page_key in sorted(pages, key=lambda k: int(k.split()[1])):
             page_no = page_key.split()[1]
-            lines.append(f"{PAGE_BREAK}\n-------- Page {page_no} --------\n")
+            lines.append(f"\n[PAGE BREAK][{page_no}]\n")
             for blk in pages[page_key]:
                 if blk.get("type") == "paragraph":
                     para = blk["text"].rstrip()