Skip to content

Commit 8d56ea2

Browse files
author
prima
committed
feat: Tweaks to parallel threshold and page break for PDF
1 parent 4da48e4 commit 8d56ea2

File tree

1 file changed

+6
-7
lines changed

1 file changed

+6
-7
lines changed

koboldcpp.py

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1887,7 +1887,6 @@ def getJsonFromPDFEncapsulatedPyMuPdf(decoded_bytes):
18871887
# Global PDF variables
18881888
CLEAN_PATTERN = re.compile(r"[^\u0000-\uFFFF]", re.DOTALL)
18891889
WHITE = set(string.whitespace)
1890-
parallel_threshold = 14
18911890

18921891
# Functions for text extraction
18931892
def clean_text(text):
@@ -2107,11 +2106,14 @@ def process_pages(args):
21072106
print(f"Start processing PDF with {total_pages}")
21082107
results = []
21092108
pageTables = []
2109+
21102110
# for i in tqdm(range(total_pages), desc="Extracting tables"):
21112111
# pageTables.append({}) # {}
21122112

2113-
if total_pages > parallel_threshold: # and True is False
2114-
num_cores = min(multiprocessing.cpu_count(), os.cpu_count() or 1)
2113+
num_cores = os.cpu_count()
2114+
if (num_cores is None):
2115+
num_cores = 1
2116+
if total_pages > num_cores: # and True is False
21152117
chunk_size = max(1, min(total_pages // (num_cores * 8), 20))
21162118
chunks = []
21172119
for i in range(0, total_pages, chunk_size):
@@ -2138,9 +2140,6 @@ def getTextFromPDFJsonEncapsulatedPyMuPdf(pages):
21382140
import json
21392141
import textwrap
21402142

2141-
# Globals
2142-
PAGE_BREAK = "[[PAGE_BREAK]]"
2143-
21442143
def col_widths(rows):
21452144
w = []
21462145
for row in rows:
@@ -2183,7 +2182,7 @@ def format_table(rows):
21832182
lines = []
21842183
for page_key in sorted(pages, key=lambda k: int(k.split()[1])):
21852184
page_no = page_key.split()[1]
2186-
lines.append(f"{PAGE_BREAK}\n-------- Page {page_no} --------\n")
2185+
lines.append(f"\n[PAGE BREAK][{page_no}]\n")
21872186
for blk in pages[page_key]:
21882187
if blk.get("type") == "paragraph":
21892188
para = blk["text"].rstrip()

0 commit comments

Comments
 (0)