Skip to content

Commit 60fb5f5

Browse files
author
prima
committed
fix: Further fixes attempted to PyMuPdf, fallback implemented as well
1 parent 6d1edb1 commit 60fb5f5

File tree

1 file changed

+67
-19
lines changed

1 file changed

+67
-19
lines changed

koboldcpp.py

Lines changed: 67 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1662,8 +1662,10 @@ def extract_text(genparams):
16621662
return ""
16631663

16641664
def extract_text_from_pdf(docData):
1665+
import traceback
16651666
global args
16661667

1668+
decoded_bytes = None
16671669
try:
16681670
# Add padding if necessary
16691671
padding = len(docData) % 4
@@ -1672,10 +1674,22 @@ def extract_text_from_pdf(docData):
16721674

16731675
# Decode the Base64 string
16741676
decoded_bytes = base64.b64decode(docData)
1677+
except Exception as e:
1678+
print(f"Error decoding text from PDF: {str(e)}")
1679+
print(traceback.format_exc())
1680+
return ""
16751681

1682+
try:
16761683
return getTextFromPDFEncapsulatedPyMuPdf(decoded_bytes)
16771684
except Exception as e:
1678-
print(f"Error extracting text: {str(e)}")
1685+
print(f"Error extracting text with PyMuPdf: {str(e)}")
1686+
print(traceback.format_exc())
1687+
1688+
try:
1689+
return getTextFromPDFEncapsulated(decoded_bytes)
1690+
except Exception as e:
1691+
print(f"Error extracting text with PdfPlumber: {str(e)}")
1692+
print(traceback.format_exc())
16791693
return ""
16801694

16811695
# PDF extraction code by sevenof9
@@ -1779,28 +1793,50 @@ def clamp_bbox(bbox, page_width, page_height):
17791793

17801794
for idx, table in enumerate(table_json_outputs, start=1):
17811795
page_output += f'"table {idx}":\n{table}\n'
1782-
1783-
print(f"Finished processing PDF page {page_number}")
17841796
return page_number, page_output
17851797

1798+
def process_pages(pagesArgs):
1799+
pageOutputs = []
1800+
for i in range(0, len(pagesArgs)):
1801+
pageOutputs.append(process_page(pagesArgs[i]))
1802+
return pageOutputs
1803+
17861804
def run_serial(pages):
1787-
# Seroa; execution
1788-
return [process_page(args) for args in pages]
1805+
from tqdm.auto import tqdm
1806+
results = []
1807+
for i in tqdm(range(len(pages)), desc="Processing pages"):
1808+
results.append(process_page(pages[i]))
1809+
return results
1810+
# return [process_page(args) for args in pages]
17891811

17901812
def run_parallel(pages):
17911813
from multiprocessing import cpu_count
1814+
from tqdm.auto import tqdm
17921815

17931816
# Parallel execution based on either the number of pages or number of CPU cores
17941817
num_cores = min(cpu_count(), len(pages))
17951818
print(f"Started processing PDF document with {len(pages)} using {num_cores} cores...")
1819+
1820+
total_pages = len(pages)
1821+
num_cores = min(multiprocessing.cpu_count(), os.cpu_count() or 1)
1822+
chunk_size = max(1, min(total_pages // (num_cores * 8), 20))
1823+
chunks = []
1824+
for i in range(0, total_pages, chunk_size):
1825+
end = min(i + chunk_size, total_pages)
1826+
chunk = []
1827+
for pageNo in range(i, end):
1828+
chunk.append(pages[pageNo])
1829+
chunks.append(chunk)
1830+
results = []
17961831
with ThreadPoolExecutor(max_workers=num_cores) as exe:
1797-
return exe.map(process_page, pages)
1798-
# exe.submit(cube,2)
1799-
1800-
# Maps the method 'cube' with a list of values.
1801-
1802-
# with Pool(num_cores) as pool:
1803-
# return pool.map(process_page, pages)
1832+
with tqdm(total=total_pages, desc="Processing pages") as pbar:
1833+
for chunk_results in exe.map(process_pages, chunks):
1834+
results.extend(chunk_results)
1835+
pbar.update(len(chunk_results))
1836+
return results
1837+
1838+
# with ThreadPoolExecutor(max_workers=num_cores) as exe:
1839+
# return exe.map(process_page, pages)
18041840

18051841
decoded_bytes = io.BytesIO(decoded_bytes)
18061842
with pdfplumber.open(decoded_bytes) as pdf:
@@ -1814,7 +1850,7 @@ def run_parallel(pages):
18141850
}
18151851

18161852
# Number of pages before multithreading should be used
1817-
PARALLEL_THRESHOLD = 8
1853+
PARALLEL_THRESHOLD = 14
18181854

18191855
pages = [(i, pdf, TEXT_EXTRACTION_SETTINGS) for i in range(num_pages)]
18201856

@@ -1875,10 +1911,19 @@ def sanitize_spans(line):
18751911
s0["size"],
18761912
) != (s1["flags"], s1["char_flags"], s1["size"]):
18771913
continue
1878-
if s0["text"].endswith("-") and s1["text"] and s1["text"][0].isalpha():
1914+
1915+
dashHandler = False
1916+
try:
1917+
if s0["text"].endswith("-") and s1["text"] and s1["text"][0].isalpha():
1918+
dashHandler = True
1919+
except Exception:
1920+
print(f"Failed to check opacity for dash handler on page")
1921+
1922+
if dashHandler:
18791923
s0["text"] = s0["text"][:-1] + s1["text"]
18801924
else:
18811925
s0["text"] += s1["text"]
1926+
18821927
s0["bbox"] |= s1["bbox"]
18831928
del line[i]
18841929
line[i - 1] = s0
@@ -1899,8 +1944,11 @@ def sanitize_spans(line):
18991944
sbbox = fitz.Rect(s["bbox"])
19001945
if is_white(s["text"]):
19011946
continue
1902-
if s["alpha"] == 0 and ignore_invisible:
1903-
continue
1947+
try:
1948+
if s["alpha"] == 0 and ignore_invisible:
1949+
continue
1950+
except Exception:
1951+
print(f"Failed to check opacity for text on page")
19041952
if abs(sbbox & clip) < abs(sbbox) * 0.8:
19051953
continue
19061954
if s["flags"] & 1 == 1:
@@ -1992,7 +2040,7 @@ def extract_page(doc, pageTables, page_number):
19922040
# tables_block = page.get_text("dict")["blocks"]
19932041
# table_rects = []
19942042

1995-
tables = page.find_tables() # pageTables[page_number]
2043+
tables = {} # page.find_tables() # pageTables[page_number]
19962044
table_rects = []
19972045
if tables and tables.tables:
19982046
for table in tables.tables:
@@ -2053,9 +2101,9 @@ def process_pages(args):
20532101
# for i in tqdm(range(total_pages), desc="Extracting tables"):
20542102
# pageTables.append({}) # {}
20552103

2056-
if total_pages > parallel_threshold and True is False:
2104+
if total_pages > parallel_threshold: # and True is False
20572105
num_cores = min(multiprocessing.cpu_count(), os.cpu_count() or 1)
2058-
chunk_size = max(1, min(total_pages // (num_cores * 2), 20))
2106+
chunk_size = max(1, min(total_pages // (num_cores * 8), 20))
20592107
chunks = []
20602108
for i in range(0, total_pages, chunk_size):
20612109
end = min(i + chunk_size, total_pages)

0 commit comments

Comments
 (0)