Skip to content

Commit 079cfef

Browse files
committed
Ditch pdfplumber
won't compile on Windows
1 parent 9e2821c commit 079cfef

File tree

3 files changed

+6
-181
lines changed

3 files changed

+6
-181
lines changed

koboldcpp.py

Lines changed: 5 additions & 172 deletions
Original file line numberDiff line numberDiff line change
@@ -34,10 +34,6 @@
3434
from datetime import datetime, timezone
3535
from typing import Tuple
3636

37-
# PDF extraction logic
38-
import pdfplumber
39-
import logging
40-
import io
4137

4238
# constants
4339
sampler_order_max = 7
@@ -2344,9 +2340,11 @@ def extract_text(genparams):
23442340
docData = genparams.get("docData", "")
23452341
if docData.startswith("data:text"):
23462342
docData = docData.split(",", 1)[1]
2347-
elif docData.startswith("data:application/pdf"):
2348-
docData = docData.split(",", 1)[1]
2349-
return extract_text_from_pdf(docData)
2343+
2344+
# elif docData.startswith("data:application/pdf"):
2345+
# docData = docData.split(",", 1)[1]
2346+
# return extract_text_from_pdf(docData)
2347+
23502348
elif docData.startswith("data:audio"):
23512349
genparams["audio_data"] = docData
23522350
return whisper_generate(genparams)
@@ -2367,172 +2365,7 @@ def extract_text(genparams):
23672365
print(f"Error extracting text: {str(e)}")
23682366
return ""
23692367

2370-
def extract_text_from_pdf(docData):
2371-
global args
2372-
2373-
try:
2374-
# Add padding if necessary
2375-
padding = len(docData) % 4
2376-
if padding != 0:
2377-
docData += '=' * (4 - padding)
2378-
2379-
# Decode the Base64 string
2380-
decoded_bytes = base64.b64decode(docData)
2381-
2382-
return getTextFromPDFEncapsulated(decoded_bytes)
2383-
except Exception as e:
2384-
print(f"Error extracting text: {str(e)}")
2385-
return ""
2386-
2387-
# PDF extraction code by sevenof9
2388-
def getTextFromPDFEncapsulated(decoded_bytes):
2389-
"""
2390-
Processes a page based on the page number, content and text settings being passed in.
2391-
Returns the page number and the text content
2392-
"""
2393-
def process_page(args):
2394-
import pdfplumber
2395-
import json
2396-
from pdfplumber.utils import get_bbox_overlap, obj_to_bbox
2397-
2398-
# Ensure logging is only at error level (as this could be running in multiple threads)
2399-
for logger_name in [
2400-
"pdfminer",
2401-
"pdfminer.pdfparser",
2402-
"pdfminer.pdfdocument",
2403-
"pdfminer.pdfpage",
2404-
"pdfminer.converter",
2405-
"pdfminer.layout",
2406-
"pdfminer.cmapdb",
2407-
"pdfminer.utils"
2408-
]:
2409-
logging.getLogger(logger_name).setLevel(logging.ERROR)
2410-
2411-
def clean_cell_text(text):
2412-
if not isinstance(text, str):
2413-
return ""
2414-
text = text.replace("-\n", "").replace("\n", " ")
2415-
return " ".join(text.split())
2416-
2417-
def safe_join(row):
2418-
return [clean_cell_text(str(cell)) if cell is not None else "" for cell in row]
2419-
2420-
def clamp_bbox(bbox, page_width, page_height):
2421-
x0, top, x1, bottom = bbox
2422-
x0 = max(0, min(x0, page_width))
2423-
x1 = max(0, min(x1, page_width))
2424-
top = max(0, min(top, page_height))
2425-
bottom = max(0, min(bottom, page_height))
2426-
return (x0, top, x1, bottom)
2427-
2428-
page_number, pdf, text_settings = args
24292368

2430-
page = pdf.pages[page_number]
2431-
page_output = f"Page {page_number + 1}\n"
2432-
page_width = page.width
2433-
page_height = page.height
2434-
2435-
filtered_page = page
2436-
table_bbox_list = []
2437-
table_json_outputs = []
2438-
2439-
# Table extraction
2440-
for table in page.find_tables():
2441-
bbox = clamp_bbox(table.bbox, page_width, page_height)
2442-
table_bbox_list.append(bbox)
2443-
2444-
if not page.crop(bbox).chars:
2445-
continue
2446-
2447-
filtered_page = filtered_page.filter(
2448-
lambda obj: get_bbox_overlap(obj_to_bbox(obj), bbox) is None
2449-
)
2450-
2451-
table_data = table.extract()
2452-
if table_data and len(table_data) >= 1:
2453-
headers = safe_join(table_data[0])
2454-
rows = [safe_join(row) for row in table_data[1:]]
2455-
json_table = [dict(zip(headers, row)) for row in rows]
2456-
table_json_outputs.append(json.dumps(json_table, indent=1, ensure_ascii=False))
2457-
2458-
# Text extraction based on bounding boxes
2459-
chars_outside_tables = [
2460-
word for word in page.extract_words(**TEXT_EXTRACTION_SETTINGS)
2461-
if not any(
2462-
bbox[0] <= float(word['x0']) <= bbox[2] and
2463-
bbox[1] <= float(word['top']) <= bbox[3]
2464-
for bbox in table_bbox_list
2465-
)
2466-
]
2467-
2468-
current_y = None
2469-
line = []
2470-
text_content = ""
2471-
2472-
for word in chars_outside_tables:
2473-
if current_y is None or abs(word['top'] - current_y) > 10:
2474-
if line:
2475-
text_content += " ".join(line) + "\n"
2476-
line = [word['text']]
2477-
current_y = word['top']
2478-
else:
2479-
line.append(word['text'])
2480-
if line:
2481-
text_content += " ".join(line) + "\n"
2482-
2483-
page_output += text_content.strip() + "\n"
2484-
2485-
for idx, table in enumerate(table_json_outputs, start=1):
2486-
page_output += f'"table {idx}":\n{table}\n'
2487-
2488-
return page_number, page_output
2489-
2490-
def run_serial(pages):
2491-
# Seroa; execution
2492-
return [process_page(args) for args in pages]
2493-
2494-
def run_parallel(pages):
2495-
from multiprocessing import cpu_count
2496-
2497-
# Parallel execution based on either the number of pages or number of CPU cores
2498-
num_cores = min(cpu_count(), len(pages))
2499-
print(f"Started processing PDF with {num_cores} cores...")
2500-
with ThreadPoolExecutor(max_workers=5) as exe:
2501-
return exe.map(process_page, pages)
2502-
# exe.submit(cube,2)
2503-
2504-
# Maps the method 'cube' with a list of values.
2505-
2506-
# with Pool(num_cores) as pool:
2507-
# return pool.map(process_page, pages)
2508-
2509-
decoded_bytes = io.BytesIO(decoded_bytes)
2510-
with pdfplumber.open(decoded_bytes) as pdf:
2511-
num_pages = len(pdf.pages)
2512-
2513-
TEXT_EXTRACTION_SETTINGS = {
2514-
"x_tolerance": 2,
2515-
"y_tolerance": 5,
2516-
"keep_blank_chars": False,
2517-
"use_text_flow": True
2518-
}
2519-
2520-
# Number of pages before multithreading should be used
2521-
PARALLEL_THRESHOLD = 8
2522-
2523-
pages = [(i, pdf, TEXT_EXTRACTION_SETTINGS) for i in range(num_pages)]
2524-
2525-
if num_pages <= PARALLEL_THRESHOLD:
2526-
results = run_serial(pages)
2527-
else:
2528-
results = run_parallel(pages)
2529-
2530-
# Sorting results by their page number
2531-
sorted_results = sorted(results, key=lambda x: x[0])
2532-
final_output = "\n".join(page_output for _, page_output in sorted_results)
2533-
2534-
return final_output
2535-
return ""
25362369

25372370
def whisper_generate(genparams):
25382371
global args

make_pyinstaller_Fks_Cublas_12.8_Pascal_Turing_and_beyond.bat

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,5 +3,5 @@ call create_ver_file.bat
33
copy "P:\NVIDIAGPUCT\CUDA\v12.8\bin\cudart64_12.dll" .\ /Y
44
copy "P:\NVIDIAGPUCT\CUDA\v12.8\bin\cublasLt64_12.dll" .\ /Y
55
copy "P:\NVIDIAGPUCT\CUDA\v12.8\bin\cublas64_12.dll" .\ /Y
6-
PyInstaller --noconfirm --onefile --clean --console --collect-all customtkinter --collect-all psutil --collect-all pdfplumber --icon "./nikogreen.ico" --add-data "./simpleclinfo.exe;." --add-data "./aria2c-win.exe;." --add-data "./OpenCL.dll;." --add-data "./kcpp_adapters;./kcpp_adapters" --add-data "./gguf-py;./gguf-py" --add-data "./koboldcpp.py;." --add-data "./json_to_gbnf.py;." --add-data "./klite.embd;." --add-data "./kcpp_docs.embd;." --add-data "./kcpp_sdui.embd;." --add-data "./taesd.embd;." --add-data "./taesd_xl.embd;." --add-data "./taesd_f.embd;." --add-data "./taesd_3.embd;." --add-data "./koboldcpp_cublas.dll;." --add-data "./cublasLt64_12.dll;." --add-data "./cublas64_12.dll;." --add-data "./cudart64_12.dll;." --add-data "./msvcp140.dll;." --add-data "./msvcp140_codecvt_ids.dll;." --add-data "./vcruntime140.dll;." --add-data "./vcruntime140_1.dll;." --add-data "./rwkv_vocab.embd;." --add-data "./rwkv_world_vocab.embd;." --version-file "./version.txt" "./koboldcpp.py" -n "croco.cpp_fks_cuda_12.8_Pascal_Turing_and_beyond.exe"
6+
PyInstaller --noconfirm --onefile --clean --console --collect-all customtkinter --collect-all psutil --collect-all multiprocess --icon "./nikogreen.ico" --add-data "./simpleclinfo.exe;." --add-data "./aria2c-win.exe;." --add-data "./OpenCL.dll;." --add-data "./kcpp_adapters;./kcpp_adapters" --add-data "./gguf-py;./gguf-py" --add-data "./koboldcpp.py;." --add-data "./json_to_gbnf.py;." --add-data "./klite.embd;." --add-data "./kcpp_docs.embd;." --add-data "./kcpp_sdui.embd;." --add-data "./taesd.embd;." --add-data "./taesd_xl.embd;." --add-data "./taesd_f.embd;." --add-data "./taesd_3.embd;." --add-data "./koboldcpp_cublas.dll;." --add-data "./cublasLt64_12.dll;." --add-data "./cublas64_12.dll;." --add-data "./cudart64_12.dll;." --add-data "./msvcp140.dll;." --add-data "./msvcp140_codecvt_ids.dll;." --add-data "./vcruntime140.dll;." --add-data "./vcruntime140_1.dll;." --add-data "./rwkv_vocab.embd;." --add-data "./rwkv_world_vocab.embd;." --version-file "./version.txt" "./koboldcpp.py" -n "croco.cpp_fks_cuda_12.8_Pascal_Turing_and_beyond.exe"
77
pause

requirements.txt

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -5,11 +5,3 @@ gguf~=0.13.0
55
customtkinter>=5.2.2
66
protobuf>=4.21.12
77
psutil>=6.1.1
8-
pdfplumber>=0.11.6
9-
multiprocess>=0.70.16
10-
expecttest>=0.3.0
11-
onnx>=1.17.0
12-
onnxscript>=0.2.5
13-
pdfminer.six==20250327
14-
Pillow>=9.1
15-
pypdfium2>=4.18.0

0 commit comments

Comments
 (0)