3434from datetime import datetime , timezone
3535from typing import Tuple
3636
37- # PDF extraction logic
38- import pdfplumber
39- import logging
40- import io
4137
4238# constants
4339sampler_order_max = 7
@@ -2344,9 +2340,11 @@ def extract_text(genparams):
23442340 docData = genparams .get ("docData" , "" )
23452341 if docData .startswith ("data:text" ):
23462342 docData = docData .split ("," , 1 )[1 ]
2347- elif docData .startswith ("data:application/pdf" ):
2348- docData = docData .split ("," , 1 )[1 ]
2349- return extract_text_from_pdf (docData )
2343+
2344+ # elif docData.startswith("data:application/pdf"):
2345+ # docData = docData.split(",", 1)[1]
2346+ # return extract_text_from_pdf(docData)
2347+
23502348 elif docData .startswith ("data:audio" ):
23512349 genparams ["audio_data" ] = docData
23522350 return whisper_generate (genparams )
@@ -2367,172 +2365,7 @@ def extract_text(genparams):
23672365 print (f"Error extracting text: { str (e )} " )
23682366 return ""
23692367
2370- def extract_text_from_pdf (docData ):
2371- global args
2372-
2373- try :
2374- # Add padding if necessary
2375- padding = len (docData ) % 4
2376- if padding != 0 :
2377- docData += '=' * (4 - padding )
2378-
2379- # Decode the Base64 string
2380- decoded_bytes = base64 .b64decode (docData )
2381-
2382- return getTextFromPDFEncapsulated (decoded_bytes )
2383- except Exception as e :
2384- print (f"Error extracting text: { str (e )} " )
2385- return ""
2386-
2387- # PDF extraction code by sevenof9
2388- def getTextFromPDFEncapsulated (decoded_bytes ):
2389- """
2390- Processes a page based on the page number, content and text settings being passed in.
2391- Returns the page number and the text content
2392- """
2393- def process_page (args ):
2394- import pdfplumber
2395- import json
2396- from pdfplumber .utils import get_bbox_overlap , obj_to_bbox
2397-
2398- # Ensure logging is only at error level (as this could be running in multiple threads)
2399- for logger_name in [
2400- "pdfminer" ,
2401- "pdfminer.pdfparser" ,
2402- "pdfminer.pdfdocument" ,
2403- "pdfminer.pdfpage" ,
2404- "pdfminer.converter" ,
2405- "pdfminer.layout" ,
2406- "pdfminer.cmapdb" ,
2407- "pdfminer.utils"
2408- ]:
2409- logging .getLogger (logger_name ).setLevel (logging .ERROR )
2410-
2411- def clean_cell_text (text ):
2412- if not isinstance (text , str ):
2413- return ""
2414- text = text .replace ("-\n " , "" ).replace ("\n " , " " )
2415- return " " .join (text .split ())
2416-
2417- def safe_join (row ):
2418- return [clean_cell_text (str (cell )) if cell is not None else "" for cell in row ]
2419-
2420- def clamp_bbox (bbox , page_width , page_height ):
2421- x0 , top , x1 , bottom = bbox
2422- x0 = max (0 , min (x0 , page_width ))
2423- x1 = max (0 , min (x1 , page_width ))
2424- top = max (0 , min (top , page_height ))
2425- bottom = max (0 , min (bottom , page_height ))
2426- return (x0 , top , x1 , bottom )
2427-
2428- page_number , pdf , text_settings = args
24292368
2430- page = pdf .pages [page_number ]
2431- page_output = f"Page { page_number + 1 } \n "
2432- page_width = page .width
2433- page_height = page .height
2434-
2435- filtered_page = page
2436- table_bbox_list = []
2437- table_json_outputs = []
2438-
2439- # Table extraction
2440- for table in page .find_tables ():
2441- bbox = clamp_bbox (table .bbox , page_width , page_height )
2442- table_bbox_list .append (bbox )
2443-
2444- if not page .crop (bbox ).chars :
2445- continue
2446-
2447- filtered_page = filtered_page .filter (
2448- lambda obj : get_bbox_overlap (obj_to_bbox (obj ), bbox ) is None
2449- )
2450-
2451- table_data = table .extract ()
2452- if table_data and len (table_data ) >= 1 :
2453- headers = safe_join (table_data [0 ])
2454- rows = [safe_join (row ) for row in table_data [1 :]]
2455- json_table = [dict (zip (headers , row )) for row in rows ]
2456- table_json_outputs .append (json .dumps (json_table , indent = 1 , ensure_ascii = False ))
2457-
2458- # Text extraction based on bounding boxes
2459- chars_outside_tables = [
2460- word for word in page .extract_words (** TEXT_EXTRACTION_SETTINGS )
2461- if not any (
2462- bbox [0 ] <= float (word ['x0' ]) <= bbox [2 ] and
2463- bbox [1 ] <= float (word ['top' ]) <= bbox [3 ]
2464- for bbox in table_bbox_list
2465- )
2466- ]
2467-
2468- current_y = None
2469- line = []
2470- text_content = ""
2471-
2472- for word in chars_outside_tables :
2473- if current_y is None or abs (word ['top' ] - current_y ) > 10 :
2474- if line :
2475- text_content += " " .join (line ) + "\n "
2476- line = [word ['text' ]]
2477- current_y = word ['top' ]
2478- else :
2479- line .append (word ['text' ])
2480- if line :
2481- text_content += " " .join (line ) + "\n "
2482-
2483- page_output += text_content .strip () + "\n "
2484-
2485- for idx , table in enumerate (table_json_outputs , start = 1 ):
2486- page_output += f'"table { idx } ":\n { table } \n '
2487-
2488- return page_number , page_output
2489-
2490- def run_serial (pages ):
2491- # Seroa; execution
2492- return [process_page (args ) for args in pages ]
2493-
2494- def run_parallel (pages ):
2495- from multiprocessing import cpu_count
2496-
2497- # Parallel execution based on either the number of pages or number of CPU cores
2498- num_cores = min (cpu_count (), len (pages ))
2499- print (f"Started processing PDF with { num_cores } cores..." )
2500- with ThreadPoolExecutor (max_workers = 5 ) as exe :
2501- return exe .map (process_page , pages )
2502- # exe.submit(cube,2)
2503-
2504- # Maps the method 'cube' with a list of values.
2505-
2506- # with Pool(num_cores) as pool:
2507- # return pool.map(process_page, pages)
2508-
2509- decoded_bytes = io .BytesIO (decoded_bytes )
2510- with pdfplumber .open (decoded_bytes ) as pdf :
2511- num_pages = len (pdf .pages )
2512-
2513- TEXT_EXTRACTION_SETTINGS = {
2514- "x_tolerance" : 2 ,
2515- "y_tolerance" : 5 ,
2516- "keep_blank_chars" : False ,
2517- "use_text_flow" : True
2518- }
2519-
2520- # Number of pages before multithreading should be used
2521- PARALLEL_THRESHOLD = 8
2522-
2523- pages = [(i , pdf , TEXT_EXTRACTION_SETTINGS ) for i in range (num_pages )]
2524-
2525- if num_pages <= PARALLEL_THRESHOLD :
2526- results = run_serial (pages )
2527- else :
2528- results = run_parallel (pages )
2529-
2530- # Sorting results by their page number
2531- sorted_results = sorted (results , key = lambda x : x [0 ])
2532- final_output = "\n " .join (page_output for _ , page_output in sorted_results )
2533-
2534- return final_output
2535- return ""
25362369
25372370def whisper_generate (genparams ):
25382371 global args
0 commit comments