|
27 | 27 | import time |
28 | 28 | import urlvalidator as uv |
29 | 29 | from typing import Callable |
| 30 | +import pdfmagic as mPdf |
30 | 31 |
|
31 | 32 |
|
32 | 33 | gMe = { |
@@ -136,7 +137,7 @@ def do_GET(self): |
136 | 137 | case '/urltext': |
137 | 138 | self.auth_and_run(pr, handle_urltext) |
138 | 139 | case '/pdf2text': |
139 | | - self.auth_and_run(pr, handle_pdf2text) |
| 140 | + self.auth_and_run(pr, mPdf.handle_pdf2text) |
140 | 141 | case '/aum': |
141 | 142 | handle_aum(self, pr) |
142 | 143 | case _: |
@@ -358,58 +359,6 @@ def handle_urltext(ph: ProxyHandler, pr: urllib.parse.ParseResult): |
358 | 359 | ph.send_error(502, f"WARN:UrlTextFailed:{exc}") |
359 | 360 |
|
360 | 361 |
|
361 | | -def process_pdf2text(url: str, startPN: int, endPN: int): |
362 | | - import pypdf |
363 | | - import io |
364 | | - gotVU = uv.validate_url(url, "HandlePdf2Text") |
365 | | - if not gotVU.callOk: |
366 | | - return { 'status': gotVU.statusCode, 'msg': gotVU.statusMsg } |
367 | | - urlParts = urllib.parse.urlparse(url) |
368 | | - fPdf = open(urlParts.path, 'rb') |
369 | | - dPdf = fPdf.read() |
370 | | - tPdf = "" |
371 | | - oPdf = pypdf.PdfReader(io.BytesIO(dPdf)) |
372 | | - if (startPN < 0): |
373 | | - startPN = 0 |
374 | | - if (endPN < 0) or (endPN >= len(oPdf.pages)): |
375 | | - endPN = len(oPdf.pages)-1 |
376 | | - for i in range(startPN, endPN+1): |
377 | | - pd = oPdf.pages[i] |
378 | | - tPdf = tPdf + pd.extract_text() |
379 | | - return { 'status': 200, 'msg': "Pdf2Text Response follows", 'data': tPdf } |
380 | | - |
381 | | - |
382 | | -def handle_pdf2text(ph: ProxyHandler, pr: urllib.parse.ParseResult): |
383 | | - """ |
384 | | - Handle requests to pdf2text path, which is used to extract plain text |
385 | | - from the specified pdf file. |
386 | | - """ |
387 | | - queryParams = urllib.parse.parse_qs(pr.query) |
388 | | - url = queryParams['url'][0] |
389 | | - startP = queryParams['startPageNumber'][0] |
390 | | - if startP: |
391 | | - startP = int(startP) |
392 | | - else: |
393 | | - startP = -1 |
394 | | - endP = queryParams['endPageNumber'][0] |
395 | | - if endP: |
396 | | - endP = int(endP) |
397 | | - else: |
398 | | - endP = -1 |
399 | | - print(f"INFO:HandlePdf2Text:Processing:{url}:{startP}:{endP}...") |
400 | | - gotP2T = process_pdf2text(url, startP, endP) |
401 | | - if (gotP2T['status'] != 200): |
402 | | - ph.send_error(gotP2T['status'], gotP2T['msg'] ) |
403 | | - return |
404 | | - ph.send_response(gotP2T['status'], gotP2T['msg']) |
405 | | - ph.send_header('Content-Type', 'text/text') |
406 | | - # Add CORS for browser fetch, just in case |
407 | | - ph.send_header('Access-Control-Allow-Origin', '*') |
408 | | - ph.end_headers() |
409 | | - print(f"INFO:HandlePdf2Text:ExtractedText:{url}...") |
410 | | - ph.wfile.write(gotP2T['data'].encode('utf-8')) |
411 | | - |
412 | | - |
413 | 362 |
|
414 | 363 | def load_config(): |
415 | 364 | """ |
|
0 commit comments