Skip to content

Commit ca10e8d

Browse files
committed
SimpleChatTC:SimpleProxy:Move pdf logic into its own module
1 parent b1c45c3 commit ca10e8d

File tree

2 files changed

+60
-53
lines changed

2 files changed

+60
-53
lines changed
Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
# Helper to manage pdf related requests
2+
# by Humans for All
3+
4+
import urllib.parse
5+
import urlvalidator as uv
6+
import simpleproxy as root
7+
8+
9+
def process_pdf2text(url: str, startPN: int, endPN: int):
10+
import pypdf
11+
import io
12+
gotVU = uv.validate_url(url, "HandlePdf2Text")
13+
if not gotVU.callOk:
14+
return { 'status': gotVU.statusCode, 'msg': gotVU.statusMsg }
15+
urlParts = urllib.parse.urlparse(url)
16+
fPdf = open(urlParts.path, 'rb')
17+
dPdf = fPdf.read()
18+
tPdf = ""
19+
oPdf = pypdf.PdfReader(io.BytesIO(dPdf))
20+
if (startPN < 0):
21+
startPN = 0
22+
if (endPN < 0) or (endPN >= len(oPdf.pages)):
23+
endPN = len(oPdf.pages)-1
24+
for i in range(startPN, endPN+1):
25+
pd = oPdf.pages[i]
26+
tPdf = tPdf + pd.extract_text()
27+
return { 'status': 200, 'msg': "Pdf2Text Response follows", 'data': tPdf }
28+
29+
30+
def handle_pdf2text(ph: root.ProxyHandler, pr: urllib.parse.ParseResult):
31+
"""
32+
Handle requests to pdf2text path, which is used to extract plain text
33+
from the specified pdf file.
34+
"""
35+
queryParams = urllib.parse.parse_qs(pr.query)
36+
url = queryParams['url'][0]
37+
startP = queryParams['startPageNumber'][0]
38+
if startP:
39+
startP = int(startP)
40+
else:
41+
startP = -1
42+
endP = queryParams['endPageNumber'][0]
43+
if endP:
44+
endP = int(endP)
45+
else:
46+
endP = -1
47+
print(f"INFO:HandlePdf2Text:Processing:{url}:{startP}:{endP}...")
48+
gotP2T = process_pdf2text(url, startP, endP)
49+
if (gotP2T['status'] != 200):
50+
ph.send_error(gotP2T['status'], gotP2T['msg'] )
51+
return
52+
ph.send_response(gotP2T['status'], gotP2T['msg'])
53+
ph.send_header('Content-Type', 'text/text')
54+
# Add CORS for browser fetch, just in case
55+
ph.send_header('Access-Control-Allow-Origin', '*')
56+
ph.end_headers()
57+
print(f"INFO:HandlePdf2Text:ExtractedText:{url}...")
58+
ph.wfile.write(gotP2T['data'].encode('utf-8'))

tools/server/public_simplechat/local.tools/simpleproxy.py

Lines changed: 2 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
import time
2828
import urlvalidator as uv
2929
from typing import Callable
30+
import pdfmagic as mPdf
3031

3132

3233
gMe = {
@@ -136,7 +137,7 @@ def do_GET(self):
136137
case '/urltext':
137138
self.auth_and_run(pr, handle_urltext)
138139
case '/pdf2text':
139-
self.auth_and_run(pr, handle_pdf2text)
140+
self.auth_and_run(pr, mPdf.handle_pdf2text)
140141
case '/aum':
141142
handle_aum(self, pr)
142143
case _:
@@ -358,58 +359,6 @@ def handle_urltext(ph: ProxyHandler, pr: urllib.parse.ParseResult):
358359
ph.send_error(502, f"WARN:UrlTextFailed:{exc}")
359360

360361

361-
def process_pdf2text(url: str, startPN: int, endPN: int):
362-
import pypdf
363-
import io
364-
gotVU = uv.validate_url(url, "HandlePdf2Text")
365-
if not gotVU.callOk:
366-
return { 'status': gotVU.statusCode, 'msg': gotVU.statusMsg }
367-
urlParts = urllib.parse.urlparse(url)
368-
fPdf = open(urlParts.path, 'rb')
369-
dPdf = fPdf.read()
370-
tPdf = ""
371-
oPdf = pypdf.PdfReader(io.BytesIO(dPdf))
372-
if (startPN < 0):
373-
startPN = 0
374-
if (endPN < 0) or (endPN >= len(oPdf.pages)):
375-
endPN = len(oPdf.pages)-1
376-
for i in range(startPN, endPN+1):
377-
pd = oPdf.pages[i]
378-
tPdf = tPdf + pd.extract_text()
379-
return { 'status': 200, 'msg': "Pdf2Text Response follows", 'data': tPdf }
380-
381-
382-
def handle_pdf2text(ph: ProxyHandler, pr: urllib.parse.ParseResult):
383-
"""
384-
Handle requests to pdf2text path, which is used to extract plain text
385-
from the specified pdf file.
386-
"""
387-
queryParams = urllib.parse.parse_qs(pr.query)
388-
url = queryParams['url'][0]
389-
startP = queryParams['startPageNumber'][0]
390-
if startP:
391-
startP = int(startP)
392-
else:
393-
startP = -1
394-
endP = queryParams['endPageNumber'][0]
395-
if endP:
396-
endP = int(endP)
397-
else:
398-
endP = -1
399-
print(f"INFO:HandlePdf2Text:Processing:{url}:{startP}:{endP}...")
400-
gotP2T = process_pdf2text(url, startP, endP)
401-
if (gotP2T['status'] != 200):
402-
ph.send_error(gotP2T['status'], gotP2T['msg'] )
403-
return
404-
ph.send_response(gotP2T['status'], gotP2T['msg'])
405-
ph.send_header('Content-Type', 'text/text')
406-
# Add CORS for browser fetch, just in case
407-
ph.send_header('Access-Control-Allow-Origin', '*')
408-
ph.end_headers()
409-
print(f"INFO:HandlePdf2Text:ExtractedText:{url}...")
410-
ph.wfile.write(gotP2T['data'].encode('utf-8'))
411-
412-
413362

414363
def load_config():
415364
"""

0 commit comments

Comments
 (0)