Skip to content

Commit ed7fdb3

Browse files
committed
SimpleChatTC:SimpleProxy: Move web requests to its own module
1 parent ca10e8d commit ed7fdb3

File tree

2 files changed

+184
-178
lines changed

2 files changed

+184
-178
lines changed

tools/server/public_simplechat/local.tools/simpleproxy.py

Lines changed: 3 additions & 178 deletions
Original file line numberDiff line numberDiff line change
@@ -21,13 +21,11 @@
2121
import sys
2222
import http.server
2323
import urllib.parse
24-
import urllib.request
25-
from dataclasses import dataclass
26-
import html.parser
2724
import time
2825
import urlvalidator as uv
2926
from typing import Callable
3027
import pdfmagic as mPdf
28+
import webmagic as mWeb
3129

3230

3331
gMe = {
@@ -133,9 +131,9 @@ def do_GET(self):
133131
print(f"DBUG:ProxyHandler:GET:{pr}")
134132
match pr.path:
135133
case '/urlraw':
136-
self.auth_and_run(pr, handle_urlraw)
134+
self.auth_and_run(pr, mWeb.handle_urlraw)
137135
case '/urltext':
138-
self.auth_and_run(pr, handle_urltext)
136+
self.auth_and_run(pr, mWeb.handle_urltext)
139137
case '/pdf2text':
140138
self.auth_and_run(pr, mPdf.handle_pdf2text)
141139
case '/aum':
@@ -175,18 +173,6 @@ def handle_aum(ph: ProxyHandler, pr: urllib.parse.ParseResult):
175173
ph.end_headers()
176174

177175

178-
@dataclass(frozen=True)
179-
class UrlReqResp:
180-
"""
181-
Used to return result wrt urlreq helper below.
182-
"""
183-
callOk: bool
184-
httpStatus: int
185-
httpStatusMsg: str = ""
186-
contentType: str = ""
187-
contentData: str = ""
188-
189-
190176
def debug_dump(meta: dict, data: dict):
191177
if not gMe['--debug']:
192178
return
@@ -199,167 +185,6 @@ def debug_dump(meta: dict, data: dict):
199185
f.write(f"\n\n\n\n{k}:{data[k]}\n\n\n\n")
200186

201187

202-
def handle_urlreq(ph: ProxyHandler, pr: urllib.parse.ParseResult, tag: str):
203-
"""
204-
Common part of the url request handling used by both urlraw and urltext.
205-
206-
Verify the url being requested is allowed.
207-
208-
Include User-Agent, Accept-Language and Accept in the generated request using
209-
equivalent values got in the request being proxied, so as to try mimic the
210-
real client, whose request we are proxying. In case a header is missing in the
211-
got request, fallback to using some possibly ok enough defaults.
212-
213-
Fetch the requested url.
214-
"""
215-
tag=f"UrlReq:{tag}"
216-
queryParams = urllib.parse.parse_qs(pr.query)
217-
url = queryParams['url']
218-
print(f"DBUG:{tag}:Url:{url}")
219-
url = url[0]
220-
gotVU = uv.validate_url(url, tag)
221-
if not gotVU.callOk:
222-
return UrlReqResp(gotVU.callOk, gotVU.statusCode, gotVU.statusMsg)
223-
try:
224-
hUA = ph.headers.get('User-Agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:132.0) Gecko/20100101 Firefox/132.0')
225-
hAL = ph.headers.get('Accept-Language', "en-US,en;q=0.9")
226-
hA = ph.headers.get('Accept', "text/html,*/*")
227-
headers = {
228-
'User-Agent': hUA,
229-
'Accept': hA,
230-
'Accept-Language': hAL
231-
}
232-
req = urllib.request.Request(url, headers=headers)
233-
# Get requested url
234-
print(f"DBUG:{tag}:Req:{req.full_url}:{req.headers}")
235-
with urllib.request.urlopen(req, timeout=10) as response:
236-
contentData = response.read().decode('utf-8')
237-
statusCode = response.status or 200
238-
contentType = response.getheader('Content-Type') or 'text/html'
239-
debug_dump({ 'url': req.full_url, 'headers': req.headers, 'ctype': contentType }, { 'cdata': contentData })
240-
return UrlReqResp(True, statusCode, "", contentType, contentData)
241-
except Exception as exc:
242-
return UrlReqResp(False, 502, f"WARN:{tag}:Failed:{exc}")
243-
244-
245-
def handle_urlraw(ph: ProxyHandler, pr: urllib.parse.ParseResult):
246-
try:
247-
# Get requested url
248-
got = handle_urlreq(ph, pr, "HandleUrlRaw")
249-
if not got.callOk:
250-
ph.send_error(got.httpStatus, got.httpStatusMsg)
251-
return
252-
# Send back to client
253-
ph.send_response(got.httpStatus)
254-
ph.send_header('Content-Type', got.contentType)
255-
# Add CORS for browser fetch, just in case
256-
ph.send_header('Access-Control-Allow-Origin', '*')
257-
ph.end_headers()
258-
ph.wfile.write(got.contentData.encode('utf-8'))
259-
except Exception as exc:
260-
ph.send_error(502, f"WARN:UrlRawFailed:{exc}")
261-
262-
263-
class TextHtmlParser(html.parser.HTMLParser):
264-
"""
265-
A simple minded logic used to strip html content of
266-
* all the html tags as well as
267-
* all the contents belonging to below predefined tags like script, style, header, ...
268-
269-
NOTE: if the html content/page uses any javascript for client side manipulation/generation of
270-
html content, that logic wont be triggered, so also such client side dynamic content wont be
271-
got.
272-
273-
This helps return a relatively clean textual representation of the html file/content being parsed.
274-
"""
275-
276-
def __init__(self):
277-
super().__init__()
278-
self.inside = {
279-
'body': False,
280-
'script': False,
281-
'style': False,
282-
'header': False,
283-
'footer': False,
284-
'nav': False
285-
}
286-
self.monitored = [ 'body', 'script', 'style', 'header', 'footer', 'nav' ]
287-
self.bCapture = False
288-
self.text = ""
289-
self.textStripped = ""
290-
291-
def do_capture(self):
292-
"""
293-
Helps decide whether to capture contents or discard them.
294-
"""
295-
if self.inside['body'] and not (self.inside['script'] or self.inside['style'] or self.inside['header'] or self.inside['footer'] or self.inside['nav']):
296-
return True
297-
return False
298-
299-
def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]):
300-
if tag in self.monitored:
301-
self.inside[tag] = True
302-
303-
def handle_endtag(self, tag: str):
304-
if tag in self.monitored:
305-
self.inside[tag] = False
306-
307-
def handle_data(self, data: str):
308-
if self.do_capture():
309-
self.text += f"{data}\n"
310-
311-
def syncup(self):
312-
self.textStripped = self.text
313-
314-
def strip_adjacent_newlines(self):
315-
oldLen = -99
316-
newLen = len(self.textStripped)
317-
aStripped = self.textStripped;
318-
while oldLen != newLen:
319-
oldLen = newLen
320-
aStripped = aStripped.replace("\n\n\n","\n")
321-
newLen = len(aStripped)
322-
self.textStripped = aStripped
323-
324-
def strip_whitespace_lines(self):
325-
aLines = self.textStripped.splitlines()
326-
self.textStripped = ""
327-
for line in aLines:
328-
if (len(line.strip())==0):
329-
self.textStripped += "\n"
330-
continue
331-
self.textStripped += f"{line}\n"
332-
333-
def get_stripped_text(self):
334-
self.syncup()
335-
self.strip_whitespace_lines()
336-
self.strip_adjacent_newlines()
337-
return self.textStripped
338-
339-
340-
def handle_urltext(ph: ProxyHandler, pr: urllib.parse.ParseResult):
341-
try:
342-
# Get requested url
343-
got = handle_urlreq(ph, pr, "HandleUrlText")
344-
if not got.callOk:
345-
ph.send_error(got.httpStatus, got.httpStatusMsg)
346-
return
347-
# Extract Text
348-
textHtml = TextHtmlParser()
349-
textHtml.feed(got.contentData)
350-
# Send back to client
351-
ph.send_response(got.httpStatus)
352-
ph.send_header('Content-Type', got.contentType)
353-
# Add CORS for browser fetch, just in case
354-
ph.send_header('Access-Control-Allow-Origin', '*')
355-
ph.end_headers()
356-
ph.wfile.write(textHtml.get_stripped_text().encode('utf-8'))
357-
debug_dump({ 'RawText': 'yes', 'StrippedText': 'yes' }, { 'RawText': textHtml.text, 'StrippedText': textHtml.get_stripped_text() })
358-
except Exception as exc:
359-
ph.send_error(502, f"WARN:UrlTextFailed:{exc}")
360-
361-
362-
363188
def load_config():
364189
"""
365190
Allow loading of a json based config file
Lines changed: 181 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,181 @@
1+
# Helper to manage web related requests
2+
# by Humans for All
3+
4+
import urllib.parse
5+
import urllib.request
6+
import simpleproxy as root
7+
import urlvalidator as uv
8+
from dataclasses import dataclass
9+
import html.parser
10+
11+
12+
@dataclass(frozen=True)
13+
class UrlReqResp:
14+
"""
15+
Used to return result wrt urlreq helper below.
16+
"""
17+
callOk: bool
18+
httpStatus: int
19+
httpStatusMsg: str = ""
20+
contentType: str = ""
21+
contentData: str = ""
22+
23+
24+
def handle_urlreq(ph: root.ProxyHandler, pr: urllib.parse.ParseResult, tag: str):
25+
"""
26+
Common part of the url request handling used by both urlraw and urltext.
27+
28+
Verify the url being requested is allowed.
29+
30+
Include User-Agent, Accept-Language and Accept in the generated request using
31+
equivalent values got in the request being proxied, so as to try mimic the
32+
real client, whose request we are proxying. In case a header is missing in the
33+
got request, fallback to using some possibly ok enough defaults.
34+
35+
Fetch the requested url.
36+
"""
37+
tag=f"UrlReq:{tag}"
38+
queryParams = urllib.parse.parse_qs(pr.query)
39+
url = queryParams['url']
40+
print(f"DBUG:{tag}:Url:{url}")
41+
url = url[0]
42+
gotVU = uv.validate_url(url, tag)
43+
if not gotVU.callOk:
44+
return UrlReqResp(gotVU.callOk, gotVU.statusCode, gotVU.statusMsg)
45+
try:
46+
hUA = ph.headers.get('User-Agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:132.0) Gecko/20100101 Firefox/132.0')
47+
hAL = ph.headers.get('Accept-Language', "en-US,en;q=0.9")
48+
hA = ph.headers.get('Accept', "text/html,*/*")
49+
headers = {
50+
'User-Agent': hUA,
51+
'Accept': hA,
52+
'Accept-Language': hAL
53+
}
54+
req = urllib.request.Request(url, headers=headers)
55+
# Get requested url
56+
print(f"DBUG:{tag}:Req:{req.full_url}:{req.headers}")
57+
with urllib.request.urlopen(req, timeout=10) as response:
58+
contentData = response.read().decode('utf-8')
59+
statusCode = response.status or 200
60+
contentType = response.getheader('Content-Type') or 'text/html'
61+
root.debug_dump({ 'url': req.full_url, 'headers': req.headers, 'ctype': contentType }, { 'cdata': contentData })
62+
return UrlReqResp(True, statusCode, "", contentType, contentData)
63+
except Exception as exc:
64+
return UrlReqResp(False, 502, f"WARN:{tag}:Failed:{exc}")
65+
66+
67+
def handle_urlraw(ph: root.ProxyHandler, pr: urllib.parse.ParseResult):
68+
try:
69+
# Get requested url
70+
got = handle_urlreq(ph, pr, "HandleUrlRaw")
71+
if not got.callOk:
72+
ph.send_error(got.httpStatus, got.httpStatusMsg)
73+
return
74+
# Send back to client
75+
ph.send_response(got.httpStatus)
76+
ph.send_header('Content-Type', got.contentType)
77+
# Add CORS for browser fetch, just in case
78+
ph.send_header('Access-Control-Allow-Origin', '*')
79+
ph.end_headers()
80+
ph.wfile.write(got.contentData.encode('utf-8'))
81+
except Exception as exc:
82+
ph.send_error(502, f"WARN:UrlRawFailed:{exc}")
83+
84+
85+
class TextHtmlParser(html.parser.HTMLParser):
86+
"""
87+
A simple minded logic used to strip html content of
88+
* all the html tags as well as
89+
* all the contents belonging to below predefined tags like script, style, header, ...
90+
91+
NOTE: if the html content/page uses any javascript for client side manipulation/generation of
92+
html content, that logic wont be triggered, so also such client side dynamic content wont be
93+
got.
94+
95+
This helps return a relatively clean textual representation of the html file/content being parsed.
96+
"""
97+
98+
def __init__(self):
99+
super().__init__()
100+
self.inside = {
101+
'body': False,
102+
'script': False,
103+
'style': False,
104+
'header': False,
105+
'footer': False,
106+
'nav': False
107+
}
108+
self.monitored = [ 'body', 'script', 'style', 'header', 'footer', 'nav' ]
109+
self.bCapture = False
110+
self.text = ""
111+
self.textStripped = ""
112+
113+
def do_capture(self):
114+
"""
115+
Helps decide whether to capture contents or discard them.
116+
"""
117+
if self.inside['body'] and not (self.inside['script'] or self.inside['style'] or self.inside['header'] or self.inside['footer'] or self.inside['nav']):
118+
return True
119+
return False
120+
121+
def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]):
122+
if tag in self.monitored:
123+
self.inside[tag] = True
124+
125+
def handle_endtag(self, tag: str):
126+
if tag in self.monitored:
127+
self.inside[tag] = False
128+
129+
def handle_data(self, data: str):
130+
if self.do_capture():
131+
self.text += f"{data}\n"
132+
133+
def syncup(self):
134+
self.textStripped = self.text
135+
136+
def strip_adjacent_newlines(self):
137+
oldLen = -99
138+
newLen = len(self.textStripped)
139+
aStripped = self.textStripped;
140+
while oldLen != newLen:
141+
oldLen = newLen
142+
aStripped = aStripped.replace("\n\n\n","\n")
143+
newLen = len(aStripped)
144+
self.textStripped = aStripped
145+
146+
def strip_whitespace_lines(self):
147+
aLines = self.textStripped.splitlines()
148+
self.textStripped = ""
149+
for line in aLines:
150+
if (len(line.strip())==0):
151+
self.textStripped += "\n"
152+
continue
153+
self.textStripped += f"{line}\n"
154+
155+
def get_stripped_text(self):
156+
self.syncup()
157+
self.strip_whitespace_lines()
158+
self.strip_adjacent_newlines()
159+
return self.textStripped
160+
161+
162+
def handle_urltext(ph: root.ProxyHandler, pr: urllib.parse.ParseResult):
163+
try:
164+
# Get requested url
165+
got = handle_urlreq(ph, pr, "HandleUrlText")
166+
if not got.callOk:
167+
ph.send_error(got.httpStatus, got.httpStatusMsg)
168+
return
169+
# Extract Text
170+
textHtml = TextHtmlParser()
171+
textHtml.feed(got.contentData)
172+
# Send back to client
173+
ph.send_response(got.httpStatus)
174+
ph.send_header('Content-Type', got.contentType)
175+
# Add CORS for browser fetch, just in case
176+
ph.send_header('Access-Control-Allow-Origin', '*')
177+
ph.end_headers()
178+
ph.wfile.write(textHtml.get_stripped_text().encode('utf-8'))
179+
root.debug_dump({ 'RawText': 'yes', 'StrippedText': 'yes' }, { 'RawText': textHtml.text, 'StrippedText': textHtml.get_stripped_text() })
180+
except Exception as exc:
181+
ph.send_error(502, f"WARN:UrlTextFailed:{exc}")

0 commit comments

Comments
 (0)