2121import sys
2222import http .server
2323import urllib .parse
24- import urllib .request
25- from dataclasses import dataclass
26- import html .parser
2724import time
2825import urlvalidator as uv
2926from typing import Callable
3027import pdfmagic as mPdf
28+ import webmagic as mWeb
3129
3230
3331gMe = {
@@ -133,9 +131,9 @@ def do_GET(self):
133131 print (f"DBUG:ProxyHandler:GET:{ pr } " )
134132 match pr .path :
135133 case '/urlraw' :
136- self .auth_and_run (pr , handle_urlraw )
134+ self .auth_and_run (pr , mWeb . handle_urlraw )
137135 case '/urltext' :
138- self .auth_and_run (pr , handle_urltext )
136+ self .auth_and_run (pr , mWeb . handle_urltext )
139137 case '/pdf2text' :
140138 self .auth_and_run (pr , mPdf .handle_pdf2text )
141139 case '/aum' :
@@ -175,18 +173,6 @@ def handle_aum(ph: ProxyHandler, pr: urllib.parse.ParseResult):
175173 ph .end_headers ()
176174
177175
178- @dataclass (frozen = True )
179- class UrlReqResp :
180- """
181- Used to return result wrt urlreq helper below.
182- """
183- callOk : bool
184- httpStatus : int
185- httpStatusMsg : str = ""
186- contentType : str = ""
187- contentData : str = ""
188-
189-
190176def debug_dump (meta : dict , data : dict ):
191177 if not gMe ['--debug' ]:
192178 return
@@ -199,167 +185,6 @@ def debug_dump(meta: dict, data: dict):
199185 f .write (f"\n \n \n \n { k } :{ data [k ]} \n \n \n \n " )
200186
201187
202- def handle_urlreq (ph : ProxyHandler , pr : urllib .parse .ParseResult , tag : str ):
203- """
204- Common part of the url request handling used by both urlraw and urltext.
205-
206- Verify the url being requested is allowed.
207-
208- Include User-Agent, Accept-Language and Accept in the generated request using
209- equivalent values got in the request being proxied, so as to try mimic the
210- real client, whose request we are proxying. In case a header is missing in the
211- got request, fallback to using some possibly ok enough defaults.
212-
213- Fetch the requested url.
214- """
215- tag = f"UrlReq:{ tag } "
216- queryParams = urllib .parse .parse_qs (pr .query )
217- url = queryParams ['url' ]
218- print (f"DBUG:{ tag } :Url:{ url } " )
219- url = url [0 ]
220- gotVU = uv .validate_url (url , tag )
221- if not gotVU .callOk :
222- return UrlReqResp (gotVU .callOk , gotVU .statusCode , gotVU .statusMsg )
223- try :
224- hUA = ph .headers .get ('User-Agent' , 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:132.0) Gecko/20100101 Firefox/132.0' )
225- hAL = ph .headers .get ('Accept-Language' , "en-US,en;q=0.9" )
226- hA = ph .headers .get ('Accept' , "text/html,*/*" )
227- headers = {
228- 'User-Agent' : hUA ,
229- 'Accept' : hA ,
230- 'Accept-Language' : hAL
231- }
232- req = urllib .request .Request (url , headers = headers )
233- # Get requested url
234- print (f"DBUG:{ tag } :Req:{ req .full_url } :{ req .headers } " )
235- with urllib .request .urlopen (req , timeout = 10 ) as response :
236- contentData = response .read ().decode ('utf-8' )
237- statusCode = response .status or 200
238- contentType = response .getheader ('Content-Type' ) or 'text/html'
239- debug_dump ({ 'url' : req .full_url , 'headers' : req .headers , 'ctype' : contentType }, { 'cdata' : contentData })
240- return UrlReqResp (True , statusCode , "" , contentType , contentData )
241- except Exception as exc :
242- return UrlReqResp (False , 502 , f"WARN:{ tag } :Failed:{ exc } " )
243-
244-
245- def handle_urlraw (ph : ProxyHandler , pr : urllib .parse .ParseResult ):
246- try :
247- # Get requested url
248- got = handle_urlreq (ph , pr , "HandleUrlRaw" )
249- if not got .callOk :
250- ph .send_error (got .httpStatus , got .httpStatusMsg )
251- return
252- # Send back to client
253- ph .send_response (got .httpStatus )
254- ph .send_header ('Content-Type' , got .contentType )
255- # Add CORS for browser fetch, just in case
256- ph .send_header ('Access-Control-Allow-Origin' , '*' )
257- ph .end_headers ()
258- ph .wfile .write (got .contentData .encode ('utf-8' ))
259- except Exception as exc :
260- ph .send_error (502 , f"WARN:UrlRawFailed:{ exc } " )
261-
262-
263- class TextHtmlParser (html .parser .HTMLParser ):
264- """
265- A simple minded logic used to strip html content of
266- * all the html tags as well as
267- * all the contents belonging to below predefined tags like script, style, header, ...
268-
269- NOTE: if the html content/page uses any javascript for client side manipulation/generation of
270- html content, that logic wont be triggered, so also such client side dynamic content wont be
271- got.
272-
273- This helps return a relatively clean textual representation of the html file/content being parsed.
274- """
275-
276- def __init__ (self ):
277- super ().__init__ ()
278- self .inside = {
279- 'body' : False ,
280- 'script' : False ,
281- 'style' : False ,
282- 'header' : False ,
283- 'footer' : False ,
284- 'nav' : False
285- }
286- self .monitored = [ 'body' , 'script' , 'style' , 'header' , 'footer' , 'nav' ]
287- self .bCapture = False
288- self .text = ""
289- self .textStripped = ""
290-
291- def do_capture (self ):
292- """
293- Helps decide whether to capture contents or discard them.
294- """
295- if self .inside ['body' ] and not (self .inside ['script' ] or self .inside ['style' ] or self .inside ['header' ] or self .inside ['footer' ] or self .inside ['nav' ]):
296- return True
297- return False
298-
299- def handle_starttag (self , tag : str , attrs : list [tuple [str , str | None ]]):
300- if tag in self .monitored :
301- self .inside [tag ] = True
302-
303- def handle_endtag (self , tag : str ):
304- if tag in self .monitored :
305- self .inside [tag ] = False
306-
307- def handle_data (self , data : str ):
308- if self .do_capture ():
309- self .text += f"{ data } \n "
310-
311- def syncup (self ):
312- self .textStripped = self .text
313-
314- def strip_adjacent_newlines (self ):
315- oldLen = - 99
316- newLen = len (self .textStripped )
317- aStripped = self .textStripped ;
318- while oldLen != newLen :
319- oldLen = newLen
320- aStripped = aStripped .replace ("\n \n \n " ,"\n " )
321- newLen = len (aStripped )
322- self .textStripped = aStripped
323-
324- def strip_whitespace_lines (self ):
325- aLines = self .textStripped .splitlines ()
326- self .textStripped = ""
327- for line in aLines :
328- if (len (line .strip ())== 0 ):
329- self .textStripped += "\n "
330- continue
331- self .textStripped += f"{ line } \n "
332-
333- def get_stripped_text (self ):
334- self .syncup ()
335- self .strip_whitespace_lines ()
336- self .strip_adjacent_newlines ()
337- return self .textStripped
338-
339-
340- def handle_urltext (ph : ProxyHandler , pr : urllib .parse .ParseResult ):
341- try :
342- # Get requested url
343- got = handle_urlreq (ph , pr , "HandleUrlText" )
344- if not got .callOk :
345- ph .send_error (got .httpStatus , got .httpStatusMsg )
346- return
347- # Extract Text
348- textHtml = TextHtmlParser ()
349- textHtml .feed (got .contentData )
350- # Send back to client
351- ph .send_response (got .httpStatus )
352- ph .send_header ('Content-Type' , got .contentType )
353- # Add CORS for browser fetch, just in case
354- ph .send_header ('Access-Control-Allow-Origin' , '*' )
355- ph .end_headers ()
356- ph .wfile .write (textHtml .get_stripped_text ().encode ('utf-8' ))
357- debug_dump ({ 'RawText' : 'yes' , 'StrippedText' : 'yes' }, { 'RawText' : textHtml .text , 'StrippedText' : textHtml .get_stripped_text () })
358- except Exception as exc :
359- ph .send_error (502 , f"WARN:UrlTextFailed:{ exc } " )
360-
361-
362-
363188def load_config ():
364189 """
365190 Allow loading of a json based config file
0 commit comments