2626import html .parser
2727import re
2828import time
29+ import urlvalidator as uv
2930
3031
3132gMe = {
4041 '--port' : 'int' ,
4142 '--config' : 'str' ,
4243 '--debug' : 'bool' ,
44+ '--allowed.schemes' : 'list' ,
4345 '--allowed.domains' : 'list' ,
4446 '--bearer.insecure' : 'str'
4547}
4648
47- gConfigNeeded = [ '--allowed.domains' , '--bearer.insecure' ]
49+ gConfigNeeded = [ '--allowed.schemes' , '--allowed. domains' , '--bearer.insecure' ]
4850
4951gAllowedCalls = [ "urltext" , "urlraw" , "pdf2text" ]
5052
@@ -195,27 +197,6 @@ def debug_dump(meta: dict, data: dict):
195197 f .write (f"\n \n \n \n { k } :{ data [k ]} \n \n \n \n " )
196198
197199
198- def validate_url (url : str , tag : str ):
199- """
200- Implement a re based filter logic on the specified url.
201- """
202- tag = f"VU:{ tag } "
203- if (not gMe .get ('--allowed.domains' )):
204- return UrlReqResp (False , 400 , f"DBUG:{ tag } :MissingAllowedDomains" )
205- urlParts = urllib .parse .urlparse (url )
206- print (f"DBUG:ValidateUrl:{ urlParts } , { urlParts .hostname } " )
207- urlHName = urlParts .hostname
208- if not urlHName :
209- return UrlReqResp (False , 400 , f"WARN:{ tag } :Missing hostname in Url" )
210- bMatched = False
211- for filter in gMe ['--allowed.domains' ]:
212- if re .match (filter , urlHName ):
213- bMatched = True
214- if not bMatched :
215- return UrlReqResp (False , 400 , f"WARN:{ tag } :requested hostname not allowed" )
216- return UrlReqResp (True , 200 )
217-
218-
219200def handle_urlreq (ph : ProxyHandler , pr : urllib .parse .ParseResult , tag : str ):
220201 """
221202 Common part of the url request handling used by both urlraw and urltext.
@@ -234,11 +215,9 @@ def handle_urlreq(ph: ProxyHandler, pr: urllib.parse.ParseResult, tag: str):
234215 url = queryParams ['url' ]
235216 print (f"DBUG:{ tag } :Url:{ url } " )
236217 url = url [0 ]
237- if (not url ) or (len (url ) == 0 ):
238- return UrlReqResp (False , 400 , f"WARN:{ tag } :MissingUrl" )
239- gotVU = validate_url (url , tag )
218+ gotVU = uv .validate_url (url , tag )
240219 if not gotVU .callOk :
241- return gotVU
220+ return UrlReqResp ( gotVU . callOk , gotVU . statusCode , gotVU . statusMsg )
242221 try :
243222 hUA = ph .headers .get ('User-Agent' , 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:132.0) Gecko/20100101 Firefox/132.0' )
244223 hAL = ph .headers .get ('Accept-Language' , "en-US,en;q=0.9" )
@@ -381,10 +360,11 @@ def handle_urltext(ph: ProxyHandler, pr: urllib.parse.ParseResult):
381360def process_pdf2text (url : str , startPN : int , endPN : int ):
382361 import pypdf
383362 import io
384- urlParts = url .split ('://' ,1 )
385- if not (urlParts [0 ] in gAllowedPdfUrlTypes ):
386- return { 'status' : 403 , 'msg' : f"WARN:HandlePdf2Text:ForbiddedUrlType:{ urlParts [0 ]} :AllowedUrlTypes:{ gAllowedPdfUrlTypes } " }
387- fPdf = open (urlParts [1 ], 'rb' )
363+ gotVU = uv .validate_url (url , "HandlePdf2Text" )
364+ if not gotVU .callOk :
365+ return { 'status' : gotVU .statusCode , 'msg' : gotVU .statusMsg }
366+ urlParts = urllib .parse .urlparse (url )
367+ fPdf = open (urlParts .path , 'rb' )
388368 dPdf = fPdf .read ()
389369 tPdf = ""
390370 oPdf = pypdf .PdfReader (io .BytesIO (dPdf ))
@@ -398,20 +378,13 @@ def process_pdf2text(url: str, startPN: int, endPN: int):
398378 return { 'status' : 200 , 'msg' : "Pdf2Text Response follows" , 'data' : tPdf }
399379
400380
401- gAllowedPdfUrlTypes = [ "file" , "http" , "https" ]
402-
403381def handle_pdf2text (ph : ProxyHandler , pr : urllib .parse .ParseResult ):
404382 """
405383 Handle requests to pdf2text path, which is used to extract plain text
406384 from the specified pdf file.
407385 """
408386 queryParams = urllib .parse .parse_qs (pr .query )
409- url = queryParams ['url' ]
410- print (f"DBUG:HandlePdf2Text:Url:{ url } " )
411- url = url [0 ]
412- if (not url ) or (len (url ) == 0 ):
413- ph .send_error (400 , f"WARN:HandlePdf2Text:MissingUrl!" )
414- return
387+ url = queryParams ['url' ][0 ]
415388 startP = queryParams ['startPageNumber' ][0 ]
416389 if startP :
417390 startP = int (startP )
@@ -422,7 +395,7 @@ def handle_pdf2text(ph: ProxyHandler, pr: urllib.parse.ParseResult):
422395 endP = int (endP )
423396 else :
424397 endP = - 1
425- print (f"INFO:HandlePdf2Text:Processing:{ url } ..." )
398+ print (f"INFO:HandlePdf2Text:Processing:{ url } : { startP } : { endP } ..." )
426399 gotP2T = process_pdf2text (url , startP , endP )
427400 if (gotP2T ['status' ] != 200 ):
428401 ph .send_error (gotP2T ['status' ], gotP2T ['msg' ] )
@@ -509,6 +482,7 @@ def process_args(args: list[str]):
509482 if gMe .get (k ) == None :
510483 print (f"ERRR:ProcessArgs:{ k } :missing, did you forget to pass the config file..." )
511484 exit (104 )
485+ uv .validator_setup (gMe ['--allowed.schemes' ], gMe ['--allowed.domains' ])
512486
513487
514488def run ():
0 commit comments