Skip to content

Commit dfbfbd9

Browse files
committed
SimpleChatTC:SimpleProxy: Use urlvalidator
Add --allowed.schemes config entry as a needed config. Setup the url validator. Use this wrt urltext, urlraw and pdf2text This allows user to control whether local file access is enabled or not. By default in the sample simpleproxy.json config file local file access is allowed.
1 parent ed3e514 commit dfbfbd9

File tree

3 files changed

+31
-39
lines changed

3 files changed

+31
-39
lines changed

tools/server/public_simplechat/local.tools/simpleproxy.json

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,9 @@
11
{
2+
"allowed.schemes": [
3+
"file",
4+
"http",
5+
"https"
6+
],
27
"allowed.domains": [
38
".*\\.wikipedia\\.org$",
49
".*\\.bing\\.com$",

tools/server/public_simplechat/local.tools/simpleproxy.py

Lines changed: 13 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
import html.parser
2727
import re
2828
import time
29+
import urlvalidator as uv
2930

3031

3132
gMe = {
@@ -40,11 +41,12 @@
4041
'--port': 'int',
4142
'--config': 'str',
4243
'--debug': 'bool',
44+
'--allowed.schemes': 'list',
4345
'--allowed.domains': 'list',
4446
'--bearer.insecure': 'str'
4547
}
4648

47-
gConfigNeeded = [ '--allowed.domains', '--bearer.insecure' ]
49+
gConfigNeeded = [ '--allowed.schemes', '--allowed.domains', '--bearer.insecure' ]
4850

4951
gAllowedCalls = [ "urltext", "urlraw", "pdf2text" ]
5052

@@ -195,27 +197,6 @@ def debug_dump(meta: dict, data: dict):
195197
f.write(f"\n\n\n\n{k}:{data[k]}\n\n\n\n")
196198

197199

198-
def validate_url(url: str, tag: str):
199-
"""
200-
Implement a re based filter logic on the specified url.
201-
"""
202-
tag=f"VU:{tag}"
203-
if (not gMe.get('--allowed.domains')):
204-
return UrlReqResp(False, 400, f"DBUG:{tag}:MissingAllowedDomains")
205-
urlParts = urllib.parse.urlparse(url)
206-
print(f"DBUG:ValidateUrl:{urlParts}, {urlParts.hostname}")
207-
urlHName = urlParts.hostname
208-
if not urlHName:
209-
return UrlReqResp(False, 400, f"WARN:{tag}:Missing hostname in Url")
210-
bMatched = False
211-
for filter in gMe['--allowed.domains']:
212-
if re.match(filter, urlHName):
213-
bMatched = True
214-
if not bMatched:
215-
return UrlReqResp(False, 400, f"WARN:{tag}:requested hostname not allowed")
216-
return UrlReqResp(True, 200)
217-
218-
219200
def handle_urlreq(ph: ProxyHandler, pr: urllib.parse.ParseResult, tag: str):
220201
"""
221202
Common part of the url request handling used by both urlraw and urltext.
@@ -234,11 +215,9 @@ def handle_urlreq(ph: ProxyHandler, pr: urllib.parse.ParseResult, tag: str):
234215
url = queryParams['url']
235216
print(f"DBUG:{tag}:Url:{url}")
236217
url = url[0]
237-
if (not url) or (len(url) == 0):
238-
return UrlReqResp(False, 400, f"WARN:{tag}:MissingUrl")
239-
gotVU = validate_url(url, tag)
218+
gotVU = uv.validate_url(url, tag)
240219
if not gotVU.callOk:
241-
return gotVU
220+
return UrlReqResp(gotVU.callOk, gotVU.statusCode, gotVU.statusMsg)
242221
try:
243222
hUA = ph.headers.get('User-Agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:132.0) Gecko/20100101 Firefox/132.0')
244223
hAL = ph.headers.get('Accept-Language', "en-US,en;q=0.9")
@@ -381,10 +360,11 @@ def handle_urltext(ph: ProxyHandler, pr: urllib.parse.ParseResult):
381360
def process_pdf2text(url: str, startPN: int, endPN: int):
382361
import pypdf
383362
import io
384-
urlParts = url.split('://',1)
385-
if not (urlParts[0] in gAllowedPdfUrlTypes):
386-
return { 'status': 403, 'msg': f"WARN:HandlePdf2Text:ForbiddedUrlType:{urlParts[0]}:AllowedUrlTypes:{gAllowedPdfUrlTypes}" }
387-
fPdf = open(urlParts[1], 'rb')
363+
gotVU = uv.validate_url(url, "HandlePdf2Text")
364+
if not gotVU.callOk:
365+
return { 'status': gotVU.statusCode, 'msg': gotVU.statusMsg }
366+
urlParts = urllib.parse.urlparse(url)
367+
fPdf = open(urlParts.path, 'rb')
388368
dPdf = fPdf.read()
389369
tPdf = ""
390370
oPdf = pypdf.PdfReader(io.BytesIO(dPdf))
@@ -398,20 +378,13 @@ def process_pdf2text(url: str, startPN: int, endPN: int):
398378
return { 'status': 200, 'msg': "Pdf2Text Response follows", 'data': tPdf }
399379

400380

401-
gAllowedPdfUrlTypes = [ "file", "http", "https" ]
402-
403381
def handle_pdf2text(ph: ProxyHandler, pr: urllib.parse.ParseResult):
404382
"""
405383
Handle requests to pdf2text path, which is used to extract plain text
406384
from the specified pdf file.
407385
"""
408386
queryParams = urllib.parse.parse_qs(pr.query)
409-
url = queryParams['url']
410-
print(f"DBUG:HandlePdf2Text:Url:{url}")
411-
url = url[0]
412-
if (not url) or (len(url) == 0):
413-
ph.send_error(400, f"WARN:HandlePdf2Text:MissingUrl!")
414-
return
387+
url = queryParams['url'][0]
415388
startP = queryParams['startPageNumber'][0]
416389
if startP:
417390
startP = int(startP)
@@ -422,7 +395,7 @@ def handle_pdf2text(ph: ProxyHandler, pr: urllib.parse.ParseResult):
422395
endP = int(endP)
423396
else:
424397
endP = -1
425-
print(f"INFO:HandlePdf2Text:Processing:{url}...")
398+
print(f"INFO:HandlePdf2Text:Processing:{url}:{startP}:{endP}...")
426399
gotP2T = process_pdf2text(url, startP, endP)
427400
if (gotP2T['status'] != 200):
428401
ph.send_error(gotP2T['status'], gotP2T['msg'] )
@@ -509,6 +482,7 @@ def process_args(args: list[str]):
509482
if gMe.get(k) == None:
510483
print(f"ERRR:ProcessArgs:{k}:missing, did you forget to pass the config file...")
511484
exit(104)
485+
uv.validator_setup(gMe['--allowed.schemes'], gMe['--allowed.domains'])
512486

513487

514488
def run():

tools/server/public_simplechat/local.tools/urlvalidator.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,12 @@
1010
}
1111

1212

13+
def validator_setup(allowedSchemes: list[str], allowedDomains: list[str]):
14+
global gMe
15+
gMe['--allowed.schemes'] = allowedSchemes
16+
gMe['--allowed.domains'] = allowedDomains
17+
18+
1319
@dataclass(frozen=True)
1420
class UrlVResponse:
1521
"""
@@ -21,6 +27,9 @@ class UrlVResponse:
2127

2228

2329
def validator_ok(tag: str):
30+
"""
31+
Cross check validator is setup as needed
32+
"""
2433
if (not gMe.get('--allowed.domains')):
2534
return UrlVResponse(False, 400, f"DBUG:{tag}:MissingAllowedDomains")
2635
if (not gMe.get('--allowed.schemes')):
@@ -29,6 +38,8 @@ def validator_ok(tag: str):
2938

3039

3140
def validate_fileurl(urlParts: urllib.parse.ParseResult, tag: str):
41+
if urlParts.netloc != '':
42+
return UrlVResponse(False, 400, f"WARN:{tag}:Malformed file url")
3243
return UrlVResponse(True, 100)
3344

3445

@@ -54,6 +65,8 @@ def validate_url(url: str, tag: str):
5465
vok = validator_ok(tag)
5566
if (not vok.callOk):
5667
return vok
68+
if (not url):
69+
return UrlVResponse(False, 400, f"WARN:{tag}:Missing url")
5770
urlParts = urllib.parse.urlparse(url)
5871
print(f"DBUG:{tag}:{urlParts}, {urlParts.hostname}")
5972
# Cross check scheme

0 commit comments

Comments
 (0)