@@ -1278,18 +1278,37 @@ def websearch(query):
12781278 import urllib .request
12791279 import difflib
12801280 from html .parser import HTMLParser
1281+ from concurrent .futures import ThreadPoolExecutor
12811282 num_results = 3
12821283 searchresults = []
12831284
12841285 def fetch_searched_webpage (url ):
1286+ if args .debugmode :
1287+ utfprint (f"WebSearch URL: { url } " )
12851288 try :
1286- req = urllib .request .Request (url , headers = {'User-Agent' : 'Mozilla/5.0' })
1287- with urllib .request .urlopen (req ) as response :
1289+ req = urllib .request .Request (url , headers = {'User-Agent' : 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html) ' })
1290+ with urllib .request .urlopen (req , timeout = 15 ) as response :
12881291 html_content = response .read ().decode ('utf-8' , errors = 'ignore' )
12891292 return html_content
1293+ except urllib .error .HTTPError : #we got blocked? try 1 more time with a different user agent
1294+ try :
1295+ req = urllib .request .Request (url , headers = {'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36' })
1296+ with urllib .request .urlopen (req , timeout = 15 ) as response :
1297+ html_content = response .read ().decode ('utf-8' , errors = 'ignore' )
1298+ return html_content
1299+ except Exception as e :
1300+ if args .debugmode != - 1 and not args .quiet :
1301+ print (f"Error fetching text from URL { url } : { e } " )
1302+ return ""
12901303 except Exception as e :
1291- print (f"Error fetching text from URL { url } : { e } " )
1304+ if args .debugmode != - 1 and not args .quiet :
1305+ print (f"Error fetching text from URL { url } : { e } " )
12921306 return ""
1307+ def fetch_webpages_parallel (urls ):
1308+ with ThreadPoolExecutor () as executor :
1309+ # Submit tasks and gather results
1310+ results = list (executor .map (fetch_searched_webpage , urls ))
1311+ return results
12931312
12941313 class VisibleTextParser (HTMLParser ):
12951314 def __init__ (self ):
@@ -1361,6 +1380,7 @@ def handle_data(self, data):
13611380 titles = parser .titles [:num_results ]
13621381 searchurls = parser .urls [:num_results ]
13631382 descs = parser .descs [:num_results ]
1383+ fetchedcontent = fetch_webpages_parallel (searchurls )
13641384 for i in range (len (descs )):
13651385 # dive into the results to try and get even more details
13661386 title = titles [i ]
@@ -1369,13 +1389,13 @@ def handle_data(self, data):
13691389 pagedesc = ""
13701390 try :
13711391 desclen = len (desc )
1372- html_content = fetch_searched_webpage ( url )
1392+ html_content = fetchedcontent [ i ]
13731393 parser2 = VisibleTextParser ()
13741394 parser2 .feed (html_content )
13751395 scraped = parser2 .get_text ().strip ()
1376- s = difflib .SequenceMatcher (None , scraped .lower (), desc .lower ())
1396+ s = difflib .SequenceMatcher (None , scraped .lower (), desc .lower (), autojunk = False )
13771397 matches = s .find_longest_match (0 , len (scraped ), 0 , desclen )
1378- if matches .size > 100 and desclen - matches .size < 50 : #good enough match
1398+ if matches .size > 100 and desclen - matches .size < 100 : #good enough match
13791399 # expand description by some chars both sides
13801400 expandamtbefore = 250
13811401 expandamtafter = 750
@@ -1388,7 +1408,8 @@ def handle_data(self, data):
13881408 searchresults .append ({"title" :title ,"url" :url ,"desc" :desc ,"content" :pagedesc })
13891409
13901410 except Exception as e :
1391- print (f"Error fetching URL { search_url } : { e } " )
1411+ if args .debugmode != - 1 and not args .quiet :
1412+ print (f"Error fetching URL { search_url } : { e } " )
13921413 return ""
13931414 return searchresults
13941415
@@ -2146,13 +2167,27 @@ def do_GET(self):
21462167
21472168 elif self .path .startswith (("/websearch" )):
21482169 if args .websearch :
2149- parsed_url = urlparse .urlparse (self .path )
2150- parsed_dict = urlparse .parse_qs (parsed_url .query )
2151- searchstr = (parsed_dict ['q' ][0 ]) if 'q' in parsed_dict else ""
2152- if args .debugmode :
2153- print (f"Searching web for: { searchstr } " )
2154- searchres = websearch (searchstr )
2155- response_body = (json .dumps (searchres ).encode ())
2170+ # ensure authorized
2171+ auth_ok = True
2172+ if password and password != "" :
2173+ auth_header = None
2174+ auth_ok = False
2175+ if 'Authorization' in self .headers :
2176+ auth_header = self .headers ['Authorization' ]
2177+ elif 'authorization' in self .headers :
2178+ auth_header = self .headers ['authorization' ]
2179+ if auth_header is not None and auth_header .startswith ('Bearer ' ):
2180+ token = auth_header [len ('Bearer ' ):].strip ()
2181+ if token == password :
2182+ auth_ok = True
2183+ if auth_ok :
2184+ parsed_url = urlparse .urlparse (self .path )
2185+ parsed_dict = urlparse .parse_qs (parsed_url .query )
2186+ searchstr = (parsed_dict ['q' ][0 ]) if 'q' in parsed_dict else ""
2187+ searchres = websearch (searchstr )
2188+ response_body = (json .dumps (searchres ).encode ())
2189+ else :
2190+ response_body = (json .dumps ([]).encode ())
21562191 else :
21572192 response_body = (json .dumps ([]).encode ())
21582193
@@ -4721,6 +4756,9 @@ def main(launch_args,start_server=True):
47214756 print ("==========" )
47224757 time .sleep (1 )
47234758
4759+ if args .password and args .password != "" :
4760+ password = args .password .strip ()
4761+
47244762 #handle loading text model
47254763 if args .model_param :
47264764 if not os .path .exists (args .model_param ):
@@ -4766,9 +4804,6 @@ def main(launch_args,start_server=True):
47664804 args .mmproj = os .path .abspath (args .mmproj )
47674805 mmprojpath = args .mmproj
47684806
4769- if args .password and args .password != "" :
4770- password = args .password .strip ()
4771-
47724807 if not args .blasthreads or args .blasthreads <= 0 :
47734808 args .blasthreads = args .threads
47744809
0 commit comments