improved websearch endpoint

LostRuins · LostRuins · commit 709dab6289cb · 2024-12-29T19:39:16.000+08:00
diff --git a/koboldcpp.py b/koboldcpp.py
@@ -1278,18 +1278,37 @@ def websearch(query):
     import urllib.request
     import difflib
     from html.parser import HTMLParser
+    from concurrent.futures import ThreadPoolExecutor
     num_results = 3
     searchresults = []
 
     def fetch_searched_webpage(url):
+        if args.debugmode:
+            utfprint(f"WebSearch URL: {url}")
         try:
-            req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0'})
-            with urllib.request.urlopen(req) as response:
+            req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)'})
+            with urllib.request.urlopen(req,  timeout=15) as response:
                 html_content = response.read().decode('utf-8', errors='ignore')
                 return html_content
+        except urllib.error.HTTPError: #we got blocked? try 1 more time with a different user agent
+            try:
+                req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36'})
+                with urllib.request.urlopen(req,  timeout=15) as response:
+                    html_content = response.read().decode('utf-8', errors='ignore')
+                    return html_content
+            except Exception as e:
+                if args.debugmode != -1 and not args.quiet:
+                    print(f"Error fetching text from URL {url}: {e}")
+                return ""
         except Exception as e:
-            print(f"Error fetching text from URL {url}: {e}")
+            if args.debugmode != -1 and not args.quiet:
+                print(f"Error fetching text from URL {url}: {e}")
             return ""
+    def fetch_webpages_parallel(urls):
+        with ThreadPoolExecutor() as executor:
+            # Submit tasks and gather results
+            results = list(executor.map(fetch_searched_webpage, urls))
+        return results
 
     class VisibleTextParser(HTMLParser):
         def __init__(self):
@@ -1361,6 +1380,7 @@ def handle_data(self, data):
         titles = parser.titles[:num_results]
         searchurls = parser.urls[:num_results]
         descs = parser.descs[:num_results]
+        fetchedcontent = fetch_webpages_parallel(searchurls)
         for i in range(len(descs)):
             # dive into the results to try and get even more details
             title = titles[i]
@@ -1369,13 +1389,13 @@ def handle_data(self, data):
             pagedesc = ""
             try:
                 desclen = len(desc)
-                html_content = fetch_searched_webpage(url)
+                html_content = fetchedcontent[i]
                 parser2 = VisibleTextParser()
                 parser2.feed(html_content)
                 scraped = parser2.get_text().strip()
-                s = difflib.SequenceMatcher(None, scraped.lower(), desc.lower())
+                s = difflib.SequenceMatcher(None, scraped.lower(), desc.lower(), autojunk=False)
                 matches = s.find_longest_match(0, len(scraped), 0, desclen)
-                if matches.size > 100 and desclen-matches.size < 50: #good enough match
+                if matches.size > 100 and desclen-matches.size < 100: #good enough match
                     # expand description by some chars both sides
                     expandamtbefore = 250
                     expandamtafter = 750
@@ -1388,7 +1408,8 @@ def handle_data(self, data):
             searchresults.append({"title":title,"url":url,"desc":desc,"content":pagedesc})
 
     except Exception as e:
-        print(f"Error fetching URL {search_url}: {e}")
+        if args.debugmode != -1 and not args.quiet:
+            print(f"Error fetching URL {search_url}: {e}")
         return ""
     return searchresults
 
@@ -2146,13 +2167,27 @@ def do_GET(self):
 
         elif self.path.startswith(("/websearch")):
             if args.websearch:
-                parsed_url = urlparse.urlparse(self.path)
-                parsed_dict = urlparse.parse_qs(parsed_url.query)
-                searchstr = (parsed_dict['q'][0]) if 'q' in parsed_dict else ""
-                if args.debugmode:
-                    print(f"Searching web for: {searchstr}")
-                searchres = websearch(searchstr)
-                response_body = (json.dumps(searchres).encode())
+                # ensure authorized
+                auth_ok = True
+                if password and password !="":
+                    auth_header = None
+                    auth_ok = False
+                    if 'Authorization' in self.headers:
+                        auth_header = self.headers['Authorization']
+                    elif 'authorization' in self.headers:
+                        auth_header = self.headers['authorization']
+                    if auth_header is not None and auth_header.startswith('Bearer '):
+                        token = auth_header[len('Bearer '):].strip()
+                        if token==password:
+                            auth_ok = True
+                if auth_ok:
+                    parsed_url = urlparse.urlparse(self.path)
+                    parsed_dict = urlparse.parse_qs(parsed_url.query)
+                    searchstr = (parsed_dict['q'][0]) if 'q' in parsed_dict else ""
+                    searchres = websearch(searchstr)
+                    response_body = (json.dumps(searchres).encode())
+                else:
+                    response_body = (json.dumps([]).encode())
             else:
                 response_body = (json.dumps([]).encode())
 
@@ -4721,6 +4756,9 @@ def main(launch_args,start_server=True):
     print("==========")
     time.sleep(1)
 
+    if args.password and args.password!="":
+        password = args.password.strip()
+
     #handle loading text model
     if args.model_param:
         if not os.path.exists(args.model_param):
@@ -4766,9 +4804,6 @@ def main(launch_args,start_server=True):
                 args.mmproj = os.path.abspath(args.mmproj)
                 mmprojpath = args.mmproj
 
-        if args.password and args.password!="":
-            password = args.password.strip()
-
         if not args.blasthreads or args.blasthreads <= 0:
             args.blasthreads = args.threads