Skip to content

Commit 709dab6

Browse files
committed
improved websearch endpoint
1 parent 5451a8e commit 709dab6

File tree

1 file changed

+52
-17
lines changed

1 file changed

+52
-17
lines changed

koboldcpp.py

Lines changed: 52 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1278,18 +1278,37 @@ def websearch(query):
12781278
import urllib.request
12791279
import difflib
12801280
from html.parser import HTMLParser
1281+
from concurrent.futures import ThreadPoolExecutor
12811282
num_results = 3
12821283
searchresults = []
12831284

12841285
def fetch_searched_webpage(url):
1286+
if args.debugmode:
1287+
utfprint(f"WebSearch URL: {url}")
12851288
try:
1286-
req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0'})
1287-
with urllib.request.urlopen(req) as response:
1289+
req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)'})
1290+
with urllib.request.urlopen(req, timeout=15) as response:
12881291
html_content = response.read().decode('utf-8', errors='ignore')
12891292
return html_content
1293+
except urllib.error.HTTPError: #we got blocked? try 1 more time with a different user agent
1294+
try:
1295+
req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36'})
1296+
with urllib.request.urlopen(req, timeout=15) as response:
1297+
html_content = response.read().decode('utf-8', errors='ignore')
1298+
return html_content
1299+
except Exception as e:
1300+
if args.debugmode != -1 and not args.quiet:
1301+
print(f"Error fetching text from URL {url}: {e}")
1302+
return ""
12901303
except Exception as e:
1291-
print(f"Error fetching text from URL {url}: {e}")
1304+
if args.debugmode != -1 and not args.quiet:
1305+
print(f"Error fetching text from URL {url}: {e}")
12921306
return ""
1307+
def fetch_webpages_parallel(urls):
1308+
with ThreadPoolExecutor() as executor:
1309+
# Submit tasks and gather results
1310+
results = list(executor.map(fetch_searched_webpage, urls))
1311+
return results
12931312

12941313
class VisibleTextParser(HTMLParser):
12951314
def __init__(self):
@@ -1361,6 +1380,7 @@ def handle_data(self, data):
13611380
titles = parser.titles[:num_results]
13621381
searchurls = parser.urls[:num_results]
13631382
descs = parser.descs[:num_results]
1383+
fetchedcontent = fetch_webpages_parallel(searchurls)
13641384
for i in range(len(descs)):
13651385
# dive into the results to try and get even more details
13661386
title = titles[i]
@@ -1369,13 +1389,13 @@ def handle_data(self, data):
13691389
pagedesc = ""
13701390
try:
13711391
desclen = len(desc)
1372-
html_content = fetch_searched_webpage(url)
1392+
html_content = fetchedcontent[i]
13731393
parser2 = VisibleTextParser()
13741394
parser2.feed(html_content)
13751395
scraped = parser2.get_text().strip()
1376-
s = difflib.SequenceMatcher(None, scraped.lower(), desc.lower())
1396+
s = difflib.SequenceMatcher(None, scraped.lower(), desc.lower(), autojunk=False)
13771397
matches = s.find_longest_match(0, len(scraped), 0, desclen)
1378-
if matches.size > 100 and desclen-matches.size < 50: #good enough match
1398+
if matches.size > 100 and desclen-matches.size < 100: #good enough match
13791399
# expand description by some chars both sides
13801400
expandamtbefore = 250
13811401
expandamtafter = 750
@@ -1388,7 +1408,8 @@ def handle_data(self, data):
13881408
searchresults.append({"title":title,"url":url,"desc":desc,"content":pagedesc})
13891409

13901410
except Exception as e:
1391-
print(f"Error fetching URL {search_url}: {e}")
1411+
if args.debugmode != -1 and not args.quiet:
1412+
print(f"Error fetching URL {search_url}: {e}")
13921413
return ""
13931414
return searchresults
13941415

@@ -2146,13 +2167,27 @@ def do_GET(self):
21462167

21472168
elif self.path.startswith(("/websearch")):
21482169
if args.websearch:
2149-
parsed_url = urlparse.urlparse(self.path)
2150-
parsed_dict = urlparse.parse_qs(parsed_url.query)
2151-
searchstr = (parsed_dict['q'][0]) if 'q' in parsed_dict else ""
2152-
if args.debugmode:
2153-
print(f"Searching web for: {searchstr}")
2154-
searchres = websearch(searchstr)
2155-
response_body = (json.dumps(searchres).encode())
2170+
# ensure authorized
2171+
auth_ok = True
2172+
if password and password !="":
2173+
auth_header = None
2174+
auth_ok = False
2175+
if 'Authorization' in self.headers:
2176+
auth_header = self.headers['Authorization']
2177+
elif 'authorization' in self.headers:
2178+
auth_header = self.headers['authorization']
2179+
if auth_header is not None and auth_header.startswith('Bearer '):
2180+
token = auth_header[len('Bearer '):].strip()
2181+
if token==password:
2182+
auth_ok = True
2183+
if auth_ok:
2184+
parsed_url = urlparse.urlparse(self.path)
2185+
parsed_dict = urlparse.parse_qs(parsed_url.query)
2186+
searchstr = (parsed_dict['q'][0]) if 'q' in parsed_dict else ""
2187+
searchres = websearch(searchstr)
2188+
response_body = (json.dumps(searchres).encode())
2189+
else:
2190+
response_body = (json.dumps([]).encode())
21562191
else:
21572192
response_body = (json.dumps([]).encode())
21582193

@@ -4721,6 +4756,9 @@ def main(launch_args,start_server=True):
47214756
print("==========")
47224757
time.sleep(1)
47234758

4759+
if args.password and args.password!="":
4760+
password = args.password.strip()
4761+
47244762
#handle loading text model
47254763
if args.model_param:
47264764
if not os.path.exists(args.model_param):
@@ -4766,9 +4804,6 @@ def main(launch_args,start_server=True):
47664804
args.mmproj = os.path.abspath(args.mmproj)
47674805
mmprojpath = args.mmproj
47684806

4769-
if args.password and args.password!="":
4770-
password = args.password.strip()
4771-
47724807
if not args.blasthreads or args.blasthreads <= 0:
47734808
args.blasthreads = args.threads
47744809

0 commit comments

Comments
 (0)