Skip to content

Commit 2de1975

Browse files
committed
improve websearch api
1 parent baaecd1 commit 2de1975

File tree

1 file changed

+74
-17
lines changed

1 file changed

+74
-17
lines changed

koboldcpp.py

Lines changed: 74 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1276,16 +1276,47 @@ def websearch(query):
12761276
return []
12771277
import urllib.parse
12781278
import urllib.request
1279+
import difflib
12791280
from html.parser import HTMLParser
12801281
num_results = 3
12811282
searchresults = []
1283+
1284+
def fetch_searched_webpage(url):
1285+
try:
1286+
req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0'})
1287+
with urllib.request.urlopen(req) as response:
1288+
html_content = response.read().decode('utf-8', errors='ignore')
1289+
return html_content
1290+
except Exception as e:
1291+
print(f"Error fetching text from URL {url}: {e}")
1292+
return ""
1293+
1294+
class VisibleTextParser(HTMLParser):
1295+
def __init__(self):
1296+
super().__init__()
1297+
self.texts = []
1298+
self.is_script_or_style = False
1299+
def handle_starttag(self, tag, attrs):
1300+
if tag in {'script', 'style'}:
1301+
self.is_script_or_style = True
1302+
def handle_endtag(self, tag):
1303+
if tag in {'script', 'style'}:
1304+
self.is_script_or_style = False
1305+
def handle_data(self, data):
1306+
if not self.is_script_or_style and data.strip():
1307+
self.texts.append(data.strip())
1308+
def get_text(self):
1309+
return ' '.join(self.texts)
1310+
12821311
class ExtractResultsParser(HTMLParser):
12831312
def __init__(self):
12841313
super().__init__()
1285-
self.results = []
1314+
self.titles = []
1315+
self.urls = []
1316+
self.descs = []
12861317
self.recordingTitle = False
1318+
self.recordingUrl = False
12871319
self.recordingDesc = False
1288-
self.currentrytxt = ""
12891320
self.currsegmenttxt = ""
12901321

12911322
def handle_starttag(self, tag, attrs):
@@ -1294,10 +1325,9 @@ def handle_starttag(self, tag, attrs):
12941325
for attr_name, attr_value in attrs:
12951326
if not self.recordingTitle and attr_name == "class" and "result__a" in attr_value.split():
12961327
self.recordingTitle = True
1297-
self.currentrytxt = ""
12981328
self.currsegmenttxt = ""
1299-
if not self.recordingTitle and attr_name == "class" and "result__url" in attr_value.split():
1300-
self.recordingTitle = True
1329+
if not self.recordingUrl and attr_name == "class" and "result__url" in attr_value.split():
1330+
self.recordingUrl = True
13011331
self.currsegmenttxt = ""
13021332
if not self.recordingDesc and attr_name == "class" and "result__snippet" in attr_value.split():
13031333
self.recordingDesc = True
@@ -1306,30 +1336,57 @@ def handle_starttag(self, tag, attrs):
13061336
def handle_endtag(self, tag):
13071337
if tag == "a" and self.recordingTitle:
13081338
self.recordingTitle = False
1309-
self.currentrytxt += self.currsegmenttxt.strip() + "\n"
1339+
self.titles.append(self.currsegmenttxt.strip())
1340+
self.currsegmenttxt = ""
1341+
if tag == "a" and self.recordingUrl:
1342+
self.recordingUrl = False
1343+
self.urls.append(f"https://{self.currsegmenttxt.strip()}")
13101344
self.currsegmenttxt = ""
13111345
if tag == "a" and self.recordingDesc:
13121346
self.recordingDesc = False
1313-
self.currentrytxt += self.currsegmenttxt.strip()
1347+
self.descs.append(self.currsegmenttxt.strip())
13141348
self.currsegmenttxt = ""
1315-
if self.currentrytxt != "":
1316-
self.results.append(self.currentrytxt.strip())
1317-
self.currentrytxt = ""
13181349

13191350
def handle_data(self, data):
1320-
if self.recordingTitle or self.recordingDesc:
1351+
if self.recordingTitle or self.recordingDesc or self.recordingUrl:
13211352
self.currsegmenttxt += data
13221353

13231354
encoded_query = urllib.parse.quote(query)
13241355
search_url = f"https://html.duckduckgo.com/html/?q={encoded_query}"
13251356

13261357
try:
1327-
req = urllib.request.Request(search_url, headers={'User-Agent': 'Mozilla/5.0'})
1328-
with urllib.request.urlopen(req) as response:
1329-
search_html = response.read().decode('utf-8', errors='ignore')
1330-
parser = ExtractResultsParser()
1331-
parser.feed(search_html)
1332-
searchresults = parser.results[:num_results]
1358+
search_html = fetch_searched_webpage(search_url)
1359+
parser = ExtractResultsParser()
1360+
parser.feed(search_html)
1361+
titles = parser.titles[:num_results]
1362+
searchurls = parser.urls[:num_results]
1363+
descs = parser.descs[:num_results]
1364+
for i in range(len(descs)):
1365+
# dive into the results to try and get even more details
1366+
title = titles[i]
1367+
url = searchurls[i]
1368+
desc = descs[i]
1369+
pagedesc = ""
1370+
try:
1371+
desclen = len(desc)
1372+
html_content = fetch_searched_webpage(url)
1373+
parser2 = VisibleTextParser()
1374+
parser2.feed(html_content)
1375+
scraped = parser2.get_text().strip()
1376+
s = difflib.SequenceMatcher(None, scraped.lower(), desc.lower())
1377+
matches = s.find_longest_match(0, len(scraped), 0, desclen)
1378+
if matches.size > 100 and desclen-matches.size < 50: #good enough match
1379+
# expand description by some chars both sides
1380+
expandamtbefore = 250
1381+
expandamtafter = 600
1382+
startpt = matches.a - expandamtbefore
1383+
startpt = 0 if startpt < 0 else startpt
1384+
endpt = matches.a + expandamtafter + desclen
1385+
pagedesc = scraped[startpt:endpt]
1386+
except Exception:
1387+
pass
1388+
searchresults.append({"title":title,"url":url,"desc":desc,"content":pagedesc})
1389+
13331390
except Exception as e:
13341391
print(f"Error fetching URL {search_url}: {e}")
13351392
return ""

0 commit comments

Comments
 (0)