@@ -1276,16 +1276,47 @@ def websearch(query):
12761276 return []
12771277 import urllib .parse
12781278 import urllib .request
1279+ import difflib
12791280 from html .parser import HTMLParser
12801281 num_results = 3
12811282 searchresults = []
1283+
1284+ def fetch_searched_webpage (url ):
1285+ try :
1286+ req = urllib .request .Request (url , headers = {'User-Agent' : 'Mozilla/5.0' })
1287+ with urllib .request .urlopen (req ) as response :
1288+ html_content = response .read ().decode ('utf-8' , errors = 'ignore' )
1289+ return html_content
1290+ except Exception as e :
1291+ print (f"Error fetching text from URL { url } : { e } " )
1292+ return ""
1293+
1294+ class VisibleTextParser (HTMLParser ):
1295+ def __init__ (self ):
1296+ super ().__init__ ()
1297+ self .texts = []
1298+ self .is_script_or_style = False
1299+ def handle_starttag (self , tag , attrs ):
1300+ if tag in {'script' , 'style' }:
1301+ self .is_script_or_style = True
1302+ def handle_endtag (self , tag ):
1303+ if tag in {'script' , 'style' }:
1304+ self .is_script_or_style = False
1305+ def handle_data (self , data ):
1306+ if not self .is_script_or_style and data .strip ():
1307+ self .texts .append (data .strip ())
1308+ def get_text (self ):
1309+ return ' ' .join (self .texts )
1310+
12821311 class ExtractResultsParser (HTMLParser ):
12831312 def __init__ (self ):
12841313 super ().__init__ ()
1285- self .results = []
1314+ self .titles = []
1315+ self .urls = []
1316+ self .descs = []
12861317 self .recordingTitle = False
1318+ self .recordingUrl = False
12871319 self .recordingDesc = False
1288- self .currentrytxt = ""
12891320 self .currsegmenttxt = ""
12901321
12911322 def handle_starttag (self , tag , attrs ):
@@ -1294,10 +1325,9 @@ def handle_starttag(self, tag, attrs):
12941325 for attr_name , attr_value in attrs :
12951326 if not self .recordingTitle and attr_name == "class" and "result__a" in attr_value .split ():
12961327 self .recordingTitle = True
1297- self .currentrytxt = ""
12981328 self .currsegmenttxt = ""
1299- if not self .recordingTitle and attr_name == "class" and "result__url" in attr_value .split ():
1300- self .recordingTitle = True
1329+ if not self .recordingUrl and attr_name == "class" and "result__url" in attr_value .split ():
1330+ self .recordingUrl = True
13011331 self .currsegmenttxt = ""
13021332 if not self .recordingDesc and attr_name == "class" and "result__snippet" in attr_value .split ():
13031333 self .recordingDesc = True
@@ -1306,30 +1336,57 @@ def handle_starttag(self, tag, attrs):
13061336 def handle_endtag (self , tag ):
13071337 if tag == "a" and self .recordingTitle :
13081338 self .recordingTitle = False
1309- self .currentrytxt += self .currsegmenttxt .strip () + "\n "
1339+ self .titles .append (self .currsegmenttxt .strip ())
1340+ self .currsegmenttxt = ""
1341+ if tag == "a" and self .recordingUrl :
1342+ self .recordingUrl = False
1343+ self .urls .append (f"https://{ self .currsegmenttxt .strip ()} " )
13101344 self .currsegmenttxt = ""
13111345 if tag == "a" and self .recordingDesc :
13121346 self .recordingDesc = False
1313- self .currentrytxt += self .currsegmenttxt .strip ()
1347+ self .descs . append ( self .currsegmenttxt .strip () )
13141348 self .currsegmenttxt = ""
1315- if self .currentrytxt != "" :
1316- self .results .append (self .currentrytxt .strip ())
1317- self .currentrytxt = ""
13181349
13191350 def handle_data (self , data ):
1320- if self .recordingTitle or self .recordingDesc :
1351+ if self .recordingTitle or self .recordingDesc or self . recordingUrl :
13211352 self .currsegmenttxt += data
13221353
13231354 encoded_query = urllib .parse .quote (query )
13241355 search_url = f"https://html.duckduckgo.com/html/?q={ encoded_query } "
13251356
13261357 try :
1327- req = urllib .request .Request (search_url , headers = {'User-Agent' : 'Mozilla/5.0' })
1328- with urllib .request .urlopen (req ) as response :
1329- search_html = response .read ().decode ('utf-8' , errors = 'ignore' )
1330- parser = ExtractResultsParser ()
1331- parser .feed (search_html )
1332- searchresults = parser .results [:num_results ]
1358+ search_html = fetch_searched_webpage (search_url )
1359+ parser = ExtractResultsParser ()
1360+ parser .feed (search_html )
1361+ titles = parser .titles [:num_results ]
1362+ searchurls = parser .urls [:num_results ]
1363+ descs = parser .descs [:num_results ]
1364+ for i in range (len (descs )):
1365+ # dive into the results to try and get even more details
1366+ title = titles [i ]
1367+ url = searchurls [i ]
1368+ desc = descs [i ]
1369+ pagedesc = ""
1370+ try :
1371+ desclen = len (desc )
1372+ html_content = fetch_searched_webpage (url )
1373+ parser2 = VisibleTextParser ()
1374+ parser2 .feed (html_content )
1375+ scraped = parser2 .get_text ().strip ()
1376+ s = difflib .SequenceMatcher (None , scraped .lower (), desc .lower ())
1377+ matches = s .find_longest_match (0 , len (scraped ), 0 , desclen )
1378+ if matches .size > 100 and desclen - matches .size < 50 : #good enough match
1379+ # expand description by some chars both sides
1380+ expandamtbefore = 250
1381+ expandamtafter = 600
1382+ startpt = matches .a - expandamtbefore
1383+ startpt = 0 if startpt < 0 else startpt
1384+ endpt = matches .a + expandamtafter + desclen
1385+ pagedesc = scraped [startpt :endpt ]
1386+ except Exception :
1387+ pass
1388+ searchresults .append ({"title" :title ,"url" :url ,"desc" :desc ,"content" :pagedesc })
1389+
13331390 except Exception as e :
13341391 print (f"Error fetching URL { search_url } : { e } " )
13351392 return ""
0 commit comments