11"""
2- research_web module
2+ Research_web module
33"""
44import re
55from typing import List
66from langchain_community .tools import DuckDuckGoSearchResults
77from googlesearch import search as google_search
88import requests
99from bs4 import BeautifulSoup
10- import json
1110
12- def search_on_web (query : str , search_engine : str = "Google" ,
13- max_results : int = 10 , port : int = 8080 ,
14- timeout : int = 10 , proxy : str | dict = None ,
15- serper_api_key : str = None ) -> List [str ]:
16- """Search web function with improved error handling and validation"""
17-
18- # Input validation
19- if not query or not isinstance (query , str ):
20- raise ValueError ("Query must be a non-empty string" )
21-
22- search_engine = search_engine .lower ()
23- valid_engines = {"google" , "duckduckgo" , "bing" , "searxng" , "serper" }
24- if search_engine not in valid_engines :
25- raise ValueError (f"Search engine must be one of: { ', ' .join (valid_engines )} " )
11+ def search_on_web (query : str , search_engine : str = "Google" ,
12+ max_results : int = 10 , port : int = 8080 ) -> List [str ]:
13+ """
14+ Searches the web for a given query using specified search engine options.
2615
27- # Format proxy once
28- formatted_proxy = None
29- if proxy :
30- formatted_proxy = format_proxy (proxy )
31-
32- try :
33- results = []
34- if search_engine == "google" :
35- results = list (google_search (query , num_results = max_results , proxy = formatted_proxy ))
36-
37- elif search_engine == "duckduckgo" :
38- research = DuckDuckGoSearchResults (max_results = max_results )
39- res = research .run (query )
40- results = re .findall (r'https?://[^\s,\]]+' , res )
41-
42- elif search_engine == "bing" :
43- results = _search_bing (query , max_results , timeout , formatted_proxy )
44-
45- elif search_engine == "searxng" :
46- results = _search_searxng (query , max_results , port , timeout )
47-
48- elif search_engine .lower () == "serper" :
49- results = _search_serper (query , max_results , serper_api_key , timeout )
50-
51- return filter_pdf_links (results )
52-
53- except requests .Timeout :
54- raise TimeoutError (f"Search request timed out after { timeout } seconds" )
55- except requests .RequestException as e :
56- raise RuntimeError (f"Search request failed: { str (e )} " )
16+ Args:
17+ query (str): The search query to find on the internet.
18+ search_engine (str, optional): Specifies the search engine to use,
19+ options include 'Google', 'DuckDuckGo', 'Bing', or 'SearXNG'. Default is 'Google'.
20+ max_results (int, optional): The maximum number of search results to return.
21+ port (int, optional): The port number to use when searching with 'SearXNG'. Default is 8080.
5722
58- def _search_bing (query : str , max_results : int , timeout : int , proxy : str = None ) -> List [str ]:
59- """Helper function for Bing search"""
60- headers = {
61- "User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
62- }
63- search_url = f"https://www.bing.com/search?q={ query } "
64-
65- proxies = {"http" : proxy , "https" : proxy } if proxy else None
66- response = requests .get (search_url , headers = headers , timeout = timeout , proxies = proxies )
67- response .raise_for_status ()
68-
69- soup = BeautifulSoup (response .text , "html.parser" )
70- return [result .find ('a' )['href' ] for result in soup .find_all ('li' , class_ = 'b_algo' , limit = max_results )]
23+ Returns:
24+ List[str]: A list of URLs as strings that are the search results.
7125
72- def _search_searxng (query : str , max_results : int , port : int , timeout : int ) -> List [str ]:
73- """Helper function for SearXNG search"""
74- url = f"http://localhost:{ port } "
75- params = {
76- "q" : query ,
77- "format" : "json" ,
78- "engines" : "google,duckduckgo,brave,qwant,bing"
79- }
80- response = requests .get (url , params = params , timeout = timeout )
81- response .raise_for_status ()
82- return [result ['url' ] for result in response .json ().get ("results" , [])[:max_results ]]
26+ Raises:
27+ ValueError: If the search engine specified is not supported.
8328
84- def _search_serper (query : str , max_results : int , serper_api_key : str , timeout : int ) -> List [str ]:
85- """Helper function for serper api"""
86- if not serper_api_key :
87- raise ValueError ("API key is required for serper api." )
88-
89- url = "https://google.serper.dev/search"
90- payload = json .dumps ({
91- "q" : query ,
92- "num" : max_results
93- })
94- headers = {
95- 'X-API-KEY' : serper_api_key ,
96- 'Content-Type' : 'application/json'
97- }
98- response = requests .post (url , headers = headers , data = payload , timeout = timeout )
99- response .raise_for_status ()
100- return [result .get ("link" ) for result in response .json ().get ("organic" , [])]
29+ Example:
30+ >>> search_on_web("example query", search_engine="Google", max_results=5)
31+ ['http://example.com', 'http://example.org', ...]
32+ """
10133
34+ if search_engine .lower () == "google" :
35+ res = []
36+ for url in google_search (query , num_results = max_results ):
37+ res .append (url )
38+ return res
10239
103- def format_proxy ( proxy ) :
104- if isinstance ( proxy , dict ):
105- server = proxy . get ( 'server' )
106- username = proxy . get ( 'username' )
107- password = proxy . get ( 'password' )
40+ elif search_engine . lower () == "duckduckgo" :
41+ research = DuckDuckGoSearchResults ( max_results = max_results )
42+ res = research . run ( query )
43+ links = re . findall ( r'https?://[^\s,\]]+' , res )
44+ return links
10845
109- if all ([username , password , server ]):
110- proxy_url = f"http://{ username } :{ password } @{ server } "
111- return proxy_url
112- else :
113- raise ValueError ("Proxy dictionary is missing required fields." )
114- elif isinstance (proxy , str ):
115- return proxy # "https://username:password@ip:port"
116- else :
117- raise TypeError ("Proxy should be a dictionary or a string." )
118-
119- def filter_pdf_links (links : List [str ]) -> List [str ]:
120- """
121- Filters out any links that point to PDF files.
46+ elif search_engine .lower () == "bing" :
47+ headers = {
48+ "User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
49+ }
50+ search_url = f"https://www.bing.com/search?q={ query } "
51+ response = requests .get (search_url , headers = headers )
52+ response .raise_for_status ()
53+ soup = BeautifulSoup (response .text , "html.parser" )
12254
123- Args:
124- links (List[str]): A list of URLs as strings.
55+ search_results = []
56+ for result in soup .find_all ('li' , class_ = 'b_algo' , limit = max_results ):
57+ link = result .find ('a' )['href' ]
58+ search_results .append (link )
59+ return search_results
12560
126- Returns:
127- List[str]: A list of URLs excluding any that end with '.pdf'.
128- """
129- return [link for link in links if not link .lower ().endswith ('.pdf' )]
61+ elif search_engine .lower () == "searxng" :
62+ url = f"http://localhost:{ port } "
63+ params = {"q" : query , "format" : "json" }
64+
65+ # Send the GET request to the server
66+ response = requests .get (url , params = params )
67+
68+ data = response .json ()
69+ limited_results = data ["results" ][:max_results ]
70+ return limited_results
71+
72+ else :
73+ raise ValueError ("The only search engines available are DuckDuckGo, Google, Bing, or SearXNG" )
0 commit comments