11"""
2- research_web module
2+ search_web module
33"""
44import re
55from typing import List
99from bs4 import BeautifulSoup
1010
1111def search_on_web (query : str , search_engine : str = "Google" ,
12- max_results : int = 10 , port : int = 8080 ,
13- timeout : int = 10 ) -> List [str ]:
12+ max_results : int = 10 , port : int = 8080 ,
13+ timeout : int = 10 , proxies : dict = None ) -> List [str ]:
1414 """
1515 Searches the web for a given query using specified search
1616 engine options and filters out PDF links.
@@ -23,6 +23,7 @@ def search_on_web(query: str, search_engine: str = "Google",
2323 port (int, optional): The port number to use when searching with 'SearXNG'. Default is 8080.
2424 timeout (int, optional): The number of seconds to wait
2525 for a response from a request. Default is 10 seconds.
26+ proxies (dict, optional): Dictionary containing proxy settings.
2627
2728 Returns:
2829 List[str]: A list of URLs as strings that are the search results, excluding any PDF links.
@@ -32,7 +33,8 @@ def search_on_web(query: str, search_engine: str = "Google",
3233 requests.exceptions.Timeout: If the request times out.
3334
3435 Example:
35- >>> search_on_web("example query", search_engine="Google", max_results=5)
36+ >>> search_on_web("example query", search_engine="Google", max_results=5,
37+ proxies={"http": "http://proxy.example.com:8080"})
3638 ['http://example.com', 'http://example.org', ...]
3739 """
3840
@@ -50,7 +52,12 @@ def filter_pdf_links(links: List[str]) -> List[str]:
5052
5153 if search_engine .lower () == "google" :
5254 res = []
53- for url in google_search (query , stop = max_results ):
55+ google_search_params = {"stop" : max_results }
56+
57+ if proxies :
58+ google_search_params ["proxies" ] = proxies
59+
60+ for url in google_search (query , ** google_search_params ):
5461 res .append (url )
5562 return filter_pdf_links (res )
5663
@@ -66,7 +73,7 @@ def filter_pdf_links(links: List[str]) -> List[str]:
6673 AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"""
6774 }
6875 search_url = f"https://www.bing.com/search?q={ query } "
69- response = requests .get (search_url , headers = headers , timeout = timeout )
76+ response = requests .get (search_url , headers = headers , timeout = timeout , proxies = proxies )
7077 response .raise_for_status ()
7178 soup = BeautifulSoup (response .text , "html.parser" )
7279
@@ -79,7 +86,7 @@ def filter_pdf_links(links: List[str]) -> List[str]:
7986 elif search_engine .lower () == "searxng" :
8087 url = f"http://localhost:{ port } "
8188 params = {"q" : query , "format" : "json" , "engines" : "google,duckduckgo,brave,qwant,bing" }
82- response = requests .get (url , params = params , timeout = timeout )
89+ response = requests .get (url , params = params , timeout = timeout , proxies = proxies )
8390 data = response .json ()
8491 limited_results = [result ['url' ] for result in data ["results" ][:max_results ]]
8592 return filter_pdf_links (limited_results )
0 commit comments