|
9 | 9 | from bs4 import BeautifulSoup |
10 | 10 |
|
11 | 11 | def search_on_web(query: str, search_engine: str = "Google", |
12 | | - max_results: int = 10, port: int = 8080, |
| 12 | + max_results: int = 10, port: int = 8080, |
13 | 13 | timeout: int = 10, proxy: str | dict = None) -> List[str]: |
| 14 | + """Search web function with improved error handling and validation""" |
| 15 | + |
| 16 | + # Input validation |
| 17 | + if not query or not isinstance(query, str): |
| 18 | + raise ValueError("Query must be a non-empty string") |
| 19 | + |
| 20 | + search_engine = search_engine.lower() |
| 21 | + valid_engines = {"google", "duckduckgo", "bing", "searxng"} |
| 22 | + if search_engine not in valid_engines: |
| 23 | + raise ValueError(f"Search engine must be one of: {', '.join(valid_engines)}") |
| 24 | + |
| 25 | + # Format proxy once |
| 26 | + formatted_proxy = None |
| 27 | + if proxy: |
| 28 | + formatted_proxy = format_proxy(proxy) |
| 29 | + |
| 30 | + try: |
| 31 | + results = [] |
| 32 | + if search_engine == "google": |
| 33 | + results = list(google_search(query, num_results=max_results, proxy=formatted_proxy)) |
| 34 | + |
| 35 | + elif search_engine == "duckduckgo": |
| 36 | + research = DuckDuckGoSearchResults(max_results=max_results) |
| 37 | + res = research.run(query) |
| 38 | + results = re.findall(r'https?://[^\s,\]]+', res) |
| 39 | + |
| 40 | + elif search_engine == "bing": |
| 41 | + results = _search_bing(query, max_results, timeout, formatted_proxy) |
| 42 | + |
| 43 | + elif search_engine == "searxng": |
| 44 | + results = _search_searxng(query, max_results, port, timeout) |
| 45 | + |
| 46 | + return filter_pdf_links(results) |
| 47 | + |
| 48 | + except requests.Timeout: |
| 49 | + raise TimeoutError(f"Search request timed out after {timeout} seconds") |
| 50 | + except requests.RequestException as e: |
| 51 | + raise RuntimeError(f"Search request failed: {str(e)}") |
| 52 | + |
| 53 | +def _search_bing(query: str, max_results: int, timeout: int, proxy: str = None) -> List[str]: |
| 54 | + """Helper function for Bing search""" |
| 55 | + headers = { |
| 56 | + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36" |
| 57 | + } |
| 58 | + search_url = f"https://www.bing.com/search?q={query}" |
| 59 | + |
| 60 | + proxies = {"http": proxy, "https": proxy} if proxy else None |
| 61 | + response = requests.get(search_url, headers=headers, timeout=timeout, proxies=proxies) |
| 62 | + response.raise_for_status() |
| 63 | + |
| 64 | + soup = BeautifulSoup(response.text, "html.parser") |
| 65 | + return [result.find('a')['href'] for result in soup.find_all('li', class_='b_algo', limit=max_results)] |
| 66 | + |
| 67 | +def _search_searxng(query: str, max_results: int, port: int, timeout: int) -> List[str]: |
| 68 | + """Helper function for SearXNG search""" |
| 69 | + url = f"http://localhost:{port}" |
| 70 | + params = { |
| 71 | + "q": query, |
| 72 | + "format": "json", |
| 73 | + "engines": "google,duckduckgo,brave,qwant,bing" |
| 74 | + } |
| 75 | + response = requests.get(url, params=params, timeout=timeout) |
| 76 | + response.raise_for_status() |
| 77 | + return [result['url'] for result in response.json().get("results", [])[:max_results]] |
| 78 | + |
| 79 | +def format_proxy(proxy): |
| 80 | + if isinstance(proxy, dict): |
| 81 | + server = proxy.get('server') |
| 82 | + username = proxy.get('username') |
| 83 | + password = proxy.get('password') |
| 84 | + |
| 85 | + if all([username, password, server]): |
| 86 | + proxy_url = f"http://{username}:{password}@{server}" |
| 87 | + return proxy_url |
| 88 | + else: |
| 89 | + raise ValueError("Proxy dictionary is missing required fields.") |
| 90 | + elif isinstance(proxy, str): |
| 91 | + return proxy # "https://username:password@ip:port" |
| 92 | + else: |
| 93 | + raise TypeError("Proxy should be a dictionary or a string.") |
| 94 | + |
| 95 | +def filter_pdf_links(links: List[str]) -> List[str]: |
14 | 96 | """ |
15 | | - Searches the web for a given query using specified search |
16 | | - engine options and filters out PDF links. |
| 97 | + Filters out any links that point to PDF files. |
17 | 98 |
|
18 | 99 | Args: |
19 | | - query (str): The search query to find on the internet. |
20 | | - search_engine (str, optional): Specifies the search engine to use, |
21 | | - options include 'Google', 'DuckDuckGo', 'Bing', or 'SearXNG'. Default is 'Google'. |
22 | | - max_results (int, optional): The maximum number of search results to return. |
23 | | - port (int, optional): The port number to use when searching with 'SearXNG'. Default is 8080. |
24 | | - timeout (int, optional): The number of seconds to wait |
25 | | - for a response from a request. Default is 10 seconds. |
26 | | - proxy (dict or string, optional): The proxy server to use for the request. Default is None. |
| 100 | + links (List[str]): A list of URLs as strings. |
27 | 101 |
|
28 | 102 | Returns: |
29 | | - List[str]: A list of URLs as strings that are the search results, excluding any PDF links. |
30 | | -
|
31 | | - Raises: |
32 | | - ValueError: If the search engine specified is not supported. |
33 | | - requests.exceptions.Timeout: If the request times out. |
34 | | -
|
35 | | - Example: |
36 | | - >>> search_on_web("example query", search_engine="Google", max_results=5) |
37 | | - ['http://example.com', 'http://example.org', ...] |
| 103 | + List[str]: A list of URLs excluding any that end with '.pdf'. |
38 | 104 | """ |
39 | | - |
40 | | - def format_proxy(proxy): |
41 | | - if isinstance(proxy, dict): |
42 | | - server = proxy.get('server') |
43 | | - username = proxy.get('username') |
44 | | - password = proxy.get('password') |
45 | | - |
46 | | - if all([username, password, server]): |
47 | | - proxy_url = f"http://{username}:{password}@{server}" |
48 | | - return proxy_url |
49 | | - else: |
50 | | - raise ValueError("Proxy dictionary is missing required fields.") |
51 | | - elif isinstance(proxy, str): |
52 | | - return proxy # "https://username:password@ip:port" |
53 | | - else: |
54 | | - raise TypeError("Proxy should be a dictionary or a string.") |
55 | | - |
56 | | - def filter_pdf_links(links: List[str]) -> List[str]: |
57 | | - """ |
58 | | - Filters out any links that point to PDF files. |
59 | | -
|
60 | | - Args: |
61 | | - links (List[str]): A list of URLs as strings. |
62 | | -
|
63 | | - Returns: |
64 | | - List[str]: A list of URLs excluding any that end with '.pdf'. |
65 | | - """ |
66 | | - return [link for link in links if not link.lower().endswith('.pdf')] |
67 | | - |
68 | | - if proxy: |
69 | | - proxy = format_proxy(proxy) |
70 | | - |
71 | | - if search_engine.lower() == "google": |
72 | | - res = [] |
73 | | - for url in google_search(query, num_results=max_results, proxy=proxy): |
74 | | - res.append(url) |
75 | | - return filter_pdf_links(res) |
76 | | - |
77 | | - elif search_engine.lower() == "duckduckgo": |
78 | | - research = DuckDuckGoSearchResults(max_results=max_results) |
79 | | - res = research.run(query) |
80 | | - links = re.findall(r'https?://[^\s,\]]+', res) |
81 | | - return filter_pdf_links(links) |
82 | | - |
83 | | - elif search_engine.lower() == "bing": |
84 | | - headers = { |
85 | | - "User-Agent": """Mozilla/5.0 (Windows NT 10.0; Win64; x64) |
86 | | - AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36""" |
87 | | - } |
88 | | - search_url = f"https://www.bing.com/search?q={query}" |
89 | | - response = requests.get(search_url, headers=headers, timeout=timeout) |
90 | | - response.raise_for_status() |
91 | | - soup = BeautifulSoup(response.text, "html.parser") |
92 | | - |
93 | | - search_results = [] |
94 | | - for result in soup.find_all('li', class_='b_algo', limit=max_results): |
95 | | - link = result.find('a')['href'] |
96 | | - search_results.append(link) |
97 | | - return filter_pdf_links(search_results) |
98 | | - |
99 | | - elif search_engine.lower() == "searxng": |
100 | | - url = f"http://localhost:{port}" |
101 | | - params = {"q": query, "format": "json", "engines": "google,duckduckgo,brave,qwant,bing"} |
102 | | - response = requests.get(url, params=params, timeout=timeout) |
103 | | - data = response.json() |
104 | | - limited_results = [result['url'] for result in data["results"][:max_results]] |
105 | | - return filter_pdf_links(limited_results) |
106 | | - |
107 | | - else: |
108 | | - raise ValueError("""The only search engines available are |
109 | | - DuckDuckGo, Google, Bing, or SearXNG""") |
| 105 | + return [link for link in links if not link.lower().endswith('.pdf')] |
0 commit comments