diff --git a/scrapegraphai/nodes/search_internet_node.py b/scrapegraphai/nodes/search_internet_node.py index 278f81ab..18f14bce 100644 --- a/scrapegraphai/nodes/search_internet_node.py +++ b/scrapegraphai/nodes/search_internet_node.py @@ -41,19 +41,11 @@ def __init__( self.verbose = ( False if node_config is None else node_config.get("verbose", False) ) - self.proxy = node_config.get("loader_kwargs", {}).get("proxy", None) self.search_engine = ( node_config["search_engine"] if node_config.get("search_engine") else "google" ) - - self.serper_api_key = ( - node_config["serper_api_key"] - if node_config.get("serper_api_key") - else None - ) - self.max_results = node_config.get("max_results", 3) def execute(self, state: dict) -> dict: @@ -102,10 +94,10 @@ def execute(self, state: dict) -> dict: self.logger.info(f"Search Query: {search_query}") answer = search_on_web(query=search_query, max_results=self.max_results, - search_engine=self.search_engine, proxy=self.proxy, serper_api_key=self.serper_api_key) + search_engine=self.search_engine) if len(answer) == 0: raise ValueError("Zero results found for the search query.") state.update({self.output[0]: answer}) - return state + return state \ No newline at end of file diff --git a/scrapegraphai/utils/research_web.py b/scrapegraphai/utils/research_web.py index f2d0c254..fa38910b 100644 --- a/scrapegraphai/utils/research_web.py +++ b/scrapegraphai/utils/research_web.py @@ -1,5 +1,5 @@ """ -research_web module +Research_web module """ import re from typing import List @@ -7,123 +7,67 @@ from googlesearch import search as google_search import requests from bs4 import BeautifulSoup -import json -def search_on_web(query: str, search_engine: str = "Google", - max_results: int = 10, port: int = 8080, - timeout: int = 10, proxy: str | dict = None, - serper_api_key: str = None) -> List[str]: - """Search web function with improved error handling and validation""" - - # Input validation - if not query or not isinstance(query, str): - raise ValueError("Query must be a non-empty string") - - search_engine = search_engine.lower() - valid_engines = {"google", "duckduckgo", "bing", "searxng", "serper"} - if search_engine not in valid_engines: - raise ValueError(f"Search engine must be one of: {', '.join(valid_engines)}") +def search_on_web(query: str, search_engine: str = "Google", + max_results: int = 10, port: int = 8080) -> List[str]: + """ + Searches the web for a given query using specified search engine options. - # Format proxy once - formatted_proxy = None - if proxy: - formatted_proxy = format_proxy(proxy) - - try: - results = [] - if search_engine == "google": - results = list(google_search(query, num_results=max_results, proxy=formatted_proxy)) - - elif search_engine == "duckduckgo": - research = DuckDuckGoSearchResults(max_results=max_results) - res = research.run(query) - results = re.findall(r'https?://[^\s,\]]+', res) - - elif search_engine == "bing": - results = _search_bing(query, max_results, timeout, formatted_proxy) - - elif search_engine == "searxng": - results = _search_searxng(query, max_results, port, timeout) - - elif search_engine.lower() == "serper": - results = _search_serper(query, max_results, serper_api_key, timeout) - - return filter_pdf_links(results) - - except requests.Timeout: - raise TimeoutError(f"Search request timed out after {timeout} seconds") - except requests.RequestException as e: - raise RuntimeError(f"Search request failed: {str(e)}") + Args: + query (str): The search query to find on the internet. + search_engine (str, optional): Specifies the search engine to use, + options include 'Google', 'DuckDuckGo', 'Bing', or 'SearXNG'. Default is 'Google'. + max_results (int, optional): The maximum number of search results to return. + port (int, optional): The port number to use when searching with 'SearXNG'. Default is 8080. -def _search_bing(query: str, max_results: int, timeout: int, proxy: str = None) -> List[str]: - """Helper function for Bing search""" - headers = { - "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36" - } - search_url = f"https://www.bing.com/search?q={query}" - - proxies = {"http": proxy, "https": proxy} if proxy else None - response = requests.get(search_url, headers=headers, timeout=timeout, proxies=proxies) - response.raise_for_status() - - soup = BeautifulSoup(response.text, "html.parser") - return [result.find('a')['href'] for result in soup.find_all('li', class_='b_algo', limit=max_results)] + Returns: + List[str]: A list of URLs as strings that are the search results. -def _search_searxng(query: str, max_results: int, port: int, timeout: int) -> List[str]: - """Helper function for SearXNG search""" - url = f"http://localhost:{port}" - params = { - "q": query, - "format": "json", - "engines": "google,duckduckgo,brave,qwant,bing" - } - response = requests.get(url, params=params, timeout=timeout) - response.raise_for_status() - return [result['url'] for result in response.json().get("results", [])[:max_results]] + Raises: + ValueError: If the search engine specified is not supported. -def _search_serper(query: str, max_results: int, serper_api_key: str, timeout: int) -> List[str]: - """Helper function for serper api""" - if not serper_api_key: - raise ValueError("API key is required for serper api.") - - url = "https://google.serper.dev/search" - payload = json.dumps({ - "q": query, - "num": max_results - }) - headers = { - 'X-API-KEY': serper_api_key, - 'Content-Type': 'application/json' - } - response = requests.post(url, headers=headers, data=payload, timeout=timeout) - response.raise_for_status() - return [result.get("link") for result in response.json().get("organic", [])] + Example: + >>> search_on_web("example query", search_engine="Google", max_results=5) + ['http://example.com', 'http://example.org', ...] + """ + if search_engine.lower() == "google": + res = [] + for url in google_search(query, num_results=max_results): + res.append(url) + return res -def format_proxy(proxy): - if isinstance(proxy, dict): - server = proxy.get('server') - username = proxy.get('username') - password = proxy.get('password') + elif search_engine.lower() == "duckduckgo": + research = DuckDuckGoSearchResults(max_results=max_results) + res = research.run(query) + links = re.findall(r'https?://[^\s,\]]+', res) + return links - if all([username, password, server]): - proxy_url = f"http://{username}:{password}@{server}" - return proxy_url - else: - raise ValueError("Proxy dictionary is missing required fields.") - elif isinstance(proxy, str): - return proxy # "https://username:password@ip:port" - else: - raise TypeError("Proxy should be a dictionary or a string.") - -def filter_pdf_links(links: List[str]) -> List[str]: - """ - Filters out any links that point to PDF files. + elif search_engine.lower() == "bing": + headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36" + } + search_url = f"https://www.bing.com/search?q={query}" + response = requests.get(search_url, headers=headers) + response.raise_for_status() + soup = BeautifulSoup(response.text, "html.parser") - Args: - links (List[str]): A list of URLs as strings. + search_results = [] + for result in soup.find_all('li', class_='b_algo', limit=max_results): + link = result.find('a')['href'] + search_results.append(link) + return search_results - Returns: - List[str]: A list of URLs excluding any that end with '.pdf'. - """ - return [link for link in links if not link.lower().endswith('.pdf')] \ No newline at end of file + elif search_engine.lower() == "searxng": + url = f"http://localhost:{port}" + params = {"q": query, "format": "json"} + + # Send the GET request to the server + response = requests.get(url, params=params) + + data = response.json() + limited_results = data["results"][:max_results] + return limited_results + + else: + raise ValueError("The only search engines available are DuckDuckGo, Google, Bing, or SearXNG") \ No newline at end of file