feat: add proxies to google search

VinciGit00 · VinciGit00 · commit b470c61f90c9 · 2024-10-10T11:13:52.000+02:00
diff --git a/scrapegraphai/graphs/search_graph.py b/scrapegraphai/graphs/search_graph.py
@@ -47,7 +47,8 @@ def __init__(self, prompt: str, config: dict, schema: Optional[BaseModel] = None
 
         self.copy_config = safe_deepcopy(config)
         self.copy_schema = deepcopy(schema)
-        self.considered_urls = []  # New attribute to store URLs
+        self.considered_urls = []
+        self.proxies = config.get("proxies")
 
         super().__init__(prompt, config, schema)
 
@@ -65,6 +66,7 @@ def _create_graph(self) -> BaseGraph:
             node_config={
                 "llm_model": self.llm_model,
                 "max_results": self.max_results,
+                "proxies": self.proxies,
                 "search_engine": self.copy_config.get("search_engine")
             }
         )
diff --git a/scrapegraphai/nodes/search_internet_node.py b/scrapegraphai/nodes/search_internet_node.py
@@ -47,6 +47,7 @@ def __init__(
             else "google"
         )
         self.max_results = node_config.get("max_results", 3)
+        self.proxyes = node_config.get("proxies", 3)
 
     def execute(self, state: dict) -> dict:
         """
@@ -94,7 +95,8 @@ def execute(self, state: dict) -> dict:
         self.logger.info(f"Search Query: {search_query}")
 
         answer = search_on_web(query=search_query, max_results=self.max_results,
-                               search_engine=self.search_engine)
+                               search_engine=self.search_engine,
+                               proxies=self.proxyes)
 
         if len(answer) == 0:
             raise ValueError("Zero results found for the search query.")
diff --git a/scrapegraphai/utils/research_web.py b/scrapegraphai/utils/research_web.py
@@ -1,5 +1,5 @@
 """
-research_web module
+search_web module
 """
 import re
 from typing import List
@@ -9,8 +9,8 @@
 from bs4 import BeautifulSoup
 
 def search_on_web(query: str, search_engine: str = "Google",
-                  max_results: int = 10, port: int = 8080, 
-                  timeout: int = 10) -> List[str]:
+                  max_results: int = 10, port: int = 8080,
+                  timeout: int = 10, proxies: dict = None) -> List[str]:
     """
     Searches the web for a given query using specified search
     engine options and filters out PDF links.
@@ -23,6 +23,7 @@ def search_on_web(query: str, search_engine: str = "Google",
         port (int, optional): The port number to use when searching with 'SearXNG'. Default is 8080.
         timeout (int, optional): The number of seconds to wait 
         for a response from a request. Default is 10 seconds.
+        proxies (dict, optional): Dictionary containing proxy settings.
 
     Returns:
         List[str]: A list of URLs as strings that are the search results, excluding any PDF links.
@@ -32,7 +33,8 @@ def search_on_web(query: str, search_engine: str = "Google",
         requests.exceptions.Timeout: If the request times out.
 
     Example:
-        >>> search_on_web("example query", search_engine="Google", max_results=5)
+        >>> search_on_web("example query", search_engine="Google", max_results=5, 
+        proxies={"http": "http://proxy.example.com:8080"})
         ['http://example.com', 'http://example.org', ...]
     """
 
@@ -50,7 +52,12 @@ def filter_pdf_links(links: List[str]) -> List[str]:
 
     if search_engine.lower() == "google":
         res = []
-        for url in google_search(query, stop=max_results):
+        google_search_params = {"stop": max_results}
+
+        if proxies:
+            google_search_params["proxies"] = proxies
+
+        for url in google_search(query, **google_search_params):
             res.append(url)
         return filter_pdf_links(res)
 
@@ -66,7 +73,7 @@ def filter_pdf_links(links: List[str]) -> List[str]:
             AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"""
         }
         search_url = f"https://www.bing.com/search?q={query}"
-        response = requests.get(search_url, headers=headers, timeout=timeout)
+        response = requests.get(search_url, headers=headers, timeout=timeout, proxies=proxies)
         response.raise_for_status()
         soup = BeautifulSoup(response.text, "html.parser")
 
@@ -79,7 +86,7 @@ def filter_pdf_links(links: List[str]) -> List[str]:
     elif search_engine.lower() == "searxng":
         url = f"http://localhost:{port}"
         params = {"q": query, "format": "json", "engines": "google,duckduckgo,brave,qwant,bing"}
-        response = requests.get(url, params=params, timeout=timeout)
+        response = requests.get(url, params=params, timeout=timeout, proxies=proxies)
         data = response.json()
         limited_results = [result['url'] for result in data["results"][:max_results]]
         return filter_pdf_links(limited_results)

Original file line number	Diff line number	Diff line change
`@@ -47,7 +47,8 @@ def __init__(self, prompt: str, config: dict, schema: Optional[BaseModel] = None`
`47`	`47`
`48`	`48`	`self.copy_config = safe_deepcopy(config)`
`49`	`49`	`self.copy_schema = deepcopy(schema)`
`50`		`- self.considered_urls = [] # New attribute to store URLs`
	`50`	`+ self.considered_urls = []`
	`51`	`+ self.proxies = config.get("proxies")`
`51`	`52`
`52`	`53`	`super().__init__(prompt, config, schema)`
`53`	`54`
`@@ -65,6 +66,7 @@ def _create_graph(self) -> BaseGraph:`
`65`	`66`	`node_config={`
`66`	`67`	`"llm_model": self.llm_model,`
`67`	`68`	`"max_results": self.max_results,`
	`69`	`+ "proxies": self.proxies,`
`68`	`70`	`"search_engine": self.copy_config.get("search_engine")`
`69`	`71`	`}`
`70`	`72`	`)`