Skip to content

Commit b470c61

Browse files
committed
feat: add proxies to google search
1 parent 12f2b99 commit b470c61

File tree

3 files changed

+20
-9
lines changed

3 files changed

+20
-9
lines changed

scrapegraphai/graphs/search_graph.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,8 @@ def __init__(self, prompt: str, config: dict, schema: Optional[BaseModel] = None
4747

4848
self.copy_config = safe_deepcopy(config)
4949
self.copy_schema = deepcopy(schema)
50-
self.considered_urls = [] # New attribute to store URLs
50+
self.considered_urls = []
51+
self.proxies = config.get("proxies")
5152

5253
super().__init__(prompt, config, schema)
5354

@@ -65,6 +66,7 @@ def _create_graph(self) -> BaseGraph:
6566
node_config={
6667
"llm_model": self.llm_model,
6768
"max_results": self.max_results,
69+
"proxies": self.proxies,
6870
"search_engine": self.copy_config.get("search_engine")
6971
}
7072
)

scrapegraphai/nodes/search_internet_node.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@ def __init__(
4747
else "google"
4848
)
4949
self.max_results = node_config.get("max_results", 3)
50+
self.proxyes = node_config.get("proxies", 3)
5051

5152
def execute(self, state: dict) -> dict:
5253
"""
@@ -94,7 +95,8 @@ def execute(self, state: dict) -> dict:
9495
self.logger.info(f"Search Query: {search_query}")
9596

9697
answer = search_on_web(query=search_query, max_results=self.max_results,
97-
search_engine=self.search_engine)
98+
search_engine=self.search_engine,
99+
proxies=self.proxyes)
98100

99101
if len(answer) == 0:
100102
raise ValueError("Zero results found for the search query.")

scrapegraphai/utils/research_web.py

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
"""
2-
research_web module
2+
search_web module
33
"""
44
import re
55
from typing import List
@@ -9,8 +9,8 @@
99
from bs4 import BeautifulSoup
1010

1111
def search_on_web(query: str, search_engine: str = "Google",
12-
max_results: int = 10, port: int = 8080,
13-
timeout: int = 10) -> List[str]:
12+
max_results: int = 10, port: int = 8080,
13+
timeout: int = 10, proxies: dict = None) -> List[str]:
1414
"""
1515
Searches the web for a given query using specified search
1616
engine options and filters out PDF links.
@@ -23,6 +23,7 @@ def search_on_web(query: str, search_engine: str = "Google",
2323
port (int, optional): The port number to use when searching with 'SearXNG'. Default is 8080.
2424
timeout (int, optional): The number of seconds to wait
2525
for a response from a request. Default is 10 seconds.
26+
proxies (dict, optional): Dictionary containing proxy settings.
2627
2728
Returns:
2829
List[str]: A list of URLs as strings that are the search results, excluding any PDF links.
@@ -32,7 +33,8 @@ def search_on_web(query: str, search_engine: str = "Google",
3233
requests.exceptions.Timeout: If the request times out.
3334
3435
Example:
35-
>>> search_on_web("example query", search_engine="Google", max_results=5)
36+
>>> search_on_web("example query", search_engine="Google", max_results=5,
37+
proxies={"http": "http://proxy.example.com:8080"})
3638
['http://example.com', 'http://example.org', ...]
3739
"""
3840

@@ -50,7 +52,12 @@ def filter_pdf_links(links: List[str]) -> List[str]:
5052

5153
if search_engine.lower() == "google":
5254
res = []
53-
for url in google_search(query, stop=max_results):
55+
google_search_params = {"stop": max_results}
56+
57+
if proxies:
58+
google_search_params["proxies"] = proxies
59+
60+
for url in google_search(query, **google_search_params):
5461
res.append(url)
5562
return filter_pdf_links(res)
5663

@@ -66,7 +73,7 @@ def filter_pdf_links(links: List[str]) -> List[str]:
6673
AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"""
6774
}
6875
search_url = f"https://www.bing.com/search?q={query}"
69-
response = requests.get(search_url, headers=headers, timeout=timeout)
76+
response = requests.get(search_url, headers=headers, timeout=timeout, proxies=proxies)
7077
response.raise_for_status()
7178
soup = BeautifulSoup(response.text, "html.parser")
7279

@@ -79,7 +86,7 @@ def filter_pdf_links(links: List[str]) -> List[str]:
7986
elif search_engine.lower() == "searxng":
8087
url = f"http://localhost:{port}"
8188
params = {"q": query, "format": "json", "engines": "google,duckduckgo,brave,qwant,bing"}
82-
response = requests.get(url, params=params, timeout=timeout)
89+
response = requests.get(url, params=params, timeout=timeout, proxies=proxies)
8390
data = response.json()
8491
limited_results = [result['url'] for result in data["results"][:max_results]]
8592
return filter_pdf_links(limited_results)

0 commit comments

Comments
 (0)