Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion scrapegraphai/graphs/search_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,8 @@ def __init__(self, prompt: str, config: dict, schema: Optional[BaseModel] = None

self.copy_config = safe_deepcopy(config)
self.copy_schema = deepcopy(schema)
self.considered_urls = [] # New attribute to store URLs
self.considered_urls = []
self.proxies = config.get("proxies")

super().__init__(prompt, config, schema)

Expand All @@ -65,6 +66,7 @@ def _create_graph(self) -> BaseGraph:
node_config={
"llm_model": self.llm_model,
"max_results": self.max_results,
"proxies": self.proxies,
"search_engine": self.copy_config.get("search_engine")
}
)
Expand Down
4 changes: 3 additions & 1 deletion scrapegraphai/nodes/search_internet_node.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ def __init__(
else "google"
)
self.max_results = node_config.get("max_results", 3)
self.proxyes = node_config.get("proxies", 3)

def execute(self, state: dict) -> dict:
"""
Expand Down Expand Up @@ -94,7 +95,8 @@ def execute(self, state: dict) -> dict:
self.logger.info(f"Search Query: {search_query}")

answer = search_on_web(query=search_query, max_results=self.max_results,
search_engine=self.search_engine)
search_engine=self.search_engine,
proxies=self.proxyes)

if len(answer) == 0:
raise ValueError("Zero results found for the search query.")
Expand Down
21 changes: 14 additions & 7 deletions scrapegraphai/utils/research_web.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
"""
research_web module
search_web module
"""
import re
from typing import List
Expand All @@ -9,8 +9,8 @@
from bs4 import BeautifulSoup

def search_on_web(query: str, search_engine: str = "Google",
max_results: int = 10, port: int = 8080,
timeout: int = 10) -> List[str]:
max_results: int = 10, port: int = 8080,
timeout: int = 10, proxies: dict = None) -> List[str]:
"""
Searches the web for a given query using specified search
engine options and filters out PDF links.
Expand All @@ -23,6 +23,7 @@ def search_on_web(query: str, search_engine: str = "Google",
port (int, optional): The port number to use when searching with 'SearXNG'. Default is 8080.
timeout (int, optional): The number of seconds to wait
for a response from a request. Default is 10 seconds.
proxies (dict, optional): Dictionary containing proxy settings.

Returns:
List[str]: A list of URLs as strings that are the search results, excluding any PDF links.
Expand All @@ -32,7 +33,8 @@ def search_on_web(query: str, search_engine: str = "Google",
requests.exceptions.Timeout: If the request times out.

Example:
>>> search_on_web("example query", search_engine="Google", max_results=5)
>>> search_on_web("example query", search_engine="Google", max_results=5,
proxies={"http": "http://proxy.example.com:8080"})
['http://example.com', 'http://example.org', ...]
"""

Expand All @@ -50,7 +52,12 @@ def filter_pdf_links(links: List[str]) -> List[str]:

if search_engine.lower() == "google":
res = []
for url in google_search(query, stop=max_results):
google_search_params = {"stop": max_results}

if proxies:
google_search_params["proxies"] = proxies

for url in google_search(query, **google_search_params):
res.append(url)
return filter_pdf_links(res)

Expand All @@ -66,7 +73,7 @@ def filter_pdf_links(links: List[str]) -> List[str]:
AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"""
}
search_url = f"https://www.bing.com/search?q={query}"
response = requests.get(search_url, headers=headers, timeout=timeout)
response = requests.get(search_url, headers=headers, timeout=timeout, proxies=proxies)
response.raise_for_status()
soup = BeautifulSoup(response.text, "html.parser")

Expand All @@ -79,7 +86,7 @@ def filter_pdf_links(links: List[str]) -> List[str]:
elif search_engine.lower() == "searxng":
url = f"http://localhost:{port}"
params = {"q": query, "format": "json", "engines": "google,duckduckgo,brave,qwant,bing"}
response = requests.get(url, params=params, timeout=timeout)
response = requests.get(url, params=params, timeout=timeout, proxies=proxies)
data = response.json()
limited_results = [result['url'] for result in data["results"][:max_results]]
return filter_pdf_links(limited_results)
Expand Down
Loading