diff --git a/haystack/components/fetchers/__init__.py b/haystack/components/fetchers/__init__.py index 4f1aa9fbe5..89407ee832 100644 --- a/haystack/components/fetchers/__init__.py +++ b/haystack/components/fetchers/__init__.py @@ -7,10 +7,11 @@ from lazy_imports import LazyImporter -_import_structure = {"link_content": ["LinkContentFetcher"]} +_import_structure = {"link_content": ["LinkContentFetcher"], "serpex": ["SerpexWebSearch"]} if TYPE_CHECKING: from .link_content import LinkContentFetcher as LinkContentFetcher + from .serpex import SerpexWebSearch as SerpexWebSearch else: sys.modules[__name__] = LazyImporter(name=__name__, module_file=__file__, import_structure=_import_structure) diff --git a/haystack/components/fetchers/serpex.py b/haystack/components/fetchers/serpex.py new file mode 100644 index 0000000000..fd60a48934 --- /dev/null +++ b/haystack/components/fetchers/serpex.py @@ -0,0 +1,203 @@ +# SPDX-FileCopyrightText: 2022-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + +from typing import Any, Dict, List, Optional + +import httpx +from tenacity import retry, retry_if_exception_type, stop_after_attempt, wait_exponential + +from haystack import component, default_from_dict, default_to_dict, logging +from haystack.dataclasses import Document + +logger = logging.getLogger(__name__) + + +@component +class SerpexWebSearch: + """ + Fetches web search results from the SERPEX API. + + SERPEX provides web search results from multiple search engines including Google, Bing, DuckDuckGo, and more. + Use it to retrieve organic search results, snippets, and metadata for search queries. + + ### Usage example + + ```python + from haystack.components.fetchers import SerpexWebSearch + + fetcher = SerpexWebSearch(api_key="your-serpex-api-key") + results = fetcher.run(query="What is Haystack?") + + documents = results["documents"] + for doc in documents: + print(f"Title: {doc.meta['title']}") + print(f"URL: {doc.meta['url']}") + print(f"Snippet: {doc.content}") + ``` + """ + + def __init__( + self, + api_key: str, + engine: str = "google", + num_results: int = 10, + timeout: int = 10, + retry_attempts: int = 2, + ): + """ + Initializes the SerpexWebSearch component. + + :param api_key: SERPEX API key for authentication. Get yours at https://serpex.dev + :param engine: Search engine to use. Options: "auto", "google", "bing", "duckduckgo", + "brave", "yahoo", "yandex". Defaults to "google". + :param num_results: Number of search results to return. Defaults to 10. + :param timeout: Timeout in seconds for the API request. Defaults to 10. + :param retry_attempts: Number of retry attempts for failed requests. Defaults to 2. + """ + self.api_key = api_key + self.engine = engine + self.num_results = num_results + self.timeout = timeout + self.retry_attempts = retry_attempts + + # Create httpx client + self._client = httpx.Client(timeout=timeout, follow_redirects=True) + + # Define retry decorator + @retry( + reraise=True, + stop=stop_after_attempt(self.retry_attempts), + wait=wait_exponential(multiplier=1, min=2, max=10), + retry=retry_if_exception_type((httpx.HTTPStatusError, httpx.RequestError)), + ) + def make_request(url: str, headers: Dict[str, str], params: Dict[str, Any]) -> httpx.Response: + response = self._client.get(url, headers=headers, params=params) + response.raise_for_status() + return response + + self._make_request = make_request + + def __del__(self): + """ + Clean up resources when the component is deleted. + + Closes the HTTP client to prevent resource leaks. + """ + try: + if hasattr(self, "_client"): + self._client.close() + except Exception: + pass + + def to_dict(self) -> Dict[str, Any]: + """ + Serializes the component to a dictionary. + + :returns: Dictionary with serialized data. + """ + return default_to_dict( + self, + api_key=self.api_key, + engine=self.engine, + num_results=self.num_results, + timeout=self.timeout, + retry_attempts=self.retry_attempts, + ) + + @classmethod + def from_dict(cls, data: Dict[str, Any]) -> "SerpexWebSearch": + """ + Deserializes the component from a dictionary. + + :param data: Dictionary to deserialize from. + :returns: Deserialized component. + """ + return default_from_dict(cls, data) + + @component.output_types(documents=List[Document]) + def run( + self, + query: str, + engine: Optional[str] = None, + num_results: Optional[int] = None, + time_range: Optional[str] = None, + ) -> Dict[str, Any]: + """ + Fetches web search results for the given query. + + :param query: The search query string. + :param engine: Override the default search engine. If None, uses the engine from initialization. + :param num_results: Override the default number of results. If None, uses num_results from initialization. + :param time_range: Time range filter for results. Options: "all", "day", "week", "month", "year". + Defaults to None (all time). + :returns: Dictionary containing a list of Document objects with search results. + """ + documents: List[Document] = [] + + try: + # Prepare request parameters + params: Dict[str, Any] = { + "q": query, + "engine": engine or self.engine, + "num": num_results or self.num_results, + "category": "web", + } + + if time_range: + params["time_range"] = time_range + + headers = {"Authorization": f"Bearer {self.api_key}", "Content-Type": "application/json"} + + # Make API request + response = self._make_request("https://api.serpex.dev/api/search", headers, params) + data = response.json() + + # Parse search results (API returns 'results' not 'organic_results') + if "results" in data and isinstance(data["results"], list): + for result in data["results"]: + # Extract result data + title = result.get("title", "") + url = result.get("url", "") # API uses 'url' not 'link' + snippet = result.get("snippet", "") + position = result.get("position", 0) + + # Create Document object + doc = Document( + content=snippet, + meta={ + "title": title, + "url": url, + "position": position, + "query": query, + "engine": engine or self.engine, + }, + ) + documents.append(doc) + + logger.info( + "Successfully fetched {count} search results for query: {query}", + count=len(documents), + query=query, + ) + else: + logger.warning( + "No results found in SERPEX API response for query: {query}", + query=query, + ) + + except httpx.HTTPStatusError as e: + logger.error( + "HTTP error occurred while fetching SERPEX results: {status} - {detail}", + status=e.response.status_code, + detail=str(e), + ) + raise + except httpx.RequestError as e: + logger.error("Request error occurred while fetching SERPEX results: {error}", error=str(e)) + raise + except Exception as e: + logger.error("Unexpected error occurred while fetching SERPEX results: {error}", error=str(e)) + raise + + return {"documents": documents} diff --git a/releasenotes/notes/add-serpex-web-search-fetcher-a1b2c3d4e5f6g7h8.yaml b/releasenotes/notes/add-serpex-web-search-fetcher-a1b2c3d4e5f6g7h8.yaml new file mode 100644 index 0000000000..df11373953 --- /dev/null +++ b/releasenotes/notes/add-serpex-web-search-fetcher-a1b2c3d4e5f6g7h8.yaml @@ -0,0 +1,7 @@ +--- +enhancements: + - | + Add SerpexWebSearch component to the fetchers module. This new component enables fetching organic web search results + from multiple search engines (Google, Bing, DuckDuckGo, Brave, Yahoo, Yandex) via the SERPEX API. The component + supports configurable search engines, result counts, time range filtering, and automatic retry logic. Search results + are returned as Haystack Document objects with rich metadata including titles, URLs, positions, and snippets. diff --git a/test/components/fetchers/test_serpex.py b/test/components/fetchers/test_serpex.py new file mode 100644 index 0000000000..1ec4489a29 --- /dev/null +++ b/test/components/fetchers/test_serpex.py @@ -0,0 +1,271 @@ +# SPDX-FileCopyrightText: 2022-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + +import os +from unittest.mock import Mock, patch + +import httpx +import pytest + +from haystack.components.fetchers.serpex import SerpexWebSearch +from haystack.dataclasses import Document + + +@pytest.fixture +def mock_serpex_response(): + """Mock SERPEX API response with results""" + return { + "results": [ + { + "title": "Haystack - Open Source LLM Framework", + "url": "https://haystack.deepset.ai/", + "snippet": "Haystack is an open-source framework for building production-ready LLM applications.", + "position": 1, + }, + { + "title": "Haystack Documentation", + "url": "https://docs.haystack.deepset.ai/", + "snippet": "Complete documentation for Haystack framework.", + "position": 2, + }, + { + "title": "Haystack GitHub Repository", + "url": "https://github.com/deepset-ai/haystack", + "snippet": "Official Haystack GitHub repository with source code.", + "position": 3, + }, + ] + } + + +@pytest.fixture +def mock_empty_serpex_response(): + """Mock SERPEX API response with no results""" + return {"results": []} + + +class TestSerpexWebSearch: + def test_init_default_params(self): + """Test initialization with default parameters""" + fetcher = SerpexWebSearch(api_key="test-api-key") + assert fetcher.api_key == "test-api-key" + assert fetcher.engine == "google" + assert fetcher.num_results == 10 + assert fetcher.timeout == 10 + assert fetcher.retry_attempts == 2 + assert hasattr(fetcher, "_client") + assert isinstance(fetcher._client, httpx.Client) + + def test_init_custom_params(self): + """Test initialization with custom parameters""" + fetcher = SerpexWebSearch( + api_key="test-key", engine="bing", num_results=5, timeout=20, retry_attempts=3 + ) + assert fetcher.api_key == "test-key" + assert fetcher.engine == "bing" + assert fetcher.num_results == 5 + assert fetcher.timeout == 20 + assert fetcher.retry_attempts == 3 + + def test_to_dict(self): + """Test serialization to dictionary""" + fetcher = SerpexWebSearch(api_key="test-api-key", engine="duckduckgo", num_results=15) + data = fetcher.to_dict() + assert data["init_parameters"]["api_key"] == "test-api-key" + assert data["init_parameters"]["engine"] == "duckduckgo" + assert data["init_parameters"]["num_results"] == 15 + assert data["init_parameters"]["timeout"] == 10 + assert data["init_parameters"]["retry_attempts"] == 2 + + def test_from_dict(self): + """Test deserialization from dictionary""" + data = { + "type": "haystack.components.fetchers.serpex.SerpexWebSearch", + "init_parameters": { + "api_key": "test-key", + "engine": "brave", + "num_results": 20, + "timeout": 15, + "retry_attempts": 1, + }, + } + fetcher = SerpexWebSearch.from_dict(data) + assert fetcher.api_key == "test-key" + assert fetcher.engine == "brave" + assert fetcher.num_results == 20 + assert fetcher.timeout == 15 + assert fetcher.retry_attempts == 1 + + def test_run_with_mock_response(self, mock_serpex_response): + """Test run method with mocked successful API response""" + with patch("haystack.components.fetchers.serpex.httpx.Client.get") as mock_get: + mock_response = Mock(status_code=200) + mock_response.json.return_value = mock_serpex_response + mock_get.return_value = mock_response + + fetcher = SerpexWebSearch(api_key="test-api-key") + result = fetcher.run(query="What is Haystack?") + + documents = result["documents"] + assert len(documents) == 3 + + # Check first document + assert isinstance(documents[0], Document) + assert documents[0].content == "Haystack is an open-source framework for building production-ready LLM applications." + assert documents[0].meta["title"] == "Haystack - Open Source LLM Framework" + assert documents[0].meta["url"] == "https://haystack.deepset.ai/" + assert documents[0].meta["position"] == 1 + assert documents[0].meta["query"] == "What is Haystack?" + assert documents[0].meta["engine"] == "google" + + # Check second document + assert documents[1].meta["title"] == "Haystack Documentation" + assert documents[1].meta["position"] == 2 + + # Check third document + assert documents[2].meta["title"] == "Haystack GitHub Repository" + assert documents[2].meta["position"] == 3 + + def test_run_with_empty_results(self, mock_empty_serpex_response): + """Test run method with empty results""" + with patch("haystack.components.fetchers.serpex.httpx.Client.get") as mock_get: + mock_response = Mock(status_code=200) + mock_response.json.return_value = mock_empty_serpex_response + mock_get.return_value = mock_response + + fetcher = SerpexWebSearch(api_key="test-api-key") + result = fetcher.run(query="nonexistent query") + + documents = result["documents"] + assert len(documents) == 0 + + def test_run_with_engine_override(self, mock_serpex_response): + """Test run method with engine parameter override""" + with patch("haystack.components.fetchers.serpex.httpx.Client.get") as mock_get: + mock_response = Mock(status_code=200) + mock_response.json.return_value = mock_serpex_response + mock_get.return_value = mock_response + + fetcher = SerpexWebSearch(api_key="test-api-key", engine="google") + result = fetcher.run(query="test query", engine="bing") + + # Verify the request was made with the overridden engine + call_args = mock_get.call_args + assert call_args[1]["params"]["engine"] == "bing" + + documents = result["documents"] + assert len(documents) == 3 + assert documents[0].meta["engine"] == "bing" + + def test_run_with_num_results_override(self, mock_serpex_response): + """Test run method with num_results parameter override""" + with patch("haystack.components.fetchers.serpex.httpx.Client.get") as mock_get: + mock_response = Mock(status_code=200) + mock_response.json.return_value = mock_serpex_response + mock_get.return_value = mock_response + + fetcher = SerpexWebSearch(api_key="test-api-key", num_results=10) + result = fetcher.run(query="test query", num_results=5) + + # Verify the request was made with the overridden num_results + call_args = mock_get.call_args + assert call_args[1]["params"]["num"] == 5 + + def test_run_with_time_range(self, mock_serpex_response): + """Test run method with time_range parameter""" + with patch("haystack.components.fetchers.serpex.httpx.Client.get") as mock_get: + mock_response = Mock(status_code=200) + mock_response.json.return_value = mock_serpex_response + mock_get.return_value = mock_response + + fetcher = SerpexWebSearch(api_key="test-api-key") + result = fetcher.run(query="test query", time_range="week") + + # Verify the request was made with time_range parameter + call_args = mock_get.call_args + assert call_args[1]["params"]["time_range"] == "week" + + def test_run_with_http_error(self): + """Test run method with HTTP error""" + with patch("haystack.components.fetchers.serpex.httpx.Client.get") as mock_get: + mock_response = Mock(status_code=401) + mock_response.raise_for_status.side_effect = httpx.HTTPStatusError( + "401 Unauthorized", request=Mock(), response=mock_response + ) + mock_get.return_value = mock_response + + fetcher = SerpexWebSearch(api_key="invalid-key") + with pytest.raises(httpx.HTTPStatusError): + fetcher.run(query="test query") + + def test_run_with_network_error(self): + """Test run method with network error""" + with patch("haystack.components.fetchers.serpex.httpx.Client.get") as mock_get: + mock_get.side_effect = httpx.RequestError("Connection failed", request=Mock()) + + fetcher = SerpexWebSearch(api_key="test-api-key") + with pytest.raises(httpx.RequestError): + fetcher.run(query="test query") + + def test_run_verifies_api_key_in_headers(self, mock_serpex_response): + """Test that API key is properly included in request headers""" + with patch("haystack.components.fetchers.serpex.httpx.Client.get") as mock_get: + mock_response = Mock(status_code=200) + mock_response.json.return_value = mock_serpex_response + mock_get.return_value = mock_response + + fetcher = SerpexWebSearch(api_key="secret-key-123") + fetcher.run(query="test query") + + # Verify the Authorization header was set correctly + call_args = mock_get.call_args + assert call_args[1]["headers"]["Authorization"] == "Bearer secret-key-123" + assert call_args[1]["headers"]["Content-Type"] == "application/json" + + def test_run_uses_correct_api_endpoint(self, mock_serpex_response): + """Test that the correct SERPEX API endpoint is used""" + with patch("haystack.components.fetchers.serpex.httpx.Client.get") as mock_get: + mock_response = Mock(status_code=200) + mock_response.json.return_value = mock_serpex_response + mock_get.return_value = mock_response + + fetcher = SerpexWebSearch(api_key="test-api-key") + fetcher.run(query="test query") + + # Verify the correct endpoint was called + call_args = mock_get.call_args + assert call_args[0][0] == "https://api.serpex.dev/api/search" + + @pytest.mark.integration + @pytest.mark.skipif(not os.environ.get("SERPEX_API_KEY"), reason="SERPEX_API_KEY not set") + def test_run_with_real_api(self): + """Integration test with real SERPEX API""" + api_key = os.environ.get("SERPEX_API_KEY") + fetcher = SerpexWebSearch(api_key=api_key) + + result = fetcher.run(query="Haystack LLM framework") + documents = result["documents"] + + # Basic assertions + assert len(documents) > 0 + assert all(isinstance(doc, Document) for doc in documents) + assert all("title" in doc.meta for doc in documents) + assert all("url" in doc.meta for doc in documents) + assert all(doc.content for doc in documents) + + @pytest.mark.integration + @pytest.mark.skipif(not os.environ.get("SERPEX_API_KEY"), reason="SERPEX_API_KEY not set") + def test_run_with_different_engines(self): + """Integration test with different search engines""" + api_key = os.environ.get("SERPEX_API_KEY") + + # Test Google + fetcher_google = SerpexWebSearch(api_key=api_key, engine="google") + result_google = fetcher_google.run(query="Python programming") + assert len(result_google["documents"]) > 0 + + # Test DuckDuckGo + fetcher_ddg = SerpexWebSearch(api_key=api_key, engine="duckduckgo") + result_ddg = fetcher_ddg.run(query="Python programming") + assert len(result_ddg["documents"]) > 0