diff --git a/.env.test b/.env.test index 4ef1b503cf..d83bacc87c 100644 --- a/.env.test +++ b/.env.test @@ -48,6 +48,7 @@ OPENAI_API_BASE=https://api.openai.com/v1 # Search & Scraping Tool API Keys # ----------------------------------------------------------------------------- SERPER_API_KEY=fake-serper-key +CLORO_API_KEY=fake-cloro-key EXA_API_KEY=fake-exa-key BRAVE_API_KEY=fake-brave-key FIRECRAWL_API_KEY=fake-firecrawl-key diff --git a/docs/en/tools/web-scraping/cloro-tool.mdx b/docs/en/tools/web-scraping/cloro-tool.mdx new file mode 100644 index 0000000000..13a41ec225 --- /dev/null +++ b/docs/en/tools/web-scraping/cloro-tool.mdx @@ -0,0 +1,88 @@ +--- +title: CloroTool +description: Use the `CloroTool` to scrape AI models using the cloro API. +icon: flask +mode: "wide" +--- + +# `CloroTool` + +## Description + +Use the `CloroTool` to scrape AI models using the cloro API. Supports engines: google, chatgpt, gemini, copilot, perplexity, aimode. + +## Installation + +```shell +pip install 'crewai[tools]' +``` + +## Environment Variables + +- `CLORO_API_KEY` (required) + +Get the credentials by creating a [cloro account](https://dashboard.cloro.dev). + +## Example + +```python Code +from crewai_tools import CloroTool + +# make sure CLORO_API_KEY variable is set +tool = CloroTool() + +result = tool.run(search_query="latest news about AI agents") + +print(result) +``` + +## Arguments + +- `api_key` (str, optional): cloro API key. +- `engine` (str, optional): The engine to use for the query. Options are `google`, `chatgpt`, `gemini`, `copilot`, `perplexity`, `aimode`. Defaults to `google`. +- `country` (str, optional): The ISO 3166-1 alpha-2 country code for localized results (e.g., "US", "BR"). For a full list of supported country codes, refer to the [cloro API /v1/countries endpoint](https://docs.cloro.dev/api-reference/endpoint/countries). Defaults to "US". +- `device` (str, optional): The device type for Google search results (`desktop` or `mobile`). Defaults to "desktop". +- `pages` (int, optional): The number of pages to retrieve for Google search results. Defaults to 1. +- `save_file` (bool, optional): Whether to save the search results to a file. Defaults to `False`. + +## Response Format + +The tool returns a structured dictionary containing different fields depending on the selected engine. + +### Google Engine + +- `organic`: List of organic search results with title, link, snippet, etc. +- `peopleAlsoAsk`: List of related questions. +- `relatedSearches`: List of related search queries. +- `ai_overview`: Google AI Overview data (if available). + +### LLM Engines (ChatGPT, Perplexity, Gemini, etc.) + +- `text`: The main response text from the model. +- `sources`: List of sources cited by the model (if available). +- `shopping_cards`: List of product/shopping cards with prices and offers (if available). +- `hotels`: List of hotel results (if available). +- `places`: List of places/locations (if available). +- `videos`: List of video results (if available). +- `images`: List of image results (if available). +- `related_queries`: List of related follow-up queries (if available). +- `entities`: List of extracted entities (if available). + +## Advanced example + +Check out the cloro [documentation](https://docs.cloro.dev/api-reference/introduction) to get the full list of parameters. + +```python Code +from crewai_tools import CloroTool + +# make sure CLORO_API_KEY variable is set +tool = CloroTool( + engine="chatgpt", + country="BR", + save_file=True +) + +result = tool.run(search_query="Say 'Hello, Brazil!'") + +print(result) +``` diff --git a/docs/en/tools/web-scraping/overview.mdx b/docs/en/tools/web-scraping/overview.mdx index 0031cf33e9..2d813eef89 100644 --- a/docs/en/tools/web-scraping/overview.mdx +++ b/docs/en/tools/web-scraping/overview.mdx @@ -14,53 +14,109 @@ These tools enable your agents to interact with the web, extract data from websi General-purpose web scraping tool for extracting content from any website. - - Target specific elements on web pages with precision scraping capabilities. - - - - Crawl entire websites systematically with Firecrawl's powerful engine. - - - - High-performance web scraping with Firecrawl's advanced capabilities. - - - - Search and extract specific content using Firecrawl's search features. - - - - Browser automation and scraping with Selenium WebDriver capabilities. - - - - Professional web scraping with ScrapFly's premium scraping service. - - - - Graph-based web scraping for complex data relationships. - - - - Comprehensive web crawling and data extraction capabilities. - - - - Cloud-based browser automation with BrowserBase infrastructure. - - - - Fast browser interactions with HyperBrowser's optimized engine. - - - - Intelligent browser automation with natural language commands. - - - - Access web data at scale with Oxylabs. - +{" "} + + Target specific elements on web pages with precision scraping capabilities. + + +{" "} + + Crawl entire websites systematically with Firecrawl's powerful engine. + + +{" "} + + High-performance web scraping with Firecrawl's advanced capabilities. + + +{" "} + + Search and extract specific content using Firecrawl's search features. + + +{" "} + + Browser automation and scraping with Selenium WebDriver capabilities. + + +{" "} + + Professional web scraping with ScrapFly's premium scraping service. + + +{" "} + + Graph-based web scraping for complex data relationships. + + +{" "} + + Comprehensive web crawling and data extraction capabilities. + + +{" "} + + Scrape the user interface of major LLMs via cloro API. + + +{" "} + + Fast browser interactions with HyperBrowser's optimized engine. + + +{" "} + + Intelligent browser automation with natural language commands. + + +{" "} + + Access web data at scale with Oxylabs. + SERP search, Web Unlocker, and Dataset API integrations. diff --git a/lib/crewai-tools/src/crewai_tools/__init__.py b/lib/crewai-tools/src/crewai_tools/__init__.py index df69905734..7c5a455345 100644 --- a/lib/crewai-tools/src/crewai_tools/__init__.py +++ b/lib/crewai-tools/src/crewai_tools/__init__.py @@ -21,6 +21,7 @@ from crewai_tools.tools.browserbase_load_tool.browserbase_load_tool import ( BrowserbaseLoadTool, ) +from crewai_tools.tools.cloro_tool.cloro_tool import CloroTool from crewai_tools.tools.code_docs_search_tool.code_docs_search_tool import ( CodeDocsSearchTool, ) @@ -205,6 +206,7 @@ "BrightDataSearchTool", "BrightDataWebUnlockerTool", "BrowserbaseLoadTool", + "CloroTool", "CSVSearchTool", "CodeDocsSearchTool", "CodeInterpreterTool", diff --git a/lib/crewai-tools/src/crewai_tools/tools/__init__.py b/lib/crewai-tools/src/crewai_tools/tools/__init__.py index 51d32ddc25..95d9e89fd0 100644 --- a/lib/crewai-tools/src/crewai_tools/tools/__init__.py +++ b/lib/crewai-tools/src/crewai_tools/tools/__init__.py @@ -10,6 +10,7 @@ from crewai_tools.tools.browserbase_load_tool.browserbase_load_tool import ( BrowserbaseLoadTool, ) +from crewai_tools.tools.cloro_tool.cloro_tool import CloroTool from crewai_tools.tools.code_docs_search_tool.code_docs_search_tool import ( CodeDocsSearchTool, ) @@ -190,6 +191,7 @@ "BrightDataSearchTool", "BrightDataWebUnlockerTool", "BrowserbaseLoadTool", + "CloroTool", "CSVSearchTool", "CodeDocsSearchTool", "CodeInterpreterTool", diff --git a/lib/crewai-tools/src/crewai_tools/tools/cloro_tool/README.md b/lib/crewai-tools/src/crewai_tools/tools/cloro_tool/README.md new file mode 100644 index 0000000000..9f326be576 --- /dev/null +++ b/lib/crewai-tools/src/crewai_tools/tools/cloro_tool/README.md @@ -0,0 +1,75 @@ +# CloroTool + +Use the `CloroTool` to search the web or query AI models using the cloro API. + +## Installation + +```shell +pip install 'crewai[tools]' +``` + +## Example + +```python +from crewai_tools import CloroTool + +# make sure CLORO_API_KEY variable is set +tool = CloroTool() + +result = tool.run(search_query="latest news about AI agents") + +print(result) +``` + +## Arguments + +- `api_key` (str, optional): cloro API key. +- `engine` (str, optional): The engine to use for the query. Options are `google`, `chatgpt`, `gemini`, `copilot`, `perplexity`, `aimode`. Defaults to `google`. +- `country` (str, optional): The ISO 3166-1 alpha-2 country code for localized results (e.g., "US", "BR"). For a full list of supported country codes, refer to the [cloro API /v1/countries endpoint](https://docs.cloro.dev/api-reference/endpoint/countries). Defaults to "US". +- `device` (str, optional): The device type for Google search results (`desktop` or `mobile`). Defaults to "desktop". +- `pages` (int, optional): The number of pages to retrieve for Google search results. Defaults to 1. +- `save_file` (bool, optional): Whether to save the search results to a file. Defaults to `False`. + +Get the credentials by creating a [cloro account](https://dashboard.cloro.dev). + +## Response Format + +The tool returns a structured dictionary containing different fields depending on the selected engine. + +### Google Engine + +- `organic`: List of organic search results with title, link, snippet, etc. +- `peopleAlsoAsk`: List of related questions. +- `relatedSearches`: List of related search queries. +- `ai_overview`: Google AI Overview data (if available). + +### LLM Engines (ChatGPT, Perplexity, Gemini, etc.) + +- `text`: The main response text from the model. +- `sources`: List of sources cited by the model (if available). +- `shopping_cards`: List of product/shopping cards with prices and offers (if available). +- `hotels`: List of hotel results (if available). +- `places`: List of places/locations (if available). +- `videos`: List of video results (if available). +- `images`: List of image results (if available). +- `related_queries`: List of related follow-up queries (if available). +- `entities`: List of extracted entities (if available). + +## Advanced example + +Check out the cloro [documentation](https://docs.cloro.dev/api-reference/introduction) to get the full list of parameters. + +```python +from crewai_tools import CloroTool + +# make sure CLORO_API_KEY variable is set +tool = CloroTool( + engine="chatgpt", + country="BR", + save_file=True +) + +result = tool.run(search_query="Say 'Hello, Brazil!'") + +print(result) +``` diff --git a/lib/crewai-tools/src/crewai_tools/tools/cloro_tool/__init__.py b/lib/crewai-tools/src/crewai_tools/tools/cloro_tool/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/lib/crewai-tools/src/crewai_tools/tools/cloro_tool/cloro_tool.py b/lib/crewai-tools/src/crewai_tools/tools/cloro_tool/cloro_tool.py new file mode 100644 index 0000000000..6a4d67a4ef --- /dev/null +++ b/lib/crewai-tools/src/crewai_tools/tools/cloro_tool/cloro_tool.py @@ -0,0 +1,204 @@ +import datetime +import json +import logging +import os +from typing import Any, Literal, TypedDict + +import requests +from crewai.tools import BaseTool, EnvVar +from pydantic import BaseModel, Field + +logger = logging.getLogger(__name__) + + +class FormattedResults(TypedDict, total=False): + """Formatted search results from Cloro API.""" + + # Google / Search + organic: list[dict[str, Any]] + peopleAlsoAsk: list[dict[str, Any]] + relatedSearches: list[dict[str, Any]] + ai_overview: dict[str, Any] + + # LLM / Common + text: str + sources: list[dict[str, Any]] + + # Rich Content (Perplexity / ChatGPT) + shopping_cards: list[dict[str, Any]] + hotels: list[dict[str, Any]] + places: list[dict[str, Any]] + videos: list[dict[str, Any]] + images: list[dict[str, Any]] + related_queries: list[str] + entities: list[dict[str, Any]] + + credits: int + + +def _save_results_to_file(content: str) -> None: + """Saves the search results to a file.""" + try: + filename = f"cloro_results_{datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.json" + with open(filename, "w") as file: + file.write(content) + logger.info(f"Results saved to {filename}") + except IOError as e: + logger.error(f"Failed to save results to file: {e}") + raise + + +class CloroDevToolSchema(BaseModel): + """Input for CloroDevTool.""" + + search_query: str = Field( + ..., description="Mandatory query/prompt you want to use to search/query the model" + ) + + +class CloroTool(BaseTool): + name: str = "Search/Query with Cloro" + description: str = ( + "A tool that can be used to search the internet or query LLMs using cloro API. " + "Supports engines: google, chatgpt, gemini, copilot, perplexity, aimode." + ) + args_schema: type[BaseModel] = CloroDevToolSchema + base_url: str = "https://api.cloro.dev/v1/monitor" + engine: Literal[ + "google", + "chatgpt", + "gemini", + "copilot", + "perplexity", + "aimode", + ] = "google" + country: str = "US" + device: str = "desktop" + pages: int = 1 + save_file: bool = False + api_key: str | None = Field(None, description="cloro API key") + env_vars: list[EnvVar] = Field( + default_factory=lambda: [ + EnvVar( + name="CLORO_API_KEY", description="API key for cloro", required=True + ), + ] + ) + + def __init__(self, api_key: str | None = None, **kwargs): + super().__init__(**kwargs) + if api_key: + self.api_key = api_key + + def _get_api_key(self) -> str: + if self.api_key: + return self.api_key + env_key = os.environ.get("CLORO_API_KEY") + if env_key: + return env_key + raise ValueError("cloro API key not found. Set CLORO_API_KEY environment variable or pass 'api_key' to constructor.") + + def _get_endpoint(self) -> str: + return f"{self.base_url}/{self.engine}" + + def _make_api_request(self, query: str) -> dict[str, Any]: + endpoint = self._get_endpoint() + + payload: dict[str, Any] = { + "country": self.country, + } + + if self.engine == "google": + payload["query"] = query + payload["device"] = self.device + payload["pages"] = self.pages + payload["include"] = { + "html": False, + "aioverview": {"markdown": True} + } + else: + payload["prompt"] = query + + if self.engine in ["chatgpt", "gemini", "copilot", "perplexity", "aimode"]: + payload["include"] = {"markdown": True} + + headers = { + "Authorization": f"Bearer {self._get_api_key()}", + "Content-Type": "application/json", + } + + response = None + try: + response = requests.post( + endpoint, headers=headers, json=payload, timeout=60 + ) + response.raise_for_status() + return response.json() + except requests.exceptions.RequestException as e: + error_msg = f"Error making request to cloro API ({self.engine}): {e}" + if response is not None and hasattr(response, "content"): + error_msg += f"\nResponse content: {response.content.decode('utf-8', errors='replace')}" + logger.error(error_msg) + raise + + def _run(self, **kwargs: Any) -> FormattedResults: + """Execute the search/query operation.""" + search_query: str | None = kwargs.get("search_query") or kwargs.get("query") + save_file = kwargs.get("save_file", self.save_file) + + if not search_query: + raise ValueError("search_query is required") + + api_response = self._make_api_request(search_query) + + if not api_response.get("success"): + raise ValueError(f"cloro API returned unsuccessful response: {api_response}") + + result = api_response.get("result", {}) + formatted_results: FormattedResults = {} # type: ignore + + # Process Google Search Results + if self.engine == "google": + if "organicResults" in result: + formatted_results["organic"] = result["organicResults"] + if "peopleAlsoAsk" in result: + formatted_results["peopleAlsoAsk"] = result["peopleAlsoAsk"] + if "relatedSearches" in result: + formatted_results["relatedSearches"] = result["relatedSearches"] + if "aioverview" in result: + formatted_results["ai_overview"] = result["aioverview"] + + # Process LLM Results + else: + if "text" in result: + formatted_results["text"] = result["text"] + if "sources" in result: + formatted_results["sources"] = result["sources"] + + # Map rich content if available + if "shopping_cards" in result: + formatted_results["shopping_cards"] = result["shopping_cards"] + elif "shoppingCards" in result: + formatted_results["shopping_cards"] = result["shoppingCards"] + + if "hotels" in result: + formatted_results["hotels"] = result["hotels"] + if "places" in result: + formatted_results["places"] = result["places"] + if "videos" in result: + formatted_results["videos"] = result["videos"] + if "images" in result: + formatted_results["images"] = result["images"] + + if "related_queries" in result: + formatted_results["related_queries"] = result["related_queries"] + elif "relatedQueries" in result: + formatted_results["related_queries"] = result["relatedQueries"] + + if "entities" in result: + formatted_results["entities"] = result["entities"] + + if save_file: + _save_results_to_file(json.dumps(formatted_results, indent=2)) + + return formatted_results \ No newline at end of file diff --git a/lib/crewai-tools/tests/tools/cloro_tool_test.py b/lib/crewai-tools/tests/tools/cloro_tool_test.py new file mode 100644 index 0000000000..75fdc2db95 --- /dev/null +++ b/lib/crewai-tools/tests/tools/cloro_tool_test.py @@ -0,0 +1,195 @@ +import os +import pytest +from unittest.mock import patch, MagicMock +from crewai_tools.tools.cloro_tool.cloro_tool import CloroTool + +@pytest.fixture(autouse=True) +def mock_cloro_api_key(): + with patch.dict(os.environ, {"CLORO_API_KEY": "test_key"}): + yield + +@patch("requests.post") +def test_cloro_tool_google_search(mock_post): + tool = CloroTool(engine="google") + mock_response = { + "success": True, + "result": { + "organicResults": [ + { + "title": "Test Title", + "link": "http://test.com", + "snippet": "Test Snippet" + } + ], + "aioverview": {"markdown": "**AI Overview**"} + } + } + mock_post.return_value.json.return_value = mock_response + mock_post.return_value.status_code = 200 + + result = tool.run(search_query="test query") + + assert "organic" in result + assert result["organic"][0]["title"] == "Test Title" + assert "ai_overview" in result + assert result["ai_overview"]["markdown"] == "**AI Overview**" + + # Check payload + called_payload = mock_post.call_args.kwargs["json"] + assert "query" in called_payload + assert called_payload["query"] == "test query" + assert "include" in called_payload + assert called_payload["include"].get("aioverview", {}).get("markdown") is True + + +@patch("requests.post") +def test_cloro_tool_chatgpt_query(mock_post): + tool = CloroTool(engine="chatgpt") + mock_response = { + "success": True, + "result": { + "text": "ChatGPT response", + "markdown": "**ChatGPT response**", + "shoppingCards": [{"title": "Product 1", "price": "$10"}] + } + } + mock_post.return_value.json.return_value = mock_response + mock_post.return_value.status_code = 200 + + result = tool.run(search_query="test prompt") + + assert "text" in result + assert result["text"] == "ChatGPT response" + + # Verify rich content processing (camelCase normalization) + assert "shopping_cards" in result + assert result["shopping_cards"][0]["title"] == "Product 1" + + # Check payload + called_payload = mock_post.call_args.kwargs["json"] + assert "prompt" in called_payload + assert called_payload["prompt"] == "test prompt" + + +@patch("requests.post") +def test_cloro_tool_gemini_query(mock_post): + tool = CloroTool(engine="gemini") + mock_response = { + "success": True, + "result": { + "text": "Gemini response", + } + } + mock_post.return_value.json.return_value = mock_response + mock_post.return_value.status_code = 200 + + result = tool.run(search_query="gemini prompt") + + assert "text" in result + assert result["text"] == "Gemini response" + + +@patch("requests.post") +def test_cloro_tool_copilot_query(mock_post): + tool = CloroTool(engine="copilot") + mock_response = { + "success": True, + "result": { + "text": "Copilot response", + "sources": [{"title": "Source 1", "link": "http://source1.com"}] + } + } + mock_post.return_value.json.return_value = mock_response + mock_post.return_value.status_code = 200 + + result = tool.run(search_query="copilot prompt") + + assert "text" in result + assert "sources" in result + assert result["sources"][0]["title"] == "Source 1" + + +@patch("requests.post") +def test_cloro_tool_perplexity_query(mock_post): + tool = CloroTool(engine="perplexity") + mock_response = { + "success": True, + "result": { + "text": "Perplexity response", + "shopping_cards": [{"title": "Product 2", "price": "$20"}], + "related_queries": ["query 1", "query 2"] + } + } + mock_post.return_value.json.return_value = mock_response + mock_post.return_value.status_code = 200 + + result = tool.run(search_query="perplexity prompt") + + assert "text" in result + + # Verify rich content processing (snake_case) + assert "shopping_cards" in result + assert result["shopping_cards"][0]["title"] == "Product 2" + assert "related_queries" in result + assert len(result["related_queries"]) == 2 + + +@patch("requests.post") +def test_cloro_tool_aimode_query(mock_post): + tool = CloroTool(engine="aimode") + mock_response = { + "success": True, + "result": { + "text": "AI Mode response" + } + } + mock_post.return_value.json.return_value = mock_response + mock_post.return_value.status_code = 200 + + result = tool.run(search_query="aimode prompt") + + assert "text" in result + + +@patch("requests.post") +def test_api_error_handling(mock_post): + tool = CloroTool() + mock_post.side_effect = Exception("API Error") + + with pytest.raises(Exception) as exc_info: + tool.run(search_query="test") + assert "API Error" in str(exc_info.value) + +@patch("requests.post") +def test_unsuccessful_response(mock_post): + tool = CloroTool() + mock_response = {"success": False} + mock_post.return_value.json.return_value = mock_response + mock_post.return_value.status_code = 200 + + with pytest.raises(ValueError) as exc_info: + tool.run(search_query="test") + assert "cloro API returned unsuccessful response" in str(exc_info.value) + +def test_save_file(): + tool = CloroTool(save_file=True) + + with patch("requests.post") as mock_post, \ + patch("builtins.open", new_callable=MagicMock) as mock_open: + + mock_response = { + "success": True, + "result": {"organicResults": []} + } + mock_post.return_value.json.return_value = mock_response + mock_post.return_value.status_code = 200 + + tool.run(search_query="test") + + # Verify open was called + mock_open.assert_called() + + # Verify write was called on the file handle + # open() returns a context manager, __enter__ returns the file handle + mock_file_handle = mock_open.return_value.__enter__.return_value + mock_file_handle.write.assert_called() diff --git a/lib/crewai-tools/tool.specs.json b/lib/crewai-tools/tool.specs.json index ea2cef07a3..325fe57f08 100644 --- a/lib/crewai-tools/tool.specs.json +++ b/lib/crewai-tools/tool.specs.json @@ -940,6 +940,119 @@ "type": "object" } }, + { + "description": "A tool that scrape the user interface LLMs using cloro API. Support Google, ChatGPT, Gemini, Copilot, Perplexity and AI Mode engines", + "env_vars": [ + { + "default": null, + "description": "API key for cloro", + "name": "CLORO_API_KEY", + "required": true + } + ], + "humanized_name": "Prompt with cloro", + "init_params_schema": { + "$defs": { + "EnvVar": { + "properties": { + "default": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Default" + }, + "description": { + "title": "Description", + "type": "string" + }, + "name": { + "title": "Name", + "type": "string" + }, + "required": { + "default": true, + "title": "Required", + "type": "boolean" + } + }, + "required": ["name", "description"], + "title": "EnvVar", + "type": "object" + } + }, + "description": "CloroTool - A tool for scraping LLMs via the cloro API.\n\nAttributes:\n name (str): Tool name.\n description (str): Tool description.\n args_schema (Type[BaseModel]): Pydantic schema for input arguments.\n api_key (Optional[str]): cloro API key.", + "properties": { + "api_key": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Api Key" + }, + "country": { + "default": "US", + "title": "Country", + "type": "string" + }, + "device": { + "default": "desktop", + "title": "Device", + "type": "string" + }, + "engine": { + "default": "google", + "enum": [ + "google", + "chatgpt", + "gemini", + "copilot", + "perplexity", + "aimode" + ], + "title": "Engine", + "type": "string" + }, + "pages": { + "default": 1, + "title": "Pages", + "type": "integer" + }, + "save_file": { + "default": false, + "title": "Save File", + "type": "boolean" + } + }, + "title": "CloroTool", + "type": "object" + }, + "name": "CloroTool", + "package_dependencies": [], + "run_params_schema": { + "description": "Input for CloroDevTool.", + "properties": { + "search_query": { + "description": "Mandatory query/prompt you want to use to search/query the model", + "title": "Search Query", + "type": "string" + } + }, + "required": ["search_query"], + "title": "CloroDevToolSchema", + "type": "object" + } + }, { "description": "A tool that can be used to semantic search a query from a CSV's content.", "env_vars": [],