Skip to content

Commit 4ebb511

Browse files
authored
Fix Firecrawl tools & adding tests (#3810)
* fix: fix Firecrawl Scrape tool * fix: fix Firecrawl Search tool * fix: fix Firecrawl Website tool * tests: adding tests for Firecrawl
1 parent 70b0839 commit 4ebb511

File tree

9 files changed

+2873
-45
lines changed

9 files changed

+2873
-45
lines changed

lib/crewai-tools/src/crewai_tools/tools/firecrawl_crawl_website_tool/firecrawl_crawl_website_tool.py

Lines changed: 19 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -22,22 +22,23 @@ class FirecrawlCrawlWebsiteToolSchema(BaseModel):
2222

2323

2424
class FirecrawlCrawlWebsiteTool(BaseTool):
25-
"""Tool for crawling websites using Firecrawl. To run this tool, you need to have a Firecrawl API key.
25+
"""Tool for crawling websites using Firecrawl v2 API. To run this tool, you need to have a Firecrawl API key.
2626
2727
Args:
2828
api_key (str): Your Firecrawl API key.
29-
config (dict): Optional. It contains Firecrawl API parameters.
29+
config (dict): Optional. It contains Firecrawl v2 API parameters.
3030
31-
Default configuration options:
32-
max_depth (int): Maximum depth to crawl. Default: 2
31+
Default configuration options (Firecrawl v2 API):
32+
max_discovery_depth (int): Maximum depth for discovering pages. Default: 2
3333
ignore_sitemap (bool): Whether to ignore sitemap. Default: True
34-
limit (int): Maximum number of pages to crawl. Default: 100
35-
allow_backward_links (bool): Allow crawling backward links. Default: False
34+
limit (int): Maximum number of pages to crawl. Default: 10
3635
allow_external_links (bool): Allow crawling external links. Default: False
37-
scrape_options (ScrapeOptions): Options for scraping content
38-
- formats (list[str]): Content formats to return. Default: ["markdown", "screenshot", "links"]
36+
allow_subdomains (bool): Allow crawling subdomains. Default: False
37+
delay (int): Delay between requests in milliseconds. Default: None
38+
scrape_options (dict): Options for scraping content
39+
- formats (list[str]): Content formats to return. Default: ["markdown"]
3940
- only_main_content (bool): Only return main content. Default: True
40-
- timeout (int): Timeout in milliseconds. Default: 30000
41+
- timeout (int): Timeout in milliseconds. Default: 10000
4142
"""
4243

4344
model_config = ConfigDict(
@@ -49,14 +50,15 @@ class FirecrawlCrawlWebsiteTool(BaseTool):
4950
api_key: str | None = None
5051
config: dict[str, Any] | None = Field(
5152
default_factory=lambda: {
52-
"maxDepth": 2,
53-
"ignoreSitemap": True,
53+
"max_discovery_depth": 2,
54+
"ignore_sitemap": True,
5455
"limit": 10,
55-
"allowBackwardLinks": False,
56-
"allowExternalLinks": False,
57-
"scrapeOptions": {
58-
"formats": ["markdown", "screenshot", "links"],
59-
"onlyMainContent": True,
56+
"allow_external_links": False,
57+
"allow_subdomains": False,
58+
"delay": None,
59+
"scrape_options": {
60+
"formats": ["markdown"],
61+
"only_main_content": True,
6062
"timeout": 10000,
6163
},
6264
}
@@ -107,7 +109,7 @@ def _run(self, url: str):
107109
if not self._firecrawl:
108110
raise RuntimeError("FirecrawlApp not properly initialized")
109111

110-
return self._firecrawl.crawl_url(url, poll_interval=2, params=self.config)
112+
return self._firecrawl.crawl(url=url, poll_interval=2, **self.config)
111113

112114

113115
try:

lib/crewai-tools/src/crewai_tools/tools/firecrawl_scrape_website_tool/firecrawl_scrape_website_tool.py

Lines changed: 28 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -22,20 +22,27 @@ class FirecrawlScrapeWebsiteToolSchema(BaseModel):
2222

2323

2424
class FirecrawlScrapeWebsiteTool(BaseTool):
25-
"""Tool for scraping webpages using Firecrawl. To run this tool, you need to have a Firecrawl API key.
25+
"""Tool for scraping webpages using Firecrawl v2 API. To run this tool, you need to have a Firecrawl API key.
2626
2727
Args:
2828
api_key (str): Your Firecrawl API key.
29-
config (dict): Optional. It contains Firecrawl API parameters.
29+
config (dict): Optional. It contains Firecrawl v2 API parameters.
3030
31-
Default configuration options:
31+
Default configuration options (Firecrawl v2 API):
3232
formats (list[str]): Content formats to return. Default: ["markdown"]
33-
onlyMainContent (bool): Only return main content. Default: True
34-
includeTags (list[str]): Tags to include. Default: []
35-
excludeTags (list[str]): Tags to exclude. Default: []
36-
headers (dict): Headers to include. Default: {}
37-
waitFor (int): Time to wait for page to load in ms. Default: 0
38-
json_options (dict): Options for JSON extraction. Default: None
33+
only_main_content (bool): Only return main content excluding headers, navs, footers, etc. Default: True
34+
include_tags (list[str]): Tags to include in the output. Default: []
35+
exclude_tags (list[str]): Tags to exclude from the output. Default: []
36+
max_age (int): Returns cached version if younger than this age in milliseconds. Default: 172800000 (2 days)
37+
headers (dict): Headers to send with the request (e.g., cookies, user-agent). Default: {}
38+
wait_for (int): Delay in milliseconds before fetching content. Default: 0
39+
mobile (bool): Emulate scraping from a mobile device. Default: False
40+
skip_tls_verification (bool): Skip TLS certificate verification. Default: True
41+
timeout (int): Request timeout in milliseconds. Default: None
42+
remove_base64_images (bool): Remove base64 images from output. Default: True
43+
block_ads (bool): Enable ad-blocking and cookie popup blocking. Default: True
44+
proxy (str): Proxy type ("basic", "stealth", "auto"). Default: "auto"
45+
store_in_cache (bool): Store page in Firecrawl index and cache. Default: True
3946
"""
4047

4148
model_config = ConfigDict(
@@ -48,11 +55,18 @@ class FirecrawlScrapeWebsiteTool(BaseTool):
4855
config: dict[str, Any] = Field(
4956
default_factory=lambda: {
5057
"formats": ["markdown"],
51-
"onlyMainContent": True,
52-
"includeTags": [],
53-
"excludeTags": [],
58+
"only_main_content": True,
59+
"include_tags": [],
60+
"exclude_tags": [],
61+
"max_age": 172800000, # 2 days cache
5462
"headers": {},
55-
"waitFor": 0,
63+
"wait_for": 0,
64+
"mobile": False,
65+
"skip_tls_verification": True,
66+
"remove_base64_images": True,
67+
"block_ads": True,
68+
"proxy": "auto",
69+
"store_in_cache": True,
5670
}
5771
)
5872

@@ -95,7 +109,7 @@ def _run(self, url: str):
95109
if not self._firecrawl:
96110
raise RuntimeError("FirecrawlApp not properly initialized")
97111

98-
return self._firecrawl.scrape_url(url, params=self.config)
112+
return self._firecrawl.scrape(url=url, **self.config)
99113

100114

101115
try:

lib/crewai-tools/src/crewai_tools/tools/firecrawl_search_tool/firecrawl_search_tool.py

Lines changed: 24 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -23,19 +23,24 @@ class FirecrawlSearchToolSchema(BaseModel):
2323

2424

2525
class FirecrawlSearchTool(BaseTool):
26-
"""Tool for searching webpages using Firecrawl. To run this tool, you need to have a Firecrawl API key.
26+
"""Tool for searching webpages using Firecrawl v2 API. To run this tool, you need to have a Firecrawl API key.
2727
2828
Args:
2929
api_key (str): Your Firecrawl API key.
30-
config (dict): Optional. It contains Firecrawl API parameters.
31-
32-
Default configuration options:
33-
limit (int): Maximum number of pages to crawl. Default: 5
34-
tbs (str): Time before search. Default: None
35-
lang (str): Language. Default: "en"
36-
country (str): Country. Default: "us"
37-
location (str): Location. Default: None
38-
timeout (int): Timeout in milliseconds. Default: 60000
30+
config (dict): Optional. It contains Firecrawl v2 API parameters.
31+
32+
Default configuration options (Firecrawl v2 API):
33+
limit (int): Maximum number of search results to return. Default: 5
34+
tbs (str): Time-based search filter (e.g., "qdr:d" for past day). Default: None
35+
location (str): Location for search results. Default: None
36+
timeout (int): Request timeout in milliseconds. Default: None
37+
scrape_options (dict): Options for scraping the search results. Default: {"formats": ["markdown"]}
38+
- formats (list[str]): Content formats to return. Default: ["markdown"]
39+
- only_main_content (bool): Only return main content. Default: True
40+
- include_tags (list[str]): Tags to include. Default: []
41+
- exclude_tags (list[str]): Tags to exclude. Default: []
42+
- wait_for (int): Delay before fetching content in ms. Default: 0
43+
- timeout (int): Request timeout in milliseconds. Default: None
3944
"""
4045

4146
model_config = ConfigDict(
@@ -49,10 +54,15 @@ class FirecrawlSearchTool(BaseTool):
4954
default_factory=lambda: {
5055
"limit": 5,
5156
"tbs": None,
52-
"lang": "en",
53-
"country": "us",
5457
"location": None,
55-
"timeout": 60000,
58+
"timeout": None,
59+
"scrape_options": {
60+
"formats": ["markdown"],
61+
"only_main_content": True,
62+
"include_tags": [],
63+
"exclude_tags": [],
64+
"wait_for": 0,
65+
},
5666
}
5767
)
5868
_firecrawl: FirecrawlApp | None = PrivateAttr(None)
@@ -106,7 +116,7 @@ def _run(
106116

107117
return self._firecrawl.search(
108118
query=query,
109-
params=self.config,
119+
**self.config,
110120
)
111121

112122

0 commit comments

Comments
 (0)