Skip to content

Commit 21f79fe

Browse files
committed
Release v0.7.3: Merge release branch
- Merge release/v0.7.3 into main - Version: 0.7.3 - Ready for tag and publication
2 parents 18504d7 + a9a2d79 commit 21f79fe

File tree

70 files changed

+7577
-1454
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

70 files changed

+7577
-1454
lines changed

CHANGELOG.md

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,21 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
2121

2222
## [Unreleased]
2323

24+
### Added
25+
- **Flexible LLM Provider Configuration** (Docker):
26+
- Support for `LLM_PROVIDER` environment variable to override default provider
27+
- Per-request provider override via optional `provider` parameter in API endpoints
28+
- Automatic provider validation with clear error messages
29+
- Updated Docker documentation and examples
30+
31+
### Changed
32+
- **WebScrapingStrategy Refactoring**: Simplified content scraping architecture
33+
- `WebScrapingStrategy` is now an alias for `LXMLWebScrapingStrategy` for backward compatibility
34+
- Removed redundant BeautifulSoup-based implementation (~1000 lines of code)
35+
- `LXMLWebScrapingStrategy` now inherits directly from `ContentScrapingStrategy`
36+
- All existing code using `WebScrapingStrategy` continues to work without modification
37+
- Default scraping strategy remains `LXMLWebScrapingStrategy` for optimal performance
38+
2439
### Added
2540
- **AsyncUrlSeeder**: High-performance URL discovery system for intelligent crawling at scale
2641
- Discover URLs from sitemaps and Common Crawl index

README.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -121,7 +121,8 @@ Crawl4AI is the #1 trending open-source web crawler on GitHub. Your support keep
121121
- **🌱 Believer ($5/mo)** — Join the movement for data democratization
122122
- **🚀 Builder ($50/mo)** — Priority support & early access to features
123123
- **💼 Growing Team ($500/mo)** — Bi-weekly syncs & optimization help
124-
- **🏢 Data Infrastructure Partner ($2000/mo)** — Full partnership with dedicated support
124+
- **🏢 Data Infrastructure Partner ($2000/mo)** — Full partnership with dedicated support
125+
*Custom arrangements available - see [SPONSORS.md](SPONSORS.md) for details & contact*
125126

126127
**Why sponsor?**
127128
No rate-limited APIs. No lock-in. Build and own your data pipeline with direct guidance from the creator of Crawl4AI.

SPONSORS.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,7 @@ Crawl4AI is the #1 trending open-source web crawler. We're building the future o
5858

5959
Building data extraction at scale? Need dedicated support or infrastructure? Let's talk about a custom partnership.
6060

61-
📧 Contact: [[email protected]](mailto:[email protected])
61+
📧 Contact: [[email protected]](mailto:[email protected]) | 📅 [Schedule a call](https://calendar.app.google/rEpvi2UBgUQjWHfJ9)
6262

6363
---
6464

crawl4ai/__init__.py

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,12 +3,12 @@
33

44
from .async_webcrawler import AsyncWebCrawler, CacheMode
55
# MODIFIED: Add SeedingConfig and VirtualScrollConfig here
6-
from .async_configs import BrowserConfig, CrawlerRunConfig, HTTPCrawlerConfig, LLMConfig, ProxyConfig, GeolocationConfig, SeedingConfig, VirtualScrollConfig, LinkPreviewConfig
6+
from .async_configs import BrowserConfig, CrawlerRunConfig, HTTPCrawlerConfig, LLMConfig, ProxyConfig, GeolocationConfig, SeedingConfig, VirtualScrollConfig, LinkPreviewConfig, MatchMode
77

88
from .content_scraping_strategy import (
99
ContentScrapingStrategy,
10-
WebScrapingStrategy,
1110
LXMLWebScrapingStrategy,
11+
WebScrapingStrategy, # Backward compatibility alias
1212
)
1313
from .async_logger import (
1414
AsyncLoggerBase,
@@ -88,6 +88,13 @@
8888
ErrorDetail
8989
)
9090

91+
# Browser Adapters
92+
from .browser_adapter import (
93+
BrowserAdapter,
94+
PlaywrightAdapter,
95+
UndetectedAdapter
96+
)
97+
9198
from .utils import (
9299
start_colab_display_server,
93100
setup_colab_environment
@@ -132,6 +139,7 @@
132139
"CrawlResult",
133140
"CrawlerHub",
134141
"CacheMode",
142+
"MatchMode",
135143
"ContentScrapingStrategy",
136144
"WebScrapingStrategy",
137145
"LXMLWebScrapingStrategy",
@@ -173,6 +181,10 @@
173181
"CompilationResult",
174182
"ValidationResult",
175183
"ErrorDetail",
184+
# Browser Adapters
185+
"BrowserAdapter",
186+
"PlaywrightAdapter",
187+
"UndetectedAdapter",
176188
"LinkPreviewConfig"
177189
]
178190

crawl4ai/__version__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
# crawl4ai/__version__.py
22

33
# This is the version that will be used for stable releases
4-
__version__ = "0.7.2"
4+
__version__ = "0.7.3"
55

66
# For nightly builds, this gets set during build process
77
__nightly_version__ = None

crawl4ai/async_configs.py

Lines changed: 80 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,17 +18,24 @@
1818
from .chunking_strategy import ChunkingStrategy, RegexChunking
1919

2020
from .markdown_generation_strategy import MarkdownGenerationStrategy, DefaultMarkdownGenerator
21-
from .content_scraping_strategy import ContentScrapingStrategy, WebScrapingStrategy, LXMLWebScrapingStrategy
21+
from .content_scraping_strategy import ContentScrapingStrategy, LXMLWebScrapingStrategy
2222
from .deep_crawling import DeepCrawlStrategy
2323

2424
from .cache_context import CacheMode
2525
from .proxy_strategy import ProxyRotationStrategy
2626

27-
from typing import Union, List
27+
from typing import Union, List, Callable
2828
import inspect
2929
from typing import Any, Dict, Optional
3030
from enum import Enum
3131

32+
# Type alias for URL matching
33+
UrlMatcher = Union[str, Callable[[str], bool], List[Union[str, Callable[[str], bool]]]]
34+
35+
class MatchMode(Enum):
36+
OR = "or"
37+
AND = "and"
38+
3239
# from .proxy_strategy import ProxyConfig
3340

3441

@@ -383,6 +390,8 @@ class BrowserConfig:
383390
light_mode (bool): Disables certain background features for performance gains. Default: False.
384391
extra_args (list): Additional command-line arguments passed to the browser.
385392
Default: [].
393+
enable_stealth (bool): If True, applies playwright-stealth to bypass basic bot detection.
394+
Cannot be used with use_undetected browser mode. Default: False.
386395
"""
387396

388397
def __init__(
@@ -423,6 +432,7 @@ def __init__(
423432
extra_args: list = None,
424433
debugging_port: int = 9222,
425434
host: str = "localhost",
435+
enable_stealth: bool = False,
426436
):
427437
self.browser_type = browser_type
428438
self.headless = headless
@@ -463,6 +473,7 @@ def __init__(
463473
self.verbose = verbose
464474
self.debugging_port = debugging_port
465475
self.host = host
476+
self.enable_stealth = enable_stealth
466477

467478
fa_user_agenr_generator = ValidUAGenerator()
468479
if self.user_agent_mode == "random":
@@ -494,6 +505,13 @@ def __init__(
494505
# If persistent context is requested, ensure managed browser is enabled
495506
if self.use_persistent_context:
496507
self.use_managed_browser = True
508+
509+
# Validate stealth configuration
510+
if self.enable_stealth and self.use_managed_browser and self.browser_mode == "builtin":
511+
raise ValueError(
512+
"enable_stealth cannot be used with browser_mode='builtin'. "
513+
"Stealth mode requires a dedicated browser instance."
514+
)
497515

498516
@staticmethod
499517
def from_kwargs(kwargs: dict) -> "BrowserConfig":
@@ -530,6 +548,7 @@ def from_kwargs(kwargs: dict) -> "BrowserConfig":
530548
extra_args=kwargs.get("extra_args", []),
531549
debugging_port=kwargs.get("debugging_port", 9222),
532550
host=kwargs.get("host", "localhost"),
551+
enable_stealth=kwargs.get("enable_stealth", False),
533552
)
534553

535554
def to_dict(self):
@@ -564,6 +583,7 @@ def to_dict(self):
564583
"verbose": self.verbose,
565584
"debugging_port": self.debugging_port,
566585
"host": self.host,
586+
"enable_stealth": self.enable_stealth,
567587
}
568588

569589

@@ -862,7 +882,7 @@ class CrawlerRunConfig():
862882
parser_type (str): Type of parser to use for HTML parsing.
863883
Default: "lxml".
864884
scraping_strategy (ContentScrapingStrategy): Scraping strategy to use.
865-
Default: WebScrapingStrategy.
885+
Default: LXMLWebScrapingStrategy.
866886
proxy_config (ProxyConfig or dict or None): Detailed proxy configuration, e.g. {"server": "...", "username": "..."}.
867887
If None, no additional proxy config. Default: None.
868888
@@ -1113,6 +1133,9 @@ def __init__(
11131133
link_preview_config: Union[LinkPreviewConfig, Dict[str, Any]] = None,
11141134
# Virtual Scroll Parameters
11151135
virtual_scroll_config: Union[VirtualScrollConfig, Dict[str, Any]] = None,
1136+
# URL Matching Parameters
1137+
url_matcher: Optional[UrlMatcher] = None,
1138+
match_mode: MatchMode = MatchMode.OR,
11161139
# Experimental Parameters
11171140
experimental: Dict[str, Any] = None,
11181141
):
@@ -1266,6 +1289,10 @@ def __init__(
12661289
else:
12671290
raise ValueError("virtual_scroll_config must be VirtualScrollConfig object or dict")
12681291

1292+
# URL Matching Parameters
1293+
self.url_matcher = url_matcher
1294+
self.match_mode = match_mode
1295+
12691296
# Experimental Parameters
12701297
self.experimental = experimental or {}
12711298

@@ -1321,6 +1348,51 @@ def _compile_c4a_script(self):
13211348
if "compilation error" not in str(e).lower():
13221349
raise ValueError(f"Failed to compile C4A script: {str(e)}")
13231350
raise
1351+
1352+
def is_match(self, url: str) -> bool:
1353+
"""Check if this config matches the given URL.
1354+
1355+
Args:
1356+
url: The URL to check against this config's matcher
1357+
1358+
Returns:
1359+
bool: True if this config should be used for the URL or if no matcher is set.
1360+
"""
1361+
if self.url_matcher is None:
1362+
return True
1363+
1364+
if callable(self.url_matcher):
1365+
# Single function matcher
1366+
return self.url_matcher(url)
1367+
1368+
elif isinstance(self.url_matcher, str):
1369+
# Single pattern string
1370+
from fnmatch import fnmatch
1371+
return fnmatch(url, self.url_matcher)
1372+
1373+
elif isinstance(self.url_matcher, list):
1374+
# List of mixed matchers
1375+
if not self.url_matcher: # Empty list
1376+
return False
1377+
1378+
results = []
1379+
for matcher in self.url_matcher:
1380+
if callable(matcher):
1381+
results.append(matcher(url))
1382+
elif isinstance(matcher, str):
1383+
from fnmatch import fnmatch
1384+
results.append(fnmatch(url, matcher))
1385+
else:
1386+
# Skip invalid matchers
1387+
continue
1388+
1389+
# Apply match mode logic
1390+
if self.match_mode == MatchMode.OR:
1391+
return any(results) if results else False
1392+
else: # AND mode
1393+
return all(results) if results else False
1394+
1395+
return False
13241396

13251397

13261398
def __getattr__(self, name):
@@ -1443,6 +1515,9 @@ def from_kwargs(kwargs: dict) -> "CrawlerRunConfig":
14431515
# Link Extraction Parameters
14441516
link_preview_config=kwargs.get("link_preview_config"),
14451517
url=kwargs.get("url"),
1518+
# URL Matching Parameters
1519+
url_matcher=kwargs.get("url_matcher"),
1520+
match_mode=kwargs.get("match_mode", MatchMode.OR),
14461521
# Experimental Parameters
14471522
experimental=kwargs.get("experimental"),
14481523
)
@@ -1540,6 +1615,8 @@ def to_dict(self):
15401615
"deep_crawl_strategy": self.deep_crawl_strategy,
15411616
"link_preview_config": self.link_preview_config.to_dict() if self.link_preview_config else None,
15421617
"url": self.url,
1618+
"url_matcher": self.url_matcher,
1619+
"match_mode": self.match_mode,
15431620
"experimental": self.experimental,
15441621
}
15451622

0 commit comments

Comments
 (0)