|
18 | 18 | from .chunking_strategy import ChunkingStrategy, RegexChunking |
19 | 19 |
|
20 | 20 | from .markdown_generation_strategy import MarkdownGenerationStrategy, DefaultMarkdownGenerator |
21 | | -from .content_scraping_strategy import ContentScrapingStrategy, WebScrapingStrategy, LXMLWebScrapingStrategy |
| 21 | +from .content_scraping_strategy import ContentScrapingStrategy, LXMLWebScrapingStrategy |
22 | 22 | from .deep_crawling import DeepCrawlStrategy |
23 | 23 |
|
24 | 24 | from .cache_context import CacheMode |
25 | 25 | from .proxy_strategy import ProxyRotationStrategy |
26 | 26 |
|
27 | | -from typing import Union, List |
| 27 | +from typing import Union, List, Callable |
28 | 28 | import inspect |
29 | 29 | from typing import Any, Dict, Optional |
30 | 30 | from enum import Enum |
31 | 31 |
|
| 32 | +# Type alias for URL matching |
| 33 | +UrlMatcher = Union[str, Callable[[str], bool], List[Union[str, Callable[[str], bool]]]] |
| 34 | + |
| 35 | +class MatchMode(Enum): |
| 36 | + OR = "or" |
| 37 | + AND = "and" |
| 38 | + |
32 | 39 | # from .proxy_strategy import ProxyConfig |
33 | 40 |
|
34 | 41 |
|
@@ -383,6 +390,8 @@ class BrowserConfig: |
383 | 390 | light_mode (bool): Disables certain background features for performance gains. Default: False. |
384 | 391 | extra_args (list): Additional command-line arguments passed to the browser. |
385 | 392 | Default: []. |
| 393 | + enable_stealth (bool): If True, applies playwright-stealth to bypass basic bot detection. |
| 394 | + Cannot be used with use_undetected browser mode. Default: False. |
386 | 395 | """ |
387 | 396 |
|
388 | 397 | def __init__( |
@@ -423,6 +432,7 @@ def __init__( |
423 | 432 | extra_args: list = None, |
424 | 433 | debugging_port: int = 9222, |
425 | 434 | host: str = "localhost", |
| 435 | + enable_stealth: bool = False, |
426 | 436 | ): |
427 | 437 | self.browser_type = browser_type |
428 | 438 | self.headless = headless |
@@ -463,6 +473,7 @@ def __init__( |
463 | 473 | self.verbose = verbose |
464 | 474 | self.debugging_port = debugging_port |
465 | 475 | self.host = host |
| 476 | + self.enable_stealth = enable_stealth |
466 | 477 |
|
467 | 478 | fa_user_agenr_generator = ValidUAGenerator() |
468 | 479 | if self.user_agent_mode == "random": |
@@ -494,6 +505,13 @@ def __init__( |
494 | 505 | # If persistent context is requested, ensure managed browser is enabled |
495 | 506 | if self.use_persistent_context: |
496 | 507 | self.use_managed_browser = True |
| 508 | + |
| 509 | + # Validate stealth configuration |
| 510 | + if self.enable_stealth and self.use_managed_browser and self.browser_mode == "builtin": |
| 511 | + raise ValueError( |
| 512 | + "enable_stealth cannot be used with browser_mode='builtin'. " |
| 513 | + "Stealth mode requires a dedicated browser instance." |
| 514 | + ) |
497 | 515 |
|
498 | 516 | @staticmethod |
499 | 517 | def from_kwargs(kwargs: dict) -> "BrowserConfig": |
@@ -530,6 +548,7 @@ def from_kwargs(kwargs: dict) -> "BrowserConfig": |
530 | 548 | extra_args=kwargs.get("extra_args", []), |
531 | 549 | debugging_port=kwargs.get("debugging_port", 9222), |
532 | 550 | host=kwargs.get("host", "localhost"), |
| 551 | + enable_stealth=kwargs.get("enable_stealth", False), |
533 | 552 | ) |
534 | 553 |
|
535 | 554 | def to_dict(self): |
@@ -564,6 +583,7 @@ def to_dict(self): |
564 | 583 | "verbose": self.verbose, |
565 | 584 | "debugging_port": self.debugging_port, |
566 | 585 | "host": self.host, |
| 586 | + "enable_stealth": self.enable_stealth, |
567 | 587 | } |
568 | 588 |
|
569 | 589 |
|
@@ -862,7 +882,7 @@ class CrawlerRunConfig(): |
862 | 882 | parser_type (str): Type of parser to use for HTML parsing. |
863 | 883 | Default: "lxml". |
864 | 884 | scraping_strategy (ContentScrapingStrategy): Scraping strategy to use. |
865 | | - Default: WebScrapingStrategy. |
| 885 | + Default: LXMLWebScrapingStrategy. |
866 | 886 | proxy_config (ProxyConfig or dict or None): Detailed proxy configuration, e.g. {"server": "...", "username": "..."}. |
867 | 887 | If None, no additional proxy config. Default: None. |
868 | 888 |
|
@@ -1113,6 +1133,9 @@ def __init__( |
1113 | 1133 | link_preview_config: Union[LinkPreviewConfig, Dict[str, Any]] = None, |
1114 | 1134 | # Virtual Scroll Parameters |
1115 | 1135 | virtual_scroll_config: Union[VirtualScrollConfig, Dict[str, Any]] = None, |
| 1136 | + # URL Matching Parameters |
| 1137 | + url_matcher: Optional[UrlMatcher] = None, |
| 1138 | + match_mode: MatchMode = MatchMode.OR, |
1116 | 1139 | # Experimental Parameters |
1117 | 1140 | experimental: Dict[str, Any] = None, |
1118 | 1141 | ): |
@@ -1266,6 +1289,10 @@ def __init__( |
1266 | 1289 | else: |
1267 | 1290 | raise ValueError("virtual_scroll_config must be VirtualScrollConfig object or dict") |
1268 | 1291 |
|
| 1292 | + # URL Matching Parameters |
| 1293 | + self.url_matcher = url_matcher |
| 1294 | + self.match_mode = match_mode |
| 1295 | + |
1269 | 1296 | # Experimental Parameters |
1270 | 1297 | self.experimental = experimental or {} |
1271 | 1298 |
|
@@ -1321,6 +1348,51 @@ def _compile_c4a_script(self): |
1321 | 1348 | if "compilation error" not in str(e).lower(): |
1322 | 1349 | raise ValueError(f"Failed to compile C4A script: {str(e)}") |
1323 | 1350 | raise |
| 1351 | + |
| 1352 | + def is_match(self, url: str) -> bool: |
| 1353 | + """Check if this config matches the given URL. |
| 1354 | + |
| 1355 | + Args: |
| 1356 | + url: The URL to check against this config's matcher |
| 1357 | + |
| 1358 | + Returns: |
| 1359 | + bool: True if this config should be used for the URL or if no matcher is set. |
| 1360 | + """ |
| 1361 | + if self.url_matcher is None: |
| 1362 | + return True |
| 1363 | + |
| 1364 | + if callable(self.url_matcher): |
| 1365 | + # Single function matcher |
| 1366 | + return self.url_matcher(url) |
| 1367 | + |
| 1368 | + elif isinstance(self.url_matcher, str): |
| 1369 | + # Single pattern string |
| 1370 | + from fnmatch import fnmatch |
| 1371 | + return fnmatch(url, self.url_matcher) |
| 1372 | + |
| 1373 | + elif isinstance(self.url_matcher, list): |
| 1374 | + # List of mixed matchers |
| 1375 | + if not self.url_matcher: # Empty list |
| 1376 | + return False |
| 1377 | + |
| 1378 | + results = [] |
| 1379 | + for matcher in self.url_matcher: |
| 1380 | + if callable(matcher): |
| 1381 | + results.append(matcher(url)) |
| 1382 | + elif isinstance(matcher, str): |
| 1383 | + from fnmatch import fnmatch |
| 1384 | + results.append(fnmatch(url, matcher)) |
| 1385 | + else: |
| 1386 | + # Skip invalid matchers |
| 1387 | + continue |
| 1388 | + |
| 1389 | + # Apply match mode logic |
| 1390 | + if self.match_mode == MatchMode.OR: |
| 1391 | + return any(results) if results else False |
| 1392 | + else: # AND mode |
| 1393 | + return all(results) if results else False |
| 1394 | + |
| 1395 | + return False |
1324 | 1396 |
|
1325 | 1397 |
|
1326 | 1398 | def __getattr__(self, name): |
@@ -1443,6 +1515,9 @@ def from_kwargs(kwargs: dict) -> "CrawlerRunConfig": |
1443 | 1515 | # Link Extraction Parameters |
1444 | 1516 | link_preview_config=kwargs.get("link_preview_config"), |
1445 | 1517 | url=kwargs.get("url"), |
| 1518 | + # URL Matching Parameters |
| 1519 | + url_matcher=kwargs.get("url_matcher"), |
| 1520 | + match_mode=kwargs.get("match_mode", MatchMode.OR), |
1446 | 1521 | # Experimental Parameters |
1447 | 1522 | experimental=kwargs.get("experimental"), |
1448 | 1523 | ) |
@@ -1540,6 +1615,8 @@ def to_dict(self): |
1540 | 1615 | "deep_crawl_strategy": self.deep_crawl_strategy, |
1541 | 1616 | "link_preview_config": self.link_preview_config.to_dict() if self.link_preview_config else None, |
1542 | 1617 | "url": self.url, |
| 1618 | + "url_matcher": self.url_matcher, |
| 1619 | + "match_mode": self.match_mode, |
1543 | 1620 | "experimental": self.experimental, |
1544 | 1621 | } |
1545 | 1622 |
|
|
0 commit comments