Skip to content

Commit 47bc688

Browse files
author
dylan.min
committed
feat: optimize resource filtering, enhance ad/tracker blocking and cleanup docs
- Refactor BrowserManager to dynamically block resources based on avoid_css and text_mode - Align text_mode behavior with community standards (no forced CSS blocking) - Add Top 20 curated ad and tracker patterns for performance - Restore and translate permanent browser logs in crawler_pool.py - Clean up models.py schema annotations and server.py docstrings - Add unit and functional tests for filtering flags
1 parent a774f8d commit 47bc688

File tree

6 files changed

+119
-74
lines changed

6 files changed

+119
-74
lines changed

crawl4ai/browser_manager.py

Lines changed: 26 additions & 71 deletions
Original file line numberDiff line numberDiff line change
@@ -878,79 +878,29 @@ async def create_browser_context(self, crawlerRunConfig: CrawlerRunConfig = None
878878
}
879879
proxy_settings = {"server": self.config.proxy} if self.config.proxy else None
880880

881-
blocked_extensions = [
881+
# Define resource categories
882+
css_extensions = ["css", "less", "scss", "sass"]
883+
static_extensions = [
882884
# Images
883-
"jpg",
884-
"jpeg",
885-
"png",
886-
"gif",
887-
"webp",
888-
"svg",
889-
"ico",
890-
"bmp",
891-
"tiff",
892-
"psd",
885+
"jpg", "jpeg", "png", "gif", "webp", "svg", "ico", "bmp", "tiff",
893886
# Fonts
894-
"woff",
895-
"woff2",
896-
"ttf",
897-
"otf",
898-
"eot",
899-
# Styles
900-
"css", "less", "scss", "sass",
887+
"woff", "woff2", "ttf", "otf", "eot",
901888
# Media
902-
"mp4",
903-
"webm",
904-
"ogg",
905-
"avi",
906-
"mov",
907-
"wmv",
908-
"flv",
909-
"m4v",
910-
"mp3",
911-
"wav",
912-
"aac",
913-
"m4a",
914-
"opus",
915-
"flac",
916-
# Documents
917-
"pdf",
918-
"doc",
919-
"docx",
920-
"xls",
921-
"xlsx",
922-
"ppt",
923-
"pptx",
924-
# Archives
925-
"zip",
926-
"rar",
927-
"7z",
928-
"tar",
929-
"gz",
930-
# Scripts and data
931-
"xml",
932-
"swf",
933-
"wasm",
889+
"mp4", "webm", "ogg", "mp3", "wav", "aac", "flac",
890+
# Documents & Archives
891+
"pdf", "doc", "docx", "xls", "xlsx", "zip", "rar", "7z", "tar", "gz",
892+
# Other
893+
"xml", "swf", "wasm"
934894
]
935895

936-
# Ad and Tracker patterns
896+
# Ad and Tracker patterns (Top 20 curated from uBlock sources for performance)
937897
ad_tracker_patterns = [
938-
"**/google-analytics.com/**",
939-
"**/googletagmanager.com/**",
940-
"**/googlesyndication.com/**",
941-
"**/doubleclick.net/**",
942-
"**/adservice.google.com/**",
943-
"**/adsystem.com/**",
944-
"**/adzerk.net/**",
945-
"**/adnxs.com/**",
946-
"**/ads.linkedin.com/**",
947-
"**/facebook.net/**",
948-
"**/analytics.twitter.com/**",
949-
"**/t.co/**",
950-
"**/hotjar.com/**",
951-
"**/clarity.ms/**",
952-
"**/scorecardresearch.com/**",
953-
"**/pixel.wp.com/**",
898+
"**/google-analytics.com/**", "**/googletagmanager.com/**", "**/googlesyndication.com/**",
899+
"**/doubleclick.net/**", "**/adservice.google.com/**", "**/adsystem.com/**",
900+
"**/adzerk.net/**", "**/adnxs.com/**", "**/ads.linkedin.com/**", "**/facebook.net/**",
901+
"**/analytics.twitter.com/**", "**/t.co/**", "**/ads-twitter.com/**",
902+
"**/hotjar.com/**", "**/clarity.ms/**", "**/scorecardresearch.com/**", "**/pixel.wp.com/**",
903+
"**/amazon-adsystem.com/**", "**/mixpanel.com/**", "**/segment.com/**"
954904
]
955905

956906
# Common context settings
@@ -1006,10 +956,15 @@ async def create_browser_context(self, crawlerRunConfig: CrawlerRunConfig = None
1006956
# Create and return the context with all settings
1007957
context = await self.browser.new_context(**context_settings)
1008958

1009-
# Apply resource filtering based on config
1010-
if self.config.avoid_css or self.config.text_mode:
1011-
# Create and apply route patterns for each extension
1012-
for ext in blocked_extensions:
959+
# Apply resource filtering based on config (Dynamic addition)
960+
to_block = []
961+
if self.config.avoid_css:
962+
to_block += css_extensions
963+
if self.config.text_mode:
964+
to_block += static_extensions
965+
966+
if to_block:
967+
for ext in to_block:
1013968
await context.route(f"**/*.{ext}", lambda route: route.abort())
1014969

1015970
if self.config.avoid_ads:

crawl4ai/models.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -151,7 +151,7 @@ class CrawlResult(BaseModel):
151151
redirected_url: Optional[str] = None
152152
network_requests: Optional[List[Dict[str, Any]]] = None
153153
console_messages: Optional[List[Dict[str, Any]]] = None
154-
tables: List[Dict] = Field(default_factory=list) # NEW – [{headers,rows,caption,summary}]
154+
tables: List[Dict] = Field(default_factory=list)
155155

156156
model_config = ConfigDict(arbitrary_types_allowed=True)
157157

deploy/docker/crawler_pool.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ async def get_crawler(cfg: BrowserConfig) -> AsyncWebCrawler:
5050
if not PERMANENT_BROWSER_DISABLED and PERMANENT and _is_default_config(sig):
5151
LAST_USED[sig] = time.time()
5252
USAGE_COUNT[sig] = USAGE_COUNT.get(sig, 0) + 1
53-
# logger.info("🔥 Using permanent browser")
53+
logger.info("🔥 Using permanent browser")
5454
return PERMANENT
5555

5656
# Check hot pool
@@ -172,7 +172,7 @@ async def janitor():
172172
mem_pct = get_container_memory_percent()
173173

174174
# Adaptive intervals and TTLs
175-
# 严格遵循 BASE_IDLE_TTL,不再做 hot_ttl = ttl * 2 的放大
175+
# Strictly follow BASE_IDLE_TTL without multipliers
176176
if mem_pct > 80:
177177
interval, cold_ttl, hot_ttl = 10, 30, 60
178178
elif mem_pct > 60:
Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
import asyncio
2+
import os
3+
import sys
4+
import pytest
5+
6+
# Add the project root to Python path if running directly
7+
if __name__ == "__main__":
8+
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..')))
9+
10+
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
11+
from crawl4ai.async_logger import AsyncLogger
12+
13+
# Create a logger for clear terminal output
14+
logger = AsyncLogger(verbose=True, log_file=None)
15+
16+
@pytest.mark.asyncio
17+
async def test_resource_filtering_launch():
18+
"""Functional test to ensure browser launches correctly with filtering flags enabled."""
19+
browser_config = BrowserConfig(
20+
headless=True,
21+
avoid_ads=True,
22+
avoid_css=True,
23+
text_mode=True
24+
)
25+
26+
async with AsyncWebCrawler(config=browser_config) as crawler:
27+
# Simple crawl to verify functionality
28+
result = await crawler.arun(
29+
url="https://example.com",
30+
config=CrawlerRunConfig(cache_mode="bypass")
31+
)
32+
assert result.success
33+
logger.success("Browser launched and crawled successfully with filtering flags")
34+
35+
@pytest.mark.asyncio
36+
async def test_avoid_css_only():
37+
"""Test avoid_css without text_mode."""
38+
browser_config = BrowserConfig(
39+
headless=True,
40+
avoid_css=True,
41+
text_mode=False
42+
)
43+
44+
async with AsyncWebCrawler(config=browser_config) as crawler:
45+
result = await crawler.arun(
46+
url="https://example.com",
47+
config=CrawlerRunConfig(cache_mode="bypass")
48+
)
49+
assert result.success
50+
logger.success("Browser launched and crawled successfully with avoid_css only")
51+
52+
if __name__ == "__main__":
53+
asyncio.run(test_resource_filtering_launch())
54+
asyncio.run(test_avoid_css_only())
55+

tests/general/test_cache_context.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,9 @@
11
import asyncio
2+
import pytest
23
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
34
from playwright.async_api import Page, BrowserContext
45

6+
@pytest.mark.asyncio
57
async def test_reuse_context_by_config():
68
# We will store each context ID in these maps to confirm reuse
79
context_ids_for_A = []

tests/unit/test_config_flags.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
import pytest
2+
from crawl4ai.async_configs import BrowserConfig
3+
4+
def test_browser_config_filtering_flags():
5+
"""Test that BrowserConfig correctly stores the new filtering flags."""
6+
# Default values
7+
config = BrowserConfig()
8+
assert config.avoid_ads is False
9+
assert config.avoid_css is False
10+
11+
# Custom values
12+
config = BrowserConfig(avoid_ads=True, avoid_css=True)
13+
assert config.avoid_ads is True
14+
assert config.avoid_css is True
15+
16+
# Check to_dict / from_kwargs parity
17+
config_dict = config.to_dict()
18+
assert config_dict["avoid_ads"] is True
19+
assert config_dict["avoid_css"] is True
20+
21+
new_config = BrowserConfig.from_kwargs(config_dict)
22+
assert new_config.avoid_ads is True
23+
assert new_config.avoid_css is True
24+
25+
def test_browser_config_clone():
26+
"""Test that cloning BrowserConfig preserves the new flags."""
27+
config = BrowserConfig(avoid_ads=True, avoid_css=False)
28+
cloned = config.clone(avoid_css=True)
29+
30+
assert cloned.avoid_ads is True
31+
assert cloned.avoid_css is True
32+
assert config.avoid_css is False # Original remains unchanged
33+

0 commit comments

Comments
 (0)