Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions crawl4ai/async_configs.py
Original file line number Diff line number Diff line change
Expand Up @@ -418,6 +418,8 @@ class BrowserConfig:
Default: [].
enable_stealth (bool): If True, applies playwright-stealth to bypass basic bot detection.
Cannot be used with use_undetected browser mode. Default: False.
avoid_ads (bool): If True, attempts to block ad-related network requests. Default: False.
avoid_css (bool): If True, blocks loading of CSS files for faster and leaner crawling. Default: False.
"""

def __init__(
Expand Down Expand Up @@ -459,6 +461,8 @@ def __init__(
debugging_port: int = 9222,
host: str = "localhost",
enable_stealth: bool = False,
avoid_ads: bool = False,
avoid_css: bool = False,
):

self.browser_type = browser_type
Expand Down Expand Up @@ -514,6 +518,8 @@ def __init__(
self.debugging_port = debugging_port
self.host = host
self.enable_stealth = enable_stealth
self.avoid_ads = avoid_ads
self.avoid_css = avoid_css

fa_user_agenr_generator = ValidUAGenerator()
if self.user_agent_mode == "random":
Expand Down Expand Up @@ -589,6 +595,8 @@ def from_kwargs(kwargs: dict) -> "BrowserConfig":
debugging_port=kwargs.get("debugging_port", 9222),
host=kwargs.get("host", "localhost"),
enable_stealth=kwargs.get("enable_stealth", False),
avoid_ads=kwargs.get("avoid_ads", False),
avoid_css=kwargs.get("avoid_css", False),
)

def to_dict(self):
Expand Down Expand Up @@ -624,6 +632,8 @@ def to_dict(self):
"debugging_port": self.debugging_port,
"host": self.host,
"enable_stealth": self.enable_stealth,
"avoid_ads": self.avoid_ads,
"avoid_css": self.avoid_css,
}


Expand Down
33 changes: 27 additions & 6 deletions crawl4ai/browser_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -878,7 +878,9 @@ async def create_browser_context(self, crawlerRunConfig: CrawlerRunConfig = None
}
proxy_settings = {"server": self.config.proxy} if self.config.proxy else None

blocked_extensions = [
# Define resource categories
css_extensions = ["css", "less", "scss", "sass"]
static_extensions = [
# Images
"jpg",
"jpeg",
Expand All @@ -896,8 +898,6 @@ async def create_browser_context(self, crawlerRunConfig: CrawlerRunConfig = None
"ttf",
"otf",
"eot",
# Styles
# 'css', 'less', 'scss', 'sass',
# Media
"mp4",
"webm",
Expand Down Expand Up @@ -933,6 +933,16 @@ async def create_browser_context(self, crawlerRunConfig: CrawlerRunConfig = None
"wasm",
]

# Ad and Tracker patterns (Top 20 curated from uBlock sources for performance)
ad_tracker_patterns = [
"**/google-analytics.com/**", "**/googletagmanager.com/**", "**/googlesyndication.com/**",
"**/doubleclick.net/**", "**/adservice.google.com/**", "**/adsystem.com/**",
"**/adzerk.net/**", "**/adnxs.com/**", "**/ads.linkedin.com/**", "**/facebook.net/**",
"**/analytics.twitter.com/**", "**/t.co/**", "**/ads-twitter.com/**",
"**/hotjar.com/**", "**/clarity.ms/**", "**/scorecardresearch.com/**", "**/pixel.wp.com/**",
"**/amazon-adsystem.com/**", "**/mixpanel.com/**", "**/segment.com/**"
]

# Common context settings
context_settings = {
"user_agent": user_agent,
Expand Down Expand Up @@ -986,11 +996,22 @@ async def create_browser_context(self, crawlerRunConfig: CrawlerRunConfig = None
# Create and return the context with all settings
context = await self.browser.new_context(**context_settings)

# Apply text mode settings if enabled
# Apply resource filtering based on config (Dynamic addition)
to_block = []
if self.config.avoid_css:
to_block += css_extensions
if self.config.text_mode:
# Create and apply route patterns for each extension
for ext in blocked_extensions:
to_block += static_extensions

if to_block:
for ext in to_block:
await context.route(f"**/*.{ext}", lambda route: route.abort())

if self.config.avoid_ads:
# Apply ad/tracker blocking
for pattern in ad_tracker_patterns:
await context.route(pattern, lambda route: route.abort())

return context

def _make_config_signature(self, crawlerRunConfig: CrawlerRunConfig) -> str:
Expand Down
53 changes: 24 additions & 29 deletions deploy/docker/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,8 @@ async def handle_llm_qa(
config: dict
) -> str:
"""Process QA using LLM with crawled content as context."""
from crawler_pool import get_crawler
from crawler_pool import get_crawler, release_crawler
crawler = None
try:
if not url.startswith(('http://', 'https://')) and not url.startswith(("raw:", "raw://")):
url = 'https://' + url
Expand Down Expand Up @@ -121,6 +122,9 @@ async def handle_llm_qa(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail=str(e)
)
finally:
if crawler:
await release_crawler(crawler)

async def process_llm_extraction(
redis: aioredis.Redis,
Expand Down Expand Up @@ -249,6 +253,7 @@ async def handle_markdown_request(
base_url: Optional[str] = None
) -> str:
"""Handle markdown generation requests."""
crawler = None
try:
# Validate provider if using LLM filter
if filter_type == FilterType.LLM:
Expand Down Expand Up @@ -282,7 +287,7 @@ async def handle_markdown_request(

cache_mode = CacheMode.ENABLED if cache == "1" else CacheMode.WRITE_ONLY

from crawler_pool import get_crawler
from crawler_pool import get_crawler, release_crawler
from utils import load_config as _load_config
_cfg = _load_config()
browser_cfg = BrowserConfig(
Expand Down Expand Up @@ -315,6 +320,9 @@ async def handle_markdown_request(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail=str(e)
)
finally:
if crawler:
await release_crawler(crawler)

async def handle_llm_request(
redis: aioredis.Redis,
Expand Down Expand Up @@ -481,6 +489,7 @@ async def stream_results(crawler: AsyncWebCrawler, results_gen: AsyncGenerator)
"""Stream results with heartbeats and completion markers."""
import json
from utils import datetime_handler
from crawler_pool import release_crawler

try:
async for result in results_gen:
Expand All @@ -507,11 +516,8 @@ async def stream_results(crawler: AsyncWebCrawler, results_gen: AsyncGenerator)
except asyncio.CancelledError:
logger.warning("Client disconnected during streaming")
finally:
# try:
# await crawler.close()
# except Exception as e:
# logger.error(f"Crawler cleanup error: {e}")
pass
if crawler:
await release_crawler(crawler)

async def handle_crawl_request(
urls: List[str],
Expand All @@ -523,6 +529,7 @@ async def handle_crawl_request(
"""Handle non-streaming crawl requests with optional hooks."""
# Track request start
request_id = f"req_{uuid4().hex[:8]}"
crawler = None
try:
from monitor import get_monitor
await get_monitor().track_request_start(
Expand All @@ -549,12 +556,9 @@ async def handle_crawl_request(
) if config["crawler"]["rate_limiter"]["enabled"] else None
)

from crawler_pool import get_crawler
from crawler_pool import get_crawler, release_crawler
crawler = await get_crawler(browser_config)

# crawler: AsyncWebCrawler = AsyncWebCrawler(config=browser_config)
# await crawler.start()

# Attach hooks if provided
hooks_status = {}
if hooks_config:
Expand Down Expand Up @@ -589,8 +593,6 @@ async def handle_crawl_request(
if not isinstance(results, list):
results = [results]

# await crawler.close()

end_mem_mb = _get_memory_mb() # <--- Get memory after
end_time = time.time()

Expand Down Expand Up @@ -689,13 +691,6 @@ async def handle_crawl_request(
except:
pass

if 'crawler' in locals() and crawler.ready: # Check if crawler was initialized and started
# try:
# await crawler.close()
# except Exception as close_e:
# logger.error(f"Error closing crawler during exception handling: {close_e}")
logger.error(f"Error closing crawler during exception handling: {str(e)}")

# Measure memory even on error if possible
end_mem_mb_error = _get_memory_mb()
if start_mem_mb is not None and end_mem_mb_error is not None:
Expand All @@ -709,6 +704,9 @@ async def handle_crawl_request(
"server_peak_memory_mb": max(peak_mem_mb if peak_mem_mb else 0, end_mem_mb_error or 0)
})
)
finally:
if crawler:
await release_crawler(crawler)

async def handle_stream_crawl_request(
urls: List[str],
Expand All @@ -719,6 +717,7 @@ async def handle_stream_crawl_request(
) -> Tuple[AsyncWebCrawler, AsyncGenerator, Optional[Dict]]:
"""Handle streaming crawl requests with optional hooks."""
hooks_info = None
crawler = None
try:
browser_config = BrowserConfig.load(browser_config)
# browser_config.verbose = True # Set to False or remove for production stress testing
Expand All @@ -734,7 +733,7 @@ async def handle_stream_crawl_request(
)
)

from crawler_pool import get_crawler
from crawler_pool import get_crawler, release_crawler
crawler = await get_crawler(browser_config)

# crawler = AsyncWebCrawler(config=browser_config)
Expand Down Expand Up @@ -763,13 +762,9 @@ async def handle_stream_crawl_request(
return crawler, results_gen, hooks_info

except Exception as e:
# Make sure to close crawler if started during an error here
if 'crawler' in locals() and crawler.ready:
# try:
# await crawler.close()
# except Exception as close_e:
# logger.error(f"Error closing crawler during stream setup exception: {close_e}")
logger.error(f"Error closing crawler during stream setup exception: {str(e)}")
# Make sure to release crawler if started during an error here
if crawler:
await release_crawler(crawler)
logger.error(f"Stream crawl error: {str(e)}", exc_info=True)
# Raising HTTPException here will prevent streaming response
raise HTTPException(
Expand Down Expand Up @@ -852,4 +847,4 @@ async def _runner():
)

background_tasks.add_task(_runner)
return {"task_id": task_id}
return {"task_id": task_id}
Loading