diff --git a/crawl4ai/async_configs.py b/crawl4ai/async_configs.py index 10cc48d08..3d683a06c 100644 --- a/crawl4ai/async_configs.py +++ b/crawl4ai/async_configs.py @@ -418,6 +418,8 @@ class BrowserConfig: Default: []. enable_stealth (bool): If True, applies playwright-stealth to bypass basic bot detection. Cannot be used with use_undetected browser mode. Default: False. + avoid_ads (bool): If True, attempts to block ad-related network requests. Default: False. + avoid_css (bool): If True, blocks loading of CSS files for faster and leaner crawling. Default: False. """ def __init__( @@ -459,6 +461,8 @@ def __init__( debugging_port: int = 9222, host: str = "localhost", enable_stealth: bool = False, + avoid_ads: bool = False, + avoid_css: bool = False, ): self.browser_type = browser_type @@ -514,6 +518,8 @@ def __init__( self.debugging_port = debugging_port self.host = host self.enable_stealth = enable_stealth + self.avoid_ads = avoid_ads + self.avoid_css = avoid_css fa_user_agenr_generator = ValidUAGenerator() if self.user_agent_mode == "random": @@ -589,6 +595,8 @@ def from_kwargs(kwargs: dict) -> "BrowserConfig": debugging_port=kwargs.get("debugging_port", 9222), host=kwargs.get("host", "localhost"), enable_stealth=kwargs.get("enable_stealth", False), + avoid_ads=kwargs.get("avoid_ads", False), + avoid_css=kwargs.get("avoid_css", False), ) def to_dict(self): @@ -624,6 +632,8 @@ def to_dict(self): "debugging_port": self.debugging_port, "host": self.host, "enable_stealth": self.enable_stealth, + "avoid_ads": self.avoid_ads, + "avoid_css": self.avoid_css, } diff --git a/crawl4ai/browser_manager.py b/crawl4ai/browser_manager.py index 3ca96aed4..9f2efd3d6 100644 --- a/crawl4ai/browser_manager.py +++ b/crawl4ai/browser_manager.py @@ -878,7 +878,9 @@ async def create_browser_context(self, crawlerRunConfig: CrawlerRunConfig = None } proxy_settings = {"server": self.config.proxy} if self.config.proxy else None - blocked_extensions = [ + # Define resource categories + css_extensions = ["css", "less", "scss", "sass"] + static_extensions = [ # Images "jpg", "jpeg", @@ -896,8 +898,6 @@ async def create_browser_context(self, crawlerRunConfig: CrawlerRunConfig = None "ttf", "otf", "eot", - # Styles - # 'css', 'less', 'scss', 'sass', # Media "mp4", "webm", @@ -933,6 +933,16 @@ async def create_browser_context(self, crawlerRunConfig: CrawlerRunConfig = None "wasm", ] + # Ad and Tracker patterns (Top 20 curated from uBlock sources for performance) + ad_tracker_patterns = [ + "**/google-analytics.com/**", "**/googletagmanager.com/**", "**/googlesyndication.com/**", + "**/doubleclick.net/**", "**/adservice.google.com/**", "**/adsystem.com/**", + "**/adzerk.net/**", "**/adnxs.com/**", "**/ads.linkedin.com/**", "**/facebook.net/**", + "**/analytics.twitter.com/**", "**/t.co/**", "**/ads-twitter.com/**", + "**/hotjar.com/**", "**/clarity.ms/**", "**/scorecardresearch.com/**", "**/pixel.wp.com/**", + "**/amazon-adsystem.com/**", "**/mixpanel.com/**", "**/segment.com/**" + ] + # Common context settings context_settings = { "user_agent": user_agent, @@ -986,11 +996,22 @@ async def create_browser_context(self, crawlerRunConfig: CrawlerRunConfig = None # Create and return the context with all settings context = await self.browser.new_context(**context_settings) - # Apply text mode settings if enabled + # Apply resource filtering based on config (Dynamic addition) + to_block = [] + if self.config.avoid_css: + to_block += css_extensions if self.config.text_mode: - # Create and apply route patterns for each extension - for ext in blocked_extensions: + to_block += static_extensions + + if to_block: + for ext in to_block: await context.route(f"**/*.{ext}", lambda route: route.abort()) + + if self.config.avoid_ads: + # Apply ad/tracker blocking + for pattern in ad_tracker_patterns: + await context.route(pattern, lambda route: route.abort()) + return context def _make_config_signature(self, crawlerRunConfig: CrawlerRunConfig) -> str: diff --git a/deploy/docker/api.py b/deploy/docker/api.py index 81cd312ab..5892b8019 100644 --- a/deploy/docker/api.py +++ b/deploy/docker/api.py @@ -67,7 +67,8 @@ async def handle_llm_qa( config: dict ) -> str: """Process QA using LLM with crawled content as context.""" - from crawler_pool import get_crawler + from crawler_pool import get_crawler, release_crawler + crawler = None try: if not url.startswith(('http://', 'https://')) and not url.startswith(("raw:", "raw://")): url = 'https://' + url @@ -121,6 +122,9 @@ async def handle_llm_qa( status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=str(e) ) + finally: + if crawler: + await release_crawler(crawler) async def process_llm_extraction( redis: aioredis.Redis, @@ -249,6 +253,7 @@ async def handle_markdown_request( base_url: Optional[str] = None ) -> str: """Handle markdown generation requests.""" + crawler = None try: # Validate provider if using LLM filter if filter_type == FilterType.LLM: @@ -282,7 +287,7 @@ async def handle_markdown_request( cache_mode = CacheMode.ENABLED if cache == "1" else CacheMode.WRITE_ONLY - from crawler_pool import get_crawler + from crawler_pool import get_crawler, release_crawler from utils import load_config as _load_config _cfg = _load_config() browser_cfg = BrowserConfig( @@ -315,6 +320,9 @@ async def handle_markdown_request( status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=str(e) ) + finally: + if crawler: + await release_crawler(crawler) async def handle_llm_request( redis: aioredis.Redis, @@ -481,6 +489,7 @@ async def stream_results(crawler: AsyncWebCrawler, results_gen: AsyncGenerator) """Stream results with heartbeats and completion markers.""" import json from utils import datetime_handler + from crawler_pool import release_crawler try: async for result in results_gen: @@ -507,11 +516,8 @@ async def stream_results(crawler: AsyncWebCrawler, results_gen: AsyncGenerator) except asyncio.CancelledError: logger.warning("Client disconnected during streaming") finally: - # try: - # await crawler.close() - # except Exception as e: - # logger.error(f"Crawler cleanup error: {e}") - pass + if crawler: + await release_crawler(crawler) async def handle_crawl_request( urls: List[str], @@ -523,6 +529,7 @@ async def handle_crawl_request( """Handle non-streaming crawl requests with optional hooks.""" # Track request start request_id = f"req_{uuid4().hex[:8]}" + crawler = None try: from monitor import get_monitor await get_monitor().track_request_start( @@ -549,12 +556,9 @@ async def handle_crawl_request( ) if config["crawler"]["rate_limiter"]["enabled"] else None ) - from crawler_pool import get_crawler + from crawler_pool import get_crawler, release_crawler crawler = await get_crawler(browser_config) - # crawler: AsyncWebCrawler = AsyncWebCrawler(config=browser_config) - # await crawler.start() - # Attach hooks if provided hooks_status = {} if hooks_config: @@ -589,8 +593,6 @@ async def handle_crawl_request( if not isinstance(results, list): results = [results] - # await crawler.close() - end_mem_mb = _get_memory_mb() # <--- Get memory after end_time = time.time() @@ -689,13 +691,6 @@ async def handle_crawl_request( except: pass - if 'crawler' in locals() and crawler.ready: # Check if crawler was initialized and started - # try: - # await crawler.close() - # except Exception as close_e: - # logger.error(f"Error closing crawler during exception handling: {close_e}") - logger.error(f"Error closing crawler during exception handling: {str(e)}") - # Measure memory even on error if possible end_mem_mb_error = _get_memory_mb() if start_mem_mb is not None and end_mem_mb_error is not None: @@ -709,6 +704,9 @@ async def handle_crawl_request( "server_peak_memory_mb": max(peak_mem_mb if peak_mem_mb else 0, end_mem_mb_error or 0) }) ) + finally: + if crawler: + await release_crawler(crawler) async def handle_stream_crawl_request( urls: List[str], @@ -719,6 +717,7 @@ async def handle_stream_crawl_request( ) -> Tuple[AsyncWebCrawler, AsyncGenerator, Optional[Dict]]: """Handle streaming crawl requests with optional hooks.""" hooks_info = None + crawler = None try: browser_config = BrowserConfig.load(browser_config) # browser_config.verbose = True # Set to False or remove for production stress testing @@ -734,7 +733,7 @@ async def handle_stream_crawl_request( ) ) - from crawler_pool import get_crawler + from crawler_pool import get_crawler, release_crawler crawler = await get_crawler(browser_config) # crawler = AsyncWebCrawler(config=browser_config) @@ -763,13 +762,9 @@ async def handle_stream_crawl_request( return crawler, results_gen, hooks_info except Exception as e: - # Make sure to close crawler if started during an error here - if 'crawler' in locals() and crawler.ready: - # try: - # await crawler.close() - # except Exception as close_e: - # logger.error(f"Error closing crawler during stream setup exception: {close_e}") - logger.error(f"Error closing crawler during stream setup exception: {str(e)}") + # Make sure to release crawler if started during an error here + if crawler: + await release_crawler(crawler) logger.error(f"Stream crawl error: {str(e)}", exc_info=True) # Raising HTTPException here will prevent streaming response raise HTTPException( @@ -852,4 +847,4 @@ async def _runner(): ) background_tasks.add_task(_runner) - return {"task_id": task_id} \ No newline at end of file + return {"task_id": task_id} diff --git a/deploy/docker/crawler_pool.py b/deploy/docker/crawler_pool.py index 509cbba92..681022c98 100644 --- a/deploy/docker/crawler_pool.py +++ b/deploy/docker/crawler_pool.py @@ -1,5 +1,6 @@ # crawler_pool.py - Smart browser pool with tiered management -import asyncio, json, hashlib, time +import asyncio, json, hashlib, time, os +import psutil from contextlib import suppress from typing import Dict, Optional from crawl4ai import AsyncWebCrawler, BrowserConfig @@ -13,6 +14,7 @@ PERMANENT: Optional[AsyncWebCrawler] = None # Always-ready default browser HOT_POOL: Dict[str, AsyncWebCrawler] = {} # Frequent configs COLD_POOL: Dict[str, AsyncWebCrawler] = {} # Rare configs +RETIRED_POOL: Dict[str, AsyncWebCrawler] = {} # Browsers marked for retirement LAST_USED: Dict[str, float] = {} USAGE_COUNT: Dict[str, int] = {} LOCK = asyncio.Lock() @@ -22,6 +24,15 @@ BASE_IDLE_TTL = CONFIG.get("crawler", {}).get("pool", {}).get("idle_ttl_sec", 300) DEFAULT_CONFIG_SIG = None # Cached sig for default config +# Retirement Config (from env) +RETIREMENT_ENABLED = os.getenv("CRAWL4AI_BROWSER_RETIREMENT_ENABLED", "false").lower() == "true" +POOL_AUDIT_ENABLED = os.getenv("CRAWL4AI_POOL_AUDIT_ENABLED", "false").lower() == "true" +PERMANENT_BROWSER_DISABLED = os.getenv("CRAWL4AI_PERMANENT_BROWSER_DISABLED", "false").lower() == "true" + +MAX_USAGE_COUNT = int(os.getenv("CRAWL4AI_BROWSER_MAX_USAGE", "100")) +MEMORY_RETIRE_THRESHOLD = int(os.getenv("CRAWL4AI_MEMORY_RETIRE_THRESHOLD", "75")) +MEMORY_RETIRE_MIN_USAGE = int(os.getenv("CRAWL4AI_MEMORY_RETIRE_MIN_USAGE", "10")) + def _sig(cfg: BrowserConfig) -> str: """Generate config signature.""" payload = json.dumps(cfg.to_dict(), sort_keys=True, separators=(",",":")) @@ -35,8 +46,8 @@ async def get_crawler(cfg: BrowserConfig) -> AsyncWebCrawler: """Get crawler from pool with tiered strategy.""" sig = _sig(cfg) async with LOCK: - # Check permanent browser for default config - if PERMANENT and _is_default_config(sig): + # Use permanent browser if not disabled and config matches + if not PERMANENT_BROWSER_DISABLED and PERMANENT and _is_default_config(sig): LAST_USED[sig] = time.time() USAGE_COUNT[sig] = USAGE_COUNT.get(sig, 0) + 1 logger.info("πŸ”₯ Using permanent browser") @@ -44,12 +55,36 @@ async def get_crawler(cfg: BrowserConfig) -> AsyncWebCrawler: # Check hot pool if sig in HOT_POOL: - LAST_USED[sig] = time.time() - USAGE_COUNT[sig] = USAGE_COUNT.get(sig, 0) + 1 - logger.info(f"♨️ Using hot pool browser (sig={sig[:8]})") - return HOT_POOL[sig] - - # Check cold pool (promote to hot if used 3+ times) + crawler = HOT_POOL[sig] + usage = USAGE_COUNT.get(sig, 0) + + if not hasattr(crawler, 'active_requests'): + crawler.active_requests = 0 + + should_retire = False + if RETIREMENT_ENABLED: + if usage >= MAX_USAGE_COUNT: + should_retire = True + logger.info(f"πŸ‘΄ Retirement time for browser {sig[:8]}: Max usage reached ({usage})") + elif usage >= MEMORY_RETIRE_MIN_USAGE: + try: + mem_percent = psutil.virtual_memory().percent + if mem_percent > MEMORY_RETIRE_THRESHOLD: + should_retire = True + logger.info(f"πŸ‘΄ Retirement time for browser {sig[:8]}: Memory high ({mem_percent}%)") + except Exception as e: + logger.warning(f"Failed to check memory for retirement: {e}") + + if should_retire: + RETIRED_POOL[sig] = HOT_POOL.pop(sig) + else: + LAST_USED[sig] = time.time() + USAGE_COUNT[sig] = usage + 1 + crawler.active_requests += 1 + logger.info(f"♨️ Using hot pool browser (sig={sig[:8]}, usage={USAGE_COUNT[sig]}, active={crawler.active_requests})") + return crawler + + # Check cold pool if sig in COLD_POOL: LAST_USED[sig] = time.time() USAGE_COUNT[sig] = USAGE_COUNT.get(sig, 0) + 1 @@ -57,18 +92,18 @@ async def get_crawler(cfg: BrowserConfig) -> AsyncWebCrawler: if USAGE_COUNT[sig] >= 3: logger.info(f"⬆️ Promoting to hot pool (sig={sig[:8]}, count={USAGE_COUNT[sig]})") HOT_POOL[sig] = COLD_POOL.pop(sig) - - # Track promotion in monitor - try: - from monitor import get_monitor - await get_monitor().track_janitor_event("promote", sig, {"count": USAGE_COUNT[sig]}) - except: - pass - - return HOT_POOL[sig] + crawler = HOT_POOL[sig] + if not hasattr(crawler, 'active_requests'): + crawler.active_requests = 0 + crawler.active_requests += 1 + return crawler logger.info(f"❄️ Using cold pool browser (sig={sig[:8]})") - return COLD_POOL[sig] + crawler = COLD_POOL[sig] + if not hasattr(crawler, 'active_requests'): + crawler.active_requests = 0 + crawler.active_requests += 1 + return crawler # Memory check before creating new mem_pct = get_container_memory_percent() @@ -80,18 +115,36 @@ async def get_crawler(cfg: BrowserConfig) -> AsyncWebCrawler: logger.info(f"πŸ†• Creating new browser in cold pool (sig={sig[:8]}, mem={mem_pct:.1f}%)") crawler = AsyncWebCrawler(config=cfg, thread_safe=False) await crawler.start() + crawler.active_requests = 1 COLD_POOL[sig] = crawler LAST_USED[sig] = time.time() USAGE_COUNT[sig] = 1 return crawler +async def release_crawler(crawler: AsyncWebCrawler): + """Decrement active request count for a crawler.""" + async with LOCK: + if hasattr(crawler, 'active_requests'): + crawler.active_requests -= 1 + if crawler.active_requests < 0: + crawler.active_requests = 0 + async def init_permanent(cfg: BrowserConfig): """Initialize permanent default browser.""" global PERMANENT, DEFAULT_CONFIG_SIG + + # Log retirement status once on startup + if RETIREMENT_ENABLED: + logger.info(f"βœ… Browser retirement enabled (Max Usage: {MAX_USAGE_COUNT}, Mem Threshold: {MEMORY_RETIRE_THRESHOLD}%)") + else: + logger.info("ℹ️ Browser retirement disabled") + async with LOCK: - if PERMANENT: - return DEFAULT_CONFIG_SIG = _sig(cfg) + if PERMANENT_BROWSER_DISABLED: + logger.info("ℹ️ Permanent browser is DISABLED via config") + return + logger.info("πŸ”₯ Creating permanent default browser") PERMANENT = AsyncWebCrawler(config=cfg, thread_safe=False) await PERMANENT.start() @@ -102,69 +155,87 @@ async def close_all(): """Close all browsers.""" async with LOCK: tasks = [] - if PERMANENT: - tasks.append(PERMANENT.close()) tasks.extend([c.close() for c in HOT_POOL.values()]) tasks.extend([c.close() for c in COLD_POOL.values()]) + tasks.extend([c.close() for c in RETIRED_POOL.values()]) await asyncio.gather(*tasks, return_exceptions=True) HOT_POOL.clear() COLD_POOL.clear() + RETIRED_POOL.clear() LAST_USED.clear() USAGE_COUNT.clear() async def janitor(): """Adaptive cleanup based on memory pressure.""" + last_audit_time = 0 while True: mem_pct = get_container_memory_percent() # Adaptive intervals and TTLs + # Strictly follow BASE_IDLE_TTL without multipliers if mem_pct > 80: - interval, cold_ttl, hot_ttl = 10, 30, 120 + interval, cold_ttl, hot_ttl = 10, 30, 60 elif mem_pct > 60: - interval, cold_ttl, hot_ttl = 30, 60, 300 + interval, cold_ttl, hot_ttl = 30, 60, 120 else: - interval, cold_ttl, hot_ttl = 60, BASE_IDLE_TTL, BASE_IDLE_TTL * 2 + interval, cold_ttl, hot_ttl = 60, BASE_IDLE_TTL, BASE_IDLE_TTL await asyncio.sleep(interval) now = time.time() async with LOCK: + # [Audit Log] Every 5 minutes + if POOL_AUDIT_ENABLED and now - last_audit_time >= 300: + def _pool_info(pool): + res = [] + for s, c in pool.items(): + req = getattr(c, 'active_requests', 0) + u_count = USAGE_COUNT.get(s, 0) + res.append(f"{s[:8]}(req={req}, usage={u_count})") + return res + + logger.info( + f"🧐 [Pool Audit]\n" + f" - PERMANENT: {'Active' if PERMANENT else 'None/Disabled'}\n" + f" - HOT_POOL: {len(HOT_POOL)} {_pool_info(HOT_POOL)}\n" + f" - COLD_POOL: {len(COLD_POOL)} {_pool_info(COLD_POOL)}\n" + f" - RETIRED_POOL: {len(RETIRED_POOL)} {_pool_info(RETIRED_POOL)}\n" + f" - System Memory: {mem_pct:.1f}%" + ) + last_audit_time = now + # Clean cold pool for sig in list(COLD_POOL.keys()): if now - LAST_USED.get(sig, now) > cold_ttl: - idle_time = now - LAST_USED[sig] - logger.info(f"🧹 Closing cold browser (sig={sig[:8]}, idle={idle_time:.0f}s)") - with suppress(Exception): - await COLD_POOL[sig].close() - COLD_POOL.pop(sig, None) - LAST_USED.pop(sig, None) - USAGE_COUNT.pop(sig, None) - - # Track in monitor - try: - from monitor import get_monitor - await get_monitor().track_janitor_event("close_cold", sig, {"idle_seconds": int(idle_time), "ttl": cold_ttl}) - except: - pass - - # Clean hot pool (more conservative) + crawler = COLD_POOL[sig] + if not hasattr(crawler, 'active_requests') or crawler.active_requests == 0: + logger.info(f"🧹 Closing cold browser (idle, sig={sig[:8]})") + with suppress(Exception): + await crawler.close() + COLD_POOL.pop(sig, None) + LAST_USED.pop(sig, None) + USAGE_COUNT.pop(sig, None) + + # Clean hot pool for sig in list(HOT_POOL.keys()): if now - LAST_USED.get(sig, now) > hot_ttl: - idle_time = now - LAST_USED[sig] - logger.info(f"🧹 Closing hot browser (sig={sig[:8]}, idle={idle_time:.0f}s)") + crawler = HOT_POOL[sig] + if not hasattr(crawler, 'active_requests') or crawler.active_requests == 0: + logger.info(f"🧹 Closing hot browser (idle={now - LAST_USED[sig]:.0f}s, sig={sig[:8]})") + with suppress(Exception): + await crawler.close() + HOT_POOL.pop(sig, None) + LAST_USED.pop(sig, None) + USAGE_COUNT.pop(sig, None) + + # Clean retired pool + for sig in list(RETIRED_POOL.keys()): + crawler = RETIRED_POOL[sig] + if hasattr(crawler, 'active_requests') and crawler.active_requests == 0: + logger.info(f"πŸ’€ Janitor closing retired browser (sig={sig[:8]})") with suppress(Exception): - await HOT_POOL[sig].close() - HOT_POOL.pop(sig, None) - LAST_USED.pop(sig, None) - USAGE_COUNT.pop(sig, None) + await crawler.close() + RETIRED_POOL.pop(sig, None) - # Track in monitor - try: - from monitor import get_monitor - await get_monitor().track_janitor_event("close_hot", sig, {"idle_seconds": int(idle_time), "ttl": hot_ttl}) - except: - pass - - # Log pool stats - if mem_pct > 60: - logger.info(f"πŸ“Š Pool: hot={len(HOT_POOL)}, cold={len(COLD_POOL)}, mem={mem_pct:.1f}%") + if mem_pct > 60 or len(RETIRED_POOL) > 0: + logger.info(f"πŸ“Š Pool: hot={len(HOT_POOL)}, cold={len(COLD_POOL)}, retired={len(RETIRED_POOL)}, mem={mem_pct:.1f}%") diff --git a/deploy/docker/server.py b/deploy/docker/server.py index 62e4e4413..353d99353 100644 --- a/deploy/docker/server.py +++ b/deploy/docker/server.py @@ -7,7 +7,7 @@ """ # ── stdlib & 3rd‑party imports ─────────────────────────────── -from crawler_pool import get_crawler, close_all, janitor +from crawler_pool import get_crawler, release_crawler, close_all, janitor from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig from auth import create_access_token, get_token_dependency, TokenRequest from pydantic import BaseModel @@ -337,8 +337,9 @@ async def generate_html( Crawls the URL, preprocesses the raw HTML for schema extraction, and returns the processed HTML. Use when you need sanitized HTML structures for building schemas or further processing. """ - from crawler_pool import get_crawler - cfg = CrawlerRunConfig() + from crawler_pool import get_crawler, release_crawler + cfg = get_default_crawler_config() + crawler = None try: crawler = await get_crawler(get_default_browser_config()) results = await crawler.arun(url=body.url, config=cfg) @@ -351,6 +352,9 @@ async def generate_html( return JSONResponse({"html": processed_html, "url": body.url, "success": True}) except Exception as e: raise HTTPException(500, detail=str(e)) + finally: + if crawler: + await release_crawler(crawler) # Screenshot endpoint @@ -368,7 +372,8 @@ async def generate_screenshot( Use when you need an image snapshot of the rendered page. Its recommened to provide an output path to save the screenshot. Then in result instead of the screenshot you will get a path to the saved file. """ - from crawler_pool import get_crawler + from crawler_pool import get_crawler, release_crawler + crawler = None try: cfg = CrawlerRunConfig(screenshot=True, screenshot_wait_for=body.screenshot_wait_for) crawler = await get_crawler(get_default_browser_config()) @@ -385,6 +390,9 @@ async def generate_screenshot( return {"success": True, "screenshot": screenshot_data} except Exception as e: raise HTTPException(500, detail=str(e)) + finally: + if crawler: + await release_crawler(crawler) # PDF endpoint @@ -402,7 +410,8 @@ async def generate_pdf( Use when you need a printable or archivable snapshot of the page. It is recommended to provide an output path to save the PDF. Then in result instead of the PDF you will get a path to the saved file. """ - from crawler_pool import get_crawler + from crawler_pool import get_crawler, release_crawler + crawler = None try: cfg = CrawlerRunConfig(pdf=True) crawler = await get_crawler(get_default_browser_config()) @@ -419,6 +428,9 @@ async def generate_pdf( return {"success": True, "pdf": base64.b64encode(pdf_data).decode()} except Exception as e: raise HTTPException(500, detail=str(e)) + finally: + if crawler: + await release_crawler(crawler) @app.post("/execute_js") @@ -457,24 +469,11 @@ class CrawlResult(BaseModel): metadata: Optional[dict] = None error_message: Optional[str] = None session_id: Optional[str] = None - response_headers: Optional[dict] = None - status_code: Optional[int] = None - ssl_certificate: Optional[SSLCertificate] = None - dispatch_result: Optional[DispatchResult] = None - redirected_url: Optional[str] = None - network_requests: Optional[List[Dict[str, Any]]] = None - console_messages: Optional[List[Dict[str, Any]]] = None - - class MarkdownGenerationResult(BaseModel): - raw_markdown: str - markdown_with_citations: str - references_markdown: str - fit_markdown: Optional[str] = None - fit_html: Optional[str] = None + # ... ``` - """ - from crawler_pool import get_crawler + from crawler_pool import get_crawler, release_crawler + crawler = None try: cfg = CrawlerRunConfig(js_code=body.scripts) crawler = await get_crawler(get_default_browser_config()) @@ -485,6 +484,9 @@ class MarkdownGenerationResult(BaseModel): return JSONResponse(data) except Exception as e: raise HTTPException(500, detail=str(e)) + finally: + if crawler: + await release_crawler(crawler) @app.get("/llm/{url:path}") @@ -806,7 +808,7 @@ async def get_context( base_url=f"http://{config['app']['host']}:{config['app']['port']}" ) -# ────────────────────────── cli ────────────────────────────── +# ── cli ────────────────────────────── if __name__ == "__main__": import uvicorn uvicorn.run( diff --git a/tests/browser/test_resource_filtering.py b/tests/browser/test_resource_filtering.py new file mode 100644 index 000000000..a38782c84 --- /dev/null +++ b/tests/browser/test_resource_filtering.py @@ -0,0 +1,55 @@ +import asyncio +import os +import sys +import pytest + +# Add the project root to Python path if running directly +if __name__ == "__main__": + sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..'))) + +from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig +from crawl4ai.async_logger import AsyncLogger + +# Create a logger for clear terminal output +logger = AsyncLogger(verbose=True, log_file=None) + +@pytest.mark.asyncio +async def test_resource_filtering_launch(): + """Functional test to ensure browser launches correctly with filtering flags enabled.""" + browser_config = BrowserConfig( + headless=True, + avoid_ads=True, + avoid_css=True, + text_mode=True + ) + + async with AsyncWebCrawler(config=browser_config) as crawler: + # Simple crawl to verify functionality + result = await crawler.arun( + url="https://example.com", + config=CrawlerRunConfig(cache_mode="bypass") + ) + assert result.success + logger.success("Browser launched and crawled successfully with filtering flags") + +@pytest.mark.asyncio +async def test_avoid_css_only(): + """Test avoid_css without text_mode.""" + browser_config = BrowserConfig( + headless=True, + avoid_css=True, + text_mode=False + ) + + async with AsyncWebCrawler(config=browser_config) as crawler: + result = await crawler.arun( + url="https://example.com", + config=CrawlerRunConfig(cache_mode="bypass") + ) + assert result.success + logger.success("Browser launched and crawled successfully with avoid_css only") + +if __name__ == "__main__": + asyncio.run(test_resource_filtering_launch()) + asyncio.run(test_avoid_css_only()) + diff --git a/tests/general/test_cache_context.py b/tests/general/test_cache_context.py index 0f42f9fdd..4d4049f70 100644 --- a/tests/general/test_cache_context.py +++ b/tests/general/test_cache_context.py @@ -1,7 +1,9 @@ import asyncio +import pytest from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode from playwright.async_api import Page, BrowserContext +@pytest.mark.asyncio async def test_reuse_context_by_config(): # We will store each context ID in these maps to confirm reuse context_ids_for_A = [] diff --git a/tests/unit/test_config_flags.py b/tests/unit/test_config_flags.py new file mode 100644 index 000000000..8685d2417 --- /dev/null +++ b/tests/unit/test_config_flags.py @@ -0,0 +1,33 @@ +import pytest +from crawl4ai.async_configs import BrowserConfig + +def test_browser_config_filtering_flags(): + """Test that BrowserConfig correctly stores the new filtering flags.""" + # Default values + config = BrowserConfig() + assert config.avoid_ads is False + assert config.avoid_css is False + + # Custom values + config = BrowserConfig(avoid_ads=True, avoid_css=True) + assert config.avoid_ads is True + assert config.avoid_css is True + + # Check to_dict / from_kwargs parity + config_dict = config.to_dict() + assert config_dict["avoid_ads"] is True + assert config_dict["avoid_css"] is True + + new_config = BrowserConfig.from_kwargs(config_dict) + assert new_config.avoid_ads is True + assert new_config.avoid_css is True + +def test_browser_config_clone(): + """Test that cloning BrowserConfig preserves the new flags.""" + config = BrowserConfig(avoid_ads=True, avoid_css=False) + cloned = config.clone(avoid_css=True) + + assert cloned.avoid_ads is True + assert cloned.avoid_css is True + assert config.avoid_css is False # Original remains unchanged +