Skip to content
Open
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions crawl4ai/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,9 @@
LXMLWebScrapingStrategy,
WebScrapingStrategy, # Backward compatibility alias
)

from .processors.pdf import PDFCrawlerStrategy, PDFContentScrapingStrategy

from .async_logger import (
AsyncLoggerBase,
AsyncLogger,
Expand Down Expand Up @@ -128,6 +131,8 @@
"BFSDeepCrawlStrategy",
"BestFirstCrawlingStrategy",
"DFSDeepCrawlStrategy",
"PDFCrawlerStrategy",
"PDFContentScrapingStrategy",
"FilterChain",
"URLPatternFilter",
"ContentTypeFilter",
Expand Down
40 changes: 36 additions & 4 deletions deploy/docker/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,12 @@
from fastapi import HTTPException, Request, status
from fastapi.background import BackgroundTasks
from fastapi.responses import JSONResponse
from fastapi.encoders import jsonable_encoder

from redis import asyncio as aioredis

from utils import is_pdf_url

from crawl4ai import (
AsyncWebCrawler,
CrawlerRunConfig,
Expand All @@ -31,6 +35,10 @@
BM25ContentFilter,
LLMContentFilter
)

from crawl4ai.processors.pdf import PDFCrawlerStrategy, PDFContentScrapingStrategy
from crawl4ai.async_configs import to_serializable_dict

from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
from crawl4ai.content_scraping_strategy import LXMLWebScrapingStrategy

Expand Down Expand Up @@ -431,6 +439,18 @@ async def handle_crawl_request(
urls = [('https://' + url) if not url.startswith(('http://', 'https://')) and not url.startswith(("raw:", "raw://")) else url for url in urls]
browser_config = BrowserConfig.load(browser_config)
crawler_config = CrawlerRunConfig.load(crawler_config)

is_pdf_flags = await asyncio.gather(*(is_pdf_url(url) for url in urls))
is_pdf = any(is_pdf_flags)
crawler_strategy = PDFCrawlerStrategy() if is_pdf else None

if is_pdf and crawler_config.scraping_strategy is None:
# Default strategy if not set.
crawler_config.scraping_strategy = PDFContentScrapingStrategy(
extract_images=False,
save_images_locally=False,
batch_size=2
)

dispatcher = MemoryAdaptiveDispatcher(
memory_threshold_percent=config["crawler"]["memory_threshold_percent"],
Expand All @@ -440,7 +460,7 @@ async def handle_crawl_request(
)

from crawler_pool import get_crawler
crawler = await get_crawler(browser_config)
crawler = await get_crawler(browser_config, crawler_strategy)

# crawler: AsyncWebCrawler = AsyncWebCrawler(config=browser_config)
# await crawler.start()
Expand Down Expand Up @@ -476,7 +496,8 @@ async def handle_crawl_request(
# If PDF exists, encode it to base64
if result_dict.get('pdf') is not None:
result_dict['pdf'] = b64encode(result_dict['pdf']).decode('utf-8')
processed_results.append(result_dict)

processed_results.append(to_serializable_dict(result_dict))

return {
"success": True,
Expand Down Expand Up @@ -521,8 +542,19 @@ async def handle_stream_crawl_request(
# browser_config.verbose = True # Set to False or remove for production stress testing
browser_config.verbose = False
crawler_config = CrawlerRunConfig.load(crawler_config)
crawler_config.scraping_strategy = LXMLWebScrapingStrategy()
crawler_config.stream = True

is_pdf_flags = await asyncio.gather(*(is_pdf_url(url) for url in urls))
is_pdf = any(is_pdf_flags)
crawler_strategy = PDFCrawlerStrategy() if is_pdf else None

if is_pdf and crawler_config.scraping_strategy is None:
# Default strategy if not set
crawler_config.scraping_strategy = PDFContentScrapingStrategy(
extract_images=True,
save_images_locally=False,
batch_size=2
)

dispatcher = MemoryAdaptiveDispatcher(
memory_threshold_percent=config["crawler"]["memory_threshold_percent"],
Expand All @@ -532,7 +564,7 @@ async def handle_stream_crawl_request(
)

from crawler_pool import get_crawler
crawler = await get_crawler(browser_config)
crawler = await get_crawler(browser_config, crawler_strategy)

# crawler = AsyncWebCrawler(config=browser_config)
# await crawler.start()
Expand Down
28 changes: 20 additions & 8 deletions deploy/docker/crawler_pool.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
# crawler_pool.py (new file)
import asyncio, json, hashlib, time, psutil
from contextlib import suppress
from typing import Dict
from typing import Dict, Optional
from crawl4ai import AsyncWebCrawler, BrowserConfig
from typing import Dict
from utils import load_config
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🛠️ Refactor suggestion | 🟠 Major

🧩 Analysis chain

Verify import path reliability for utils.

Absolute import from utils import load_config may break if deploy/docker isn't on sys.path. Prefer a package-relative import or confirm deploy/docker is a package with init.py and the entrypoint runs from this dir.

Run:


🏁 Script executed:

#!/bin/bash
# Check if deploy/docker is a package and whether utils is importable.
set -euo pipefail
if [ -f "deploy/docker/__init__.py" ]; then
  echo "deploy/docker is a package."
else
  echo "deploy/docker is NOT a package; relative imports won't work."
fi
echo "Listing files next to crawler_pool.py:"
ls -la deploy/docker || true
echo "Searching for demo entrypoint:"
fd -a --exact-path demo_docker_api.py || true

Length of output: 1558


Convert utils import to package-relative

  • Add an empty deploy/docker/__init__.py to make it a package
  • In deploy/docker/crawler_pool.py, change
    from utils import load_config
    to
    from .utils import load_config
🤖 Prompt for AI Agents
In deploy/docker/crawler_pool.py around line 6, the import uses a top-level
module import "from utils import load_config" which breaks package-relative
resolution; add an empty deploy/docker/__init__.py to make the directory a
package and change the import to use a relative import (from .utils import
load_config) so the module is resolved within the package namespace.



CONFIG = load_config()

POOL: Dict[str, AsyncWebCrawler] = {}
Expand All @@ -15,20 +15,31 @@
MEM_LIMIT = CONFIG.get("crawler", {}).get("memory_threshold_percent", 95.0) # % RAM – refuse new browsers above this
IDLE_TTL = CONFIG.get("crawler", {}).get("pool", {}).get("idle_ttl_sec", 1800) # close if unused for 30 min

def _sig(cfg: BrowserConfig) -> str:
payload = json.dumps(cfg.to_dict(), sort_keys=True, separators=(",",":"))
return hashlib.sha1(payload.encode()).hexdigest()
def _sig(cfg: BrowserConfig, crawler_strategy: Optional[object] = None) -> str:
"""
Generate a unique signature for a crawler based on browser config
and optional crawler strategy. This ensures that crawlers with
different strategies (e.g., PDF) are stored separately in the pool.
"""
payload = cfg.to_dict()

if crawler_strategy is not None:
payload["strategy"] = crawler_strategy.__class__.__name__

Comment on lines +26 to +28
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Include strategy configuration in the pool signature.

Keying pooled crawlers only by crawler_strategy.__class__.__name__ collapses all instances of the same strategy class into a single slot. If two callers supply the same strategy class with different constructor arguments (e.g., PDFCrawlerStrategy(ocr=True) vs PDFCrawlerStrategy(ocr=False)), the second caller will be handed the already-started crawler that still holds the first instance, so their requested configuration is silently ignored. That’s a functional bug for any strategy with mutable/per-instance configuration. Please extend the signature to encode the strategy’s effective settings (e.g., module-qualified name plus a stable serialization of its config) so distinct instances don’t collide.

🤖 Prompt for AI Agents
In deploy/docker/crawler_pool.py around lines 25 to 27, the pool signature only
keys strategies by crawler_strategy.__class__.__name__, causing different
instances of the same class with different constructor arguments to collide; fix
this by including a stable, deterministic encoding of the instance configuration
in the payload key: derive the module-qualified class name (e.g.,
strategy.__class__.__module__ + "." + strategy.__class__.__name__) and a
serializable representation of the strategy's effective settings (prefer a
to_dict()/asdict() method on the strategy or fall back to
vars()/inspect.getfullargspec/init args), then JSON-serialize that config with
sort_keys=True (and avoid non-serializable members) and add it to payload (e.g.,
payload["strategy"] = "<module.ClassName>", payload["strategy_config"] =
"<stable-json>") so distinct configured instances do not collide.

json_payload = json.dumps(payload, sort_keys=True, separators=(",", ":"))
return hashlib.sha1(json_payload.encode()).hexdigest()


async def get_crawler(cfg: BrowserConfig) -> AsyncWebCrawler:
async def get_crawler(cfg: BrowserConfig, crawler_strategy: Optional[object] = None) -> AsyncWebCrawler:
try:
sig = _sig(cfg)
sig = _sig(cfg, crawler_strategy=crawler_strategy)
async with LOCK:
if sig in POOL:
LAST_USED[sig] = time.time();
return POOL[sig]
if psutil.virtual_memory().percent >= MEM_LIMIT:
raise MemoryError("RAM pressure – new browser denied")
crawler = AsyncWebCrawler(config=cfg, thread_safe=False)
crawler = AsyncWebCrawler(config=cfg, thread_safe=False, crawler_strategy=crawler_strategy)
await crawler.start()
POOL[sig] = crawler; LAST_USED[sig] = time.time()
return crawler
Expand All @@ -44,6 +55,7 @@ async def get_crawler(cfg: BrowserConfig) -> AsyncWebCrawler:
POOL.pop(sig, None)
LAST_USED.pop(sig, None)
# If we failed to start the browser, we should remove it from the pool

async def close_all():
async with LOCK:
await asyncio.gather(*(c.close() for c in POOL.values()), return_exceptions=True)
Expand Down
31 changes: 30 additions & 1 deletion deploy/docker/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import logging
import yaml
import os
import httpx
from datetime import datetime
from enum import Enum
from pathlib import Path
Expand Down Expand Up @@ -124,4 +125,32 @@ def verify_email_domain(email: str) -> bool:
records = dns.resolver.resolve(domain, 'MX')
return True if records else False
except Exception as e:
return False
return False

async def is_pdf_url(url: str) -> bool:
"""
Check if a URL points to a PDF using httpx:
- Check extension
- Check Content-Type via HEAD request
- Check first 5 bytes (magic number) if needed
"""
if url.lower().endswith(".pdf"):
return True

try:
async with httpx.AsyncClient(follow_redirects=True) as client:
# HEAD request to check Content-Type
head_resp = await client.head(url)
content_type = head_resp.headers.get("content-type", "").lower()
if "application/pdf" in content_type:
return True

# Fallback: GET first 5 bytes to check PDF magic number
get_resp = await client.get(url, headers={"Range": "bytes=0-4"})
if get_resp.status_code in (200, 206): # 206 Partial Content
return get_resp.content.startswith(b"%PDF-")
except Exception:
return False

return False

69 changes: 69 additions & 0 deletions docs/examples/docker/demo_docker_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
FORMS_URL = "https://httpbin.org/forms/post" # For JS demo
BOOKS_URL = "http://books.toscrape.com/" # For CSS extraction
PYTHON_URL = "https://python.org" # For deeper crawl
PDF_URL = "https://arxiv.org/pdf/2310.06825" # For PDF demo
# Use the same sample site as deep crawl tests for consistency
DEEP_CRAWL_BASE_URL = os.getenv(
"DEEP_CRAWL_TEST_SITE", "https://docs.crawl4ai.com/samples/deepcrawl/")
Expand Down Expand Up @@ -1261,6 +1262,71 @@ async def demo_config_dump_invalid(client: httpx.AsyncClient):
console.print(
f"[bold red]Unexpected error during invalid test:[/] {e}")

# 10. Crawl PDF

async def demo_pdf_crawl(client: httpx.AsyncClient):
payload = {
"urls": [PDF_URL],
"crawler_config": {
"type": "CrawlerRunConfig",
"params": {
"cache_mode": "BYPASS",
"scraping_strategy": {
"type": "PDFContentScrapingStrategy",
"params": {
"extract_images": False,
"save_images_locally": False,
"batch_size": 2
}
}
}
}
}

resp = await client.post("/crawl", json=payload)
resp.raise_for_status()
data = resp.json()
print("=== Demo: PDF Crawl ===")
print("Success:", data.get("success"))
print("Number of results:", len(data.get("results", [])))
if data.get("results"):
first = data["results"][0]
text_snippet = (first.get("text") or "")[:500]
print("Extracted text (first 500 chars):")
print(text_snippet)

# 11. Crawl PDF stream

async def demo_pdf_crawl_stream(client: httpx.AsyncClient):
"""
Demo: Crawl a PDF and stream the extracted text content.
"""
payload = {
"urls": [PDF_URL],
"crawler_config": {
"type": "CrawlerRunConfig",
"params": {
"stream": True,
"cache_mode": "BYPASS",
"scraping_strategy": { # <-- Default strategy if not set
"type": "PDFContentScrapingStrategy",
"params": {
"extract_images": False,
"save_images_locally": False,
"batch_size": 2
}
}
}
}
}

await stream_request(
client,
"/crawl/stream",
payload,
"Demo PDF: Streaming PDF Crawl"
)


# --- Update Main Runner to include new demo ---
async def main_demo():
Expand Down Expand Up @@ -1294,6 +1360,9 @@ async def main_demo():
# await demo_deep_with_llm_extraction(client)
# await demo_deep_with_proxy(client) # Skips if no PROXIES env var
# await demo_deep_with_ssl(client) # Added the new demo

# await demo_pdf_crawl_stream(client)
# await demo_pdf_crawl(client)

# --- Helper endpoints ---
await demo_markdown_endpoint(client)
Expand Down
Loading