Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 10 additions & 3 deletions blueprints/crawl4ai/docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,22 +2,29 @@ version: "3.8"

services:
crawl4ai:
image: unclecode/crawl4ai:latest
image: unclecode/crawl4ai:0.8.0
restart: unless-stopped
shm_size: 1g
stop_grace_period: 30s
expose:
- 11235
environment:
- LLM_PROVIDER=${LLM_PROVIDER}
- LLM_TEMPERATURE=${LLM_TEMPERATURE}
- OPENAI_API_KEY=${OPENAI_API_KEY}
- OPENAI_BASE_URL=${OPENAI_BASE_URL}
- OPENAI_TEMPERATURE=${OPENAI_TEMPERATURE}
- ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY}
- DEEPSEEK_API_KEY=${DEEPSEEK_API_KEY}
- GROQ_API_KEY=${GROQ_API_KEY}
- TOGETHER_API_KEY=${TOGETHER_API_KEY}
- MISTRAL_API_KEY=${MISTRAL_API_KEY}
- GEMINI_API_TOKEN=${GEMINI_API_TOKEN}
volumes:
- ../files/config.yml:/app/config.yml:ro
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:11235/health"]
test: ["CMD-SHELL", "curl -fsS http://127.0.0.1:11235/health || exit 1"]
interval: 30s
timeout: 10s
retries: 3
retries: 5
start_period: 30s
112 changes: 110 additions & 2 deletions blueprints/crawl4ai/template.toml
Original file line number Diff line number Diff line change
@@ -1,5 +1,12 @@
[variables]
main_domain = "${domain}"
llm_provider = "openai/gpt-4o-mini"
llm_temperature = "0.1"
security_enabled = "true"
jwt_enabled = "true"
default_rate_limit = "120/minute"
jwt_secret = "${password:48}"
max_pages = "40"

[config]
[[config.domains]]
Expand All @@ -8,11 +15,112 @@ port = 11235
host = "${main_domain}"

[config.env]
LLM_PROVIDER = "openai/gpt-4o-mini"
OPENAI_API_KEY = "${password:48}"
LLM_PROVIDER = "${llm_provider}"
LLM_TEMPERATURE = "${llm_temperature}"
OPENAI_API_KEY = ""
OPENAI_BASE_URL = "https://openrouter.ai/api/v1"
OPENAI_TEMPERATURE = "${llm_temperature}"
SECRET_KEY = "${jwt_secret}"
ANTHROPIC_API_KEY = ""
DEEPSEEK_API_KEY = ""
GROQ_API_KEY = ""
TOGETHER_API_KEY = ""
MISTRAL_API_KEY = ""
GEMINI_API_TOKEN = ""

[[config.mounts]]
filePath = "config.yml"
serviceName = "crawl4ai"
content = """
app:
title: "Crawl4AI API"
version: "0.8.0"
host: "0.0.0.0"
port: 11235
reload: false
workers: 1
timeout_keep_alive: 300

llm:
provider: "${llm_provider}"

redis:
host: "localhost"
port: 6379
db: 0
password: ""
ssl: false
ssl_cert_reqs: null
ssl_ca_certs: null
ssl_certfile: null
ssl_keyfile: null

rate_limiting:
enabled: true
default_limit: "${default_rate_limit}"
trusted_proxies: []
storage_uri: "memory://"

security:
enabled: ${security_enabled}
jwt_enabled: ${jwt_enabled}
https_redirect: false
trusted_hosts:
- "${main_domain}"
- "localhost"
- "127.0.0.1"
- "crawl4ai"
headers:
x_content_type_options: "nosniff"
x_frame_options: "DENY"
content_security_policy: "default-src 'self'"
strict_transport_security: "max-age=63072000; includeSubDomains"

crawler:
base_config:
simulate_user: true
memory_threshold_percent: 90.0
rate_limiter:
enabled: true
base_delay: [1.0, 2.0]
timeouts:
stream_init: 30.0
batch_process: 300.0
pool:
max_pages: ${max_pages}
idle_ttl_sec: 300
browser:
kwargs:
headless: true
text_mode: true
extra_args:
- "--no-sandbox"
- "--disable-dev-shm-usage"
- "--disable-gpu"
- "--disable-software-rasterizer"
- "--allow-insecure-localhost"
- "--ignore-certificate-errors"

logging:
level: "INFO"
format: "%(asctime)s - %(name)s - %(levelname)s - %(message)s"

observability:
prometheus:
enabled: true
endpoint: "/metrics"
health_check:
endpoint: "/health"

webhooks:
enabled: true
default_url: null
data_in_payload: false
retry:
max_attempts: 5
initial_delay_ms: 1000
max_delay_ms: 32000
timeout_ms: 30000
headers:
User-Agent: "Crawl4AI-Webhook/1.0"
"""
129 changes: 70 additions & 59 deletions blueprints/firecrawl/docker-compose.yml
Original file line number Diff line number Diff line change
@@ -1,121 +1,130 @@
name: firecrawl
version: "3.8"

x-common-service: &common-service
image: ghcr.io/firecrawl/firecrawl:latest
build:
context: "https://github.com/firecrawl/firecrawl.git#v2.8.0:apps/api"
restart: unless-stopped
ulimits:
nofile:
soft: 65535
hard: 65535
extra_hosts:
- "host.docker.internal:host-gateway"

x-common-env: &common-env
REDIS_URL: ${REDIS_URL:-redis://redis:6379}
REDIS_RATE_LIMIT_URL: ${REDIS_RATE_LIMIT_URL:-redis://redis:6379}
PLAYWRIGHT_MICROSERVICE_URL: ${PLAYWRIGHT_MICROSERVICE_URL:-http://playwright-service:3000/scrape}
NUQ_DATABASE_URL: ${NUQ_DATABASE_URL:-postgres://postgres:postgres@nuq-postgres:5432/postgres}
USE_DB_AUTHENTICATION: ${USE_DB_AUTHENTICATION:-}
POSTGRES_HOST: ${POSTGRES_HOST:-nuq-postgres}
POSTGRES_PORT: ${POSTGRES_PORT:-5432}
POSTGRES_USER: ${POSTGRES_USER:-postgres}
POSTGRES_PASSWORD: ${POSTGRES_PASSWORD:-firecrawl}
POSTGRES_DB: ${POSTGRES_DB:-postgres}
NUQ_RABBITMQ_URL: ${NUQ_RABBITMQ_URL:-amqp://rabbitmq:5672}
USE_DB_AUTHENTICATION: ${USE_DB_AUTHENTICATION:-false}
NUM_WORKERS_PER_QUEUE: ${NUM_WORKERS_PER_QUEUE:-8}
CRAWL_CONCURRENT_REQUESTS: ${CRAWL_CONCURRENT_REQUESTS:-10}
MAX_CONCURRENT_JOBS: ${MAX_CONCURRENT_JOBS:-5}
BROWSER_POOL_SIZE: ${BROWSER_POOL_SIZE:-5}
OPENAI_API_KEY: ${OPENAI_API_KEY:-}
OPENAI_BASE_URL: ${OPENAI_BASE_URL:-}
OPENROUTER_API_KEY: ${OPENROUTER_API_KEY:-}
MODEL_NAME: ${MODEL_NAME:-}
MODEL_EMBEDDING_NAME: ${MODEL_EMBEDDING_NAME:-}
OLLAMA_BASE_URL: ${OLLAMA_BASE_URL:-}
AUTUMN_SECRET_KEY: ${AUTUMN_SECRET_KEY:-}
LLAMAPARSE_API_KEY: ${LLAMAPARSE_API_KEY:-}
SLACK_WEBHOOK_URL: ${SLACK_WEBHOOK_URL:-}
BULL_AUTH_KEY: ${BULL_AUTH_KEY:-}
TEST_API_KEY: ${TEST_API_KEY:-}
POSTHOG_API_KEY: ${POSTHOG_API_KEY:-}
POSTHOG_HOST: ${POSTHOG_HOST:-}
SUPABASE_ANON_TOKEN: ${SUPABASE_ANON_TOKEN:-}
SUPABASE_URL: ${SUPABASE_URL:-}
SUPABASE_REPLICA_URL: ${SUPABASE_REPLICA_URL:-}
SUPABASE_SERVICE_TOKEN: ${SUPABASE_SERVICE_TOKEN:-}
INDEX_SUPABASE_URL: ${INDEX_SUPABASE_URL:-}
INDEX_SUPABASE_SERVICE_TOKEN: ${INDEX_SUPABASE_SERVICE_TOKEN:-}
SEARCH_INDEX_SUPABASE_URL: ${SEARCH_INDEX_SUPABASE_URL:-}
SELF_HOSTED_WEBHOOK_URL: ${SELF_HOSTED_WEBHOOK_URL:-}
SERPER_API_KEY: ${SERPER_API_KEY:-}
SEARCHAPI_API_KEY: ${SEARCHAPI_API_KEY:-}
LOGGING_LEVEL: ${LOGGING_LEVEL:-INFO}
ALLOW_LOCAL_WEBHOOKS: ${ALLOW_LOCAL_WEBHOOKS:-false}
LOGGING_LEVEL: ${LOGGING_LEVEL:-}
PROXY_SERVER: ${PROXY_SERVER:-}
PROXY_USERNAME: ${PROXY_USERNAME:-}
PROXY_PASSWORD: ${PROXY_PASSWORD:-}
NO_PROXY: ${NO_PROXY:-localhost,127.0.0.1,redis,nuq-postgres,playwright-service,host.docker.internal}
SEARXNG_ENDPOINT: ${SEARXNG_ENDPOINT:-}
SEARXNG_ENGINES: ${SEARXNG_ENGINES:-}
SEARXNG_CATEGORIES: ${SEARXNG_CATEGORIES:-}

MAX_CPU: ${MAX_CPU:-0.8}
MAX_RAM: ${MAX_RAM:-0.8}

services:
playwright-service:
image: ghcr.io/firecrawl/playwright-service:latest
shm_size: "1g"
build:
context: "https://github.com/firecrawl/firecrawl.git#v2.8.0:apps/playwright-service-ts"
restart: unless-stopped
shm_size: "1g"
environment:
PORT: 3000
PROXY_SERVER: ${PROXY_SERVER:-}
PROXY_USERNAME: ${PROXY_USERNAME:-}
PROXY_PASSWORD: ${PROXY_PASSWORD:-}
BLOCK_MEDIA: ${BLOCK_MEDIA:-}
NO_PROXY: ${NO_PROXY:-localhost,127.0.0.1,redis,nuq-postgres,playwright-service,host.docker.internal}

ALLOW_LOCAL_WEBHOOKS: ${ALLOW_LOCAL_WEBHOOKS:-false}
BLOCK_MEDIA: ${BLOCK_MEDIA:-false}
MAX_CONCURRENT_PAGES: ${CRAWL_CONCURRENT_REQUESTS:-10}
expose:
- "3000"

api:
<<: *common-service
restart: unless-stopped
ports:
expose:
- "3002"
environment:
<<: *common-env
HOST: "0.0.0.0"
PORT: 3002
EXTRACT_WORKER_PORT: 3004
WORKER_PORT: 3005
ENV: local
depends_on:
redis:
condition: service_started
playwright-service:
condition: service_started
nuq-postgres:
rabbitmq:
condition: service_healthy
command: node --import ./dist/src/otel.js dist/src/index.js

worker:
<<: *common-service
restart: unless-stopped
environment:
<<: *common-env
HOST: "0.0.0.0"
PORT: 3005
ENV: local
depends_on:
redis:
condition: service_started
nuq-postgres:
condition: service_healthy
command: node --import ./dist/src/otel.js dist/src/services/queue-worker.js

extract-worker:
<<: *common-service
restart: unless-stopped
environment:
<<: *common-env
HOST: "0.0.0.0"
PORT: 3004
ENV: local
depends_on:
redis:
condition: service_started
nuq-postgres:
condition: service_healthy
command: node --import ./dist/src/otel.js dist/src/services/extract-worker.js

command: node dist/src/harness.js --start-docker

redis:
image: redis:alpine
command: redis-server --bind 0.0.0.0

image: redis:7-alpine
restart: unless-stopped
command: redis-server --bind 0.0.0.0 --appendonly yes
volumes:
- redis_data:/data

rabbitmq:
image: rabbitmq:3.13-management
restart: unless-stopped
command: rabbitmq-server
volumes:
- rabbitmq_data:/var/lib/rabbitmq
healthcheck:
test: ["CMD", "rabbitmq-diagnostics", "-q", "check_running"]
interval: 5s
timeout: 5s
retries: 3
start_period: 5s

nuq-postgres:
build:
context: "https://github.com/firecrawl/firecrawl.git#main:apps/nuq-postgres"
dockerfile: Dockerfile
context: "https://github.com/firecrawl/firecrawl.git#v2.8.0:apps/nuq-postgres"
restart: unless-stopped
environment:
POSTGRES_USER: postgres
POSTGRES_PASSWORD: postgres
POSTGRES_DB: postgres
POSTGRES_USER: ${POSTGRES_USER:-postgres}
POSTGRES_PASSWORD: ${POSTGRES_PASSWORD:-firecrawl}
POSTGRES_DB: ${POSTGRES_DB:-postgres}
volumes:
- nuq_pg_data:/var/lib/postgresql/data
healthcheck:
Expand All @@ -124,6 +133,8 @@ services:
interval: 10s
timeout: 5s
retries: 10

volumes:
nuq_pg_data:
nuq_pg_data:
rabbitmq_data:
redis_data:
Loading
Loading