diff --git a/compose/local/django/celery/worker/start b/compose/local/django/celery/worker/start index 0d4f67362..a3482f44e 100755 --- a/compose/local/django/celery/worker/start +++ b/compose/local/django/celery/worker/start @@ -4,11 +4,24 @@ set -o errexit set -o pipefail set -o nounset +# Local development with auto-reload and optional debugging +# +# DEBUGGER=1 - Enable debugpy for remote debugging on port 5679 +# +# Worker protections (prevent memory leaks from long-running workers): +# --max-tasks-per-child=100 - Restart worker process after 100 tasks +# --max-memory-per-child=1048576 - Restart if worker process exceeds 1 GiB (in KB) +# +# With prefork pool (default), each CPU gets a worker process. +# Example: 8 CPUs × 1 GiB = 8 GiB max total worker memory + +MAX_TASKS_PER_CHILD=100 +MAX_MEMORY_PER_CHILD=1048576 # 1 GiB in KB + # Launch VS Code debug server if DEBUGGER environment variable is set to 1 # Note that auto reloading is disabled when debugging, manual restart required for code changes. if [ "${DEBUGGER:-0}" = "1" ]; then - # exec watchfiles --filter python 'python -m debugpy --listen 0.0.0.0:5679 -m celery -A config.celery_app worker -l INFO' - exec python -Xfrozen_modules=off -m debugpy --listen 0.0.0.0:5679 -m celery -A config.celery_app worker --queues=antenna -l INFO + exec python -Xfrozen_modules=off -m debugpy --listen 0.0.0.0:5679 -m celery -A config.celery_app worker --queues=antenna -l INFO --max-tasks-per-child=$MAX_TASKS_PER_CHILD --max-memory-per-child=$MAX_MEMORY_PER_CHILD else - exec watchfiles --filter python celery.__main__.main --args '-A config.celery_app worker --queues=antenna -l INFO' + exec watchfiles --filter python celery.__main__.main --args '-A config.celery_app worker --queues=antenna -l INFO --max-tasks-per-child='$MAX_TASKS_PER_CHILD' --max-memory-per-child='$MAX_MEMORY_PER_CHILD fi diff --git a/compose/production/django/celery/worker/start b/compose/production/django/celery/worker/start index 6b372e854..4ba716507 100644 --- a/compose/production/django/celery/worker/start +++ b/compose/production/django/celery/worker/start @@ -4,4 +4,16 @@ set -o errexit set -o pipefail set -o nounset -exec newrelic-admin run-program celery -A config.celery_app worker --queues=antenna -l INFO +# Production Celery worker with memory leak protections +# +# Worker protections: +# --max-tasks-per-child=100 - Restart worker process after 100 tasks +# --max-memory-per-child=2097152 - Restart if worker process exceeds 2 GiB (in KB) +# +# With prefork pool (default), each CPU gets a worker process. +# Example: 8 CPUs × 2 GiB = 16 GiB max total worker memory + +MAX_TASKS_PER_CHILD=100 +MAX_MEMORY_PER_CHILD=2097152 # 2 GiB in KB + +exec newrelic-admin run-program celery -A config.celery_app worker --queues=antenna -l INFO --max-tasks-per-child=$MAX_TASKS_PER_CHILD --max-memory-per-child=$MAX_MEMORY_PER_CHILD diff --git a/config/settings/base.py b/config/settings/base.py index 68f5d4130..8b70d62ea 100644 --- a/config/settings/base.py +++ b/config/settings/base.py @@ -323,10 +323,10 @@ CELERY_RESULT_COMPRESSION = "gzip" # https://docs.celeryq.dev/en/stable/userguide/configuration.html#task-time-limit # TODO: set to whatever value is adequate in your circumstances -CELERY_TASK_TIME_LIMIT = 7 * 60 * 24 +CELERY_TASK_TIME_LIMIT = 4 * 60 * 60 * 24 # 4 days # https://docs.celeryq.dev/en/stable/userguide/configuration.html#task-soft-time-limit # TODO: set to whatever value is adequate in your circumstances -CELERY_TASK_SOFT_TIME_LIMIT = 6 * 60 * 24 +CELERY_TASK_SOFT_TIME_LIMIT = 3 * 60 * 60 * 24 # 3 days # https://docs.celeryq.dev/en/stable/userguide/configuration.html#beat-scheduler CELERY_BEAT_SCHEDULER = "django_celery_beat.schedulers:DatabaseScheduler" # https://docs.celeryq.dev/en/stable/userguide/configuration.html#worker-send-task-events @@ -346,16 +346,19 @@ CELERY_WORKER_PREFETCH_MULTIPLIER = 1 CELERY_WORKER_ENABLE_PREFETCH_COUNT_REDUCTION = True -# Connection settings to match Redis timeout and keepalive +# RabbitMQ broker connection settings +# These settings improve reliability for long-running workers with intermittent network issues CELERY_BROKER_TRANSPORT_OPTIONS = { - "visibility_timeout": 43200, # 12 hours - default celery value - "socket_timeout": 120, # Matches Redis timeout setting - "socket_connect_timeout": 30, # Max time to establish connection - "socket_keepalive": True, # Enable TCP keepalive - "retry_on_timeout": True, # Retry operations if Redis times out - "max_connections": 20, # Per process connection pool limit + "socket_timeout": 120, # Socket read/write timeout (seconds) + "socket_connect_timeout": 30, # Max time to establish connection (seconds) + "socket_keepalive": True, # Enable TCP keepalive probes + "retry_on_timeout": True, # Retry operations on timeout + "max_connections": 20, # Per-process connection pool limit + "heartbeat": 60, # RabbitMQ heartbeat interval (seconds) - detects broken connections } +# Broker connection retry settings +# Workers will retry forever on connection failures rather than crashing CELERY_BROKER_CONNECTION_RETRY = True CELERY_BROKER_CONNECTION_RETRY_ON_STARTUP = True CELERY_BROKER_CONNECTION_MAX_RETRIES = None # Retry forever diff --git a/docs/WORKER_MONITORING.md b/docs/WORKER_MONITORING.md new file mode 100644 index 000000000..30cfbde3f --- /dev/null +++ b/docs/WORKER_MONITORING.md @@ -0,0 +1,226 @@ +# Celery Worker Monitoring Guide + +This guide explains how to monitor Celery worker health and detect issues like memory leaks or hung workers. + +## Worker Protections In Place + +The workers are configured with automatic protections to prevent long-running issues: + +- **`--max-tasks-per-child=100`** - Each worker process restarts after 100 tasks + - Prevents memory leaks from accumulating over time + - Clean slate every 100 tasks ensures consistent performance + +- **`--max-memory-per-child=2097152`** - Each worker process restarts if it exceeds 2 GiB + - Measured in KB (2097152 KB = 2 GiB) + - With prefork pool (default), this is **per worker process** + - Example: 8 CPUs = 8 worker processes × 2 GiB = 16 GiB max total + +These protections cause workers to gracefully restart before problems occur. + +## Monitoring Tools + +### 1. Flower (http://localhost:5555) + +**Active Tasks View:** +- Shows currently running tasks +- Look for tasks stuck in "running" state for unusually long periods +- Your `CELERY_TASK_TIME_LIMIT` is 4 days, so anything approaching that is suspicious + +**Worker Page:** +- Shows last heartbeat timestamp for each worker +- Stale heartbeat (> 60 seconds) indicates potential issue +- Shows worker status, active tasks, and resource usage + +**Task Timeline:** +- Sort tasks by duration to find long-running ones +- Filter by state (SUCCESS, FAILURE, RETRY, etc.) +- See task arguments and return values + +**Limitations:** +- Cannot detect workers in deadlock that stop sending heartbeats +- Shows task status but not detailed memory usage + +### 2. New Relic APM + +**Transaction Traces:** +- Automatic instrumentation via `newrelic-admin` wrapper +- See detailed task execution times and database queries +- Identify slow tasks that might cause memory buildup + +**Infrastructure Monitoring:** +- Monitor container CPU and memory usage +- Set alerts for high memory usage approaching limits +- Track worker process count and restarts + +**Custom Instrumentation:** +- Add custom metrics for task-specific monitoring +- Track business metrics (images processed, detections created, etc.) + +### 3. Docker Monitoring + +**Check worker health:** +```bash +# View running containers and resource usage +docker stats + +# Check worker logs for restart messages +docker compose logs celeryworker | grep -i "restart\|warm shutdown\|cool shutdown" + +# See recent worker activity +docker compose logs --tail=100 celeryworker +``` + +**Worker restart patterns:** +When a worker hits limits, you'll see: +``` +[INFO/MainProcess] Warm shutdown (MainProcess) +[INFO/MainProcess] Celery worker: Restarting pool processes +``` + +## Detecting Hung Workers + +### Signs of a hung worker: + +1. **Tasks stuck in "started" state** (Flower active tasks page) + - Task shows as running but makes no progress + - Duration keeps increasing without completion + +2. **Stale worker heartbeats** (Flower workers page) + - Last heartbeat > 60 seconds ago + - Worker shows as offline or unresponsive + +3. **Queue backlog building up** (Flower or RabbitMQ management) + - Tasks accumulating in queue but not being processed + - Check: http://localhost:15672 (RabbitMQ management UI) + - Default credentials: `rabbituser` / `rabbitpass` + +4. **Container health** (Docker stats) + - CPU stuck at 100% with no task progress + - Memory at ceiling without task completion + +### Manual investigation: + +```bash +# Attach to worker container +docker compose exec celeryworker bash + +# Check running Python processes +ps aux | grep celery + +# Check memory usage of worker processes +ps aux --sort=-%mem | grep celery | head -10 + +# See active connections to RabbitMQ +netstat -an | grep 5672 +``` + +## Recommended Monitoring Setup + +### Flower Alerts (Manual Checks) + +Visit Flower periodically and check: +1. Workers page - all workers showing recent heartbeats? +2. Tasks page - any tasks running > 1 hour? +3. Queue length - is it growing unexpectedly? + +### New Relic Alerts (Automated) + +Set up alerts for: +- Container memory > 80% for > 5 minutes +- Task duration > 1 hour (customize based on your use case) +- Worker process restarts > 10 per hour +- Queue depth > 100 tasks + +### RabbitMQ Management UI + +Access at: http://localhost:15672 +- Monitor queue lengths +- Check consumer counts (should match number of worker processes) +- View connection status and channels + +## Troubleshooting Common Issues + +### Worker keeps restarting every 100 tasks + +**This is normal behavior** with `--max-tasks-per-child=100`. + +If you see too many restarts: +- Check task complexity - are tasks heavier than expected? +- Consider increasing the limit: `--max-tasks-per-child=200` +- Monitor if specific task types cause issues + +### Worker hitting memory limit frequently + +If workers constantly hit the 2 GiB limit: +- Review which tasks use the most memory +- Optimize data loading (use streaming for large datasets) +- Consider increasing limit: `--max-memory-per-child=3145728` (3 GiB) +- Check for memory leaks in task code + +### Tasks timing out + +Current limits: +- `CELERY_TASK_TIME_LIMIT = 4 * 60 * 60 * 24` (4 days) +- `CELERY_TASK_SOFT_TIME_LIMIT = 3 * 60 * 60 * 24` (3 days) + +These are very generous. If tasks timeout: +- Check task logs for errors +- Review data volume being processed +- Consider breaking large jobs into smaller tasks + +### Queue backlog growing + +Possible causes: +1. Workers offline - check `docker compose ps` +2. Tasks failing and retrying - check Flower failures +3. Task creation rate > processing rate - scale up workers +4. Tasks hung - restart workers: `docker compose restart celeryworker` + +## Scaling Workers + +### Horizontal scaling (more containers): + +```bash +# Local development +docker compose up -d --scale celeryworker=3 + +# Production (docker-compose.worker.yml) +docker compose -f docker-compose.worker.yml up -d --scale celeryworker=3 +``` + +### Vertical scaling (more processes per container): + +Add concurrency flag to worker start command: +```bash +celery -A config.celery_app worker --queues=antenna --concurrency=16 +``` + +Default is number of CPUs. Increase if CPU is underutilized. + +## Memory Tuning Guidelines + +Based on your ML workload: + +**Current setting:** 2 GiB per worker process (production), 1 GiB (local dev) +**Task requirement:** JSON orchestration with large request/response payloads + +This provides adequate headroom for JSON processing and HTTP overhead. + +**If tasks consistently use more:** +- Increase: `--max-memory-per-child=3145728` (3 GiB) +- Ensure container/host has sufficient memory +- Monitor total memory: (num CPUs) × (limit per child) + +**If tasks use much less:** +- Decrease: `--max-memory-per-child=1048576` (1 GiB) +- Allows more workers on same hardware +- Faster to detect actual memory leaks + +## Best Practices + +1. **Monitor regularly** - Check Flower dashboard daily during active development +2. **Log memory-intensive tasks** - Add logging for tasks processing large datasets +3. **Test limits locally** - Verify worker restarts work correctly in development +4. **Set New Relic alerts** - Automate detection of worker issues +5. **Review worker logs** - Look for patterns in restarts and failures +6. **Tune limits based on reality** - Adjust after observing actual task behavior