RolnickLab · mihow · Nov 16, 2025 · Nov 14, 2025 · Nov 15, 2025 · Nov 15, 2025
diff --git a/compose/local/django/celery/worker/start b/compose/local/django/celery/worker/start
@@ -4,11 +4,24 @@ set -o errexit
 set -o pipefail
 set -o nounset
 
+# Local development with auto-reload and optional debugging
+#
+# DEBUGGER=1 - Enable debugpy for remote debugging on port 5679
+#
+# Worker protections (prevent memory leaks from long-running workers):
+# --max-tasks-per-child=100 - Restart worker process after 100 tasks
+# --max-memory-per-child=1048576 - Restart if worker process exceeds 1 GiB (in KB)
+#
+# With prefork pool (default), each CPU gets a worker process.
+# Example: 8 CPUs × 1 GiB = 8 GiB max total worker memory
+
+MAX_TASKS_PER_CHILD=100
+MAX_MEMORY_PER_CHILD=1048576  # 1 GiB in KB
+
 # Launch VS Code debug server if DEBUGGER environment variable is set to 1
 # Note that auto reloading is disabled when debugging, manual restart required for code changes.
 if [ "${DEBUGGER:-0}" = "1" ]; then
-    # exec watchfiles --filter python 'python -m debugpy --listen 0.0.0.0:5679 -m celery -A config.celery_app worker -l INFO'
-    exec python -Xfrozen_modules=off -m debugpy --listen 0.0.0.0:5679 -m celery -A config.celery_app worker --queues=antenna -l INFO
+    exec python -Xfrozen_modules=off -m debugpy --listen 0.0.0.0:5679 -m celery -A config.celery_app worker --queues=antenna -l INFO --max-tasks-per-child=$MAX_TASKS_PER_CHILD --max-memory-per-child=$MAX_MEMORY_PER_CHILD
 else
-    exec watchfiles --filter python celery.__main__.main --args '-A config.celery_app worker --queues=antenna -l INFO'
+    exec watchfiles --filter python celery.__main__.main --args '-A config.celery_app worker --queues=antenna -l INFO --max-tasks-per-child='$MAX_TASKS_PER_CHILD' --max-memory-per-child='$MAX_MEMORY_PER_CHILD
 fi
diff --git a/compose/production/django/celery/worker/start b/compose/production/django/celery/worker/start
@@ -4,4 +4,16 @@ set -o errexit
 set -o pipefail
 set -o nounset
 
-exec newrelic-admin run-program celery -A config.celery_app worker --queues=antenna -l INFO
+# Production Celery worker with memory leak protections
+#
+# Worker protections:
+# --max-tasks-per-child=100 - Restart worker process after 100 tasks
+# --max-memory-per-child=2097152 - Restart if worker process exceeds 2 GiB (in KB)
+#
+# With prefork pool (default), each CPU gets a worker process.
+# Example: 8 CPUs × 2 GiB = 16 GiB max total worker memory
+
+MAX_TASKS_PER_CHILD=100
+MAX_MEMORY_PER_CHILD=2097152  # 2 GiB in KB
+
+exec newrelic-admin run-program celery -A config.celery_app worker --queues=antenna -l INFO --max-tasks-per-child=$MAX_TASKS_PER_CHILD --max-memory-per-child=$MAX_MEMORY_PER_CHILD
diff --git a/config/settings/base.py b/config/settings/base.py
@@ -323,10 +323,10 @@
 CELERY_RESULT_COMPRESSION = "gzip"
 # https://docs.celeryq.dev/en/stable/userguide/configuration.html#task-time-limit
 # TODO: set to whatever value is adequate in your circumstances
-CELERY_TASK_TIME_LIMIT = 7 * 60 * 24
+CELERY_TASK_TIME_LIMIT = 4 * 60 * 60 * 24  # 4 days
 # https://docs.celeryq.dev/en/stable/userguide/configuration.html#task-soft-time-limit
 # TODO: set to whatever value is adequate in your circumstances
-CELERY_TASK_SOFT_TIME_LIMIT = 6 * 60 * 24
+CELERY_TASK_SOFT_TIME_LIMIT = 3 * 60 * 60 * 24  # 3 days
 # https://docs.celeryq.dev/en/stable/userguide/configuration.html#beat-scheduler
 CELERY_BEAT_SCHEDULER = "django_celery_beat.schedulers:DatabaseScheduler"
 # https://docs.celeryq.dev/en/stable/userguide/configuration.html#worker-send-task-events
@@ -346,16 +346,19 @@
 CELERY_WORKER_PREFETCH_MULTIPLIER = 1
 CELERY_WORKER_ENABLE_PREFETCH_COUNT_REDUCTION = True
 
-# Connection settings to match Redis timeout and keepalive
+# RabbitMQ broker connection settings
+# These settings improve reliability for long-running workers with intermittent network issues
 CELERY_BROKER_TRANSPORT_OPTIONS = {
-    "visibility_timeout": 43200,  # 12 hours - default celery value
-    "socket_timeout": 120,  # Matches Redis timeout setting
-    "socket_connect_timeout": 30,  # Max time to establish connection
-    "socket_keepalive": True,  # Enable TCP keepalive
-    "retry_on_timeout": True,  # Retry operations if Redis times out
-    "max_connections": 20,  # Per process connection pool limit
+    "socket_timeout": 120,  # Socket read/write timeout (seconds)
+    "socket_connect_timeout": 30,  # Max time to establish connection (seconds)
+    "socket_keepalive": True,  # Enable TCP keepalive probes
+    "retry_on_timeout": True,  # Retry operations on timeout
+    "max_connections": 20,  # Per-process connection pool limit
+    "heartbeat": 60,  # RabbitMQ heartbeat interval (seconds) - detects broken connections
 }
 
+# Broker connection retry settings
+# Workers will retry forever on connection failures rather than crashing
 CELERY_BROKER_CONNECTION_RETRY = True
 CELERY_BROKER_CONNECTION_RETRY_ON_STARTUP = True
 CELERY_BROKER_CONNECTION_MAX_RETRIES = None  # Retry forever

diff --git a/docs/WORKER_MONITORING.md b/docs/WORKER_MONITORING.md
@@ -0,0 +1,226 @@
+# Celery Worker Monitoring Guide
+
+This guide explains how to monitor Celery worker health and detect issues like memory leaks or hung workers.
+
+## Worker Protections In Place
+
+The workers are configured with automatic protections to prevent long-running issues:
+
+- **`--max-tasks-per-child=100`** - Each worker process restarts after 100 tasks
+  - Prevents memory leaks from accumulating over time
+  - Clean slate every 100 tasks ensures consistent performance
+
+- **`--max-memory-per-child=2097152`** - Each worker process restarts if it exceeds 2 GiB
+  - Measured in KB (2097152 KB = 2 GiB)
+  - With prefork pool (default), this is **per worker process**
+  - Example: 8 CPUs = 8 worker processes × 2 GiB = 16 GiB max total
+
+These protections cause workers to gracefully restart before problems occur.
+
+## Monitoring Tools
+
+### 1. Flower (http://localhost:5555)
+
+**Active Tasks View:**
+- Shows currently running tasks
+- Look for tasks stuck in "running" state for unusually long periods
+- Your `CELERY_TASK_TIME_LIMIT` is 4 days, so anything approaching that is suspicious
+
+**Worker Page:**
+- Shows last heartbeat timestamp for each worker
+- Stale heartbeat (> 60 seconds) indicates potential issue
+- Shows worker status, active tasks, and resource usage
+
+**Task Timeline:**
+- Sort tasks by duration to find long-running ones
+- Filter by state (SUCCESS, FAILURE, RETRY, etc.)
+- See task arguments and return values
+
+**Limitations:**
+- Cannot detect workers in deadlock that stop sending heartbeats
+- Shows task status but not detailed memory usage
+
+### 2. New Relic APM
+
+**Transaction Traces:**
+- Automatic instrumentation via `newrelic-admin` wrapper
+- See detailed task execution times and database queries
+- Identify slow tasks that might cause memory buildup
+
+**Infrastructure Monitoring:**
+- Monitor container CPU and memory usage
+- Set alerts for high memory usage approaching limits
+- Track worker process count and restarts
+
+**Custom Instrumentation:**
+- Add custom metrics for task-specific monitoring
+- Track business metrics (images processed, detections created, etc.)
+
+### 3. Docker Monitoring
+
+**Check worker health:**
+```bash
+# View running containers and resource usage
+docker stats
+
+# Check worker logs for restart messages
+docker compose logs celeryworker | grep -i "restart\|warm shutdown\|cool shutdown"
+
+# See recent worker activity
+docker compose logs --tail=100 celeryworker
+```
+
+**Worker restart patterns:**
+When a worker hits limits, you'll see:
+```
+[INFO/MainProcess] Warm shutdown (MainProcess)
+[INFO/MainProcess] Celery worker: Restarting pool processes
+```
+
+## Detecting Hung Workers
+
+### Signs of a hung worker:
+
+1. **Tasks stuck in "started" state** (Flower active tasks page)
+   - Task shows as running but makes no progress
+   - Duration keeps increasing without completion
+
+2. **Stale worker heartbeats** (Flower workers page)
+   - Last heartbeat > 60 seconds ago
+   - Worker shows as offline or unresponsive
+
+3. **Queue backlog building up** (Flower or RabbitMQ management)
+   - Tasks accumulating in queue but not being processed
+   - Check: http://localhost:15672 (RabbitMQ management UI)
+   - Default credentials: `rabbituser` / `rabbitpass`
+
+4. **Container health** (Docker stats)
+   - CPU stuck at 100% with no task progress
+   - Memory at ceiling without task completion
+
+### Manual investigation:
+
+```bash
+# Attach to worker container
+docker compose exec celeryworker bash
+
+# Check running Python processes
+ps aux | grep celery
+
+# Check memory usage of worker processes
+ps aux --sort=-%mem | grep celery | head -10
+
+# See active connections to RabbitMQ
+netstat -an | grep 5672
+```
+
+## Recommended Monitoring Setup
+
+### Flower Alerts (Manual Checks)
+
+Visit Flower periodically and check:
+1. Workers page - all workers showing recent heartbeats?
+2. Tasks page - any tasks running > 1 hour?
+3. Queue length - is it growing unexpectedly?
+
+### New Relic Alerts (Automated)
+
+Set up alerts for:
+- Container memory > 80% for > 5 minutes
+- Task duration > 1 hour (customize based on your use case)
+- Worker process restarts > 10 per hour
+- Queue depth > 100 tasks
+
+### RabbitMQ Management UI
+
+Access at: http://localhost:15672
+- Monitor queue lengths
+- Check consumer counts (should match number of worker processes)
+- View connection status and channels
+
+## Troubleshooting Common Issues
+
+### Worker keeps restarting every 100 tasks
+
+**This is normal behavior** with `--max-tasks-per-child=100`.
+
+If you see too many restarts:
+- Check task complexity - are tasks heavier than expected?
+- Consider increasing the limit: `--max-tasks-per-child=200`
+- Monitor if specific task types cause issues
+
+### Worker hitting memory limit frequently
+
+If workers constantly hit the 2 GiB limit:
+- Review which tasks use the most memory
+- Optimize data loading (use streaming for large datasets)
+- Consider increasing limit: `--max-memory-per-child=3145728` (3 GiB)
+- Check for memory leaks in task code
+
+### Tasks timing out
+
+Current limits:
+- `CELERY_TASK_TIME_LIMIT = 4 * 60 * 60 * 24` (4 days)
+- `CELERY_TASK_SOFT_TIME_LIMIT = 3 * 60 * 60 * 24` (3 days)
+
+These are very generous. If tasks timeout:
+- Check task logs for errors
+- Review data volume being processed
+- Consider breaking large jobs into smaller tasks
+
+### Queue backlog growing
+
+Possible causes:
+1. Workers offline - check `docker compose ps`
+2. Tasks failing and retrying - check Flower failures
+3. Task creation rate > processing rate - scale up workers
+4. Tasks hung - restart workers: `docker compose restart celeryworker`
+
+## Scaling Workers
+
+### Horizontal scaling (more containers):
+
+```bash
+# Local development
+docker compose up -d --scale celeryworker=3
+
+# Production (docker-compose.worker.yml)
+docker compose -f docker-compose.worker.yml up -d --scale celeryworker=3
+```
+
+### Vertical scaling (more processes per container):
+
+Add concurrency flag to worker start command:
+```bash
+celery -A config.celery_app worker --queues=antenna --concurrency=16
+```
+
+Default is number of CPUs. Increase if CPU is underutilized.
+
+## Memory Tuning Guidelines
+
+Based on your ML workload:
+
+**Current setting:** 2 GiB per worker process (production), 1 GiB (local dev)
+**Task requirement:** JSON orchestration with large request/response payloads
+
+This provides adequate headroom for JSON processing and HTTP overhead.
+
+**If tasks consistently use more:**
+- Increase: `--max-memory-per-child=3145728` (3 GiB)
+- Ensure container/host has sufficient memory
+- Monitor total memory: (num CPUs) × (limit per child)
+
+**If tasks use much less:**
+- Decrease: `--max-memory-per-child=1048576` (1 GiB)
+- Allows more workers on same hardware
+- Faster to detect actual memory leaks
+
+## Best Practices
+
+1. **Monitor regularly** - Check Flower dashboard daily during active development
+2. **Log memory-intensive tasks** - Add logging for tasks processing large datasets
+3. **Test limits locally** - Verify worker restarts work correctly in development
+4. **Set New Relic alerts** - Automate detection of worker issues
+5. **Review worker logs** - Look for patterns in restarts and failures
+6. **Tune limits based on reality** - Adjust after observing actual task behavior