diff --git a/compose/local/django/Dockerfile b/compose/local/django/Dockerfile index 0e778f82b..d0460f246 100644 --- a/compose/local/django/Dockerfile +++ b/compose/local/django/Dockerfile @@ -41,6 +41,8 @@ RUN apt-get update && apt-get install --no-install-recommends -y \ libpq-dev \ # Translations dependencies gettext \ + # healthcheck dependencies + procps \ # cleaning up unused files && apt-get purge -y --auto-remove -o APT::AutoRemove::RecommendsImportant=false \ && rm -rf /var/lib/apt/lists/* @@ -74,6 +76,10 @@ COPY ./compose/local/django/celery/flower/start /start-flower RUN sed -i 's/\r$//g' /start-flower RUN chmod +x /start-flower +# Copy celery scripts directory for healthcheck +COPY ./compose/local/django/celery /celery +RUN chmod +x /celery/healthcheck.sh + # copy application code to WORKDIR COPY . ${APP_HOME} diff --git a/compose/local/django/celery/healthcheck.sh b/compose/local/django/celery/healthcheck.sh new file mode 100644 index 000000000..61f502dd7 --- /dev/null +++ b/compose/local/django/celery/healthcheck.sh @@ -0,0 +1,46 @@ +#!/bin/bash +# +# Celery Worker Healthcheck Script +# +# This script checks if the Celery worker process is running and responsive. +# It uses two checks: +# 1. Process check - is celery worker process running? +# 2. RabbitMQ broker connectivity - can we connect to the broker? +# +# When used with the autoheal container, unhealthy workers will be +# automatically restarted. + +set -e + +# Check 1: Is the celery worker process running? +if ! pgrep -f "celery.*worker" > /dev/null 2>&1; then + echo "ERROR: Celery worker process not found" >&2 + exit 1 +fi + +# Check 2: Can we connect to RabbitMQ (the broker)? +# Use Python and Celery's connection to test broker connectivity +if command -v python > /dev/null 2>&1; then + # Use Python to test the connection with a timeout + # Access CELERY_BROKER_URL from environment within Python for security + if ! timeout 5 python -c " +import sys +import os +from kombu import Connection +try: + broker_url = os.environ.get('CELERY_BROKER_URL', 'amqp://rabbituser:rabbitpass@rabbitmq:5672/') + conn = Connection(broker_url) + conn.ensure_connection(max_retries=1, timeout=3) + conn.release() + sys.exit(0) +except Exception as e: + print('ERROR: Cannot connect to RabbitMQ broker: {0}'.format(str(e)), file=sys.stderr) + sys.exit(1) +" 2>&1; then + echo "ERROR: Cannot connect to RabbitMQ broker" >&2 + exit 1 + fi +fi + +# All checks passed +exit 0 diff --git a/compose/local/django/celery/worker/start b/compose/local/django/celery/worker/start index 0d4f67362..0b4ab3c7a 100755 --- a/compose/local/django/celery/worker/start +++ b/compose/local/django/celery/worker/start @@ -4,11 +4,20 @@ set -o errexit set -o pipefail set -o nounset +# Local development with auto-reload and optional debugging +# +# DEBUGGER=1 - Enable debugpy for remote debugging on port 5679 +# +# Worker protections (same as production): +# --max-tasks-per-child=50 - Restart after 50 tasks (prevents memory leaks) +# --max-memory-per-child=4000000 - Restart if memory exceeds 4GB + # Launch VS Code debug server if DEBUGGER environment variable is set to 1 # Note that auto reloading is disabled when debugging, manual restart required for code changes. if [ "${DEBUGGER:-0}" = "1" ]; then - # exec watchfiles --filter python 'python -m debugpy --listen 0.0.0.0:5679 -m celery -A config.celery_app worker -l INFO' - exec python -Xfrozen_modules=off -m debugpy --listen 0.0.0.0:5679 -m celery -A config.celery_app worker --queues=antenna -l INFO + echo "Starting Celery worker with debugpy on port 5679..." + exec python -Xfrozen_modules=off -m debugpy --listen 0.0.0.0:5679 -m celery -A config.celery_app worker --queues=antenna -l INFO --max-tasks-per-child=50 --max-memory-per-child=4000000 else - exec watchfiles --filter python celery.__main__.main --args '-A config.celery_app worker --queues=antenna -l INFO' + echo "Starting Celery worker with watchfiles auto-reload..." + exec watchfiles --filter python celery.__main__.main --args '-A config.celery_app worker --queues=antenna -l INFO --max-tasks-per-child=50 --max-memory-per-child=4000000' fi diff --git a/compose/production/django/Dockerfile b/compose/production/django/Dockerfile index fd6b80ec1..eeebd3afa 100644 --- a/compose/production/django/Dockerfile +++ b/compose/production/django/Dockerfile @@ -45,6 +45,8 @@ RUN apt-get update && apt-get install --no-install-recommends -y \ libpq-dev \ # Translations dependencies gettext \ + # healthcheck dependencies + procps \ # cleaning up unused files && apt-get purge -y --auto-remove -o APT::AutoRemove::RecommendsImportant=false \ && rm -rf /var/lib/apt/lists/* @@ -80,6 +82,10 @@ COPY ./compose/production/django/celery/flower/start /start-flower RUN sed -i 's/\r$//g' /start-flower RUN chmod +x /start-flower +# Copy celery scripts directory for healthcheck +COPY --chown=django:django ./compose/production/django/celery /celery +RUN chmod +x /celery/healthcheck.sh + # copy application code to WORKDIR COPY --chown=django:django . ${APP_HOME} diff --git a/compose/production/django/celery/healthcheck.sh b/compose/production/django/celery/healthcheck.sh new file mode 100644 index 000000000..b899eedfe --- /dev/null +++ b/compose/production/django/celery/healthcheck.sh @@ -0,0 +1,46 @@ +#!/bin/bash +# +# Celery Worker Healthcheck Script (Production) +# +# This script checks if the Celery worker process is running and responsive. +# It uses two checks: +# 1. Process check - is celery worker process running? +# 2. RabbitMQ broker connectivity - can we connect to the broker? +# +# When used with the autoheal container, unhealthy workers will be +# automatically restarted. + +set -e + +# Check 1: Is the celery worker process running? +if ! pgrep -f "celery.*worker" > /dev/null 2>&1; then + echo "ERROR: Celery worker process not found" >&2 + exit 1 +fi + +# Check 2: Can we connect to RabbitMQ (the broker)? +# Use Python and Celery's connection to test broker connectivity +if command -v python > /dev/null 2>&1; then + # Use Python to test the connection with a timeout + # Access CELERY_BROKER_URL from environment within Python for security + if ! timeout 5 python -c " +import sys +import os +from kombu import Connection +try: + broker_url = os.environ.get('CELERY_BROKER_URL', 'amqp://rabbituser:rabbitpass@rabbitmq:5672/') + conn = Connection(broker_url) + conn.ensure_connection(max_retries=1, timeout=3) + conn.release() + sys.exit(0) +except Exception as e: + print('ERROR: Cannot connect to RabbitMQ broker: {0}'.format(str(e)), file=sys.stderr) + sys.exit(1) +" 2>&1; then + echo "ERROR: Cannot connect to RabbitMQ broker" >&2 + exit 1 + fi +fi + +# All checks passed +exit 0 diff --git a/compose/production/django/celery/worker/start b/compose/production/django/celery/worker/start index 6b372e854..c5dee4f33 100644 --- a/compose/production/django/celery/worker/start +++ b/compose/production/django/celery/worker/start @@ -4,4 +4,10 @@ set -o errexit set -o pipefail set -o nounset -exec newrelic-admin run-program celery -A config.celery_app worker --queues=antenna -l INFO +# Production Celery worker with protections +# +# Worker protections: +# --max-tasks-per-child=50 - Restart after 50 tasks (prevents memory leaks) +# --max-memory-per-child=4000000 - Restart if memory exceeds 4GB + +exec newrelic-admin run-program celery -A config.celery_app worker --queues=antenna -l INFO --max-tasks-per-child=50 --max-memory-per-child=4000000 diff --git a/docker-compose.production.yml b/docker-compose.production.yml index 83e2799da..b88c30ce7 100644 --- a/docker-compose.production.yml +++ b/docker-compose.production.yml @@ -30,12 +30,28 @@ services: ports: [] command: /start-celeryworker restart: always + healthcheck: + test: ["CMD-SHELL", "/celery/healthcheck.sh"] + interval: 30s # Check every 30 seconds + timeout: 15s # Healthcheck must complete within 15s (connection timeout is 5s + overhead) + retries: 3 # Mark unhealthy after 3 consecutive failures (90s total) + start_period: 60s # Grace period during container startup + labels: + - "autoheal=true" # Enable autoheal to restart this container when unhealthy celerybeat: <<: *django ports: [] command: /start-celerybeat restart: always + healthcheck: + test: ["CMD-SHELL", "pgrep -f 'celery.*beat' > /dev/null || exit 1"] + interval: 60s # Beat is less critical, check every minute + timeout: 10s + retries: 3 + start_period: 30s + labels: + - "autoheal=true" flower: <<: *django @@ -45,6 +61,25 @@ services: restart: always volumes: - ./data/flower/:/data/ + healthcheck: + test: ["CMD-SHELL", "python -c \"import urllib.request; urllib.request.urlopen('http://localhost:5555/', timeout=3)\" || exit 1"] + interval: 30s + timeout: 5s + retries: 3 + start_period: 30s + labels: + - "autoheal=true" + + autoheal: + image: willfarrell/autoheal:latest + container_name: ami_production_autoheal + restart: always + environment: + - AUTOHEAL_CONTAINER_LABEL=autoheal + - AUTOHEAL_INTERVAL=10 # Check container health every 10 seconds + - AUTOHEAL_START_PERIOD=60 # Don't restart containers for 60s after they start + volumes: + - /var/run/docker.sock:/var/run/docker.sock awscli: build: diff --git a/docker-compose.worker.yml b/docker-compose.worker.yml index b2aef392a..c31dd9191 100644 --- a/docker-compose.worker.yml +++ b/docker-compose.worker.yml @@ -26,3 +26,22 @@ services: ports: [] command: /start-celeryworker restart: always + healthcheck: + test: ["CMD-SHELL", "/celery/healthcheck.sh"] + interval: 30s # Check every 30 seconds + timeout: 15s # Healthcheck must complete within 15s (connection timeout is 5s + overhead) + retries: 3 # Mark unhealthy after 3 consecutive failures (90s total) + start_period: 60s # Grace period during container startup + labels: + - "autoheal=true" # Enable autoheal to restart this container when unhealthy + + autoheal: + image: willfarrell/autoheal:latest + container_name: ami_worker_autoheal + restart: always + environment: + - AUTOHEAL_CONTAINER_LABEL=autoheal + - AUTOHEAL_INTERVAL=10 # Check container health every 10 seconds + - AUTOHEAL_START_PERIOD=60 # Don't restart containers for 60s after they start + volumes: + - /var/run/docker.sock:/var/run/docker.sock diff --git a/docker-compose.yml b/docker-compose.yml index 58e551a71..0cfba772d 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -94,6 +94,12 @@ services: command: /start-celeryworker depends_on: - rabbitmq + healthcheck: + test: ["CMD-SHELL", "/celery/healthcheck.sh"] + interval: 30s # Check every 30 seconds + timeout: 15s # Healthcheck must complete within 15s (connection timeout is 5s + overhead) + retries: 3 # Mark unhealthy after 3 consecutive failures (90s total) + start_period: 60s # Grace period during container startup celerybeat: <<: *django