Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
af5161d
Add HTTP health check to celery workers; default is port 9001 but con…
johnewart Dec 3, 2025
3f84e92
Add celery-healthcheck to the codebase because upgrading the required…
johnewart Dec 4, 2025
8da5830
Sorry mypy, pylint and black
johnewart Dec 4, 2025
765b06f
fix / disable isort
johnewart Dec 4, 2025
bb8edb4
fix black error
johnewart Dec 4, 2025
1706854
Remove uvicorn in favor of plain-old Python HTTP server, add some tes…
johnewart Dec 9, 2025
6ad8c77
Merge branch 'main' into johnewart/ENG-1948
johnewart Dec 9, 2025
6af3d6d
Comment out celery_worker_parameters fixture
johnewart Dec 9, 2025
4153a68
Update tests/conftest.py
johnewart Dec 11, 2025
195868e
Update src/fides/api/tasks/celery_healthcheck/server.py
johnewart Dec 11, 2025
8da1f48
Update tests/task/test_healthcheck_server.py
johnewart Dec 11, 2025
b6a4eee
Update src/fides/api/tasks/celery_healthcheck/server.py
johnewart Dec 11, 2025
073e0b8
Clean up some variable names, add typing / linting fixes
johnewart Dec 11, 2025
b2648c6
Add note for port override
johnewart Dec 11, 2025
14ee36c
Merge branch 'main' into johnewart/ENG-1948
galvana Jan 7, 2026
e61afeb
Merge branch 'main' into johnewart/ENG-1948
johnewart Jan 9, 2026
c7216c5
Move session outside of lock, add a bit of logging on pool creation
johnewart Jan 14, 2026
f578ee3
Merge branch 'main' into johnewart/ENG-1948
johnewart Feb 4, 2026
b195f04
Minor fix for healthcheck on Celery
johnewart Feb 4, 2026
e973c40
Add changelog yaml
johnewart Feb 4, 2026
421eaa7
Merge branch 'main' into johnewart/ENG-1948
johnewart Feb 4, 2026
080146f
Fix formatting after merging main
johnewart Feb 4, 2026
c7e9531
Merge branch 'main' into johnewart/ENG-1948
galvana Feb 5, 2026
8249088
Merge branch 'main' into johnewart/ENG-1948
johnewart Feb 10, 2026
cd21e53
Formatting fixes
johnewart Feb 10, 2026
145d9d2
Merge branch 'main' into johnewart/ENG-1948
johnewart Feb 11, 2026
452d351
Add some tweaks to the http server
johnewart Feb 11, 2026
a97855b
Handle xdist better
johnewart Feb 11, 2026
690be83
Fix unit tests
johnewart Feb 11, 2026
420e084
Fix formatting
johnewart Feb 11, 2026
0ca82eb
Merge branch 'main' into johnewart/ENG-1948
johnewart Feb 11, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions changelog/7091.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# Copy this file and rename it (e.g., pr-number.yaml or feature-name.yaml)
# Fill in the required fields and delete this comment block

type: Added # One of: Added, Changed, Developer Experience, Deprecated, Docs, Fixed, Removed, Security
description: Celery workers now have an HTTP healthcheck endpoint that can be used to check if the workers are running for environments that do not support running a command to check if the workers are running.
pr: 7091 # PR number
labels: [] # Optional: ["high-risk", "db-migration"]
53 changes: 11 additions & 42 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,11 @@ services:
FIDES__CONFIG_PATH: ${FIDES__CONFIG_PATH:-/fides/.fides/fides.toml}
FIDES__LOGGING__COLORIZE: "True"
FIDES__USER__ANALYTICS_OPT_OUT: "True"
# The default HTTP health check port is 9000, override it here to ensure that
# the override works as expected.
FIDES__CELERY__HEALTHCHECK_PORT: "9001"
expose:
- 9001
volumes:
- type: bind
source: ./
Expand All @@ -148,52 +153,16 @@ services:
- /fides/src/fides.egg-info

worker-privacy-preferences:
image: ethyca/fides:local${IMAGE_SUFFIX:-}
extends:
service: worker-other
command: fides worker --queues=fides.privacy_preferences,fides.privacy_request_exports,fides.privacy_request_ingestion
depends_on:
redis:
condition: service_started
restart: always
healthcheck:
test: ["CMD", "/opt/fides/bin/python", "-m", "celery", "-A", "fides.api.tasks", "inspect", "ping"]
start_period: 60s
interval: 20s
timeout: 20s
retries: 10
environment:
FIDES__CONFIG_PATH: ${FIDES__CONFIG_PATH:-/fides/.fides/fides.toml}
FIDES__LOGGING__COLORIZE: "True"
FIDES__USER__ANALYTICS_OPT_OUT: "True"
volumes:
- type: bind
source: ./
target: /fides
read_only: False
- /fides/src/fides.egg-info

worker-dsr:
image: ethyca/fides:local${IMAGE_SUFFIX:-}
command: fides worker --queues=fides.dsr
depends_on:
redis:
condition: service_started
restart: always
extends:
service: worker-other
healthcheck:
test: ["CMD", "/opt/fides/bin/python", "-m", "celery", "-A", "fides.api.tasks", "inspect", "ping"]
start_period: 60s
interval: 20s
timeout: 20s
retries: 10
environment:
FIDES__CONFIG_PATH: ${FIDES__CONFIG_PATH:-/fides/.fides/fides.toml}
FIDES__LOGGING__COLORIZE: "True"
FIDES__USER__ANALYTICS_OPT_OUT: "True"
volumes:
- type: bind
source: ./
target: /fides
read_only: False
- /fides/src/fides.egg-info
test: [ "CMD", "curl", "-f", "http://localhost:9001/"]
command: fides worker --queues=fides.dsr

redis:
image: "redis:8.0-alpine"
Expand Down
4 changes: 3 additions & 1 deletion qa/scenarios/manual_task_with_conditional_dependencies.py
Original file line number Diff line number Diff line change
Expand Up @@ -673,7 +673,9 @@ def _create_conditional_dependencies(self) -> bool:
dependency_data = create_dependency_data(self.manual_task.id)
dep = ManualTaskConditionalDependency.create(db=db, data=dependency_data)
self.conditional_dependencies.append(dep)
self.info(f"Created conditional dependency with full condition tree: {dep.id}")
self.info(
f"Created conditional dependency with full condition tree: {dep.id}"
)

self.success("Created ManualTaskConditionalDependency with condition tree")
return True
Expand Down
14 changes: 9 additions & 5 deletions qa/scenarios/sql_translator_demo.py
Original file line number Diff line number Diff line change
Expand Up @@ -473,11 +473,15 @@ def _create_manual_task_config(self) -> bool:
"""Create Manual Task configuration."""
try:
# Check if config already exists for this task
existing_config = self.db.query(ManualTaskConfig).filter_by(
task_id=self.manual_task.id,
config_type=ActionType.access,
is_current=True
).first()
existing_config = (
self.db.query(ManualTaskConfig)
.filter_by(
task_id=self.manual_task.id,
config_type=ActionType.access,
is_current=True,
)
.first()
)

if existing_config:
self.info(
Expand Down
2 changes: 1 addition & 1 deletion src/fides/api/db/ctl_session.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,7 @@ async def prewarmed_async_readonly_session() -> AsyncGenerator[Any, Any]:
)
ASYNC_READONLY_POOL_WARMED = True

session = readonly_async_session_factory()
session = readonly_async_session_factory()

try:
yield session
Expand Down
4 changes: 4 additions & 0 deletions src/fides/api/tasks/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
)

from fides.api.db.session import get_db_engine, get_db_session
from fides.api.tasks import celery_healthcheck
from fides.api.util.logger import setup as setup_logging
from fides.config import CONFIG, FidesConfig

Expand Down Expand Up @@ -102,6 +103,7 @@ def _create_celery(config: FidesConfig = CONFIG) -> Celery:
)

app = Celery(__name__)
celery_healthcheck.register(app) # type: ignore

celery_config: Dict[str, Any] = {
# Defaults for the celery config
Expand All @@ -112,6 +114,8 @@ def _create_celery(config: FidesConfig = CONFIG) -> Celery:
# Ops requires this to route emails to separate queues
"task_create_missing_queues": True,
"task_default_queue": "fides",
"healthcheck_port": config.celery.healthcheck_port,
"healthcheck_ping_timeout": config.celery.healthcheck_ping_timeout,
}

celery_config.update(config.celery)
Expand Down
11 changes: 11 additions & 0 deletions src/fides/api/tasks/celery_healthcheck/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# fmt: off
# type: ignore
# pylint: skip-file
# isort:off


from .server import HealthCheckServer


def register(celery_app):
celery_app.steps["worker"].add(HealthCheckServer)
148 changes: 148 additions & 0 deletions src/fides/api/tasks/celery_healthcheck/server.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
import json
import socket
import threading
from http.server import HTTPServer, SimpleHTTPRequestHandler
from typing import Any, Optional

from celery import bootsteps
from celery.worker import WorkController
from loguru import logger

HEALTHCHECK_DEFAULT_PORT = 9000
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should use a consistent value, I think 9001 would be ok. It's 9000 in some places (celery_settings.py , server.py, tests) but 9001 in docker-compose.yml

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is 9001 in the compose file explicitly to ensure that the config would override it and it would work as-expected. But we can just keep it 9000 everywhere / default.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added a comment in the docker-compose.yml file as to why it's 9001 there (since it seems valuable to ensure that override works correctly and it's easy to do there)

HEALTHCHECK_DEFAULT_PING_TIMEOUT = 2.0
HEALTHCHECK_DEFAULT_HTTP_SERVER_SHUTDOWN_TIMEOUT = 2.0


class HealthcheckHandler(SimpleHTTPRequestHandler):
"""HTTP request handler with additional properties and functions"""

def __init__(
self, parent: WorkController, healthcheck_ping_timeout: float, *args: Any
):
self.parent = parent
self.healthcheck_ping_timeout = healthcheck_ping_timeout
super().__init__(*args)

def log_message(self, format: str, *args: Any) -> None:
"""
Override to suppress default HTTP server logging to stderr.
The default implementation writes to stderr which can cause
contention and deadlocks in test environments, especially with
pytest's output capturing and parallel test execution.
We use loguru for structured logging instead at the debug level.
"""
logger.debug(f"Healthcheck: {self.address_string()} - {format % args}")

def do_GET(self) -> None:
"""Handle GET requests"""
try:
try:
parent = self.parent
insp = parent.app.control.inspect(
destination=[parent.hostname], timeout=self.healthcheck_ping_timeout
)
result = insp.ping()

data = json.dumps({"status": "ok", "data": result})
logger.debug(f"Healthcheck ping result: {data}")

self.send_response(200)
self.send_header("Content-type", "application/json")
self.end_headers()
self.wfile.write(bytes(data, "utf-8"))
except Exception as e:
logger.warning(f"Healthcheck ping exception: {e}")
response = {"status": "error", "data": str(e)}
self.send_response(503)
self.send_header("Content-type", "application/json")
self.end_headers()
self.wfile.write(bytes(json.dumps(response), "utf-8"))
except Exception as ex:
logger.exception("HealthcheckHandler exception", exc_info=ex)
self.send_response(500)


class HealthCheckServer(bootsteps.StartStopStep):
# ignore kwargs type
def __init__(self, parent: WorkController, **kwargs): # type: ignore [arg-type, no-untyped-def]
self.thread: Optional[threading.Thread] = None
self.http_server: Optional[HTTPServer] = None

self.parent = parent

# config
self.healthcheck_port = int(
getattr(parent.app.conf, "healthcheck_port", HEALTHCHECK_DEFAULT_PORT)
)
self.healthcheck_ping_timeout = float(
getattr(
parent.app.conf,
"healthcheck_ping_timeout",
HEALTHCHECK_DEFAULT_PING_TIMEOUT,
)
)
self.shutdown_timeout = float(
getattr(
parent.app.conf,
"shutdown_timeout",
HEALTHCHECK_DEFAULT_HTTP_SERVER_SHUTDOWN_TIMEOUT,
)
)

super().__init__(parent, **kwargs)

# The mypy hints for an HTTP handler are strange, so ignoring them here
def http_handler(self, *args) -> None: # type: ignore [arg-type, no-untyped-def]
HealthcheckHandler(self.parent, self.healthcheck_ping_timeout, *args)

def start(self, parent: WorkController) -> None:
# Ignore mypy hints here as the constructed object immediately handles the request
# (if you look in the source code for SimpleHTTPRequestHandler, specifically the finalize request method)
self.http_server = HTTPServer(
("0.0.0.0", self.healthcheck_port),
self.http_handler, # type: ignore [arg-type]
)

# Enable socket reuse to prevent port conflicts during rapid test cycling
# This is especially important for session-scoped test workers
self.http_server.socket.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)

# Set a socket timeout to prevent indefinite blocking on requests
self.http_server.timeout = 5.0

self.thread = threading.Thread(
target=self.http_server.serve_forever, daemon=True
)
self.thread.start()
logger.info(f"Health check server started on port {self.healthcheck_port}")

def stop(self, parent: WorkController) -> None:
if self.http_server is None:
logger.warning(
"Requested stop of HTTP healthcheck server, but no server was started"
)
else:
logger.info(
f"Stopping health check server with a timeout of {self.shutdown_timeout} seconds"
)
try:
# Call shutdown - this should be safe from any thread
# It will cause serve_forever() to return after handling any current request
self.http_server.shutdown()
except Exception as e:
logger.warning(f"Error during HTTP server shutdown: {e}")

# Wait for the thread to finish with a timeout
if self.thread is None:
logger.warning("No thread in HTTP healthcheck server to shutdown...")
else:
self.thread.join(self.shutdown_timeout)
if self.thread.is_alive():
logger.warning(
f"Healthcheck thread still alive after {self.shutdown_timeout}s timeout. "
"It will continue running as a daemon thread."
)
else:
logger.info(
f"Health check server stopped cleanly on port {self.healthcheck_port}"
)
6 changes: 6 additions & 0 deletions src/fides/config/celery_settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,12 @@ class CelerySettings(FidesSettings):
description="If true, tasks are executed locally instead of being sent to the queue. "
"If False, tasks are sent to the queue.",
)
healthcheck_port: int = Field(
default=9000, description="The port to use for the health check endpoint"
)
healthcheck_ping_timeout: float = Field(
default=2.0, description="The timeout in seconds for the health check ping"
)
model_config = SettingsConfigDict(env_prefix=ENV_PREFIX)


Expand Down
Loading
Loading