-
Notifications
You must be signed in to change notification settings - Fork 91
ENG-1948 - Celery healthcheck HTTP endpoint #7091
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 23 commits
af5161d
3f84e92
8da5830
765b06f
bb8edb4
1706854
6ad8c77
6af3d6d
4153a68
195868e
8da1f48
b6a4eee
073e0b8
b2648c6
14ee36c
e61afeb
c7216c5
f578ee3
b195f04
e973c40
421eaa7
080146f
c7e9531
8249088
cd21e53
145d9d2
452d351
a97855b
690be83
420e084
0ca82eb
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,7 @@ | ||
| # Copy this file and rename it (e.g., pr-number.yaml or feature-name.yaml) | ||
| # Fill in the required fields and delete this comment block | ||
|
|
||
| type: Added # One of: Added, Changed, Developer Experience, Deprecated, Docs, Fixed, Removed, Security | ||
| description: Celery workers now have an HTTP healthcheck endpoint that can be used to check if the workers are running for environments that do not support running a command to check if the workers are running. | ||
| pr: 7091 # PR number | ||
| labels: [] # Optional: ["high-risk", "db-migration"] |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,11 @@ | ||
| # fmt: off | ||
| # type: ignore | ||
| # pylint: skip-file | ||
| # isort:off | ||
|
|
||
|
|
||
| from .server import HealthCheckServer | ||
|
|
||
|
|
||
| def register(celery_app): | ||
| celery_app.steps["worker"].add(HealthCheckServer) |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,116 @@ | ||
| import json | ||
| import threading | ||
| from http.server import HTTPServer, SimpleHTTPRequestHandler | ||
| from typing import Any, Optional | ||
|
|
||
| from celery import bootsteps | ||
| from celery.worker import WorkController | ||
| from loguru import logger | ||
|
|
||
| HEALTHCHECK_DEFAULT_PORT = 9000 | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We should use a consistent value, I think 9001 would be ok. It's 9000 in some places (
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It is 9001 in the compose file explicitly to ensure that the config would override it and it would work as-expected. But we can just keep it 9000 everywhere / default.
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Added a comment in the docker-compose.yml file as to why it's 9001 there (since it seems valuable to ensure that override works correctly and it's easy to do there) |
||
| HEALTHCHECK_DEFAULT_PING_TIMEOUT = 2.0 | ||
| HEALTHCHECK_DEFAULT_HTTP_SERVER_SHUTDOWN_TIMEOUT = 2.0 | ||
|
|
||
|
|
||
| class HealthcheckHandler(SimpleHTTPRequestHandler): | ||
| """HTTP request handler with additional properties and functions""" | ||
|
|
||
| def __init__( | ||
| self, parent: WorkController, healthcheck_ping_timeout: float, *args: Any | ||
| ): | ||
| self.parent = parent | ||
| self.healthcheck_ping_timeout = healthcheck_ping_timeout | ||
| super().__init__(*args) | ||
|
|
||
| def do_GET(self) -> None: | ||
| """Handle GET requests""" | ||
| try: | ||
| try: | ||
| parent = self.parent | ||
| insp = parent.app.control.inspect( | ||
| destination=[parent.hostname], timeout=self.healthcheck_ping_timeout | ||
| ) | ||
| result = insp.ping() | ||
|
|
||
| data = json.dumps({"status": "ok", "data": result}) | ||
| logger.debug(f"Healthcheck ping result: {data}") | ||
|
|
||
| self.send_response(200) | ||
| self.send_header("Content-type", "application/json") | ||
| self.end_headers() | ||
| self.wfile.write(bytes(data, "utf-8")) | ||
| except Exception as e: | ||
| logger.warning(f"Healthcheck ping exception: {e}") | ||
| response = {"status": "error", "data": str(e)} | ||
| self.send_response(503) | ||
| self.send_header("Content-type", "application/json") | ||
| self.end_headers() | ||
| self.wfile.write(bytes(json.dumps(response), "utf-8")) | ||
| except Exception as ex: | ||
| logger.exception("HealthcheckHandler exception", exc_info=ex) | ||
| self.send_response(500) | ||
|
|
||
|
|
||
| class HealthCheckServer(bootsteps.StartStopStep): | ||
| # ignore kwargs type | ||
| def __init__(self, parent: WorkController, **kwargs): # type: ignore [arg-type, no-untyped-def] | ||
| self.thread: Optional[threading.Thread] = None | ||
| self.http_server: Optional[HTTPServer] = None | ||
|
|
||
| self.parent = parent | ||
|
|
||
| # config | ||
| self.healthcheck_port = int( | ||
| getattr(parent.app.conf, "healthcheck_port", HEALTHCHECK_DEFAULT_PORT) | ||
| ) | ||
| self.healthcheck_ping_timeout = float( | ||
| getattr( | ||
| parent.app.conf, | ||
| "healthcheck_ping_timeout", | ||
| HEALTHCHECK_DEFAULT_PING_TIMEOUT, | ||
| ) | ||
| ) | ||
| self.shutdown_timeout = float( | ||
| getattr( | ||
| parent.app.conf, | ||
| "shutdown_timeout", | ||
| HEALTHCHECK_DEFAULT_HTTP_SERVER_SHUTDOWN_TIMEOUT, | ||
| ) | ||
| ) | ||
|
|
||
| super().__init__(parent, **kwargs) | ||
|
|
||
| # The mypy hints for an HTTP handler are strange, so ignoring them here | ||
| def http_handler(self, *args) -> None: # type: ignore [arg-type, no-untyped-def] | ||
| HealthcheckHandler(self.parent, self.healthcheck_ping_timeout, *args) | ||
|
|
||
| def start(self, parent: WorkController) -> None: | ||
| # Ignore mypy hints here as the constructed object immediately handles the request | ||
| # (if you look in the source code for SimpleHTTPRequestHandler, specifically the finalize request method) | ||
| self.http_server = HTTPServer( | ||
| ("0.0.0.0", self.healthcheck_port), self.http_handler # type: ignore [arg-type] | ||
| ) | ||
|
|
||
| self.thread = threading.Thread( | ||
| target=self.http_server.serve_forever, daemon=True | ||
| ) | ||
| self.thread.start() | ||
|
|
||
| def stop(self, parent: WorkController) -> None: | ||
| if self.http_server is None: | ||
| logger.warning( | ||
| "Requested stop of HTTP healthcheck server, but no server was started" | ||
| ) | ||
| else: | ||
| logger.info( | ||
| f"Stopping health check server with a timeout of {self.shutdown_timeout} seconds" | ||
| ) | ||
| self.http_server.shutdown() | ||
|
|
||
| # Really this should not happen if the HTTP server is None, but just in case, we should check. | ||
| if self.thread is None: | ||
| logger.warning("No thread in HTTP healthcheck server to shutdown...") | ||
| else: | ||
| self.thread.join(self.shutdown_timeout) | ||
|
|
||
| logger.info(f"Health check server stopped on port {self.healthcheck_port}") | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -60,7 +60,7 @@ | |
| from fides.api.schemas.messaging.messaging import MessagingServiceType | ||
| from fides.api.schemas.privacy_request import PrivacyRequestStatus | ||
| from fides.api.task.graph_runners import access_runner, consent_runner, erasure_runner | ||
| from fides.api.tasks import celery_app | ||
| from fides.api.tasks import celery_app, celery_healthcheck | ||
| from fides.api.tasks.scheduled.scheduler import async_scheduler, scheduler | ||
| from fides.api.util.cache import get_cache | ||
| from fides.api.util.collection_util import Row | ||
|
|
@@ -767,6 +767,10 @@ def celery_enable_logging(): | |
| """Turns on celery output logs.""" | ||
| return True | ||
|
|
||
| @pytest.fixture(scope="session") | ||
| def celery_session_app(celery_session_app): | ||
| celery_healthcheck.register(celery_session_app) | ||
| return celery_session_app | ||
|
|
||
| # This is here because the test suite occasionally fails to teardown the | ||
| # Celery worker if it takes too long to terminate the worker thread. This | ||
|
|
@@ -792,6 +796,7 @@ def celery_session_worker( | |
| with worker.start_worker( | ||
| celery_session_app, | ||
| pool=celery_worker_pool, | ||
| shutdown_timeout=2.0, | ||
| **celery_worker_parameters, | ||
| ) as w: | ||
| try: | ||
|
|
@@ -802,18 +807,6 @@ def celery_session_worker( | |
| except RuntimeError as re: | ||
| logger.warning("Failed to stop the celery worker: " + str(re)) | ||
|
|
||
|
|
||
| @pytest.fixture(scope="session") | ||
| def celery_worker_parameters(): | ||
| """Configure celery worker parameters for testing. | ||
|
|
||
| Increase shutdown_timeout to avoid flaky test failures when the worker | ||
| takes longer to shut down, especially during parallel test runs with pytest-xdist. | ||
| The CI environment can be slow, so we use a generous timeout. | ||
| """ | ||
| return {"shutdown_timeout": 20.0} | ||
|
|
||
|
|
||
|
Comment on lines
-805
to
-816
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yay! |
||
| @pytest.fixture(autouse=True, scope="session") | ||
| def celery_use_virtual_worker(celery_session_worker): | ||
| """ | ||
|
|
@@ -918,7 +911,8 @@ def access_runner_tester( | |
| connection_configs, | ||
| identity, | ||
| session, | ||
| privacy_request_proceed=False, # This allows the DSR 3.0 Access Runner to be tested in isolation, to just test running the access graph without queuing the privacy request | ||
| privacy_request_proceed=False, | ||
| # This allows the DSR 3.0 Access Runner to be tested in isolation, to just test running the access graph without queuing the privacy request | ||
| ) | ||
| except PrivacyRequestExit: | ||
| # DSR 3.0 intentionally raises a PrivacyRequestExit status while it waits for | ||
|
|
@@ -976,7 +970,8 @@ def consent_runner_tester( | |
| connection_configs, | ||
| identity, | ||
| session, | ||
| privacy_request_proceed=False, # This allows the DSR 3.0 Consent Runner to be tested in isolation, to just test running the consent graph without queuing the privacy request | ||
| privacy_request_proceed=False, | ||
| # This allows the DSR 3.0 Consent Runner to be tested in isolation, to just test running the consent graph without queuing the privacy request | ||
| ) | ||
| except PrivacyRequestExit: | ||
| # DSR 3.0 intentionally raises a PrivacyRequestExit status while it waits for | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,34 @@ | ||
| import pytest | ||
| import requests | ||
| from loguru import logger | ||
|
|
||
|
|
||
| class TestCeleryHealthCheckServer: | ||
| def test_responds_to_ping_properly(self, celery_session_app, celery_session_worker): | ||
| try: | ||
| response = requests.get("http://127.0.0.1:9000/") | ||
| assert response.status_code == 200 | ||
| assert response.json()["status"] == "ok" | ||
| except requests.exceptions.ConnectionError: | ||
| pytest.fail("Connection error") | ||
|
|
||
|
|
||
| class TestCeleryHealthCheckWorker: | ||
| @pytest.fixture(autouse=True) | ||
| def setup_teardown(self): | ||
| yield | ||
| with pytest.raises(Exception): | ||
| requests.get("http://127.0.0.1:9000/", timeout=1) | ||
|
|
||
| def test_shutdown_gracefully(self, celery_session_app, celery_session_worker): | ||
| try: | ||
| logger.info("Shutdown gracefully") | ||
| celery_session_worker.stop() | ||
| logger.info("Shutdown gracefully finished") | ||
| except Exception: | ||
| pytest.fail("Failed to stop health check server") | ||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
Uh oh!
There was an error while loading. Please reload this page.