From f457f41de80228fb1a1bd54a14e33d4ecafbf8e8 Mon Sep 17 00:00:00 2001 From: YuryHrytsuk Date: Fri, 13 Jun 2025 16:11:26 +0200 Subject: [PATCH 1/4] Add fallback traefik routes Add fallback routes that will take effect once main services (e.g. webserver is not in `running` state). Once service is not in `running` state, its configuration is removed from traefik leading to 404. Related Issue(s): * https://github.com/ITISFoundation/osparc-ops-environments/issues/218 Related PR(s): * https://github.com/ITISFoundation/osparc-ops-environments/pull/950 --- services/docker-compose.yml | 53 +++++++++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) diff --git a/services/docker-compose.yml b/services/docker-compose.yml index 8d59f487263c..e5df3c10c31f 100644 --- a/services/docker-compose.yml +++ b/services/docker-compose.yml @@ -57,6 +57,7 @@ services: - traefik.http.services.${SWARM_STACK_NAME}_api-server.loadbalancer.healthcheck.path=/ - traefik.http.services.${SWARM_STACK_NAME}_api-server.loadbalancer.healthcheck.interval=2000ms - traefik.http.services.${SWARM_STACK_NAME}_api-server.loadbalancer.healthcheck.timeout=1000ms + # NOTE: keep in sync with fallback router (rule and entrypoint) - traefik.http.routers.${SWARM_STACK_NAME}_api-server.rule=(Path(`/`) || Path(`/v0`) || PathPrefix(`/v0/`) || Path(`/api/v0/openapi.json`)) - traefik.http.routers.${SWARM_STACK_NAME}_api-server.entrypoints=simcore_api - traefik.http.routers.${SWARM_STACK_NAME}_api-server.priority=3 @@ -628,6 +629,7 @@ services: - traefik.http.services.${SWARM_STACK_NAME}_static_webserver.loadbalancer.healthcheck.interval=2000ms - traefik.http.services.${SWARM_STACK_NAME}_static_webserver.loadbalancer.healthcheck.timeout=1000ms - traefik.http.middlewares.${SWARM_STACK_NAME}_static_webserver_retry.retry.attempts=2 + # NOTE: keep in sync with fallback router (rule and entrypoint) - traefik.http.routers.${SWARM_STACK_NAME}_static_webserver.rule=(Path(`/osparc`) || Path(`/s4l`) || Path(`/s4llite`) || Path(`/s4lacad`) || Path(`/s4lengine`) || Path(`/s4ldesktop`) || Path(`/s4ldesktopacad`) || Path(`/tis`) || Path(`/tiplite`) || Path(`/transpiled`) || Path(`/resource`) || PathPrefix(`/osparc/`) || PathPrefix(`/s4l/`) || PathPrefix(`/s4llite/`) || PathPrefix(`/s4lacad/`) || PathPrefix(`/s4lengine/`) || PathPrefix(`/s4ldesktop/`) || PathPrefix(`/s4ldesktopacad/`) || PathPrefix(`/tis/`) || PathPrefix(`/tiplite/`) || PathPrefix(`/transpiled/`) || PathPrefix(`/resource/`)) - traefik.http.routers.${SWARM_STACK_NAME}_static_webserver.service=${SWARM_STACK_NAME}_static_webserver - traefik.http.routers.${SWARM_STACK_NAME}_static_webserver.entrypoints=http @@ -876,6 +878,7 @@ services: - traefik.http.services.${SWARM_STACK_NAME}_webserver.loadbalancer.sticky.cookie.secure=true - traefik.http.middlewares.${SWARM_STACK_NAME}_webserver_retry.retry.attempts=2 - traefik.http.routers.${SWARM_STACK_NAME}_webserver.service=${SWARM_STACK_NAME}_webserver + # NOTE: keep in sync with fallback router (rule and entrypoint) - traefik.http.routers.${SWARM_STACK_NAME}_webserver.rule=(Path(`/`) || Path(`/v0`) || Path(`/socket.io/`) || Path(`/static-frontend-data.json`) || PathRegexp(`^/study/(?P\b[0-9a-f]{8}\b-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-\b[0-9a-f]{12}\b)`) || Path(`/view`) || Path(`/#/view`) || Path(`/#/error`) || PathPrefix(`/v0/`)) - traefik.http.routers.${SWARM_STACK_NAME}_webserver.entrypoints=http - traefik.http.routers.${SWARM_STACK_NAME}_webserver.priority=6 @@ -1452,6 +1455,56 @@ services: - default - interactive_services_subnet # for legacy dynamic services + # use to define fallback routes for simcore services + # if docker healthcheck fails, container's traefik configuration is removed + # leading to 404 https://github.com/traefik/traefik/issues/7842 + # + # use fallback routes to return proper 503 (instead of 404) + # this service must be running at all times + traefik-configuration-placeholder: + image: busybox:1.35.0 + command: sleep infinity + networks: + - default + deploy: + labels: + # route to internal traefik + - traefik.enable=true + - io.simcore.zone=${TRAEFIK_SIMCORE_ZONE} + + ### Fallback for api-server + - traefik.http.routers.${SWARM_STACK_NAME}_api-server_fallback.rule=(Path(`/`) || Path(`/v0`) || PathPrefix(`/v0/`) || Path(`/api/v0/openapi.json`)) + - traefik.http.routers.${SWARM_STACK_NAME}_api-server_fallback.service=${SWARM_STACK_NAME}_api-server_fallback + - traefik.http.routers.${SWARM_STACK_NAME}_api-server_fallback.entrypoints=simcore_api + - traefik.http.routers.${SWARM_STACK_NAME}_api-server_fallback.priority=1 + # always fail and return 503 via unhealthy loadbalancer healthcheck + - traefik.http.services.${SWARM_STACK_NAME}_api-server_fallback.loadbalancer.server.port=0 # port is required (otherwise traefik service is not created) + - traefik.http.services.${SWARM_STACK_NAME}_api-server_fallback.loadbalancer.healthcheck.path=/some/invalid/path/to/generate/a/503 + - traefik.http.services.${SWARM_STACK_NAME}_api-server_fallback.loadbalancer.healthcheck.interval=10s + - traefik.http.services.${SWARM_STACK_NAME}_api-server_fallback.loadbalancer.healthcheck.timeout=1ms + + ### Fallback for webserver + - traefik.http.routers.${SWARM_STACK_NAME}_webserver_fallback.service=${SWARM_STACK_NAME}_webserver_fallback + - traefik.http.routers.${SWARM_STACK_NAME}_webserver_fallback.rule=(Path(`/`) || Path(`/v0`) || Path(`/socket.io/`) || Path(`/static-frontend-data.json`) || PathRegexp(`^/study/(?P\b[0-9a-f]{8}\b-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-\b[0-9a-f]{12}\b)`) || Path(`/view`) || Path(`/#/view`) || Path(`/#/error`) || PathPrefix(`/v0/`)) + - traefik.http.routers.${SWARM_STACK_NAME}_webserver_fallback.entrypoints=http + - traefik.http.routers.${SWARM_STACK_NAME}_webserver_fallback.priority=1 + # always fail and return 503 via unhealthy loadbalancer healthcheck + - traefik.http.services.${SWARM_STACK_NAME}_webserver_fallback.loadbalancer.server.port=0 + - traefik.http.services.${SWARM_STACK_NAME}_webserver_fallback.loadbalancer.healthcheck.path=/v0/ + - traefik.http.services.${SWARM_STACK_NAME}_webserver_fallback.loadbalancer.healthcheck.interval=10s + - traefik.http.services.${SWARM_STACK_NAME}_webserver_fallback.loadbalancer.healthcheck.timeout=1ms + + ### Fallback for static-webserver + - traefik.http.routers.${SWARM_STACK_NAME}_static_webserver_fallback.rule=(Path(`/osparc`) || Path(`/s4l`) || Path(`/s4llite`) || Path(`/s4lacad`) || Path(`/s4lengine`) || Path(`/s4ldesktop`) || Path(`/s4ldesktopacad`) || Path(`/tis`) || Path(`/tiplite`) || Path(`/transpiled`) || Path(`/resource`) || PathPrefix(`/osparc/`) || PathPrefix(`/s4l/`) || PathPrefix(`/s4llite/`) || PathPrefix(`/s4lacad/`) || PathPrefix(`/s4lengine/`) || PathPrefix(`/s4ldesktop/`) || PathPrefix(`/s4ldesktopacad/`) || PathPrefix(`/tis/`) || PathPrefix(`/tiplite/`) || PathPrefix(`/transpiled/`) || PathPrefix(`/resource/`)) + - traefik.http.routers.${SWARM_STACK_NAME}_static_webserver_fallback.service=${SWARM_STACK_NAME}_static_webserver_fallback + - traefik.http.routers.${SWARM_STACK_NAME}_static_webserver_fallback.entrypoints=http + - traefik.http.routers.${SWARM_STACK_NAME}_static_webserver_fallback.priority=1 + # always fail and return 503 via unhealthy loadbalancer healthcheck + - traefik.http.services.${SWARM_STACK_NAME}_static_webserver_fallback.loadbalancer.server.port=0 + - traefik.http.services.${SWARM_STACK_NAME}_static_webserver_fallback.loadbalancer.healthcheck.path=/some/invalid/path/to/generate/a/503 + - traefik.http.services.${SWARM_STACK_NAME}_static_webserver_fallback.loadbalancer.healthcheck.interval=10s + - traefik.http.services.${SWARM_STACK_NAME}_static_webserver_fallback.loadbalancer.healthcheck.timeout=1ms + volumes: postgres_data: name: ${SWARM_STACK_NAME}_postgres_data From 5e22332f20c9b6dd8ac182e94b885bd75569a47c Mon Sep 17 00:00:00 2001 From: YuryHrytsuk Date: Mon, 16 Jun 2025 09:43:15 +0200 Subject: [PATCH 2/4] traefik config placeholder: add docker healthcheck --- services/docker-compose.yml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/services/docker-compose.yml b/services/docker-compose.yml index 0408502b8a83..453957ddbc45 100644 --- a/services/docker-compose.yml +++ b/services/docker-compose.yml @@ -1504,6 +1504,12 @@ services: - traefik.http.services.${SWARM_STACK_NAME}_static_webserver_fallback.loadbalancer.healthcheck.path=/some/invalid/path/to/generate/a/503 - traefik.http.services.${SWARM_STACK_NAME}_static_webserver_fallback.loadbalancer.healthcheck.interval=10s - traefik.http.services.${SWARM_STACK_NAME}_static_webserver_fallback.loadbalancer.healthcheck.timeout=1ms + healthcheck: + test: command -v sleep + interval: 10s + timeout: 1s + start_period: 1s + retries: 1 volumes: postgres_data: From 1d99b1a8e70f54f61e105d7502d1e299cb2921a7 Mon Sep 17 00:00:00 2001 From: YuryHrytsuk Date: Mon, 16 Jun 2025 09:45:05 +0200 Subject: [PATCH 3/4] Healthcheck retries 3 --- services/docker-compose.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/docker-compose.yml b/services/docker-compose.yml index 453957ddbc45..4da7f06ae861 100644 --- a/services/docker-compose.yml +++ b/services/docker-compose.yml @@ -1509,7 +1509,7 @@ services: interval: 10s timeout: 1s start_period: 1s - retries: 1 + retries: 3 volumes: postgres_data: From eb017e449edbddf6844c25e59471d203f3b568a3 Mon Sep 17 00:00:00 2001 From: YuryHrytsuk Date: Wed, 25 Jun 2025 10:04:28 +0200 Subject: [PATCH 4/4] Fix [sys] public api test Test expects services to have exposed ports (either in Dockerfile or in docker compose spec). "traefik-configuration-placeholder" uses busybox image which has no exposed ports in Dockerfile and we don't expose ports in docker compose as we don't need them (this services is simply a placeholder for traefik rules that we want to configure on simcore traefik) --- packages/pytest-simcore/src/pytest_simcore/simcore_services.py | 1 + 1 file changed, 1 insertion(+) diff --git a/packages/pytest-simcore/src/pytest_simcore/simcore_services.py b/packages/pytest-simcore/src/pytest_simcore/simcore_services.py index 2a4f6d2ff4dc..d0c62e33010d 100644 --- a/packages/pytest-simcore/src/pytest_simcore/simcore_services.py +++ b/packages/pytest-simcore/src/pytest_simcore/simcore_services.py @@ -40,6 +40,7 @@ "whoami", "sto-worker", "sto-worker-cpu-bound", + "traefik-configuration-placeholder", } # TODO: unify healthcheck policies see https://github.com/ITISFoundation/osparc-simcore/pull/2281 DEFAULT_SERVICE_HEALTHCHECK_ENTRYPOINT: Final[str] = "/v0/"