Skip to content

Commit 4b88d15

Browse files
Follow up to osparc.io incident (ITISFoundation#8491)
1 parent e6e1581 commit 4b88d15

File tree

6 files changed

+80
-7
lines changed

6 files changed

+80
-7
lines changed

packages/service-library/src/servicelib/aiohttp/monitoring.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
from ..prometheus_metrics import (
2121
PrometheusMetrics,
2222
get_prometheus_metrics,
23+
record_asyncio_event_looop_metrics,
2324
record_request_metrics,
2425
record_response_metrics,
2526
)
@@ -33,15 +34,15 @@
3334

3435
def get_collector_registry(app: web.Application) -> CollectorRegistry:
3536
metrics = app[PROMETHEUS_METRICS_APPKEY]
36-
assert isinstance(metrics, PrometheusMetrics) # nosec
3737
return metrics.registry
3838

3939

4040
async def metrics_handler(request: web.Request):
41-
registry = get_collector_registry(request.app)
41+
metrics = request.app[PROMETHEUS_METRICS_APPKEY]
42+
await record_asyncio_event_looop_metrics(metrics)
4243

4344
# NOTE: Cannot use ProcessPoolExecutor because registry is not pickable
44-
result = await request.loop.run_in_executor(None, generate_latest, registry)
45+
result = await request.loop.run_in_executor(None, generate_latest, metrics.registry)
4546
response = web.Response(body=result)
4647
response.content_type = CONTENT_TYPE_LATEST
4748
return response

packages/service-library/src/servicelib/fastapi/monitoring.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
from ..prometheus_metrics import (
2424
PrometheusMetrics,
2525
get_prometheus_metrics,
26+
record_asyncio_event_looop_metrics,
2627
record_request_metrics,
2728
record_response_metrics,
2829
)
@@ -91,6 +92,7 @@ def _startup(app: FastAPI) -> None:
9192
async def metrics_endpoint(request: Request) -> Response:
9293
prometheus_metrics = request.app.state.prometheus_metrics
9394
assert isinstance(prometheus_metrics, PrometheusMetrics) # nosec
95+
await record_asyncio_event_looop_metrics(prometheus_metrics)
9496

9597
content = await asyncio.get_event_loop().run_in_executor(
9698
None, generate_latest, prometheus_metrics.registry

packages/service-library/src/servicelib/prometheus_metrics.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
import asyncio
2+
import time
13
from collections.abc import Iterator
24
from contextlib import contextmanager
35
from dataclasses import dataclass
@@ -40,6 +42,8 @@ class PrometheusMetrics:
4042
request_count: Counter
4143
in_flight_requests: Gauge
4244
response_latency_with_labels: Histogram
45+
event_loop_tasks: Gauge
46+
event_loop_lag: Gauge
4347

4448

4549
def _get_exemplar() -> dict[str, str] | None:
@@ -89,6 +93,20 @@ def get_prometheus_metrics() -> PrometheusMetrics:
8993
buckets=(0.1, 1, 5, 10),
9094
)
9195

96+
event_loop_tasks = Gauge(
97+
name="asyncio_event_loop_tasks",
98+
documentation="Total number of tasks in the asyncio event loop",
99+
labelnames=[],
100+
registry=registry,
101+
)
102+
103+
event_loop_lag = Gauge(
104+
name="asyncio_event_loop_lag_seconds",
105+
documentation="Time between scheduling and execution of event loop callbacks. >10ms consistently indicates event loop saturation",
106+
labelnames=[],
107+
registry=registry,
108+
)
109+
92110
return PrometheusMetrics(
93111
registry=registry,
94112
process_collector=process_collector,
@@ -97,6 +115,8 @@ def get_prometheus_metrics() -> PrometheusMetrics:
97115
request_count=request_count,
98116
in_flight_requests=in_flight_requests,
99117
response_latency_with_labels=response_latency_with_labels,
118+
event_loop_tasks=event_loop_tasks,
119+
event_loop_lag=event_loop_lag,
100120
)
101121

102122

@@ -141,3 +161,13 @@ def record_response_metrics(
141161
metrics.response_latency_with_labels.labels(method, endpoint, user_agent).observe(
142162
amount=response_latency_seconds, exemplar=exemplar
143163
)
164+
165+
166+
async def record_asyncio_event_looop_metrics(metrics: PrometheusMetrics) -> None:
167+
168+
metrics.event_loop_tasks.set(len(asyncio.all_tasks()))
169+
170+
start_time = time.perf_counter()
171+
await asyncio.sleep(0) # Yield control to event loop
172+
lag = time.perf_counter() - start_time
173+
metrics.event_loop_lag.set(lag)

packages/service-library/tests/aiohttp/test_monitoring.py

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,8 @@ def _assert_metrics_contain_entry(
6262
)
6363
assert len(filtered_samples) == 1
6464
sample = filtered_samples[0]
65-
assert sample.value == value
65+
if value is not None:
66+
assert sample.value == value
6667
print(f"Found {metric_name=} with expected {value=}")
6768
return
6869

@@ -135,3 +136,24 @@ async def test_request_with_simcore_user_agent(client: TestClient, faker: Faker)
135136
},
136137
value=1,
137138
)
139+
140+
141+
async def test_asyncio_event_loop_tasks(client: TestClient):
142+
response = await client.get("/monitored_request")
143+
assert response.status == status.HTTP_200_OK
144+
data = await response.json()
145+
assert data
146+
assert "data" in data
147+
assert data["data"] == "OK"
148+
149+
response = await client.get("/metrics")
150+
assert response.status == status.HTTP_200_OK
151+
metrics_as_text = await response.text()
152+
153+
_assert_metrics_contain_entry(
154+
metrics_as_text,
155+
metric_name="asyncio_event_loop_tasks",
156+
sample_name="asyncio_event_loop_tasks",
157+
labels={},
158+
value=None,
159+
)

packages/service-library/tests/fastapi/test_prometheus_middleware.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
import pytest
66
from asgi_lifespan import LifespanManager
77
from fastapi import FastAPI
8+
from fastapi.responses import PlainTextResponse
89
from httpx import AsyncClient
910
from prometheus_client.openmetrics.exposition import CONTENT_TYPE_LATEST
1011
from servicelib.fastapi.monitoring import setup_prometheus_instrumentation
@@ -16,6 +17,11 @@ async def app(app: FastAPI) -> AsyncIterable[FastAPI]:
1617
Fixture that sets up the Prometheus middleware in the FastAPI app.
1718
"""
1819
setup_prometheus_instrumentation(app)
20+
21+
@app.get("/dummy-endpoint")
22+
async def dummy_endpoint() -> PlainTextResponse:
23+
return PlainTextResponse("OK", media_type="text/plain")
24+
1925
async with LifespanManager(app):
2026
yield app
2127

@@ -29,3 +35,16 @@ async def test_metrics_endpoint(client: AsyncClient, app: FastAPI):
2935
assert response.headers["Content-Type"] == CONTENT_TYPE_LATEST
3036
assert "# HELP" in response.text
3137
assert "# TYPE" in response.text
38+
39+
40+
async def test_asyncio_event_loop_tasks(client: AsyncClient, app: FastAPI):
41+
"""
42+
Test that the /metrics endpoint is available and returns Prometheus metrics.
43+
"""
44+
response = await client.get("/dummy-endpoint")
45+
assert response.status_code == 200
46+
47+
response = await client.get("/metrics")
48+
assert response.status_code == 200
49+
assert response.headers["Content-Type"] == CONTENT_TYPE_LATEST
50+
assert "asyncio_event_loop_tasks" in response.text

services/api-server/src/simcore_service_api_server/services_http/webserver.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -50,11 +50,10 @@
5050
from servicelib.long_running_tasks.models import TaskStatus
5151
from servicelib.rest_constants import X_PRODUCT_NAME_HEADER
5252
from settings_library.tracing import TracingSettings
53-
from tenacity import TryAgain, retry_if_exception_type
53+
from tenacity import TryAgain, retry_if_exception_type, wait_random_exponential
5454
from tenacity.asyncio import AsyncRetrying
5555
from tenacity.before_sleep import before_sleep_log
5656
from tenacity.stop import stop_after_delay
57-
from tenacity.wait import wait_fixed
5857

5958
from ..core.settings import WebServerSettings
6059
from ..exceptions.backend_errors import (
@@ -253,7 +252,7 @@ async def _wait_for_long_running_task_results(self, lrt_response: httpx.Response
253252

254253
# GET task status now until done
255254
async for attempt in AsyncRetrying(
256-
wait=wait_fixed(0.5),
255+
wait=wait_random_exponential(multiplier=0.5, min=0.5, max=30),
257256
stop=stop_after_delay(_POLL_TIMEOUT),
258257
reraise=True,
259258
retry=retry_if_exception_type(TryAgain),

0 commit comments

Comments
 (0)