Skip to content

Commit ff5ca9c

Browse files
🚨✨ Implement tracing sampling strategy (🚧 devops 🚧) (ITISFoundation#8421)
Co-authored-by: Dustin Kaiser <[email protected]> force merge after tests are green to use beneficial time window
1 parent 98bd68a commit ff5ca9c

File tree

87 files changed

+1056
-420
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

87 files changed

+1056
-420
lines changed

β€Ž.env-develβ€Ž

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -377,7 +377,7 @@ TRACING_OPENTELEMETRY_COLLECTOR_BATCH_SIZE=2
377377
TRACING_OPENTELEMETRY_COLLECTOR_ENDPOINT=http://opentelemetry-collector
378378
TRACING_OPENTELEMETRY_COLLECTOR_EXPORTER_ENDPOINT=http://jaeger:4318
379379
TRACING_OPENTELEMETRY_COLLECTOR_PORT=4318
380-
TRACING_OPENTELEMETRY_COLLECTOR_SAMPLING_PERCENTAGE=100
380+
TRACING_OPENTELEMETRY_SAMPLING_PROBABILITY=1.0
381381
TRAEFIK_SIMCORE_ZONE=internal_simcore_stack
382382
TRASH_RETENTION_DAYS=7
383383
TWILIO_ACCOUNT_SID=DUMMY

β€Žpackages/service-library/src/servicelib/aiohttp/tracing.pyβ€Ž

Lines changed: 85 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
"""Adds aiohttp middleware for tracing using opentelemetry instrumentation."""
22

33
import logging
4+
import time
45
from collections.abc import AsyncIterator, Callable
56
from typing import Final
67

@@ -13,18 +14,28 @@
1314
AioHttpClientInstrumentor,
1415
)
1516
from opentelemetry.instrumentation.aiohttp_server import (
16-
middleware as aiohttp_server_opentelemetry_middleware, # pylint:disable=no-name-in-module
17+
_parse_active_request_count_attrs,
18+
_parse_duration_attrs,
19+
collect_request_attributes,
20+
get_default_span_details,
21+
getter,
22+
meter,
23+
set_status_code,
1724
)
18-
from opentelemetry.sdk.resources import Resource
25+
from opentelemetry.propagate import extract
1926
from opentelemetry.sdk.trace import SpanProcessor, TracerProvider
2027
from opentelemetry.sdk.trace.export import BatchSpanProcessor
28+
from opentelemetry.semconv.metrics import MetricInstruments
2129
from settings_library.tracing import TracingSettings
2230
from yarl import URL
2331

2432
from ..logging_utils import log_context
25-
from ..tracing import get_trace_id_header
33+
from ..tracing import TracingConfig, get_trace_id_header
2634

2735
_logger = logging.getLogger(__name__)
36+
37+
TRACING_CONFIG_KEY: Final[str] = "tracing_config"
38+
2839
try:
2940
from opentelemetry.instrumentation.botocore import ( # type: ignore[import-not-found]
3041
BotocoreInstrumentor,
@@ -65,6 +76,57 @@
6576
)
6677

6778

79+
@web.middleware
80+
async def aiohttp_server_opentelemetry_middleware(request: web.Request, handler):
81+
"""This middleware is extracted from https://github.com/open-telemetry/opentelemetry-python-contrib/blob/main/instrumentation/opentelemetry-instrumentation-aiohttp-server/src/opentelemetry/instrumentation/aiohttp_server/__init__.py
82+
and adapted to allow passing the tracer provider via the app instead of using the global object. The original code for the function is licensed under https://github.com/open-telemetry/opentelemetry-python-contrib/blob/main/LICENSE.
83+
FIXME: I have recorded this limitation in the official source here: https://github.com/open-telemetry/opentelemetry-python-contrib/issues/3801 and plan on providing a fix soon.
84+
"""
85+
86+
span_name, additional_attributes = get_default_span_details(request)
87+
88+
req_attrs = collect_request_attributes(request)
89+
duration_attrs = _parse_duration_attrs(req_attrs)
90+
active_requests_count_attrs = _parse_active_request_count_attrs(req_attrs)
91+
92+
duration_histogram = meter.create_histogram(
93+
name=MetricInstruments.HTTP_SERVER_DURATION,
94+
unit="ms",
95+
description="Measures the duration of inbound HTTP requests.",
96+
)
97+
98+
active_requests_counter = meter.create_up_down_counter(
99+
name=MetricInstruments.HTTP_SERVER_ACTIVE_REQUESTS,
100+
unit="requests",
101+
description="measures the number of concurrent HTTP requests those are currently in flight",
102+
)
103+
tracing_config = request.app[TRACING_CONFIG_KEY]
104+
assert isinstance(tracing_config, TracingConfig) # nosec
105+
assert tracing_config.tracer_provider # nosec
106+
tracer = tracing_config.tracer_provider.get_tracer(__name__)
107+
with tracer.start_as_current_span(
108+
span_name,
109+
context=extract(request, getter=getter),
110+
kind=trace.SpanKind.SERVER,
111+
) as span:
112+
attributes = collect_request_attributes(request)
113+
attributes.update(additional_attributes)
114+
span.set_attributes(attributes)
115+
start = time.perf_counter()
116+
active_requests_counter.add(1, active_requests_count_attrs)
117+
try:
118+
resp = await handler(request)
119+
set_status_code(span, resp.status)
120+
except web.HTTPException as ex:
121+
set_status_code(span, ex.status_code)
122+
raise
123+
finally:
124+
duration = max((time.perf_counter() - start) * 1000, 0)
125+
duration_histogram.record(duration, duration_attrs)
126+
active_requests_counter.add(-1, active_requests_count_attrs)
127+
return resp
128+
129+
68130
def _create_span_processor(tracing_destination: str) -> SpanProcessor:
69131
otlp_exporter = OTLPSpanExporterHTTP(
70132
endpoint=tracing_destination,
@@ -77,12 +139,12 @@ def _startup(
77139
app: web.Application,
78140
tracing_settings: TracingSettings,
79141
service_name: str,
142+
tracer_provider: TracerProvider,
80143
add_response_trace_id_header: bool = False,
81144
) -> None:
82145
"""
83146
Sets up this service for a distributed tracing system (opentelemetry)
84147
"""
85-
_ = app
86148
opentelemetry_collector_endpoint = (
87149
f"{tracing_settings.TRACING_OPENTELEMETRY_COLLECTOR_ENDPOINT}"
88150
)
@@ -99,9 +161,6 @@ def _startup(
99161
"unset. Provide both or remove both."
100162
)
101163
raise RuntimeError(msg)
102-
resource = Resource(attributes={"service.name": service_name})
103-
trace.set_tracer_provider(TracerProvider(resource=resource))
104-
tracer_provider: trace.TracerProvider = trace.get_tracer_provider()
105164

106165
tracing_destination: str = (
107166
f"{URL(opentelemetry_collector_endpoint).with_port(opentelemetry_collector_port).with_path('/v1/traces')}"
@@ -114,7 +173,7 @@ def _startup(
114173
)
115174

116175
# Add the span processor to the tracer provider
117-
tracer_provider.add_span_processor(_create_span_processor(tracing_destination)) # type: ignore[attr-defined] # https://github.com/open-telemetry/opentelemetry-python/issues/3713
176+
tracer_provider.add_span_processor(_create_span_processor(tracing_destination))
118177
# Instrument aiohttp server
119178
# Explanation for custom middleware call DK 10/2024:
120179
# OpenTelemetry Aiohttp autoinstrumentation is meant to be used by only calling `AioHttpServerInstrumentor().instrument()`
@@ -135,43 +194,43 @@ def _startup(
135194
# - opentelemetry-instrumentation==0.48b0
136195

137196
# Instrument aiohttp client
138-
AioHttpClientInstrumentor().instrument()
197+
AioHttpClientInstrumentor().instrument(tracer_provider=tracer_provider)
139198
if HAS_AIOPG:
140199
with log_context(
141200
_logger,
142201
logging.INFO,
143202
msg="Attempting to add aio-pg opentelemetry autoinstrumentation...",
144203
):
145-
AiopgInstrumentor().instrument()
204+
AiopgInstrumentor().instrument(tracer_provider=tracer_provider)
146205
if HAS_ASYNCPG:
147206
with log_context(
148207
_logger,
149208
logging.INFO,
150209
msg="Attempting to add asyncpg opentelemetry autoinstrumentation...",
151210
):
152-
AsyncPGInstrumentor().instrument()
211+
AsyncPGInstrumentor().instrument(tracer_provider=tracer_provider)
153212
if HAS_BOTOCORE:
154213
with log_context(
155214
_logger,
156215
logging.INFO,
157216
msg="Attempting to add botocore opentelemetry autoinstrumentation...",
158217
):
159-
BotocoreInstrumentor().instrument()
218+
BotocoreInstrumentor().instrument(tracer_provider=tracer_provider)
160219
if HAS_REQUESTS:
161220
with log_context(
162221
_logger,
163222
logging.INFO,
164223
msg="Attempting to add requests opentelemetry autoinstrumentation...",
165224
):
166-
RequestsInstrumentor().instrument()
225+
RequestsInstrumentor().instrument(tracer_provider=tracer_provider)
167226

168227
if HAS_AIO_PIKA:
169228
with log_context(
170229
_logger,
171230
logging.INFO,
172231
msg="Attempting to add aio_pika opentelemetry autoinstrumentation...",
173232
):
174-
AioPikaInstrumentor().instrument()
233+
AioPikaInstrumentor().instrument(tracer_provider=tracer_provider)
175234

176235

177236
@web.middleware
@@ -222,17 +281,24 @@ def _shutdown() -> None:
222281
_logger.exception("Failed to uninstrument AioPikaInstrumentor")
223282

224283

225-
def get_tracing_lifespan(
284+
def setup_tracing(
226285
*,
227286
app: web.Application,
228-
tracing_settings: TracingSettings,
229-
service_name: str,
287+
tracing_config: TracingConfig,
230288
add_response_trace_id_header: bool = False,
231289
) -> Callable[[web.Application], AsyncIterator]:
290+
291+
if tracing_config.tracing_enabled is False:
292+
msg = "Tracing is not enabled"
293+
raise ValueError(msg)
294+
assert tracing_config.tracer_provider # nosec
295+
assert tracing_config.tracing_settings # nosec
296+
232297
_startup(
233298
app=app,
234-
tracing_settings=tracing_settings,
235-
service_name=service_name,
299+
tracing_settings=tracing_config.tracing_settings,
300+
tracer_provider=tracing_config.tracer_provider,
301+
service_name=tracing_config.service_name,
236302
add_response_trace_id_header=add_response_trace_id_header,
237303
)
238304

β€Žpackages/service-library/src/servicelib/fastapi/client_session.pyβ€Ž

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
import httpx
44
from fastapi import FastAPI
5-
from settings_library.tracing import TracingSettings
5+
from servicelib.tracing import TracingConfig
66

77
from .tracing import setup_httpx_client_tracing
88

@@ -12,16 +12,16 @@ def setup_client_session(
1212
*,
1313
default_timeout: datetime.timedelta = datetime.timedelta(seconds=20),
1414
max_keepalive_connections: int = 20,
15-
tracing_settings: TracingSettings | None,
15+
tracing_config: TracingConfig | None
1616
) -> None:
1717
async def on_startup() -> None:
1818
session = httpx.AsyncClient(
1919
transport=httpx.AsyncHTTPTransport(http2=True),
2020
limits=httpx.Limits(max_keepalive_connections=max_keepalive_connections),
2121
timeout=default_timeout.total_seconds(),
2222
)
23-
if tracing_settings:
24-
setup_httpx_client_tracing(session)
23+
if tracing_config:
24+
setup_httpx_client_tracing(session, tracing_config=tracing_config)
2525
app.state.aiohttp_client_session = session
2626

2727
async def on_shutdown() -> None:

β€Žpackages/service-library/src/servicelib/fastapi/http_client_thin.pyβ€Ž

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
from common_library.errors_classes import OsparcErrorMixin
99
from httpx import AsyncClient, ConnectError, HTTPError, PoolTimeout, Response
1010
from httpx._types import TimeoutTypes, URLTypes
11-
from settings_library.tracing import TracingSettings
11+
from servicelib.tracing import TracingConfig
1212
from tenacity import RetryCallState
1313
from tenacity.asyncio import AsyncRetrying
1414
from tenacity.before_sleep import before_sleep_log
@@ -200,7 +200,7 @@ def __init__(
200200
self,
201201
*,
202202
total_retry_interval: float,
203-
tracing_settings: TracingSettings | None,
203+
tracing_config: TracingConfig,
204204
base_url: URLTypes | None = None,
205205
default_http_client_timeout: TimeoutTypes | None = None,
206206
extra_allowed_method_names: set[str] | None = None,
@@ -224,8 +224,8 @@ def __init__(
224224
client_args["timeout"] = default_http_client_timeout
225225

226226
client = AsyncClient(**client_args)
227-
if tracing_settings:
228-
setup_httpx_client_tracing(client)
227+
if tracing_config.tracing_enabled:
228+
setup_httpx_client_tracing(client, tracing_config=tracing_config)
229229
super().__init__(client=client)
230230

231231
async def __aenter__(self):

β€Žpackages/service-library/src/servicelib/fastapi/logging_lifespan.pyβ€Ž

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44

55
from common_library.logging.logging_utils_filtering import LoggerName, MessageSubstring
66
from fastapi import FastAPI
7-
from settings_library.tracing import TracingSettings
7+
from servicelib.tracing import TracingConfig
88

99
from ..logging_utils import (
1010
LogLevelInt,
@@ -20,7 +20,7 @@ def create_logging_lifespan(
2020
*,
2121
log_format_local_dev_enabled: bool,
2222
logger_filter_mapping: dict[LoggerName, list[MessageSubstring]],
23-
tracing_settings: TracingSettings | None,
23+
tracing_config: TracingConfig,
2424
log_base_level: LogLevelInt,
2525
noisy_loggers: tuple[str, ...] | None,
2626
) -> Lifespan:
@@ -32,7 +32,7 @@ def create_logging_lifespan(
3232
noisy_loggers=noisy_loggers,
3333
log_format_local_dev_enabled=log_format_local_dev_enabled,
3434
logger_filter_mapping=logger_filter_mapping,
35-
tracing_settings=tracing_settings,
35+
tracing_config=tracing_config,
3636
)
3737
)
3838

@@ -49,7 +49,7 @@ def create_logging_shutdown_event(
4949
*,
5050
log_format_local_dev_enabled: bool,
5151
logger_filter_mapping: dict[LoggerName, list[MessageSubstring]],
52-
tracing_settings: TracingSettings | None,
52+
tracing_config: TracingConfig,
5353
log_base_level: LogLevelInt,
5454
noisy_loggers: tuple[str, ...] | None,
5555
) -> Callable[[], Awaitable[None]]:
@@ -67,7 +67,7 @@ def create_logging_shutdown_event(
6767
noisy_loggers=noisy_loggers,
6868
log_format_local_dev_enabled=log_format_local_dev_enabled,
6969
logger_filter_mapping=logger_filter_mapping,
70-
tracing_settings=tracing_settings,
70+
tracing_config=tracing_config,
7171
)
7272
)
7373

0 commit comments

Comments
Β (0)