From 478d1f3a03813e340d00831b006f39503fceabb6 Mon Sep 17 00:00:00 2001 From: Dylan Russell Date: Wed, 30 Apr 2025 17:41:59 +0000 Subject: [PATCH 01/38] Initial commit to add timeout as a parm to export, make retries encompass timeout --- .../otlp/proto/grpc/_log_exporter/__init__.py | 12 +- .../exporter/otlp/proto/grpc/exporter.py | 137 ++++++--------- .../proto/grpc/metric_exporter/__init__.py | 22 ++- .../proto/grpc/trace_exporter/__init__.py | 14 +- .../test-requirements.txt | 1 + .../tests/test_otlp_exporter_mixin.py | 166 ++++++++++-------- .../tests/test_otlp_metrics_exporter.py | 6 +- .../tests/test_otlp_trace_exporter.py | 10 +- .../otlp/proto/http/_log_exporter/__init__.py | 37 ++-- .../proto/http/metric_exporter/__init__.py | 35 ++-- .../proto/http/trace_exporter/__init__.py | 60 +++---- .../metrics/test_otlp_metrics_exporter.py | 70 +++++--- .../tests/test_proto_log_exporter.py | 68 ++++--- .../tests/test_proto_span_exporter.py | 90 ++++++---- .../sdk/_logs/_internal/export/__init__.py | 17 +- .../export/in_memory_log_exporter.py | 1 + .../sdk/trace/export/__init__.py | 12 +- 17 files changed, 431 insertions(+), 327 deletions(-) diff --git a/exporter/opentelemetry-exporter-otlp-proto-grpc/src/opentelemetry/exporter/otlp/proto/grpc/_log_exporter/__init__.py b/exporter/opentelemetry-exporter-otlp-proto-grpc/src/opentelemetry/exporter/otlp/proto/grpc/_log_exporter/__init__.py index 8f629899d77..d13f662ccab 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-grpc/src/opentelemetry/exporter/otlp/proto/grpc/_log_exporter/__init__.py +++ b/exporter/opentelemetry-exporter-otlp-proto-grpc/src/opentelemetry/exporter/otlp/proto/grpc/_log_exporter/__init__.py @@ -58,7 +58,7 @@ def __init__( headers: Optional[ Union[TypingSequence[Tuple[str, str]], Dict[str, str], str] ] = None, - timeout: Optional[int] = None, + timeout: Optional[float] = None, compression: Optional[Compression] = None, ): if insecure is None: @@ -79,7 +79,7 @@ def __init__( environ_timeout = environ.get(OTEL_EXPORTER_OTLP_LOGS_TIMEOUT) environ_timeout = ( - int(environ_timeout) if environ_timeout is not None else None + float(environ_timeout) if environ_timeout is not None else None ) compression = ( @@ -107,8 +107,12 @@ def _translate_data( ) -> ExportLogsServiceRequest: return encode_logs(data) - def export(self, batch: Sequence[LogData]) -> LogExportResult: - return self._export(batch) + def export( + self, batch: Sequence[LogData], timeout_millis: Optional[int] = None + ) -> LogExportResult: + return self._export( + batch, timeout_millis / 1e3 if timeout_millis else None + ) def shutdown(self, timeout_millis: float = 30_000, **kwargs) -> None: OTLPExporterMixin.shutdown(self, timeout_millis=timeout_millis) diff --git a/exporter/opentelemetry-exporter-otlp-proto-grpc/src/opentelemetry/exporter/otlp/proto/grpc/exporter.py b/exporter/opentelemetry-exporter-otlp-proto-grpc/src/opentelemetry/exporter/otlp/proto/grpc/exporter.py index 79270b99a0c..d169d1e5a80 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-grpc/src/opentelemetry/exporter/otlp/proto/grpc/exporter.py +++ b/exporter/opentelemetry-exporter-otlp-proto-grpc/src/opentelemetry/exporter/otlp/proto/grpc/exporter.py @@ -14,12 +14,12 @@ """OTLP Exporter""" +import json import threading from abc import ABC, abstractmethod from collections.abc import Sequence # noqa: F401 from logging import getLogger from os import environ -from time import sleep from typing import ( # noqa: F401 Any, Callable, @@ -35,7 +35,6 @@ from urllib.parse import urlparse from deprecated import deprecated -from google.rpc.error_details_pb2 import RetryInfo from grpc import ( ChannelCredentials, @@ -47,7 +46,6 @@ ssl_channel_credentials, ) from opentelemetry.exporter.otlp.proto.common._internal import ( - _create_exp_backoff_generator, _get_resource_data, ) from opentelemetry.exporter.otlp.proto.grpc import ( @@ -74,6 +72,29 @@ from opentelemetry.sdk.trace import ReadableSpan from opentelemetry.util.re import parse_env_headers +json_config = json.dumps( + { + "methodConfig": [ + { + "name": [dict()], + "retryPolicy": { + "maxAttempts": 5, + "initialBackoff": "1s", + "maxBackoff": "64s", + "backoffMultiplier": 2, + "retryableStatusCodes": [ + "UNAVAILABLE", + "CANCELLED", + "RESOURCE_EXHAUSTED", + "ABORTED", + "OUT_OF_RANGE", + "DATA_LOSS", + ], + }, + } + ] + } +) logger = getLogger(__name__) SDKDataT = TypeVar("SDKDataT") ResourceDataT = TypeVar("ResourceDataT") @@ -195,7 +216,7 @@ def __init__( headers: Optional[ Union[TypingSequence[Tuple[str, str]], Dict[str, str], str] ] = None, - timeout: Optional[int] = None, + timeout: Optional[float] = None, compression: Optional[Compression] = None, ): super().__init__() @@ -232,7 +253,7 @@ def __init__( else: self._headers = tuple(self._headers) + tuple(_OTLP_GRPC_HEADERS) - self._timeout = timeout or int( + self._timeout = timeout or float( environ.get(OTEL_EXPORTER_OTLP_TIMEOUT, 10) ) self._collector_kwargs = None @@ -245,7 +266,11 @@ def __init__( if insecure: self._channel = insecure_channel( - self._endpoint, compression=compression + self._endpoint, + compression=compression, + options=[ + ("grpc.service_config", json_config), + ], ) else: credentials = _get_credentials( @@ -255,7 +280,12 @@ def __init__( OTEL_EXPORTER_OTLP_CLIENT_CERTIFICATE, ) self._channel = secure_channel( - self._endpoint, credentials, compression=compression + self._endpoint, + credentials, + compression=compression, + options=[ + ("grpc.service_config", json_config), + ], ) self._client = self._stub(self._channel) @@ -269,10 +299,10 @@ def _translate_data( pass def _export( - self, data: Union[TypingSequence[ReadableSpan], MetricsData] + self, + data: Union[TypingSequence[ReadableSpan], MetricsData], + timeout_sec: Optional[float] = None, ) -> ExportResultT: - # After the call to shutdown, subsequent calls to Export are - # not allowed and should return a Failure result. if self._shutdown: logger.warning("Exporter already shutdown, ignoring batch") return self._result.FAILURE @@ -280,79 +310,24 @@ def _export( # FIXME remove this check if the export type for traces # gets updated to a class that represents the proto # TracesData and use the code below instead. - # logger.warning( - # "Transient error %s encountered while exporting %s, retrying in %ss.", - # error.code(), - # data.__class__.__name__, - # delay, - # ) - max_value = 64 - # expo returns a generator that yields delay values which grow - # exponentially. Once delay is greater than max_value, the yielded - # value will remain constant. - for delay in _create_exp_backoff_generator(max_value=max_value): - if delay == max_value or self._shutdown: + with self._export_lock: + try: + self._client.Export( + request=self._translate_data(data), + metadata=self._headers, + timeout=(timeout_sec or self._timeout), + ) + return self._result.SUCCESS + except RpcError as error: + logger.error( + "Failed to export %s to %s, error code: %s", + self._exporting, + self._endpoint, + error.code(), + exc_info=error.code() == StatusCode.UNKNOWN, + ) return self._result.FAILURE - with self._export_lock: - try: - self._client.Export( - request=self._translate_data(data), - metadata=self._headers, - timeout=self._timeout, - ) - - return self._result.SUCCESS - - except RpcError as error: - if error.code() in [ - StatusCode.CANCELLED, - StatusCode.DEADLINE_EXCEEDED, - StatusCode.RESOURCE_EXHAUSTED, - StatusCode.ABORTED, - StatusCode.OUT_OF_RANGE, - StatusCode.UNAVAILABLE, - StatusCode.DATA_LOSS, - ]: - retry_info_bin = dict(error.trailing_metadata()).get( - "google.rpc.retryinfo-bin" - ) - if retry_info_bin is not None: - retry_info = RetryInfo() - retry_info.ParseFromString(retry_info_bin) - delay = ( - retry_info.retry_delay.seconds - + retry_info.retry_delay.nanos / 1.0e9 - ) - - logger.warning( - ( - "Transient error %s encountered while exporting " - "%s to %s, retrying in %ss." - ), - error.code(), - self._exporting, - self._endpoint, - delay, - ) - sleep(delay) - continue - else: - logger.error( - "Failed to export %s to %s, error code: %s", - self._exporting, - self._endpoint, - error.code(), - exc_info=error.code() == StatusCode.UNKNOWN, - ) - - if error.code() == StatusCode.OK: - return self._result.SUCCESS - - return self._result.FAILURE - - return self._result.FAILURE - def shutdown(self, timeout_millis: float = 30_000, **kwargs) -> None: if self._shutdown: logger.warning("Exporter already shutdown, ignoring call") diff --git a/exporter/opentelemetry-exporter-otlp-proto-grpc/src/opentelemetry/exporter/otlp/proto/grpc/metric_exporter/__init__.py b/exporter/opentelemetry-exporter-otlp-proto-grpc/src/opentelemetry/exporter/otlp/proto/grpc/metric_exporter/__init__.py index 8580dbb7386..64c10ae9056 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-grpc/src/opentelemetry/exporter/otlp/proto/grpc/metric_exporter/__init__.py +++ b/exporter/opentelemetry-exporter-otlp-proto-grpc/src/opentelemetry/exporter/otlp/proto/grpc/metric_exporter/__init__.py @@ -13,10 +13,11 @@ from __future__ import annotations +import time from dataclasses import replace from logging import getLogger from os import environ -from typing import Iterable, List, Tuple, Union +from typing import Iterable, List, Optional, Tuple, Union from typing import Sequence as TypingSequence from grpc import ChannelCredentials, Compression @@ -99,7 +100,7 @@ def __init__( credentials: ChannelCredentials | None = None, headers: Union[TypingSequence[Tuple[str, str]], dict[str, str], str] | None = None, - timeout: int | None = None, + timeout: float | None = None, compression: Compression | None = None, preferred_temporality: dict[type, AggregationTemporality] | None = None, @@ -124,7 +125,7 @@ def __init__( environ_timeout = environ.get(OTEL_EXPORTER_OTLP_METRICS_TIMEOUT) environ_timeout = ( - int(environ_timeout) if environ_timeout is not None else None + float(environ_timeout) if environ_timeout is not None else None ) compression = ( @@ -158,17 +159,22 @@ def _translate_data( def export( self, metrics_data: MetricsData, - timeout_millis: float = 10_000, + timeout_millis: Optional[int] = None, **kwargs, ) -> MetricExportResult: - # TODO(#2663): OTLPExporterMixin should pass timeout to gRPC + timeout_sec = ( + timeout_millis / 1e3 if timeout_millis else self._timeout # pylint: disable=protected-access + ) if self._max_export_batch_size is None: - return self._export(data=metrics_data) + return self._export(metrics_data, timeout_sec) export_result = MetricExportResult.SUCCESS - + deadline_sec = time.time() + timeout_sec for split_metrics_data in self._split_metrics_data(metrics_data): - split_export_result = self._export(data=split_metrics_data) + time_remaining_sec = deadline_sec - time.time() + split_export_result = self._export( + split_metrics_data, time_remaining_sec + ) if split_export_result is MetricExportResult.FAILURE: export_result = MetricExportResult.FAILURE diff --git a/exporter/opentelemetry-exporter-otlp-proto-grpc/src/opentelemetry/exporter/otlp/proto/grpc/trace_exporter/__init__.py b/exporter/opentelemetry-exporter-otlp-proto-grpc/src/opentelemetry/exporter/otlp/proto/grpc/trace_exporter/__init__.py index c78c1b81bb6..b092b5b3b9e 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-grpc/src/opentelemetry/exporter/otlp/proto/grpc/trace_exporter/__init__.py +++ b/exporter/opentelemetry-exporter-otlp-proto-grpc/src/opentelemetry/exporter/otlp/proto/grpc/trace_exporter/__init__.py @@ -91,7 +91,7 @@ def __init__( headers: Optional[ Union[TypingSequence[Tuple[str, str]], Dict[str, str], str] ] = None, - timeout: Optional[int] = None, + timeout: Optional[float] = None, compression: Optional[Compression] = None, ): if insecure is None: @@ -112,7 +112,7 @@ def __init__( environ_timeout = environ.get(OTEL_EXPORTER_OTLP_TRACES_TIMEOUT) environ_timeout = ( - int(environ_timeout) if environ_timeout is not None else None + float(environ_timeout) if environ_timeout is not None else None ) compression = ( @@ -139,8 +139,14 @@ def _translate_data( ) -> ExportTraceServiceRequest: return encode_spans(data) - def export(self, spans: Sequence[ReadableSpan]) -> SpanExportResult: - return self._export(spans) + def export( + self, + spans: Sequence[ReadableSpan], + timeout_millis: Optional[int] = None, + ) -> SpanExportResult: + return self._export( + spans, timeout_millis / 1e3 if timeout_millis else None + ) def shutdown(self) -> None: OTLPExporterMixin.shutdown(self) diff --git a/exporter/opentelemetry-exporter-otlp-proto-grpc/test-requirements.txt b/exporter/opentelemetry-exporter-otlp-proto-grpc/test-requirements.txt index 28d778461a9..01c9f1ddadd 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-grpc/test-requirements.txt +++ b/exporter/opentelemetry-exporter-otlp-proto-grpc/test-requirements.txt @@ -2,6 +2,7 @@ asgiref==3.7.2 Deprecated==1.2.14 googleapis-common-protos==1.63.2 grpcio==1.66.2 +grpcio-status==1.66.0 importlib-metadata==6.11.0 iniconfig==2.0.0 packaging==24.0 diff --git a/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/test_otlp_exporter_mixin.py b/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/test_otlp_exporter_mixin.py index 656d9a6cb79..5a75595f693 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/test_otlp_exporter_mixin.py +++ b/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/test_otlp_exporter_mixin.py @@ -15,18 +15,14 @@ import threading import time from concurrent.futures import ThreadPoolExecutor -from logging import WARNING +from logging import WARNING, getLogger from typing import Any, Optional, Sequence from unittest import TestCase -from unittest.mock import Mock, patch +from unittest.mock import ANY, Mock, patch -from google.protobuf.duration_pb2 import ( # pylint: disable=no-name-in-module - Duration, -) -from google.rpc.error_details_pb2 import ( # pylint: disable=no-name-in-module - RetryInfo, -) -from grpc import Compression, StatusCode, server +from google.rpc import code_pb2, status_pb2 +from grpc import Compression, server +from grpc_status import rpc_status from opentelemetry.exporter.otlp.proto.common.trace_encoder import ( encode_spans, @@ -55,6 +51,8 @@ SpanExportResult, ) +logger = getLogger(__name__) + # The below tests use this test SpanExporter and Spans, but are testing the # underlying behavior in the mixin. A MetricExporter or LogExporter could @@ -73,8 +71,14 @@ def _translate_data( ) -> ExportTraceServiceRequest: return encode_spans(data) - def export(self, spans: Sequence[ReadableSpan]) -> SpanExportResult: - return self._export(spans) + def export( + self, + spans: Sequence[ReadableSpan], + timeout_millis: Optional[float] = None, + ) -> SpanExportResult: + return self._export( + spans, timeout_millis / 1e3 if timeout_millis else None + ) @property def _exporting(self): @@ -87,40 +91,25 @@ def shutdown(self, timeout_millis=30_000): class TraceServiceServicerWithExportParams(TraceServiceServicer): def __init__( self, - export_result: StatusCode, + export_result: code_pb2, optional_export_sleep: Optional[float] = None, - optional_export_retry_millis: Optional[float] = None, ): self.export_result = export_result self.optional_export_sleep = optional_export_sleep - self.optional_export_retry_millis = optional_export_retry_millis # pylint: disable=invalid-name,unused-argument def Export(self, request, context): + logger.warning("Export Request Recieved") if self.optional_export_sleep: time.sleep(self.optional_export_sleep) - if self.optional_export_retry_millis: - context.send_initial_metadata( - ( - ( - "google.rpc.retryinfo-bin", - RetryInfo().SerializeToString(), - ), - ) - ) - context.set_trailing_metadata( - ( - ( - "google.rpc.retryinfo-bin", - RetryInfo( - retry_delay=Duration( - nanos=int(self.optional_export_retry_millis) - ) - ).SerializeToString(), - ), + if self.export_result != code_pb2.OK: + context.abort_with_status( + rpc_status.to_status( + status_pb2.Status( + code=self.export_result, + ) ) ) - context.set_code(self.export_result) return ExportTraceServiceResponse() @@ -268,7 +257,9 @@ def test_otlp_exporter_otlp_compression_unspecified( """No env or kwarg should be NoCompression""" OTLPSpanExporterForTesting(insecure=True) mock_insecure_channel.assert_called_once_with( - "localhost:4317", compression=Compression.NoCompression + "localhost:4317", + compression=Compression.NoCompression, + options=ANY, ) # pylint: disable=no-self-use, disable=unused-argument @@ -292,12 +283,12 @@ def test_otlp_exporter_otlp_compression_envvar( """Just OTEL_EXPORTER_OTLP_COMPRESSION should work""" OTLPSpanExporterForTesting(insecure=True) mock_insecure_channel.assert_called_once_with( - "localhost:4317", compression=Compression.Gzip + "localhost:4317", compression=Compression.Gzip, options=ANY ) def test_shutdown(self): add_TraceServiceServicer_to_server( - TraceServiceServicerWithExportParams(StatusCode.OK), + TraceServiceServicerWithExportParams(code_pb2.OK), self.server, ) self.assertEqual( @@ -316,7 +307,7 @@ def test_shutdown(self): def test_shutdown_wait_last_export(self): add_TraceServiceServicer_to_server( TraceServiceServicerWithExportParams( - StatusCode.OK, optional_export_sleep=1 + code_pb2.OK, optional_export_sleep=1 ), self.server, ) @@ -337,7 +328,7 @@ def test_shutdown_wait_last_export(self): def test_shutdown_doesnot_wait_last_export(self): add_TraceServiceServicer_to_server( TraceServiceServicerWithExportParams( - StatusCode.OK, optional_export_sleep=3 + code_pb2.OK, optional_export_sleep=3 ), self.server, ) @@ -360,7 +351,7 @@ def test_export_over_closed_grpc_channel(self): # pylint: disable=protected-access add_TraceServiceServicer_to_server( - TraceServiceServicerWithExportParams(StatusCode.OK), + TraceServiceServicerWithExportParams(code_pb2.OK), self.server, ) self.exporter.export([self.span]) @@ -372,52 +363,79 @@ def test_export_over_closed_grpc_channel(self): str(err.exception), "Cannot invoke RPC on closed channel!" ) - @patch( - "opentelemetry.exporter.otlp.proto.grpc.exporter._create_exp_backoff_generator" - ) - @patch("opentelemetry.exporter.otlp.proto.grpc.exporter.sleep") - def test_unavailable(self, mock_sleep, mock_expo): - mock_expo.configure_mock(**{"return_value": [0.01]}) - + def test_retry_timeout(self): add_TraceServiceServicer_to_server( - TraceServiceServicerWithExportParams(StatusCode.UNAVAILABLE), + TraceServiceServicerWithExportParams(code_pb2.UNAVAILABLE), self.server, ) - result = self.exporter.export([self.span]) - self.assertEqual(result, SpanExportResult.FAILURE) - mock_sleep.assert_called_with(0.01) - - @patch("opentelemetry.exporter.otlp.proto.grpc.exporter.sleep") - def test_unavailable_delay(self, mock_sleep): + with self.assertLogs(level=WARNING) as warning: + # Set timeout to 1.5 seconds + self.assertEqual( + self.exporter.export([self.span], 1500), + SpanExportResult.FAILURE, + ) + # Our GRPC retry policy starts with a 1 second backoff then doubles. + # So we expect just two calls: one at time 0, one at time 1. + # The final log is from when export fails. + self.assertEqual(len(warning.records), 3) + for idx, log in enumerate(warning.records): + if idx != 2: + self.assertEqual( + "Export Request Recieved", + log.message, + ) + else: + self.assertEqual( + "Failed to export traces to localhost:4317, error code: StatusCode.DEADLINE_EXCEEDED", + log.message, + ) + with self.assertLogs(level=WARNING) as warning: + exporter = OTLPSpanExporterForTesting(insecure=True, timeout=3.5) + # This time don't pass in a timeout to export, so it should fallback to the timeout + # passed to the exporter class. + # pylint: disable=protected-access + self.assertEqual(exporter._timeout, 3.5) + self.assertEqual( + exporter.export([self.span]), + SpanExportResult.FAILURE, + ) + # We expect 3 calls: time 0, time 1, time 3, but not time 7. + # The final log is from when export fails. + self.assertEqual(len(warning.records), 4) + for idx, log in enumerate(warning.records): + if idx != 3: + self.assertEqual( + "Export Request Recieved", + log.message, + ) + else: + self.assertEqual( + "Failed to export traces to localhost:4317, error code: StatusCode.DEADLINE_EXCEEDED", + log.message, + ) + + def test_timeout_set_correctly(self): add_TraceServiceServicer_to_server( TraceServiceServicerWithExportParams( - StatusCode.UNAVAILABLE, - optional_export_sleep=None, - optional_export_retry_millis=1e7, + code_pb2.OK, optional_export_sleep=0.5 ), self.server, ) + # Should timeout. Deadline should be set to now + timeout. + # That is 400 millis from now, and export sleeps for 500 millis. with self.assertLogs(level=WARNING) as warning: self.assertEqual( - self.exporter.export([self.span]), SpanExportResult.FAILURE + self.exporter.export([self.span], 400), + SpanExportResult.FAILURE, ) - mock_sleep.assert_called_with(0.01) - self.assertEqual( - warning.records[0].message, - ( - "Transient error StatusCode.UNAVAILABLE encountered " - "while exporting traces to localhost:4317, retrying in 0.01s." - ), + "Failed to export traces to localhost:4317, error code: StatusCode.DEADLINE_EXCEEDED", + warning.records[-1].message, ) - def test_success(self): - add_TraceServiceServicer_to_server( - TraceServiceServicerWithExportParams(StatusCode.OK), - self.server, - ) self.assertEqual( - self.exporter.export([self.span]), SpanExportResult.SUCCESS + self.exporter.export([self.span], 600), + SpanExportResult.SUCCESS, ) def test_otlp_headers_from_env(self): @@ -431,15 +449,13 @@ def test_otlp_headers_from_env(self): def test_permanent_failure(self): with self.assertLogs(level=WARNING) as warning: add_TraceServiceServicer_to_server( - TraceServiceServicerWithExportParams( - StatusCode.ALREADY_EXISTS - ), + TraceServiceServicerWithExportParams(code_pb2.ALREADY_EXISTS), self.server, ) self.assertEqual( self.exporter.export([self.span]), SpanExportResult.FAILURE ) self.assertEqual( - warning.records[0].message, + warning.records[-1].message, "Failed to export traces to localhost:4317, error code: StatusCode.ALREADY_EXISTS", ) diff --git a/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/test_otlp_metrics_exporter.py b/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/test_otlp_metrics_exporter.py index 2ea12f660fb..ceda6e72a8e 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/test_otlp_metrics_exporter.py +++ b/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/test_otlp_metrics_exporter.py @@ -18,7 +18,7 @@ from os.path import dirname from typing import List from unittest import TestCase -from unittest.mock import patch +from unittest.mock import ANY, patch from grpc import ChannelCredentials, Compression @@ -297,7 +297,9 @@ def test_otlp_exporter_otlp_compression_kwarg(self, mock_insecure_channel): insecure=True, compression=Compression.NoCompression ) mock_insecure_channel.assert_called_once_with( - "localhost:4317", compression=Compression.NoCompression + "localhost:4317", + compression=Compression.NoCompression, + options=ANY, ) def test_split_metrics_data_many_data_points(self): diff --git a/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/test_otlp_trace_exporter.py b/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/test_otlp_trace_exporter.py index 73d8d6c7a20..5238dc91224 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/test_otlp_trace_exporter.py +++ b/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/test_otlp_trace_exporter.py @@ -16,7 +16,7 @@ import os from unittest import TestCase -from unittest.mock import Mock, PropertyMock, patch +from unittest.mock import ANY, Mock, PropertyMock, patch from grpc import ChannelCredentials, Compression @@ -333,7 +333,9 @@ def test_otlp_exporter_otlp_compression_kwarg(self, mock_insecure_channel): """Specifying kwarg should take precedence over env""" OTLPSpanExporter(insecure=True, compression=Compression.NoCompression) mock_insecure_channel.assert_called_once_with( - "localhost:4317", compression=Compression.NoCompression + "localhost:4317", + compression=Compression.NoCompression, + options=ANY, ) # pylint: disable=no-self-use @@ -350,7 +352,9 @@ def test_otlp_exporter_otlp_compression_precendence( """ OTLPSpanExporter(insecure=True) mock_insecure_channel.assert_called_once_with( - "localhost:4317", compression=Compression.Gzip + "localhost:4317", + compression=Compression.Gzip, + options=ANY, ) def test_translate_spans(self): diff --git a/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/_log_exporter/__init__.py b/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/_log_exporter/__init__.py index f86f0113833..1a46cab057c 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/_log_exporter/__init__.py +++ b/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/_log_exporter/__init__.py @@ -17,15 +17,12 @@ import zlib from io import BytesIO from os import environ -from time import sleep +from time import sleep, time from typing import Dict, Optional, Sequence import requests from requests.exceptions import ConnectionError -from opentelemetry.exporter.otlp.proto.common._internal import ( - _create_exp_backoff_generator, -) from opentelemetry.exporter.otlp.proto.common._log_encoder import encode_logs from opentelemetry.exporter.otlp.proto.http import ( _OTLP_HTTP_HEADERS, @@ -64,8 +61,6 @@ class OTLPLogExporter(LogExporter): - _MAX_RETRY_TIMEOUT = 64 - def __init__( self, endpoint: Optional[str] = None, @@ -73,7 +68,7 @@ def __init__( client_key_file: Optional[str] = None, client_certificate_file: Optional[str] = None, headers: Optional[Dict[str, str]] = None, - timeout: Optional[int] = None, + timeout: Optional[float] = None, compression: Optional[Compression] = None, session: Optional[requests.Session] = None, ): @@ -108,7 +103,7 @@ def __init__( self._headers = headers or parse_env_headers( headers_string, liberal=True ) - self._timeout = timeout or int( + self._timeout = timeout or float( environ.get( OTEL_EXPORTER_OTLP_LOGS_TIMEOUT, environ.get(OTEL_EXPORTER_OTLP_TIMEOUT, DEFAULT_TIMEOUT), @@ -124,7 +119,7 @@ def __init__( ) self._shutdown = False - def _export(self, serialized_data: bytes): + def _export(self, serialized_data: bytes, timeout_sec: float): data = serialized_data if self._compression == Compression.Gzip: gzip_data = BytesIO() @@ -143,7 +138,7 @@ def _export(self, serialized_data: bytes): url=self._endpoint, data=data, verify=self._certificate_file, - timeout=self._timeout, + timeout=timeout_sec cert=self._client_cert, ) except ConnectionError: @@ -151,7 +146,7 @@ def _export(self, serialized_data: bytes): url=self._endpoint, data=data, verify=self._certificate_file, - timeout=self._timeout, + timeout=timeout_sec cert=self._client_cert, ) return resp @@ -164,7 +159,9 @@ def _retryable(resp: requests.Response) -> bool: return True return False - def export(self, batch: Sequence[LogData]) -> LogExportResult: + def export( + self, batch: Sequence[LogData], timeout_millis: Optional[float] = None + ) -> LogExportResult: # After the call to Shutdown subsequent calls to Export are # not allowed and should return a Failure result. if self._shutdown: @@ -172,18 +169,20 @@ def export(self, batch: Sequence[LogData]) -> LogExportResult: return LogExportResult.FAILURE serialized_data = encode_logs(batch).SerializeToString() - - for delay in _create_exp_backoff_generator( - max_value=self._MAX_RETRY_TIMEOUT - ): - if delay == self._MAX_RETRY_TIMEOUT: + deadline_sec = time() + ( + timeout_millis / 1e3 if timeout_millis else self._timeout + ) + for delay in [1, 2, 4, 8, 16, 32]: + remaining_time_sec = deadline_sec - time() + if remaining_time_sec < 1e-09: return LogExportResult.FAILURE - - resp = self._export(serialized_data) + resp = self._export(serialized_data, remaining_time_sec) # pylint: disable=no-else-return if resp.ok: return LogExportResult.SUCCESS elif self._retryable(resp): + if delay > (deadline_sec - time()): + return LogExportResult.FAILURE _logger.warning( "Transient error %s encountered while exporting logs batch, retrying in %ss.", resp.reason, diff --git a/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/metric_exporter/__init__.py b/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/metric_exporter/__init__.py index 4feea8d4302..6c8b930fbc7 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/metric_exporter/__init__.py +++ b/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/metric_exporter/__init__.py @@ -17,13 +17,14 @@ import zlib from io import BytesIO from os import environ -from time import sleep +from time import sleep, time from typing import ( # noqa: F401 Any, Callable, Dict, List, Mapping, + Optional, Sequence, ) @@ -32,7 +33,6 @@ from requests.exceptions import ConnectionError from opentelemetry.exporter.otlp.proto.common._internal import ( - _create_exp_backoff_generator, _get_resource_data, ) from opentelemetry.exporter.otlp.proto.common._internal.metrics_encoder import ( @@ -101,8 +101,6 @@ class OTLPMetricExporter(MetricExporter, OTLPMetricExporterMixin): - _MAX_RETRY_TIMEOUT = 64 - def __init__( self, endpoint: str | None = None, @@ -110,7 +108,7 @@ def __init__( client_key_file: str | None = None, client_certificate_file: str | None = None, headers: dict[str, str] | None = None, - timeout: int | None = None, + timeout: float | None = None, compression: Compression | None = None, session: requests.Session | None = None, preferred_temporality: dict[type, AggregationTemporality] @@ -147,7 +145,7 @@ def __init__( self._headers = headers or parse_env_headers( headers_string, liberal=True ) - self._timeout = timeout or int( + self._timeout = timeout or float( environ.get( OTEL_EXPORTER_OTLP_METRICS_TIMEOUT, environ.get(OTEL_EXPORTER_OTLP_TIMEOUT, DEFAULT_TIMEOUT), @@ -166,7 +164,7 @@ def __init__( preferred_temporality, preferred_aggregation ) - def _export(self, serialized_data: bytes): + def _export(self, serialized_data: bytes, timeout_sec: float): data = serialized_data if self._compression == Compression.Gzip: gzip_data = BytesIO() @@ -185,7 +183,7 @@ def _export(self, serialized_data: bytes): url=self._endpoint, data=data, verify=self._certificate_file, - timeout=self._timeout, + timeout=timeout_sec, cert=self._client_cert, ) except ConnectionError: @@ -193,7 +191,7 @@ def _export(self, serialized_data: bytes): url=self._endpoint, data=data, verify=self._certificate_file, - timeout=self._timeout, + timeout=timeout_sec, cert=self._client_cert, ) return resp @@ -209,21 +207,26 @@ def _retryable(resp: requests.Response) -> bool: def export( self, metrics_data: MetricsData, - timeout_millis: float = 10_000, + timeout_millis: Optional[float] = None, **kwargs, ) -> MetricExportResult: serialized_data = encode_metrics(metrics_data) - for delay in _create_exp_backoff_generator( - max_value=self._MAX_RETRY_TIMEOUT - ): - if delay == self._MAX_RETRY_TIMEOUT: + deadline_sec = time() + ( + timeout_millis / 1e3 if timeout_millis else self._timeout + ) + for delay in [1, 2, 4, 8, 16, 32]: + remaining_time_sec = deadline_sec - time() + if remaining_time_sec < 1e-09: return MetricExportResult.FAILURE - - resp = self._export(serialized_data.SerializeToString()) + resp = self._export( + serialized_data.SerializeToString(), remaining_time_sec + ) # pylint: disable=no-else-return if resp.ok: return MetricExportResult.SUCCESS elif self._retryable(resp): + if delay > (deadline_sec - time()): + return MetricExportResult.FAILURE _logger.warning( "Transient error %s encountered while exporting metric batch, retrying in %ss.", resp.reason, diff --git a/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/trace_exporter/__init__.py b/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/trace_exporter/__init__.py index 1841e5210a4..6fa0f1c1bdd 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/trace_exporter/__init__.py +++ b/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/trace_exporter/__init__.py @@ -17,15 +17,12 @@ import zlib from io import BytesIO from os import environ -from time import sleep -from typing import Dict, Optional +from time import sleep, time +from typing import Dict, Optional, Sequence import requests from requests.exceptions import ConnectionError -from opentelemetry.exporter.otlp.proto.common._internal import ( - _create_exp_backoff_generator, -) from opentelemetry.exporter.otlp.proto.common.trace_encoder import ( encode_spans, ) @@ -49,6 +46,7 @@ OTEL_EXPORTER_OTLP_TRACES_HEADERS, OTEL_EXPORTER_OTLP_TRACES_TIMEOUT, ) +from opentelemetry.sdk.trace import ReadableSpan from opentelemetry.sdk.trace.export import SpanExporter, SpanExportResult from opentelemetry.util.re import parse_env_headers @@ -62,8 +60,6 @@ class OTLPSpanExporter(SpanExporter): - _MAX_RETRY_TIMEOUT = 64 - def __init__( self, endpoint: Optional[str] = None, @@ -71,7 +67,7 @@ def __init__( client_key_file: Optional[str] = None, client_certificate_file: Optional[str] = None, headers: Optional[Dict[str, str]] = None, - timeout: Optional[int] = None, + timeout: Optional[float] = None, compression: Optional[Compression] = None, session: Optional[requests.Session] = None, ): @@ -105,7 +101,7 @@ def __init__( self._headers = headers or parse_env_headers( headers_string, liberal=True ) - self._timeout = timeout or int( + self._timeout = timeout or float( environ.get( OTEL_EXPORTER_OTLP_TRACES_TIMEOUT, environ.get(OTEL_EXPORTER_OTLP_TIMEOUT, DEFAULT_TIMEOUT), @@ -121,7 +117,7 @@ def __init__( ) self._shutdown = False - def _export(self, serialized_data: bytes): + def _export(self, serialized_data: bytes, timeout_sec: float): data = serialized_data if self._compression == Compression.Gzip: gzip_data = BytesIO() @@ -140,7 +136,7 @@ def _export(self, serialized_data: bytes): url=self._endpoint, data=data, verify=self._certificate_file, - timeout=self._timeout, + timeout=timeout_sec, cert=self._client_cert, ) except ConnectionError: @@ -148,7 +144,7 @@ def _export(self, serialized_data: bytes): url=self._endpoint, data=data, verify=self._certificate_file, - timeout=self._timeout, + timeout=timeout_sec, cert=self._client_cert, ) return resp @@ -161,21 +157,32 @@ def _retryable(resp: requests.Response) -> bool: return True return False - def _serialize_spans(self, spans): - return encode_spans(spans).SerializePartialToString() + def export( + self, + spans: Sequence[ReadableSpan], + timeout_millis: Optional[float] = None, + ) -> SpanExportResult: + # After the call to Shutdown subsequent calls to Export are + # not allowed and should return a Failure result. + if self._shutdown: + _logger.warning("Exporter already shutdown, ignoring batch") + return SpanExportResult.FAILURE - def _export_serialized_spans(self, serialized_data): - for delay in _create_exp_backoff_generator( - max_value=self._MAX_RETRY_TIMEOUT - ): - if delay == self._MAX_RETRY_TIMEOUT: + serialized_data = encode_spans(spans).SerializePartialToString() + deadline_sec = time() + ( + timeout_millis / 1e3 if timeout_millis else self._timeout + ) + for delay in [1, 2, 4, 8, 16, 32]: + remaining_time_sec = deadline_sec - time() + if remaining_time_sec < 1e-09: return SpanExportResult.FAILURE - - resp = self._export(serialized_data) + resp = self._export(serialized_data, remaining_time_sec) # pylint: disable=no-else-return if resp.ok: return SpanExportResult.SUCCESS elif self._retryable(resp): + if delay > (deadline_sec - time()): + return SpanExportResult.FAILURE _logger.warning( "Transient error %s encountered while exporting span batch, retrying in %ss.", resp.reason, @@ -192,17 +199,6 @@ def _export_serialized_spans(self, serialized_data): return SpanExportResult.FAILURE return SpanExportResult.FAILURE - def export(self, spans) -> SpanExportResult: - # After the call to Shutdown subsequent calls to Export are - # not allowed and should return a Failure result. - if self._shutdown: - _logger.warning("Exporter already shutdown, ignoring batch") - return SpanExportResult.FAILURE - - serialized_data = self._serialize_spans(spans) - - return self._export_serialized_spans(serialized_data) - def shutdown(self): if self._shutdown: _logger.warning("Exporter already shutdown, ignoring call") diff --git a/exporter/opentelemetry-exporter-otlp-proto-http/tests/metrics/test_otlp_metrics_exporter.py b/exporter/opentelemetry-exporter-otlp-proto-http/tests/metrics/test_otlp_metrics_exporter.py index 16bb3e54286..df7c0c17ea3 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-http/tests/metrics/test_otlp_metrics_exporter.py +++ b/exporter/opentelemetry-exporter-otlp-proto-http/tests/metrics/test_otlp_metrics_exporter.py @@ -15,11 +15,10 @@ from logging import WARNING from os import environ from unittest import TestCase -from unittest.mock import MagicMock, Mock, call, patch +from unittest.mock import ANY, MagicMock, Mock, patch from requests import Session from requests.models import Response -from responses import POST, activate, add from opentelemetry.exporter.otlp.proto.common.metrics_encoder import ( encode_metrics, @@ -327,31 +326,10 @@ def test_serialization(self, mock_post): url=exporter._endpoint, data=serialized_data.SerializeToString(), verify=exporter._certificate_file, - timeout=exporter._timeout, + timeout=ANY, # Timeout is a float based on real time, can't put an exact value here. cert=exporter._client_cert, ) - @activate - @patch("opentelemetry.exporter.otlp.proto.http.metric_exporter.sleep") - def test_exponential_backoff(self, mock_sleep): - # return a retryable error - add( - POST, - "http://metrics.example.com/export", - json={"error": "something exploded"}, - status=500, - ) - - exporter = OTLPMetricExporter( - endpoint="http://metrics.example.com/export" - ) - metrics_data = self.metrics["sum_int"] - - exporter.export(metrics_data) - mock_sleep.assert_has_calls( - [call(1), call(2), call(4), call(8), call(16), call(32)] - ) - def test_aggregation_temporality(self): otlp_metric_exporter = OTLPMetricExporter() @@ -523,3 +501,47 @@ def test_preferred_aggregation_override(self): self.assertEqual( exporter._preferred_aggregation[Histogram], histogram_aggregation ) + + @patch.object(Session, "post") + def test_retry_timeout(self, mock_post): + exporter = OTLPMetricExporter(timeout=3.5) + + resp = Response() + resp.status_code = 503 + resp.reason = "UNAVAILABLE" + mock_post.return_value = resp + with self.assertLogs(level=WARNING) as warning: + # Set timeout to 1.5 seconds + self.assertEqual( + exporter.export(self.metrics["sum_int"], 1500), + MetricExportResult.FAILURE, + ) + # Code should return failure before the final retry which would exceed timeout. + # Code should return failure after retrying once. + self.assertEqual(len(warning.records), 1) + self.assertEqual( + "Transient error UNAVAILABLE encountered while exporting metric batch, retrying in 1s.", + warning.records[0].message, + ) + with self.assertLogs(level=WARNING) as warning: + # This time don't pass in a timeout, so it will fallback to 3.5 second set on class. + self.assertEqual( + exporter.export(self.metrics["sum_int"]), + MetricExportResult.FAILURE, + ) + # 2 retrys (after 1s, 3s). + self.assertEqual(len(warning.records), 2) + + @patch.object(Session, "post") + def test_timeout_set_correctly(self, mock_post): + resp = Response() + resp.status_code = 200 + + def export_side_effect(*args, **kwargs): + # Timeout should be set to something slightly less than 400 milliseconds depending on how much time has passed. + self.assertTrue(0.4 - kwargs["timeout"] < 0.0005) + return resp + + mock_post.side_effect = export_side_effect + exporter = OTLPMetricExporter() + exporter.export(self.metrics["sum_int"], 400) diff --git a/exporter/opentelemetry-exporter-otlp-proto-http/tests/test_proto_log_exporter.py b/exporter/opentelemetry-exporter-otlp-proto-http/tests/test_proto_log_exporter.py index 66b0f890d76..00a00ae3aa9 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-http/tests/test_proto_log_exporter.py +++ b/exporter/opentelemetry-exporter-otlp-proto-http/tests/test_proto_log_exporter.py @@ -15,12 +15,14 @@ # pylint: disable=protected-access import unittest +from logging import WARNING from typing import List -from unittest.mock import MagicMock, Mock, call, patch +from unittest.mock import MagicMock, Mock, patch import requests -import responses from google.protobuf.json_format import MessageToDict +from requests import Session +from requests.models import Response from opentelemetry._logs import SeverityNumber from opentelemetry.exporter.otlp.proto.http import Compression @@ -267,25 +269,6 @@ def test_exported_log_without_span_id(self): else: self.fail("No log records found") - @responses.activate - @patch("opentelemetry.exporter.otlp.proto.http._log_exporter.sleep") - def test_exponential_backoff(self, mock_sleep): - # return a retryable error - responses.add( - responses.POST, - "http://logs.example.com/export", - json={"error": "something exploded"}, - status=500, - ) - - exporter = OTLPLogExporter(endpoint="http://logs.example.com/export") - logs = self._get_sdk_log_data() - - exporter.export(logs) - mock_sleep.assert_has_calls( - [call(1), call(2), call(4), call(8), call(16), call(32)] - ) - @staticmethod def _get_sdk_log_data() -> List[LogData]: log1 = LogData( @@ -365,3 +348,46 @@ def test_2xx_status_code(self, mock_otlp_metric_exporter): self.assertEqual( OTLPLogExporter().export(MagicMock()), LogExportResult.SUCCESS ) + + @patch.object(Session, "post") + def test_retry_timeout(self, mock_post): + exporter = OTLPLogExporter(timeout=3.5) + + resp = Response() + resp.status_code = 503 + resp.reason = "UNAVAILABLE" + mock_post.return_value = resp + with self.assertLogs(level=WARNING) as warning: + # Set timeout to 1.5 seconds + self.assertEqual( + exporter.export(self._get_sdk_log_data(), 1500), + LogExportResult.FAILURE, + ) + # Code should return failure after retrying once. + self.assertEqual(len(warning.records), 1) + self.assertEqual( + "Transient error UNAVAILABLE encountered while exporting logs batch, retrying in 1s.", + warning.records[0].message, + ) + with self.assertLogs(level=WARNING) as warning: + # This time don't pass in a timeout, so it will fallback to 3.5 second set on class. + self.assertEqual( + exporter.export(self._get_sdk_log_data()), + LogExportResult.FAILURE, + ) + # 2 retrys (after 1s, 3s). + self.assertEqual(len(warning.records), 2) + + @patch.object(Session, "post") + def test_timeout_set_correctly(self, mock_post): + resp = Response() + resp.status_code = 200 + + def export_side_effect(*args, **kwargs): + # Timeout should be set to something slightly less than 400 milliseconds depending on how much time has passed. + self.assertTrue(0.4 - kwargs["timeout"] < 0.0005) + return resp + + mock_post.side_effect = export_side_effect + exporter = OTLPLogExporter() + exporter.export(self._get_sdk_log_data(), 400) diff --git a/exporter/opentelemetry-exporter-otlp-proto-http/tests/test_proto_span_exporter.py b/exporter/opentelemetry-exporter-otlp-proto-http/tests/test_proto_span_exporter.py index 8d8ff6037aa..b7e357bbe4c 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-http/tests/test_proto_span_exporter.py +++ b/exporter/opentelemetry-exporter-otlp-proto-http/tests/test_proto_span_exporter.py @@ -13,10 +13,12 @@ # limitations under the License. import unittest -from unittest.mock import MagicMock, Mock, call, patch +from logging import WARNING +from unittest.mock import MagicMock, Mock, patch import requests -import responses +from requests import Session +from requests.models import Response from opentelemetry.exporter.otlp.proto.http import Compression from opentelemetry.exporter.otlp.proto.http.trace_exporter import ( @@ -52,6 +54,16 @@ OS_ENV_CLIENT_KEY = "os/env/client-key.pem" OS_ENV_HEADERS = "envHeader1=val1,envHeader2=val2" OS_ENV_TIMEOUT = "30" +BASIC_SPAN = _Span( + "abc", + context=Mock( + **{ + "trace_state": {"a": "b", "c": "d"}, + "span_id": 10217189687419569865, + "trace_id": 67545097771067222548457157018666467027, + } + ), +) # pylint: disable=protected-access @@ -227,37 +239,6 @@ def test_headers_parse_from_env(self): ), ) - # pylint: disable=no-self-use - @responses.activate - @patch("opentelemetry.exporter.otlp.proto.http.trace_exporter.sleep") - def test_exponential_backoff(self, mock_sleep): - # return a retryable error - responses.add( - responses.POST, - "http://traces.example.com/export", - json={"error": "something exploded"}, - status=500, - ) - - exporter = OTLPSpanExporter( - endpoint="http://traces.example.com/export" - ) - span = _Span( - "abc", - context=Mock( - **{ - "trace_state": {"a": "b", "c": "d"}, - "span_id": 10217189687419569865, - "trace_id": 67545097771067222548457157018666467027, - } - ), - ) - - exporter.export([span]) - mock_sleep.assert_has_calls( - [call(1), call(2), call(4), call(8), call(16), call(32)] - ) - @patch.object(OTLPSpanExporter, "_export", return_value=Mock(ok=True)) def test_2xx_status_code(self, mock_otlp_metric_exporter): """ @@ -267,3 +248,46 @@ def test_2xx_status_code(self, mock_otlp_metric_exporter): self.assertEqual( OTLPSpanExporter().export(MagicMock()), SpanExportResult.SUCCESS ) + + @patch.object(Session, "post") + def test_retry_timeout(self, mock_post): + exporter = OTLPSpanExporter(timeout=3.5) + + resp = Response() + resp.status_code = 503 + resp.reason = "UNAVAILABLE" + mock_post.return_value = resp + with self.assertLogs(level=WARNING) as warning: + # Set timeout to 1.5 seconds + self.assertEqual( + exporter.export([BASIC_SPAN], 1500), + SpanExportResult.FAILURE, + ) + # Code should return failure after retrying once. + self.assertEqual(len(warning.records), 1) + self.assertEqual( + "Transient error UNAVAILABLE encountered while exporting span batch, retrying in 1s.", + warning.records[0].message, + ) + with self.assertLogs(level=WARNING) as warning: + # This time don't pass in a timeout, so it will fallback to 3.5 second set on class. + self.assertEqual( + exporter.export([BASIC_SPAN]), + SpanExportResult.FAILURE, + ) + # 2 retrys (after 1s, 3s). + self.assertEqual(len(warning.records), 2) + + @patch.object(Session, "post") + def test_timeout_set_correctly(self, mock_post): + resp = Response() + resp.status_code = 200 + + def export_side_effect(*args, **kwargs): + # Timeout should be set to something slightly less than 400 milliseconds depending on how much time has passed. + self.assertTrue(0.4 - kwargs["timeout"] < 0.0005) + return resp + + mock_post.side_effect = export_side_effect + exporter = OTLPSpanExporter() + exporter.export([BASIC_SPAN], 400) diff --git a/opentelemetry-sdk/src/opentelemetry/sdk/_logs/_internal/export/__init__.py b/opentelemetry-sdk/src/opentelemetry/sdk/_logs/_internal/export/__init__.py index 254c5f6b96d..36382ea9b29 100644 --- a/opentelemetry-sdk/src/opentelemetry/sdk/_logs/_internal/export/__init__.py +++ b/opentelemetry-sdk/src/opentelemetry/sdk/_logs/_internal/export/__init__.py @@ -72,11 +72,14 @@ class LogExporter(abc.ABC): """ @abc.abstractmethod - def export(self, batch: Sequence[LogData]): + def export( + self, batch: Sequence[LogData], timeout_millis: Optional[int] = None + ): """Exports a batch of logs. Args: - batch: The list of `LogData` objects to be exported + batch: The list of `LogData` objects to be exported. + timeout_millis: Optional milliseconds until Export should timeout if it hasn't succeded. Returns: The result of the export @@ -89,6 +92,13 @@ def shutdown(self): Called when the SDK is shut down. """ + @abc.abstractmethod + def force_flush(self, timeout_millis: int = 30000) -> bool: + """Hint to ensure that the export of any spans the exporter has received + prior to the call to ForceFlush SHOULD be completed as soon as possible, preferably + before returning from this method. + """ + class ConsoleLogExporter(LogExporter): """Implementation of :class:`LogExporter` that prints log records to the @@ -107,6 +117,7 @@ def __init__( self.out = out self.formatter = formatter + # pylint: disable=arguments-differ def export(self, batch: Sequence[LogData]): for data in batch: self.out.write(self.formatter(data.log_record)) @@ -310,7 +321,7 @@ def shutdown(self): self._worker_thread.join() self._exporter.shutdown() - def force_flush(self, timeout_millis: Optional[int] = None) -> bool: + def force_flush(self, timeout_millis: Optional[int] = None): if self._shutdown: return # Blocking call to export. diff --git a/opentelemetry-sdk/src/opentelemetry/sdk/_logs/_internal/export/in_memory_log_exporter.py b/opentelemetry-sdk/src/opentelemetry/sdk/_logs/_internal/export/in_memory_log_exporter.py index 68cb6b7389a..910e2cb17c2 100644 --- a/opentelemetry-sdk/src/opentelemetry/sdk/_logs/_internal/export/in_memory_log_exporter.py +++ b/opentelemetry-sdk/src/opentelemetry/sdk/_logs/_internal/export/in_memory_log_exporter.py @@ -40,6 +40,7 @@ def get_finished_logs(self) -> typing.Tuple[LogData, ...]: with self._lock: return tuple(self._logs) + # pylint: disable=arguments-differ def export(self, batch: typing.Sequence[LogData]) -> LogExportResult: if self._stopped: return LogExportResult.FAILURE diff --git a/opentelemetry-sdk/src/opentelemetry/sdk/trace/export/__init__.py b/opentelemetry-sdk/src/opentelemetry/sdk/trace/export/__init__.py index 9e60d6cff9b..5fccdf6fc94 100644 --- a/opentelemetry-sdk/src/opentelemetry/sdk/trace/export/__init__.py +++ b/opentelemetry-sdk/src/opentelemetry/sdk/trace/export/__init__.py @@ -13,6 +13,7 @@ # limitations under the License. from __future__ import annotations +import abc import collections import logging import os @@ -56,7 +57,8 @@ class SpanExportResult(Enum): FAILURE = 1 -class SpanExporter: + +class SpanExporter(abc.ABC): """Interface for exporting spans. Interface to be implemented by services that want to export spans recorded @@ -66,24 +68,30 @@ class SpanExporter: `SimpleSpanProcessor` or a `BatchSpanProcessor`. """ + @abc.abstractmethod def export( - self, spans: typing.Sequence[ReadableSpan] + self, + spans: typing.Sequence[ReadableSpan], + timeout_millis: typing.Optional[int] = None, ) -> "SpanExportResult": """Exports a batch of telemetry data. Args: spans: The list of `opentelemetry.trace.Span` objects to be exported + timeout_millis: Optional milliseconds until Export should timeout if it hasn't succeded. Returns: The result of the export """ + @abc.abstractmethod def shutdown(self) -> None: """Shuts down the exporter. Called when the SDK is shut down. """ + @abc.abstractmethod def force_flush(self, timeout_millis: int = 30000) -> bool: """Hint to ensure that the export of any spans the exporter has received prior to the call to ForceFlush SHOULD be completed as soon as possible, preferably From ccdd2246bbb4328bd0247472236da1c8fbc7b160 Mon Sep 17 00:00:00 2001 From: Dylan Russell Date: Wed, 30 Apr 2025 18:41:38 +0000 Subject: [PATCH 02/38] Fix lint issues --- .../exporter/otlp/proto/http/_log_exporter/__init__.py | 4 ++-- .../opentelemetry/sdk/_logs/_internal/export/__init__.py | 8 -------- .../src/opentelemetry/sdk/trace/export/__init__.py | 8 +++++--- .../sdk/trace/export/in_memory_span_exporter.py | 1 + opentelemetry-sdk/tests/trace/export/test_export.py | 4 ++++ 5 files changed, 12 insertions(+), 13 deletions(-) diff --git a/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/_log_exporter/__init__.py b/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/_log_exporter/__init__.py index 1a46cab057c..c8530972ade 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/_log_exporter/__init__.py +++ b/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/_log_exporter/__init__.py @@ -138,7 +138,7 @@ def _export(self, serialized_data: bytes, timeout_sec: float): url=self._endpoint, data=data, verify=self._certificate_file, - timeout=timeout_sec + timeout=timeout_sec, cert=self._client_cert, ) except ConnectionError: @@ -146,7 +146,7 @@ def _export(self, serialized_data: bytes, timeout_sec: float): url=self._endpoint, data=data, verify=self._certificate_file, - timeout=timeout_sec + timeout=timeout_sec, cert=self._client_cert, ) return resp diff --git a/opentelemetry-sdk/src/opentelemetry/sdk/_logs/_internal/export/__init__.py b/opentelemetry-sdk/src/opentelemetry/sdk/_logs/_internal/export/__init__.py index 36382ea9b29..e42cc1532fe 100644 --- a/opentelemetry-sdk/src/opentelemetry/sdk/_logs/_internal/export/__init__.py +++ b/opentelemetry-sdk/src/opentelemetry/sdk/_logs/_internal/export/__init__.py @@ -92,13 +92,6 @@ def shutdown(self): Called when the SDK is shut down. """ - @abc.abstractmethod - def force_flush(self, timeout_millis: int = 30000) -> bool: - """Hint to ensure that the export of any spans the exporter has received - prior to the call to ForceFlush SHOULD be completed as soon as possible, preferably - before returning from this method. - """ - class ConsoleLogExporter(LogExporter): """Implementation of :class:`LogExporter` that prints log records to the @@ -127,7 +120,6 @@ def export(self, batch: Sequence[LogData]): def shutdown(self): pass - class SimpleLogRecordProcessor(LogRecordProcessor): """This is an implementation of LogRecordProcessor which passes received logs in the export-friendly LogData representation to the diff --git a/opentelemetry-sdk/src/opentelemetry/sdk/trace/export/__init__.py b/opentelemetry-sdk/src/opentelemetry/sdk/trace/export/__init__.py index 5fccdf6fc94..035f67d9f3f 100644 --- a/opentelemetry-sdk/src/opentelemetry/sdk/trace/export/__init__.py +++ b/opentelemetry-sdk/src/opentelemetry/sdk/trace/export/__init__.py @@ -57,7 +57,6 @@ class SpanExportResult(Enum): FAILURE = 1 - class SpanExporter(abc.ABC): """Interface for exporting spans. @@ -91,7 +90,6 @@ def shutdown(self) -> None: Called when the SDK is shut down. """ - @abc.abstractmethod def force_flush(self, timeout_millis: int = 30000) -> bool: """Hint to ensure that the export of any spans the exporter has received prior to the call to ForceFlush SHOULD be completed as soon as possible, preferably @@ -517,11 +515,15 @@ def __init__( self.formatter = formatter self.service_name = service_name + # pylint: disable=arguments-differ def export(self, spans: typing.Sequence[ReadableSpan]) -> SpanExportResult: for span in spans: self.out.write(self.formatter(span)) self.out.flush() return SpanExportResult.SUCCESS - def force_flush(self, timeout_millis: int = 30000) -> bool: + def force_flush(self, timeout_millis: int = 30000): return True + + def shutdown(self): + pass diff --git a/opentelemetry-sdk/src/opentelemetry/sdk/trace/export/in_memory_span_exporter.py b/opentelemetry-sdk/src/opentelemetry/sdk/trace/export/in_memory_span_exporter.py index c28ecfd214f..eadef019514 100644 --- a/opentelemetry-sdk/src/opentelemetry/sdk/trace/export/in_memory_span_exporter.py +++ b/opentelemetry-sdk/src/opentelemetry/sdk/trace/export/in_memory_span_exporter.py @@ -42,6 +42,7 @@ def get_finished_spans(self) -> typing.Tuple[ReadableSpan, ...]: with self._lock: return tuple(self._finished_spans) + # pylint: disable=arguments-differ def export(self, spans: typing.Sequence[ReadableSpan]) -> SpanExportResult: """Stores a list of spans in memory.""" if self._stopped: diff --git a/opentelemetry-sdk/tests/trace/export/test_export.py b/opentelemetry-sdk/tests/trace/export/test_export.py index a6d9c36875b..770b3ff123c 100644 --- a/opentelemetry-sdk/tests/trace/export/test_export.py +++ b/opentelemetry-sdk/tests/trace/export/test_export.py @@ -60,6 +60,7 @@ def __init__( self.export_timeout = export_timeout_millis / 1e3 self.export_event = export_event + # pylint: disable=arguments-differ def export(self, spans: trace.Span) -> export.SpanExportResult: if ( self.max_export_batch_size is not None @@ -75,6 +76,9 @@ def export(self, spans: trace.Span) -> export.SpanExportResult: def shutdown(self): self.is_shutdown = True + def force_flush(self, timeout_millis: int = 30000) -> bool: + return True + class TestSimpleSpanProcessor(unittest.TestCase): def test_simple_span_processor(self): From 5bc8894521323424c42daa5acfe5c043c863d43a Mon Sep 17 00:00:00 2001 From: Dylan Russell Date: Thu, 1 May 2025 19:04:05 +0000 Subject: [PATCH 03/38] Fix a bunch of failing style/lint/spellcheck checks --- .../opencensus/trace_exporter/__init__.py | 1 + .../exporter/otlp/proto/grpc/exporter.py | 6 +-- .../tests/test_otlp_exporter_mixin.py | 37 ++++++++++--------- .../exporter/zipkin/json/__init__.py | 1 + .../sdk/_logs/_internal/export/__init__.py | 3 +- .../sdk/trace/export/__init__.py | 8 +--- .../trace/export/in_memory_span_exporter.py | 3 -- .../tests/trace/export/test_export.py | 3 -- 8 files changed, 28 insertions(+), 34 deletions(-) diff --git a/exporter/opentelemetry-exporter-opencensus/src/opentelemetry/exporter/opencensus/trace_exporter/__init__.py b/exporter/opentelemetry-exporter-opencensus/src/opentelemetry/exporter/opencensus/trace_exporter/__init__.py index 0b79bbb2073..5dd1ae2df09 100644 --- a/exporter/opentelemetry-exporter-opencensus/src/opentelemetry/exporter/opencensus/trace_exporter/__init__.py +++ b/exporter/opentelemetry-exporter-opencensus/src/opentelemetry/exporter/opencensus/trace_exporter/__init__.py @@ -69,6 +69,7 @@ def __init__( self.host_name = host_name self.node = utils.get_node(service_name, host_name) + # pylint: disable=arguments-differ def export(self, spans: Sequence[ReadableSpan]) -> SpanExportResult: # Populate service_name from first span # We restrict any SpanProcessor to be only associated with a single diff --git a/exporter/opentelemetry-exporter-otlp-proto-grpc/src/opentelemetry/exporter/otlp/proto/grpc/exporter.py b/exporter/opentelemetry-exporter-otlp-proto-grpc/src/opentelemetry/exporter/otlp/proto/grpc/exporter.py index d169d1e5a80..e4022e9bdd4 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-grpc/src/opentelemetry/exporter/otlp/proto/grpc/exporter.py +++ b/exporter/opentelemetry-exporter-otlp-proto-grpc/src/opentelemetry/exporter/otlp/proto/grpc/exporter.py @@ -72,7 +72,7 @@ from opentelemetry.sdk.trace import ReadableSpan from opentelemetry.util.re import parse_env_headers -json_config = json.dumps( +_JSON_CONFIG = json.dumps( { "methodConfig": [ { @@ -269,7 +269,7 @@ def __init__( self._endpoint, compression=compression, options=[ - ("grpc.service_config", json_config), + ("grpc.service_config", _JSON_CONFIG), ], ) else: @@ -284,7 +284,7 @@ def __init__( credentials, compression=compression, options=[ - ("grpc.service_config", json_config), + ("grpc.service_config", _JSON_CONFIG), ], ) self._client = self._stub(self._channel) diff --git a/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/test_otlp_exporter_mixin.py b/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/test_otlp_exporter_mixin.py index 5a75595f693..674e8589645 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/test_otlp_exporter_mixin.py +++ b/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/test_otlp_exporter_mixin.py @@ -20,7 +20,12 @@ from unittest import TestCase from unittest.mock import ANY, Mock, patch -from google.rpc import code_pb2, status_pb2 +from google.rpc.code_pb2 import ( # pylint: disable=no-name-in-module + ALREADY_EXISTS, + OK, + UNAVAILABLE, +) +from google.rpc.status_pb2 import Status # pylint: disable=no-name-in-module from grpc import Compression, server from grpc_status import rpc_status @@ -91,7 +96,7 @@ def shutdown(self, timeout_millis=30_000): class TraceServiceServicerWithExportParams(TraceServiceServicer): def __init__( self, - export_result: code_pb2, + export_result: int, optional_export_sleep: Optional[float] = None, ): self.export_result = export_result @@ -99,13 +104,13 @@ def __init__( # pylint: disable=invalid-name,unused-argument def Export(self, request, context): - logger.warning("Export Request Recieved") + logger.warning("Export Request Received") if self.optional_export_sleep: time.sleep(self.optional_export_sleep) - if self.export_result != code_pb2.OK: + if self.export_result != OK: context.abort_with_status( rpc_status.to_status( - status_pb2.Status( + Status( code=self.export_result, ) ) @@ -288,7 +293,7 @@ def test_otlp_exporter_otlp_compression_envvar( def test_shutdown(self): add_TraceServiceServicer_to_server( - TraceServiceServicerWithExportParams(code_pb2.OK), + TraceServiceServicerWithExportParams(OK), self.server, ) self.assertEqual( @@ -306,9 +311,7 @@ def test_shutdown(self): def test_shutdown_wait_last_export(self): add_TraceServiceServicer_to_server( - TraceServiceServicerWithExportParams( - code_pb2.OK, optional_export_sleep=1 - ), + TraceServiceServicerWithExportParams(OK, optional_export_sleep=1), self.server, ) @@ -327,9 +330,7 @@ def test_shutdown_wait_last_export(self): def test_shutdown_doesnot_wait_last_export(self): add_TraceServiceServicer_to_server( - TraceServiceServicerWithExportParams( - code_pb2.OK, optional_export_sleep=3 - ), + TraceServiceServicerWithExportParams(OK, optional_export_sleep=3), self.server, ) @@ -351,7 +352,7 @@ def test_export_over_closed_grpc_channel(self): # pylint: disable=protected-access add_TraceServiceServicer_to_server( - TraceServiceServicerWithExportParams(code_pb2.OK), + TraceServiceServicerWithExportParams(OK), self.server, ) self.exporter.export([self.span]) @@ -365,7 +366,7 @@ def test_export_over_closed_grpc_channel(self): def test_retry_timeout(self): add_TraceServiceServicer_to_server( - TraceServiceServicerWithExportParams(code_pb2.UNAVAILABLE), + TraceServiceServicerWithExportParams(UNAVAILABLE), self.server, ) with self.assertLogs(level=WARNING) as warning: @@ -381,7 +382,7 @@ def test_retry_timeout(self): for idx, log in enumerate(warning.records): if idx != 2: self.assertEqual( - "Export Request Recieved", + "Export Request Received", log.message, ) else: @@ -405,7 +406,7 @@ def test_retry_timeout(self): for idx, log in enumerate(warning.records): if idx != 3: self.assertEqual( - "Export Request Recieved", + "Export Request Received", log.message, ) else: @@ -417,7 +418,7 @@ def test_retry_timeout(self): def test_timeout_set_correctly(self): add_TraceServiceServicer_to_server( TraceServiceServicerWithExportParams( - code_pb2.OK, optional_export_sleep=0.5 + OK, optional_export_sleep=0.5 ), self.server, ) @@ -449,7 +450,7 @@ def test_otlp_headers_from_env(self): def test_permanent_failure(self): with self.assertLogs(level=WARNING) as warning: add_TraceServiceServicer_to_server( - TraceServiceServicerWithExportParams(code_pb2.ALREADY_EXISTS), + TraceServiceServicerWithExportParams(ALREADY_EXISTS), self.server, ) self.assertEqual( diff --git a/exporter/opentelemetry-exporter-zipkin-json/src/opentelemetry/exporter/zipkin/json/__init__.py b/exporter/opentelemetry-exporter-zipkin-json/src/opentelemetry/exporter/zipkin/json/__init__.py index ba313db942a..d0d559981cf 100644 --- a/exporter/opentelemetry-exporter-zipkin-json/src/opentelemetry/exporter/zipkin/json/__init__.py +++ b/exporter/opentelemetry-exporter-zipkin-json/src/opentelemetry/exporter/zipkin/json/__init__.py @@ -149,6 +149,7 @@ def __init__( environ.get(OTEL_EXPORTER_ZIPKIN_TIMEOUT, 10) ) + # pylint: disable=arguments-differ def export(self, spans: Sequence[Span]) -> SpanExportResult: # After the call to Shutdown subsequent calls to Export are # not allowed and should return a Failure result diff --git a/opentelemetry-sdk/src/opentelemetry/sdk/_logs/_internal/export/__init__.py b/opentelemetry-sdk/src/opentelemetry/sdk/_logs/_internal/export/__init__.py index e42cc1532fe..ffcdc6f2343 100644 --- a/opentelemetry-sdk/src/opentelemetry/sdk/_logs/_internal/export/__init__.py +++ b/opentelemetry-sdk/src/opentelemetry/sdk/_logs/_internal/export/__init__.py @@ -79,7 +79,7 @@ def export( Args: batch: The list of `LogData` objects to be exported. - timeout_millis: Optional milliseconds until Export should timeout if it hasn't succeded. + timeout_millis: Optional milliseconds until Export should timeout if it hasn't succeeded. Returns: The result of the export @@ -120,6 +120,7 @@ def export(self, batch: Sequence[LogData]): def shutdown(self): pass + class SimpleLogRecordProcessor(LogRecordProcessor): """This is an implementation of LogRecordProcessor which passes received logs in the export-friendly LogData representation to the diff --git a/opentelemetry-sdk/src/opentelemetry/sdk/trace/export/__init__.py b/opentelemetry-sdk/src/opentelemetry/sdk/trace/export/__init__.py index 035f67d9f3f..a4a1ba19d72 100644 --- a/opentelemetry-sdk/src/opentelemetry/sdk/trace/export/__init__.py +++ b/opentelemetry-sdk/src/opentelemetry/sdk/trace/export/__init__.py @@ -77,13 +77,12 @@ def export( Args: spans: The list of `opentelemetry.trace.Span` objects to be exported - timeout_millis: Optional milliseconds until Export should timeout if it hasn't succeded. + timeout_millis: Optional milliseconds until Export should timeout if it hasn't succeeded. Returns: The result of the export """ - @abc.abstractmethod def shutdown(self) -> None: """Shuts down the exporter. @@ -522,8 +521,5 @@ def export(self, spans: typing.Sequence[ReadableSpan]) -> SpanExportResult: self.out.flush() return SpanExportResult.SUCCESS - def force_flush(self, timeout_millis: int = 30000): + def force_flush(self, timeout_millis: int = 30000) -> bool: return True - - def shutdown(self): - pass diff --git a/opentelemetry-sdk/src/opentelemetry/sdk/trace/export/in_memory_span_exporter.py b/opentelemetry-sdk/src/opentelemetry/sdk/trace/export/in_memory_span_exporter.py index eadef019514..f13b9c512ef 100644 --- a/opentelemetry-sdk/src/opentelemetry/sdk/trace/export/in_memory_span_exporter.py +++ b/opentelemetry-sdk/src/opentelemetry/sdk/trace/export/in_memory_span_exporter.py @@ -57,6 +57,3 @@ def shutdown(self) -> None: Calls to export after the exporter has been shut down will fail. """ self._stopped = True - - def force_flush(self, timeout_millis: int = 30000) -> bool: - return True diff --git a/opentelemetry-sdk/tests/trace/export/test_export.py b/opentelemetry-sdk/tests/trace/export/test_export.py index 770b3ff123c..f20656ecb5a 100644 --- a/opentelemetry-sdk/tests/trace/export/test_export.py +++ b/opentelemetry-sdk/tests/trace/export/test_export.py @@ -76,9 +76,6 @@ def export(self, spans: trace.Span) -> export.SpanExportResult: def shutdown(self): self.is_shutdown = True - def force_flush(self, timeout_millis: int = 30000) -> bool: - return True - class TestSimpleSpanProcessor(unittest.TestCase): def test_simple_span_processor(self): From ba92c5aa7537965436d8120ff42bb3b59b9e7b70 Mon Sep 17 00:00:00 2001 From: Dylan Russell Date: Fri, 2 May 2025 21:05:01 +0000 Subject: [PATCH 04/38] Remove timeout param from the export calls. --- .../otlp/proto/grpc/_log_exporter/__init__.py | 8 ++----- .../exporter/otlp/proto/grpc/exporter.py | 3 +-- .../proto/grpc/metric_exporter/__init__.py | 19 +++++---------- .../proto/grpc/trace_exporter/__init__.py | 10 ++------ .../tests/test_otlp_exporter_mixin.py | 24 +++++++------------ .../otlp/proto/http/_log_exporter/__init__.py | 8 ++----- .../proto/http/trace_exporter/__init__.py | 10 ++------ .../tests/test_proto_log_exporter.py | 11 ++++----- .../tests/test_proto_span_exporter.py | 11 ++++----- .../exporter/zipkin/json/__init__.py | 1 - .../sdk/_logs/_internal/export/__init__.py | 8 ++----- .../export/in_memory_log_exporter.py | 1 - .../sdk/trace/export/__init__.py | 10 ++------ .../trace/export/in_memory_span_exporter.py | 4 +++- .../tests/trace/export/test_export.py | 1 - 15 files changed, 41 insertions(+), 88 deletions(-) diff --git a/exporter/opentelemetry-exporter-otlp-proto-grpc/src/opentelemetry/exporter/otlp/proto/grpc/_log_exporter/__init__.py b/exporter/opentelemetry-exporter-otlp-proto-grpc/src/opentelemetry/exporter/otlp/proto/grpc/_log_exporter/__init__.py index d13f662ccab..e66f9dbcca7 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-grpc/src/opentelemetry/exporter/otlp/proto/grpc/_log_exporter/__init__.py +++ b/exporter/opentelemetry-exporter-otlp-proto-grpc/src/opentelemetry/exporter/otlp/proto/grpc/_log_exporter/__init__.py @@ -107,12 +107,8 @@ def _translate_data( ) -> ExportLogsServiceRequest: return encode_logs(data) - def export( - self, batch: Sequence[LogData], timeout_millis: Optional[int] = None - ) -> LogExportResult: - return self._export( - batch, timeout_millis / 1e3 if timeout_millis else None - ) + def export(self, batch: Sequence[LogData]) -> LogExportResult: + return self._export(batch) def shutdown(self, timeout_millis: float = 30_000, **kwargs) -> None: OTLPExporterMixin.shutdown(self, timeout_millis=timeout_millis) diff --git a/exporter/opentelemetry-exporter-otlp-proto-grpc/src/opentelemetry/exporter/otlp/proto/grpc/exporter.py b/exporter/opentelemetry-exporter-otlp-proto-grpc/src/opentelemetry/exporter/otlp/proto/grpc/exporter.py index e4022e9bdd4..0f280dc9986 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-grpc/src/opentelemetry/exporter/otlp/proto/grpc/exporter.py +++ b/exporter/opentelemetry-exporter-otlp-proto-grpc/src/opentelemetry/exporter/otlp/proto/grpc/exporter.py @@ -301,7 +301,6 @@ def _translate_data( def _export( self, data: Union[TypingSequence[ReadableSpan], MetricsData], - timeout_sec: Optional[float] = None, ) -> ExportResultT: if self._shutdown: logger.warning("Exporter already shutdown, ignoring batch") @@ -315,7 +314,7 @@ def _export( self._client.Export( request=self._translate_data(data), metadata=self._headers, - timeout=(timeout_sec or self._timeout), + timeout=self._timeout, ) return self._result.SUCCESS except RpcError as error: diff --git a/exporter/opentelemetry-exporter-otlp-proto-grpc/src/opentelemetry/exporter/otlp/proto/grpc/metric_exporter/__init__.py b/exporter/opentelemetry-exporter-otlp-proto-grpc/src/opentelemetry/exporter/otlp/proto/grpc/metric_exporter/__init__.py index 64c10ae9056..dbb2a8e1dee 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-grpc/src/opentelemetry/exporter/otlp/proto/grpc/metric_exporter/__init__.py +++ b/exporter/opentelemetry-exporter-otlp-proto-grpc/src/opentelemetry/exporter/otlp/proto/grpc/metric_exporter/__init__.py @@ -13,11 +13,10 @@ from __future__ import annotations -import time from dataclasses import replace from logging import getLogger from os import environ -from typing import Iterable, List, Optional, Tuple, Union +from typing import Iterable, List, Tuple, Union from typing import Sequence as TypingSequence from grpc import ChannelCredentials, Compression @@ -159,26 +158,20 @@ def _translate_data( def export( self, metrics_data: MetricsData, - timeout_millis: Optional[int] = None, + timeout_millis: float = 10_000, **kwargs, ) -> MetricExportResult: - timeout_sec = ( - timeout_millis / 1e3 if timeout_millis else self._timeout # pylint: disable=protected-access - ) + # TODO(#2663): OTLPExporterMixin should pass timeout to gRPC if self._max_export_batch_size is None: - return self._export(metrics_data, timeout_sec) + return self._export(data=metrics_data) export_result = MetricExportResult.SUCCESS - deadline_sec = time.time() + timeout_sec + for split_metrics_data in self._split_metrics_data(metrics_data): - time_remaining_sec = deadline_sec - time.time() - split_export_result = self._export( - split_metrics_data, time_remaining_sec - ) + split_export_result = self._export(data=split_metrics_data) if split_export_result is MetricExportResult.FAILURE: export_result = MetricExportResult.FAILURE - return export_result def _split_metrics_data( diff --git a/exporter/opentelemetry-exporter-otlp-proto-grpc/src/opentelemetry/exporter/otlp/proto/grpc/trace_exporter/__init__.py b/exporter/opentelemetry-exporter-otlp-proto-grpc/src/opentelemetry/exporter/otlp/proto/grpc/trace_exporter/__init__.py index b092b5b3b9e..7aef65a2ca9 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-grpc/src/opentelemetry/exporter/otlp/proto/grpc/trace_exporter/__init__.py +++ b/exporter/opentelemetry-exporter-otlp-proto-grpc/src/opentelemetry/exporter/otlp/proto/grpc/trace_exporter/__init__.py @@ -139,14 +139,8 @@ def _translate_data( ) -> ExportTraceServiceRequest: return encode_spans(data) - def export( - self, - spans: Sequence[ReadableSpan], - timeout_millis: Optional[int] = None, - ) -> SpanExportResult: - return self._export( - spans, timeout_millis / 1e3 if timeout_millis else None - ) + def export(self, spans: Sequence[ReadableSpan]) -> SpanExportResult: + return self._export(spans) def shutdown(self) -> None: OTLPExporterMixin.shutdown(self) diff --git a/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/test_otlp_exporter_mixin.py b/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/test_otlp_exporter_mixin.py index 674e8589645..2e116f4218b 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/test_otlp_exporter_mixin.py +++ b/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/test_otlp_exporter_mixin.py @@ -76,14 +76,8 @@ def _translate_data( ) -> ExportTraceServiceRequest: return encode_spans(data) - def export( - self, - spans: Sequence[ReadableSpan], - timeout_millis: Optional[float] = None, - ) -> SpanExportResult: - return self._export( - spans, timeout_millis / 1e3 if timeout_millis else None - ) + def export(self, spans: Sequence[ReadableSpan]) -> SpanExportResult: + return self._export(spans) @property def _exporting(self): @@ -369,10 +363,11 @@ def test_retry_timeout(self): TraceServiceServicerWithExportParams(UNAVAILABLE), self.server, ) + # Set timeout to 1.5 seconds. + exporter = OTLPSpanExporterForTesting(insecure=True, timeout=1.5) with self.assertLogs(level=WARNING) as warning: - # Set timeout to 1.5 seconds self.assertEqual( - self.exporter.export([self.span], 1500), + exporter.export([self.span]), SpanExportResult.FAILURE, ) # Our GRPC retry policy starts with a 1 second backoff then doubles. @@ -392,8 +387,6 @@ def test_retry_timeout(self): ) with self.assertLogs(level=WARNING) as warning: exporter = OTLPSpanExporterForTesting(insecure=True, timeout=3.5) - # This time don't pass in a timeout to export, so it should fallback to the timeout - # passed to the exporter class. # pylint: disable=protected-access self.assertEqual(exporter._timeout, 3.5) self.assertEqual( @@ -422,20 +415,21 @@ def test_timeout_set_correctly(self): ), self.server, ) + exporter = OTLPSpanExporterForTesting(insecure=True, timeout=0.4) # Should timeout. Deadline should be set to now + timeout. # That is 400 millis from now, and export sleeps for 500 millis. with self.assertLogs(level=WARNING) as warning: self.assertEqual( - self.exporter.export([self.span], 400), + exporter.export([self.span]), SpanExportResult.FAILURE, ) self.assertEqual( "Failed to export traces to localhost:4317, error code: StatusCode.DEADLINE_EXCEEDED", warning.records[-1].message, ) - + exporter = OTLPSpanExporterForTesting(insecure=True, timeout=0.6) self.assertEqual( - self.exporter.export([self.span], 600), + exporter.export([self.span]), SpanExportResult.SUCCESS, ) diff --git a/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/_log_exporter/__init__.py b/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/_log_exporter/__init__.py index c8530972ade..6a6e8821feb 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/_log_exporter/__init__.py +++ b/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/_log_exporter/__init__.py @@ -159,9 +159,7 @@ def _retryable(resp: requests.Response) -> bool: return True return False - def export( - self, batch: Sequence[LogData], timeout_millis: Optional[float] = None - ) -> LogExportResult: + def export(self, batch: Sequence[LogData]) -> LogExportResult: # After the call to Shutdown subsequent calls to Export are # not allowed and should return a Failure result. if self._shutdown: @@ -169,9 +167,7 @@ def export( return LogExportResult.FAILURE serialized_data = encode_logs(batch).SerializeToString() - deadline_sec = time() + ( - timeout_millis / 1e3 if timeout_millis else self._timeout - ) + deadline_sec = time() + self._timeout for delay in [1, 2, 4, 8, 16, 32]: remaining_time_sec = deadline_sec - time() if remaining_time_sec < 1e-09: diff --git a/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/trace_exporter/__init__.py b/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/trace_exporter/__init__.py index 6fa0f1c1bdd..3feb8731c5e 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/trace_exporter/__init__.py +++ b/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/trace_exporter/__init__.py @@ -157,11 +157,7 @@ def _retryable(resp: requests.Response) -> bool: return True return False - def export( - self, - spans: Sequence[ReadableSpan], - timeout_millis: Optional[float] = None, - ) -> SpanExportResult: + def export(self, spans: Sequence[ReadableSpan]) -> SpanExportResult: # After the call to Shutdown subsequent calls to Export are # not allowed and should return a Failure result. if self._shutdown: @@ -169,9 +165,7 @@ def export( return SpanExportResult.FAILURE serialized_data = encode_spans(spans).SerializePartialToString() - deadline_sec = time() + ( - timeout_millis / 1e3 if timeout_millis else self._timeout - ) + deadline_sec = time() + self._timeout for delay in [1, 2, 4, 8, 16, 32]: remaining_time_sec = deadline_sec - time() if remaining_time_sec < 1e-09: diff --git a/exporter/opentelemetry-exporter-otlp-proto-http/tests/test_proto_log_exporter.py b/exporter/opentelemetry-exporter-otlp-proto-http/tests/test_proto_log_exporter.py index 00a00ae3aa9..1ba1db06567 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-http/tests/test_proto_log_exporter.py +++ b/exporter/opentelemetry-exporter-otlp-proto-http/tests/test_proto_log_exporter.py @@ -351,16 +351,15 @@ def test_2xx_status_code(self, mock_otlp_metric_exporter): @patch.object(Session, "post") def test_retry_timeout(self, mock_post): - exporter = OTLPLogExporter(timeout=3.5) + exporter = OTLPLogExporter(timeout=1.5) resp = Response() resp.status_code = 503 resp.reason = "UNAVAILABLE" mock_post.return_value = resp with self.assertLogs(level=WARNING) as warning: - # Set timeout to 1.5 seconds self.assertEqual( - exporter.export(self._get_sdk_log_data(), 1500), + exporter.export(self._get_sdk_log_data()), LogExportResult.FAILURE, ) # Code should return failure after retrying once. @@ -369,8 +368,8 @@ def test_retry_timeout(self, mock_post): "Transient error UNAVAILABLE encountered while exporting logs batch, retrying in 1s.", warning.records[0].message, ) + exporter = OTLPLogExporter(timeout=3.5) with self.assertLogs(level=WARNING) as warning: - # This time don't pass in a timeout, so it will fallback to 3.5 second set on class. self.assertEqual( exporter.export(self._get_sdk_log_data()), LogExportResult.FAILURE, @@ -389,5 +388,5 @@ def export_side_effect(*args, **kwargs): return resp mock_post.side_effect = export_side_effect - exporter = OTLPLogExporter() - exporter.export(self._get_sdk_log_data(), 400) + exporter = OTLPLogExporter(timeout=0.4) + exporter.export(self._get_sdk_log_data()) diff --git a/exporter/opentelemetry-exporter-otlp-proto-http/tests/test_proto_span_exporter.py b/exporter/opentelemetry-exporter-otlp-proto-http/tests/test_proto_span_exporter.py index b7e357bbe4c..2096c727462 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-http/tests/test_proto_span_exporter.py +++ b/exporter/opentelemetry-exporter-otlp-proto-http/tests/test_proto_span_exporter.py @@ -251,16 +251,15 @@ def test_2xx_status_code(self, mock_otlp_metric_exporter): @patch.object(Session, "post") def test_retry_timeout(self, mock_post): - exporter = OTLPSpanExporter(timeout=3.5) + exporter = OTLPSpanExporter(timeout=1.5) resp = Response() resp.status_code = 503 resp.reason = "UNAVAILABLE" mock_post.return_value = resp with self.assertLogs(level=WARNING) as warning: - # Set timeout to 1.5 seconds self.assertEqual( - exporter.export([BASIC_SPAN], 1500), + exporter.export([BASIC_SPAN]), SpanExportResult.FAILURE, ) # Code should return failure after retrying once. @@ -269,8 +268,8 @@ def test_retry_timeout(self, mock_post): "Transient error UNAVAILABLE encountered while exporting span batch, retrying in 1s.", warning.records[0].message, ) + exporter = OTLPSpanExporter(timeout=3.5) with self.assertLogs(level=WARNING) as warning: - # This time don't pass in a timeout, so it will fallback to 3.5 second set on class. self.assertEqual( exporter.export([BASIC_SPAN]), SpanExportResult.FAILURE, @@ -289,5 +288,5 @@ def export_side_effect(*args, **kwargs): return resp mock_post.side_effect = export_side_effect - exporter = OTLPSpanExporter() - exporter.export([BASIC_SPAN], 400) + exporter = OTLPSpanExporter(timeout=0.4) + exporter.export([BASIC_SPAN]) diff --git a/exporter/opentelemetry-exporter-zipkin-json/src/opentelemetry/exporter/zipkin/json/__init__.py b/exporter/opentelemetry-exporter-zipkin-json/src/opentelemetry/exporter/zipkin/json/__init__.py index d0d559981cf..ba313db942a 100644 --- a/exporter/opentelemetry-exporter-zipkin-json/src/opentelemetry/exporter/zipkin/json/__init__.py +++ b/exporter/opentelemetry-exporter-zipkin-json/src/opentelemetry/exporter/zipkin/json/__init__.py @@ -149,7 +149,6 @@ def __init__( environ.get(OTEL_EXPORTER_ZIPKIN_TIMEOUT, 10) ) - # pylint: disable=arguments-differ def export(self, spans: Sequence[Span]) -> SpanExportResult: # After the call to Shutdown subsequent calls to Export are # not allowed and should return a Failure result diff --git a/opentelemetry-sdk/src/opentelemetry/sdk/_logs/_internal/export/__init__.py b/opentelemetry-sdk/src/opentelemetry/sdk/_logs/_internal/export/__init__.py index ffcdc6f2343..c331fa76c6b 100644 --- a/opentelemetry-sdk/src/opentelemetry/sdk/_logs/_internal/export/__init__.py +++ b/opentelemetry-sdk/src/opentelemetry/sdk/_logs/_internal/export/__init__.py @@ -72,14 +72,11 @@ class LogExporter(abc.ABC): """ @abc.abstractmethod - def export( - self, batch: Sequence[LogData], timeout_millis: Optional[int] = None - ): + def export(self, batch: Sequence[LogData]): """Exports a batch of logs. Args: batch: The list of `LogData` objects to be exported. - timeout_millis: Optional milliseconds until Export should timeout if it hasn't succeeded. Returns: The result of the export @@ -110,7 +107,6 @@ def __init__( self.out = out self.formatter = formatter - # pylint: disable=arguments-differ def export(self, batch: Sequence[LogData]): for data in batch: self.out.write(self.formatter(data.log_record)) @@ -314,7 +310,7 @@ def shutdown(self): self._worker_thread.join() self._exporter.shutdown() - def force_flush(self, timeout_millis: Optional[int] = None): + def force_flush(self, timeout_millis: Optional[int] = None) -> bool: if self._shutdown: return # Blocking call to export. diff --git a/opentelemetry-sdk/src/opentelemetry/sdk/_logs/_internal/export/in_memory_log_exporter.py b/opentelemetry-sdk/src/opentelemetry/sdk/_logs/_internal/export/in_memory_log_exporter.py index 910e2cb17c2..68cb6b7389a 100644 --- a/opentelemetry-sdk/src/opentelemetry/sdk/_logs/_internal/export/in_memory_log_exporter.py +++ b/opentelemetry-sdk/src/opentelemetry/sdk/_logs/_internal/export/in_memory_log_exporter.py @@ -40,7 +40,6 @@ def get_finished_logs(self) -> typing.Tuple[LogData, ...]: with self._lock: return tuple(self._logs) - # pylint: disable=arguments-differ def export(self, batch: typing.Sequence[LogData]) -> LogExportResult: if self._stopped: return LogExportResult.FAILURE diff --git a/opentelemetry-sdk/src/opentelemetry/sdk/trace/export/__init__.py b/opentelemetry-sdk/src/opentelemetry/sdk/trace/export/__init__.py index a4a1ba19d72..9e60d6cff9b 100644 --- a/opentelemetry-sdk/src/opentelemetry/sdk/trace/export/__init__.py +++ b/opentelemetry-sdk/src/opentelemetry/sdk/trace/export/__init__.py @@ -13,7 +13,6 @@ # limitations under the License. from __future__ import annotations -import abc import collections import logging import os @@ -57,7 +56,7 @@ class SpanExportResult(Enum): FAILURE = 1 -class SpanExporter(abc.ABC): +class SpanExporter: """Interface for exporting spans. Interface to be implemented by services that want to export spans recorded @@ -67,17 +66,13 @@ class SpanExporter(abc.ABC): `SimpleSpanProcessor` or a `BatchSpanProcessor`. """ - @abc.abstractmethod def export( - self, - spans: typing.Sequence[ReadableSpan], - timeout_millis: typing.Optional[int] = None, + self, spans: typing.Sequence[ReadableSpan] ) -> "SpanExportResult": """Exports a batch of telemetry data. Args: spans: The list of `opentelemetry.trace.Span` objects to be exported - timeout_millis: Optional milliseconds until Export should timeout if it hasn't succeeded. Returns: The result of the export @@ -514,7 +509,6 @@ def __init__( self.formatter = formatter self.service_name = service_name - # pylint: disable=arguments-differ def export(self, spans: typing.Sequence[ReadableSpan]) -> SpanExportResult: for span in spans: self.out.write(self.formatter(span)) diff --git a/opentelemetry-sdk/src/opentelemetry/sdk/trace/export/in_memory_span_exporter.py b/opentelemetry-sdk/src/opentelemetry/sdk/trace/export/in_memory_span_exporter.py index f13b9c512ef..c28ecfd214f 100644 --- a/opentelemetry-sdk/src/opentelemetry/sdk/trace/export/in_memory_span_exporter.py +++ b/opentelemetry-sdk/src/opentelemetry/sdk/trace/export/in_memory_span_exporter.py @@ -42,7 +42,6 @@ def get_finished_spans(self) -> typing.Tuple[ReadableSpan, ...]: with self._lock: return tuple(self._finished_spans) - # pylint: disable=arguments-differ def export(self, spans: typing.Sequence[ReadableSpan]) -> SpanExportResult: """Stores a list of spans in memory.""" if self._stopped: @@ -57,3 +56,6 @@ def shutdown(self) -> None: Calls to export after the exporter has been shut down will fail. """ self._stopped = True + + def force_flush(self, timeout_millis: int = 30000) -> bool: + return True diff --git a/opentelemetry-sdk/tests/trace/export/test_export.py b/opentelemetry-sdk/tests/trace/export/test_export.py index f20656ecb5a..a6d9c36875b 100644 --- a/opentelemetry-sdk/tests/trace/export/test_export.py +++ b/opentelemetry-sdk/tests/trace/export/test_export.py @@ -60,7 +60,6 @@ def __init__( self.export_timeout = export_timeout_millis / 1e3 self.export_event = export_event - # pylint: disable=arguments-differ def export(self, spans: trace.Span) -> export.SpanExportResult: if ( self.max_export_batch_size is not None From 29144a1dfb6e982cea8c5fcaee32dfe1e5f6d325 Mon Sep 17 00:00:00 2001 From: Dylan Russell Date: Fri, 2 May 2025 21:13:42 +0000 Subject: [PATCH 05/38] Fix flaky windows test ? --- .../tests/test_otlp_exporter_mixin.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/test_otlp_exporter_mixin.py b/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/test_otlp_exporter_mixin.py index 2e116f4218b..c1b2fbb6997 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/test_otlp_exporter_mixin.py +++ b/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/test_otlp_exporter_mixin.py @@ -427,7 +427,7 @@ def test_timeout_set_correctly(self): "Failed to export traces to localhost:4317, error code: StatusCode.DEADLINE_EXCEEDED", warning.records[-1].message, ) - exporter = OTLPSpanExporterForTesting(insecure=True, timeout=0.6) + exporter = OTLPSpanExporterForTesting(insecure=True, timeout=0.8) self.assertEqual( exporter.export([self.span]), SpanExportResult.SUCCESS, From 95ccfea3f9af5d10c6220b9788e9825d4958eec4 Mon Sep 17 00:00:00 2001 From: Dylan Russell Date: Fri, 9 May 2025 17:43:25 +0000 Subject: [PATCH 06/38] Respond to review comments.. --- .../exporter/opencensus/trace_exporter/__init__.py | 1 - .../src/opentelemetry/exporter/otlp/proto/grpc/exporter.py | 7 ++++++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/exporter/opentelemetry-exporter-opencensus/src/opentelemetry/exporter/opencensus/trace_exporter/__init__.py b/exporter/opentelemetry-exporter-opencensus/src/opentelemetry/exporter/opencensus/trace_exporter/__init__.py index 5dd1ae2df09..0b79bbb2073 100644 --- a/exporter/opentelemetry-exporter-opencensus/src/opentelemetry/exporter/opencensus/trace_exporter/__init__.py +++ b/exporter/opentelemetry-exporter-opencensus/src/opentelemetry/exporter/opencensus/trace_exporter/__init__.py @@ -69,7 +69,6 @@ def __init__( self.host_name = host_name self.node = utils.get_node(service_name, host_name) - # pylint: disable=arguments-differ def export(self, spans: Sequence[ReadableSpan]) -> SpanExportResult: # Populate service_name from first span # We restrict any SpanProcessor to be only associated with a single diff --git a/exporter/opentelemetry-exporter-otlp-proto-grpc/src/opentelemetry/exporter/otlp/proto/grpc/exporter.py b/exporter/opentelemetry-exporter-otlp-proto-grpc/src/opentelemetry/exporter/otlp/proto/grpc/exporter.py index 0f280dc9986..b373df1c8df 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-grpc/src/opentelemetry/exporter/otlp/proto/grpc/exporter.py +++ b/exporter/opentelemetry-exporter-otlp-proto-grpc/src/opentelemetry/exporter/otlp/proto/grpc/exporter.py @@ -78,9 +78,14 @@ { "name": [dict()], "retryPolicy": { + # 5 is the maximum allowable attempts allowed by grpc retry policy. + # This policy results in backoffs of 1s, 2s, 4s, and then 8s after the initial failed attempt. + # Timeout set on the RPC call encompasses the retry backoffs AND time spent waiting + # for a response. DEADLINE_EXCEEDED is returned if all the attempts cannot complete within the + # timeout. See https://grpc.io/docs/guides/retry/ for more details. "maxAttempts": 5, "initialBackoff": "1s", - "maxBackoff": "64s", + "maxBackoff": "9s", "backoffMultiplier": 2, "retryableStatusCodes": [ "UNAVAILABLE", From 8770e1518c38f145c52f6835d52f2324de06a63c Mon Sep 17 00:00:00 2001 From: Dylan Russell Date: Tue, 13 May 2025 18:08:47 +0000 Subject: [PATCH 07/38] Delete exponential backoff code that is now unused --- .../otlp/proto/common/_internal/__init__.py | 35 -------------- .../tests/test_backoff.py | 46 ------------------- .../exporter/otlp/proto/grpc/exporter.py | 2 +- 3 files changed, 1 insertion(+), 82 deletions(-) delete mode 100644 exporter/opentelemetry-exporter-otlp-proto-common/tests/test_backoff.py diff --git a/exporter/opentelemetry-exporter-otlp-proto-common/src/opentelemetry/exporter/otlp/proto/common/_internal/__init__.py b/exporter/opentelemetry-exporter-otlp-proto-common/src/opentelemetry/exporter/otlp/proto/common/_internal/__init__.py index 2f49502cf1d..4f83db24eeb 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-common/src/opentelemetry/exporter/otlp/proto/common/_internal/__init__.py +++ b/exporter/opentelemetry-exporter-otlp-proto-common/src/opentelemetry/exporter/otlp/proto/common/_internal/__init__.py @@ -177,38 +177,3 @@ def _get_resource_data( ) ) return resource_data - - -def _create_exp_backoff_generator(max_value: int = 0) -> Iterator[int]: - """ - Generates an infinite sequence of exponential backoff values. The sequence starts - from 1 (2^0) and doubles each time (2^1, 2^2, 2^3, ...). If a max_value is specified - and non-zero, the generated values will not exceed this maximum, capping at max_value - instead of growing indefinitely. - - Parameters: - - max_value (int, optional): The maximum value to yield. If 0 or not provided, the - sequence grows without bound. - - Returns: - Iterator[int]: An iterator that yields the exponential backoff values, either uncapped or - capped at max_value. - - Example: - ``` - gen = _create_exp_backoff_generator(max_value=10) - for _ in range(5): - print(next(gen)) - ``` - This will print: - 1 - 2 - 4 - 8 - 10 - - Note: this functionality used to be handled by the 'backoff' package. - """ - for i in count(0): - out = 2**i - yield min(out, max_value) if max_value else out diff --git a/exporter/opentelemetry-exporter-otlp-proto-common/tests/test_backoff.py b/exporter/opentelemetry-exporter-otlp-proto-common/tests/test_backoff.py deleted file mode 100644 index 789a184ad04..00000000000 --- a/exporter/opentelemetry-exporter-otlp-proto-common/tests/test_backoff.py +++ /dev/null @@ -1,46 +0,0 @@ -# Copyright The OpenTelemetry Authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from unittest import TestCase - -from opentelemetry.exporter.otlp.proto.common._internal import ( - _create_exp_backoff_generator, -) - - -class TestBackoffGenerator(TestCase): - def test_exp_backoff_generator(self): - generator = _create_exp_backoff_generator() - self.assertEqual(next(generator), 1) - self.assertEqual(next(generator), 2) - self.assertEqual(next(generator), 4) - self.assertEqual(next(generator), 8) - self.assertEqual(next(generator), 16) - - def test_exp_backoff_generator_with_max(self): - generator = _create_exp_backoff_generator(max_value=4) - self.assertEqual(next(generator), 1) - self.assertEqual(next(generator), 2) - self.assertEqual(next(generator), 4) - self.assertEqual(next(generator), 4) - self.assertEqual(next(generator), 4) - - def test_exp_backoff_generator_with_odd_max(self): - # use a max_value that's not in the set - generator = _create_exp_backoff_generator(max_value=11) - self.assertEqual(next(generator), 1) - self.assertEqual(next(generator), 2) - self.assertEqual(next(generator), 4) - self.assertEqual(next(generator), 8) - self.assertEqual(next(generator), 11) diff --git a/exporter/opentelemetry-exporter-otlp-proto-grpc/src/opentelemetry/exporter/otlp/proto/grpc/exporter.py b/exporter/opentelemetry-exporter-otlp-proto-grpc/src/opentelemetry/exporter/otlp/proto/grpc/exporter.py index df2d64a8d0e..e998147da31 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-grpc/src/opentelemetry/exporter/otlp/proto/grpc/exporter.py +++ b/exporter/opentelemetry-exporter-otlp-proto-grpc/src/opentelemetry/exporter/otlp/proto/grpc/exporter.py @@ -82,7 +82,7 @@ # This policy results in backoffs of 1s, 2s, 4s, and then 8s after the initial failed attempt. # Timeout set on the RPC call encompasses the retry backoffs AND time spent waiting # for a response. DEADLINE_EXCEEDED is returned if all the attempts cannot complete within the - # timeout. See https://grpc.io/docs/guides/retry/ for more details. + # timeout, and all fail. See https://grpc.io/docs/guides/retry/ for more details. "maxAttempts": 5, "initialBackoff": "1s", "maxBackoff": "9s", From f373caa572fb30410c3d9ca93a44ff781a9798be Mon Sep 17 00:00:00 2001 From: Dylan Russell Date: Tue, 13 May 2025 18:59:03 +0000 Subject: [PATCH 08/38] Add changelog and remove some unused imports.. --- CHANGELOG.md | 3 +++ .../exporter/otlp/proto/common/_internal/__init__.py | 2 -- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 85ab5a84852..3fa22b0b45b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Refactor `BatchLogRecordProcessor` to simplify code and make the control flow more clear ([#4562](https://github.com/open-telemetry/opentelemetry-python/pull/4562/) and [#4535](https://github.com/open-telemetry/opentelemetry-python/pull/4535)). +- Update OTLP gRPC/HTTP exporters: the export timeout is now inclusive of all retries and backoffs, + and an unnecessary 32 second sleep that occured after all retries had completed/failed was removed + ([#4564](https://github.com/open-telemetry/opentelemetry-python/pull/4564)). ## Version 1.33.0/0.54b0 (2025-05-09) diff --git a/exporter/opentelemetry-exporter-otlp-proto-common/src/opentelemetry/exporter/otlp/proto/common/_internal/__init__.py b/exporter/opentelemetry-exporter-otlp-proto-common/src/opentelemetry/exporter/otlp/proto/common/_internal/__init__.py index 4f83db24eeb..200644368df 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-common/src/opentelemetry/exporter/otlp/proto/common/_internal/__init__.py +++ b/exporter/opentelemetry-exporter-otlp-proto-common/src/opentelemetry/exporter/otlp/proto/common/_internal/__init__.py @@ -17,12 +17,10 @@ import logging from collections.abc import Sequence -from itertools import count from typing import ( Any, Callable, Dict, - Iterator, List, Mapping, Optional, From d1e04e102a81238e3937722885cbe95794077905 Mon Sep 17 00:00:00 2001 From: Dylan Russell Date: Tue, 13 May 2025 19:05:49 +0000 Subject: [PATCH 09/38] fix typo and unit test flaking on windows --- CHANGELOG.md | 2 +- .../tests/test_otlp_exporter_mixin.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3fa22b0b45b..7b9053270e1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,7 +11,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 clear ([#4562](https://github.com/open-telemetry/opentelemetry-python/pull/4562/) and [#4535](https://github.com/open-telemetry/opentelemetry-python/pull/4535)). - Update OTLP gRPC/HTTP exporters: the export timeout is now inclusive of all retries and backoffs, - and an unnecessary 32 second sleep that occured after all retries had completed/failed was removed + and an unnecessary 32 second sleep that occurred after all retries had completed/failed was removed ([#4564](https://github.com/open-telemetry/opentelemetry-python/pull/4564)). diff --git a/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/test_otlp_exporter_mixin.py b/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/test_otlp_exporter_mixin.py index c1b2fbb6997..aa620ed390a 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/test_otlp_exporter_mixin.py +++ b/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/test_otlp_exporter_mixin.py @@ -386,9 +386,9 @@ def test_retry_timeout(self): log.message, ) with self.assertLogs(level=WARNING) as warning: - exporter = OTLPSpanExporterForTesting(insecure=True, timeout=3.5) + exporter = OTLPSpanExporterForTesting(insecure=True, timeout=5) # pylint: disable=protected-access - self.assertEqual(exporter._timeout, 3.5) + self.assertEqual(exporter._timeout, 5) self.assertEqual( exporter.export([self.span]), SpanExportResult.FAILURE, From f42ecd31bea8dab7a52f0a0def30bbbc190b38bc Mon Sep 17 00:00:00 2001 From: Dylan Russell Date: Thu, 22 May 2025 14:13:41 +0000 Subject: [PATCH 10/38] Refactor tests, HTTP exporters a bit --- .../pyproject.toml | 1 + .../otlp/proto/common/_internal/__init__.py | 10 ++ .../test-requirements.txt | 1 + .../exporter/otlp/proto/grpc/exporter.py | 19 ++- .../tests/test_otlp_exporter_mixin.py | 140 +++++++++--------- .../otlp/proto/http/_log_exporter/__init__.py | 48 +++--- .../proto/http/metric_exporter/__init__.py | 62 ++++---- .../proto/http/trace_exporter/__init__.py | 50 +++---- .../metrics/test_otlp_metrics_exporter.py | 42 ++++-- .../tests/test_proto_log_exporter.py | 22 ++- .../tests/test_proto_span_exporter.py | 22 ++- uv.lock | 6 +- 12 files changed, 221 insertions(+), 202 deletions(-) diff --git a/exporter/opentelemetry-exporter-otlp-proto-common/pyproject.toml b/exporter/opentelemetry-exporter-otlp-proto-common/pyproject.toml index 40b4950b99a..77ff64ea5f8 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-common/pyproject.toml +++ b/exporter/opentelemetry-exporter-otlp-proto-common/pyproject.toml @@ -29,6 +29,7 @@ classifiers = [ ] dependencies = [ "opentelemetry-proto == 1.34.0.dev", + "requests ~= 2.7", ] [project.urls] diff --git a/exporter/opentelemetry-exporter-otlp-proto-common/src/opentelemetry/exporter/otlp/proto/common/_internal/__init__.py b/exporter/opentelemetry-exporter-otlp-proto-common/src/opentelemetry/exporter/otlp/proto/common/_internal/__init__.py index 200644368df..c239a4ed18a 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-common/src/opentelemetry/exporter/otlp/proto/common/_internal/__init__.py +++ b/exporter/opentelemetry-exporter-otlp-proto-common/src/opentelemetry/exporter/otlp/proto/common/_internal/__init__.py @@ -27,6 +27,8 @@ TypeVar, ) +import requests + from opentelemetry.proto.common.v1.common_pb2 import AnyValue as PB2AnyValue from opentelemetry.proto.common.v1.common_pb2 import ( ArrayValue as PB2ArrayValue, @@ -108,6 +110,14 @@ def _encode_key_value( ) +def _is_retryable(resp: requests.Response) -> bool: + if resp.status_code == 408: + return True + if resp.status_code >= 500 and resp.status_code <= 599: + return True + return False + + def _encode_array( array: Sequence[Any], allow_null: bool = False ) -> Sequence[PB2AnyValue]: diff --git a/exporter/opentelemetry-exporter-otlp-proto-common/test-requirements.txt b/exporter/opentelemetry-exporter-otlp-proto-common/test-requirements.txt index 6ab1dbf75b7..49ffef98ac2 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-common/test-requirements.txt +++ b/exporter/opentelemetry-exporter-otlp-proto-common/test-requirements.txt @@ -2,6 +2,7 @@ asgiref==3.7.2 Deprecated==1.2.14 importlib-metadata==6.11.0 iniconfig==2.0.0 +requests == 2.7.0 packaging==24.0 pluggy==1.5.0 protobuf==5.26.1 diff --git a/exporter/opentelemetry-exporter-otlp-proto-grpc/src/opentelemetry/exporter/otlp/proto/grpc/exporter.py b/exporter/opentelemetry-exporter-otlp-proto-grpc/src/opentelemetry/exporter/otlp/proto/grpc/exporter.py index e998147da31..baccd38d3d2 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-grpc/src/opentelemetry/exporter/otlp/proto/grpc/exporter.py +++ b/exporter/opentelemetry-exporter-otlp-proto-grpc/src/opentelemetry/exporter/otlp/proto/grpc/exporter.py @@ -72,17 +72,18 @@ from opentelemetry.sdk.trace import ReadableSpan from opentelemetry.util.re import parse_env_headers -_JSON_CONFIG = json.dumps( +# 5 is the maximum allowable attempts allowed by grpc retry policy. +# This policy results in backoffs of 1s, 2s, 4s, and then 8s after the initial failed attempt, +# plus or minus a 20% jitter. Timeout set on the RPC call encompasses the retry backoffs AND time spent waiting +# for a response. DEADLINE_EXCEEDED is returned if all the attempts cannot complete within the +# timeout, and all fail. A header `grpc-retry-pushback-ms` when set by the server will override +# and take precedence over this backoff. See https://grpc.io/docs/guides/retry/ for more details. +_GRPC_RETRY_POLICY = json.dumps( { "methodConfig": [ { "name": [dict()], "retryPolicy": { - # 5 is the maximum allowable attempts allowed by grpc retry policy. - # This policy results in backoffs of 1s, 2s, 4s, and then 8s after the initial failed attempt. - # Timeout set on the RPC call encompasses the retry backoffs AND time spent waiting - # for a response. DEADLINE_EXCEEDED is returned if all the attempts cannot complete within the - # timeout, and all fail. See https://grpc.io/docs/guides/retry/ for more details. "maxAttempts": 5, "initialBackoff": "1s", "maxBackoff": "9s", @@ -213,8 +214,6 @@ class OTLPExporterMixin( compression: gRPC compression method to use """ - _MAX_RETRY_TIMEOUT = 64 - def __init__( self, endpoint: Optional[str] = None, @@ -276,7 +275,7 @@ def __init__( self._endpoint, compression=compression, options=[ - ("grpc.service_config", _JSON_CONFIG), + ("grpc.service_config", _GRPC_RETRY_POLICY), ], ) else: @@ -291,7 +290,7 @@ def __init__( credentials, compression=compression, options=[ - ("grpc.service_config", _JSON_CONFIG), + ("grpc.service_config", _GRPC_RETRY_POLICY), ], ) self._client = self._stub(self._channel) diff --git a/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/test_otlp_exporter_mixin.py b/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/test_otlp_exporter_mixin.py index aa620ed390a..3c752d70e50 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/test_otlp_exporter_mixin.py +++ b/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/test_otlp_exporter_mixin.py @@ -20,14 +20,7 @@ from unittest import TestCase from unittest.mock import ANY, Mock, patch -from google.rpc.code_pb2 import ( # pylint: disable=no-name-in-module - ALREADY_EXISTS, - OK, - UNAVAILABLE, -) -from google.rpc.status_pb2 import Status # pylint: disable=no-name-in-module -from grpc import Compression, server -from grpc_status import rpc_status +from grpc import Compression, StatusCode, server from opentelemetry.exporter.otlp.proto.common.trace_encoder import ( encode_spans, @@ -90,25 +83,36 @@ def shutdown(self, timeout_millis=30_000): class TraceServiceServicerWithExportParams(TraceServiceServicer): def __init__( self, - export_result: int, + export_result: StatusCode, optional_export_sleep: Optional[float] = None, + optional_first_time_retry_millis: Optional[int] = None, ): self.export_result = export_result self.optional_export_sleep = optional_export_sleep + self.optional_first_time_retry_millis = ( + optional_first_time_retry_millis + ) + self.first_attempt = True + self.num_requests = 0 + self.now = time.time() # pylint: disable=invalid-name,unused-argument def Export(self, request, context): - logger.warning("Export Request Received") + self.num_requests += 1 if self.optional_export_sleep: time.sleep(self.optional_export_sleep) - if self.export_result != OK: - context.abort_with_status( - rpc_status.to_status( - Status( - code=self.export_result, + if self.export_result != StatusCode.OK: + if self.optional_first_time_retry_millis and self.first_attempt: + self.first_attempt = False + context.set_trailing_metadata( + ( + ( + "grpc-retry-pushback-ms", + str(self.optional_first_time_retry_millis), + ), ) ) - ) + context.abort(self.export_result, "") return ExportTraceServiceResponse() @@ -287,7 +291,7 @@ def test_otlp_exporter_otlp_compression_envvar( def test_shutdown(self): add_TraceServiceServicer_to_server( - TraceServiceServicerWithExportParams(OK), + TraceServiceServicerWithExportParams(StatusCode.OK), self.server, ) self.assertEqual( @@ -305,7 +309,9 @@ def test_shutdown(self): def test_shutdown_wait_last_export(self): add_TraceServiceServicer_to_server( - TraceServiceServicerWithExportParams(OK, optional_export_sleep=1), + TraceServiceServicerWithExportParams( + StatusCode.OK, optional_export_sleep=1 + ), self.server, ) @@ -324,7 +330,9 @@ def test_shutdown_wait_last_export(self): def test_shutdown_doesnot_wait_last_export(self): add_TraceServiceServicer_to_server( - TraceServiceServicerWithExportParams(OK, optional_export_sleep=3), + TraceServiceServicerWithExportParams( + StatusCode.OK, optional_export_sleep=3 + ), self.server, ) @@ -346,7 +354,7 @@ def test_export_over_closed_grpc_channel(self): # pylint: disable=protected-access add_TraceServiceServicer_to_server( - TraceServiceServicerWithExportParams(OK), + TraceServiceServicerWithExportParams(StatusCode.OK), self.server, ) self.exporter.export([self.span]) @@ -358,61 +366,56 @@ def test_export_over_closed_grpc_channel(self): str(err.exception), "Cannot invoke RPC on closed channel!" ) + def test_retry_with_server_pushback(self): + mock_trace_service = TraceServiceServicerWithExportParams( + StatusCode.UNAVAILABLE, optional_first_time_retry_millis=200 + ) + add_TraceServiceServicer_to_server( + mock_trace_service, + self.server, + ) + exporter = OTLPSpanExporterForTesting(insecure=True, timeout=10) + before = time.time() + self.assertEqual( + exporter.export([self.span]), + SpanExportResult.FAILURE, + ) + after = time.time() + # We set the `grpc-retry-pushback-ms` header to 200 millis on the first server response. + # So we expect the first request at time 0, second at time 0.2, + # third at 1.2 (start of backoff policy), fourth at time 3.2, last at time 7.2. + self.assertEqual(mock_trace_service.num_requests, 5) + # The backoffs have a jitter +- 20%, so we have to put a higher bound than 7.2. + self.assertTrue(after - before < 8.8) + def test_retry_timeout(self): + mock_trace_service = TraceServiceServicerWithExportParams( + StatusCode.UNAVAILABLE + ) add_TraceServiceServicer_to_server( - TraceServiceServicerWithExportParams(UNAVAILABLE), + mock_trace_service, self.server, ) # Set timeout to 1.5 seconds. exporter = OTLPSpanExporterForTesting(insecure=True, timeout=1.5) - with self.assertLogs(level=WARNING) as warning: - self.assertEqual( - exporter.export([self.span]), - SpanExportResult.FAILURE, - ) - # Our GRPC retry policy starts with a 1 second backoff then doubles. - # So we expect just two calls: one at time 0, one at time 1. - # The final log is from when export fails. - self.assertEqual(len(warning.records), 3) - for idx, log in enumerate(warning.records): - if idx != 2: - self.assertEqual( - "Export Request Received", - log.message, - ) - else: - self.assertEqual( - "Failed to export traces to localhost:4317, error code: StatusCode.DEADLINE_EXCEEDED", - log.message, - ) - with self.assertLogs(level=WARNING) as warning: - exporter = OTLPSpanExporterForTesting(insecure=True, timeout=5) - # pylint: disable=protected-access - self.assertEqual(exporter._timeout, 5) - self.assertEqual( - exporter.export([self.span]), - SpanExportResult.FAILURE, - ) - # We expect 3 calls: time 0, time 1, time 3, but not time 7. - # The final log is from when export fails. - self.assertEqual(len(warning.records), 4) - for idx, log in enumerate(warning.records): - if idx != 3: - self.assertEqual( - "Export Request Received", - log.message, - ) - else: - self.assertEqual( - "Failed to export traces to localhost:4317, error code: StatusCode.DEADLINE_EXCEEDED", - log.message, - ) + before = time.time() + self.assertEqual( + exporter.export([self.span]), + SpanExportResult.FAILURE, + ) + after = time.time() + # Our retry starts with a 1 second backoff then doubles. + # So we expect just two calls: one at time 0, one at time 1. + self.assertEqual(mock_trace_service.num_requests, 2) + # gRPC retry config waits for the timeout (1.5) before cancelling the request. + self.assertTrue(after - before < 1.6) def test_timeout_set_correctly(self): + mock_trace_service = TraceServiceServicerWithExportParams( + StatusCode.OK, optional_export_sleep=0.5 + ) add_TraceServiceServicer_to_server( - TraceServiceServicerWithExportParams( - OK, optional_export_sleep=0.5 - ), + mock_trace_service, self.server, ) exporter = OTLPSpanExporterForTesting(insecure=True, timeout=0.4) @@ -427,6 +430,7 @@ def test_timeout_set_correctly(self): "Failed to export traces to localhost:4317, error code: StatusCode.DEADLINE_EXCEEDED", warning.records[-1].message, ) + self.assertEqual(mock_trace_service.num_requests, 1) exporter = OTLPSpanExporterForTesting(insecure=True, timeout=0.8) self.assertEqual( exporter.export([self.span]), @@ -444,7 +448,9 @@ def test_otlp_headers_from_env(self): def test_permanent_failure(self): with self.assertLogs(level=WARNING) as warning: add_TraceServiceServicer_to_server( - TraceServiceServicerWithExportParams(ALREADY_EXISTS), + TraceServiceServicerWithExportParams( + StatusCode.ALREADY_EXISTS + ), self.server, ) self.assertEqual( diff --git a/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/_log_exporter/__init__.py b/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/_log_exporter/__init__.py index 6a6e8821feb..9dda3cca5cd 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/_log_exporter/__init__.py +++ b/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/_log_exporter/__init__.py @@ -14,6 +14,7 @@ import gzip import logging +import random import zlib from io import BytesIO from os import environ @@ -23,6 +24,9 @@ import requests from requests.exceptions import ConnectionError +from opentelemetry.exporter.otlp.proto.common._internal import ( + _is_retryable, +) from opentelemetry.exporter.otlp.proto.common._log_encoder import encode_logs from opentelemetry.exporter.otlp.proto.http import ( _OTLP_HTTP_HEADERS, @@ -58,6 +62,7 @@ DEFAULT_ENDPOINT = "http://localhost:4318/" DEFAULT_LOGS_EXPORT_PATH = "v1/logs" DEFAULT_TIMEOUT = 10 # in seconds +_MAX_RETRYS = 6 class OTLPLogExporter(LogExporter): @@ -151,48 +156,37 @@ def _export(self, serialized_data: bytes, timeout_sec: float): ) return resp - @staticmethod - def _retryable(resp: requests.Response) -> bool: - if resp.status_code == 408: - return True - if resp.status_code >= 500 and resp.status_code <= 599: - return True - return False - def export(self, batch: Sequence[LogData]) -> LogExportResult: - # After the call to Shutdown subsequent calls to Export are - # not allowed and should return a Failure result. if self._shutdown: _logger.warning("Exporter already shutdown, ignoring batch") return LogExportResult.FAILURE serialized_data = encode_logs(batch).SerializeToString() deadline_sec = time() + self._timeout - for delay in [1, 2, 4, 8, 16, 32]: - remaining_time_sec = deadline_sec - time() - if remaining_time_sec < 1e-09: - return LogExportResult.FAILURE - resp = self._export(serialized_data, remaining_time_sec) - # pylint: disable=no-else-return + backoff_seconds = 1 * random.uniform(0.8, 1.2) + for retry_num in range(1, _MAX_RETRYS + 1): + resp = self._export(serialized_data, deadline_sec - time()) if resp.ok: return LogExportResult.SUCCESS - elif self._retryable(resp): - if delay > (deadline_sec - time()): - return LogExportResult.FAILURE - _logger.warning( - "Transient error %s encountered while exporting logs batch, retrying in %ss.", - resp.reason, - delay, - ) - sleep(delay) - continue - else: + if ( + not _is_retryable(resp) + or retry_num == _MAX_RETRYS + or backoff_seconds > (deadline_sec - time()) + ): _logger.error( "Failed to export logs batch code: %s, reason: %s", resp.status_code, resp.text, ) return LogExportResult.FAILURE + _logger.warning( + "Transient error %s encountered while exporting logs batch, retrying in %.2fs.", + resp.reason, + backoff_seconds, + ) + sleep(backoff_seconds) + backoff_seconds *= 2 * random.uniform(0.8, 1.2) + # Not possible to reach here but the linter is complaining. return LogExportResult.FAILURE def force_flush(self, timeout_millis: float = 10_000) -> bool: diff --git a/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/metric_exporter/__init__.py b/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/metric_exporter/__init__.py index 6c8b930fbc7..bed515b0bc5 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/metric_exporter/__init__.py +++ b/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/metric_exporter/__init__.py @@ -14,6 +14,7 @@ import gzip import logging +import random import zlib from io import BytesIO from os import environ @@ -34,6 +35,7 @@ from opentelemetry.exporter.otlp.proto.common._internal import ( _get_resource_data, + _is_retryable, ) from opentelemetry.exporter.otlp.proto.common._internal.metrics_encoder import ( OTLPMetricExporterMixin, @@ -48,7 +50,7 @@ from opentelemetry.proto.collector.metrics.v1.metrics_service_pb2 import ( # noqa: F401 ExportMetricsServiceRequest, ) -from opentelemetry.proto.common.v1.common_pb2 import ( # noqa: F401 # noqa: F401 +from opentelemetry.proto.common.v1.common_pb2 import ( # noqa: F401 AnyValue, ArrayValue, InstrumentationScope, @@ -98,6 +100,7 @@ DEFAULT_ENDPOINT = "http://localhost:4318/" DEFAULT_METRICS_EXPORT_PATH = "v1/metrics" DEFAULT_TIMEOUT = 10 # in seconds +_MAX_RETRYS = 6 class OTLPMetricExporter(MetricExporter, OTLPMetricExporterMixin): @@ -163,6 +166,7 @@ def __init__( self._common_configuration( preferred_temporality, preferred_aggregation ) + self._shutdown = False def _export(self, serialized_data: bytes, timeout_sec: float): data = serialized_data @@ -196,55 +200,51 @@ def _export(self, serialized_data: bytes, timeout_sec: float): ) return resp - @staticmethod - def _retryable(resp: requests.Response) -> bool: - if resp.status_code == 408: - return True - if resp.status_code >= 500 and resp.status_code <= 599: - return True - return False - def export( self, metrics_data: MetricsData, timeout_millis: Optional[float] = None, **kwargs, ) -> MetricExportResult: - serialized_data = encode_metrics(metrics_data) + if self._shutdown: + _logger.warning("Exporter already shutdown, ignoring batch") + return MetricExportResult.FAILURE + serialized_data = encode_metrics(metrics_data).SerializeToString() deadline_sec = time() + ( timeout_millis / 1e3 if timeout_millis else self._timeout ) - for delay in [1, 2, 4, 8, 16, 32]: - remaining_time_sec = deadline_sec - time() - if remaining_time_sec < 1e-09: - return MetricExportResult.FAILURE - resp = self._export( - serialized_data.SerializeToString(), remaining_time_sec - ) - # pylint: disable=no-else-return + backoff_seconds = 1 * random.uniform(0.8, 1.2) + for retry_num in range(1, _MAX_RETRYS + 1): + resp = self._export(serialized_data, deadline_sec - time()) if resp.ok: return MetricExportResult.SUCCESS - elif self._retryable(resp): - if delay > (deadline_sec - time()): - return MetricExportResult.FAILURE - _logger.warning( - "Transient error %s encountered while exporting metric batch, retrying in %ss.", - resp.reason, - delay, - ) - sleep(delay) - continue - else: + if ( + not _is_retryable(resp) + or retry_num == _MAX_RETRYS + or backoff_seconds > (deadline_sec - time()) + ): _logger.error( - "Failed to export batch code: %s, reason: %s", + "Failed to export metrics batch code: %s, reason: %s", resp.status_code, resp.text, ) return MetricExportResult.FAILURE + _logger.warning( + "Transient error %s encountered while exporting metrics batch, retrying in %.2fs.", + resp.reason, + backoff_seconds, + ) + sleep(backoff_seconds) + backoff_seconds *= 2 * random.uniform(0.8, 1.2) + # Not possible to reach here but the linter is complaining. return MetricExportResult.FAILURE def shutdown(self, timeout_millis: float = 30_000, **kwargs) -> None: - pass + if self._shutdown: + _logger.warning("Exporter already shutdown, ignoring call") + return + self._session.close() + self._shutdown = True @property def _exporting(self) -> str: diff --git a/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/trace_exporter/__init__.py b/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/trace_exporter/__init__.py index 3feb8731c5e..7bce9494648 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/trace_exporter/__init__.py +++ b/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/trace_exporter/__init__.py @@ -14,6 +14,7 @@ import gzip import logging +import random import zlib from io import BytesIO from os import environ @@ -23,6 +24,9 @@ import requests from requests.exceptions import ConnectionError +from opentelemetry.exporter.otlp.proto.common._internal import ( + _is_retryable, +) from opentelemetry.exporter.otlp.proto.common.trace_encoder import ( encode_spans, ) @@ -57,6 +61,7 @@ DEFAULT_ENDPOINT = "http://localhost:4318/" DEFAULT_TRACES_EXPORT_PATH = "v1/traces" DEFAULT_TIMEOUT = 10 # in seconds +_MAX_RETRYS = 6 class OTLPSpanExporter(SpanExporter): @@ -149,48 +154,37 @@ def _export(self, serialized_data: bytes, timeout_sec: float): ) return resp - @staticmethod - def _retryable(resp: requests.Response) -> bool: - if resp.status_code == 408: - return True - if resp.status_code >= 500 and resp.status_code <= 599: - return True - return False - def export(self, spans: Sequence[ReadableSpan]) -> SpanExportResult: - # After the call to Shutdown subsequent calls to Export are - # not allowed and should return a Failure result. if self._shutdown: _logger.warning("Exporter already shutdown, ignoring batch") return SpanExportResult.FAILURE serialized_data = encode_spans(spans).SerializePartialToString() deadline_sec = time() + self._timeout - for delay in [1, 2, 4, 8, 16, 32]: - remaining_time_sec = deadline_sec - time() - if remaining_time_sec < 1e-09: - return SpanExportResult.FAILURE - resp = self._export(serialized_data, remaining_time_sec) - # pylint: disable=no-else-return + backoff_seconds = 1 * random.uniform(0.8, 1.2) + for retry_num in range(1, _MAX_RETRYS + 1): + resp = self._export(serialized_data, deadline_sec - time()) if resp.ok: return SpanExportResult.SUCCESS - elif self._retryable(resp): - if delay > (deadline_sec - time()): - return SpanExportResult.FAILURE - _logger.warning( - "Transient error %s encountered while exporting span batch, retrying in %ss.", - resp.reason, - delay, - ) - sleep(delay) - continue - else: + if ( + not _is_retryable(resp) + or retry_num == _MAX_RETRYS + or backoff_seconds > (deadline_sec - time()) + ): _logger.error( - "Failed to export batch code: %s, reason: %s", + "Failed to export span batch code: %s, reason: %s", resp.status_code, resp.text, ) return SpanExportResult.FAILURE + _logger.warning( + "Transient error %s encountered while exporting span batch, retrying in %.2fs.", + resp.reason, + backoff_seconds, + ) + sleep(backoff_seconds) + backoff_seconds *= 2 * random.uniform(0.8, 1.2) + # Not possible to reach here but the linter is complaining. return SpanExportResult.FAILURE def shutdown(self): diff --git a/exporter/opentelemetry-exporter-otlp-proto-http/tests/metrics/test_otlp_metrics_exporter.py b/exporter/opentelemetry-exporter-otlp-proto-http/tests/metrics/test_otlp_metrics_exporter.py index df7c0c17ea3..85422e2b281 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-http/tests/metrics/test_otlp_metrics_exporter.py +++ b/exporter/opentelemetry-exporter-otlp-proto-http/tests/metrics/test_otlp_metrics_exporter.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import time from logging import WARNING from os import environ from unittest import TestCase @@ -511,26 +512,39 @@ def test_retry_timeout(self, mock_post): resp.reason = "UNAVAILABLE" mock_post.return_value = resp with self.assertLogs(level=WARNING) as warning: - # Set timeout to 1.5 seconds + before = time.time() + # Set timeout to 1.5 seconds, takes precedence over the 3.5 second class timeout. self.assertEqual( exporter.export(self.metrics["sum_int"], 1500), MetricExportResult.FAILURE, ) - # Code should return failure before the final retry which would exceed timeout. - # Code should return failure after retrying once. - self.assertEqual(len(warning.records), 1) - self.assertEqual( - "Transient error UNAVAILABLE encountered while exporting metric batch, retrying in 1s.", + after = time.time() + + # First call at time 0, second at time 1, then an early return before the second backoff sleep b/c it would exceed timeout. + self.assertEqual(mock_post.call_count, 2) + # There's a +/-20% jitter on each backoff. + self.assertTrue(after - before < 1.3) + self.assertIn( + "Transient error UNAVAILABLE encountered while exporting metrics batch, retrying in", warning.records[0].message, ) - with self.assertLogs(level=WARNING) as warning: - # This time don't pass in a timeout, so it will fallback to 3.5 second set on class. - self.assertEqual( - exporter.export(self.metrics["sum_int"]), - MetricExportResult.FAILURE, - ) - # 2 retrys (after 1s, 3s). - self.assertEqual(len(warning.records), 2) + mock_post.reset_mock() + before = time.time() + # This time the class level 3.5s timeout should be used. + self.assertEqual( + exporter.export(self.metrics["sum_int"]), + MetricExportResult.FAILURE, + ) + after = time.time() + + # First call at time 0, second at time 1, third at time 3. + self.assertEqual(mock_post.call_count, 3) + # There's a +/-20% jitter on each backoff. + self.assertTrue(after - before < 3.7) + self.assertIn( + "Transient error UNAVAILABLE encountered while exporting metrics batch, retrying in", + warning.records[0].message, + ) @patch.object(Session, "post") def test_timeout_set_correctly(self, mock_post): diff --git a/exporter/opentelemetry-exporter-otlp-proto-http/tests/test_proto_log_exporter.py b/exporter/opentelemetry-exporter-otlp-proto-http/tests/test_proto_log_exporter.py index 1ba1db06567..d646968fa74 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-http/tests/test_proto_log_exporter.py +++ b/exporter/opentelemetry-exporter-otlp-proto-http/tests/test_proto_log_exporter.py @@ -14,6 +14,7 @@ # pylint: disable=protected-access +import time import unittest from logging import WARNING from typing import List @@ -358,24 +359,21 @@ def test_retry_timeout(self, mock_post): resp.reason = "UNAVAILABLE" mock_post.return_value = resp with self.assertLogs(level=WARNING) as warning: + before = time.time() + # Set timeout to 1.5 seconds self.assertEqual( exporter.export(self._get_sdk_log_data()), LogExportResult.FAILURE, ) - # Code should return failure after retrying once. - self.assertEqual(len(warning.records), 1) - self.assertEqual( - "Transient error UNAVAILABLE encountered while exporting logs batch, retrying in 1s.", + after = time.time() + # First call at time 0, second at time 1, then an early return before the second backoff sleep b/c it would exceed timeout. + self.assertEqual(mock_post.call_count, 2) + # There's a +/-20% jitter on each backoff. + self.assertTrue(after - before < 1.3) + self.assertIn( + "Transient error UNAVAILABLE encountered while exporting logs batch, retrying in", warning.records[0].message, ) - exporter = OTLPLogExporter(timeout=3.5) - with self.assertLogs(level=WARNING) as warning: - self.assertEqual( - exporter.export(self._get_sdk_log_data()), - LogExportResult.FAILURE, - ) - # 2 retrys (after 1s, 3s). - self.assertEqual(len(warning.records), 2) @patch.object(Session, "post") def test_timeout_set_correctly(self, mock_post): diff --git a/exporter/opentelemetry-exporter-otlp-proto-http/tests/test_proto_span_exporter.py b/exporter/opentelemetry-exporter-otlp-proto-http/tests/test_proto_span_exporter.py index 2096c727462..0707f7ef73f 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-http/tests/test_proto_span_exporter.py +++ b/exporter/opentelemetry-exporter-otlp-proto-http/tests/test_proto_span_exporter.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import time import unittest from logging import WARNING from unittest.mock import MagicMock, Mock, patch @@ -258,24 +259,21 @@ def test_retry_timeout(self, mock_post): resp.reason = "UNAVAILABLE" mock_post.return_value = resp with self.assertLogs(level=WARNING) as warning: + before = time.time() + # Set timeout to 1.5 seconds self.assertEqual( exporter.export([BASIC_SPAN]), SpanExportResult.FAILURE, ) - # Code should return failure after retrying once. - self.assertEqual(len(warning.records), 1) - self.assertEqual( - "Transient error UNAVAILABLE encountered while exporting span batch, retrying in 1s.", + after = time.time() + # First call at time 0, second at time 1, then an early return before the second backoff sleep b/c it would exceed timeout. + self.assertEqual(mock_post.call_count, 2) + # There's a +/-20% jitter on each backoff. + self.assertTrue(after - before < 1.3) + self.assertIn( + "Transient error UNAVAILABLE encountered while exporting span batch, retrying in", warning.records[0].message, ) - exporter = OTLPSpanExporter(timeout=3.5) - with self.assertLogs(level=WARNING) as warning: - self.assertEqual( - exporter.export([BASIC_SPAN]), - SpanExportResult.FAILURE, - ) - # 2 retrys (after 1s, 3s). - self.assertEqual(len(warning.records), 2) @patch.object(Session, "post") def test_timeout_set_correctly(self, mock_post): diff --git a/uv.lock b/uv.lock index adb4be16d17..4ba10c0dd5f 100644 --- a/uv.lock +++ b/uv.lock @@ -293,10 +293,14 @@ name = "opentelemetry-exporter-otlp-proto-common" source = { editable = "exporter/opentelemetry-exporter-otlp-proto-common" } dependencies = [ { name = "opentelemetry-proto" }, + { name = "requests" }, ] [package.metadata] -requires-dist = [{ name = "opentelemetry-proto", editable = "opentelemetry-proto" }] +requires-dist = [ + { name = "opentelemetry-proto", editable = "opentelemetry-proto" }, + { name = "requests", specifier = "~=2.7" }, +] [[package]] name = "opentelemetry-exporter-otlp-proto-grpc" From 46e15f16b26ff94ff6b0b48215cae23ec7687dff Mon Sep 17 00:00:00 2001 From: Dylan Russell Date: Thu, 22 May 2025 15:38:35 +0000 Subject: [PATCH 11/38] Remove unneeded test reqs --- .../test-requirements.txt | 2 -- 1 file changed, 2 deletions(-) diff --git a/exporter/opentelemetry-exporter-otlp-proto-grpc/test-requirements.txt b/exporter/opentelemetry-exporter-otlp-proto-grpc/test-requirements.txt index b90eb642adb..67a94245243 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-grpc/test-requirements.txt +++ b/exporter/opentelemetry-exporter-otlp-proto-grpc/test-requirements.txt @@ -1,7 +1,5 @@ asgiref==3.7.2 -googleapis-common-protos==1.63.2 grpcio==1.66.2 -grpcio-status==1.66.0 importlib-metadata==6.11.0 iniconfig==2.0.0 packaging==24.0 From dcba91a2cd91d4469d037d3a12ee4b99bfc04cf9 Mon Sep 17 00:00:00 2001 From: Dylan Russell Date: Thu, 5 Jun 2025 17:39:44 +0000 Subject: [PATCH 12/38] Remove gRPC retry config --- .../exporter/otlp/proto/grpc/exporter.py | 108 +++++++++--------- .../tests/test_otlp_exporter_mixin.py | 57 +++++---- .../tests/test_otlp_metrics_exporter.py | 3 +- .../tests/test_otlp_trace_exporter.py | 4 +- 4 files changed, 92 insertions(+), 80 deletions(-) diff --git a/exporter/opentelemetry-exporter-otlp-proto-grpc/src/opentelemetry/exporter/otlp/proto/grpc/exporter.py b/exporter/opentelemetry-exporter-otlp-proto-grpc/src/opentelemetry/exporter/otlp/proto/grpc/exporter.py index dfa9597a2f1..b22553db308 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-grpc/src/opentelemetry/exporter/otlp/proto/grpc/exporter.py +++ b/exporter/opentelemetry-exporter-otlp-proto-grpc/src/opentelemetry/exporter/otlp/proto/grpc/exporter.py @@ -14,12 +14,13 @@ """OTLP Exporter""" -import json +import random import threading from abc import ABC, abstractmethod from collections.abc import Sequence # noqa: F401 from logging import getLogger from os import environ +from time import sleep, time from typing import ( # noqa: F401 Any, Callable, @@ -34,6 +35,7 @@ from typing import Sequence as TypingSequence from urllib.parse import urlparse +from google.rpc.error_details_pb2 import RetryInfo from typing_extensions import deprecated from grpc import ( @@ -72,35 +74,18 @@ from opentelemetry.sdk.trace import ReadableSpan from opentelemetry.util.re import parse_env_headers -# 5 is the maximum allowable attempts allowed by grpc retry policy. -# This policy results in backoffs of 1s, 2s, 4s, and then 8s after the initial failed attempt, -# plus or minus a 20% jitter. Timeout set on the RPC call encompasses the retry backoffs AND time spent waiting -# for a response. DEADLINE_EXCEEDED is returned if all the attempts cannot complete within the -# timeout, and all fail. A header `grpc-retry-pushback-ms` when set by the server will override -# and take precedence over this backoff. See https://grpc.io/docs/guides/retry/ for more details. -_GRPC_RETRY_POLICY = json.dumps( - { - "methodConfig": [ - { - "name": [dict()], - "retryPolicy": { - "maxAttempts": 5, - "initialBackoff": "1s", - "maxBackoff": "9s", - "backoffMultiplier": 2, - "retryableStatusCodes": [ - "UNAVAILABLE", - "CANCELLED", - "RESOURCE_EXHAUSTED", - "ABORTED", - "OUT_OF_RANGE", - "DATA_LOSS", - ], - }, - } - ] - } +_RETRYABLE_ERROR_CODES = frozenset( + [ + StatusCode.CANCELLED, + StatusCode.DEADLINE_EXCEEDED, + StatusCode.RESOURCE_EXHAUSTED, + StatusCode.ABORTED, + StatusCode.OUT_OF_RANGE, + StatusCode.UNAVAILABLE, + StatusCode.DATA_LOSS, + ] ) +_MAX_RETRYS = 6 logger = getLogger(__name__) SDKDataT = TypeVar("SDKDataT") ResourceDataT = TypeVar("ResourceDataT") @@ -273,9 +258,6 @@ def __init__( self._channel = insecure_channel( self._endpoint, compression=compression, - options=[ - ("grpc.service_config", _GRPC_RETRY_POLICY), - ], ) else: credentials = _get_credentials( @@ -288,9 +270,6 @@ def __init__( self._endpoint, credentials, compression=compression, - options=[ - ("grpc.service_config", _GRPC_RETRY_POLICY), - ], ) self._client = self._stub(self._channel) @@ -314,23 +293,50 @@ def _export( # FIXME remove this check if the export type for traces # gets updated to a class that represents the proto # TracesData and use the code below instead. + retry_info = RetryInfo() with self._export_lock: - try: - self._client.Export( - request=self._translate_data(data), - metadata=self._headers, - timeout=self._timeout, - ) - return self._result.SUCCESS - except RpcError as error: - logger.error( - "Failed to export %s to %s, error code: %s", - self._exporting, - self._endpoint, - error.code(), - exc_info=error.code() == StatusCode.UNKNOWN, - ) - return self._result.FAILURE + deadline_sec = time() + self._timeout + backoff_seconds = 1 * random.uniform(0.8, 1.2) + for retry_num in range(1, _MAX_RETRYS + 1): + try: + self._client.Export( + request=self._translate_data(data), + metadata=self._headers, + timeout=self._timeout, + ) + return self._result.SUCCESS + except RpcError as error: + retry_info_bin = dict(error.trailing_metadata()).get( + "google.rpc.retryinfo-bin" + ) + if retry_info_bin is not None: + retry_info.ParseFromString(retry_info_bin) + backoff_seconds = ( + retry_info.retry_delay.seconds + + retry_info.retry_delay.nanos / 1.0e9 + ) + if ( + error.code() not in _RETRYABLE_ERROR_CODES + or retry_num == _MAX_RETRYS + or backoff_seconds > (deadline_sec - time()) + ): + logger.error( + "Failed to export %s to %s, error code: %s", + self._exporting, + self._endpoint, + error.code(), + exc_info=error.code() == StatusCode.UNKNOWN, + ) + return self._result.FAILURE + logger.warning( + "Transient error %s encountered while exporting logs batch, retrying in %.2fs.", + error.code(), + backoff_seconds, + ) + sleep(backoff_seconds) + backoff_seconds *= 2 * random.uniform(0.8, 1.2) + # Not possible to reach here but the linter is complaining. + return self._result.FAILURE def shutdown(self, timeout_millis: float = 30_000, **kwargs) -> None: if self._shutdown: diff --git a/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/test_otlp_exporter_mixin.py b/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/test_otlp_exporter_mixin.py index 3c752d70e50..dc13724c0a5 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/test_otlp_exporter_mixin.py +++ b/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/test_otlp_exporter_mixin.py @@ -18,8 +18,14 @@ from logging import WARNING, getLogger from typing import Any, Optional, Sequence from unittest import TestCase -from unittest.mock import ANY, Mock, patch +from unittest.mock import Mock, patch +from google.protobuf.duration_pb2 import ( # pylint: disable=no-name-in-module + Duration, +) +from google.rpc.error_details_pb2 import ( # pylint: disable=no-name-in-module + RetryInfo, +) from grpc import Compression, StatusCode, server from opentelemetry.exporter.otlp.proto.common.trace_encoder import ( @@ -84,17 +90,15 @@ class TraceServiceServicerWithExportParams(TraceServiceServicer): def __init__( self, export_result: StatusCode, - optional_export_sleep: Optional[float] = None, optional_first_time_retry_millis: Optional[int] = None, + optional_export_sleep: Optional[float] = None, ): self.export_result = export_result self.optional_export_sleep = optional_export_sleep self.optional_first_time_retry_millis = ( optional_first_time_retry_millis ) - self.first_attempt = True self.num_requests = 0 - self.now = time.time() # pylint: disable=invalid-name,unused-argument def Export(self, request, context): @@ -102,17 +106,23 @@ def Export(self, request, context): if self.optional_export_sleep: time.sleep(self.optional_export_sleep) if self.export_result != StatusCode.OK: - if self.optional_first_time_retry_millis and self.first_attempt: - self.first_attempt = False + if ( + self.optional_first_time_retry_millis + and self.num_requests == 1 + ): context.set_trailing_metadata( ( ( - "grpc-retry-pushback-ms", - str(self.optional_first_time_retry_millis), + "google.rpc.retryinfo-bin", + RetryInfo( + retry_delay=Duration( + nanos=self.optional_first_time_retry_millis + ) + ).SerializeToString(), ), ) ) - context.abort(self.export_result, "") + context.set_code(self.export_result) return ExportTraceServiceResponse() @@ -262,7 +272,6 @@ def test_otlp_exporter_otlp_compression_unspecified( mock_insecure_channel.assert_called_once_with( "localhost:4317", compression=Compression.NoCompression, - options=ANY, ) # pylint: disable=no-self-use, disable=unused-argument @@ -286,7 +295,7 @@ def test_otlp_exporter_otlp_compression_envvar( """Just OTEL_EXPORTER_OTLP_COMPRESSION should work""" OTLPSpanExporterForTesting(insecure=True) mock_insecure_channel.assert_called_once_with( - "localhost:4317", compression=Compression.Gzip, options=ANY + "localhost:4317", compression=Compression.Gzip ) def test_shutdown(self): @@ -366,7 +375,7 @@ def test_export_over_closed_grpc_channel(self): str(err.exception), "Cannot invoke RPC on closed channel!" ) - def test_retry_with_server_pushback(self): + def test_retry_info_is_respected(self): mock_trace_service = TraceServiceServicerWithExportParams( StatusCode.UNAVAILABLE, optional_first_time_retry_millis=200 ) @@ -381,14 +390,15 @@ def test_retry_with_server_pushback(self): SpanExportResult.FAILURE, ) after = time.time() - # We set the `grpc-retry-pushback-ms` header to 200 millis on the first server response. + # We set the `grpc-retry-pushback-ms` header to 200 millis on the first server response only, + # and then we do exponential backoff using that first value. # So we expect the first request at time 0, second at time 0.2, - # third at 1.2 (start of backoff policy), fourth at time 3.2, last at time 7.2. - self.assertEqual(mock_trace_service.num_requests, 5) - # The backoffs have a jitter +- 20%, so we have to put a higher bound than 7.2. - self.assertTrue(after - before < 8.8) + # third at .6, fourth at 1.4, fifth at 3, and final one at 6.2. + self.assertEqual(mock_trace_service.num_requests, 6) + # The backoffs have a jitter +- 20%, so we have to put a higher bound than 6.2. + self.assertTrue(after - before < 7.5) - def test_retry_timeout(self): + def test_retry_not_made_if_would_exceed_timeout(self): mock_trace_service = TraceServiceServicerWithExportParams( StatusCode.UNAVAILABLE ) @@ -396,8 +406,7 @@ def test_retry_timeout(self): mock_trace_service, self.server, ) - # Set timeout to 1.5 seconds. - exporter = OTLPSpanExporterForTesting(insecure=True, timeout=1.5) + exporter = OTLPSpanExporterForTesting(insecure=True, timeout=4) before = time.time() self.assertEqual( exporter.export([self.span]), @@ -405,10 +414,10 @@ def test_retry_timeout(self): ) after = time.time() # Our retry starts with a 1 second backoff then doubles. - # So we expect just two calls: one at time 0, one at time 1. - self.assertEqual(mock_trace_service.num_requests, 2) - # gRPC retry config waits for the timeout (1.5) before cancelling the request. - self.assertTrue(after - before < 1.6) + # First call at time 0, second at time 1, third at time 3, fourth would exceed timeout. + self.assertEqual(mock_trace_service.num_requests, 3) + # There's a +/-20% jitter on each backoff. + self.assertTrue(after - before < 3.7) def test_timeout_set_correctly(self): mock_trace_service = TraceServiceServicerWithExportParams( diff --git a/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/test_otlp_metrics_exporter.py b/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/test_otlp_metrics_exporter.py index ceda6e72a8e..c7dc83ef32b 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/test_otlp_metrics_exporter.py +++ b/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/test_otlp_metrics_exporter.py @@ -18,7 +18,7 @@ from os.path import dirname from typing import List from unittest import TestCase -from unittest.mock import ANY, patch +from unittest.mock import patch from grpc import ChannelCredentials, Compression @@ -299,7 +299,6 @@ def test_otlp_exporter_otlp_compression_kwarg(self, mock_insecure_channel): mock_insecure_channel.assert_called_once_with( "localhost:4317", compression=Compression.NoCompression, - options=ANY, ) def test_split_metrics_data_many_data_points(self): diff --git a/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/test_otlp_trace_exporter.py b/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/test_otlp_trace_exporter.py index 5238dc91224..7a609c34867 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/test_otlp_trace_exporter.py +++ b/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/test_otlp_trace_exporter.py @@ -16,7 +16,7 @@ import os from unittest import TestCase -from unittest.mock import ANY, Mock, PropertyMock, patch +from unittest.mock import Mock, PropertyMock, patch from grpc import ChannelCredentials, Compression @@ -335,7 +335,6 @@ def test_otlp_exporter_otlp_compression_kwarg(self, mock_insecure_channel): mock_insecure_channel.assert_called_once_with( "localhost:4317", compression=Compression.NoCompression, - options=ANY, ) # pylint: disable=no-self-use @@ -354,7 +353,6 @@ def test_otlp_exporter_otlp_compression_precendence( mock_insecure_channel.assert_called_once_with( "localhost:4317", compression=Compression.Gzip, - options=ANY, ) def test_translate_spans(self): From 71b77e10193358035953054f07588eadaf69582a Mon Sep 17 00:00:00 2001 From: Dylan Russell Date: Thu, 5 Jun 2025 18:02:18 +0000 Subject: [PATCH 13/38] Tweak backoff calculation --- .../exporter/otlp/proto/grpc/exporter.py | 2 +- .../tests/test_otlp_exporter_mixin.py | 23 ++++++------------- .../otlp/proto/http/_log_exporter/__init__.py | 2 +- .../proto/http/metric_exporter/__init__.py | 2 +- .../proto/http/trace_exporter/__init__.py | 2 +- 5 files changed, 11 insertions(+), 20 deletions(-) diff --git a/exporter/opentelemetry-exporter-otlp-proto-grpc/src/opentelemetry/exporter/otlp/proto/grpc/exporter.py b/exporter/opentelemetry-exporter-otlp-proto-grpc/src/opentelemetry/exporter/otlp/proto/grpc/exporter.py index b22553db308..a011a0e04db 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-grpc/src/opentelemetry/exporter/otlp/proto/grpc/exporter.py +++ b/exporter/opentelemetry-exporter-otlp-proto-grpc/src/opentelemetry/exporter/otlp/proto/grpc/exporter.py @@ -334,7 +334,7 @@ def _export( backoff_seconds, ) sleep(backoff_seconds) - backoff_seconds *= 2 * random.uniform(0.8, 1.2) + backoff_seconds = 2 ** retry_num * random.uniform(0.8, 1.2) # Not possible to reach here but the linter is complaining. return self._result.FAILURE diff --git a/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/test_otlp_exporter_mixin.py b/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/test_otlp_exporter_mixin.py index dc13724c0a5..55f260345c2 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/test_otlp_exporter_mixin.py +++ b/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/test_otlp_exporter_mixin.py @@ -90,14 +90,12 @@ class TraceServiceServicerWithExportParams(TraceServiceServicer): def __init__( self, export_result: StatusCode, - optional_first_time_retry_millis: Optional[int] = None, + optional_retry_millis: Optional[int] = None, optional_export_sleep: Optional[float] = None, ): self.export_result = export_result self.optional_export_sleep = optional_export_sleep - self.optional_first_time_retry_millis = ( - optional_first_time_retry_millis - ) + self.optional_retry_millis = optional_retry_millis self.num_requests = 0 # pylint: disable=invalid-name,unused-argument @@ -106,17 +104,14 @@ def Export(self, request, context): if self.optional_export_sleep: time.sleep(self.optional_export_sleep) if self.export_result != StatusCode.OK: - if ( - self.optional_first_time_retry_millis - and self.num_requests == 1 - ): + if self.optional_retry_millis and self.num_requests == 1: context.set_trailing_metadata( ( ( "google.rpc.retryinfo-bin", RetryInfo( retry_delay=Duration( - nanos=self.optional_first_time_retry_millis + nanos=self.optional_retry_millis ) ).SerializeToString(), ), @@ -377,7 +372,7 @@ def test_export_over_closed_grpc_channel(self): def test_retry_info_is_respected(self): mock_trace_service = TraceServiceServicerWithExportParams( - StatusCode.UNAVAILABLE, optional_first_time_retry_millis=200 + StatusCode.UNAVAILABLE, optional_retry_millis=200 ) add_TraceServiceServicer_to_server( mock_trace_service, @@ -390,13 +385,9 @@ def test_retry_info_is_respected(self): SpanExportResult.FAILURE, ) after = time.time() - # We set the `grpc-retry-pushback-ms` header to 200 millis on the first server response only, - # and then we do exponential backoff using that first value. - # So we expect the first request at time 0, second at time 0.2, - # third at .6, fourth at 1.4, fifth at 3, and final one at 6.2. self.assertEqual(mock_trace_service.num_requests, 6) - # The backoffs have a jitter +- 20%, so we have to put a higher bound than 6.2. - self.assertTrue(after - before < 7.5) + # 200 * 5 = 1 second, plus some wiggle room so the test passes consistently. + self.assertTrue(after - before < 1.1) def test_retry_not_made_if_would_exceed_timeout(self): mock_trace_service = TraceServiceServicerWithExportParams( diff --git a/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/_log_exporter/__init__.py b/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/_log_exporter/__init__.py index 9dda3cca5cd..42b96facfdd 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/_log_exporter/__init__.py +++ b/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/_log_exporter/__init__.py @@ -185,7 +185,7 @@ def export(self, batch: Sequence[LogData]) -> LogExportResult: backoff_seconds, ) sleep(backoff_seconds) - backoff_seconds *= 2 * random.uniform(0.8, 1.2) + backoff_seconds = 2**retry_num * random.uniform(0.8, 1.2) # Not possible to reach here but the linter is complaining. return LogExportResult.FAILURE diff --git a/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/metric_exporter/__init__.py b/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/metric_exporter/__init__.py index 31710804577..592ed0cfe8f 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/metric_exporter/__init__.py +++ b/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/metric_exporter/__init__.py @@ -235,7 +235,7 @@ def export( backoff_seconds, ) sleep(backoff_seconds) - backoff_seconds *= 2 * random.uniform(0.8, 1.2) + backoff_seconds = 2**retry_num * random.uniform(0.8, 1.2) # Not possible to reach here but the linter is complaining. return MetricExportResult.FAILURE diff --git a/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/trace_exporter/__init__.py b/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/trace_exporter/__init__.py index 7bce9494648..0ddb2796880 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/trace_exporter/__init__.py +++ b/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/trace_exporter/__init__.py @@ -183,7 +183,7 @@ def export(self, spans: Sequence[ReadableSpan]) -> SpanExportResult: backoff_seconds, ) sleep(backoff_seconds) - backoff_seconds *= 2 * random.uniform(0.8, 1.2) + backoff_seconds = 2**retry_num * random.uniform(0.8, 1.2) # Not possible to reach here but the linter is complaining. return SpanExportResult.FAILURE From 2ae79bb17b640154d45b9040b9d2627d1037aadb Mon Sep 17 00:00:00 2001 From: Dylan Russell Date: Thu, 5 Jun 2025 18:04:35 +0000 Subject: [PATCH 14/38] Lint and precommit --- .../src/opentelemetry/exporter/otlp/proto/grpc/exporter.py | 2 +- .../tests/test_otlp_exporter_mixin.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/exporter/opentelemetry-exporter-otlp-proto-grpc/src/opentelemetry/exporter/otlp/proto/grpc/exporter.py b/exporter/opentelemetry-exporter-otlp-proto-grpc/src/opentelemetry/exporter/otlp/proto/grpc/exporter.py index a011a0e04db..f888fb433e1 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-grpc/src/opentelemetry/exporter/otlp/proto/grpc/exporter.py +++ b/exporter/opentelemetry-exporter-otlp-proto-grpc/src/opentelemetry/exporter/otlp/proto/grpc/exporter.py @@ -334,7 +334,7 @@ def _export( backoff_seconds, ) sleep(backoff_seconds) - backoff_seconds = 2 ** retry_num * random.uniform(0.8, 1.2) + backoff_seconds = 2**retry_num * random.uniform(0.8, 1.2) # Not possible to reach here but the linter is complaining. return self._result.FAILURE diff --git a/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/test_otlp_exporter_mixin.py b/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/test_otlp_exporter_mixin.py index 55f260345c2..fbca0f495b2 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/test_otlp_exporter_mixin.py +++ b/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/test_otlp_exporter_mixin.py @@ -104,7 +104,7 @@ def Export(self, request, context): if self.optional_export_sleep: time.sleep(self.optional_export_sleep) if self.export_result != StatusCode.OK: - if self.optional_retry_millis and self.num_requests == 1: + if self.optional_retry_millis: context.set_trailing_metadata( ( ( From 553ea3ed367bdadaff6c768ad53ee4fcef50c1c4 Mon Sep 17 00:00:00 2001 From: Dylan Russell Date: Thu, 5 Jun 2025 18:40:18 +0000 Subject: [PATCH 15/38] Empty commit --- .../exporter/otlp/proto/http/metric_exporter/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/metric_exporter/__init__.py b/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/metric_exporter/__init__.py index 592ed0cfe8f..8b57d23dadf 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/metric_exporter/__init__.py +++ b/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/metric_exporter/__init__.py @@ -235,7 +235,7 @@ def export( backoff_seconds, ) sleep(backoff_seconds) - backoff_seconds = 2**retry_num * random.uniform(0.8, 1.2) + backoff_seconds = 2**retry_num * random.uniform(0.8, 1.2) # Not possible to reach here but the linter is complaining. return MetricExportResult.FAILURE From 28b93998c61ef0232b9f4801ccfefba7f0879635 Mon Sep 17 00:00:00 2001 From: Dylan Russell Date: Thu, 5 Jun 2025 18:40:39 +0000 Subject: [PATCH 16/38] Another empty commit --- .../exporter/otlp/proto/http/metric_exporter/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/metric_exporter/__init__.py b/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/metric_exporter/__init__.py index 8b57d23dadf..592ed0cfe8f 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/metric_exporter/__init__.py +++ b/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/metric_exporter/__init__.py @@ -235,7 +235,7 @@ def export( backoff_seconds, ) sleep(backoff_seconds) - backoff_seconds = 2**retry_num * random.uniform(0.8, 1.2) + backoff_seconds = 2**retry_num * random.uniform(0.8, 1.2) # Not possible to reach here but the linter is complaining. return MetricExportResult.FAILURE From b4df54a9a64c48e972743ad364d08638644a25ac Mon Sep 17 00:00:00 2001 From: Dylan Russell Date: Thu, 5 Jun 2025 18:48:19 +0000 Subject: [PATCH 17/38] Calculate backoff in 1 place instead of 2 --- .../src/opentelemetry/exporter/otlp/proto/grpc/exporter.py | 5 +++-- .../exporter/otlp/proto/http/_log_exporter/__init__.py | 3 +-- .../exporter/otlp/proto/http/metric_exporter/__init__.py | 3 +-- .../exporter/otlp/proto/http/trace_exporter/__init__.py | 3 +-- 4 files changed, 6 insertions(+), 8 deletions(-) diff --git a/exporter/opentelemetry-exporter-otlp-proto-grpc/src/opentelemetry/exporter/otlp/proto/grpc/exporter.py b/exporter/opentelemetry-exporter-otlp-proto-grpc/src/opentelemetry/exporter/otlp/proto/grpc/exporter.py index f888fb433e1..3001764dacd 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-grpc/src/opentelemetry/exporter/otlp/proto/grpc/exporter.py +++ b/exporter/opentelemetry-exporter-otlp-proto-grpc/src/opentelemetry/exporter/otlp/proto/grpc/exporter.py @@ -296,8 +296,10 @@ def _export( retry_info = RetryInfo() with self._export_lock: deadline_sec = time() + self._timeout - backoff_seconds = 1 * random.uniform(0.8, 1.2) for retry_num in range(1, _MAX_RETRYS + 1): + backoff_seconds = 2 ** (retry_num - 1) * random.uniform( + 0.8, 1.2 + ) try: self._client.Export( request=self._translate_data(data), @@ -334,7 +336,6 @@ def _export( backoff_seconds, ) sleep(backoff_seconds) - backoff_seconds = 2**retry_num * random.uniform(0.8, 1.2) # Not possible to reach here but the linter is complaining. return self._result.FAILURE diff --git a/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/_log_exporter/__init__.py b/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/_log_exporter/__init__.py index 42b96facfdd..39062cae822 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/_log_exporter/__init__.py +++ b/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/_log_exporter/__init__.py @@ -163,8 +163,8 @@ def export(self, batch: Sequence[LogData]) -> LogExportResult: serialized_data = encode_logs(batch).SerializeToString() deadline_sec = time() + self._timeout - backoff_seconds = 1 * random.uniform(0.8, 1.2) for retry_num in range(1, _MAX_RETRYS + 1): + backoff_seconds = 2 ** (retry_num - 1) * random.uniform(0.8, 1.2) resp = self._export(serialized_data, deadline_sec - time()) if resp.ok: return LogExportResult.SUCCESS @@ -185,7 +185,6 @@ def export(self, batch: Sequence[LogData]) -> LogExportResult: backoff_seconds, ) sleep(backoff_seconds) - backoff_seconds = 2**retry_num * random.uniform(0.8, 1.2) # Not possible to reach here but the linter is complaining. return LogExportResult.FAILURE diff --git a/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/metric_exporter/__init__.py b/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/metric_exporter/__init__.py index 592ed0cfe8f..0b2db633fe2 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/metric_exporter/__init__.py +++ b/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/metric_exporter/__init__.py @@ -213,8 +213,8 @@ def export( deadline_sec = time() + ( timeout_millis / 1e3 if timeout_millis else self._timeout ) - backoff_seconds = 1 * random.uniform(0.8, 1.2) for retry_num in range(1, _MAX_RETRYS + 1): + backoff_seconds = 2 ** (retry_num - 1) * random.uniform(0.8, 1.2) resp = self._export(serialized_data, deadline_sec - time()) if resp.ok: return MetricExportResult.SUCCESS @@ -235,7 +235,6 @@ def export( backoff_seconds, ) sleep(backoff_seconds) - backoff_seconds = 2**retry_num * random.uniform(0.8, 1.2) # Not possible to reach here but the linter is complaining. return MetricExportResult.FAILURE diff --git a/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/trace_exporter/__init__.py b/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/trace_exporter/__init__.py index 0ddb2796880..536a5053ad5 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/trace_exporter/__init__.py +++ b/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/trace_exporter/__init__.py @@ -161,8 +161,8 @@ def export(self, spans: Sequence[ReadableSpan]) -> SpanExportResult: serialized_data = encode_spans(spans).SerializePartialToString() deadline_sec = time() + self._timeout - backoff_seconds = 1 * random.uniform(0.8, 1.2) for retry_num in range(1, _MAX_RETRYS + 1): + backoff_seconds = 2 ** (retry_num - 1) * random.uniform(0.8, 1.2) resp = self._export(serialized_data, deadline_sec - time()) if resp.ok: return SpanExportResult.SUCCESS @@ -183,7 +183,6 @@ def export(self, spans: Sequence[ReadableSpan]) -> SpanExportResult: backoff_seconds, ) sleep(backoff_seconds) - backoff_seconds = 2**retry_num * random.uniform(0.8, 1.2) # Not possible to reach here but the linter is complaining. return SpanExportResult.FAILURE From 9e1ba287a42e5ea0ca180f42cef69a34a6fb2f5a Mon Sep 17 00:00:00 2001 From: Dylan Russell Date: Thu, 5 Jun 2025 18:53:43 +0000 Subject: [PATCH 18/38] Update changelog --- CHANGELOG.md | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 37907252e60..483b9ad84c7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,10 +11,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - typecheck: add sdk/resources and drop mypy ([#4578](https://github.com/open-telemetry/opentelemetry-python/pull/4578)) -- Update OTLP gRPC/HTTP exporters: the export timeout is now inclusive of all retries and backoffs, - and an unnecessary 32 second sleep that occurred after all retries had completed/failed was removed. - Update gRPC OTLP Exporters to use official gRPC retry policy config. The `RetryInfo` proto in the error - response will now be ignored, and server's should now use the gRPC supported header `grpc-retry-pushback-ms`. +- Update OTLP gRPC/HTTP exporters: the export timeout is now inclusive of all retries and backoffs. + A +/-20% jitter was added to all backoffs. A pointless 32 second sleep that occurred after all retries + had completed/failed was removed. ([#4564](https://github.com/open-telemetry/opentelemetry-python/pull/4564)). - Use PEP702 for marking deprecations ([#4522](https://github.com/open-telemetry/opentelemetry-python/pull/4522)) From 0b540904bac49ed1fbce9061bddd5fca93979511 Mon Sep 17 00:00:00 2001 From: Dylan Russell Date: Thu, 5 Jun 2025 18:54:12 +0000 Subject: [PATCH 19/38] Update changelog --- CHANGELOG.md | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 483b9ad84c7..7b8801e057b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,14 +7,15 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## Unreleased -## Version 1.34.0/0.55b0 (2025-06-04) - -- typecheck: add sdk/resources and drop mypy - ([#4578](https://github.com/open-telemetry/opentelemetry-python/pull/4578)) - Update OTLP gRPC/HTTP exporters: the export timeout is now inclusive of all retries and backoffs. A +/-20% jitter was added to all backoffs. A pointless 32 second sleep that occurred after all retries had completed/failed was removed. ([#4564](https://github.com/open-telemetry/opentelemetry-python/pull/4564)). + +## Version 1.34.0/0.55b0 (2025-06-04) + +- typecheck: add sdk/resources and drop mypy + ([#4578](https://github.com/open-telemetry/opentelemetry-python/pull/4578)) - Use PEP702 for marking deprecations ([#4522](https://github.com/open-telemetry/opentelemetry-python/pull/4522)) - Refactor `BatchLogRecordProcessor` and `BatchSpanProcessor` to simplify code From bc3110a50714d2fc87dabb5b19622c6f8e612ff4 Mon Sep 17 00:00:00 2001 From: Dylan Russell Date: Thu, 5 Jun 2025 19:38:17 +0000 Subject: [PATCH 20/38] Make new _common directory in the http exporter for shared code --- .../pyproject.toml | 1 - .../otlp/proto/common/_internal/__init__.py | 10 -- .../test-requirements.txt | 1 - .../otlp/proto/http/_common/__init__.py | 22 +++++ .../otlp/proto/http/_log_exporter/__init__.py | 6 +- .../proto/http/metric_exporter/__init__.py | 4 +- .../proto/http/trace_exporter/__init__.py | 6 +- uv.lock | 95 ++----------------- 8 files changed, 38 insertions(+), 107 deletions(-) create mode 100644 exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/_common/__init__.py diff --git a/exporter/opentelemetry-exporter-otlp-proto-common/pyproject.toml b/exporter/opentelemetry-exporter-otlp-proto-common/pyproject.toml index 0df799746d4..e3192172be9 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-common/pyproject.toml +++ b/exporter/opentelemetry-exporter-otlp-proto-common/pyproject.toml @@ -28,7 +28,6 @@ classifiers = [ ] dependencies = [ "opentelemetry-proto == 1.35.0.dev", - "requests ~= 2.7", ] [project.urls] diff --git a/exporter/opentelemetry-exporter-otlp-proto-common/src/opentelemetry/exporter/otlp/proto/common/_internal/__init__.py b/exporter/opentelemetry-exporter-otlp-proto-common/src/opentelemetry/exporter/otlp/proto/common/_internal/__init__.py index c239a4ed18a..200644368df 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-common/src/opentelemetry/exporter/otlp/proto/common/_internal/__init__.py +++ b/exporter/opentelemetry-exporter-otlp-proto-common/src/opentelemetry/exporter/otlp/proto/common/_internal/__init__.py @@ -27,8 +27,6 @@ TypeVar, ) -import requests - from opentelemetry.proto.common.v1.common_pb2 import AnyValue as PB2AnyValue from opentelemetry.proto.common.v1.common_pb2 import ( ArrayValue as PB2ArrayValue, @@ -110,14 +108,6 @@ def _encode_key_value( ) -def _is_retryable(resp: requests.Response) -> bool: - if resp.status_code == 408: - return True - if resp.status_code >= 500 and resp.status_code <= 599: - return True - return False - - def _encode_array( array: Sequence[Any], allow_null: bool = False ) -> Sequence[PB2AnyValue]: diff --git a/exporter/opentelemetry-exporter-otlp-proto-common/test-requirements.txt b/exporter/opentelemetry-exporter-otlp-proto-common/test-requirements.txt index 46ce953c4c3..1c295c81ca5 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-common/test-requirements.txt +++ b/exporter/opentelemetry-exporter-otlp-proto-common/test-requirements.txt @@ -1,7 +1,6 @@ asgiref==3.7.2 importlib-metadata==6.11.0 iniconfig==2.0.0 -requests == 2.7.0 packaging==24.0 pluggy==1.5.0 protobuf==5.26.1 diff --git a/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/_common/__init__.py b/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/_common/__init__.py new file mode 100644 index 00000000000..d748c78eb56 --- /dev/null +++ b/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/_common/__init__.py @@ -0,0 +1,22 @@ +# Copyright The OpenTelemetry Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import requests + +def _is_retryable(resp: requests.Response) -> bool: + if resp.status_code == 408: + return True + if resp.status_code >= 500 and resp.status_code <= 599: + return True + return False \ No newline at end of file diff --git a/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/_log_exporter/__init__.py b/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/_log_exporter/__init__.py index 39062cae822..057072c7842 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/_log_exporter/__init__.py +++ b/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/_log_exporter/__init__.py @@ -24,14 +24,14 @@ import requests from requests.exceptions import ConnectionError -from opentelemetry.exporter.otlp.proto.common._internal import ( - _is_retryable, -) from opentelemetry.exporter.otlp.proto.common._log_encoder import encode_logs from opentelemetry.exporter.otlp.proto.http import ( _OTLP_HTTP_HEADERS, Compression, ) +from opentelemetry.exporter.otlp.proto.http._common import ( + _is_retryable, +) from opentelemetry.sdk._logs import LogData from opentelemetry.sdk._logs.export import ( LogExporter, diff --git a/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/metric_exporter/__init__.py b/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/metric_exporter/__init__.py index 0b2db633fe2..38a3451af93 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/metric_exporter/__init__.py +++ b/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/metric_exporter/__init__.py @@ -35,7 +35,6 @@ from opentelemetry.exporter.otlp.proto.common._internal import ( _get_resource_data, - _is_retryable, ) from opentelemetry.exporter.otlp.proto.common._internal.metrics_encoder import ( OTLPMetricExporterMixin, @@ -47,6 +46,9 @@ _OTLP_HTTP_HEADERS, Compression, ) +from opentelemetry.exporter.otlp.proto.http._common import ( + _is_retryable, +) from opentelemetry.proto.collector.metrics.v1.metrics_service_pb2 import ( # noqa: F401 ExportMetricsServiceRequest, ) diff --git a/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/trace_exporter/__init__.py b/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/trace_exporter/__init__.py index 536a5053ad5..1f14c3163c6 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/trace_exporter/__init__.py +++ b/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/trace_exporter/__init__.py @@ -24,9 +24,6 @@ import requests from requests.exceptions import ConnectionError -from opentelemetry.exporter.otlp.proto.common._internal import ( - _is_retryable, -) from opentelemetry.exporter.otlp.proto.common.trace_encoder import ( encode_spans, ) @@ -34,6 +31,9 @@ _OTLP_HTTP_HEADERS, Compression, ) +from opentelemetry.exporter.otlp.proto.http._common import ( + _is_retryable, +) from opentelemetry.sdk.environment_variables import ( OTEL_EXPORTER_OTLP_CERTIFICATE, OTEL_EXPORTER_OTLP_CLIENT_CERTIFICATE, diff --git a/uv.lock b/uv.lock index 827b727de29..26c44026f7a 100644 --- a/uv.lock +++ b/uv.lock @@ -3,8 +3,8 @@ revision = 1 requires-python = ">=3.9" resolution-markers = [ "python_full_version >= '3.13'", - "python_full_version >= '3.9' and python_full_version < '3.13'", - "python_full_version < '3.9'", + "python_full_version < '3.13'", + "python_version < '0'", ] [manifest] @@ -104,19 +104,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/e4/93/946a86ce20790e11312c87c75ba68d5f6ad2208cfb52b2d6a2c32840d922/charset_normalizer-3.4.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:fa88b843d6e211393a37219e6a1c1df99d35e8fd90446f1118f4216e307e48cd", size = 145732 }, { url = "https://files.pythonhosted.org/packages/cd/e5/131d2fb1b0dddafc37be4f3a2fa79aa4c037368be9423061dccadfd90091/charset_normalizer-3.4.1-cp313-cp313-win32.whl", hash = "sha256:eb8178fe3dba6450a3e024e95ac49ed3400e506fd4e9e5c32d30adda88cbd407", size = 95391 }, { url = "https://files.pythonhosted.org/packages/27/f2/4f9a69cc7712b9b5ad8fdb87039fd89abba997ad5cbe690d1835d40405b0/charset_normalizer-3.4.1-cp313-cp313-win_amd64.whl", hash = "sha256:b1ac5992a838106edb89654e0aebfc24f5848ae2547d22c2c3f66454daa11971", size = 102702 }, - { url = "https://files.pythonhosted.org/packages/10/bd/6517ea94f2672e801011d50b5d06be2a0deaf566aea27bcdcd47e5195357/charset_normalizer-3.4.1-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:ecddf25bee22fe4fe3737a399d0d177d72bc22be6913acfab364b40bce1ba83c", size = 195653 }, - { url = "https://files.pythonhosted.org/packages/e5/0d/815a2ba3f283b4eeaa5ece57acade365c5b4135f65a807a083c818716582/charset_normalizer-3.4.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8c60ca7339acd497a55b0ea5d506b2a2612afb2826560416f6894e8b5770d4a9", size = 140701 }, - { url = "https://files.pythonhosted.org/packages/aa/17/c94be7ee0d142687e047fe1de72060f6d6837f40eedc26e87e6e124a3fc6/charset_normalizer-3.4.1-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b7b2d86dd06bfc2ade3312a83a5c364c7ec2e3498f8734282c6c3d4b07b346b8", size = 150495 }, - { url = "https://files.pythonhosted.org/packages/f7/33/557ac796c47165fc141e4fb71d7b0310f67e05cb420756f3a82e0a0068e0/charset_normalizer-3.4.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:dd78cfcda14a1ef52584dbb008f7ac81c1328c0f58184bf9a84c49c605002da6", size = 142946 }, - { url = "https://files.pythonhosted.org/packages/1e/0d/38ef4ae41e9248d63fc4998d933cae22473b1b2ac4122cf908d0f5eb32aa/charset_normalizer-3.4.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6e27f48bcd0957c6d4cb9d6fa6b61d192d0b13d5ef563e5f2ae35feafc0d179c", size = 144737 }, - { url = "https://files.pythonhosted.org/packages/43/01/754cdb29dd0560f58290aaaa284d43eea343ad0512e6ad3b8b5c11f08592/charset_normalizer-3.4.1-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:01ad647cdd609225c5350561d084b42ddf732f4eeefe6e678765636791e78b9a", size = 147471 }, - { url = "https://files.pythonhosted.org/packages/ba/cd/861883ba5160c7a9bd242c30b2c71074cda2aefcc0addc91118e0d4e0765/charset_normalizer-3.4.1-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:619a609aa74ae43d90ed2e89bdd784765de0a25ca761b93e196d938b8fd1dbbd", size = 140801 }, - { url = "https://files.pythonhosted.org/packages/6f/7f/0c0dad447819e90b93f8ed238cc8f11b91353c23c19e70fa80483a155bed/charset_normalizer-3.4.1-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:89149166622f4db9b4b6a449256291dc87a99ee53151c74cbd82a53c8c2f6ccd", size = 149312 }, - { url = "https://files.pythonhosted.org/packages/8e/09/9f8abcc6fff60fb727268b63c376c8c79cc37b833c2dfe1f535dfb59523b/charset_normalizer-3.4.1-cp38-cp38-musllinux_1_2_ppc64le.whl", hash = "sha256:7709f51f5f7c853f0fb938bcd3bc59cdfdc5203635ffd18bf354f6967ea0f824", size = 152347 }, - { url = "https://files.pythonhosted.org/packages/be/e5/3f363dad2e24378f88ccf63ecc39e817c29f32e308ef21a7a6d9c1201165/charset_normalizer-3.4.1-cp38-cp38-musllinux_1_2_s390x.whl", hash = "sha256:345b0426edd4e18138d6528aed636de7a9ed169b4aaf9d61a8c19e39d26838ca", size = 149888 }, - { url = "https://files.pythonhosted.org/packages/e4/10/a78c0e91f487b4ad0ef7480ac765e15b774f83de2597f1b6ef0eaf7a2f99/charset_normalizer-3.4.1-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:0907f11d019260cdc3f94fbdb23ff9125f6b5d1039b76003b5b0ac9d6a6c9d5b", size = 145169 }, - { url = "https://files.pythonhosted.org/packages/d3/81/396e7d7f5d7420da8273c91175d2e9a3f569288e3611d521685e4b9ac9cc/charset_normalizer-3.4.1-cp38-cp38-win32.whl", hash = "sha256:ea0d8d539afa5eb2728aa1932a988a9a7af94f18582ffae4bc10b3fbdad0626e", size = 95094 }, - { url = "https://files.pythonhosted.org/packages/40/bb/20affbbd9ea29c71ea123769dc568a6d42052ff5089c5fe23e21e21084a6/charset_normalizer-3.4.1-cp38-cp38-win_amd64.whl", hash = "sha256:329ce159e82018d646c7ac45b01a430369d526569ec08516081727a20e9e4af4", size = 102139 }, { url = "https://files.pythonhosted.org/packages/7f/c0/b913f8f02836ed9ab32ea643c6fe4d3325c3d8627cf6e78098671cafff86/charset_normalizer-3.4.1-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:b97e690a2118911e39b4042088092771b4ae3fc3aa86518f84b8cf6888dbdb41", size = 197867 }, { url = "https://files.pythonhosted.org/packages/0f/6c/2bee440303d705b6fb1e2ec789543edec83d32d258299b16eed28aad48e0/charset_normalizer-3.4.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:78baa6d91634dfb69ec52a463534bc0df05dbd546209b79a3880a34487f4b84f", size = 141385 }, { url = "https://files.pythonhosted.org/packages/3d/04/cb42585f07f6f9fd3219ffb6f37d5a39b4fd2db2355b23683060029c35f7/charset_normalizer-3.4.1-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1a2bc9f351a75ef49d664206d51f8e5ede9da246602dc2d2726837620ea034b2", size = 151367 }, @@ -187,15 +174,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/79/6a/5df64b6df405a1ed1482cb6c10044b06ec47fd28e87c2232dbcf435ecb33/grpcio-1.70.0-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:0a5c78d5198a1f0aa60006cd6eb1c912b4a1520b6a3968e677dbcba215fabb40", size = 6190982 }, { url = "https://files.pythonhosted.org/packages/42/aa/aeaac87737e6d25d1048c53b8ec408c056d3ed0c922e7c5efad65384250c/grpcio-1.70.0-cp313-cp313-win32.whl", hash = "sha256:fe9dbd916df3b60e865258a8c72ac98f3ac9e2a9542dcb72b7a34d236242a5ce", size = 3598359 }, { url = "https://files.pythonhosted.org/packages/1f/79/8edd2442d2de1431b4a3de84ef91c37002f12de0f9b577fb07b452989dbc/grpcio-1.70.0-cp313-cp313-win_amd64.whl", hash = "sha256:4119fed8abb7ff6c32e3d2255301e59c316c22d31ab812b3fbcbaf3d0d87cc68", size = 4293938 }, - { url = "https://files.pythonhosted.org/packages/38/5f/d7fe323c18a2ec98a2a9b38fb985f5e843f76990298d7c4ce095f44b46a7/grpcio-1.70.0-cp38-cp38-linux_armv7l.whl", hash = "sha256:8058667a755f97407fca257c844018b80004ae8035565ebc2812cc550110718d", size = 5232027 }, - { url = "https://files.pythonhosted.org/packages/d4/4b/3d3b5548575b635f51883212a482cd237e8525535d4591b9dc7e5b2c2ddc/grpcio-1.70.0-cp38-cp38-macosx_10_14_universal2.whl", hash = "sha256:879a61bf52ff8ccacbedf534665bb5478ec8e86ad483e76fe4f729aaef867cab", size = 11448811 }, - { url = "https://files.pythonhosted.org/packages/8a/d7/9a0922fc12d339271c7e4e6691470172b7c13715fed7bd934274803f1527/grpcio-1.70.0-cp38-cp38-manylinux_2_17_aarch64.whl", hash = "sha256:0ba0a173f4feacf90ee618fbc1a27956bfd21260cd31ced9bc707ef551ff7dc7", size = 5711890 }, - { url = "https://files.pythonhosted.org/packages/1e/ae/d4dbf8bff0f1d270f118d08558bc8dc0489e026d6620a4e3ee2d79d79041/grpcio-1.70.0-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:558c386ecb0148f4f99b1a65160f9d4b790ed3163e8610d11db47838d452512d", size = 6331933 }, - { url = "https://files.pythonhosted.org/packages/2c/64/66a74c02b00e00b919c245ca9da8e5c44e8692bf3fe7f27efbc97572566c/grpcio-1.70.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:412faabcc787bbc826f51be261ae5fa996b21263de5368a55dc2cf824dc5090e", size = 5950685 }, - { url = "https://files.pythonhosted.org/packages/b0/64/e992ac693118c37164e085676216d258804d7a5bbf3581d3f989c843a9a5/grpcio-1.70.0-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:3b0f01f6ed9994d7a0b27eeddea43ceac1b7e6f3f9d86aeec0f0064b8cf50fdb", size = 6640974 }, - { url = "https://files.pythonhosted.org/packages/57/17/34d0a6af4477fd48b8b41d13782fb1e35b8841b17d6ac7a3eb24d2f3b17e/grpcio-1.70.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:7385b1cb064734005204bc8994eed7dcb801ed6c2eda283f613ad8c6c75cf873", size = 6204792 }, - { url = "https://files.pythonhosted.org/packages/d3/e5/e45d8eb81929c0becd5bda413b60262f79d862e19cff632d496909aa3bd0/grpcio-1.70.0-cp38-cp38-win32.whl", hash = "sha256:07269ff4940f6fb6710951116a04cd70284da86d0a4368fd5a3b552744511f5a", size = 3620015 }, - { url = "https://files.pythonhosted.org/packages/87/7d/36009c38093e62969c708f20b86ab6761c2ba974b12ff10def6f397f24fa/grpcio-1.70.0-cp38-cp38-win_amd64.whl", hash = "sha256:aba19419aef9b254e15011b230a180e26e0f6864c90406fdbc255f01d83bc83c", size = 4307043 }, { url = "https://files.pythonhosted.org/packages/9d/0e/64061c9746a2dd6e07cb0a0f3829f0a431344add77ec36397cc452541ff6/grpcio-1.70.0-cp39-cp39-linux_armv7l.whl", hash = "sha256:4f1937f47c77392ccd555728f564a49128b6a197a05a5cd527b796d36f3387d0", size = 5231123 }, { url = "https://files.pythonhosted.org/packages/72/9f/c93501d5f361aecee0146ab19300d5acb1c2747b00217c641f06fffbcd62/grpcio-1.70.0-cp39-cp39-macosx_10_14_universal2.whl", hash = "sha256:0cd430b9215a15c10b0e7d78f51e8a39d6cf2ea819fd635a7214fae600b1da27", size = 11467217 }, { url = "https://files.pythonhosted.org/packages/0a/1a/980d115b701023450a304881bf3f6309f6fb15787f9b78d2728074f3bf86/grpcio-1.70.0-cp39-cp39-manylinux_2_17_aarch64.whl", hash = "sha256:e27585831aa6b57b9250abaf147003e126cd3a6c6ca0c531a01996f31709bed1", size = 5710913 }, @@ -216,31 +194,12 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/76/c6/c88e154df9c4e1a2a66ccf0005a88dfb2650c1dffb6f5ce603dfbd452ce3/idna-3.10-py3-none-any.whl", hash = "sha256:946d195a0d259cbba61165e88e65941f16e9b36ea6ddb97f00452bae8b1287d3", size = 70442 }, ] -[[package]] -name = "importlib-metadata" -version = "8.5.0" -source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version < '3.9'", -] -dependencies = [ - { name = "zipp", version = "3.20.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/cd/12/33e59336dca5be0c398a7482335911a33aa0e20776128f038019f1a95f1b/importlib_metadata-8.5.0.tar.gz", hash = "sha256:71522656f0abace1d072b9e5481a48f07c138e00f079c38c8f883823f9c26bd7", size = 55304 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/a0/d9/a1e041c5e7caa9a05c925f4bdbdfb7f006d1f74996af53467bc394c97be7/importlib_metadata-8.5.0-py3-none-any.whl", hash = "sha256:45e54197d28b7a7f1559e60b95e7c567032b602131fbd588f1497f47880aa68b", size = 26514 }, -] - [[package]] name = "importlib-metadata" version = "8.6.1" source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version >= '3.13'", - "python_full_version >= '3.9' and python_full_version < '3.13'", -] dependencies = [ - { name = "zipp", version = "3.21.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.9'" }, + { name = "zipp" }, ] sdist = { url = "https://files.pythonhosted.org/packages/33/08/c1395a292bb23fd03bdf572a1357c5a733d3eecbab877641ceacab23db6e/importlib_metadata-8.6.1.tar.gz", hash = "sha256:310b41d755445d74569f993ccfc22838295d9fe005425094fad953d7f15c8580", size = 55767 } wheels = [ @@ -251,15 +210,14 @@ wheels = [ name = "opentelemetry-api" source = { editable = "opentelemetry-api" } dependencies = [ - { name = "importlib-metadata", version = "8.5.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" }, - { name = "importlib-metadata", version = "8.6.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.9'" }, + { name = "importlib-metadata" }, { name = "typing-extensions" }, ] [package.metadata] requires-dist = [ - { name = "typing-extensions", specifier = ">=4.5.0" }, { name = "importlib-metadata", specifier = ">=6.0,<8.8.0" }, + { name = "typing-extensions", specifier = ">=4.5.0" }, ] [[package]] @@ -281,14 +239,10 @@ name = "opentelemetry-exporter-otlp-proto-common" source = { editable = "exporter/opentelemetry-exporter-otlp-proto-common" } dependencies = [ { name = "opentelemetry-proto" }, - { name = "requests" }, ] [package.metadata] -requires-dist = [ - { name = "opentelemetry-proto", editable = "opentelemetry-proto" }, - { name = "requests", specifier = "~=2.7" }, -] +requires-dist = [{ name = "opentelemetry-proto", editable = "opentelemetry-proto" }] [[package]] name = "opentelemetry-exporter-otlp-proto-grpc" @@ -506,8 +460,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/dd/04/3eaedc2ba17a088961d0e3bd396eac764450f431621b58a04ce898acd126/protobuf-5.29.3-cp38-abi3-macosx_10_9_universal2.whl", hash = "sha256:a8434404bbf139aa9e1300dbf989667a83d42ddda9153d8ab76e0d5dcaca484e", size = 417825 }, { url = "https://files.pythonhosted.org/packages/4f/06/7c467744d23c3979ce250397e26d8ad8eeb2bea7b18ca12ad58313c1b8d5/protobuf-5.29.3-cp38-abi3-manylinux2014_aarch64.whl", hash = "sha256:daaf63f70f25e8689c072cfad4334ca0ac1d1e05a92fc15c54eb9cf23c3efd84", size = 319573 }, { url = "https://files.pythonhosted.org/packages/a8/45/2ebbde52ad2be18d3675b6bee50e68cd73c9e0654de77d595540b5129df8/protobuf-5.29.3-cp38-abi3-manylinux2014_x86_64.whl", hash = "sha256:c027e08a08be10b67c06bf2370b99c811c466398c357e615ca88c91c07f0910f", size = 319672 }, - { url = "https://files.pythonhosted.org/packages/ce/06/18efd22aaefbc444a96a68390fd66aacd40d6791637e86dd6fea3164975d/protobuf-5.29.3-cp38-cp38-win32.whl", hash = "sha256:84a57163a0ccef3f96e4b6a20516cedcf5bb3a95a657131c5c3ac62200d23252", size = 422593 }, - { url = "https://files.pythonhosted.org/packages/c6/36/37425a115a95e35a1d8dff686ac2488718a40f07d498edfd89eb40ee3c5d/protobuf-5.29.3-cp38-cp38-win_amd64.whl", hash = "sha256:b89c115d877892a512f79a8114564fb435943b59067615894c3b13cd3e1fa107", size = 434517 }, { url = "https://files.pythonhosted.org/packages/85/a6/bf65a38f8be5ab8c3b575822acfd338702fdf7ac9abd8c81630cc7c9f4bd/protobuf-5.29.3-cp39-cp39-win32.whl", hash = "sha256:0eb32bfa5219fc8d4111803e9a690658aa2e6366384fd0851064b963b6d1f2a7", size = 422676 }, { url = "https://files.pythonhosted.org/packages/ac/e2/48d46adc86369ff092eaece3e537f76b3baaab45ca3dde257838cde831d2/protobuf-5.29.3-cp39-cp39-win_amd64.whl", hash = "sha256:6ce8cc3389a20693bfde6c6562e03474c40851b44975c9b2bf6df7d8c4f864da", size = 434593 }, { url = "https://files.pythonhosted.org/packages/fd/b2/ab07b09e0f6d143dfb839693aa05765257bceaa13d03bf1a696b78323e7a/protobuf-5.29.3-py3-none-any.whl", hash = "sha256:0a18ed4a24198528f2333802eb075e59dea9d679ab7a6c5efb017a59004d849f", size = 172550 }, @@ -521,8 +473,7 @@ dependencies = [ { name = "certifi" }, { name = "charset-normalizer" }, { name = "idna" }, - { name = "urllib3", version = "2.2.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" }, - { name = "urllib3", version = "2.3.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.9'" }, + { name = "urllib3" }, ] sdist = { url = "https://files.pythonhosted.org/packages/63/70/2bf7780ad2d390a8d301ad0b550f1581eadbd9a20f896afe06353c2a2913/requests-2.32.3.tar.gz", hash = "sha256:55365417734eb18255590a9ff9eb97e9e1da868d4ccd6402399eaf68af20a760", size = 131218 } wheels = [ @@ -538,51 +489,19 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/26/9f/ad63fc0248c5379346306f8668cda6e2e2e9c95e01216d2b8ffd9ff037d0/typing_extensions-4.12.2-py3-none-any.whl", hash = "sha256:04e5ca0351e0f3f85c6853954072df659d0d13fac324d0072316b67d7794700d", size = 37438 }, ] -[[package]] -name = "urllib3" -version = "2.2.3" -source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version < '3.9'", -] -sdist = { url = "https://files.pythonhosted.org/packages/ed/63/22ba4ebfe7430b76388e7cd448d5478814d3032121827c12a2cc287e2260/urllib3-2.2.3.tar.gz", hash = "sha256:e7d814a81dad81e6caf2ec9fdedb284ecc9c73076b62654547cc64ccdcae26e9", size = 300677 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/ce/d9/5f4c13cecde62396b0d3fe530a50ccea91e7dfc1ccf0e09c228841bb5ba8/urllib3-2.2.3-py3-none-any.whl", hash = "sha256:ca899ca043dcb1bafa3e262d73aa25c465bfb49e0bd9dd5d59f1d0acba2f8fac", size = 126338 }, -] - [[package]] name = "urllib3" version = "2.3.0" source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version >= '3.13'", - "python_full_version >= '3.9' and python_full_version < '3.13'", -] sdist = { url = "https://files.pythonhosted.org/packages/aa/63/e53da845320b757bf29ef6a9062f5c669fe997973f966045cb019c3f4b66/urllib3-2.3.0.tar.gz", hash = "sha256:f8c5449b3cf0861679ce7e0503c7b44b5ec981bec0d1d3795a07f1ba96f0204d", size = 307268 } wheels = [ { url = "https://files.pythonhosted.org/packages/c8/19/4ec628951a74043532ca2cf5d97b7b14863931476d117c471e8e2b1eb39f/urllib3-2.3.0-py3-none-any.whl", hash = "sha256:1cee9ad369867bfdbbb48b7dd50374c0967a0bb7710050facf0dd6911440e3df", size = 128369 }, ] -[[package]] -name = "zipp" -version = "3.20.2" -source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version < '3.9'", -] -sdist = { url = "https://files.pythonhosted.org/packages/54/bf/5c0000c44ebc80123ecbdddba1f5dcd94a5ada602a9c225d84b5aaa55e86/zipp-3.20.2.tar.gz", hash = "sha256:bc9eb26f4506fda01b81bcde0ca78103b6e62f991b381fec825435c836edbc29", size = 24199 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/62/8b/5ba542fa83c90e09eac972fc9baca7a88e7e7ca4b221a89251954019308b/zipp-3.20.2-py3-none-any.whl", hash = "sha256:a817ac80d6cf4b23bf7f2828b7cabf326f15a001bea8b1f9b49631780ba28350", size = 9200 }, -] - [[package]] name = "zipp" version = "3.21.0" source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version >= '3.13'", - "python_full_version >= '3.9' and python_full_version < '3.13'", -] sdist = { url = "https://files.pythonhosted.org/packages/3f/50/bad581df71744867e9468ebd0bcd6505de3b275e06f202c2cb016e3ff56f/zipp-3.21.0.tar.gz", hash = "sha256:2c9958f6430a2040341a52eb608ed6dd93ef4392e02ffe219417c1b28b5dd1f4", size = 24545 } wheels = [ { url = "https://files.pythonhosted.org/packages/b7/1a/7e4798e9339adc931158c9d69ecc34f5e6791489d469f5e50ec15e35f458/zipp-3.21.0-py3-none-any.whl", hash = "sha256:ac1bbe05fd2991f160ebce24ffbac5f6d11d83dc90891255885223d42b3cd931", size = 9630 }, From 4bbecf81c34770291d0e2f5377a2db888b7035f5 Mon Sep 17 00:00:00 2001 From: Dylan Russell Date: Thu, 5 Jun 2025 19:41:28 +0000 Subject: [PATCH 21/38] precommit --- .../opentelemetry/exporter/otlp/proto/http/_common/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/_common/__init__.py b/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/_common/__init__.py index d748c78eb56..b1ed46d28b7 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/_common/__init__.py +++ b/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/_common/__init__.py @@ -14,9 +14,10 @@ import requests + def _is_retryable(resp: requests.Response) -> bool: if resp.status_code == 408: return True if resp.status_code >= 500 and resp.status_code <= 599: return True - return False \ No newline at end of file + return False From 3076c0fe2e42376e78bdd4754e83affd82234ec4 Mon Sep 17 00:00:00 2001 From: Dylan Russell Date: Mon, 9 Jun 2025 13:25:19 +0000 Subject: [PATCH 22/38] Make many changes --- .../exporter/otlp/proto/grpc/exporter.py | 86 +++++++++---------- .../tests/test_otlp_exporter_mixin.py | 56 +++++------- .../otlp/proto/http/_log_exporter/__init__.py | 12 ++- .../proto/http/metric_exporter/__init__.py | 10 ++- .../proto/http/trace_exporter/__init__.py | 12 ++- .../sdk/_shared_internal/__init__.py | 35 ++++++-- .../shared_internal/test_batch_processor.py | 64 ++++++++++++++ 7 files changed, 184 insertions(+), 91 deletions(-) diff --git a/exporter/opentelemetry-exporter-otlp-proto-grpc/src/opentelemetry/exporter/otlp/proto/grpc/exporter.py b/exporter/opentelemetry-exporter-otlp-proto-grpc/src/opentelemetry/exporter/otlp/proto/grpc/exporter.py index 3001764dacd..80f9f94c486 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-grpc/src/opentelemetry/exporter/otlp/proto/grpc/exporter.py +++ b/exporter/opentelemetry-exporter-otlp-proto-grpc/src/opentelemetry/exporter/otlp/proto/grpc/exporter.py @@ -273,7 +273,7 @@ def __init__( ) self._client = self._stub(self._channel) - self._export_lock = threading.Lock() + self._shutdown_is_occuring = threading.Event() self._shutdown = False @abstractmethod @@ -294,60 +294,60 @@ def _export( # gets updated to a class that represents the proto # TracesData and use the code below instead. retry_info = RetryInfo() - with self._export_lock: - deadline_sec = time() + self._timeout - for retry_num in range(1, _MAX_RETRYS + 1): - backoff_seconds = 2 ** (retry_num - 1) * random.uniform( - 0.8, 1.2 + deadline_sec = time() + self._timeout + for retry_num in range(1, _MAX_RETRYS + 1): + backoff_seconds = 2 ** (retry_num - 1) * random.uniform( + 0.8, 1.2 + ) + try: + self._client.Export( + request=self._translate_data(data), + metadata=self._headers, + timeout=self._timeout, ) - try: - self._client.Export( - request=self._translate_data(data), - metadata=self._headers, - timeout=self._timeout, - ) - return self._result.SUCCESS - except RpcError as error: - retry_info_bin = dict(error.trailing_metadata()).get( - "google.rpc.retryinfo-bin" + return self._result.SUCCESS + except RpcError as error: + retry_info_bin = dict(error.trailing_metadata()).get( + "google.rpc.retryinfo-bin" + ) + if retry_info_bin is not None: + retry_info.ParseFromString(retry_info_bin) + backoff_seconds = ( + retry_info.retry_delay.seconds + + retry_info.retry_delay.nanos / 1.0e9 ) - if retry_info_bin is not None: - retry_info.ParseFromString(retry_info_bin) - backoff_seconds = ( - retry_info.retry_delay.seconds - + retry_info.retry_delay.nanos / 1.0e9 - ) - if ( - error.code() not in _RETRYABLE_ERROR_CODES - or retry_num == _MAX_RETRYS - or backoff_seconds > (deadline_sec - time()) - ): - logger.error( - "Failed to export %s to %s, error code: %s", - self._exporting, - self._endpoint, - error.code(), - exc_info=error.code() == StatusCode.UNKNOWN, - ) - return self._result.FAILURE - logger.warning( - "Transient error %s encountered while exporting logs batch, retrying in %.2fs.", + if ( + error.code() not in _RETRYABLE_ERROR_CODES + or retry_num == _MAX_RETRYS + or backoff_seconds > (deadline_sec - time()) + or self._shutdown + ): + logger.error( + "Failed to export %s to %s, error code: %s", + self._exporting, + self._endpoint, error.code(), - backoff_seconds, + exc_info=error.code() == StatusCode.UNKNOWN, ) - sleep(backoff_seconds) - # Not possible to reach here but the linter is complaining. + return self._result.FAILURE + logger.warning( + "Transient error %s encountered while exporting logs batch, retrying in %.2fs.", + error.code(), + backoff_seconds, + ) + shutdown = self._shutdown_is_occuring.wait(backoff_seconds) + if shutdown: + logger.warning("Shutdown in progress, aborting retry.") + break return self._result.FAILURE def shutdown(self, timeout_millis: float = 30_000, **kwargs) -> None: if self._shutdown: logger.warning("Exporter already shutdown, ignoring call") return - # wait for the last export if any - self._export_lock.acquire(timeout=timeout_millis / 1e3) + self._shutdown_is_occuring.set() self._shutdown = True self._channel.close() - self._export_lock.release() @property @abstractmethod diff --git a/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/test_otlp_exporter_mixin.py b/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/test_otlp_exporter_mixin.py index fbca0f495b2..8cf5e83a319 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/test_otlp_exporter_mixin.py +++ b/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/test_otlp_exporter_mixin.py @@ -311,10 +311,11 @@ def test_shutdown(self): "Exporter already shutdown, ignoring batch", ) - def test_shutdown_wait_last_export(self): + def test_shutdown_interrupts_export_sleep(self): + # Returns unavailable and asks for a 20 second sleep before retry. add_TraceServiceServicer_to_server( TraceServiceServicerWithExportParams( - StatusCode.OK, optional_export_sleep=1 + StatusCode.UNAVAILABLE ), self.server, ) @@ -322,37 +323,26 @@ def test_shutdown_wait_last_export(self): export_thread = ThreadWithReturnValue( target=self.exporter.export, args=([self.span],) ) - export_thread.start() - # Wait a bit for exporter to get lock and make export call. - time.sleep(0.25) - # pylint: disable=protected-access - self.assertTrue(self.exporter._export_lock.locked()) - self.exporter.shutdown(timeout_millis=3000) - # pylint: disable=protected-access - self.assertTrue(self.exporter._shutdown) - self.assertEqual(export_thread.join(), SpanExportResult.SUCCESS) - - def test_shutdown_doesnot_wait_last_export(self): - add_TraceServiceServicer_to_server( - TraceServiceServicerWithExportParams( - StatusCode.OK, optional_export_sleep=3 - ), - self.server, - ) - - export_thread = ThreadWithReturnValue( - target=self.exporter.export, args=([self.span],) - ) - export_thread.start() - # Wait for exporter to get lock and make export call. - time.sleep(0.25) - # pylint: disable=protected-access - self.assertTrue(self.exporter._export_lock.locked()) - # Set to 1 seconds, so the 3 second server-side delay will not be reached. - self.exporter.shutdown(timeout_millis=1000) - # pylint: disable=protected-access - self.assertTrue(self.exporter._shutdown) - self.assertEqual(export_thread.join(), None) + with self.assertLogs(level=WARNING) as warning: + export_thread.start() + # Wait a bit for export to fail and the backoff sleep to start + time.sleep(.1) + # The code should now be in a sleep that's between .8 and 1.2 seconds. + begin_wait = time.time_ns() + # pylint: disable=protected-access + self.assertFalse(self.exporter._shutdown_is_occuring.is_set()) + self.exporter.shutdown() + self.assertTrue(self.exporter._shutdown_is_occuring.is_set()) + export_result = export_thread.join() + end_wait = time.time_ns() + self.assertEqual(export_result, SpanExportResult.FAILURE) + # Shutdown should have interrupted the sleep. + self.assertTrue((end_wait - begin_wait) / 1e9 < .1) + print(warning.records) + self.assertEqual( + warning.records[1].message, + "Shutdown in progress, aborting retry." + ) def test_export_over_closed_grpc_channel(self): # pylint: disable=protected-access diff --git a/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/_log_exporter/__init__.py b/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/_log_exporter/__init__.py index 057072c7842..f836fdc4755 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/_log_exporter/__init__.py +++ b/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/_log_exporter/__init__.py @@ -14,6 +14,7 @@ import gzip import logging +import threading import random import zlib from io import BytesIO @@ -77,6 +78,7 @@ def __init__( compression: Optional[Compression] = None, session: Optional[requests.Session] = None, ): + self._shutdown_is_occuring = threading.Event() self._endpoint = endpoint or environ.get( OTEL_EXPORTER_OTLP_LOGS_ENDPOINT, _append_logs_path( @@ -164,14 +166,15 @@ def export(self, batch: Sequence[LogData]) -> LogExportResult: serialized_data = encode_logs(batch).SerializeToString() deadline_sec = time() + self._timeout for retry_num in range(1, _MAX_RETRYS + 1): - backoff_seconds = 2 ** (retry_num - 1) * random.uniform(0.8, 1.2) resp = self._export(serialized_data, deadline_sec - time()) if resp.ok: return LogExportResult.SUCCESS + backoff_seconds = 2 ** (retry_num - 1) * random.uniform(0.8, 1.2) if ( not _is_retryable(resp) or retry_num == _MAX_RETRYS or backoff_seconds > (deadline_sec - time()) + or self._shutdown ): _logger.error( "Failed to export logs batch code: %s, reason: %s", @@ -184,8 +187,10 @@ def export(self, batch: Sequence[LogData]) -> LogExportResult: resp.reason, backoff_seconds, ) - sleep(backoff_seconds) - # Not possible to reach here but the linter is complaining. + shutdown = self._shutdown_is_occuring.wait(backoff_seconds) + if shutdown: + _logger.warning("Shutdown in progress, aborting retry.") + break return LogExportResult.FAILURE def force_flush(self, timeout_millis: float = 10_000) -> bool: @@ -196,6 +201,7 @@ def shutdown(self): if self._shutdown: _logger.warning("Exporter already shutdown, ignoring call") return + self._shutdown_is_occuring.set() self._session.close() self._shutdown = True diff --git a/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/metric_exporter/__init__.py b/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/metric_exporter/__init__.py index 38a3451af93..67ef08a5ad6 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/metric_exporter/__init__.py +++ b/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/metric_exporter/__init__.py @@ -14,6 +14,7 @@ import gzip import logging +import threading import random import zlib from io import BytesIO @@ -120,6 +121,7 @@ def __init__( | None = None, preferred_aggregation: dict[type, Aggregation] | None = None, ): + self._shutdown_is_occuring = threading.Event() self._endpoint = endpoint or environ.get( OTEL_EXPORTER_OTLP_METRICS_ENDPOINT, _append_metrics_path( @@ -224,6 +226,7 @@ def export( not _is_retryable(resp) or retry_num == _MAX_RETRYS or backoff_seconds > (deadline_sec - time()) + or self._shutdown ): _logger.error( "Failed to export metrics batch code: %s, reason: %s", @@ -236,14 +239,17 @@ def export( resp.reason, backoff_seconds, ) - sleep(backoff_seconds) - # Not possible to reach here but the linter is complaining. + shutdown = self._shutdown_is_occuring.wait(backoff_seconds) + if shutdown: + _logger.warning("Shutdown in progress, aborting retry.") + break return MetricExportResult.FAILURE def shutdown(self, timeout_millis: float = 30_000, **kwargs) -> None: if self._shutdown: _logger.warning("Exporter already shutdown, ignoring call") return + self._shutdown_is_occuring.set() self._session.close() self._shutdown = True diff --git a/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/trace_exporter/__init__.py b/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/trace_exporter/__init__.py index 1f14c3163c6..4a99f3a5389 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/trace_exporter/__init__.py +++ b/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/trace_exporter/__init__.py @@ -16,6 +16,7 @@ import logging import random import zlib +import threading from io import BytesIO from os import environ from time import sleep, time @@ -76,6 +77,7 @@ def __init__( compression: Optional[Compression] = None, session: Optional[requests.Session] = None, ): + self._shutdown_is_occuring = threading.Event() self._endpoint = endpoint or environ.get( OTEL_EXPORTER_OTLP_TRACES_ENDPOINT, _append_trace_path( @@ -162,14 +164,15 @@ def export(self, spans: Sequence[ReadableSpan]) -> SpanExportResult: serialized_data = encode_spans(spans).SerializePartialToString() deadline_sec = time() + self._timeout for retry_num in range(1, _MAX_RETRYS + 1): - backoff_seconds = 2 ** (retry_num - 1) * random.uniform(0.8, 1.2) resp = self._export(serialized_data, deadline_sec - time()) if resp.ok: return SpanExportResult.SUCCESS + backoff_seconds = 2 ** (retry_num - 1) * random.uniform(0.8, 1.2) if ( not _is_retryable(resp) or retry_num == _MAX_RETRYS or backoff_seconds > (deadline_sec - time()) + or self._shutdown ): _logger.error( "Failed to export span batch code: %s, reason: %s", @@ -182,14 +185,17 @@ def export(self, spans: Sequence[ReadableSpan]) -> SpanExportResult: resp.reason, backoff_seconds, ) - sleep(backoff_seconds) - # Not possible to reach here but the linter is complaining. + shutdown = self._shutdown_is_occuring.wait(backoff_seconds) + if shutdown: + _logger.warning("Shutdown in progress, aborting retry.") + break return SpanExportResult.FAILURE def shutdown(self): if self._shutdown: _logger.warning("Exporter already shutdown, ignoring call") return + self._shutdown_is_occuring.set() self._session.close() self._shutdown = True diff --git a/opentelemetry-sdk/src/opentelemetry/sdk/_shared_internal/__init__.py b/opentelemetry-sdk/src/opentelemetry/sdk/_shared_internal/__init__.py index 97a00980cba..1771b3e0fe7 100644 --- a/opentelemetry-sdk/src/opentelemetry/sdk/_shared_internal/__init__.py +++ b/opentelemetry-sdk/src/opentelemetry/sdk/_shared_internal/__init__.py @@ -16,9 +16,11 @@ import collections import enum +import inspect import logging import os import threading +import time import weakref from abc import abstractmethod from typing import ( @@ -90,6 +92,7 @@ def __init__( self._exporting = exporting self._shutdown = False + self._shutdown_timeout_exceeded = False self._export_lock = threading.Lock() self._worker_awaken = threading.Event() self._worker_thread.start() @@ -101,7 +104,7 @@ def __init__( def _should_export_batch( self, batch_strategy: BatchExportStrategy, num_iterations: int ) -> bool: - if not self._queue: + if not self._queue or self._shutdown_timeout_exceeded: return False # Always continue to export while queue length exceeds max batch size. if len(self._queue) >= self._max_export_batch_size: @@ -180,16 +183,34 @@ def emit(self, data: Telemetry) -> None: if len(self._queue) >= self._max_export_batch_size: self._worker_awaken.set() - def shutdown(self): + def shutdown(self, timeout_millis: int = 30000): if self._shutdown: return - # Prevents emit and force_flush from further calling export. + # Causes emit to reject telemetry and makes force_flush a no-op. self._shutdown = True - # Interrupts sleep in the worker, if it's sleeping. + # Interrupts sleep in the worker if it's sleeping. self._worker_awaken.set() - # Main worker loop should exit after one final export call with flush all strategy. + # Wait a tiny bit for the worker thread to wake and call export for a final time. + time.sleep(0.1) + num_sleeps = 10 + for _ in range(num_sleeps): + # If export is not being called, we can shutdown. + if not self._export_lock.locked(): + break + time.sleep(timeout_millis / 1000 / num_sleeps) + # Stops worker thread from calling export again if queue is still not empty. + self._shutdown_timeout_exceeded = True + # We want to shutdown immediately because we already waited `timeout_millis`. + # Some exporter's shutdown support a timeout param. + if ( + "timeout_millis" + in inspect.getfullargspec(self._exporter.shutdown).args + ): + self._exporter.shutdown(timeout_millis=0) # type: ignore + else: + self._exporter.shutdown() + # Worker thread should be finished at this point and return instantly. self._worker_thread.join() - self._exporter.shutdown() # TODO: Fix force flush so the timeout is used https://github.com/open-telemetry/opentelemetry-python/issues/4568. def force_flush(self, timeout_millis: Optional[int] = None) -> bool: @@ -197,4 +218,4 @@ def force_flush(self, timeout_millis: Optional[int] = None) -> bool: return False # Blocking call to export. self._export(BatchExportStrategy.EXPORT_ALL) - return True + return True \ No newline at end of file diff --git a/opentelemetry-sdk/tests/shared_internal/test_batch_processor.py b/opentelemetry-sdk/tests/shared_internal/test_batch_processor.py index 4888d81779d..a7cbaf33657 100644 --- a/opentelemetry-sdk/tests/shared_internal/test_batch_processor.py +++ b/opentelemetry-sdk/tests/shared_internal/test_batch_processor.py @@ -18,8 +18,10 @@ import os import time import unittest +import threading import weakref from platform import system +from typing import Any from unittest.mock import Mock import pytest @@ -49,6 +51,29 @@ multiprocessing.set_start_method("fork") +class MockExporterForTesting(): + def __init__(self, export_sleep: int): + self.num_export_calls = 0 + self.export_sleep = export_sleep + self._shutdown = False + self.export_sleep_event = threading.Event() + + def export(self, _: list[Any]): + self.num_export_calls += 1 + if self._shutdown: + raise Exception("Cannot export, already shutdown") + + sleep_interrupted = self.export_sleep_event.wait(self.export_sleep) + if sleep_interrupted: + raise Exception("Did not get to finish !") + return + + def shutdown(self): + # Force export to finish sleeping. + self._shutdown = True + self.export_sleep_event.set() + + # BatchLogRecodProcessor/BatchSpanProcessor initialize and use BatchProcessor. # Important: make sure to call .shutdown() before the end of the test, # otherwise the worker thread will continue to run after the end of the test. @@ -193,3 +218,42 @@ def test_record_processor_is_garbage_collected( # Then the reference to the processor should no longer exist assert weak_ref() is None + + def test_shutdown_cancels_longrunning_export(self, batch_processor_class, telemetry, caplog): + # This exporter throws an exception if it's export sleep cannot finish. + exporter = MockExporterForTesting(export_sleep=3) + processor = batch_processor_class( + exporter, + max_queue_size=200, + max_export_batch_size=10, + schedule_delay_millis=30000, + ) + processor._batch_processor.emit(telemetry) + before = time.time() + # Shutdown should cancel export after 2 seconds + processor._batch_processor.shutdown(timeout_millis=2000) + after = time.time() + assert after - before < 2.2 + assert "Exception while exporting" in caplog.text + + + def test_shutdown_allows_1_export_to_finish(self, batch_processor_class, telemetry, caplog): + # This exporter throws an exception if it's export sleep cannot finish. + exporter = MockExporterForTesting(export_sleep=2) + processor = batch_processor_class( + exporter, + max_queue_size=200, + max_export_batch_size=1, + schedule_delay_millis=30000, + ) + # Max export batch size is 1, so 3 emit calls requires 3 separate calls to Export to clear the queue. + processor._batch_processor.emit(telemetry) + processor._batch_processor.emit(telemetry) + processor._batch_processor.emit(telemetry) + before = time.time() + processor._batch_processor.shutdown(timeout_millis=3000) + after = time.time() + assert after - before < 3.2 + # Expect the second call to be interrupted by shutdown, and the third call to never be made. + assert "Exception while exporting" in caplog.text + assert 2 == exporter.num_export_calls \ No newline at end of file From f2583e017def9c4156783cac3801a1e8ed718881 Mon Sep 17 00:00:00 2001 From: Dylan Russell Date: Tue, 10 Jun 2025 20:44:17 +0000 Subject: [PATCH 23/38] Reorder shutdown stuff --- .../src/opentelemetry/exporter/otlp/proto/grpc/exporter.py | 2 +- .../exporter/otlp/proto/http/_log_exporter/__init__.py | 3 +-- .../exporter/otlp/proto/http/metric_exporter/__init__.py | 2 +- .../exporter/otlp/proto/http/trace_exporter/__init__.py | 2 +- 4 files changed, 4 insertions(+), 5 deletions(-) diff --git a/exporter/opentelemetry-exporter-otlp-proto-grpc/src/opentelemetry/exporter/otlp/proto/grpc/exporter.py b/exporter/opentelemetry-exporter-otlp-proto-grpc/src/opentelemetry/exporter/otlp/proto/grpc/exporter.py index 80f9f94c486..d1fe17447d8 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-grpc/src/opentelemetry/exporter/otlp/proto/grpc/exporter.py +++ b/exporter/opentelemetry-exporter-otlp-proto-grpc/src/opentelemetry/exporter/otlp/proto/grpc/exporter.py @@ -345,8 +345,8 @@ def shutdown(self, timeout_millis: float = 30_000, **kwargs) -> None: if self._shutdown: logger.warning("Exporter already shutdown, ignoring call") return - self._shutdown_is_occuring.set() self._shutdown = True + self._shutdown_is_occuring.set() self._channel.close() @property diff --git a/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/_log_exporter/__init__.py b/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/_log_exporter/__init__.py index f836fdc4755..d146ea4c6e2 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/_log_exporter/__init__.py +++ b/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/_log_exporter/__init__.py @@ -201,10 +201,9 @@ def shutdown(self): if self._shutdown: _logger.warning("Exporter already shutdown, ignoring call") return + self._shutdown = True self._shutdown_is_occuring.set() self._session.close() - self._shutdown = True - def _compression_from_env() -> Compression: compression = ( diff --git a/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/metric_exporter/__init__.py b/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/metric_exporter/__init__.py index 67ef08a5ad6..643aee85e33 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/metric_exporter/__init__.py +++ b/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/metric_exporter/__init__.py @@ -249,9 +249,9 @@ def shutdown(self, timeout_millis: float = 30_000, **kwargs) -> None: if self._shutdown: _logger.warning("Exporter already shutdown, ignoring call") return + self._shutdown = True self._shutdown_is_occuring.set() self._session.close() - self._shutdown = True @property def _exporting(self) -> str: diff --git a/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/trace_exporter/__init__.py b/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/trace_exporter/__init__.py index 4a99f3a5389..10db27ab842 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/trace_exporter/__init__.py +++ b/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/trace_exporter/__init__.py @@ -195,9 +195,9 @@ def shutdown(self): if self._shutdown: _logger.warning("Exporter already shutdown, ignoring call") return + self._shutdown = True self._shutdown_is_occuring.set() self._session.close() - self._shutdown = True def force_flush(self, timeout_millis: int = 30000) -> bool: """Nothing is buffered in this exporter, so this method does nothing.""" From 5fa8c2307382b7484c4e676b9050fde56aa5e7af Mon Sep 17 00:00:00 2001 From: Dylan Russell Date: Fri, 13 Jun 2025 19:48:04 +0000 Subject: [PATCH 24/38] Fix merging --- .../src/opentelemetry/sdk/_shared_internal/__init__.py | 8 +++++--- .../tests/shared_internal/test_batch_processor.py | 3 ++- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/opentelemetry-sdk/src/opentelemetry/sdk/_shared_internal/__init__.py b/opentelemetry-sdk/src/opentelemetry/sdk/_shared_internal/__init__.py index 1771b3e0fe7..4f7ac2b4ae1 100644 --- a/opentelemetry-sdk/src/opentelemetry/sdk/_shared_internal/__init__.py +++ b/opentelemetry-sdk/src/opentelemetry/sdk/_shared_internal/__init__.py @@ -182,7 +182,7 @@ def emit(self, data: Telemetry) -> None: self._queue.appendleft(data) if len(self._queue) >= self._max_export_batch_size: self._worker_awaken.set() - + def shutdown(self, timeout_millis: int = 30000): if self._shutdown: return @@ -209,8 +209,10 @@ def shutdown(self, timeout_millis: int = 30000): self._exporter.shutdown(timeout_millis=0) # type: ignore else: self._exporter.shutdown() - # Worker thread should be finished at this point and return instantly. - self._worker_thread.join() + # Worker thread **should** be finished at this point, because we called shutdown on the exporter, + # and set shutdown_is_occuring to prevent further export calls. It's possible that a single export + # call is ongoing and the thread isn't finished. In this case we will return instead of waiting on + # the thread to finish. # TODO: Fix force flush so the timeout is used https://github.com/open-telemetry/opentelemetry-python/issues/4568. def force_flush(self, timeout_millis: Optional[int] = None) -> bool: diff --git a/opentelemetry-sdk/tests/shared_internal/test_batch_processor.py b/opentelemetry-sdk/tests/shared_internal/test_batch_processor.py index a7cbaf33657..7cd3bcc4858 100644 --- a/opentelemetry-sdk/tests/shared_internal/test_batch_processor.py +++ b/opentelemetry-sdk/tests/shared_internal/test_batch_processor.py @@ -239,6 +239,7 @@ def test_shutdown_cancels_longrunning_export(self, batch_processor_class, teleme def test_shutdown_allows_1_export_to_finish(self, batch_processor_class, telemetry, caplog): # This exporter throws an exception if it's export sleep cannot finish. + dir(caplog) exporter = MockExporterForTesting(export_sleep=2) processor = batch_processor_class( exporter, @@ -246,7 +247,7 @@ def test_shutdown_allows_1_export_to_finish(self, batch_processor_class, telemet max_export_batch_size=1, schedule_delay_millis=30000, ) - # Max export batch size is 1, so 3 emit calls requires 3 separate calls to Export to clear the queue. + # Max export batch size is 1, so 3 emit calls requires 3 separate calls (each block for 2 seconds) to Export to clear the queue. processor._batch_processor.emit(telemetry) processor._batch_processor.emit(telemetry) processor._batch_processor.emit(telemetry) From 62d369924148fa37446b738d7217a41692707514 Mon Sep 17 00:00:00 2001 From: Dylan Russell Date: Mon, 16 Jun 2025 16:07:36 +0000 Subject: [PATCH 25/38] Don't join the thread in case we are stuck in an individual export call --- .../shared_internal/test_batch_processor.py | 25 ++++++------------- 1 file changed, 7 insertions(+), 18 deletions(-) diff --git a/opentelemetry-sdk/tests/shared_internal/test_batch_processor.py b/opentelemetry-sdk/tests/shared_internal/test_batch_processor.py index 7cd3bcc4858..96ff1ec79d5 100644 --- a/opentelemetry-sdk/tests/shared_internal/test_batch_processor.py +++ b/opentelemetry-sdk/tests/shared_internal/test_batch_processor.py @@ -14,6 +14,7 @@ # pylint: disable=protected-access import gc +import logging import multiprocessing import os import time @@ -219,27 +220,9 @@ def test_record_processor_is_garbage_collected( # Then the reference to the processor should no longer exist assert weak_ref() is None - def test_shutdown_cancels_longrunning_export(self, batch_processor_class, telemetry, caplog): - # This exporter throws an exception if it's export sleep cannot finish. - exporter = MockExporterForTesting(export_sleep=3) - processor = batch_processor_class( - exporter, - max_queue_size=200, - max_export_batch_size=10, - schedule_delay_millis=30000, - ) - processor._batch_processor.emit(telemetry) - before = time.time() - # Shutdown should cancel export after 2 seconds - processor._batch_processor.shutdown(timeout_millis=2000) - after = time.time() - assert after - before < 2.2 - assert "Exception while exporting" in caplog.text - def test_shutdown_allows_1_export_to_finish(self, batch_processor_class, telemetry, caplog): # This exporter throws an exception if it's export sleep cannot finish. - dir(caplog) exporter = MockExporterForTesting(export_sleep=2) processor = batch_processor_class( exporter, @@ -253,8 +236,14 @@ def test_shutdown_allows_1_export_to_finish(self, batch_processor_class, telemet processor._batch_processor.emit(telemetry) before = time.time() processor._batch_processor.shutdown(timeout_millis=3000) + # Shutdown does not kill the thread. + assert processor._batch_processor._worker_thread.is_alive() is True + after = time.time() assert after - before < 3.2 + # Thread will naturally finish after a little bit. + time.sleep(.1) + assert processor._batch_processor._worker_thread.is_alive() is False # Expect the second call to be interrupted by shutdown, and the third call to never be made. assert "Exception while exporting" in caplog.text assert 2 == exporter.num_export_calls \ No newline at end of file From eb59db3505f4b9125202b4b862b488beb623d9e0 Mon Sep 17 00:00:00 2001 From: Dylan Russell Date: Mon, 16 Jun 2025 16:57:33 +0000 Subject: [PATCH 26/38] Add tests, changelog entry --- CHANGELOG.md | 9 +- .../exporter/otlp/proto/grpc/exporter.py | 89 +++++++++---------- .../tests/test_otlp_exporter_mixin.py | 18 ++-- .../otlp/proto/http/_log_exporter/__init__.py | 6 +- .../proto/http/metric_exporter/__init__.py | 3 +- .../proto/http/trace_exporter/__init__.py | 7 +- .../metrics/test_otlp_metrics_exporter.py | 32 +++++++ .../tests/test_proto_log_exporter.py | 32 +++++++ .../tests/test_proto_span_exporter.py | 30 +++++++ .../sdk/_shared_internal/__init__.py | 4 +- .../shared_internal/test_batch_processor.py | 18 ++-- 11 files changed, 170 insertions(+), 78 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4d902c4f63a..a857ee2c7b0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,11 +7,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## Unreleased -- Update OTLP gRPC/HTTP exporters: the export timeout is now inclusive of all retries and backoffs. - A +/-20% jitter was added to all backoffs. A pointless 32 second sleep that occurred after all retries - had completed/failed was removed. - ([#4564](https://github.com/open-telemetry/opentelemetry-python/pull/4564)). +- Update OTLP gRPC/HTTP exporters: calling shutdown will now interrupt exporters that are sleeping + before a retry attempt, and cause them to return failure immediately. + Update BatchSpan/LogRecodProcessors: shutdown will now complete after 30 seconds of trying to finish + exporting any buffered telemetry, instead of continuing to export until all telemetry was exported. + ([#4564](https://github.com/open-telemetry/opentelemetry-python/pull/4564)). - Update OTLP gRPC/HTTP exporters: the export timeout is now inclusive of all retries and backoffs. A +/-20% jitter was added to all backoffs. A pointless 32 second sleep that occurred after all retries had completed/failed was removed. diff --git a/exporter/opentelemetry-exporter-otlp-proto-grpc/src/opentelemetry/exporter/otlp/proto/grpc/exporter.py b/exporter/opentelemetry-exporter-otlp-proto-grpc/src/opentelemetry/exporter/otlp/proto/grpc/exporter.py index 3d2235b1c00..70269083c84 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-grpc/src/opentelemetry/exporter/otlp/proto/grpc/exporter.py +++ b/exporter/opentelemetry-exporter-otlp-proto-grpc/src/opentelemetry/exporter/otlp/proto/grpc/exporter.py @@ -20,7 +20,7 @@ from collections.abc import Sequence # noqa: F401 from logging import getLogger from os import environ -from time import sleep, time +from time import time from typing import ( # noqa: F401 Any, Callable, @@ -293,54 +293,53 @@ def _export( # FIXME remove this check if the export type for traces # gets updated to a class that represents the proto # TracesData and use the code below instead. - with self._export_lock: - deadline_sec = time() + self._timeout - for retry_num in range(_MAX_RETRYS): - try: - self._client.Export( - request=self._translate_data(data), - metadata=self._headers, - timeout=deadline_sec - time(), + deadline_sec = time() + self._timeout + for retry_num in range(_MAX_RETRYS): + try: + self._client.Export( + request=self._translate_data(data), + metadata=self._headers, + timeout=deadline_sec - time(), + ) + return self._result.SUCCESS + except RpcError as error: + retry_info_bin = dict(error.trailing_metadata()).get( + "google.rpc.retryinfo-bin" + ) + # multiplying by a random number between .8 and 1.2 introduces a +/20% jitter to each backoff. + backoff_seconds = 2**retry_num * random.uniform(0.8, 1.2) + if retry_info_bin is not None: + retry_info = RetryInfo() + retry_info.ParseFromString(retry_info_bin) + backoff_seconds = ( + retry_info.retry_delay.seconds + + retry_info.retry_delay.nanos / 1.0e9 ) - return self._result.SUCCESS - except RpcError as error: - retry_info_bin = dict(error.trailing_metadata()).get( - "google.rpc.retryinfo-bin" - ) - # multiplying by a random number between .8 and 1.2 introduces a +/20% jitter to each backoff. - backoff_seconds = 2**retry_num * random.uniform(0.8, 1.2) - if retry_info_bin is not None: - retry_info = RetryInfo() - retry_info.ParseFromString(retry_info_bin) - backoff_seconds = ( - retry_info.retry_delay.seconds - + retry_info.retry_delay.nanos / 1.0e9 - ) - if ( - error.code() not in _RETRYABLE_ERROR_CODES - or retry_num + 1 == _MAX_RETRYS - or backoff_seconds > (deadline_sec - time()) - or self._shutdown - ): - logger.error( - "Failed to export %s to %s, error code: %s", - self._exporting, - self._endpoint, - error.code(), - exc_info=error.code() == StatusCode.UNKNOWN, - ) - return self._result.FAILURE - logger.warning( - "Transient error %s encountered while exporting %s to %s, retrying in %.2fs.", - error.code(), + if ( + error.code() not in _RETRYABLE_ERROR_CODES + or retry_num + 1 == _MAX_RETRYS + or backoff_seconds > (deadline_sec - time()) + or self._shutdown + ): + logger.error( + "Failed to export %s to %s, error code: %s", self._exporting, self._endpoint, - backoff_seconds, + error.code(), + exc_info=error.code() == StatusCode.UNKNOWN, ) - shutdown = self._shutdown_is_occuring.wait(backoff_seconds) - if shutdown: - logger.warning("Shutdown in progress, aborting retry.") - break + return self._result.FAILURE + logger.warning( + "Transient error %s encountered while exporting %s to %s, retrying in %.2fs.", + error.code(), + self._exporting, + self._endpoint, + backoff_seconds, + ) + shutdown = self._shutdown_is_occuring.wait(backoff_seconds) + if shutdown: + logger.warning("Shutdown in progress, aborting retry.") + break # Not possible to reach here but the linter is complaining. return self._result.FAILURE diff --git a/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/test_otlp_exporter_mixin.py b/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/test_otlp_exporter_mixin.py index c93dfa0caca..d795233702d 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/test_otlp_exporter_mixin.py +++ b/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/test_otlp_exporter_mixin.py @@ -312,11 +312,10 @@ def test_shutdown(self): "Exporter already shutdown, ignoring batch", ) - def test_shutdown_interrupts_export_sleep(self): - # Returns unavailable and asks for a 20 second sleep before retry. + def test_shutdown_interrupts_export_retry_backoff(self): add_TraceServiceServicer_to_server( TraceServiceServicerWithExportParams( - StatusCode.UNAVAILABLE + StatusCode.UNAVAILABLE, ), self.server, ) @@ -325,24 +324,23 @@ def test_shutdown_interrupts_export_sleep(self): target=self.exporter.export, args=([self.span],) ) with self.assertLogs(level=WARNING) as warning: + begin_wait = time.time() export_thread.start() # Wait a bit for export to fail and the backoff sleep to start - time.sleep(.1) - # The code should now be in a sleep that's between .8 and 1.2 seconds. - begin_wait = time.time_ns() + time.sleep(0.05) + # The code should now be in a 1 second backoff. # pylint: disable=protected-access self.assertFalse(self.exporter._shutdown_is_occuring.is_set()) self.exporter.shutdown() self.assertTrue(self.exporter._shutdown_is_occuring.is_set()) export_result = export_thread.join() - end_wait = time.time_ns() + end_wait = time.time() self.assertEqual(export_result, SpanExportResult.FAILURE) # Shutdown should have interrupted the sleep. - self.assertTrue((end_wait - begin_wait) / 1e9 < .1) - print(warning.records) + self.assertTrue(end_wait - begin_wait < 0.2) self.assertEqual( warning.records[1].message, - "Shutdown in progress, aborting retry." + "Shutdown in progress, aborting retry.", ) def test_export_over_closed_grpc_channel(self): diff --git a/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/_log_exporter/__init__.py b/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/_log_exporter/__init__.py index f46b65548b1..4c8b4de6004 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/_log_exporter/__init__.py +++ b/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/_log_exporter/__init__.py @@ -15,10 +15,11 @@ import gzip import logging import random +import threading import zlib from io import BytesIO from os import environ -from time import sleep, time +from time import time from typing import Dict, Optional, Sequence import requests @@ -174,7 +175,7 @@ def export(self, batch: Sequence[LogData]) -> LogExportResult: not _is_retryable(resp) or retry_num + 1 == _MAX_RETRYS or backoff_seconds > (deadline_sec - time()) - or self._shutdown + or self._shutdown ): _logger.error( "Failed to export logs batch code: %s, reason: %s", @@ -205,6 +206,7 @@ def shutdown(self): self._shutdown_is_occuring.set() self._session.close() + def _compression_from_env() -> Compression: compression = ( environ.get( diff --git a/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/metric_exporter/__init__.py b/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/metric_exporter/__init__.py index c17649e17fc..c45202f240d 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/metric_exporter/__init__.py +++ b/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/metric_exporter/__init__.py @@ -15,10 +15,11 @@ import gzip import logging import random +import threading import zlib from io import BytesIO from os import environ -from time import sleep, time +from time import time from typing import ( # noqa: F401 Any, Callable, diff --git a/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/trace_exporter/__init__.py b/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/trace_exporter/__init__.py index 55e7bc0bf05..599d95b3adc 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/trace_exporter/__init__.py +++ b/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/trace_exporter/__init__.py @@ -15,11 +15,11 @@ import gzip import logging import random -import zlib import threading +import zlib from io import BytesIO from os import environ -from time import sleep, time +from time import time from typing import Dict, Optional, Sequence import requests @@ -35,9 +35,6 @@ from opentelemetry.exporter.otlp.proto.http._common import ( _is_retryable, ) -from opentelemetry.exporter.otlp.proto.http._common import ( - _is_retryable, -) from opentelemetry.sdk.environment_variables import ( OTEL_EXPORTER_OTLP_CERTIFICATE, OTEL_EXPORTER_OTLP_CLIENT_CERTIFICATE, diff --git a/exporter/opentelemetry-exporter-otlp-proto-http/tests/metrics/test_otlp_metrics_exporter.py b/exporter/opentelemetry-exporter-otlp-proto-http/tests/metrics/test_otlp_metrics_exporter.py index 1b5e9cc5f92..815761397ea 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-http/tests/metrics/test_otlp_metrics_exporter.py +++ b/exporter/opentelemetry-exporter-otlp-proto-http/tests/metrics/test_otlp_metrics_exporter.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import threading import time from logging import WARNING from os import environ @@ -541,3 +542,34 @@ def export_side_effect(*args, **kwargs): mock_post.side_effect = export_side_effect exporter = OTLPMetricExporter(timeout=0.4) exporter.export(self.metrics["sum_int"]) + + @patch.object(Session, "post") + def test_shutdown_interrupts_retry_backoff(self, mock_post): + exporter = OTLPMetricExporter(timeout=1.5) + + resp = Response() + resp.status_code = 503 + resp.reason = "UNAVAILABLE" + mock_post.return_value = resp + thread = threading.Thread( + target=exporter.export, args=(self.metrics["sum_int"],) + ) + with self.assertLogs(level=WARNING) as warning: + before = time.time() + thread.start() + # Wait for the first attempt to fail, then enter a 1 second backoff. + time.sleep(0.05) + # Should cause export to wake up and return. + exporter.shutdown() + thread.join() + after = time.time() + self.assertIn( + "Transient error UNAVAILABLE encountered while exporting metrics batch, retrying in", + warning.records[0].message, + ) + self.assertIn( + "Shutdown in progress, aborting retry.", + warning.records[1].message, + ) + + assert after - before < 0.2 diff --git a/exporter/opentelemetry-exporter-otlp-proto-http/tests/test_proto_log_exporter.py b/exporter/opentelemetry-exporter-otlp-proto-http/tests/test_proto_log_exporter.py index 04c37743860..b3ee63854d4 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-http/tests/test_proto_log_exporter.py +++ b/exporter/opentelemetry-exporter-otlp-proto-http/tests/test_proto_log_exporter.py @@ -14,6 +14,7 @@ # pylint: disable=protected-access +import threading import time import unittest from logging import WARNING @@ -388,3 +389,34 @@ def export_side_effect(*args, **kwargs): mock_post.side_effect = export_side_effect exporter = OTLPLogExporter(timeout=0.4) exporter.export(self._get_sdk_log_data()) + + @patch.object(Session, "post") + def test_shutdown_interrupts_retry_backoff(self, mock_post): + exporter = OTLPLogExporter(timeout=1.5) + + resp = Response() + resp.status_code = 503 + resp.reason = "UNAVAILABLE" + mock_post.return_value = resp + thread = threading.Thread( + target=exporter.export, args=(self._get_sdk_log_data(),) + ) + with self.assertLogs(level=WARNING) as warning: + before = time.time() + thread.start() + # Wait for the first attempt to fail, then enter a 1 second backoff. + time.sleep(0.05) + # Should cause export to wake up and return. + exporter.shutdown() + thread.join() + after = time.time() + self.assertIn( + "Transient error UNAVAILABLE encountered while exporting logs batch, retrying in", + warning.records[0].message, + ) + self.assertIn( + "Shutdown in progress, aborting retry.", + warning.records[1].message, + ) + + assert after - before < 0.2 diff --git a/exporter/opentelemetry-exporter-otlp-proto-http/tests/test_proto_span_exporter.py b/exporter/opentelemetry-exporter-otlp-proto-http/tests/test_proto_span_exporter.py index 16d40e3f3fd..224227a7f59 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-http/tests/test_proto_span_exporter.py +++ b/exporter/opentelemetry-exporter-otlp-proto-http/tests/test_proto_span_exporter.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import threading import time import unittest from logging import WARNING @@ -288,3 +289,32 @@ def export_side_effect(*args, **kwargs): mock_post.side_effect = export_side_effect exporter = OTLPSpanExporter(timeout=0.4) exporter.export([BASIC_SPAN]) + + @patch.object(Session, "post") + def test_shutdown_interrupts_retry_backoff(self, mock_post): + exporter = OTLPSpanExporter(timeout=1.5) + + resp = Response() + resp.status_code = 503 + resp.reason = "UNAVAILABLE" + mock_post.return_value = resp + thread = threading.Thread(target=exporter.export, args=([BASIC_SPAN],)) + with self.assertLogs(level=WARNING) as warning: + before = time.time() + thread.start() + # Wait for the first attempt to fail, then enter a 1 second backoff. + time.sleep(0.05) + # Should cause export to wake up and return. + exporter.shutdown() + thread.join() + after = time.time() + self.assertIn( + "Transient error UNAVAILABLE encountered while exporting span batch, retrying in", + warning.records[0].message, + ) + self.assertIn( + "Shutdown in progress, aborting retry.", + warning.records[1].message, + ) + + assert after - before < 0.2 diff --git a/opentelemetry-sdk/src/opentelemetry/sdk/_shared_internal/__init__.py b/opentelemetry-sdk/src/opentelemetry/sdk/_shared_internal/__init__.py index 4f7ac2b4ae1..f25e31d3d2c 100644 --- a/opentelemetry-sdk/src/opentelemetry/sdk/_shared_internal/__init__.py +++ b/opentelemetry-sdk/src/opentelemetry/sdk/_shared_internal/__init__.py @@ -182,7 +182,7 @@ def emit(self, data: Telemetry) -> None: self._queue.appendleft(data) if len(self._queue) >= self._max_export_batch_size: self._worker_awaken.set() - + def shutdown(self, timeout_millis: int = 30000): if self._shutdown: return @@ -220,4 +220,4 @@ def force_flush(self, timeout_millis: Optional[int] = None) -> bool: return False # Blocking call to export. self._export(BatchExportStrategy.EXPORT_ALL) - return True \ No newline at end of file + return True diff --git a/opentelemetry-sdk/tests/shared_internal/test_batch_processor.py b/opentelemetry-sdk/tests/shared_internal/test_batch_processor.py index 96ff1ec79d5..17600ea6b45 100644 --- a/opentelemetry-sdk/tests/shared_internal/test_batch_processor.py +++ b/opentelemetry-sdk/tests/shared_internal/test_batch_processor.py @@ -14,12 +14,11 @@ # pylint: disable=protected-access import gc -import logging import multiprocessing import os +import threading import time import unittest -import threading import weakref from platform import system from typing import Any @@ -52,13 +51,13 @@ multiprocessing.set_start_method("fork") -class MockExporterForTesting(): +class MockExporterForTesting: def __init__(self, export_sleep: int): self.num_export_calls = 0 self.export_sleep = export_sleep self._shutdown = False self.export_sleep_event = threading.Event() - + def export(self, _: list[Any]): self.num_export_calls += 1 if self._shutdown: @@ -73,7 +72,7 @@ def shutdown(self): # Force export to finish sleeping. self._shutdown = True self.export_sleep_event.set() - + # BatchLogRecodProcessor/BatchSpanProcessor initialize and use BatchProcessor. # Important: make sure to call .shutdown() before the end of the test, @@ -220,8 +219,9 @@ def test_record_processor_is_garbage_collected( # Then the reference to the processor should no longer exist assert weak_ref() is None - - def test_shutdown_allows_1_export_to_finish(self, batch_processor_class, telemetry, caplog): + def test_shutdown_allows_1_export_to_finish( + self, batch_processor_class, telemetry, caplog + ): # This exporter throws an exception if it's export sleep cannot finish. exporter = MockExporterForTesting(export_sleep=2) processor = batch_processor_class( @@ -242,8 +242,8 @@ def test_shutdown_allows_1_export_to_finish(self, batch_processor_class, telemet after = time.time() assert after - before < 3.2 # Thread will naturally finish after a little bit. - time.sleep(.1) + time.sleep(0.1) assert processor._batch_processor._worker_thread.is_alive() is False # Expect the second call to be interrupted by shutdown, and the third call to never be made. assert "Exception while exporting" in caplog.text - assert 2 == exporter.num_export_calls \ No newline at end of file + assert 2 == exporter.num_export_calls From 61c28da22622d758e280520784ea04b8976d37b0 Mon Sep 17 00:00:00 2001 From: Dylan Russell Date: Mon, 16 Jun 2025 20:05:33 +0000 Subject: [PATCH 27/38] Update time assertions to satisfy windows.. Fix lint issues --- .../tests/test_otlp_exporter_mixin.py | 4 ++-- .../tests/shared_internal/test_batch_processor.py | 7 +++---- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/test_otlp_exporter_mixin.py b/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/test_otlp_exporter_mixin.py index d795233702d..92f1473f4ea 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/test_otlp_exporter_mixin.py +++ b/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/test_otlp_exporter_mixin.py @@ -327,7 +327,7 @@ def test_shutdown_interrupts_export_retry_backoff(self): begin_wait = time.time() export_thread.start() # Wait a bit for export to fail and the backoff sleep to start - time.sleep(0.05) + time.sleep(0.1) # The code should now be in a 1 second backoff. # pylint: disable=protected-access self.assertFalse(self.exporter._shutdown_is_occuring.is_set()) @@ -337,7 +337,7 @@ def test_shutdown_interrupts_export_retry_backoff(self): end_wait = time.time() self.assertEqual(export_result, SpanExportResult.FAILURE) # Shutdown should have interrupted the sleep. - self.assertTrue(end_wait - begin_wait < 0.2) + self.assertTrue(end_wait - begin_wait < 0.3) self.assertEqual( warning.records[1].message, "Shutdown in progress, aborting retry.", diff --git a/opentelemetry-sdk/tests/shared_internal/test_batch_processor.py b/opentelemetry-sdk/tests/shared_internal/test_batch_processor.py index 17600ea6b45..541d27c880a 100644 --- a/opentelemetry-sdk/tests/shared_internal/test_batch_processor.py +++ b/opentelemetry-sdk/tests/shared_internal/test_batch_processor.py @@ -61,12 +61,11 @@ def __init__(self, export_sleep: int): def export(self, _: list[Any]): self.num_export_calls += 1 if self._shutdown: - raise Exception("Cannot export, already shutdown") + raise ValueError("Cannot export, already shutdown") sleep_interrupted = self.export_sleep_event.wait(self.export_sleep) if sleep_interrupted: - raise Exception("Did not get to finish !") - return + raise ValueError("Did not get to finish !") def shutdown(self): # Force export to finish sleeping. @@ -240,7 +239,7 @@ def test_shutdown_allows_1_export_to_finish( assert processor._batch_processor._worker_thread.is_alive() is True after = time.time() - assert after - before < 3.2 + assert after - before < 3.3 # Thread will naturally finish after a little bit. time.sleep(0.1) assert processor._batch_processor._worker_thread.is_alive() is False From 23cd24ac55627bfe48f5664bfbcc04c8010d3d8c Mon Sep 17 00:00:00 2001 From: Dylan Russell Date: Mon, 16 Jun 2025 20:11:16 +0000 Subject: [PATCH 28/38] Skip test on windows --- .../tests/test_otlp_exporter_mixin.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/test_otlp_exporter_mixin.py b/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/test_otlp_exporter_mixin.py index 92f1473f4ea..dd78e2b06c3 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/test_otlp_exporter_mixin.py +++ b/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/test_otlp_exporter_mixin.py @@ -312,6 +312,10 @@ def test_shutdown(self): "Exporter already shutdown, ignoring batch", ) + @unittest.skipIf( + system() == "Windows", + "For gRPC + windows there's some added delay in the RPCs which breaks the assertion over amount of time passed.", + ) def test_shutdown_interrupts_export_retry_backoff(self): add_TraceServiceServicer_to_server( TraceServiceServicerWithExportParams( @@ -327,7 +331,7 @@ def test_shutdown_interrupts_export_retry_backoff(self): begin_wait = time.time() export_thread.start() # Wait a bit for export to fail and the backoff sleep to start - time.sleep(0.1) + time.sleep(0.05) # The code should now be in a 1 second backoff. # pylint: disable=protected-access self.assertFalse(self.exporter._shutdown_is_occuring.is_set()) @@ -337,7 +341,7 @@ def test_shutdown_interrupts_export_retry_backoff(self): end_wait = time.time() self.assertEqual(export_result, SpanExportResult.FAILURE) # Shutdown should have interrupted the sleep. - self.assertTrue(end_wait - begin_wait < 0.3) + self.assertTrue(end_wait - begin_wait < 0.2) self.assertEqual( warning.records[1].message, "Shutdown in progress, aborting retry.", From 33a860f257814c37b2fc555b21b1c1aa7c756581 Mon Sep 17 00:00:00 2001 From: Dylan Russell Date: Fri, 27 Jun 2025 14:33:19 +0000 Subject: [PATCH 29/38] Use threading Event instead of sleep loop. --- .../sdk/_shared_internal/__init__.py | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/opentelemetry-sdk/src/opentelemetry/sdk/_shared_internal/__init__.py b/opentelemetry-sdk/src/opentelemetry/sdk/_shared_internal/__init__.py index f25e31d3d2c..94606dbe0ba 100644 --- a/opentelemetry-sdk/src/opentelemetry/sdk/_shared_internal/__init__.py +++ b/opentelemetry-sdk/src/opentelemetry/sdk/_shared_internal/__init__.py @@ -20,7 +20,6 @@ import logging import os import threading -import time import weakref from abc import abstractmethod from typing import ( @@ -94,6 +93,7 @@ def __init__( self._shutdown = False self._shutdown_timeout_exceeded = False self._export_lock = threading.Lock() + self._final_export_finished = threading.Event() self._worker_awaken = threading.Event() self._worker_thread.start() if hasattr(os, "register_at_fork"): @@ -118,6 +118,7 @@ def _should_export_batch( def _at_fork_reinit(self): self._export_lock = threading.Lock() self._worker_awaken = threading.Event() + self._export_event = threading.Event() self._queue.clear() self._worker_thread = threading.Thread( name=f"OtelBatch{self._exporting}RecordProcessor", @@ -169,6 +170,9 @@ def _export(self, batch_strategy: BatchExportStrategy) -> None: "Exception while exporting %s.", self._exporting ) detach(token) + # This is the final export. Set the signal to shutdown that export is done. + if batch_strategy == BatchExportStrategy.EXPORT_ALL and self.shutdown: + self._final_export_finished.set() # Do not add any logging.log statements to this function, they can be being routed back to this `emit` function, # resulting in endless recursive calls that crash the program. @@ -190,14 +194,7 @@ def shutdown(self, timeout_millis: int = 30000): self._shutdown = True # Interrupts sleep in the worker if it's sleeping. self._worker_awaken.set() - # Wait a tiny bit for the worker thread to wake and call export for a final time. - time.sleep(0.1) - num_sleeps = 10 - for _ in range(num_sleeps): - # If export is not being called, we can shutdown. - if not self._export_lock.locked(): - break - time.sleep(timeout_millis / 1000 / num_sleeps) + self._final_export_finished.wait(timeout_millis / 1000) # Stops worker thread from calling export again if queue is still not empty. self._shutdown_timeout_exceeded = True # We want to shutdown immediately because we already waited `timeout_millis`. From fb7f3202dcc38c973ba7322ef025b0ea7c66b40e Mon Sep 17 00:00:00 2001 From: Dylan Russell Date: Mon, 14 Jul 2025 17:46:16 +0000 Subject: [PATCH 30/38] Respond to review comments.. --- .../src/opentelemetry/sdk/_shared_internal/__init__.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/opentelemetry-sdk/src/opentelemetry/sdk/_shared_internal/__init__.py b/opentelemetry-sdk/src/opentelemetry/sdk/_shared_internal/__init__.py index 94606dbe0ba..110e795724d 100644 --- a/opentelemetry-sdk/src/opentelemetry/sdk/_shared_internal/__init__.py +++ b/opentelemetry-sdk/src/opentelemetry/sdk/_shared_internal/__init__.py @@ -93,7 +93,6 @@ def __init__( self._shutdown = False self._shutdown_timeout_exceeded = False self._export_lock = threading.Lock() - self._final_export_finished = threading.Event() self._worker_awaken = threading.Event() self._worker_thread.start() if hasattr(os, "register_at_fork"): @@ -118,7 +117,6 @@ def _should_export_batch( def _at_fork_reinit(self): self._export_lock = threading.Lock() self._worker_awaken = threading.Event() - self._export_event = threading.Event() self._queue.clear() self._worker_thread = threading.Thread( name=f"OtelBatch{self._exporting}RecordProcessor", @@ -170,9 +168,6 @@ def _export(self, batch_strategy: BatchExportStrategy) -> None: "Exception while exporting %s.", self._exporting ) detach(token) - # This is the final export. Set the signal to shutdown that export is done. - if batch_strategy == BatchExportStrategy.EXPORT_ALL and self.shutdown: - self._final_export_finished.set() # Do not add any logging.log statements to this function, they can be being routed back to this `emit` function, # resulting in endless recursive calls that crash the program. @@ -194,7 +189,7 @@ def shutdown(self, timeout_millis: int = 30000): self._shutdown = True # Interrupts sleep in the worker if it's sleeping. self._worker_awaken.set() - self._final_export_finished.wait(timeout_millis / 1000) + self._worker_thread.join(timeout_millis / 1000) # Stops worker thread from calling export again if queue is still not empty. self._shutdown_timeout_exceeded = True # We want to shutdown immediately because we already waited `timeout_millis`. From 79946d8454dc0c4f8f6c86d43e791afd0298a834 Mon Sep 17 00:00:00 2001 From: Dylan Russell Date: Wed, 16 Jul 2025 20:41:24 +0000 Subject: [PATCH 31/38] Pass remaining timeout to shutdown --- .../opentelemetry/sdk/_shared_internal/__init__.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/opentelemetry-sdk/src/opentelemetry/sdk/_shared_internal/__init__.py b/opentelemetry-sdk/src/opentelemetry/sdk/_shared_internal/__init__.py index 110e795724d..b008e568c51 100644 --- a/opentelemetry-sdk/src/opentelemetry/sdk/_shared_internal/__init__.py +++ b/opentelemetry-sdk/src/opentelemetry/sdk/_shared_internal/__init__.py @@ -15,6 +15,7 @@ from __future__ import annotations import collections +import time import enum import inspect import logging @@ -185,6 +186,7 @@ def emit(self, data: Telemetry) -> None: def shutdown(self, timeout_millis: int = 30000): if self._shutdown: return + shutdown_should_end = time.time() + timeout_millis / 1000 # Causes emit to reject telemetry and makes force_flush a no-op. self._shutdown = True # Interrupts sleep in the worker if it's sleeping. @@ -192,13 +194,18 @@ def shutdown(self, timeout_millis: int = 30000): self._worker_thread.join(timeout_millis / 1000) # Stops worker thread from calling export again if queue is still not empty. self._shutdown_timeout_exceeded = True - # We want to shutdown immediately because we already waited `timeout_millis`. + # We want to shutdown immediately only if we already waited `timeout_millis`. + # Otherwise we pass the remaining timeout to the exporter. # Some exporter's shutdown support a timeout param. if ( "timeout_millis" in inspect.getfullargspec(self._exporter.shutdown).args ): - self._exporter.shutdown(timeout_millis=0) # type: ignore + remaining_time = shutdown_should_end - time.time() + if remaining_time < 0: + self._exporter.shutdown(timeout_millis=0) # type: ignore + else: + self._exporter.shutdown(timeout_millis=remaining_time * 1000) # type: ignore else: self._exporter.shutdown() # Worker thread **should** be finished at this point, because we called shutdown on the exporter, From adbe4b046932f0142b025dc1c0be31e8e59ab96d Mon Sep 17 00:00:00 2001 From: Dylan Russell Date: Wed, 16 Jul 2025 20:41:51 +0000 Subject: [PATCH 32/38] Run precommit --- .../src/opentelemetry/sdk/_shared_internal/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/opentelemetry-sdk/src/opentelemetry/sdk/_shared_internal/__init__.py b/opentelemetry-sdk/src/opentelemetry/sdk/_shared_internal/__init__.py index b008e568c51..b04ddff709c 100644 --- a/opentelemetry-sdk/src/opentelemetry/sdk/_shared_internal/__init__.py +++ b/opentelemetry-sdk/src/opentelemetry/sdk/_shared_internal/__init__.py @@ -15,12 +15,12 @@ from __future__ import annotations import collections -import time import enum import inspect import logging import os import threading +import time import weakref from abc import abstractmethod from typing import ( @@ -205,7 +205,7 @@ def shutdown(self, timeout_millis: int = 30000): if remaining_time < 0: self._exporter.shutdown(timeout_millis=0) # type: ignore else: - self._exporter.shutdown(timeout_millis=remaining_time * 1000) # type: ignore + self._exporter.shutdown(timeout_millis=remaining_time * 1000) # type: ignore else: self._exporter.shutdown() # Worker thread **should** be finished at this point, because we called shutdown on the exporter, From f47c24d8def1d776e162bf3f82bf1c583d2e97a4 Mon Sep 17 00:00:00 2001 From: Dylan Russell Date: Thu, 17 Jul 2025 14:09:09 +0000 Subject: [PATCH 33/38] Change variable names --- .../sdk/_shared_internal/__init__.py | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/opentelemetry-sdk/src/opentelemetry/sdk/_shared_internal/__init__.py b/opentelemetry-sdk/src/opentelemetry/sdk/_shared_internal/__init__.py index b04ddff709c..904e22f4a83 100644 --- a/opentelemetry-sdk/src/opentelemetry/sdk/_shared_internal/__init__.py +++ b/opentelemetry-sdk/src/opentelemetry/sdk/_shared_internal/__init__.py @@ -183,29 +183,26 @@ def emit(self, data: Telemetry) -> None: if len(self._queue) >= self._max_export_batch_size: self._worker_awaken.set() - def shutdown(self, timeout_millis: int = 30000): + def shutdown(self, timeout_secs: int = 30): if self._shutdown: return - shutdown_should_end = time.time() + timeout_millis / 1000 + shutdown_should_end = time.time() + timeout_secs # Causes emit to reject telemetry and makes force_flush a no-op. self._shutdown = True # Interrupts sleep in the worker if it's sleeping. self._worker_awaken.set() - self._worker_thread.join(timeout_millis / 1000) + self._worker_thread.join(timeout_secs) # Stops worker thread from calling export again if queue is still not empty. self._shutdown_timeout_exceeded = True - # We want to shutdown immediately only if we already waited `timeout_millis`. + # We want to shutdown immediately only if we already waited `timeout_secs`. # Otherwise we pass the remaining timeout to the exporter. # Some exporter's shutdown support a timeout param. if ( "timeout_millis" in inspect.getfullargspec(self._exporter.shutdown).args ): - remaining_time = shutdown_should_end - time.time() - if remaining_time < 0: - self._exporter.shutdown(timeout_millis=0) # type: ignore - else: - self._exporter.shutdown(timeout_millis=remaining_time * 1000) # type: ignore + remaining_millis = (shutdown_should_end - time.time()) * 1000 + self._exporter.shutdown(timeout_millis=max(0, remaining_millis)) # type: ignore else: self._exporter.shutdown() # Worker thread **should** be finished at this point, because we called shutdown on the exporter, From 48309b6c1944303a1d0ac172e040b8a0fc71def2 Mon Sep 17 00:00:00 2001 From: Dylan Russell Date: Thu, 17 Jul 2025 18:46:37 +0000 Subject: [PATCH 34/38] Switch timeout back to timeout_millis --- .../src/opentelemetry/sdk/_shared_internal/__init__.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/opentelemetry-sdk/src/opentelemetry/sdk/_shared_internal/__init__.py b/opentelemetry-sdk/src/opentelemetry/sdk/_shared_internal/__init__.py index 904e22f4a83..aec04e80ea0 100644 --- a/opentelemetry-sdk/src/opentelemetry/sdk/_shared_internal/__init__.py +++ b/opentelemetry-sdk/src/opentelemetry/sdk/_shared_internal/__init__.py @@ -183,15 +183,15 @@ def emit(self, data: Telemetry) -> None: if len(self._queue) >= self._max_export_batch_size: self._worker_awaken.set() - def shutdown(self, timeout_secs: int = 30): + def shutdown(self, timeout_millis: int = 30000): if self._shutdown: return - shutdown_should_end = time.time() + timeout_secs + shutdown_should_end = time.time() + (timeout_millis / 1000) # Causes emit to reject telemetry and makes force_flush a no-op. self._shutdown = True # Interrupts sleep in the worker if it's sleeping. self._worker_awaken.set() - self._worker_thread.join(timeout_secs) + self._worker_thread.join(timeout_millis / 1000) # Stops worker thread from calling export again if queue is still not empty. self._shutdown_timeout_exceeded = True # We want to shutdown immediately only if we already waited `timeout_secs`. From 023f259d01e15be2b0b260c090242aae2fb56f1d Mon Sep 17 00:00:00 2001 From: DylanRussell Date: Fri, 18 Jul 2025 19:47:07 +0000 Subject: [PATCH 35/38] Update CHANGELOG.md MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Emídio Neto <9735060+emdneto@users.noreply.github.com> --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 682e66fbacd..e39fef391e7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,7 +13,7 @@ can cause a deadlock to occur over `logging._lock` in some cases ([#4636](https: - Update OTLP gRPC/HTTP exporters: calling shutdown will now interrupt exporters that are sleeping before a retry attempt, and cause them to return failure immediately. - Update BatchSpan/LogRecodProcessors: shutdown will now complete after 30 seconds of trying to finish + Update BatchSpan/LogRecordProcessors: shutdown will now complete after 30 seconds of trying to finish exporting any buffered telemetry, instead of continuing to export until all telemetry was exported. ([#4564](https://github.com/open-telemetry/opentelemetry-python/pull/4564)). From 768435b159daa34cbc41d6cdccf0dcf046201279 Mon Sep 17 00:00:00 2001 From: DylanRussell Date: Fri, 18 Jul 2025 19:47:25 +0000 Subject: [PATCH 36/38] Update CHANGELOG.md MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Emídio Neto <9735060+emdneto@users.noreply.github.com> --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e39fef391e7..d4ab27560c7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,7 +15,7 @@ can cause a deadlock to occur over `logging._lock` in some cases ([#4636](https: before a retry attempt, and cause them to return failure immediately. Update BatchSpan/LogRecordProcessors: shutdown will now complete after 30 seconds of trying to finish exporting any buffered telemetry, instead of continuing to export until all telemetry was exported. - ([#4564](https://github.com/open-telemetry/opentelemetry-python/pull/4564)). + ([#4638](https://github.com/open-telemetry/opentelemetry-python/pull/4638)). ## Version 1.35.0/0.56b0 (2025-07-11) From 950f57ae001c58884c1fb4f15f9090e60173efb4 Mon Sep 17 00:00:00 2001 From: Dylan Russell Date: Fri, 18 Jul 2025 19:50:28 +0000 Subject: [PATCH 37/38] Rename variable --- .../src/opentelemetry/exporter/otlp/proto/grpc/exporter.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/exporter/opentelemetry-exporter-otlp-proto-grpc/src/opentelemetry/exporter/otlp/proto/grpc/exporter.py b/exporter/opentelemetry-exporter-otlp-proto-grpc/src/opentelemetry/exporter/otlp/proto/grpc/exporter.py index cb2cfc7896b..6791062d5dc 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-grpc/src/opentelemetry/exporter/otlp/proto/grpc/exporter.py +++ b/exporter/opentelemetry-exporter-otlp-proto-grpc/src/opentelemetry/exporter/otlp/proto/grpc/exporter.py @@ -289,7 +289,7 @@ def __init__( ) self._client = self._stub(self._channel) - self._shutdown_is_occuring = threading.Event() + self._shutdown_in_progress = threading.Event() self._shutdown = False @abstractmethod @@ -352,7 +352,7 @@ def _export( self._endpoint, backoff_seconds, ) - shutdown = self._shutdown_is_occuring.wait(backoff_seconds) + shutdown = self._shutdown_in_progress.wait(backoff_seconds) if shutdown: logger.warning("Shutdown in progress, aborting retry.") break @@ -364,7 +364,7 @@ def shutdown(self, timeout_millis: float = 30_000, **kwargs) -> None: logger.warning("Exporter already shutdown, ignoring call") return self._shutdown = True - self._shutdown_is_occuring.set() + self._shutdown_in_progress.set() self._channel.close() @property From 235d22d7f7215f796a2eb7ab793b48e763cd3fc2 Mon Sep 17 00:00:00 2001 From: Dylan Russell Date: Fri, 18 Jul 2025 19:52:38 +0000 Subject: [PATCH 38/38] Fix variable name --- .../tests/test_otlp_exporter_mixin.py | 4 ++-- .../exporter/otlp/proto/http/metric_exporter/__init__.py | 6 +++--- .../exporter/otlp/proto/http/trace_exporter/__init__.py | 6 +++--- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/test_otlp_exporter_mixin.py b/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/test_otlp_exporter_mixin.py index 7663cd1ef12..aef52fbc4a7 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/test_otlp_exporter_mixin.py +++ b/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/test_otlp_exporter_mixin.py @@ -347,9 +347,9 @@ def test_shutdown_interrupts_export_retry_backoff(self): time.sleep(0.05) # The code should now be in a 1 second backoff. # pylint: disable=protected-access - self.assertFalse(self.exporter._shutdown_is_occuring.is_set()) + self.assertFalse(self.exporter._shutdown_in_progress.is_set()) self.exporter.shutdown() - self.assertTrue(self.exporter._shutdown_is_occuring.is_set()) + self.assertTrue(self.exporter._shutdown_in_progress.is_set()) export_result = export_thread.join() end_wait = time.time() self.assertEqual(export_result, SpanExportResult.FAILURE) diff --git a/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/metric_exporter/__init__.py b/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/metric_exporter/__init__.py index c45202f240d..486344753a0 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/metric_exporter/__init__.py +++ b/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/metric_exporter/__init__.py @@ -121,7 +121,7 @@ def __init__( | None = None, preferred_aggregation: dict[type, Aggregation] | None = None, ): - self._shutdown_is_occuring = threading.Event() + self._shutdown_in_progress = threading.Event() self._endpoint = endpoint or environ.get( OTEL_EXPORTER_OTLP_METRICS_ENDPOINT, _append_metrics_path( @@ -238,7 +238,7 @@ def export( resp.reason, backoff_seconds, ) - shutdown = self._shutdown_is_occuring.wait(backoff_seconds) + shutdown = self._shutdown_in_progress.wait(backoff_seconds) if shutdown: _logger.warning("Shutdown in progress, aborting retry.") break @@ -249,7 +249,7 @@ def shutdown(self, timeout_millis: float = 30_000, **kwargs) -> None: _logger.warning("Exporter already shutdown, ignoring call") return self._shutdown = True - self._shutdown_is_occuring.set() + self._shutdown_in_progress.set() self._session.close() @property diff --git a/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/trace_exporter/__init__.py b/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/trace_exporter/__init__.py index 599d95b3adc..8b97a651d4f 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/trace_exporter/__init__.py +++ b/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/trace_exporter/__init__.py @@ -77,7 +77,7 @@ def __init__( compression: Optional[Compression] = None, session: Optional[requests.Session] = None, ): - self._shutdown_is_occuring = threading.Event() + self._shutdown_in_progress = threading.Event() self._endpoint = endpoint or environ.get( OTEL_EXPORTER_OTLP_TRACES_ENDPOINT, _append_trace_path( @@ -186,7 +186,7 @@ def export(self, spans: Sequence[ReadableSpan]) -> SpanExportResult: resp.reason, backoff_seconds, ) - shutdown = self._shutdown_is_occuring.wait(backoff_seconds) + shutdown = self._shutdown_in_progress.wait(backoff_seconds) if shutdown: _logger.warning("Shutdown in progress, aborting retry.") break @@ -197,7 +197,7 @@ def shutdown(self): _logger.warning("Exporter already shutdown, ignoring call") return self._shutdown = True - self._shutdown_is_occuring.set() + self._shutdown_in_progress.set() self._session.close() def force_flush(self, timeout_millis: int = 30000) -> bool: