-
Notifications
You must be signed in to change notification settings - Fork 770
Fix: Reinitialize gRPC channel on UNAVAILABLE error (Fixes #4517) #4825
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 2 commits
436ecc9
2c848f4
8b397a7
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -12,7 +12,14 @@ | |
| # See the License for the specific language governing permissions and | ||
| # limitations under the License. | ||
|
|
||
| """OTLP Exporter""" | ||
| """OTLP Exporter | ||
|
|
||
| This module provides a mixin class for OTLP exporters that send telemetry data | ||
| to an OTLP-compatible receiver via gRPC. It includes a configurable reconnection | ||
| logic to handle transient collector outages. | ||
|
|
||
|
|
||
| """ | ||
|
|
||
| import random | ||
| import threading | ||
|
|
@@ -251,17 +258,24 @@ def _get_credentials( | |
| if certificate_file: | ||
| client_key_file = environ.get(client_key_file_env_key) | ||
| client_certificate_file = environ.get(client_certificate_file_env_key) | ||
| return _load_credentials( | ||
| credentials = _load_credentials( | ||
| certificate_file, client_key_file, client_certificate_file | ||
| ) | ||
| if credentials is not None: | ||
| return credentials | ||
| return ssl_channel_credentials() | ||
|
|
||
|
|
||
| # pylint: disable=no-member | ||
| class OTLPExporterMixin( | ||
| ABC, Generic[SDKDataT, ExportServiceRequestT, ExportResultT, ExportStubT] | ||
| ): | ||
| """OTLP span exporter | ||
| """OTLP gRPC exporter mixin. | ||
|
|
||
| This class provides the base functionality for OTLP exporters that send | ||
| telemetry data (spans or metrics) to an OpenTelemetry Collector via gRPC. | ||
| It includes a configurable reconnection mechanism to handle transient | ||
| collector outages. | ||
|
|
||
| Args: | ||
| endpoint: OpenTelemetry Collector receiver endpoint | ||
|
|
@@ -308,6 +322,8 @@ def __init__( | |
| if parsed_url.netloc: | ||
| self._endpoint = parsed_url.netloc | ||
|
|
||
| self._insecure = insecure | ||
| self._credentials = credentials | ||
| self._headers = headers or environ.get(OTEL_EXPORTER_OTLP_HEADERS) | ||
| if isinstance(self._headers, str): | ||
| temp_headers = parse_env_headers(self._headers, liberal=True) | ||
|
|
@@ -336,21 +352,58 @@ def __init__( | |
| ) | ||
| self._collector_kwargs = None | ||
|
|
||
| compression = ( | ||
| environ_to_compression(OTEL_EXPORTER_OTLP_COMPRESSION) | ||
| if compression is None | ||
| else compression | ||
| ) or Compression.NoCompression | ||
| self._compression = ( | ||
| compression | ||
| or environ_to_compression( | ||
| environ.get(OTEL_EXPORTER_OTLP_COMPRESSION) | ||
| ) | ||
| or Compression.NoCompression | ||
| ) | ||
| self._channel = None | ||
| self._client = None | ||
| self._channel_reconnection_enabled = False | ||
| self._initialize_channel_and_stub() | ||
|
|
||
| def _initialize_channel_and_stub(self): | ||
| """ | ||
| Create a new gRPC channel and stub. | ||
|
|
||
| if insecure: | ||
| This method is used during initialization and by the reconnection | ||
| mechanism to reinitialize the channel on transient errors. | ||
| """ | ||
| # Add channel options for better reconnection behavior | ||
| # Only add these if we're dealing with reconnection scenarios | ||
| channel_options = [] | ||
| if self._channel_reconnection_enabled: | ||
|
||
| channel_options = [ | ||
| ("grpc.keepalive_time_ms", 30000), | ||
| ("grpc.keepalive_timeout_ms", 15000), | ||
| ("grpc.keepalive_permit_without_calls", 1), | ||
| ("grpc.initial_reconnect_backoff_ms", 5000), | ||
| ("grpc.min_reconnect_backoff_ms", 5000), | ||
| ("grpc.max_reconnect_backoff_ms", 30000), | ||
| ] | ||
|
|
||
| # Merge reconnection options with existing channel options | ||
|
||
| current_options = list(self._channel_options) | ||
|
||
| # Filter out options that we are about to override | ||
| reconnection_keys = {key for key, _ in channel_options} | ||
| current_options = [ | ||
| (key, value) | ||
| for key, value in current_options | ||
| if key not in reconnection_keys | ||
| ] | ||
| final_options = tuple(current_options + channel_options) | ||
|
|
||
| if self._insecure: | ||
| self._channel = insecure_channel( | ||
| self._endpoint, | ||
| compression=compression, | ||
| options=self._channel_options, | ||
| compression=self._compression, | ||
| options=final_options, | ||
| ) | ||
| else: | ||
| self._credentials = _get_credentials( | ||
| credentials, | ||
| self._credentials, | ||
|
||
| _OTEL_PYTHON_EXPORTER_OTLP_GRPC_CREDENTIAL_PROVIDER, | ||
| OTEL_EXPORTER_OTLP_CERTIFICATE, | ||
| OTEL_EXPORTER_OTLP_CLIENT_KEY, | ||
|
|
@@ -359,13 +412,14 @@ def __init__( | |
| self._channel = secure_channel( | ||
| self._endpoint, | ||
| self._credentials, | ||
| compression=compression, | ||
| options=self._channel_options, | ||
| compression=self._compression, | ||
| options=final_options, | ||
| ) | ||
| self._client = self._stub(self._channel) # type: ignore [reportCallIssue] | ||
|
|
||
| self._shutdown_in_progress = threading.Event() | ||
| self._shutdown = False | ||
| if not hasattr(self, "_shutdown_in_progress"): | ||
| self._shutdown_in_progress = threading.Event() | ||
| self._shutdown = False | ||
|
||
|
|
||
| @abstractmethod | ||
| def _translate_data( | ||
|
|
@@ -407,6 +461,26 @@ def _export( | |
| retry_info.retry_delay.seconds | ||
| + retry_info.retry_delay.nanos / 1.0e9 | ||
| ) | ||
|
|
||
| # For UNAVAILABLE errors, reinitialize the channel to force reconnection | ||
| if error.code() == StatusCode.UNAVAILABLE and retry_num == 0: # type: ignore | ||
| logger.debug( | ||
| "Reinitializing gRPC channel for %s exporter due to UNAVAILABLE error", | ||
| self._exporting, | ||
| ) | ||
| try: | ||
| self._channel.close() | ||
| except Exception as e: | ||
| logger.debug( | ||
| "Error closing channel for %s exporter to %s: %s", | ||
| self._exporting, | ||
| self._endpoint, | ||
| str(e), | ||
| ) | ||
| # Enable channel reconnection for subsequent calls | ||
| self._channel_reconnection_enabled = True | ||
| self._initialize_channel_and_stub() | ||
|
|
||
| if ( | ||
| error.code() not in _RETRYABLE_ERROR_CODES # type: ignore [reportAttributeAccessIssue] | ||
| or retry_num + 1 == _MAX_RETRYS | ||
|
|
@@ -436,6 +510,12 @@ def _export( | |
| return self._result.FAILURE # type: ignore [reportReturnType] | ||
|
|
||
| def shutdown(self, timeout_millis: float = 30_000, **kwargs) -> None: | ||
| """ | ||
| Shut down the exporter. | ||
|
|
||
| Args: | ||
| timeout_millis: Timeout in milliseconds for shutting down the exporter. | ||
| """ | ||
| if self._shutdown: | ||
| logger.warning("Exporter already shutdown, ignoring call") | ||
| return | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: can we say the same thing as above here (OTLP-compatible receiver)