refactored otlp aws log exporter, add comments aws batch log processor

liustve · liustve · commit ff2fb5d6cadc · 2025-07-01T06:39:11.000Z
diff --git a/aws-opentelemetry-distro/src/amazon/opentelemetry/distro/exporter/otlp/aws/logs/aws_batch_log_record_processor.py b/aws-opentelemetry-distro/src/amazon/opentelemetry/distro/exporter/otlp/aws/logs/aws_batch_log_record_processor.py
@@ -1,5 +1,6 @@
 # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 # SPDX-License-Identifier: Apache-2.0
+# Modifications Copyright The OpenTelemetry Authors. Licensed under the Apache License 2.0 License.
 
 import logging
 from typing import Mapping, Optional, Sequence, cast
@@ -19,7 +20,7 @@ class AwsCloudWatchOtlpBatchLogRecordProcessor(BatchLogRecordProcessor):
     Custom implementation of BatchLogRecordProcessor that manages log record batching
     with size-based constraints to prevent exceeding AWS CloudWatch Logs OTLP endpoint request size limits.
 
-    This processor still exports all logs up to _max_export_batch_size but rather than doing exactly
+    This processor still exports all logs up to _MAX_LOG_REQUEST_BYTE_SIZE but rather than doing exactly
     one export, we will estimate log sizes and do multiple batch exports
     where each exported batch will have an additional constraint:
 
@@ -29,9 +30,41 @@ class AwsCloudWatchOtlpBatchLogRecordProcessor(BatchLogRecordProcessor):
     A unique case is if the sub-batch is of data size > 1 MB, then the sub-batch will have exactly 1 log in it.
     """
 
-    _BASE_LOG_BUFFER_BYTE_SIZE = (
-        1000  # Buffer size in bytes to account for log metadata not included in the body or attribute size calculation
-    )
+    # OTel log events include fixed metadata attributes so the estimated metadata size
+    # possibly be calculated as this with best efforts:
+    # service.name (255 chars) + cloud.resource_id (max ARN length) + telemetry.xxx (~20 chars) +
+    # common attributes (255 chars) +
+    # scope + flags + traceId + spanId + numeric/timestamp fields + ...
+    # Example log structure:
+    # {
+    #     "resource": {
+    #         "attributes": {
+    #             "aws.local.service": "example-service123",
+    #             "telemetry.sdk.language": "python",
+    #             "service.name": "my-application",
+    #             "cloud.resource_id": "example-resource",
+    #             "aws.log.group.names": "example-log-group",
+    #             "aws.ai.agent.type": "default",
+    #             "telemetry.sdk.version": "1.x.x",
+    #             "telemetry.auto.version": "0.x.x",
+    #             "telemetry.sdk.name": "opentelemetry"
+    #         }
+    #     },
+    #     "scope": {"name": "example.instrumentation.library"},
+    #     "timeUnixNano": 1234567890123456789,
+    #     "observedTimeUnixNano": 1234567890987654321,
+    #     "severityNumber": 9,
+    #     "body": {...},
+    #     "attributes": {...},
+    #     "flags": 1,
+    #     "traceId": "abcd1234efgh5678ijkl9012mnop3456",
+    #     "spanId": "1234abcd5678efgh"
+    # }
+    # 2000 might be a bit of an overestimate but it's better to overestimate the size of the log
+    # and suffer a small performance impact with batching than it is to underestimate and risk
+    # a large log being dropped when sent to the AWS otlp endpoint.
+    _BASE_LOG_BUFFER_BYTE_SIZE = 2000
+
     _MAX_LOG_REQUEST_BYTE_SIZE = (
         1048576  # Maximum uncompressed/unserialized bytes / request -
         # https://docs.aws.amazon.com/AmazonCloudWatch/latest/monitoring/CloudWatch-OTLPEndpoint.html
@@ -63,10 +96,11 @@ def _export(self, batch_strategy: BatchLogExportStrategy) -> None:
         https://github.com/open-telemetry/opentelemetry-python/blob/bb21ebd46d070c359eee286c97bdf53bfd06759d/opentelemetry-sdk/src/opentelemetry/sdk/_shared_internal/__init__.py#L143
 
         Preserves existing batching behavior but will intermediarly export small log batches if
-        the size of the data in the batch is at or above AWS CloudWatch's maximum request size limit of 1 MB.
+        the size of the data in the batch is estimated to be at or above AWS CloudWatch's
+        maximum request size limit of 1 MB.
 
-        - Data size of exported batches will ALWAYS be <= 1 MB except for the case below:
-        - If the data size of an exported batch is ever > 1 MB then the batch size is guaranteed to be 1
+        - Estimated data size of exported batches will typically be <= 1 MB except for the case below:
+        - If the estimated data size of an exported batch is ever > 1 MB then the batch size is guaranteed to be 1
         """
         with self._export_lock:
             iteration = 0
@@ -141,19 +175,17 @@ def _estimate_log_size(self, log: LogData, depth: int = 3) -> int:  # pylint: di
                 if next_val is None:
                     continue
 
-                if isinstance(next_val, bool):
-                    size += 4 if next_val else 5
-                    continue
-
                 if isinstance(next_val, (str, bytes)):
                     size += len(next_val)
                     continue
 
-                if isinstance(next_val, (float, int)):
+                if isinstance(next_val, (float, int, bool)):
                     size += len(str(next_val))
                     continue
 
-                # next_val must be Sequence["AnyValue"] or Mapping[str, "AnyValue"],
+                # next_val must be Sequence["AnyValue"] or Mapping[str, "AnyValue"]
+                # See: https://github.com/open-telemetry/opentelemetry-python/blob/\
+                # 9426d6da834cfb4df7daedd4426bba0aa83165b5/opentelemetry-api/src/opentelemetry/util/types.py#L20
                 if current_depth <= depth:
                     obj_id = id(
                         next_val
diff --git a/aws-opentelemetry-distro/src/amazon/opentelemetry/distro/exporter/otlp/aws/logs/otlp_aws_logs_exporter.py b/aws-opentelemetry-distro/src/amazon/opentelemetry/distro/exporter/otlp/aws/logs/otlp_aws_logs_exporter.py
@@ -1,28 +1,37 @@
 # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 # SPDX-License-Identifier: Apache-2.0
+# Modifications Copyright The OpenTelemetry Authors. Licensed under the Apache License 2.0 License.
 
 import gzip
 import logging
+import random
 from io import BytesIO
-from time import sleep
+from time import sleep, time
 from typing import Dict, Optional, Sequence
 
 from requests import Response
 from requests.exceptions import ConnectionError as RequestsConnectionError
 from requests.structures import CaseInsensitiveDict
 
 from amazon.opentelemetry.distro.exporter.otlp.aws.common.aws_auth_session import AwsAuthSession
-from opentelemetry.exporter.otlp.proto.common._internal import _create_exp_backoff_generator
 from opentelemetry.exporter.otlp.proto.common._log_encoder import encode_logs
 from opentelemetry.exporter.otlp.proto.http import Compression
 from opentelemetry.exporter.otlp.proto.http._log_exporter import OTLPLogExporter
 from opentelemetry.sdk._logs import LogData
 from opentelemetry.sdk._logs.export import LogExportResult
 
 _logger = logging.getLogger(__name__)
+_MAX_RETRYS = 6
 
 
 class OTLPAwsLogExporter(OTLPLogExporter):
+    """
+    This exporter extends the functionality of the OTLPLogExporter to allow logs to be exported
+    to the CloudWatch Logs OTLP endpoint https://logs.[AWSRegion].amazonaws.com/v1/logs. Utilizes the aws-sdk
+    library to sign and directly inject SigV4 Authentication to the exported request's headers.
+
+    See: https://docs.aws.amazon.com/AmazonCloudWatch/latest/monitoring/CloudWatch-OTLPEndpoint.html
+    """
 
     _RETRY_AFTER_HEADER = "Retry-After"  # See: https://opentelemetry.io/docs/specs/otlp/#otlphttp-throttling
 
@@ -56,12 +65,13 @@ def export(self, batch: Sequence[LogData]) -> LogExportResult:
         """
         Exports log batch with AWS-specific enhancements over the base OTLPLogExporter.
 
-        Based on upstream implementation which does not retry based on Retry-After header:
-        https://github.com/open-telemetry/opentelemetry-python/blob/acae2c232b101d3e447a82a7161355d66aa06fa2/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/_log_exporter/__init__.py#L167
+        Key differences from upstream OTLPLogExporter:
+        1. Respects Retry-After header from server responses for proper throttling
+        2. Treats HTTP 429 (Too Many Requests) as a retryable exception
+        3. Always compresses data with gzip before sending
 
-        Key behaviors:
-        1. Always compresses data with gzip before sending
-        2. Implements Retry-After header support for throttling responses
+        Upstream implementation does not support Retry-After header:
+        https://github.com/open-telemetry/opentelemetry-python/blob/acae2c232b101d3e447a82a7161355d66aa06fa2/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/_log_exporter/__init__.py#L167
         """
 
         if self._shutdown:
@@ -74,52 +84,50 @@ def export(self, batch: Sequence[LogData]) -> LogExportResult:
             gzip_stream.write(serialized_data)
         data = gzip_data.getvalue()
 
-        backoff = _create_exp_backoff_generator(max_value=self._MAX_RETRY_TIMEOUT)
+        deadline_sec = time() + self._timeout
+        retry_num = 0
 
         # This loop will eventually terminate because:
         # 1) The export request will eventually either succeed or fail permanently
-        # 2) The exponential backoff generator has a max value of _MAX_RETRY_TIMEOUT (64s)
-        # 3) After enough retries, delay will equal _MAX_RETRY_TIMEOUT, forcing exit
+        # 2) Maximum retries (_MAX_RETRYS = 6) will be reached
+        # 3) Deadline timeout will be exceeded
         # 4) Non-retryable errors (4xx except 429) immediately exit the loop
         while True:
-            resp = self._send(data)
+            resp = self._send(data, deadline_sec - time())
 
             if resp.ok:
                 return LogExportResult.SUCCESS
 
-            delay = self._get_retry_delay_sec(resp.headers, backoff)
+            backoff_seconds = self._get_retry_delay_sec(resp.headers, retry_num)
             is_retryable = self._retryable(resp)
 
-            if not is_retryable or delay == self._MAX_RETRY_TIMEOUT:
-                if is_retryable:
-                    _logger.error(
-                        "Failed to export logs due to retries exhausted "
-                        "after transient error %s encountered while exporting logs batch",
-                        resp.reason,
-                    )
-                else:
-                    _logger.error(
-                        "Failed to export logs batch code: %s, reason: %s",
-                        resp.status_code,
-                        resp.text,
-                    )
+            if not is_retryable or retry_num + 1 == _MAX_RETRYS or backoff_seconds > (deadline_sec - time()):
+                _logger.error(
+                    "Failed to export logs batch code: %s, reason: %s",
+                    resp.status_code,
+                    resp.text,
+                )
                 return LogExportResult.FAILURE
 
             _logger.warning(
-                "Transient error %s encountered while exporting logs batch, retrying in %ss.",
+                "Transient error %s encountered while exporting logs batch, retrying in %.2fs.",
                 resp.reason,
-                delay,
+                backoff_seconds,
             )
 
-            sleep(delay)
+            # Make sleep interruptible by checking shutdown status
+            if self._shutdown:
+                return LogExportResult.FAILURE
+            sleep(backoff_seconds)
+            retry_num += 1
 
-    def _send(self, serialized_data: bytes):
+    def _send(self, serialized_data: bytes, timeout_sec: float):
         try:
             response = self._session.post(
                 url=self._endpoint,
                 data=serialized_data,
                 verify=self._certificate_file,
-                timeout=self._timeout,
+                timeout=timeout_sec,
                 cert=self._client_cert,
             )
             return response
@@ -128,37 +136,31 @@ def _send(self, serialized_data: bytes):
                 url=self._endpoint,
                 data=serialized_data,
                 verify=self._certificate_file,
-                timeout=self._timeout,
+                timeout=timeout_sec,
                 cert=self._client_cert,
             )
             return response
 
     @staticmethod
     def _retryable(resp: Response) -> bool:
         """
-        Is it a retryable response?
+        Logic based on https://opentelemetry.io/docs/specs/otlp/#otlphttp-throttling
         """
         # See: https://opentelemetry.io/docs/specs/otlp/#otlphttp-throttling
 
         return resp.status_code in (429, 503) or OTLPLogExporter._retryable(resp)
 
-    def _get_retry_delay_sec(self, headers: CaseInsensitiveDict, backoff) -> float:
+    def _get_retry_delay_sec(self, headers: CaseInsensitiveDict, retry_num: int) -> float:
         """
         Get retry delay in seconds from headers or backoff strategy.
         """
-        # See: https://opentelemetry.io/docs/specs/otlp/#otlphttp-throttling
-        maybe_retry_after = headers.get(self._RETRY_AFTER_HEADER, None)
-
-        # Set the next retry delay to the value of the Retry-After response in the headers.
-        # If Retry-After is not present in the headers, default to the next iteration of the
-        # exponential backoff strategy.
-
-        delay = self._parse_retryable_header(maybe_retry_after)
-
-        if delay == -1:
-            delay = next(backoff, self._MAX_RETRY_TIMEOUT)
-
-        return delay
+        # Check for Retry-After header first, then use exponential backoff with jitter
+        retry_after_delay = self._parse_retryable_header(headers.get(self._RETRY_AFTER_HEADER))
+        if retry_after_delay > -1:
+            return retry_after_delay
+        else:
+            # multiplying by a random number between .8 and 1.2 introduces a +/-20% jitter to each backoff.
+            return 2**retry_num * random.uniform(0.8, 1.2)
 
     @staticmethod
     def _parse_retryable_header(retry_header: Optional[str]) -> float:
diff --git a/aws-opentelemetry-distro/src/amazon/opentelemetry/distro/exporter/otlp/aws/traces/otlp_aws_span_exporter.py b/aws-opentelemetry-distro/src/amazon/opentelemetry/distro/exporter/otlp/aws/traces/otlp_aws_span_exporter.py
@@ -18,6 +18,14 @@
 
 
 class OTLPAwsSpanExporter(OTLPSpanExporter):
+    """
+    This exporter extends the functionality of the OTLPSpanExporter to allow spans to be exported
+    to the XRay OTLP endpoint https://xray.[AWSRegion].amazonaws.com/v1/traces. Utilizes the
+    AwsAuthSession to sign and directly inject SigV4 Authentication to the exported request's headers.
+
+    See: https://docs.aws.amazon.com/AmazonCloudWatch/latest/monitoring/CloudWatch-OTLPEndpoint.html
+    """
+
     def __init__(
         self,
         endpoint: Optional[str] = None,
diff --git a/aws-opentelemetry-distro/tests/amazon/opentelemetry/distro/exporter/otlp/aws/logs/test_otlp_aws_logs_exporter.py b/aws-opentelemetry-distro/tests/amazon/opentelemetry/distro/exporter/otlp/aws/logs/test_otlp_aws_logs_exporter.py
@@ -7,7 +7,7 @@
 import requests
 from requests.structures import CaseInsensitiveDict
 
-from amazon.opentelemetry.distro.exporter.otlp.aws.logs.otlp_aws_logs_exporter import OTLPAwsLogExporter
+from amazon.opentelemetry.distro.exporter.otlp.aws.logs.otlp_aws_logs_exporter import _MAX_RETRYS, OTLPAwsLogExporter
 from opentelemetry._logs.severity import SeverityNumber
 from opentelemetry.sdk._logs import LogData, LogRecord
 from opentelemetry.sdk._logs.export import LogExportResult
@@ -80,18 +80,21 @@ def test_should_not_export_again_if_not_retryable(self, mock_request):
     def test_should_export_again_with_backoff_if_retryable_and_no_retry_after_header(self, mock_request, mock_sleep):
         """Tests that multiple export requests are made with exponential delay if the response status code is retryable.
         But there is no Retry-After header."""
+        self.exporter._timeout = 10000  # Large timeout to avoid early exit
         result = self.exporter.export(self.logs)
 
-        # 1, 2, 4, 8, 16, 32 delays
-        self.assertEqual(mock_sleep.call_count, 6)
+        self.assertEqual(mock_sleep.call_count, _MAX_RETRYS - 1)
 
         delays = mock_sleep.call_args_list
 
         for index, delay in enumerate(delays):
-            self.assertEqual(delay[0][0], 2**index)
+            expected_base = 2**index
+            actual_delay = delay[0][0]
+            # Assert delay is within jitter range: base * [0.8, 1.2]
+            self.assertGreaterEqual(actual_delay, expected_base * 0.8)
+            self.assertLessEqual(actual_delay, expected_base * 1.2)
 
-        # Number of calls: 1 + len(1, 2, 4, 8, 16, 32 delays)
-        self.assertEqual(mock_request.call_count, 7)
+        self.assertEqual(mock_request.call_count, _MAX_RETRYS)
         self.assertEqual(result, LogExportResult.FAILURE)
 
     @patch(
@@ -104,6 +107,7 @@ def test_should_export_again_with_backoff_if_retryable_and_no_retry_after_header
     def test_should_export_again_with_server_delay_if_retryable_and_retry_after_header(self, mock_request, mock_sleep):
         """Tests that multiple export requests are made with the server's suggested
         delay if the response status code is retryable and there is a Retry-After header."""
+        self.exporter._timeout = 10000  # Large timeout to avoid early exit
         result = self.exporter.export(self.logs)
         delays = mock_sleep.call_args_list
 
@@ -130,12 +134,17 @@ def test_should_export_again_with_backoff_delay_if_retryable_and_bad_retry_after
         self, mock_request, mock_sleep
     ):
         """Tests that multiple export requests are made with exponential delay if the response status code is retryable.
-        but the Retry-After header ins invalid or malformed."""
+        but the Retry-After header is invalid or malformed."""
+        self.exporter._timeout = 10000  # Large timeout to avoid early exit
         result = self.exporter.export(self.logs)
         delays = mock_sleep.call_args_list
 
         for index, delay in enumerate(delays):
-            self.assertEqual(delay[0][0], 2**index)
+            expected_base = 2**index
+            actual_delay = delay[0][0]
+            # Assert delay is within jitter range: base * [0.8, 1.2]
+            self.assertGreaterEqual(actual_delay, expected_base * 0.8)
+            self.assertLessEqual(actual_delay, expected_base * 1.2)
 
         self.assertEqual(mock_sleep.call_count, 3)
         self.assertEqual(mock_request.call_count, 4)
@@ -149,6 +158,29 @@ def test_export_connection_error_retry(self, mock_request):
         self.assertEqual(mock_request.call_count, 2)
         self.assertEqual(result, LogExportResult.SUCCESS)
 
+    @patch(
+        "amazon.opentelemetry.distro.exporter.otlp.aws.logs.otlp_aws_logs_exporter.sleep", side_effect=lambda x: None
+    )
+    @patch("requests.Session.post", return_value=retryable_response_no_header)
+    def test_should_stop_retrying_when_deadline_exceeded(self, mock_request, mock_sleep):
+        """Tests that the exporter stops retrying when the deadline is exceeded."""
+        self.exporter._timeout = 5  # Short timeout to trigger deadline check
+
+        # Mock time to simulate time passing
+        with patch("amazon.opentelemetry.distro.exporter.otlp.aws.logs.otlp_aws_logs_exporter.time") as mock_time:
+            # First call returns start time, subsequent calls simulate time passing
+            mock_time.side_effect = [0, 0, 1, 2, 4, 8]  # Exponential backoff would be 1, 2, 4 seconds
+
+            result = self.exporter.export(self.logs)
+
+            # Should stop before max retries due to deadline
+            self.assertLess(mock_sleep.call_count, _MAX_RETRYS)
+            self.assertLess(mock_request.call_count, _MAX_RETRYS + 1)
+            self.assertEqual(result, LogExportResult.FAILURE)
+
+            # Verify total time passed is at the timeout limit
+            self.assertGreaterEqual(5, self.exporter._timeout)
+
     @staticmethod
     def generate_test_log_data(count=5):
         logs = []