aws-observability
diff --git a/‎aws-opentelemetry-distro/src/amazon/opentelemetry/distro/aws_opentelemetry_configurator.py‎
Lines changed: 11 additions & 4 deletions b/‎aws-opentelemetry-distro/src/amazon/opentelemetry/distro/aws_opentelemetry_configurator.py‎
Lines changed: 11 additions & 4 deletions
diff --git a/‎aws-opentelemetry-distro/src/amazon/opentelemetry/distro/exporter/otlp/aws/logs/aws_batch_log_record_processor.py‎
Lines changed: 160 additions & 0 deletions b/‎aws-opentelemetry-distro/src/amazon/opentelemetry/distro/exporter/otlp/aws/logs/aws_batch_log_record_processor.py‎
Lines changed: 160 additions & 0 deletions
diff --git a/‎aws-opentelemetry-distro/src/amazon/opentelemetry/distro/exporter/otlp/aws/logs/otlp_aws_logs_exporter.py‎
Lines changed: 160 additions & 1 deletion b/‎aws-opentelemetry-distro/src/amazon/opentelemetry/distro/exporter/otlp/aws/logs/otlp_aws_logs_exporter.py‎
Lines changed: 160 additions & 1 deletion
@@ -4,7 +4,7 @@
 import os
 import re
 from logging import NOTSET, Logger, getLogger
-from typing import ClassVar, Dict, List, Type, Union
+from typing import ClassVar, Dict, List, Optional, Type, Union
 
 from importlib_metadata import version
 from typing_extensions import override
@@ -22,6 +22,7 @@
     AwsMetricAttributesSpanExporterBuilder,
 )
 from amazon.opentelemetry.distro.aws_span_metrics_processor_builder import AwsSpanMetricsProcessorBuilder
+from amazon.opentelemetry.distro.exporter.otlp.aws.logs.aws_batch_log_record_processor import AwsBatchLogRecordProcessor
 from amazon.opentelemetry.distro.exporter.otlp.aws.logs.otlp_aws_logs_exporter import OTLPAwsLogExporter
 from amazon.opentelemetry.distro.exporter.otlp.aws.traces.otlp_aws_span_exporter import OTLPAwsSpanExporter
 from amazon.opentelemetry.distro.otlp_udp_exporter import OTLPUdpSpanExporter
@@ -181,7 +182,9 @@ def _init_logging(
 
     # Provides a default OTLP log exporter when none is specified.
     # This is the behavior for the logs exporters for other languages.
-    if not exporters:
+    logs_exporter = os.environ.get("OTEL_LOGS_EXPORTER")
+
+    if not exporters and logs_exporter and (logs_exporter.lower() != "none"):
         exporters = {"otlp": OTLPLogExporter}
 
     provider = LoggerProvider(resource=resource)
@@ -190,7 +193,11 @@ def _init_logging(
     for _, exporter_class in exporters.items():
         exporter_args: Dict[str, any] = {}
         log_exporter = _customize_logs_exporter(exporter_class(**exporter_args), resource)
-        provider.add_log_record_processor(BatchLogRecordProcessor(exporter=log_exporter))
+
+        if isinstance(log_exporter, OTLPAwsLogExporter) and is_agent_observability_enabled():
+            provider.add_log_record_processor(AwsBatchLogRecordProcessor(exporter=log_exporter))
+        else:
+            provider.add_log_record_processor(BatchLogRecordProcessor(exporter=log_exporter))
 
     handler = LoggingHandler(level=NOTSET, logger_provider=provider)
 
@@ -532,7 +539,7 @@ def _is_lambda_environment():
     return AWS_LAMBDA_FUNCTION_NAME_CONFIG in os.environ
 
 
-def _is_aws_otlp_endpoint(otlp_endpoint: str = None, service: str = "xray") -> bool:
+def _is_aws_otlp_endpoint(otlp_endpoint: Optional[str] = None, service: str = "xray") -> bool:
     """Is the given endpoint an AWS OTLP endpoint?"""
 
     pattern = AWS_TRACES_OTLP_ENDPOINT_PATTERN if service == "xray" else AWS_LOGS_OTLP_ENDPOINT_PATTERN
 
@@ -0,0 +1,160 @@
+import logging
+from typing import Mapping, Optional, Sequence, cast
+
+from amazon.opentelemetry.distro.exporter.otlp.aws.logs.otlp_aws_logs_exporter import OTLPAwsLogExporter
+from opentelemetry.context import (
+    _SUPPRESS_INSTRUMENTATION_KEY,
+    attach,
+    detach,
+    set_value,
+)
+from opentelemetry.sdk._logs import LogData
+from opentelemetry.sdk._logs._internal.export import BatchLogExportStrategy
+from opentelemetry.sdk._logs.export import BatchLogRecordProcessor
+from opentelemetry.util.types import AnyValue
+
+_logger = logging.getLogger(__name__)
+
+
+class AwsBatchLogRecordProcessor(BatchLogRecordProcessor):
+    _BASE_LOG_BUFFER_BYTE_SIZE = (
+        2000  # Buffer size in bytes to account for log metadata not included in the body size calculation
+    )
+    _MAX_LOG_REQUEST_BYTE_SIZE = (
+        1048576  # https://docs.aws.amazon.com/AmazonCloudWatch/latest/monitoring/CloudWatch-OTLPEndpoint.html
+    )
+
+    def __init__(
+        self,
+        exporter: OTLPAwsLogExporter,
+        schedule_delay_millis: Optional[float] = None,
+        max_export_batch_size: Optional[int] = None,
+        export_timeout_millis: Optional[float] = None,
+        max_queue_size: Optional[int] = None,
+    ):
+
+        super().__init__(
+            exporter=exporter,
+            schedule_delay_millis=schedule_delay_millis,
+            max_export_batch_size=max_export_batch_size,
+            export_timeout_millis=export_timeout_millis,
+            max_queue_size=max_queue_size,
+        )
+
+        self._exporter = exporter
+
+    # https://github.com/open-telemetry/opentelemetry-python/blob/main/opentelemetry-sdk/src/opentelemetry/sdk/_shared_internal/__init__.py#L143
+    def _export(self, batch_strategy: BatchLogExportStrategy) -> None:
+        """
+        Preserves existing batching behavior but will intermediarly export small log batches if
+        the size of the data in the batch is at orabove AWS CloudWatch's maximum request size limit of 1 MB.
+
+        - Data size of exported batches will ALWAYS be <= 1 MB except for the case below:
+        - If the data size of an exported batch is ever > 1 MB then the batch size is guaranteed to be 1
+        """
+        with self._export_lock:
+            iteration = 0
+            # We could see concurrent export calls from worker and force_flush. We call _should_export_batch
+            # once the lock is obtained to see if we still need to make the requested export.
+            while self._should_export_batch(batch_strategy, iteration):
+                iteration += 1
+                token = attach(set_value(_SUPPRESS_INSTRUMENTATION_KEY, True))
+                try:
+                    batch_length = min(self._max_export_batch_size, len(self._queue))
+                    batch_data_size = 0
+                    batch = []
+
+                    for _ in range(batch_length):
+                        log_data: LogData = self._queue.pop()
+                        log_size = self._BASE_LOG_BUFFER_BYTE_SIZE + self._get_any_value_size(log_data.log_record.body)
+
+                        if batch and (batch_data_size + log_size > self._MAX_LOG_REQUEST_BYTE_SIZE):
+                            # if batch_data_size > MAX_LOG_REQUEST_BYTE_SIZE then len(batch) == 1
+                            if batch_data_size > self._MAX_LOG_REQUEST_BYTE_SIZE:
+                                if self._is_gen_ai_log(batch[0]):
+                                    self._exporter.set_gen_ai_log_flag()
+
+                            self._exporter.export(batch)
+                            batch_data_size = 0
+                            batch = []
+
+                        batch_data_size += log_size
+                        batch.append(log_data)
+
+                    if batch:
+                        # if batch_data_size > MAX_LOG_REQUEST_BYTE_SIZE then len(batch) == 1
+                        if batch_data_size > self._MAX_LOG_REQUEST_BYTE_SIZE:
+                            if self._is_gen_ai_log(batch[0]):
+                                self._exporter.set_gen_ai_log_flag()
+
+                        self._exporter.export(batch)
+                except Exception as e:  # pylint: disable=broad-exception-caught
+                    _logger.exception("Exception while exporting logs: " + str(e))
+                detach(token)
+
+    def _get_any_value_size(self, val: AnyValue, depth: int = 3) -> int:
+        """
+        Only used to indicate whether we should export a batch log size of 1 or not.
+        Calculates the size in bytes of an AnyValue object.
+        Will processs complex AnyValue structures up to the specified depth limit.
+        If the depth limit of the AnyValue structure is exceeded, returns 0.
+
+        Args:
+            val: The AnyValue object to calculate size for
+            depth: Maximum depth to traverse in nested structures (default: 3)
+
+        Returns:
+            int: Total size of the AnyValue object in bytes
+        """
+        # Use a stack to prevent excessive recursive calls.
+        stack = [(val, 0)]
+        size: int = 0
+
+        while stack:
+            # small optimization. We can stop calculating the size once it reaches the 1 MB limit.
+            if size >= self._MAX_LOG_REQUEST_BYTE_SIZE:
+                return size
+
+            next_val, current_depth = stack.pop()
+
+            if isinstance(next_val, (str, bytes)):
+                size += len(next_val)
+                continue
+
+            if isinstance(next_val, bool):
+                size += 4 if next_val else 5
+                continue
+
+            if isinstance(next_val, (float, int)):
+                size += len(str(next_val))
+                continue
+
+            if current_depth <= depth:
+                if isinstance(next_val, Sequence):
+                    for content in next_val:
+                        stack.append((cast(AnyValue, content), current_depth + 1))
+
+                if isinstance(next_val, Mapping):
+                    for key, content in next_val.items():
+                        size += len(key)
+                        stack.append((content, current_depth + 1))
+            else:
+                _logger.debug("Max log depth exceeded. Log data size will not be accurately calculated.")
+                return 0
+
+        return size
+
+    @staticmethod
+    def _is_gen_ai_log(log_data: LogData) -> bool:
+        """
+        Is the log a Gen AI log event?
+        """
+        gen_ai_instrumentations = {
+            "openinference.instrumentation.langchain",
+            "openinference.instrumentation.crewai",
+            "opentelemetry.instrumentation.langchain",
+            "crewai.telemetry",
+            "openlit.otel.tracing",
+        }
+
+        return log_data.instrumentation_scope.name in gen_ai_instrumentations
@@ -1,14 +1,41 @@
 # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 # SPDX-License-Identifier: Apache-2.0
 
-from typing import Dict, Optional
+import gzip
+import logging
+from io import BytesIO
+from time import sleep
+from typing import Dict, Optional, Sequence
+
+import requests
 
 from amazon.opentelemetry.distro.exporter.otlp.aws.common.aws_auth_session import AwsAuthSession
+from opentelemetry.exporter.otlp.proto.common._internal import (
+    _create_exp_backoff_generator,
+)
+from opentelemetry.exporter.otlp.proto.common._log_encoder import encode_logs
 from opentelemetry.exporter.otlp.proto.http import Compression
 from opentelemetry.exporter.otlp.proto.http._log_exporter import OTLPLogExporter
+from opentelemetry.sdk._logs import (
+    LogData,
+)
+from opentelemetry.sdk._logs.export import (
+    LogExportResult,
+)
+
+_logger = logging.getLogger(__name__)
 
 
 class OTLPAwsLogExporter(OTLPLogExporter):
+    _LARGE_LOG_HEADER = "x-aws-truncatable-fields"
+    _LARGE_GEN_AI_LOG_PATH_HEADER = (
+        "\\$['resourceLogs'][0]['scopeLogs'][0]['logRecords'][0]['body']"
+        "['kvlistValue']['values'][*]['value']['kvlistValue']['values'][*]"
+        "['value']['arrayValue']['values'][*]['kvlistValue']['values'][*]"
+        "['value']['stringValue']"
+    )
+    _RETRY_AFTER_HEADER = "Retry-After"  # https://opentelemetry.io/docs/specs/otlp/#otlphttp-throttling
+
     def __init__(
         self,
         endpoint: Optional[str] = None,
@@ -18,6 +45,7 @@ def __init__(
         headers: Optional[Dict[str, str]] = None,
         timeout: Optional[int] = None,
     ):
+        self._gen_ai_log_flag = False
         self._aws_region = None
 
         if endpoint:
@@ -34,3 +62,134 @@ def __init__(
             compression=Compression.Gzip,
             session=AwsAuthSession(aws_region=self._aws_region, service="logs"),
         )
+
+    # https://github.com/open-telemetry/opentelemetry-python/blob/main/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/_log_exporter/__init__.py#L167
+    def export(self, batch: Sequence[LogData]) -> LogExportResult:
+        """
+        Exports the given batch of OTLP log data.
+        Behaviors of how this export will work -
+
+        1. Always compresses the serialized data into gzip before sending.
+
+        2. If self._gen_ai_log_flag is enabled, the log data is > 1 MB a
+           and the assumption is that the log is a normalized gen.ai LogEvent.
+            - inject the {LARGE_LOG_HEADER} into the header.
+
+        3. Retry behavior is now the following:
+            - if the response contains a status code that is retryable and the response contains Retry-After in its
+              headers, the serialized data will be exported after that set delay
+
+            - if the response does not contain that Retry-After header, default back to the current iteration of the
+              exponential backoff delay
+        """
+
+        if self._shutdown:
+            _logger.warning("Exporter already shutdown, ignoring batch")
+            return LogExportResult.FAILURE
+
+        serialized_data = encode_logs(batch).SerializeToString()
+
+        gzip_data = BytesIO()
+        with gzip.GzipFile(fileobj=gzip_data, mode="w") as gzip_stream:
+            gzip_stream.write(serialized_data)
+
+        data = gzip_data.getvalue()
+
+        backoff = _create_exp_backoff_generator(max_value=self._MAX_RETRY_TIMEOUT)
+
+        while True:
+            resp = self._send(data)
+
+            if resp.ok:
+                return LogExportResult.SUCCESS
+
+            if not self._retryable(resp):
+                _logger.error(
+                    "Failed to export logs batch code: %s, reason: %s",
+                    resp.status_code,
+                    resp.text,
+                )
+                self._gen_ai_log_flag = False
+                return LogExportResult.FAILURE
+
+            # https://opentelemetry.io/docs/specs/otlp/#otlphttp-throttling
+            maybe_retry_after = resp.headers.get(self._RETRY_AFTER_HEADER, None)
+
+            # Set the next retry delay to the value of the Retry-After response in the headers.
+            # If Retry-After is not present in the headers, default to the next iteration of the
+            # exponential backoff strategy.
+
+            delay = self._parse_retryable_header(maybe_retry_after)
+
+            if delay == -1:
+                delay = next(backoff, self._MAX_RETRY_TIMEOUT)
+
+            if delay == self._MAX_RETRY_TIMEOUT:
+                _logger.error(
+                    "Transient error %s encountered while exporting logs batch. "
+                    "No Retry-After header found and all backoff retries exhausted. "
+                    "Logs will not be exported.",
+                    resp.reason,
+                )
+                self._gen_ai_log_flag = False
+                return LogExportResult.FAILURE
+
+            _logger.warning(
+                "Transient error %s encountered while exporting logs batch, retrying in %ss.",
+                resp.reason,
+                delay,
+            )
+
+            sleep(delay)
+
+    def set_gen_ai_log_flag(self):
+        """
+        Sets a flag that indicates the current log batch contains
+        a generative AI log record that exceeds the CloudWatch Logs size limit (1MB).
+        """
+        self._gen_ai_log_flag = True
+
+    def _send(self, serialized_data: bytes):
+        try:
+            response = self._session.post(
+                url=self._endpoint,
+                headers={self._LARGE_LOG_HEADER: self._LARGE_GEN_AI_LOG_PATH_HEADER} if self._gen_ai_log_flag else None,
+                data=serialized_data,
+                verify=self._certificate_file,
+                timeout=self._timeout,
+                cert=self._client_cert,
+            )
+            return response
+        except ConnectionError:
+            response = self._session.post(
+                url=self._endpoint,
+                headers={self._LARGE_LOG_HEADER: self._LARGE_GEN_AI_LOG_PATH_HEADER} if self._gen_ai_log_flag else None,
+                data=serialized_data,
+                verify=self._certificate_file,
+                timeout=self._timeout,
+                cert=self._client_cert,
+            )
+            return response
+
+    @staticmethod
+    def _retryable(resp: requests.Response) -> bool:
+        """
+        Is it a retryable response?
+        """
+
+        return resp.status_code in (429, 503) or OTLPLogExporter._retryable(resp)
+
+    @staticmethod
+    def _parse_retryable_header(retry_header: Optional[str]) -> float:
+        """
+        Converts the given retryable header into a delay in seconds, returns -1 if there's no header
+        or error with the parsing
+        """
+        if not retry_header:
+            return -1
+
+        try:
+            val = float(retry_header)
+            return val if val >= 0 else -1
+        except ValueError:
+            return -1