open-telemetry · AaronRM · Jan 15, 2025 · Jan 15, 2025 · Feb 27, 2025 · Feb 27, 2025
@@ -322,7 +322,7 @@
         logs: LogBatch<'_>,
     ) -> opentelemetry_sdk::logs::LogResult<(Vec<u8>, &'static str)> {
         use opentelemetry_proto::tonic::collector::logs::v1::ExportLogsServiceRequest;
-        let resource_logs = group_logs_by_resource_and_scope(logs, &self.resource);
+        let resource_logs = group_logs_by_resource_and_scope(&logs, &self.resource);
         let req = ExportLogsServiceRequest { resource_logs };
 
         match self.protocol {

@@ -1,3 +1,4 @@
+use std::sync::Arc;
 use core::fmt;
 use opentelemetry::otel_debug;
 use opentelemetry_proto::tonic::collector::logs::v1::{
@@ -12,6 +13,8 @@
 use super::BoxInterceptor;
 use tokio::sync::Mutex;
 
+use crate::retry::{retry_with_exponential_backoff, RetryPolicy};
+
 pub(crate) struct TonicLogsClient {
     inner: Option<ClientInner>,
     #[allow(dead_code)]
@@ -57,33 +60,50 @@
 
 impl LogExporter for TonicLogsClient {
     async fn export(&self, batch: LogBatch<'_>) -> OTelSdkResult {
-        let (mut client, metadata, extensions) = match &self.inner {
-            Some(inner) => {
-                let (m, e, _) = inner
-                    .interceptor
-                    .lock()
-                    .await // tokio::sync::Mutex doesn't return a poisoned error, so we can safely use the interceptor here
-                    .call(Request::new(()))
-                    .map_err(|e| OTelSdkError::InternalFailure(format!("error: {:?}", e)))?
-                    .into_parts();
-                (inner.client.clone(), m, e)
-            }
-            None => return Err(OTelSdkError::AlreadyShutdown),
+        let policy = RetryPolicy {
+            max_retries: 3,
+            initial_delay_ms: 100,
+            max_delay_ms: 1600,
+            jitter_ms: 100,
         };
 
-        let resource_logs = group_logs_by_resource_and_scope(batch, &self.resource);
+        let batch = Arc::new(batch); // Wrap batch in Arc<Mutex<LogBatch>>
+
+        retry_with_exponential_backoff(policy, "TonicLogsClient.Export", {
+            let batch = Arc::clone(&batch);
+            move || {
+                let batch = Arc::clone(&batch); // Clone the Arc inside the closure
+                Box::pin(async move {
+                    let (mut client, metadata, extensions) = match &self.inner {
+                        Some(inner) => {
+                            let (m, e, _) = inner
+                                .interceptor
+                                .lock()
+                                .await // tokio::sync::Mutex doesn't return a poisoned error, so we can safely use the interceptor here
+                                .call(Request::new(()))
+                                .map_err(|e| OTelSdkError::InternalFailure(format!("error: {:?}", e)))?
+                                .into_parts();
+                            (inner.client.clone(), m, e)
+                        }
+                        None => return Err(OTelSdkError::AlreadyShutdown),
+                    };
 
-        otel_debug!(name: "TonicsLogsClient.CallingExport");
+                    let resource_logs = group_logs_by_resource_and_scope(&*batch, &self.resource);
 
-        client
-            .export(Request::from_parts(
-                metadata,
-                extensions,
-                ExportLogsServiceRequest { resource_logs },
-            ))
-            .await
-            .map_err(|e| OTelSdkError::InternalFailure(format!("export error: {:?}", e)))?;
-        Ok(())
+                    otel_debug!(name: "TonicsLogsClient.CallingExport");
+
+                    client
+                        .export(Request::from_parts(
+                            metadata,
+                            extensions,
+                            ExportLogsServiceRequest { resource_logs },
+                        ))
+                        .await
+                        .map(|_| ()) // Map the successful result to Ok(())
+                        .map_err(|e| OTelSdkError::InternalFailure(format!("export error: {:?}", e)))
+                })
+            }
+        }).await
     }
 
     fn shutdown(&mut self) -> OTelSdkResult {

@@ -498,9 +498,6 @@ mod tests {
     #[test]
     #[cfg(feature = "gzip-tonic")]
     fn test_with_gzip_compression() {
-        // metadata should merge with the current one with priority instead of just replacing it
-        let mut metadata = MetadataMap::new();
-        metadata.insert("foo", "bar".parse().unwrap());
         let builder = TonicExporterBuilder::default().with_compression(Compression::Gzip);
         assert_eq!(builder.tonic_config.compression.unwrap(), Compression::Gzip);
     }

diff --git a/opentelemetry-otlp/src/lib.rs b/opentelemetry-otlp/src/lib.rs
@@ -220,6 +220,7 @@ mod metric;
 #[cfg(feature = "trace")]
 #[cfg(any(feature = "http-proto", feature = "http-json", feature = "grpc-tonic"))]
 mod span;
+mod retry;
 
 pub use crate::exporter::Compression;
 pub use crate::exporter::ExportConfig;

diff --git a/opentelemetry-otlp/src/retry.rs b/opentelemetry-otlp/src/retry.rs
@@ -0,0 +1,147 @@
+use std::future::Future;
+use std::time::{Duration, SystemTime};
+use opentelemetry::otel_warn;
+use tokio::time::sleep;
+
+pub(crate) struct RetryPolicy {
+    pub max_retries: usize,
+    pub initial_delay_ms: u64,
+    pub max_delay_ms: u64,
+    pub jitter_ms: u64,
+}
+
+// Generates a random jitter value up to max_jitter
+fn generate_jitter(max_jitter: u64) -> u64 {
+    let now = SystemTime::now();
+    let nanos = now.duration_since(SystemTime::UNIX_EPOCH).unwrap().subsec_nanos();
+    nanos as u64 % (max_jitter + 1)
+}
+
+// Retries the given operation with exponential backoff and jitter
+pub(crate) async fn retry_with_exponential_backoff<F, Fut, T, E>(
+    policy: RetryPolicy,
+    operation_name: &str,
+    mut operation: F,
+) -> Result<T, E>
+where
+    F: FnMut() -> Fut,
+    E: std::fmt::Debug,
+    Fut: Future<Output = Result<T, E>>,
+{
+    let mut attempt = 0;
+    let mut delay = policy.initial_delay_ms;
+
+    loop {
+        match operation().await {
+            Ok(result) => return Ok(result), // Return the result if the operation succeeds
+            Err(err) if attempt < policy.max_retries => {
+                attempt += 1;
+                // Log the error and retry after a delay with jitter
+                otel_warn!(name: "OtlpRetry", message = format!("Retrying operation {:?} due to error: {:?}", operation_name, err));
-                otel_warn!(name: "OtlpRetry", message = format!("Retrying operation {:?} due to error: {:?}", operation_name, err));
+                otel_debug!(name: "OtlpRetry", message = format!("Retrying operation {:?} due to error: {:?}", operation_name, err));
-                otel_warn!(name: "OtlpRetry", message = format!("Retrying operation {:?} due to error: {:?}", operation_name, err));
+                otel_debug!(name: "OtlpRetry", message = format!("Retrying operation {:?} due to error: {:?}", operation_name, err));
+                let jitter = generate_jitter(policy.jitter_ms);
+                let delay_with_jitter = std::cmp::min(delay + jitter, policy.max_delay_ms);
+                sleep(Duration::from_millis(delay_with_jitter)).await;
+                delay = std::cmp::min(delay * 2, policy.max_delay_ms); // Exponential backoff
+            }
+            Err(err) => return Err(err), // Return the error if max retries are reached
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use tokio::time::timeout;
+    use std::sync::atomic::{AtomicUsize, Ordering};
+    use std::time::Duration;
+
+    // Test to ensure generate_jitter returns a value within the expected range
+    #[tokio::test]
+    async fn test_generate_jitter() {
+        let max_jitter = 100;
+        let jitter = generate_jitter(max_jitter);
+        assert!(jitter <= max_jitter);
+    }
+
+    // Test to ensure retry_with_exponential_backoff succeeds on the first attempt
+    #[tokio::test]
+    async fn test_retry_with_exponential_backoff_success() {
+        let policy = RetryPolicy {
+            max_retries: 3,
+            initial_delay_ms: 100,
+            max_delay_ms: 1600,
+            jitter_ms: 100,
+        };
+
+        let result = retry_with_exponential_backoff(policy, "test_operation", || {
+            Box::pin(async { Ok::<_, ()>("success") })
+        }).await;
+
+        assert_eq!(result, Ok("success"));
+    }
+
+    // Test to ensure retry_with_exponential_backoff retries the operation and eventually succeeds
+    #[tokio::test]
+    async fn test_retry_with_exponential_backoff_retries() {
+        let policy = RetryPolicy {
+            max_retries: 3,
+            initial_delay_ms: 100,
+            max_delay_ms: 1600,
+            jitter_ms: 100,
+        };
+
+        let attempts = AtomicUsize::new(0);
+
+        let result = retry_with_exponential_backoff(policy, "test_operation", || {
+            let attempt = attempts.fetch_add(1, Ordering::SeqCst);
+            Box::pin(async move {
+                if attempt < 2 {
+                    Err::<&str, &str>("error") // Fail the first two attempts
+                } else {
+                    Ok::<&str, &str>("success") // Succeed on the third attempt
+                }
+            })
+        }).await;
+
+        assert_eq!(result, Ok("success"));
+        assert_eq!(attempts.load(Ordering::SeqCst), 3); // Ensure there were 3 attempts
+    }
+
+    // Test to ensure retry_with_exponential_backoff fails after max retries
+    #[tokio::test]
+    async fn test_retry_with_exponential_backoff_failure() {
+        let policy = RetryPolicy {
+            max_retries: 3,
+            initial_delay_ms: 100,
+            max_delay_ms: 1600,
+            jitter_ms: 100,
+        };
+
+        let attempts = AtomicUsize::new(0);
+
+        let result = retry_with_exponential_backoff(policy, "test_operation", || {
+            attempts.fetch_add(1, Ordering::SeqCst);
+            Box::pin(async { Err::<(), _>("error") }) // Always fail
+        }).await;
+
+        assert_eq!(result, Err("error"));
+        assert_eq!(attempts.load(Ordering::SeqCst), 4); // Ensure there were 4 attempts (initial + 3 retries)
+    }
+
+    // Test to ensure retry_with_exponential_backoff respects the timeout
+    #[tokio::test]
+    async fn test_retry_with_exponential_backoff_timeout() {
+        let policy = RetryPolicy {
+            max_retries: 12, // Increase the number of retries
+            initial_delay_ms: 100,
+            max_delay_ms: 1600,
+            jitter_ms: 100,
+        };
+
+        let result = timeout(Duration::from_secs(1), retry_with_exponential_backoff(policy, "test_operation", || {
+            Box::pin(async { Err::<(), _>("error") }) // Always fail
+        })).await;
+
+        assert!(result.is_err()); // Ensure the operation times out
+    }
+}
@@ -13,7 +13,7 @@
 //! Only a single test suite can run at once, as each container has statically mapped ports, but
 //! this works nicely with the way cargo executes the suite.
 //!
-//! To skip integration tests with cargo, you can run `cargo test --mod`, which will run unit tests
+//! To skip integration tests with cargo, you can run `cargo test --lib`, which will run unit tests
 //! only.
 //!
 #![cfg(unix)]

@@ -164,8 +164,8 @@ pub mod tonic {
         }
     }
 
-    pub fn group_logs_by_resource_and_scope(
-        logs: LogBatch<'_>,
+    pub fn group_logs_by_resource_and_scope<'a>(
+        logs: &'a LogBatch<'a>,
         resource: &ResourceAttributesWithSchema,
     ) -> Vec<ResourceLogs> {
         // Group logs by target or instrumentation name
@@ -273,7 +273,7 @@ mod tests {
         let resource: ResourceAttributesWithSchema = (&resource).into(); // Convert Resource to ResourceAttributesWithSchema
 
         let grouped_logs =
-            crate::transform::logs::tonic::group_logs_by_resource_and_scope(log_batch, &resource);
+            crate::transform::logs::tonic::group_logs_by_resource_and_scope(&log_batch, &resource);
 
         assert_eq!(grouped_logs.len(), 1);
         let resource_logs = &grouped_logs[0];
@@ -293,7 +293,7 @@ mod tests {
         let log_batch = LogBatch::new(&logs);
         let resource: ResourceAttributesWithSchema = (&resource).into(); // Convert Resource to ResourceAttributesWithSchema
         let grouped_logs =
-            crate::transform::logs::tonic::group_logs_by_resource_and_scope(log_batch, &resource);
+            crate::transform::logs::tonic::group_logs_by_resource_and_scope(&log_batch, &resource);
 
         assert_eq!(grouped_logs.len(), 1);
         let resource_logs = &grouped_logs[0];