diff --git a/changelog.d/aws_s3_source_exponential_backoff.enhancement.md b/changelog.d/aws_s3_source_exponential_backoff.enhancement.md new file mode 100644 index 0000000000000..fa2b139dde6ba --- /dev/null +++ b/changelog.d/aws_s3_source_exponential_backoff.enhancement.md @@ -0,0 +1,5 @@ +The `aws_s3` source now uses exponential backoff when retrying failed SQS `receive_message` operations. Previously, the source used a fixed 500ms delay between retries. + +The new behavior starts at 500ms and doubles with each consecutive failure, capping at 30 seconds. This prevents excessive API calls during prolonged AWS SQS outages, invalid IAM permissions, or throttling scenarios, while still being responsive when the service recovers. + +authors: medzin pront diff --git a/src/common/backoff.rs b/src/common/backoff.rs index cbf9e275cbe25..94ed2ff7e4936 100644 --- a/src/common/backoff.rs +++ b/src/common/backoff.rs @@ -79,3 +79,44 @@ impl Iterator for ExponentialBackoff { Some(duration) } } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_exponential_backoff_sequence() { + let mut backoff = ExponentialBackoff::from_millis(2) + .factor(250) + .max_delay(Duration::from_secs(30)); + + let expected_delays = [ + Duration::from_millis(500), // 2 * 250 + Duration::from_secs(1), // 4 * 250 + Duration::from_secs(2), // 8 * 250 + Duration::from_secs(4), // 16 * 250 + Duration::from_secs(8), // 32 * 250 + Duration::from_secs(16), // 64 * 250 + Duration::from_secs(30), // 128 * 250 = 32s, capped at 30s + Duration::from_secs(30), // Should stay capped + ]; + + for expected in expected_delays.iter() { + let actual = backoff.next().unwrap(); + assert_eq!(actual, *expected); + } + } + + #[test] + fn test_backoff_reset() { + let mut backoff = ExponentialBackoff::from_millis(2) + .factor(250) + .max_delay(Duration::from_secs(30)); + for _ in 0..2 { + backoff.next(); + } + assert_eq!(backoff.next().unwrap(), Duration::from_secs(2)); + backoff.reset(); + assert_eq!(backoff.next().unwrap(), Duration::from_millis(500)); + } +} diff --git a/src/sources/aws_s3/sqs.rs b/src/sources/aws_s3/sqs.rs index 1274631298f59..81ef9e178c5b2 100644 --- a/src/sources/aws_s3/sqs.rs +++ b/src/sources/aws_s3/sqs.rs @@ -4,6 +4,7 @@ use std::{ num::NonZeroUsize, panic, sync::{Arc, LazyLock}, + time::Duration, }; use aws_sdk_s3::{Client as S3Client, operation::get_object::GetObjectError}; @@ -43,6 +44,7 @@ use crate::{ SourceSender, aws::AwsTimeout, codecs::Decoder, + common::backoff::ExponentialBackoff, config::{SourceAcknowledgementsConfig, SourceContext}, event::{BatchNotifier, BatchStatus, EstimatedJsonEncodedSizeOf, Event, LogEvent}, internal_events::{ @@ -381,6 +383,7 @@ pub struct IngestorProcess { log_namespace: LogNamespace, bytes_received: Registered, events_received: Registered, + backoff: ExponentialBackoff, } impl IngestorProcess { @@ -399,6 +402,9 @@ impl IngestorProcess { log_namespace, bytes_received: register!(BytesReceived::from(Protocol::HTTP)), events_received: register!(EventsReceived), + backoff: ExponentialBackoff::from_millis(2) + .factor(250) + .max_delay(Duration::from_secs(30)), } } @@ -409,23 +415,39 @@ impl IngestorProcess { loop { select! { _ = &mut shutdown => break, - _ = self.run_once() => {}, + result = self.run_once() => { + match result { + Ok(()) => { + // Reset backoff on successful receive + self.backoff.reset(); + } + Err(_) => { + let delay = self.backoff.next().expect("backoff never ends"); + trace!( + delay_ms = delay.as_millis(), + "`run_once` failed, will retry after delay.", + ); + tokio::time::sleep(delay).await; + } + } + }, } } } - async fn run_once(&mut self) { - let messages = self.receive_messages().await; - let messages = messages - .inspect(|messages| { + async fn run_once(&mut self) -> Result<(), ()> { + let messages = match self.receive_messages().await { + Ok(messages) => { emit!(SqsMessageReceiveSucceeded { count: messages.len(), }); - }) - .inspect_err(|err| { - emit!(SqsMessageReceiveError { error: err }); - }) - .unwrap_or_default(); + messages + } + Err(err) => { + emit!(SqsMessageReceiveError { error: &err }); + return Err(()); + } + }; let mut delete_entries = Vec::new(); let mut deferred_entries = Vec::new(); @@ -521,7 +543,7 @@ impl IngestorProcess { message = "Deferred queue not configured, but received deferred entries.", internal_log_rate_limit = true ); - return; + return Ok(()); }; let cloned_entries = deferred_entries.clone(); match self @@ -576,6 +598,7 @@ impl IngestorProcess { } } } + Ok(()) } async fn handle_sqs_message(&mut self, message: Message) -> Result<(), ProcessingError> {