Implement Collect-Then-Send pattern to eliminate race conditions between file Create and Remove events that could cause duplicate FileTrackers or reading from already-removed files.

soulgarden · soulgarden · commit 5a04899005e9 · 2025-12-13T21:44:44.000+01:00
diff --git a/Makefile b/Makefile
@@ -59,7 +59,7 @@ docker_down dd:
 	docker-compose down
 
 # Build and push Docker image with version from VERSION file
-docker-build db: increment-version
+docker-build db:
 	@NEW_VERSION=$$(cat VERSION); \
 	echo "Building with version: $$NEW_VERSION"; \
 	docker build . -t soulgarden/logfowd2:$$NEW_VERSION -t soulgarden/logfowd2:latest --platform linux/amd64; \
diff --git a/README.md b/README.md
@@ -1,8 +1,8 @@
 # logfowd2
 
 ![Tests and linters](https://github.com/soulgarden/logfowd2/actions/workflows/main.yml/badge.svg)
-[![Version](https://img.shields.io/badge/version-0.1.0-blue.svg)](https://github.com/soulgarden/logfowd2)
-[![Tests](https://img.shields.io/badge/tests-269%20passing-success.svg)](https://github.com/soulgarden/logfowd2)
+[![Version](https://img.shields.io/badge/version-0.4.0-blue.svg)](https://github.com/soulgarden/logfowd2)
+[![Tests](https://img.shields.io/badge/tests-293%20passing-success.svg)](https://github.com/soulgarden/logfowd2)
 [![Code Quality](https://img.shields.io/badge/linter-zero%20warnings-success.svg)](https://github.com/soulgarden/logfowd2)
 
 **High-performance Kubernetes log forwarding tool built with Rust**
@@ -28,7 +28,7 @@ Logfowd2 is a memory-efficient log forwarding daemon designed for Kubernetes env
 ### Advanced System Optimization
 - **MetadataCache System** - High-performance file metadata caching with TTL-based eviction (100ms TTL, LRU)
 - **Intelligent Retry Management** - Universal exponential backoff retry mechanism for all async operations
-- **Lock Optimization** - Drop/reacquire pattern minimizes lock contention and improves concurrency
+- **Atomic Event Handling** - Collect-Then-Send pattern eliminates race conditions in file event processing
 - **Event-Driven File Monitoring** - Uses filesystem events for instant rotation detection
 - **Historical Log Recovery** - Reads existing log content on startup (no data loss)
 - **Symlink Support** - Full support for Kubernetes symlinked log files
@@ -40,7 +40,7 @@ Logfowd2 is a memory-efficient log forwarding daemon designed for Kubernetes env
 
 logfowd2 is built with Domain-Driven Design (DDD) principles:
 - **Modular design** - Clear separation between domain, infrastructure, and transport layers
-- **Comprehensive testing** - 269 tests covering all critical paths
+- **Comprehensive testing** - 293 tests covering all critical paths
 - **Type safety** - Leverages Rust's type system for compile-time guarantees
 - **Extensible architecture** - Ready for parallel file processing and custom extensions
 
@@ -81,6 +81,7 @@ Buffer Management                     State Persist                    RetryMana
 #### Watcher (`src/watcher.rs`)
 - **Purpose**: Monitors `/var/log/pods` recursively using filesystem events
 - **NotifyBridge Integration**: Uses NotifyBridge to prevent filesystem notify callback blocking
+- **Atomic Event Handling**: Collect-Then-Send pattern via `handle_create_event`/`handle_remove_event` eliminates race conditions
 - **File Tracking**: Advanced FileTracker with symlink and rapid rotation support, leveraging MetadataCache
 - **Metadata Parsing**: Extracts Kubernetes metadata (namespace, pod, container) from log paths
 - **Initial Sync**: Processes existing files on startup with position restoration
@@ -140,7 +141,7 @@ Buffer Management                     State Persist                    RetryMana
 - **Parallel ES Workers**: Concurrent bulk operations with configurable pool sizing
 - **Adaptive Batching**: Size and time-based flushing with backpressure awareness
 - **Memory Streaming**: Bounded buffer architecture prevents memory growth
-- **Advanced Lock Optimization**: Drop/reacquire pattern minimizes lock contention during I/O operations
+- **Atomic Event Handling**: Collect-Then-Send pattern eliminates race conditions while maintaining high throughput
 
 ### Resource Efficiency  
 - **Ultra-Low Memory Baseline**: 30-50Mi baseline memory usage
@@ -164,7 +165,7 @@ Buffer Management                     State Persist                    RetryMana
 ## 🧪 Code Quality & Testing
 
 ### Test Coverage
-- **263 Comprehensive Tests**: Unit, integration, and edge case coverage
+- **293 Comprehensive Tests**: Unit, integration, and edge case coverage
 - **Domain Testing**: File rotation, symlinks, corrupted files, permission issues
 - **Network Testing**: Circuit breaker, retry logic, timeout behavior
 - **Memory Testing**: Backpressure, channel overflow, cache eviction
@@ -181,7 +182,7 @@ Buffer Management                     State Persist                    RetryMana
 
 ### Prerequisites
 - **Platform**: Linux/Unix only (uses `std::os::unix` APIs and Unix signals)
-- **Rust Toolchain**: 1.85+ (required for Rust 2024 edition support)
+- **Rust Toolchain**: 1.91+ (required for Rust 2024 edition support)
 - **Kubernetes**: 1.14+ with `/var/log/pods` access
 - **Elasticsearch**: 7.x+ or ZincSearch compatible target
 
diff --git a/VERSION b/VERSION
@@ -1 +1 @@
-0.3.0
+0.4.0
diff --git a/src/infrastructure/elasticsearch/dead_letter_queue.rs b/src/infrastructure/elasticsearch/dead_letter_queue.rs
@@ -172,7 +172,9 @@ impl DeadLetterQueue {
         if returned_count > 0 || permanently_failed_count > 0 {
             debug!(
                 "Returned {} events to DLQ, {} permanently failed (queue size: {})",
-                returned_count, permanently_failed_count, queue.len()
+                returned_count,
+                permanently_failed_count,
+                queue.len()
             );
         }
     }
@@ -1144,7 +1146,10 @@ mod tests {
 
         // Stats should be updated
         let stats = dlq.stats.read().await;
-        assert_eq!(stats.events_in_queue, 2, "Stats should reflect remaining events");
+        assert_eq!(
+            stats.events_in_queue, 2,
+            "Stats should reflect remaining events"
+        );
         assert_eq!(stats.events_retried, 3, "Should track retried events");
     }
 
@@ -1239,7 +1244,10 @@ mod tests {
 
         // Stats should be updated
         let stats = dlq.stats.read().await;
-        assert_eq!(stats.events_in_queue, 3, "Stats should reflect returned events");
+        assert_eq!(
+            stats.events_in_queue, 3,
+            "Stats should reflect returned events"
+        );
     }
 
     #[tokio::test]
@@ -1304,7 +1312,10 @@ mod tests {
     async fn test_max_retry_config_default() {
         // TDD: Test that default config has max_retry_count
         let config = DeadLetterQueueConfig::default();
-        assert_eq!(config.max_retry_count, 5, "Default max_retry_count should be 5");
+        assert_eq!(
+            config.max_retry_count, 5,
+            "Default max_retry_count should be 5"
+        );
     }
 
     #[tokio::test]
diff --git a/src/sender.rs b/src/sender.rs
@@ -172,7 +172,10 @@ impl Sender {
                 }
                 Err(crate::transport::channels::SendError::Closed(returned_events)) => {
                     // Channel closed - return events for later retry
-                    warn!("ES channel closed, {} events will be retried", returned_events.len());
+                    warn!(
+                        "ES channel closed, {} events will be retried",
+                        returned_events.len()
+                    );
                     if self.metrics_enabled {
                         metrics()
                             .errors_total
@@ -1203,11 +1206,18 @@ mod tests {
             .await
             .expect("Should receive second batch - events must NOT be lost!")
             .unwrap();
-        assert_eq!(batch2.len(), 5, "Second batch should have 5 events - none lost!");
+        assert_eq!(
+            batch2.len(),
+            5,
+            "Second batch should have 5 events - none lost!"
+        );
 
         // Verify total events
         let total_events = batch1.len() + batch2.len();
-        assert_eq!(total_events, 10, "All 10 events must be delivered, none lost!");
+        assert_eq!(
+            total_events, 10,
+            "All 10 events must be delivered, none lost!"
+        );
 
         shutdown.notify_one();
         let _ = timeout(Duration::from_millis(500), sender_handle).await;
@@ -1227,17 +1237,18 @@ mod tests {
         es_sender.send(batch1).await.unwrap();
 
         // Now channel is full - try_send should return the events, not lose them
-        let batch2 = vec![
-            create_test_event("second_1"),
-            create_test_event("second_2"),
-        ];
+        let batch2 = vec![create_test_event("second_1"), create_test_event("second_2")];
 
         // This should fail with Full error and return our events
         let result = es_sender.try_send(batch2);
 
         match result {
             Err(SendError::Full(returned_events)) => {
-                assert_eq!(returned_events.len(), 2, "Events must be returned, not lost!");
+                assert_eq!(
+                    returned_events.len(),
+                    2,
+                    "Events must be returned, not lost!"
+                );
                 assert_eq!(returned_events[0].message, "second_1");
                 assert_eq!(returned_events[1].message, "second_2");
             }
@@ -1298,7 +1309,10 @@ mod tests {
         })
         .await;
 
-        assert!(sender_result.is_ok(), "Sender should complete within timeout");
+        assert!(
+            sender_result.is_ok(),
+            "Sender should complete within timeout"
+        );
 
         // Wait for sender to finish
         let _ = timeout(Duration::from_secs(1), sender_handle).await;
@@ -1426,7 +1440,10 @@ mod tests {
 
         // Sender should complete (not hang forever) - within retry timeout
         let result = timeout(Duration::from_secs(6), sender_handle).await;
-        assert!(result.is_ok(), "Sender should not hang when ES channel is closed");
+        assert!(
+            result.is_ok(),
+            "Sender should not hang when ES channel is closed"
+        );
     }
 
     #[tokio::test]
diff --git a/src/watcher.rs b/src/watcher.rs