[ AGNTLOG-462 ] Fix auditor Flush() race condition during transport restart (#46882)

ddrthall · web-flow · commit 62bf5e55c252 · 2026-02-24T21:52:25.000Z
## What does this PR do?

In response to flaky CI test failures in `TestRestartTestSuite`, this PR fixes a race condition in the auditor's `Flush()` method that caused stale offsets to be written to disk during transport restarts.

Previously, `Flush()` wrote the in-memory registry directly, missing payloads that destinations had already sent to the auditor channel but the `run()` goroutine hadn't consumed yet. This caused stale offsets on disk after `partialStop()`, leading to duplicate log processing after a TCP-to-HTTP transport restart.

`Flush()` now sends a synchronous request through the auditor's `run()` goroutine event loop. The goroutine drains buffered `inputChan` payloads (bounded by a `len()` snapshot), updates the in-memory registry, then writes to disk. When the auditor is stopped, it falls back to a direct `flushRegistry()` call.

## Motivation

Resolves AGNTLOG-462. `TestRestartTestSuite` sub-tests (`TestPartialStop_FlushesRegistryToDisk`, `TestRestart_FlushesAuditor`) were failing intermittently on macOS ARM64 and IoT Linux x64 CI runners. Investigation revealed the test failures exposed a real production race condition: the `LogsSent` metric is incremented by destinations before payloads reach the auditor's `inputChan`, so `partialStop()` calling `Flush()` immediately after stopping destinations could write a registry missing the latest offsets.

## Describe how you validated your changes

Existing automated tests were relied on.

## Additional Notes

- `Flush()` is now blocking (waits for the `run()` goroutine to complete drain + write). All existing callers already treated it as synchronous.
- Must not be called concurrently with `Stop()` (not a new constraint -- this was already the case).

Co-authored-by: ryan.hall &lt;ryan.hall@datadoghq.com&gt;
diff --git a/comp/logs/auditor/impl/auditor.go b/comp/logs/auditor/impl/auditor.go
@@ -54,6 +54,7 @@ type registryAuditor struct {
 	kubeHealthRegistrar kubehealthdef.Component
 	chansMutex          sync.Mutex
 	inputChan           chan *message.Payload
+	flushRequestChan    chan chan struct{}
 	registry            map[string]*RegistryEntry
 	tailedSources       map[string]bool
 	registryPath        string
@@ -136,19 +137,34 @@ func (a *registryAuditor) Stop() {
 	}
 }
 
-// Flush immediately writes the current registry to disk.
-// This is useful to ensure all file positions are committed before a restart,
-// preventing duplicate log processing.
+// Flush drains all pending payloads from the input channel, updates the
+// in-memory registry, then writes it to disk. It blocks until complete.
+// When the auditor is stopped (run loop not active), it falls back to
+// writing the current in-memory registry directly.
+//
+// Must not be called concurrently with Stop.
 func (a *registryAuditor) Flush() {
-	if err := a.flushRegistry(); err != nil {
-		a.log.Warnf("Failed to flush auditor registry: %v", err)
+	a.chansMutex.Lock()
+	reqChan := a.flushRequestChan
+	a.chansMutex.Unlock()
+
+	if reqChan == nil {
+		if err := a.flushRegistry(); err != nil {
+			a.log.Warnf("Failed to flush auditor registry: %v", err)
+		}
+		return
 	}
+
+	done := make(chan struct{})
+	reqChan <- done
+	<-done
 }
 
 func (a *registryAuditor) createChannels() {
 	a.chansMutex.Lock()
 	defer a.chansMutex.Unlock()
 	a.inputChan = make(chan *message.Payload, a.messageChannelSize)
+	a.flushRequestChan = make(chan chan struct{})
 	a.done = make(chan struct{})
 }
 
@@ -164,6 +180,7 @@ func (a *registryAuditor) closeChannels() {
 		a.done = nil
 	}
 	a.inputChan = nil
+	a.flushRequestChan = nil
 }
 
 // GetFingerprint returns the fingerprint for a given identifier,
@@ -293,6 +310,25 @@ func (a *registryAuditor) run() {
 					a.log.Warn(err)
 				}
 			}
+		case responseChan := <-a.flushRequestChan:
+			n := len(a.inputChan)
+			for i := 0; i < n; i++ {
+				select {
+				case payload := <-a.inputChan:
+					for _, msg := range payload.MessageMetas {
+						var fingerprint types.Fingerprint
+						if msg.Origin.Fingerprint != nil {
+							fingerprint = *msg.Origin.Fingerprint
+						}
+						a.updateRegistry(msg.Origin.Identifier, msg.Origin.Offset, msg.Origin.LogSource.Config.TailingMode, msg.IngestionTimestamp, fingerprint)
+					}
+				default:
+				}
+			}
+			if err := a.flushRegistry(); err != nil {
+				a.log.Warnf("Flush: failed to flush registry: %v", err)
+			}
+			close(responseChan)
 		}
 	}
 }
diff --git a/releasenotes/notes/auditor-flush-drain-race-b407c76bffdc4c2a.yaml b/releasenotes/notes/auditor-flush-drain-race-b407c76bffdc4c2a.yaml
@@ -0,0 +1,15 @@
+# Each section from every release note are combined when the
+# CHANGELOG.rst is rendered. So the text needs to be worded so that
+# it does not depend on any information only available in another
+# section. This may mean repeating some details, but each section
+# must be readable independently of the other.
+#
+# Each section note must be formatted as reStructuredText.
+---
+fixes:
+  - |
+    Fixed a race condition in the logs auditor where ``Flush()`` could write a
+    stale registry to disk during a transport restart. The auditor now drains
+    all pending payloads from its input channel before flushing, ensuring file
+    offsets are up to date and reducing duplicate log processing after a
+    TCP-to-HTTP transport switch.