calebcall
diff --git a/‎.github/workflows/test.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/test.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎Cargo.lock‎
Lines changed: 2 additions & 2 deletions b/‎Cargo.lock‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎changelog.d/24027_file_source_multi_char_delimiter.fix.md‎
Lines changed: 3 additions & 0 deletions b/‎changelog.d/24027_file_source_multi_char_delimiter.fix.md‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎changelog.d/source-sender-utilization-metric.enhancement.md‎
Lines changed: 9 additions & 0 deletions b/‎changelog.d/source-sender-utilization-metric.enhancement.md‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎lib/dnstap-parser/src/vrl_functions/parse_dnstap.rs‎
Lines changed: 2 additions & 2 deletions b/‎lib/dnstap-parser/src/vrl_functions/parse_dnstap.rs‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎lib/enrichment/src/find_enrichment_table_records.rs‎
Lines changed: 2 additions & 2 deletions b/‎lib/enrichment/src/find_enrichment_table_records.rs‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎lib/enrichment/src/get_enrichment_table_record.rs‎
Lines changed: 2 additions & 2 deletions b/‎lib/enrichment/src/get_enrichment_table_record.rs‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎lib/file-source-common/src/buffer.rs‎
Lines changed: 222 additions & 7 deletions b/‎lib/file-source-common/src/buffer.rs‎
Lines changed: 222 additions & 7 deletions
@@ -163,7 +163,7 @@ jobs:
       - uses: ./.github/actions/setup
         with:
           rust: true
-      - run: cd rust-doc && make docs
+      - run: cd rust-doc && make ci-docs-build
 
   test-vrl:
     name: VRL - Linux
 
@@ -0,0 +1,3 @@
+Fixed a bug in the `file` source, which could silently corrupt data when using multi-char delimiters.
+
+authors: lfrancke
@@ -0,0 +1,9 @@
+Added the following metrics to record the utilization level of the buffer that
+all sources send into:
+
+- `source_buffer_max_byte_size`
+- `source_buffer_max_event_size`
+- `source_buffer_utilization`
+- `source_buffer_utilization_level`
+
+authors: bruceg
@@ -29,7 +29,7 @@ impl Function for ParseDnstap {
     }
 
     fn examples(&self) -> &'static [Example] {
-        &[Example {
+        &[example!(
             title: "Parse dnstap query message",
             source: r#"parse_dnstap!("ChVqYW1lcy1WaXJ0dWFsLU1hY2hpbmUSC0JJTkQgOS4xNi4zGgBy5wEIAxACGAEiEAAAAAAAAAAAAAAAAAAAAAAqECABBQJwlAAAAAAAAAAAADAw8+0CODVA7+zq9wVNMU3WNlI2kwIAAAABAAAAAAABCWZhY2Vib29rMQNjb20AAAEAAQAAKQIAAACAAAAMAAoACOxjCAG9zVgzWgUDY29tAGAAbQAAAAByZLM4AAAAAQAAAAAAAQJoNQdleGFtcGxlA2NvbQAABgABAAApBNABAUAAADkADwA1AAlubyBTRVAgbWF0Y2hpbmcgdGhlIERTIGZvdW5kIGZvciBkbnNzZWMtZmFpbGVkLm9yZy54AQ==")"#,
             result: Ok(indoc!(
@@ -135,7 +135,7 @@ impl Function for ParseDnstap {
                         "timestamp": "2020-06-30T03:50:07.920014129Z"
                     }"#
             )),
-        }]
+        )]
     }
 
     fn compile(
 
@@ -82,15 +82,15 @@ impl Function for FindEnrichmentTableRecords {
     }
 
     fn examples(&self) -> &'static [Example] {
-        &[Example {
+        &[example!(
             title: "find records",
             source: r#"find_enrichment_table_records!("test", {"surname": "Smith"})"#,
             result: Ok(
                 indoc! { r#"[{"id": 1, "firstname": "Bob", "surname": "Smith"},
                              {"id": 2, "firstname": "Fred", "surname": "Smith"}]"#,
                 },
             ),
-        }]
+        )]
     }
 
     fn compile(
 
@@ -79,11 +79,11 @@ impl Function for GetEnrichmentTableRecord {
     }
 
     fn examples(&self) -> &'static [Example] {
-        &[Example {
+        &[example!(
             title: "find records",
             source: r#"get_enrichment_table_record!("test", {"id": 1})"#,
             result: Ok(r#"{"id": 1, "firstname": "Bob", "surname": "Smith"}"#),
-        }]
+        )]
     }
 
     fn compile(
 
@@ -46,13 +46,56 @@ pub async fn read_until_with_max_size<'a, R: AsyncBufRead + ?Sized + Unpin>(
     let delim_len = delim.len();
     let mut discarded_for_size_and_truncated = Vec::new();
     let mut reader = Box::new(reader);
+
+    // Used to track partial delimiter matches across buffer boundaries.
+    // Data is read in chunks from the reader (see `fill_buf` below).
+    // A multi-byte delimiter may be split across the "old" and "new" buffers.
+    // Any potential partial delimiter that was found in the "old" buffer is stored in this variable.
+    let mut partial_delim: BytesMut = BytesMut::with_capacity(delim_len);
+
     loop {
+        // Read the next chunk of data
         let available: &[u8] = match reader.fill_buf().await {
             Ok(n) => n,
             Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
             Err(e) => return Err(e),
         };
 
+        // First, check if we have a partial delimiter from the previous iteration/buffer
+        if !partial_delim.is_empty() {
+            let expected_suffix = &delim[partial_delim.len()..];
+            let expected_suffix_len = expected_suffix.len();
+
+            // We already know that we have a partial delimiter match from the previous buffer.
+            // Here we check what part of the delimiter is missing and whether the new buffer
+            // contains the remaining part.
+            if available.len() >= expected_suffix_len
+                && &available[..expected_suffix_len] == expected_suffix
+            {
+                // Complete delimiter found! Consume the remainder of the delimiter so we can start
+                // processing data after the delimiter.
+                reader.consume(expected_suffix_len);
+                *position += expected_suffix_len as u64;
+                total_read += expected_suffix_len;
+                partial_delim.clear();
+
+                // Found a complete delimiter, return the current buffer so we can proceed with the
+                // next record after this delimiter in the next call.
+                return Ok(ReadResult {
+                    successfully_read: Some(total_read),
+                    discarded_for_size_and_truncated,
+                });
+            } else {
+                // Not a complete delimiter after all.
+                // Add partial_delim to output buffer as it is actual data.
+                if !discarding {
+                    buf.extend_from_slice(&partial_delim);
+                }
+                partial_delim.clear();
+                // Continue processing current available buffer
+            }
+        }
+
         let (done, used) = {
             match delim_finder.find(available) {
                 Some(i) => {
@@ -62,13 +105,75 @@ pub async fn read_until_with_max_size<'a, R: AsyncBufRead + ?Sized + Unpin>(
                     (true, i + delim_len)
                 }
                 None => {
-                    if !discarding {
-                        buf.extend_from_slice(available);
+                    // No delimiter found in current buffer. But there could be a partial delimiter
+                    // at the end of this buffer. For multi-byte delimiters like \r\n, we need
+                    // to handle the case where the delimiter is split across buffer boundaries
+                    // (e.g. \r in the "old" buffer, then we read new data and find \n in the new
+                    // buffer).
+                    let mut partial_match_len = 0;
+
+                    // We only need to check if we're not already at the end of the buffer and if we
+                    // have a delimiter that has more than one byte.
+                    if !available.is_empty() && delim_len > 1 {
+                        // Check if the end of the current buffer matches a prefix of the delimiter
+                        // by testing from longest to shortest possible prefix.
+                        //
+                        // This loop runs at most (delim_len - 1) iterations:
+                        //   - 2-byte delimiter (\r\n): 1 iteration max
+                        //   - 5-byte delimiter: 4 iterations max
+                        //
+                        // This part of the code is only called if all of these are true:
+                        //
+                        // - We have a new buffer (e.g. every 8kB, i.e. only called once per buffer)
+                        // - We have a multi-byte delimiter
+                        // - This delimiter could not be found in the current buffer
+                        //
+                        // Even for longer delimiters the performance impact is negligible.
+                        //
+                        // Example 1:
+                        //   Delimiter: \r\n
+                        //   Iteration 1: It checks if the current buffer ends with "\r",
+                        //     if it does we have a potential partial delimiter.
+                        //   The next chunk will confirm whether this is truly part of a delimiter.
+
+                        // Example 2:
+                        //   Delimiter: ABCDE
+                        //   Iteration 1: It checks if the current buffer ends with "ABCD" (we don't
+                        //     need to check "ABCDE" because that would have been caught by
+                        //     `delim_finder.find` earlier)
+                        //   Iteration 2: It checks if the current buffer ends with "ABC"
+                        //   Iterations 3-4: Same for "AB" and "A"
+                        for prefix_len in (1..delim_len).rev() {
+                            if available.len() >= prefix_len
+                                && available.ends_with(&delim[..prefix_len])
+                            {
+                                partial_match_len = prefix_len;
+                                break;
+                            }
+                        }
+                    }
+
+                    let bytes_to_copy = available.len() - partial_match_len;
+
+                    if !discarding && bytes_to_copy > 0 {
+                        buf.extend_from_slice(&available[..bytes_to_copy]);
                     }
+
+                    // If we found a potential partial delimiter, save it for the next iteration
+                    if partial_match_len > 0 {
+                        partial_delim.clear();
+                        partial_delim.extend_from_slice(&available[bytes_to_copy..]);
+                    }
+
                     (false, available.len())
                 }
             }
         };
+
+        // Check if we're at EOF before we start processing
+        // (for borrow checker, has to come before `consume`)
+        let at_eof = available.is_empty();
+
         reader.consume(used);
         *position += used as u64; // do this at exactly same time
         total_read += used;
@@ -92,11 +197,12 @@ pub async fn read_until_with_max_size<'a, R: AsyncBufRead + ?Sized + Unpin>(
                 discarding = false;
                 buf.clear();
             }
-        } else if used == 0 {
-            // We've hit EOF but not yet seen a newline. This can happen when unlucky timing causes
-            // us to observe an incomplete write. We return None here and let the loop continue
-            // next time the method is called. This is safe because the buffer is specific to this
-            // FileWatcher.
+        } else if used == 0 && at_eof {
+            // We've hit EOF but haven't seen a delimiter. This can happen when:
+            // 1. The file ends without a trailing delimiter
+            // 2. We're observing an incomplete write
+            //
+            // Return None to signal the caller to retry later.
             return Ok(ReadResult {
                 successfully_read: None,
                 discarded_for_size_and_truncated,
@@ -262,4 +368,113 @@ mod test {
         .await
         .unwrap()
     }
+
+    /// Generic test helper that tests delimiter splits across buffer boundaries
+    /// for any delimiter length. This function:
+    /// 1. Creates test data with delimiters positioned to split at buffer boundaries
+    /// 2. Tests multiple iterations to ensure state tracking works correctly
+    /// 3. Verifies all lines are correctly separated without merging
+    async fn test_delimiter_boundary_split_helper(delimiter: &[u8], num_lines: usize) {
+        let delimiter_len = delimiter.len();
+
+        // Use a buffer capacity that will force splits
+        // We'll position delimiters to split at this boundary
+        let buffer_capacity = 10;
+
+        // Build test data where each delimiter is positioned to split across buffer boundary
+        // Strategy: For each line, calculate position so delimiter starts at boundary - (delimiter_len - 1)
+        let mut data = Vec::new();
+        let mut expected_lines = Vec::new();
+
+        for i in 0..num_lines {
+            // Create line content that positions the delimiter to split at buffer boundary
+            // We want the delimiter to straddle a buffer_capacity boundary
+
+            // Calculate how many bytes until the next buffer boundary
+            let current_pos = data.len();
+            let bytes_until_boundary = buffer_capacity - (current_pos % buffer_capacity);
+
+            // Create line content that will position delimiter to split
+            // We want (delimiter_len - 1) bytes before boundary, then 1 byte after
+            let line_content = if bytes_until_boundary > delimiter_len {
+                let content_len = bytes_until_boundary - (delimiter_len - 1);
+                format!("line{:0width$}", i, width = content_len.saturating_sub(4)).into_bytes()
+            } else {
+                // Not enough room in this buffer, pad to next boundary
+                let padding = bytes_until_boundary;
+                let extra_content = buffer_capacity - (delimiter_len - 1);
+                let mut content = vec![b'X'; padding];
+                content.extend_from_slice(
+                    format!("L{:0width$}", i, width = extra_content.saturating_sub(1)).as_bytes(),
+                );
+                content
+            };
+
+            expected_lines.push(line_content.clone());
+            data.extend_from_slice(&line_content);
+            data.extend_from_slice(delimiter);
+        }
+
+        // Now test reading this data
+        let cursor = Cursor::new(data);
+        let mut reader = BufReader::with_capacity(buffer_capacity, cursor);
+        let mut position = 0;
+        let max_size = 1024;
+
+        // Read each line and verify it matches expected
+        for (i, expected_line) in expected_lines.iter().enumerate() {
+            let mut buffer = BytesMut::new();
+            let result = read_until_with_max_size(
+                Box::pin(&mut reader),
+                &mut position,
+                delimiter,
+                &mut buffer,
+                max_size,
+            )
+            .await
+            .unwrap();
+
+            assert_eq!(
+                buffer.as_ref(),
+                expected_line.as_slice(),
+                "Line {} should match expected content. Got: {:?}, Expected: {:?}",
+                i,
+                String::from_utf8_lossy(&buffer),
+                String::from_utf8_lossy(expected_line)
+            );
+
+            assert!(
+                result.successfully_read.is_some(),
+                "Should find delimiter for line {}",
+                i
+            );
+        }
+    }
+
+    #[tokio::test]
+    async fn test_single_byte_delimiter_boundary() {
+        // Test single-byte delimiter (should work without any special handling)
+        test_delimiter_boundary_split_helper(b"\n", 5).await;
+    }
+
+    #[tokio::test]
+    async fn test_two_byte_delimiter_boundary() {
+        // Test two-byte delimiter (CRLF case)
+        test_delimiter_boundary_split_helper(b"\r\n", 5).await;
+    }
+
+    #[tokio::test]
+    async fn test_three_byte_delimiter_boundary() {
+        test_delimiter_boundary_split_helper(b"|||", 5).await;
+    }
+
+    #[tokio::test]
+    async fn test_four_byte_delimiter_boundary() {
+        test_delimiter_boundary_split_helper(b"<|>|", 5).await;
+    }
+
+    #[tokio::test]
+    async fn test_five_byte_delimiter_boundary() {
+        test_delimiter_boundary_split_helper(b"<<>>>", 5).await;
+    }
 }
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	+Fixed a bug in the `file` source, which could silently corrupt data when using multi-char delimiters.
	`2`	`+`
	`3`	`+authors: lfrancke`
Original file line number	Diff line number	Diff line change
`@@ -79,11 +79,11 @@ impl Function for GetEnrichmentTableRecord {`
`79`	`79`	`}`
`80`	`80`
`81`	`81`	`fn examples(&self) -> &'static [Example] {`
`82`		`- &[Example {`
	`82`	`+ &[example!(`
`83`	`83`	`title: "find records",`
`84`	`84`	`source: r#"get_enrichment_table_record!("test", {"id": 1})"#,`
`85`	`85`	`result: Ok(r#"{"id": 1, "firstname": "Bob", "surname": "Smith"}"#),`
`86`		`- }]`
	`86`	`+ )]`
`87`	`87`	`}`
`88`	`88`
`89`	`89`	`fn compile(`