Single pass truncation (#6914)

pakrym-oai · web-flow · commit 92e304673374 · 2025-11-19T16:56:37.000-08:00
diff --git a/codex-rs/core/src/truncate.rs b/codex-rs/core/src/truncate.rs
@@ -185,6 +185,7 @@ fn truncate_with_byte_estimate(s: &str, policy: TruncationPolicy) -> String {
     if s.is_empty() {
         return String::new();
     }
+
     let total_chars = s.chars().count();
     let max_bytes = policy.byte_budget();
 
@@ -204,24 +205,55 @@ fn truncate_with_byte_estimate(s: &str, policy: TruncationPolicy) -> String {
     let total_bytes = s.len();
 
     let (left_budget, right_budget) = split_budget(max_bytes);
-    let prefix_end = pick_prefix_end(s, left_budget);
-    let mut suffix_start = pick_suffix_start(s, right_budget);
-    if suffix_start < prefix_end {
-        suffix_start = prefix_end;
-    }
 
-    let left_chars = s[..prefix_end].chars().count();
-    let right_chars = s[suffix_start..].chars().count();
-    let removed_chars = total_chars
-        .saturating_sub(left_chars)
-        .saturating_sub(right_chars);
+    let (removed_chars, left, right) = split_string(s, left_budget, right_budget);
 
     let marker = format_truncation_marker(
         policy,
         removed_units_for_source(policy, total_bytes.saturating_sub(max_bytes), removed_chars),
     );
 
-    assemble_truncated_output(&s[..prefix_end], &s[suffix_start..], &marker)
+    assemble_truncated_output(left, right, &marker)
+}
+
+fn split_string(s: &str, beginning_bytes: usize, end_bytes: usize) -> (usize, &str, &str) {
+    if s.is_empty() {
+        return (0, "", "");
+    }
+
+    let len = s.len();
+    let tail_start_target = len.saturating_sub(end_bytes);
+    let mut prefix_end = 0usize;
+    let mut suffix_start = len;
+    let mut removed_chars = 0usize;
+    let mut suffix_started = false;
+
+    for (idx, ch) in s.char_indices() {
+        let char_end = idx + ch.len_utf8();
+        if char_end <= beginning_bytes {
+            prefix_end = char_end;
+            continue;
+        }
+
+        if idx >= tail_start_target {
+            if !suffix_started {
+                suffix_start = idx;
+                suffix_started = true;
+            }
+            continue;
+        }
+
+        removed_chars = removed_chars.saturating_add(1);
+    }
+
+    if suffix_start < prefix_end {
+        suffix_start = prefix_end;
+    }
+
+    let before = &s[..prefix_end];
+    let after = &s[suffix_start..];
+
+    (removed_chars, before, after)
 }
 
 fn format_truncation_marker(policy: TruncationPolicy, removed_count: u64) -> String {
@@ -270,42 +302,54 @@ fn approx_tokens_from_byte_count(bytes: usize) -> u64 {
         / (APPROX_BYTES_PER_TOKEN as u64)
 }
 
-fn truncate_on_boundary(input: &str, max_len: usize) -> &str {
-    if input.len() <= max_len {
-        return input;
-    }
-    let mut end = max_len;
-    while end > 0 && !input.is_char_boundary(end) {
-        end -= 1;
-    }
-    &input[..end]
-}
-
-fn pick_prefix_end(s: &str, left_budget: usize) -> usize {
-    truncate_on_boundary(s, left_budget).len()
-}
-
-fn pick_suffix_start(s: &str, right_budget: usize) -> usize {
-    let start_tail = s.len().saturating_sub(right_budget);
-    let mut idx = start_tail.min(s.len());
-    while idx < s.len() && !s.is_char_boundary(idx) {
-        idx += 1;
-    }
-    idx
-}
-
 #[cfg(test)]
 mod tests {
 
     use super::TruncationPolicy;
     use super::approx_token_count;
     use super::formatted_truncate_text;
+    use super::split_string;
     use super::truncate_function_output_items_with_policy;
     use super::truncate_text;
     use super::truncate_with_token_budget;
     use codex_protocol::models::FunctionCallOutputContentItem;
     use pretty_assertions::assert_eq;
 
+    #[test]
+    fn split_string_works() {
+        assert_eq!(split_string("hello world", 5, 5), (1, "hello", "world"));
+        assert_eq!(split_string("abc", 0, 0), (3, "", ""));
+    }
+
+    #[test]
+    fn split_string_handles_empty_string() {
+        assert_eq!(split_string("", 4, 4), (0, "", ""));
+    }
+
+    #[test]
+    fn split_string_only_keeps_prefix_when_tail_budget_is_zero() {
+        assert_eq!(split_string("abcdef", 3, 0), (3, "abc", ""));
+    }
+
+    #[test]
+    fn split_string_only_keeps_suffix_when_prefix_budget_is_zero() {
+        assert_eq!(split_string("abcdef", 0, 3), (3, "", "def"));
+    }
+
+    #[test]
+    fn split_string_handles_overlapping_budgets_without_removal() {
+        assert_eq!(split_string("abcdef", 4, 4), (0, "abcd", "ef"));
+    }
+
+    #[test]
+    fn split_string_respects_utf8_boundaries() {
+        assert_eq!(split_string("😀abc😀", 5, 5), (1, "😀a", "c😀"));
+
+        assert_eq!(split_string("😀😀😀😀😀", 1, 1), (5, "", ""));
+        assert_eq!(split_string("😀😀😀😀😀", 7, 7), (3, "😀", "😀"));
+        assert_eq!(split_string("😀😀😀😀😀", 8, 8), (1, "😀😀", "😀😀"));
+    }
+
     #[test]
     fn truncate_bytes_less_than_placeholder_returns_placeholder() {
         let content = "example output";