Skip to content

Commit 92e3046

Browse files
authored
Single pass truncation (#6914)
1 parent 65c13f1 commit 92e3046

File tree

1 file changed

+79
-35
lines changed

1 file changed

+79
-35
lines changed

codex-rs/core/src/truncate.rs

Lines changed: 79 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -185,6 +185,7 @@ fn truncate_with_byte_estimate(s: &str, policy: TruncationPolicy) -> String {
185185
if s.is_empty() {
186186
return String::new();
187187
}
188+
188189
let total_chars = s.chars().count();
189190
let max_bytes = policy.byte_budget();
190191

@@ -204,24 +205,55 @@ fn truncate_with_byte_estimate(s: &str, policy: TruncationPolicy) -> String {
204205
let total_bytes = s.len();
205206

206207
let (left_budget, right_budget) = split_budget(max_bytes);
207-
let prefix_end = pick_prefix_end(s, left_budget);
208-
let mut suffix_start = pick_suffix_start(s, right_budget);
209-
if suffix_start < prefix_end {
210-
suffix_start = prefix_end;
211-
}
212208

213-
let left_chars = s[..prefix_end].chars().count();
214-
let right_chars = s[suffix_start..].chars().count();
215-
let removed_chars = total_chars
216-
.saturating_sub(left_chars)
217-
.saturating_sub(right_chars);
209+
let (removed_chars, left, right) = split_string(s, left_budget, right_budget);
218210

219211
let marker = format_truncation_marker(
220212
policy,
221213
removed_units_for_source(policy, total_bytes.saturating_sub(max_bytes), removed_chars),
222214
);
223215

224-
assemble_truncated_output(&s[..prefix_end], &s[suffix_start..], &marker)
216+
assemble_truncated_output(left, right, &marker)
217+
}
218+
219+
fn split_string(s: &str, beginning_bytes: usize, end_bytes: usize) -> (usize, &str, &str) {
220+
if s.is_empty() {
221+
return (0, "", "");
222+
}
223+
224+
let len = s.len();
225+
let tail_start_target = len.saturating_sub(end_bytes);
226+
let mut prefix_end = 0usize;
227+
let mut suffix_start = len;
228+
let mut removed_chars = 0usize;
229+
let mut suffix_started = false;
230+
231+
for (idx, ch) in s.char_indices() {
232+
let char_end = idx + ch.len_utf8();
233+
if char_end <= beginning_bytes {
234+
prefix_end = char_end;
235+
continue;
236+
}
237+
238+
if idx >= tail_start_target {
239+
if !suffix_started {
240+
suffix_start = idx;
241+
suffix_started = true;
242+
}
243+
continue;
244+
}
245+
246+
removed_chars = removed_chars.saturating_add(1);
247+
}
248+
249+
if suffix_start < prefix_end {
250+
suffix_start = prefix_end;
251+
}
252+
253+
let before = &s[..prefix_end];
254+
let after = &s[suffix_start..];
255+
256+
(removed_chars, before, after)
225257
}
226258

227259
fn format_truncation_marker(policy: TruncationPolicy, removed_count: u64) -> String {
@@ -270,42 +302,54 @@ fn approx_tokens_from_byte_count(bytes: usize) -> u64 {
270302
/ (APPROX_BYTES_PER_TOKEN as u64)
271303
}
272304

273-
fn truncate_on_boundary(input: &str, max_len: usize) -> &str {
274-
if input.len() <= max_len {
275-
return input;
276-
}
277-
let mut end = max_len;
278-
while end > 0 && !input.is_char_boundary(end) {
279-
end -= 1;
280-
}
281-
&input[..end]
282-
}
283-
284-
fn pick_prefix_end(s: &str, left_budget: usize) -> usize {
285-
truncate_on_boundary(s, left_budget).len()
286-
}
287-
288-
fn pick_suffix_start(s: &str, right_budget: usize) -> usize {
289-
let start_tail = s.len().saturating_sub(right_budget);
290-
let mut idx = start_tail.min(s.len());
291-
while idx < s.len() && !s.is_char_boundary(idx) {
292-
idx += 1;
293-
}
294-
idx
295-
}
296-
297305
#[cfg(test)]
298306
mod tests {
299307

300308
use super::TruncationPolicy;
301309
use super::approx_token_count;
302310
use super::formatted_truncate_text;
311+
use super::split_string;
303312
use super::truncate_function_output_items_with_policy;
304313
use super::truncate_text;
305314
use super::truncate_with_token_budget;
306315
use codex_protocol::models::FunctionCallOutputContentItem;
307316
use pretty_assertions::assert_eq;
308317

318+
#[test]
319+
fn split_string_works() {
320+
assert_eq!(split_string("hello world", 5, 5), (1, "hello", "world"));
321+
assert_eq!(split_string("abc", 0, 0), (3, "", ""));
322+
}
323+
324+
#[test]
325+
fn split_string_handles_empty_string() {
326+
assert_eq!(split_string("", 4, 4), (0, "", ""));
327+
}
328+
329+
#[test]
330+
fn split_string_only_keeps_prefix_when_tail_budget_is_zero() {
331+
assert_eq!(split_string("abcdef", 3, 0), (3, "abc", ""));
332+
}
333+
334+
#[test]
335+
fn split_string_only_keeps_suffix_when_prefix_budget_is_zero() {
336+
assert_eq!(split_string("abcdef", 0, 3), (3, "", "def"));
337+
}
338+
339+
#[test]
340+
fn split_string_handles_overlapping_budgets_without_removal() {
341+
assert_eq!(split_string("abcdef", 4, 4), (0, "abcd", "ef"));
342+
}
343+
344+
#[test]
345+
fn split_string_respects_utf8_boundaries() {
346+
assert_eq!(split_string("😀abc😀", 5, 5), (1, "😀a", "c😀"));
347+
348+
assert_eq!(split_string("😀😀😀😀😀", 1, 1), (5, "", ""));
349+
assert_eq!(split_string("😀😀😀😀😀", 7, 7), (3, "😀", "😀"));
350+
assert_eq!(split_string("😀😀😀😀😀", 8, 8), (1, "😀😀", "😀😀"));
351+
}
352+
309353
#[test]
310354
fn truncate_bytes_less_than_placeholder_returns_placeholder() {
311355
let content = "example output";

0 commit comments

Comments
 (0)