Skip to content

Commit 2a26e29

Browse files
jif-oaiJeffCarpenter
authored andcommitted
feat: use actual tokenizer for unified_exec truncation (openai#5514)
1 parent 1793f23 commit 2a26e29

File tree

4 files changed

+68
-41
lines changed

4 files changed

+68
-41
lines changed

codex-rs/Cargo.lock

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

codex-rs/core/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ codex-rmcp-client = { workspace = true }
2828
codex-async-utils = { workspace = true }
2929
codex-utils-string = { workspace = true }
3030
codex-utils-pty = { workspace = true }
31+
codex-utils-tokenizer = { workspace = true }
3132
dirs = { workspace = true }
3233
dunce = { workspace = true }
3334
env-flags = { workspace = true }

codex-rs/core/src/truncate.rs

Lines changed: 60 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,35 @@
11
//! Utilities for truncating large chunks of output while preserving a prefix
22
//! and suffix on UTF-8 boundaries.
33
4+
use codex_utils_tokenizer::Tokenizer;
5+
46
/// Truncate the middle of a UTF-8 string to at most `max_bytes` bytes,
57
/// preserving the beginning and the end. Returns the possibly truncated
6-
/// string and `Some(original_token_count)` (estimated at 4 bytes/token)
8+
/// string and `Some(original_token_count)` (counted with the local tokenizer;
9+
/// falls back to a 4-bytes-per-token estimate if the tokenizer cannot load)
710
/// if truncation occurred; otherwise returns the original string and `None`.
811
pub(crate) fn truncate_middle(s: &str, max_bytes: usize) -> (String, Option<u64>) {
912
if s.len() <= max_bytes {
1013
return (s.to_string(), None);
1114
}
1215

13-
let est_tokens = (s.len() as u64).div_ceil(4);
16+
// Build a tokenizer for counting (default to o200k_base; fall back to cl100k_base).
17+
// If both fail, fall back to a 4-bytes-per-token estimate.
18+
let tok = Tokenizer::try_default().ok();
19+
let token_count = |text: &str| -> u64 {
20+
if let Some(ref t) = tok {
21+
t.count(text) as u64
22+
} else {
23+
(text.len() as u64).div_ceil(4)
24+
}
25+
};
26+
27+
let total_tokens = token_count(s);
1428
if max_bytes == 0 {
15-
return (format!("…{est_tokens} tokens truncated…"), Some(est_tokens));
29+
return (
30+
format!("…{total_tokens} tokens truncated…"),
31+
Some(total_tokens),
32+
);
1633
}
1734

1835
fn truncate_on_boundary(input: &str, max_len: usize) -> &str {
@@ -50,13 +67,17 @@ pub(crate) fn truncate_middle(s: &str, max_bytes: usize) -> (String, Option<u64>
5067
idx
5168
}
5269

53-
let mut guess_tokens = est_tokens;
70+
// Iterate to stabilize marker length → keep budget → boundaries.
71+
let mut guess_tokens: u64 = 1;
5472
for _ in 0..4 {
5573
let marker = format!("…{guess_tokens} tokens truncated…");
5674
let marker_len = marker.len();
5775
let keep_budget = max_bytes.saturating_sub(marker_len);
5876
if keep_budget == 0 {
59-
return (format!("…{est_tokens} tokens truncated…"), Some(est_tokens));
77+
return (
78+
format!("…{total_tokens} tokens truncated…"),
79+
Some(total_tokens),
80+
);
6081
}
6182

6283
let left_budget = keep_budget / 2;
@@ -67,59 +88,72 @@ pub(crate) fn truncate_middle(s: &str, max_bytes: usize) -> (String, Option<u64>
6788
suffix_start = prefix_end;
6889
}
6990

70-
let kept_content_bytes = prefix_end + (s.len() - suffix_start);
71-
let truncated_content_bytes = s.len().saturating_sub(kept_content_bytes);
72-
let new_tokens = (truncated_content_bytes as u64).div_ceil(4);
91+
// Tokens actually removed (middle slice) using the real tokenizer.
92+
let removed_tokens = token_count(&s[prefix_end..suffix_start]);
7393

74-
if new_tokens == guess_tokens {
75-
let mut out = String::with_capacity(marker_len + kept_content_bytes + 1);
94+
// If the number of digits in the token count does not change the marker length,
95+
// we can finalize output.
96+
let final_marker = format!("…{removed_tokens} tokens truncated…");
97+
if final_marker.len() == marker_len {
98+
let kept_content_bytes = prefix_end + (s.len() - suffix_start);
99+
let mut out = String::with_capacity(final_marker.len() + kept_content_bytes + 1);
76100
out.push_str(&s[..prefix_end]);
77-
out.push_str(&marker);
101+
out.push_str(&final_marker);
78102
out.push('\n');
79103
out.push_str(&s[suffix_start..]);
80-
return (out, Some(est_tokens));
104+
return (out, Some(total_tokens));
81105
}
82106

83-
guess_tokens = new_tokens;
107+
guess_tokens = removed_tokens;
84108
}
85109

110+
// Fallback build after iterations: compute with the last guess.
86111
let marker = format!("…{guess_tokens} tokens truncated…");
87112
let marker_len = marker.len();
88113
let keep_budget = max_bytes.saturating_sub(marker_len);
89114
if keep_budget == 0 {
90-
return (format!("…{est_tokens} tokens truncated…"), Some(est_tokens));
115+
return (
116+
format!("…{total_tokens} tokens truncated…"),
117+
Some(total_tokens),
118+
);
91119
}
92120

93121
let left_budget = keep_budget / 2;
94122
let right_budget = keep_budget - left_budget;
95123
let prefix_end = pick_prefix_end(s, left_budget);
96-
let suffix_start = pick_suffix_start(s, right_budget);
124+
let mut suffix_start = pick_suffix_start(s, right_budget);
125+
if suffix_start < prefix_end {
126+
suffix_start = prefix_end;
127+
}
97128

98129
let mut out = String::with_capacity(marker_len + prefix_end + (s.len() - suffix_start) + 1);
99130
out.push_str(&s[..prefix_end]);
100131
out.push_str(&marker);
101132
out.push('\n');
102133
out.push_str(&s[suffix_start..]);
103-
(out, Some(est_tokens))
134+
(out, Some(total_tokens))
104135
}
105136

106137
#[cfg(test)]
107138
mod tests {
108139
use super::truncate_middle;
140+
use codex_utils_tokenizer::Tokenizer;
109141

110142
#[test]
111143
fn truncate_middle_no_newlines_fallback() {
144+
let tok = Tokenizer::try_default().expect("load tokenizer");
112145
let s = "abcdefghijklmnopqrstuvwxyz0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ*";
113146
let max_bytes = 32;
114147
let (out, original) = truncate_middle(s, max_bytes);
115148
assert!(out.starts_with("abc"));
116149
assert!(out.contains("tokens truncated"));
117150
assert!(out.ends_with("XYZ*"));
118-
assert_eq!(original, Some((s.len() as u64).div_ceil(4)));
151+
assert_eq!(original, Some(tok.count(s) as u64));
119152
}
120153

121154
#[test]
122155
fn truncate_middle_prefers_newline_boundaries() {
156+
let tok = Tokenizer::try_default().expect("load tokenizer");
123157
let mut s = String::new();
124158
for i in 1..=20 {
125159
s.push_str(&format!("{i:03}\n"));
@@ -131,50 +165,36 @@ mod tests {
131165
assert!(out.starts_with("001\n002\n003\n004\n"));
132166
assert!(out.contains("tokens truncated"));
133167
assert!(out.ends_with("017\n018\n019\n020\n"));
134-
assert_eq!(tokens, Some(20));
168+
assert_eq!(tokens, Some(tok.count(&s) as u64));
135169
}
136170

137171
#[test]
138172
fn truncate_middle_handles_utf8_content() {
173+
let tok = Tokenizer::try_default().expect("load tokenizer");
139174
let s = "😀😀😀😀😀😀😀😀😀😀\nsecond line with ascii text\n";
140175
let max_bytes = 32;
141176
let (out, tokens) = truncate_middle(s, max_bytes);
142177

143178
assert!(out.contains("tokens truncated"));
144179
assert!(!out.contains('\u{fffd}'));
145-
assert_eq!(tokens, Some((s.len() as u64).div_ceil(4)));
180+
assert_eq!(tokens, Some(tok.count(s) as u64));
146181
}
147182

148183
#[test]
149184
fn truncate_middle_prefers_newline_boundaries_2() {
185+
let tok = Tokenizer::try_default().expect("load tokenizer");
150186
// Build a multi-line string of 20 numbered lines (each "NNN\n").
151187
let mut s = String::new();
152188
for i in 1..=20 {
153189
s.push_str(&format!("{i:03}\n"));
154190
}
155-
// Total length: 20 lines * 4 bytes per line = 80 bytes.
156191
assert_eq!(s.len(), 80);
157192

158-
// Choose a cap that forces truncation while leaving room for
159-
// a few lines on each side after accounting for the marker.
160193
let max_bytes = 64;
161-
// Expect exact output: first 4 lines, marker, last 4 lines, and correct token estimate (80/4 = 20).
162-
assert_eq!(
163-
truncate_middle(&s, max_bytes),
164-
(
165-
r#"001
166-
002
167-
003
168-
004
169-
…12 tokens truncated…
170-
017
171-
018
172-
019
173-
020
174-
"#
175-
.to_string(),
176-
Some(20)
177-
)
178-
);
194+
let (out, total) = truncate_middle(&s, max_bytes);
195+
assert!(out.starts_with("001\n002\n003\n004\n"));
196+
assert!(out.contains("tokens truncated"));
197+
assert!(out.ends_with("017\n018\n019\n020\n"));
198+
assert_eq!(total, Some(tok.count(&s) as u64));
179199
}
180200
}

codex-rs/utils/tokenizer/src/lib.rs

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,8 +55,13 @@ impl Tokenizer {
5555
Ok(Self { inner })
5656
}
5757

58+
/// Default to `O200kBase`
59+
pub fn try_default() -> Result<Self, TokenizerError> {
60+
Self::new(EncodingKind::O200kBase)
61+
}
62+
5863
/// Build a tokenizer using an `OpenAI` model name (maps to an encoding).
59-
/// Falls back to the `o200k_base` encoding when the model is unknown.
64+
/// Falls back to the `O200kBase` encoding when the model is unknown.
6065
pub fn for_model(model: &str) -> Result<Self, TokenizerError> {
6166
match tiktoken_rs::get_bpe_from_model(model) {
6267
Ok(inner) => Ok(Self { inner }),

0 commit comments

Comments
 (0)