11//! Utilities for truncating large chunks of output while preserving a prefix
22//! and suffix on UTF-8 boundaries.
33
4+ use codex_utils_tokenizer:: Tokenizer ;
5+
46/// Truncate the middle of a UTF-8 string to at most `max_bytes` bytes,
57/// preserving the beginning and the end. Returns the possibly truncated
6- /// string and `Some(original_token_count)` (estimated at 4 bytes/token)
8+ /// string and `Some(original_token_count)` (counted with the local tokenizer;
9+ /// falls back to a 4-bytes-per-token estimate if the tokenizer cannot load)
710/// if truncation occurred; otherwise returns the original string and `None`.
811pub ( crate ) fn truncate_middle ( s : & str , max_bytes : usize ) -> ( String , Option < u64 > ) {
912 if s. len ( ) <= max_bytes {
1013 return ( s. to_string ( ) , None ) ;
1114 }
1215
13- let est_tokens = ( s. len ( ) as u64 ) . div_ceil ( 4 ) ;
16+ // Build a tokenizer for counting (default to o200k_base; fall back to cl100k_base).
17+ // If both fail, fall back to a 4-bytes-per-token estimate.
18+ let tok = Tokenizer :: try_default ( ) . ok ( ) ;
19+ let token_count = |text : & str | -> u64 {
20+ if let Some ( ref t) = tok {
21+ t. count ( text) as u64
22+ } else {
23+ ( text. len ( ) as u64 ) . div_ceil ( 4 )
24+ }
25+ } ;
26+
27+ let total_tokens = token_count ( s) ;
1428 if max_bytes == 0 {
15- return ( format ! ( "…{est_tokens} tokens truncated…" ) , Some ( est_tokens) ) ;
29+ return (
30+ format ! ( "…{total_tokens} tokens truncated…" ) ,
31+ Some ( total_tokens) ,
32+ ) ;
1633 }
1734
1835 fn truncate_on_boundary ( input : & str , max_len : usize ) -> & str {
@@ -50,13 +67,17 @@ pub(crate) fn truncate_middle(s: &str, max_bytes: usize) -> (String, Option<u64>
5067 idx
5168 }
5269
53- let mut guess_tokens = est_tokens;
70+ // Iterate to stabilize marker length → keep budget → boundaries.
71+ let mut guess_tokens: u64 = 1 ;
5472 for _ in 0 ..4 {
5573 let marker = format ! ( "…{guess_tokens} tokens truncated…" ) ;
5674 let marker_len = marker. len ( ) ;
5775 let keep_budget = max_bytes. saturating_sub ( marker_len) ;
5876 if keep_budget == 0 {
59- return ( format ! ( "…{est_tokens} tokens truncated…" ) , Some ( est_tokens) ) ;
77+ return (
78+ format ! ( "…{total_tokens} tokens truncated…" ) ,
79+ Some ( total_tokens) ,
80+ ) ;
6081 }
6182
6283 let left_budget = keep_budget / 2 ;
@@ -67,59 +88,72 @@ pub(crate) fn truncate_middle(s: &str, max_bytes: usize) -> (String, Option<u64>
6788 suffix_start = prefix_end;
6889 }
6990
70- let kept_content_bytes = prefix_end + ( s. len ( ) - suffix_start) ;
71- let truncated_content_bytes = s. len ( ) . saturating_sub ( kept_content_bytes) ;
72- let new_tokens = ( truncated_content_bytes as u64 ) . div_ceil ( 4 ) ;
91+ // Tokens actually removed (middle slice) using the real tokenizer.
92+ let removed_tokens = token_count ( & s[ prefix_end..suffix_start] ) ;
7393
74- if new_tokens == guess_tokens {
75- let mut out = String :: with_capacity ( marker_len + kept_content_bytes + 1 ) ;
94+ // If the number of digits in the token count does not change the marker length,
95+ // we can finalize output.
96+ let final_marker = format ! ( "…{removed_tokens} tokens truncated…" ) ;
97+ if final_marker. len ( ) == marker_len {
98+ let kept_content_bytes = prefix_end + ( s. len ( ) - suffix_start) ;
99+ let mut out = String :: with_capacity ( final_marker. len ( ) + kept_content_bytes + 1 ) ;
76100 out. push_str ( & s[ ..prefix_end] ) ;
77- out. push_str ( & marker ) ;
101+ out. push_str ( & final_marker ) ;
78102 out. push ( '\n' ) ;
79103 out. push_str ( & s[ suffix_start..] ) ;
80- return ( out, Some ( est_tokens ) ) ;
104+ return ( out, Some ( total_tokens ) ) ;
81105 }
82106
83- guess_tokens = new_tokens ;
107+ guess_tokens = removed_tokens ;
84108 }
85109
110+ // Fallback build after iterations: compute with the last guess.
86111 let marker = format ! ( "…{guess_tokens} tokens truncated…" ) ;
87112 let marker_len = marker. len ( ) ;
88113 let keep_budget = max_bytes. saturating_sub ( marker_len) ;
89114 if keep_budget == 0 {
90- return ( format ! ( "…{est_tokens} tokens truncated…" ) , Some ( est_tokens) ) ;
115+ return (
116+ format ! ( "…{total_tokens} tokens truncated…" ) ,
117+ Some ( total_tokens) ,
118+ ) ;
91119 }
92120
93121 let left_budget = keep_budget / 2 ;
94122 let right_budget = keep_budget - left_budget;
95123 let prefix_end = pick_prefix_end ( s, left_budget) ;
96- let suffix_start = pick_suffix_start ( s, right_budget) ;
124+ let mut suffix_start = pick_suffix_start ( s, right_budget) ;
125+ if suffix_start < prefix_end {
126+ suffix_start = prefix_end;
127+ }
97128
98129 let mut out = String :: with_capacity ( marker_len + prefix_end + ( s. len ( ) - suffix_start) + 1 ) ;
99130 out. push_str ( & s[ ..prefix_end] ) ;
100131 out. push_str ( & marker) ;
101132 out. push ( '\n' ) ;
102133 out. push_str ( & s[ suffix_start..] ) ;
103- ( out, Some ( est_tokens ) )
134+ ( out, Some ( total_tokens ) )
104135}
105136
106137#[ cfg( test) ]
107138mod tests {
108139 use super :: truncate_middle;
140+ use codex_utils_tokenizer:: Tokenizer ;
109141
110142 #[ test]
111143 fn truncate_middle_no_newlines_fallback ( ) {
144+ let tok = Tokenizer :: try_default ( ) . expect ( "load tokenizer" ) ;
112145 let s = "abcdefghijklmnopqrstuvwxyz0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ*" ;
113146 let max_bytes = 32 ;
114147 let ( out, original) = truncate_middle ( s, max_bytes) ;
115148 assert ! ( out. starts_with( "abc" ) ) ;
116149 assert ! ( out. contains( "tokens truncated" ) ) ;
117150 assert ! ( out. ends_with( "XYZ*" ) ) ;
118- assert_eq ! ( original, Some ( ( s . len ( ) as u64 ) . div_ceil ( 4 ) ) ) ;
151+ assert_eq ! ( original, Some ( tok . count ( s ) as u64 ) ) ;
119152 }
120153
121154 #[ test]
122155 fn truncate_middle_prefers_newline_boundaries ( ) {
156+ let tok = Tokenizer :: try_default ( ) . expect ( "load tokenizer" ) ;
123157 let mut s = String :: new ( ) ;
124158 for i in 1 ..=20 {
125159 s. push_str ( & format ! ( "{i:03}\n " ) ) ;
@@ -131,50 +165,36 @@ mod tests {
131165 assert ! ( out. starts_with( "001\n 002\n 003\n 004\n " ) ) ;
132166 assert ! ( out. contains( "tokens truncated" ) ) ;
133167 assert ! ( out. ends_with( "017\n 018\n 019\n 020\n " ) ) ;
134- assert_eq ! ( tokens, Some ( 20 ) ) ;
168+ assert_eq ! ( tokens, Some ( tok . count ( & s ) as u64 ) ) ;
135169 }
136170
137171 #[ test]
138172 fn truncate_middle_handles_utf8_content ( ) {
173+ let tok = Tokenizer :: try_default ( ) . expect ( "load tokenizer" ) ;
139174 let s = "😀😀😀😀😀😀😀😀😀😀\n second line with ascii text\n " ;
140175 let max_bytes = 32 ;
141176 let ( out, tokens) = truncate_middle ( s, max_bytes) ;
142177
143178 assert ! ( out. contains( "tokens truncated" ) ) ;
144179 assert ! ( !out. contains( '\u{fffd}' ) ) ;
145- assert_eq ! ( tokens, Some ( ( s . len ( ) as u64 ) . div_ceil ( 4 ) ) ) ;
180+ assert_eq ! ( tokens, Some ( tok . count ( s ) as u64 ) ) ;
146181 }
147182
148183 #[ test]
149184 fn truncate_middle_prefers_newline_boundaries_2 ( ) {
185+ let tok = Tokenizer :: try_default ( ) . expect ( "load tokenizer" ) ;
150186 // Build a multi-line string of 20 numbered lines (each "NNN\n").
151187 let mut s = String :: new ( ) ;
152188 for i in 1 ..=20 {
153189 s. push_str ( & format ! ( "{i:03}\n " ) ) ;
154190 }
155- // Total length: 20 lines * 4 bytes per line = 80 bytes.
156191 assert_eq ! ( s. len( ) , 80 ) ;
157192
158- // Choose a cap that forces truncation while leaving room for
159- // a few lines on each side after accounting for the marker.
160193 let max_bytes = 64 ;
161- // Expect exact output: first 4 lines, marker, last 4 lines, and correct token estimate (80/4 = 20).
162- assert_eq ! (
163- truncate_middle( & s, max_bytes) ,
164- (
165- r#"001
166- 002
167- 003
168- 004
169- …12 tokens truncated…
170- 017
171- 018
172- 019
173- 020
174- "#
175- . to_string( ) ,
176- Some ( 20 )
177- )
178- ) ;
194+ let ( out, total) = truncate_middle ( & s, max_bytes) ;
195+ assert ! ( out. starts_with( "001\n 002\n 003\n 004\n " ) ) ;
196+ assert ! ( out. contains( "tokens truncated" ) ) ;
197+ assert ! ( out. ends_with( "017\n 018\n 019\n 020\n " ) ) ;
198+ assert_eq ! ( total, Some ( tok. count( & s) as u64 ) ) ;
179199 }
180200}
0 commit comments