Skip to content

Commit 7fff78b

Browse files
committed
Fix: Rust UTF-8 iterator doctest
1 parent 85af5b5 commit 7fff78b

File tree

1 file changed

+22
-3
lines changed

1 file changed

+22
-3
lines changed

rust/stringzilla.rs

Lines changed: 22 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1226,14 +1226,33 @@ where
12261226
///
12271227
/// # Examples
12281228
///
1229+
/// Processing pure ASCII text (most common case, single chunk):
12291230
/// ```
12301231
/// use stringzilla::stringzilla as sz;
1231-
/// let text = "Hello 世界";
1232+
/// let text = "Hello World!";
12321233
/// let mut runes = [0u32; 16];
12331234
/// let (bytes, count) = sz::utf8_unpack_chunk(text.as_bytes(), &mut runes);
1234-
/// assert_eq!(count, 8); // 6 ASCII + 2 CJK characters
1235+
/// assert_eq!(count, 12); // All 12 ASCII characters
1236+
/// assert_eq!(bytes, 12); // 12 bytes consumed
12351237
/// assert_eq!(runes[0], 'H' as u32);
1236-
/// assert_eq!(runes[6], '世' as u32);
1238+
/// assert_eq!(runes[11], '!' as u32);
1239+
/// ```
1240+
///
1241+
/// For mixed ASCII/multibyte text, SIMD implementations may process homogeneous
1242+
/// chunks separately. Call repeatedly to process the entire string:
1243+
/// ```
1244+
/// use stringzilla::stringzilla as sz;
1245+
/// let text = "Hi世界"; // 2 ASCII + 2 CJK
1246+
/// let bytes = text.as_bytes();
1247+
/// let mut runes = [0u32; 16];
1248+
/// let mut all_runes = Vec::new();
1249+
/// let mut offset = 0;
1250+
/// while offset < bytes.len() {
1251+
/// let (consumed, count) = sz::utf8_unpack_chunk(&bytes[offset..], &mut runes);
1252+
/// all_runes.extend_from_slice(&runes[..count]);
1253+
/// offset += consumed;
1254+
/// }
1255+
/// assert_eq!(all_runes.len(), 4); // 2 ASCII + 2 CJK = 4 codepoints
12371256
/// ```
12381257
///
12391258
pub fn utf8_unpack_chunk(text: &[u8], runes: &mut [u32]) -> (usize, usize) {

0 commit comments

Comments
 (0)