@@ -1226,14 +1226,33 @@ where
12261226///
12271227/// # Examples
12281228///
1229+ /// Processing pure ASCII text (most common case, single chunk):
12291230/// ```
12301231/// use stringzilla::stringzilla as sz;
1231- /// let text = "Hello 世界 ";
1232+ /// let text = "Hello World! ";
12321233/// let mut runes = [0u32; 16];
12331234/// let (bytes, count) = sz::utf8_unpack_chunk(text.as_bytes(), &mut runes);
1234- /// assert_eq!(count, 8); // 6 ASCII + 2 CJK characters
1235+ /// assert_eq!(count, 12); // All 12 ASCII characters
1236+ /// assert_eq!(bytes, 12); // 12 bytes consumed
12351237/// assert_eq!(runes[0], 'H' as u32);
1236- /// assert_eq!(runes[6], '世' as u32);
1238+ /// assert_eq!(runes[11], '!' as u32);
1239+ /// ```
1240+ ///
1241+ /// For mixed ASCII/multibyte text, SIMD implementations may process homogeneous
1242+ /// chunks separately. Call repeatedly to process the entire string:
1243+ /// ```
1244+ /// use stringzilla::stringzilla as sz;
1245+ /// let text = "Hi世界"; // 2 ASCII + 2 CJK
1246+ /// let bytes = text.as_bytes();
1247+ /// let mut runes = [0u32; 16];
1248+ /// let mut all_runes = Vec::new();
1249+ /// let mut offset = 0;
1250+ /// while offset < bytes.len() {
1251+ /// let (consumed, count) = sz::utf8_unpack_chunk(&bytes[offset..], &mut runes);
1252+ /// all_runes.extend_from_slice(&runes[..count]);
1253+ /// offset += consumed;
1254+ /// }
1255+ /// assert_eq!(all_runes.len(), 4); // 2 ASCII + 2 CJK = 4 codepoints
12371256/// ```
12381257///
12391258pub fn utf8_unpack_chunk ( text : & [ u8 ] , runes : & mut [ u32 ] ) -> ( usize , usize ) {
0 commit comments