@@ -397,4 +397,165 @@ mod tests {
397397 paragraph_breaks. len( )
398398 ) ;
399399 }
400+
401+ #[ test]
402+ fn test_get_sentence_boundaries_with_multibyte_cjk ( ) {
403+ // Test with Japanese and Chinese characters (multi-byte UTF-8)
404+ let text = "日本語です。\n \n 中文文章。" ;
405+ let boundaries = get_sentence_boundaries ( "en" , text) ;
406+
407+ // Should have sentences and paragraph break
408+ assert ! (
409+ boundaries. len( ) >= 2 ,
410+ "Expected at least 2 boundaries, got {}" ,
411+ boundaries. len( )
412+ ) ;
413+
414+ // Verify indices don't overlap and are monotonically increasing
415+ for i in 1 ..boundaries. len ( ) {
416+ assert ! (
417+ boundaries[ i] . start_index >= boundaries[ i - 1 ] . end_index,
418+ "Boundary {} starts at {} but previous ends at {}" ,
419+ i,
420+ boundaries[ i] . start_index,
421+ boundaries[ i - 1 ] . end_index
422+ ) ;
423+ }
424+
425+ // Verify text can be reconstructed (most important for multi-byte UTF-8)
426+ let reconstructed: String = boundaries. iter ( ) . map ( |b| b. text ) . collect ( ) ;
427+ assert_eq ! (
428+ reconstructed, text,
429+ "Reconstructed text doesn't match original.\n Original: {:?}\n Reconstructed: {:?}" ,
430+ text, reconstructed
431+ ) ;
432+ }
433+
434+ #[ test]
435+ fn test_get_sentence_boundaries_with_emoji ( ) {
436+ // Test with emoji characters (4-byte UTF-8)
437+ let text = "Hello world 👋.\n \n Goodbye 👋." ;
438+ let boundaries = get_sentence_boundaries ( "en" , text) ;
439+
440+ // Should have sentences and paragraph break
441+ assert ! (
442+ boundaries. len( ) >= 2 ,
443+ "Expected at least 2 boundaries, got {}" ,
444+ boundaries. len( )
445+ ) ;
446+
447+ // Verify text can be reconstructed (critical for emoji)
448+ let reconstructed: String = boundaries. iter ( ) . map ( |b| b. text ) . collect ( ) ;
449+ assert_eq ! (
450+ reconstructed, text,
451+ "Reconstructed text doesn't match original with emojis"
452+ ) ;
453+
454+ // Verify boundary indices are valid
455+ for boundary in & boundaries {
456+ assert ! (
457+ boundary. start_index <= boundary. end_index,
458+ "Invalid boundary: start_index {} > end_index {}" ,
459+ boundary. start_index,
460+ boundary. end_index
461+ ) ;
462+ }
463+ }
464+
465+ #[ test]
466+ fn test_get_sentence_boundaries_with_mixed_scripts ( ) {
467+ // Test with mixed ASCII, Latin extended, and CJK
468+ let text = "English text. Café résumé.\n \n 日本語のテキスト。" ;
469+ let boundaries = get_sentence_boundaries ( "en" , text) ;
470+
471+ // Verify text reconstruction
472+ let reconstructed: String = boundaries. iter ( ) . map ( |b| b. text ) . collect ( ) ;
473+ assert_eq ! (
474+ reconstructed, text,
475+ "Mixed script text failed reconstruction"
476+ ) ;
477+
478+ // Verify indices are properly ordered
479+ for i in 1 ..boundaries. len ( ) {
480+ assert ! (
481+ boundaries[ i] . start_index >= boundaries[ i - 1 ] . end_index,
482+ "Boundaries not ordered correctly at index {}" ,
483+ i
484+ ) ;
485+ }
486+
487+ // Verify all boundaries have valid indices
488+ for ( i, boundary) in boundaries. iter ( ) . enumerate ( ) {
489+ assert ! (
490+ boundary. start_index <= boundary. end_index,
491+ "Boundary {} has invalid indices: {} > {}" ,
492+ i,
493+ boundary. start_index,
494+ boundary. end_index
495+ ) ;
496+ }
497+ }
498+
499+ #[ test]
500+ fn test_get_sentence_boundaries_character_vs_byte_offsets ( ) {
501+ // This test specifically validates that character indices (start_index/end_index)
502+ // are correctly distinguished from byte offsets (start_byte/end_byte)
503+ let text = "Short. 日本語." ;
504+ let boundaries = get_sentence_boundaries ( "en" , text) ;
505+
506+ // Count actual characters in original text
507+ let total_chars = text. chars ( ) . count ( ) ;
508+ let total_bytes = text. len ( ) ;
509+
510+ // Verify that the character indices sum correctly
511+ // All boundaries together should cover all characters
512+ let mut last_char_end = 0 ;
513+ for boundary in & boundaries {
514+ // Each boundary should start where previous ended (no gaps)
515+ assert_eq ! (
516+ boundary. start_index, last_char_end,
517+ "Gap in character indices: expected start {}, got {}" ,
518+ last_char_end, boundary. start_index
519+ ) ;
520+ last_char_end = boundary. end_index ;
521+ }
522+ assert_eq ! (
523+ last_char_end, total_chars,
524+ "Character coverage mismatch: ended at {}, total chars = {}" ,
525+ last_char_end, total_chars
526+ ) ;
527+
528+ // Similarly verify byte offsets
529+ let mut last_byte_end = 0 ;
530+ for boundary in & boundaries {
531+ assert_eq ! (
532+ boundary. start_byte, last_byte_end,
533+ "Gap in byte indices: expected start {}, got {}" ,
534+ last_byte_end, boundary. start_byte
535+ ) ;
536+ last_byte_end = boundary. end_byte ;
537+ }
538+ assert_eq ! (
539+ last_byte_end, total_bytes,
540+ "Byte coverage mismatch: ended at {}, total bytes = {}" ,
541+ last_byte_end, total_bytes
542+ ) ;
543+ }
544+
545+ #[ test]
546+ fn test_segment_with_multibyte_characters ( ) {
547+ // Test basic segment function with multi-byte characters
548+ let text = "日本語です。中文文章。" ;
549+ let sentences = segment ( "en" , text) ;
550+
551+ // Should segment into sentences
552+ assert ! ( sentences. len( ) > 0 , "Should find at least one sentence" ) ;
553+
554+ // Verify reconstruction
555+ let reconstructed: String = sentences. join ( "" ) ;
556+ assert_eq ! (
557+ reconstructed, text,
558+ "Segment reconstruction failed for multi-byte text"
559+ ) ;
560+ }
400561}
0 commit comments