@@ -558,4 +558,142 @@ mod tests {
558558 "Segment reconstruction failed for multi-byte text"
559559 ) ;
560560 }
561+
562+ #[ test]
563+ fn test_boundary_symbol_detection_with_trailing_space ( ) {
564+ // This test reproduces the bug from issue #35:
565+ // boundary_symbols are not detected if space follows boundary_symbol
566+ let text = "Hello world. This is a test.Another test. And another test." ;
567+ let boundaries = get_sentence_boundaries ( "en" , text) ;
568+
569+ // Verify we have at least 4 boundaries (4 sentences)
570+ assert ! (
571+ boundaries. len( ) >= 4 ,
572+ "Expected at least 4 boundaries, got {}" ,
573+ boundaries. len( )
574+ ) ;
575+
576+ // Check each boundary has the correct boundary_symbol
577+ let non_paragraph: Vec < _ > = boundaries
578+ . iter ( )
579+ . filter ( |b| !b. is_paragraph_break )
580+ . collect ( ) ;
581+
582+ // All 4 sentences should have period as boundary symbol
583+ for ( i, boundary) in non_paragraph. iter ( ) . enumerate ( ) {
584+ assert ! (
585+ boundary. boundary_symbol. is_some( ) ,
586+ "Boundary {} should have a boundary_symbol, got None" ,
587+ i
588+ ) ;
589+ assert_eq ! (
590+ boundary. boundary_symbol. as_deref( ) ,
591+ Some ( "." ) ,
592+ "Boundary {} should have period as symbol" ,
593+ i
594+ ) ;
595+ }
596+ }
597+
598+ #[ test]
599+ fn test_boundary_symbol_with_multiple_trailing_spaces ( ) {
600+ // Test that multiple trailing spaces don't prevent symbol detection
601+ let text = "Hello. This is another. Yet another." ;
602+ let boundaries = get_sentence_boundaries ( "en" , text) ;
603+
604+ let non_paragraph = boundaries
605+ . iter ( )
606+ . filter ( |b| !b. is_paragraph_break )
607+ . collect :: < Vec < _ > > ( ) ;
608+
609+ // All sentence boundaries should have period symbol
610+ for ( i, boundary) in non_paragraph. iter ( ) . enumerate ( ) {
611+ assert_eq ! (
612+ boundary. boundary_symbol. as_deref( ) ,
613+ Some ( "." ) ,
614+ "Boundary {} should have period symbol despite trailing spaces" ,
615+ i
616+ ) ;
617+ }
618+ }
619+
620+ #[ test]
621+ fn test_boundary_symbol_with_mixed_terminators ( ) {
622+ // Test various sentence terminators with trailing spaces
623+ let text = "Hello! How are you? I'm fine. Yes." ;
624+ let boundaries = get_sentence_boundaries ( "en" , text) ;
625+
626+ let non_paragraph = boundaries
627+ . iter ( )
628+ . filter ( |b| !b. is_paragraph_break )
629+ . collect :: < Vec < _ > > ( ) ;
630+
631+ // Verify we detect the different terminators
632+ let symbols: Vec < _ > = non_paragraph
633+ . iter ( )
634+ . filter_map ( |b| b. boundary_symbol . as_deref ( ) )
635+ . collect ( ) ;
636+
637+ assert ! (
638+ symbols. iter( ) . any( |& s| s == "!" ) ,
639+ "Should detect exclamation mark"
640+ ) ;
641+ assert ! (
642+ symbols. iter( ) . any( |& s| s == "?" ) ,
643+ "Should detect question mark"
644+ ) ;
645+ assert ! ( symbols. iter( ) . any( |& s| s == "." ) , "Should detect period" ) ;
646+ }
647+
648+ #[ test]
649+ fn test_boundary_symbol_with_cjk_terminator ( ) {
650+ // Test with CJK full stop (。) which is a valid sentence terminator
651+ let text = "日本語です。中文です。" ;
652+ let boundaries = get_sentence_boundaries ( "en" , text) ;
653+
654+ let non_paragraph = boundaries
655+ . iter ( )
656+ . filter ( |b| !b. is_paragraph_break )
657+ . collect :: < Vec < _ > > ( ) ;
658+
659+ // Should detect CJK terminators
660+ let with_cjk_stop = non_paragraph
661+ . iter ( )
662+ . filter ( |b| b. boundary_symbol . as_deref ( ) == Some ( "。" ) )
663+ . count ( ) ;
664+
665+ assert ! (
666+ with_cjk_stop >= 1 ,
667+ "Should detect at least one CJK full stop, got {}" ,
668+ with_cjk_stop
669+ ) ;
670+ }
671+
672+ #[ test]
673+ fn test_boundary_symbol_with_tabs_and_spaces ( ) {
674+ // Test with mixed whitespace (tabs and spaces)
675+ let text = "First sentence.\t \n Second sentence. Third one!" ;
676+ let boundaries = get_sentence_boundaries ( "en" , text) ;
677+
678+ let non_paragraph = boundaries
679+ . iter ( )
680+ . filter ( |b| !b. is_paragraph_break )
681+ . collect :: < Vec < _ > > ( ) ;
682+
683+ // All should have boundary symbols
684+ for ( i, boundary) in non_paragraph. iter ( ) . enumerate ( ) {
685+ assert ! (
686+ boundary. boundary_symbol. is_some( ) ,
687+ "Boundary {} should have boundary_symbol despite mixed whitespace" ,
688+ i
689+ ) ;
690+ }
691+
692+ // Verify reconstruction still works
693+ let reconstructed: String = boundaries. iter ( ) . map ( |b| b. text ) . collect ( ) ;
694+ assert_eq ! (
695+ reconstructed, text,
696+ "Text reconstruction failed with mixed whitespace"
697+ ) ;
698+ }
561699}
0 commit comments