@@ -617,7 +617,7 @@ impl SimpleFunctionFactoryBase for Factory {
617617
618618#[ cfg( test) ]
619619mod tests {
620- use super :: * ;
620+ use super :: * ;
621621
622622 // Helper function to assert chunk text and its consistency with the range within the original text.
623623 fn assert_chunk_text_consistency (
@@ -629,31 +629,43 @@ mod tests {
629629 // Extract text using the chunk's range from the original full text.
630630 let extracted_text = actual_chunk. 0 . extract_str ( full_text) ;
631631 // Assert that the expected text matches the text provided in the chunk.
632- assert_eq ! ( actual_chunk. 1 , expected_text, "Provided chunk text mismatch - {}" , context) ;
632+ assert_eq ! (
633+ actual_chunk. 1 , expected_text,
634+ "Provided chunk text mismatch - {}" ,
635+ context
636+ ) ;
633637 // Assert that the expected text also matches the text extracted using the chunk's range.
634- assert_eq ! ( extracted_text, expected_text, "Range inconsistency: extracted text mismatch - {}" , context) ;
638+ assert_eq ! (
639+ extracted_text, expected_text,
640+ "Range inconsistency: extracted text mismatch - {}" ,
641+ context
642+ ) ;
635643 }
636644
637645 // Creates a default RecursiveChunker for testing, assuming no language-specific parsing.
638- fn create_test_chunker ( text : & str , chunk_size : usize , chunk_overlap : usize ) -> RecursiveChunker {
646+ fn create_test_chunker (
647+ text : & str ,
648+ chunk_size : usize ,
649+ chunk_overlap : usize ,
650+ ) -> RecursiveChunker {
639651 RecursiveChunker {
640652 full_text : text,
641- lang_config : None ,
653+ lang_config : None ,
642654 chunk_size,
643655 chunk_overlap,
644656 }
645657 }
646658
647659 #[ test]
648660 fn test_translate_bytes_to_chars_simple ( ) {
649- let text = "abc😄def" ;
650- let mut start1 = 0 ;
651- let mut end1 = 3 ;
652- let mut start2 = 3 ;
653- let mut end2 = 7 ;
654- let mut start3 = 7 ;
655- let mut end3 = 10 ;
656- let mut end_full = text. len ( ) ;
661+ let text = "abc😄def" ;
662+ let mut start1 = 0 ;
663+ let mut end1 = 3 ;
664+ let mut start2 = 3 ;
665+ let mut end2 = 7 ;
666+ let mut start3 = 7 ;
667+ let mut end3 = 10 ;
668+ let mut end_full = text. len ( ) ;
657669
658670 let offsets = vec ! [
659671 & mut start1,
@@ -667,22 +679,24 @@ mod tests {
667679
668680 translate_bytes_to_chars ( text, offsets. into_iter ( ) ) ;
669681
670- assert_eq ! ( start1, 0 ) ;
671- assert_eq ! ( end1, 3 ) ;
672- assert_eq ! ( start2, 3 ) ;
673- assert_eq ! ( end2, 4 ) ;
674- assert_eq ! ( start3, 4 ) ;
675- assert_eq ! ( end3, 7 ) ;
676- assert_eq ! ( end_full, 7 ) ;
682+ assert_eq ! ( start1, 0 ) ;
683+ assert_eq ! ( end1, 3 ) ;
684+ assert_eq ! ( start2, 3 ) ;
685+ assert_eq ! ( end2, 4 ) ;
686+ assert_eq ! ( start3, 4 ) ;
687+ assert_eq ! ( end3, 7 ) ;
688+ assert_eq ! ( end_full, 7 ) ;
677689 }
678690
679691 #[ test]
680692 fn test_basic_split_no_overlap ( ) {
681693 let text = "Linea 1.\n Linea 2.\n \n Linea 3." ;
682- let chunker = create_test_chunker ( text, 15 , 0 ) ;
683-
684- let result = chunker. split_root_chunk ( ChunkKind :: RegexpSepChunk { next_regexp_sep_id : 0 } ) ;
685-
694+ let chunker = create_test_chunker ( text, 15 , 0 ) ;
695+
696+ let result = chunker. split_root_chunk ( ChunkKind :: RegexpSepChunk {
697+ next_regexp_sep_id : 0 ,
698+ } ) ;
699+
686700 assert ! ( result. is_ok( ) ) ;
687701 let chunks = result. unwrap ( ) ;
688702
@@ -693,74 +707,85 @@ mod tests {
693707
694708 // Test splitting when chunk_size forces breaks within segments.
695709 let text2 = "A very very long text that needs to be split." ;
696- let chunker2 = create_test_chunker ( text2, 20 , 0 ) ;
697- let result2 = chunker2. split_root_chunk ( ChunkKind :: RegexpSepChunk { next_regexp_sep_id : 0 } ) ;
698-
710+ let chunker2 = create_test_chunker ( text2, 20 , 0 ) ;
711+ let result2 = chunker2. split_root_chunk ( ChunkKind :: RegexpSepChunk {
712+ next_regexp_sep_id : 0 ,
713+ } ) ;
714+
699715 assert ! ( result2. is_ok( ) ) ;
700716 let chunks2 = result2. unwrap ( ) ;
701717
702718 // Expect multiple chunks, likely split by spaces due to chunk_size.
703- assert ! ( chunks2. len( ) > 1 ) ;
704- assert_chunk_text_consistency ( text2, & chunks2[ 0 ] , "A very very long" , "Test 2, Chunk 0" ) ;
705- assert ! ( chunks2[ 0 ] . 1 . len( ) <= 20 ) ;
719+ assert ! ( chunks2. len( ) > 1 ) ;
720+ assert_chunk_text_consistency ( text2, & chunks2[ 0 ] , "A very very long" , "Test 2, Chunk 0" ) ;
721+ assert ! ( chunks2[ 0 ] . 1 . len( ) <= 20 ) ;
706722 }
707723 #[ test]
708724 fn test_basic_split_with_overlap ( ) {
709725 let text = "This is a test text that is a bit longer to see how the overlap works." ;
710- let chunker = create_test_chunker ( text, 20 , 5 ) ;
711-
712- let result = chunker. split_root_chunk ( ChunkKind :: RegexpSepChunk { next_regexp_sep_id : 0 } ) ;
713-
726+ let chunker = create_test_chunker ( text, 20 , 5 ) ;
727+
728+ let result = chunker. split_root_chunk ( ChunkKind :: RegexpSepChunk {
729+ next_regexp_sep_id : 0 ,
730+ } ) ;
731+
714732 assert ! ( result. is_ok( ) ) ;
715733 let chunks = result. unwrap ( ) ;
716-
717- assert ! ( chunks. len( ) > 1 ) ;
734+
735+ assert ! ( chunks. len( ) > 1 ) ;
718736
719737 if chunks. len ( ) >= 2 {
720738 let _chunk1_text = chunks[ 0 ] . 1 ;
721739 let _chunk2_text = chunks[ 1 ] . 1 ;
722-
723- assert ! ( chunks[ 0 ] . 1 . len( ) <= 25 ) ;
740+
741+ assert ! ( chunks[ 0 ] . 1 . len( ) <= 25 ) ;
724742 }
725743 }
726744 #[ test]
727745 fn test_split_trims_whitespace ( ) {
728746 let text = " \n First chunk. \n \n Second chunk with spaces at the end. \n " ;
729- let chunker = create_test_chunker ( text, 30 , 0 ) ;
730-
731- let result = chunker. split_root_chunk ( ChunkKind :: RegexpSepChunk { next_regexp_sep_id : 0 } ) ;
732-
747+ let chunker = create_test_chunker ( text, 30 , 0 ) ;
748+
749+ let result = chunker. split_root_chunk ( ChunkKind :: RegexpSepChunk {
750+ next_regexp_sep_id : 0 ,
751+ } ) ;
752+
733753 assert ! ( result. is_ok( ) ) ;
734754 let chunks = result. unwrap ( ) ;
735755
736- assert_eq ! ( chunks. len( ) , 3 ) ;
737-
738- // Only assert chunk 0 using the new helper, as chunks 1 and 2 have shown inconsistent split points/content.
739- assert_chunk_text_consistency ( text, & chunks[ 0 ] , " \n First chunk." , "Whitespace Test, Chunk 0" ) ;
740-
741- // TODO: Assertions for chunks[1] and chunks[2] are commented out because
742- // the exact split point between them (byte 48 or 49) and their resulting
743- // content ("...espacio"/"s al final." vs "...espacios"/"al final.")
744- // has proven inconsistent across test runs.
745- // This indicates a possible bug or non-deterministic behavior in the
746- // flush_small_chunks or process_sub_chunks logic that needs investigation
747- // in the main code.
756+ assert_eq ! ( chunks. len( ) , 3 ) ;
757+
758+ assert_chunk_text_consistency (
759+ text,
760+ & chunks[ 0 ] ,
761+ " \n First chunk." ,
762+ "Whitespace Test, Chunk 0" ,
763+ ) ;
764+ assert_chunk_text_consistency (
765+ text,
766+ & chunks[ 1 ] ,
767+ " Second chunk with spaces at" ,
768+ "Whitespace Test, Chunk 1" ,
769+ ) ;
770+ assert_chunk_text_consistency ( text, & chunks[ 2 ] , "the end." , "Whitespace Test, Chunk 2" ) ;
748771 }
749772 #[ test]
750773 fn test_split_discards_empty_chunks ( ) {
751774 let text = "Chunk 1.\n \n \n \n Chunk 2.\n \n ------\n \n Chunk 3." ;
752- let chunker = create_test_chunker ( text, 10 , 0 ) ;
753-
754- let result = chunker. split_root_chunk ( ChunkKind :: RegexpSepChunk { next_regexp_sep_id : 0 } ) ;
755-
775+ let chunker = create_test_chunker ( text, 10 , 0 ) ;
776+
777+ let result = chunker. split_root_chunk ( ChunkKind :: RegexpSepChunk {
778+ next_regexp_sep_id : 0 ,
779+ } ) ;
780+
756781 assert ! ( result. is_ok( ) ) ;
757782 let chunks = result. unwrap ( ) ;
758783
759784 assert_eq ! ( chunks. len( ) , 3 ) ;
760-
785+
761786 // Expect only the chunks with actual alphanumeric content.
762787 assert_chunk_text_consistency ( text, & chunks[ 0 ] , "Chunk 1." , "Discard Test, Chunk 0" ) ;
763- assert_chunk_text_consistency ( text, & chunks[ 1 ] , "Chunk 2." , "Discard Test, Chunk 1" ) ;
764- assert_chunk_text_consistency ( text, & chunks[ 2 ] , "Chunk 3." , "Discard Test, Chunk 2" ) ;
788+ assert_chunk_text_consistency ( text, & chunks[ 1 ] , "Chunk 2." , "Discard Test, Chunk 1" ) ;
789+ assert_chunk_text_consistency ( text, & chunks[ 2 ] , "Chunk 3." , "Discard Test, Chunk 2" ) ;
765790 }
766791}
0 commit comments