Skip to content

Commit 60d7081

Browse files
authored
test: uncomment logic to assert more chunks - they're stable (#402)
1 parent dc190d5 commit 60d7081

File tree

1 file changed

+86
-61
lines changed

1 file changed

+86
-61
lines changed

src/ops/functions/split_recursively.rs

Lines changed: 86 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -617,7 +617,7 @@ impl SimpleFunctionFactoryBase for Factory {
617617

618618
#[cfg(test)]
619619
mod tests {
620-
use super::*;
620+
use super::*;
621621

622622
// Helper function to assert chunk text and its consistency with the range within the original text.
623623
fn assert_chunk_text_consistency(
@@ -629,31 +629,43 @@ mod tests {
629629
// Extract text using the chunk's range from the original full text.
630630
let extracted_text = actual_chunk.0.extract_str(full_text);
631631
// Assert that the expected text matches the text provided in the chunk.
632-
assert_eq!(actual_chunk.1, expected_text, "Provided chunk text mismatch - {}", context);
632+
assert_eq!(
633+
actual_chunk.1, expected_text,
634+
"Provided chunk text mismatch - {}",
635+
context
636+
);
633637
// Assert that the expected text also matches the text extracted using the chunk's range.
634-
assert_eq!(extracted_text, expected_text, "Range inconsistency: extracted text mismatch - {}", context);
638+
assert_eq!(
639+
extracted_text, expected_text,
640+
"Range inconsistency: extracted text mismatch - {}",
641+
context
642+
);
635643
}
636644

637645
// Creates a default RecursiveChunker for testing, assuming no language-specific parsing.
638-
fn create_test_chunker(text: &str, chunk_size: usize, chunk_overlap: usize) -> RecursiveChunker {
646+
fn create_test_chunker(
647+
text: &str,
648+
chunk_size: usize,
649+
chunk_overlap: usize,
650+
) -> RecursiveChunker {
639651
RecursiveChunker {
640652
full_text: text,
641-
lang_config: None,
653+
lang_config: None,
642654
chunk_size,
643655
chunk_overlap,
644656
}
645657
}
646658

647659
#[test]
648660
fn test_translate_bytes_to_chars_simple() {
649-
let text = "abc😄def";
650-
let mut start1 = 0;
651-
let mut end1 = 3;
652-
let mut start2 = 3;
653-
let mut end2 = 7;
654-
let mut start3 = 7;
655-
let mut end3 = 10;
656-
let mut end_full = text.len();
661+
let text = "abc😄def";
662+
let mut start1 = 0;
663+
let mut end1 = 3;
664+
let mut start2 = 3;
665+
let mut end2 = 7;
666+
let mut start3 = 7;
667+
let mut end3 = 10;
668+
let mut end_full = text.len();
657669

658670
let offsets = vec![
659671
&mut start1,
@@ -667,22 +679,24 @@ mod tests {
667679

668680
translate_bytes_to_chars(text, offsets.into_iter());
669681

670-
assert_eq!(start1, 0);
671-
assert_eq!(end1, 3);
672-
assert_eq!(start2, 3);
673-
assert_eq!(end2, 4);
674-
assert_eq!(start3, 4);
675-
assert_eq!(end3, 7);
676-
assert_eq!(end_full, 7);
682+
assert_eq!(start1, 0);
683+
assert_eq!(end1, 3);
684+
assert_eq!(start2, 3);
685+
assert_eq!(end2, 4);
686+
assert_eq!(start3, 4);
687+
assert_eq!(end3, 7);
688+
assert_eq!(end_full, 7);
677689
}
678690

679691
#[test]
680692
fn test_basic_split_no_overlap() {
681693
let text = "Linea 1.\nLinea 2.\n\nLinea 3.";
682-
let chunker = create_test_chunker(text, 15, 0);
683-
684-
let result = chunker.split_root_chunk(ChunkKind::RegexpSepChunk { next_regexp_sep_id: 0 });
685-
694+
let chunker = create_test_chunker(text, 15, 0);
695+
696+
let result = chunker.split_root_chunk(ChunkKind::RegexpSepChunk {
697+
next_regexp_sep_id: 0,
698+
});
699+
686700
assert!(result.is_ok());
687701
let chunks = result.unwrap();
688702

@@ -693,74 +707,85 @@ mod tests {
693707

694708
// Test splitting when chunk_size forces breaks within segments.
695709
let text2 = "A very very long text that needs to be split.";
696-
let chunker2 = create_test_chunker(text2, 20, 0);
697-
let result2 = chunker2.split_root_chunk(ChunkKind::RegexpSepChunk { next_regexp_sep_id: 0 });
698-
710+
let chunker2 = create_test_chunker(text2, 20, 0);
711+
let result2 = chunker2.split_root_chunk(ChunkKind::RegexpSepChunk {
712+
next_regexp_sep_id: 0,
713+
});
714+
699715
assert!(result2.is_ok());
700716
let chunks2 = result2.unwrap();
701717

702718
// Expect multiple chunks, likely split by spaces due to chunk_size.
703-
assert!(chunks2.len() > 1);
704-
assert_chunk_text_consistency(text2, &chunks2[0], "A very very long", "Test 2, Chunk 0");
705-
assert!(chunks2[0].1.len() <= 20);
719+
assert!(chunks2.len() > 1);
720+
assert_chunk_text_consistency(text2, &chunks2[0], "A very very long", "Test 2, Chunk 0");
721+
assert!(chunks2[0].1.len() <= 20);
706722
}
707723
#[test]
708724
fn test_basic_split_with_overlap() {
709725
let text = "This is a test text that is a bit longer to see how the overlap works.";
710-
let chunker = create_test_chunker(text, 20, 5);
711-
712-
let result = chunker.split_root_chunk(ChunkKind::RegexpSepChunk { next_regexp_sep_id: 0 });
713-
726+
let chunker = create_test_chunker(text, 20, 5);
727+
728+
let result = chunker.split_root_chunk(ChunkKind::RegexpSepChunk {
729+
next_regexp_sep_id: 0,
730+
});
731+
714732
assert!(result.is_ok());
715733
let chunks = result.unwrap();
716-
717-
assert!(chunks.len() > 1);
734+
735+
assert!(chunks.len() > 1);
718736

719737
if chunks.len() >= 2 {
720738
let _chunk1_text = chunks[0].1;
721739
let _chunk2_text = chunks[1].1;
722-
723-
assert!(chunks[0].1.len() <= 25);
740+
741+
assert!(chunks[0].1.len() <= 25);
724742
}
725743
}
726744
#[test]
727745
fn test_split_trims_whitespace() {
728746
let text = " \n First chunk. \n\n Second chunk with spaces at the end. \n";
729-
let chunker = create_test_chunker(text, 30, 0);
730-
731-
let result = chunker.split_root_chunk(ChunkKind::RegexpSepChunk { next_regexp_sep_id: 0 });
732-
747+
let chunker = create_test_chunker(text, 30, 0);
748+
749+
let result = chunker.split_root_chunk(ChunkKind::RegexpSepChunk {
750+
next_regexp_sep_id: 0,
751+
});
752+
733753
assert!(result.is_ok());
734754
let chunks = result.unwrap();
735755

736-
assert_eq!(chunks.len(), 3);
737-
738-
// Only assert chunk 0 using the new helper, as chunks 1 and 2 have shown inconsistent split points/content.
739-
assert_chunk_text_consistency(text, &chunks[0], " \n First chunk.", "Whitespace Test, Chunk 0");
740-
741-
// TODO: Assertions for chunks[1] and chunks[2] are commented out because
742-
// the exact split point between them (byte 48 or 49) and their resulting
743-
// content ("...espacio"/"s al final." vs "...espacios"/"al final.")
744-
// has proven inconsistent across test runs.
745-
// This indicates a possible bug or non-deterministic behavior in the
746-
// flush_small_chunks or process_sub_chunks logic that needs investigation
747-
// in the main code.
756+
assert_eq!(chunks.len(), 3);
757+
758+
assert_chunk_text_consistency(
759+
text,
760+
&chunks[0],
761+
" \n First chunk.",
762+
"Whitespace Test, Chunk 0",
763+
);
764+
assert_chunk_text_consistency(
765+
text,
766+
&chunks[1],
767+
" Second chunk with spaces at",
768+
"Whitespace Test, Chunk 1",
769+
);
770+
assert_chunk_text_consistency(text, &chunks[2], "the end.", "Whitespace Test, Chunk 2");
748771
}
749772
#[test]
750773
fn test_split_discards_empty_chunks() {
751774
let text = "Chunk 1.\n\n \n\nChunk 2.\n\n------\n\nChunk 3.";
752-
let chunker = create_test_chunker(text, 10, 0);
753-
754-
let result = chunker.split_root_chunk(ChunkKind::RegexpSepChunk { next_regexp_sep_id: 0 });
755-
775+
let chunker = create_test_chunker(text, 10, 0);
776+
777+
let result = chunker.split_root_chunk(ChunkKind::RegexpSepChunk {
778+
next_regexp_sep_id: 0,
779+
});
780+
756781
assert!(result.is_ok());
757782
let chunks = result.unwrap();
758783

759784
assert_eq!(chunks.len(), 3);
760-
785+
761786
// Expect only the chunks with actual alphanumeric content.
762787
assert_chunk_text_consistency(text, &chunks[0], "Chunk 1.", "Discard Test, Chunk 0");
763-
assert_chunk_text_consistency(text, &chunks[1], "Chunk 2.", "Discard Test, Chunk 1");
764-
assert_chunk_text_consistency(text, &chunks[2], "Chunk 3.", "Discard Test, Chunk 2");
788+
assert_chunk_text_consistency(text, &chunks[1], "Chunk 2.", "Discard Test, Chunk 1");
789+
assert_chunk_text_consistency(text, &chunks[2], "Chunk 3.", "Discard Test, Chunk 2");
765790
}
766791
}

0 commit comments

Comments
 (0)