-
Notifications
You must be signed in to change notification settings - Fork 25.6k
Add recursive chunker #126866
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add recursive chunker #126866
Changes from 1 commit
5167b21
7d9e07c
8418223
0685124
f40947a
6f649fc
c8a5f0c
6f337a8
6035d76
29498f7
0d6b461
3edf75e
3ac8b94
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -15,7 +15,7 @@ | |
|
|
||
| public class RecursiveChunkerTests extends ESTestCase { | ||
|
|
||
| private final List<String> TEST_SEPARATORS = List.of("\n\n", "\n", "\f", "\t", "#"); | ||
| private final List<String> TEST_SEPARATORS = List.of("\n", "\f", "\t", "#"); | ||
| private final String TEST_SENTENCE = "This is a test sentence that has ten total words. "; | ||
|
|
||
| public void testChunkWithInvalidChunkingSettings() { | ||
|
|
@@ -142,6 +142,45 @@ public void testChunkLongDocument() { | |
| assertExpectedChunksGenerated(input, settings, expectedChunks); | ||
| } | ||
|
|
||
| public void testMarkdownChunking() { | ||
| int numSentences = randomIntBetween(10, 50); | ||
| List<String> separators = SeparatorSet.MARKDOWN.getSeparators(); | ||
| List<String> validHeaders = List.of( | ||
| "# Header\n", | ||
| "## Header\n", | ||
| "### Header\n", | ||
| "#### Header\n", | ||
| "##### Header\n", | ||
| "###### Header\n", | ||
| "Header\n-\n", | ||
| "Header\n=\n" | ||
| ); | ||
| List<String> validSplittersAfterSentences = validHeaders.stream().map(header -> "\n" + header).toList(); | ||
| List<String> splittersAfterSentences = new ArrayList<>(); | ||
| for (int i = 0; i < numSentences - 1; i++) { | ||
| splittersAfterSentences.add(randomFrom(validSplittersAfterSentences)); | ||
| } | ||
| RecursiveChunkingSettings settings = generateChunkingSettings(15, separators); | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Because of the small chunk size the generated chunks will never contain more than 1 sentence. Can you structure the test so that some chunks contain multiple heading sections. For example if, if chunks size was 100 words and given the document In this case, given an ordered list of separators, I would expect Please add tests on longer documents that capture the hierarchical nature of the chunker There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Discussed offline with Dave. Adding this into the existing long document tests that randomly generate a document would require essentially re-writing the chunking logic into the testing file to generate the expected chunk limits. We've instead decided it makes sense to add a new test with a smaller fixed length document to cover this case. |
||
| String input = generateTestText(numSentences, splittersAfterSentences); | ||
| String leadingHeader = randomFrom(validHeaders); | ||
| input = leadingHeader + input; | ||
|
|
||
| List<Chunker.ChunkOffset> expectedChunks = new ArrayList<>(); | ||
| int currentOffset = 0; | ||
| for (int i = 0; i < numSentences; i++) { | ||
| int chunkLength = TEST_SENTENCE.length(); | ||
| if (i == 0) { | ||
| chunkLength += leadingHeader.length(); | ||
| } else { | ||
| chunkLength += splittersAfterSentences.get(i - 1).length(); | ||
| } | ||
| expectedChunks.add(new Chunker.ChunkOffset(currentOffset, currentOffset + chunkLength)); | ||
| currentOffset += chunkLength; | ||
| } | ||
|
|
||
| assertExpectedChunksGenerated(input, settings, expectedChunks); | ||
| } | ||
|
|
||
| private void assertExpectedChunksGenerated(String input, RecursiveChunkingSettings settings, List<Chunker.ChunkOffset> expectedChunks) { | ||
| RecursiveChunker chunker = new RecursiveChunker(); | ||
| List<Chunker.ChunkOffset> chunks = chunker.chunk(input, settings); | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.