1010import org .elasticsearch .inference .ChunkingSettings ;
1111import org .elasticsearch .test .ESTestCase ;
1212
13+ import java .util .ArrayList ;
1314import java .util .List ;
1415
1516public class RecursiveChunkerTests extends ESTestCase {
@@ -45,7 +46,7 @@ public void testChunkInputShorterThanMaxChunkSize() {
4546 assertExpectedChunksGenerated (input , settings , List .of (new Chunker .ChunkOffset (0 , input .length ())));
4647 }
4748
48- public void testChunkInputRequiresOneSplitWithNoMerges () {
49+ public void testChunkInputRequiresOneSplit () {
4950 List <String > separators = generateRandomSeparators ();
5051 RecursiveChunkingSettings settings = generateChunkingSettings (10 , separators );
5152 String input = generateTestText (2 , List .of (separators .getFirst ()));
@@ -57,23 +58,7 @@ public void testChunkInputRequiresOneSplitWithNoMerges() {
5758 );
5859 }
5960
60- public void testChunkInputRequiresOneSplitWithMerges () {
61- List <String > separators = generateRandomSeparators ();
62- RecursiveChunkingSettings settings = generateChunkingSettings (20 , separators );
63- String input = generateTestText (3 , List .of (separators .getFirst (), separators .getFirst ()));
64-
65- var expectedFirstChunkOffsetEnd = TEST_SENTENCE .length () * 2 + separators .getFirst ().length ();
66- assertExpectedChunksGenerated (
67- input ,
68- settings ,
69- List .of (
70- new Chunker .ChunkOffset (0 , expectedFirstChunkOffsetEnd ),
71- new Chunker .ChunkOffset (expectedFirstChunkOffsetEnd , input .length ())
72- )
73- );
74- }
75-
76- public void testChunkInputRequiresMultipleSplitsWithNoMerges () {
61+ public void testChunkInputRequiresMultipleSplits () {
7762 var separators = generateRandomSeparators ();
7863 RecursiveChunkingSettings settings = generateChunkingSettings (15 , separators );
7964 String input = generateTestText (4 , List .of (separators .get (1 ), separators .getFirst (), separators .get (1 )));
@@ -93,22 +78,6 @@ public void testChunkInputRequiresMultipleSplitsWithNoMerges() {
9378 );
9479 }
9580
96- public void testChunkInputRequiresMultipleSplitsWithMerges () {
97- var separators = generateRandomSeparators ();
98- RecursiveChunkingSettings settings = generateChunkingSettings (25 , separators );
99- String input = generateTestText (4 , List .of (separators .get (1 ), separators .getFirst (), separators .get (1 )));
100-
101- var expectedFirstChunkOffsetEnd = TEST_SENTENCE .length () * 2 + separators .get (1 ).length ();
102- assertExpectedChunksGenerated (
103- input ,
104- settings ,
105- List .of (
106- new Chunker .ChunkOffset (0 , expectedFirstChunkOffsetEnd ),
107- new Chunker .ChunkOffset (expectedFirstChunkOffsetEnd , input .length ())
108- )
109- );
110- }
111-
11281 public void testChunkInputRequiresBackupChunkingStrategy () {
11382 var separators = generateRandomSeparators ();
11483 RecursiveChunkingSettings settings = generateChunkingSettings (10 , separators );
@@ -149,6 +118,30 @@ public void testChunkWithRegexSeparator() {
149118 );
150119 }
151120
121+ public void testChunkLongDocument () {
122+ int numSentences = randomIntBetween (50 , 100 );
123+ List <String > separators = generateRandomSeparators ();
124+ List <String > splittersAfterSentences = new ArrayList <>();
125+ for (int i = 0 ; i < numSentences - 1 ; i ++) {
126+ splittersAfterSentences .add (randomFrom (separators ));
127+ }
128+ RecursiveChunkingSettings settings = generateChunkingSettings (15 , separators );
129+ String input = generateTestText (numSentences , splittersAfterSentences );
130+
131+ List <Chunker .ChunkOffset > expectedChunks = new ArrayList <>();
132+ int currentOffset = 0 ;
133+ for (int i = 0 ; i < numSentences ; i ++) {
134+ int chunkLength = TEST_SENTENCE .length ();
135+ if (i > 0 ) {
136+ chunkLength += splittersAfterSentences .get (i - 1 ).length ();
137+ }
138+ expectedChunks .add (new Chunker .ChunkOffset (currentOffset , currentOffset + chunkLength ));
139+ currentOffset += chunkLength ;
140+ }
141+
142+ assertExpectedChunksGenerated (input , settings , expectedChunks );
143+ }
144+
152145 private void assertExpectedChunksGenerated (String input , RecursiveChunkingSettings settings , List <Chunker .ChunkOffset > expectedChunks ) {
153146 RecursiveChunker chunker = new RecursiveChunker ();
154147 List <Chunker .ChunkOffset > chunks = chunker .chunk (input , settings );
0 commit comments