Skip to content

Commit a8a247f

Browse files
[8.19] Adding merging logic to recursive chunking and renaming SeparatorSet to SeparatorGroup (#131103) (#131227)
* Adding merging logic to recursive chunking and renaming SeparatorSet to SeparatorGroup (#131103) * Adding merging logic to recursive chunking and renaming SeparatorSet to SeparatorGroup * Update docs/changelog/131103.yaml * Delete docs/changelog/131103.yaml --------- Co-authored-by: Elastic Machine <[email protected]> * Removing getFirst calls --------- Co-authored-by: Elastic Machine <[email protected]>
1 parent 9396aaf commit a8a247f

File tree

6 files changed

+87
-25
lines changed

6 files changed

+87
-25
lines changed

x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/chunking/ChunkingSettingsOptions.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ public enum ChunkingSettingsOptions {
1212
MAX_CHUNK_SIZE("max_chunk_size"),
1313
OVERLAP("overlap"),
1414
SENTENCE_OVERLAP("sentence_overlap"),
15-
SEPARATOR_SET("separator_set"),
15+
SEPARATOR_GROUP("separator_group"),
1616
SEPARATORS("separators");
1717

1818
private final String chunkingSettingsOption;

x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/chunking/RecursiveChunker.java

Lines changed: 31 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,10 @@ private List<ChunkOffset> chunk(String input, ChunkOffset offset, List<String> s
6060
return chunkWithBackupChunker(input, offset, maxChunkSize);
6161
}
6262

63-
var potentialChunks = splitTextBySeparatorRegex(input, offset, separators.get(separatorIndex));
63+
var potentialChunks = mergeChunkOffsetsUpToMaxChunkSize(
64+
splitTextBySeparatorRegex(input, offset, separators.get(separatorIndex)),
65+
maxChunkSize
66+
);
6467
var actualChunks = new ArrayList<ChunkOffset>();
6568
for (var potentialChunk : potentialChunks) {
6669
if (isChunkWithinMaxSize(potentialChunk, maxChunkSize)) {
@@ -104,6 +107,33 @@ private List<ChunkOffsetAndCount> splitTextBySeparatorRegex(String input, ChunkO
104107
return chunkOffsets;
105108
}
106109

110+
private List<ChunkOffsetAndCount> mergeChunkOffsetsUpToMaxChunkSize(List<ChunkOffsetAndCount> chunkOffsets, int maxChunkSize) {
111+
if (chunkOffsets.size() < 2) {
112+
return chunkOffsets;
113+
}
114+
115+
List<ChunkOffsetAndCount> mergedOffsetsAndCounts = new ArrayList<>();
116+
var mergedChunk = chunkOffsets.get(0);
117+
for (int i = 1; i < chunkOffsets.size(); i++) {
118+
var chunkOffsetAndCountToMerge = chunkOffsets.get(i);
119+
var potentialMergedChunk = new ChunkOffsetAndCount(
120+
new ChunkOffset(mergedChunk.chunkOffset.start(), chunkOffsetAndCountToMerge.chunkOffset.end()),
121+
mergedChunk.wordCount + chunkOffsetAndCountToMerge.wordCount
122+
);
123+
if (isChunkWithinMaxSize(potentialMergedChunk, maxChunkSize)) {
124+
mergedChunk = potentialMergedChunk;
125+
} else {
126+
mergedOffsetsAndCounts.add(mergedChunk);
127+
mergedChunk = chunkOffsets.get(i);
128+
}
129+
130+
if (i == chunkOffsets.size() - 1) {
131+
mergedOffsetsAndCounts.add(mergedChunk);
132+
}
133+
}
134+
return mergedOffsetsAndCounts;
135+
}
136+
107137
private List<ChunkOffset> chunkWithBackupChunker(String input, ChunkOffset offset, int maxChunkSize) {
108138
var chunks = new SentenceBoundaryChunker().chunk(
109139
input.substring(offset.start(), offset.end()),

x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/chunking/RecursiveChunkingSettings.java

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ public class RecursiveChunkingSettings implements ChunkingSettings {
3636
private static final Set<String> VALID_KEYS = Set.of(
3737
ChunkingSettingsOptions.STRATEGY.toString(),
3838
ChunkingSettingsOptions.MAX_CHUNK_SIZE.toString(),
39-
ChunkingSettingsOptions.SEPARATOR_SET.toString(),
39+
ChunkingSettingsOptions.SEPARATOR_GROUP.toString(),
4040
ChunkingSettingsOptions.SEPARATORS.toString()
4141
);
4242

@@ -45,7 +45,7 @@ public class RecursiveChunkingSettings implements ChunkingSettings {
4545

4646
public RecursiveChunkingSettings(int maxChunkSize, List<String> separators) {
4747
this.maxChunkSize = maxChunkSize;
48-
this.separators = separators == null ? SeparatorSet.PLAINTEXT.getSeparators() : separators;
48+
this.separators = separators == null ? SeparatorGroup.PLAINTEXT.getSeparators() : separators;
4949
}
5050

5151
public RecursiveChunkingSettings(StreamInput in) throws IOException {
@@ -72,12 +72,12 @@ public static RecursiveChunkingSettings fromMap(Map<String, Object> map) {
7272
validationException
7373
);
7474

75-
SeparatorSet separatorSet = ServiceUtils.extractOptionalEnum(
75+
SeparatorGroup separatorGroup = ServiceUtils.extractOptionalEnum(
7676
map,
77-
ChunkingSettingsOptions.SEPARATOR_SET.toString(),
77+
ChunkingSettingsOptions.SEPARATOR_GROUP.toString(),
7878
ModelConfigurations.CHUNKING_SETTINGS,
79-
SeparatorSet::fromString,
80-
EnumSet.allOf(SeparatorSet.class),
79+
SeparatorGroup::fromString,
80+
EnumSet.allOf(SeparatorGroup.class),
8181
validationException
8282
);
8383

@@ -88,12 +88,12 @@ public static RecursiveChunkingSettings fromMap(Map<String, Object> map) {
8888
validationException
8989
);
9090

91-
if (separators != null && separatorSet != null) {
91+
if (separators != null && separatorGroup != null) {
9292
validationException.addValidationError("Recursive chunking settings can not have both separators and separator_set");
9393
}
9494

95-
if (separatorSet != null) {
96-
separators = separatorSet.getSeparators();
95+
if (separatorGroup != null) {
96+
separators = separatorGroup.getSeparators();
9797
} else if (separators != null && separators.isEmpty()) {
9898
validationException.addValidationError("Recursive chunking settings can not have an empty list of separators");
9999
}

x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/chunking/SeparatorSet.java renamed to x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/chunking/SeparatorGroup.java

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,17 +10,17 @@
1010
import java.util.List;
1111
import java.util.Locale;
1212

13-
public enum SeparatorSet {
13+
public enum SeparatorGroup {
1414
PLAINTEXT("plaintext"),
1515
MARKDOWN("markdown");
1616

1717
private final String name;
1818

19-
SeparatorSet(String name) {
19+
SeparatorGroup(String name) {
2020
this.name = name;
2121
}
2222

23-
public static SeparatorSet fromString(String name) {
23+
public static SeparatorGroup fromString(String name) {
2424
return valueOf(name.trim().toUpperCase(Locale.ROOT));
2525
}
2626

x-pack/plugin/inference/src/test/java/org/elasticsearch/xpack/inference/chunking/RecursiveChunkerTests.java

Lines changed: 35 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ public void testChunkInputShorterThanMaxChunkSize() {
4646
assertExpectedChunksGenerated(input, settings, List.of(new Chunker.ChunkOffset(0, input.length())));
4747
}
4848

49-
public void testChunkInputRequiresOneSplit() {
49+
public void testChunkInputRequiresOneSplitWithoutMerges() {
5050
List<String> separators = generateRandomSeparators();
5151
RecursiveChunkingSettings settings = generateChunkingSettings(10, separators);
5252
String input = generateTestText(2, List.of(separators.get(0)));
@@ -58,7 +58,23 @@ public void testChunkInputRequiresOneSplit() {
5858
);
5959
}
6060

61-
public void testChunkInputRequiresMultipleSplits() {
61+
public void testChunkInputRequiresOneSplitWithMerges() {
62+
List<String> separators = generateRandomSeparators();
63+
RecursiveChunkingSettings settings = generateChunkingSettings(20, separators);
64+
String input = generateTestText(3, List.of(separators.get(0), separators.get(0)));
65+
66+
var expectedFirstChunkOffsetEnd = TEST_SENTENCE.length() * 2 + separators.get(0).length();
67+
assertExpectedChunksGenerated(
68+
input,
69+
settings,
70+
List.of(
71+
new Chunker.ChunkOffset(0, expectedFirstChunkOffsetEnd),
72+
new Chunker.ChunkOffset(expectedFirstChunkOffsetEnd, input.length())
73+
)
74+
);
75+
}
76+
77+
public void testChunkInputRequiresMultipleSplitsWithoutMerges() {
6278
var separators = generateRandomSeparators();
6379
RecursiveChunkingSettings settings = generateChunkingSettings(15, separators);
6480
String input = generateTestText(4, List.of(separators.get(1), separators.get(0), separators.get(1)));
@@ -78,6 +94,22 @@ public void testChunkInputRequiresMultipleSplits() {
7894
);
7995
}
8096

97+
public void testChunkInputRequiresMultipleSplitsWithMerges() {
98+
var separators = generateRandomSeparators();
99+
RecursiveChunkingSettings settings = generateChunkingSettings(25, separators);
100+
String input = generateTestText(4, List.of(separators.get(1), separators.get(0), separators.get(1)));
101+
102+
var expectedFirstChunkOffsetEnd = TEST_SENTENCE.length() * 2 + separators.get(1).length();
103+
assertExpectedChunksGenerated(
104+
input,
105+
settings,
106+
List.of(
107+
new Chunker.ChunkOffset(0, expectedFirstChunkOffsetEnd),
108+
new Chunker.ChunkOffset(expectedFirstChunkOffsetEnd, input.length())
109+
)
110+
);
111+
}
112+
81113
public void testChunkInputDoesNotSplitWhenNoLongerExceedingMaxChunkSize() {
82114
var separators = randomSubsetOf(3, TEST_SEPARATORS);
83115
RecursiveChunkingSettings settings = generateChunkingSettings(25, separators);
@@ -166,7 +198,7 @@ public void testChunkLongDocument() {
166198

167199
public void testMarkdownChunking() {
168200
int numSentences = randomIntBetween(10, 50);
169-
List<String> separators = SeparatorSet.MARKDOWN.getSeparators();
201+
List<String> separators = SeparatorGroup.MARKDOWN.getSeparators();
170202
List<String> validHeaders = List.of(
171203
"# Header\n",
172204
"## Header\n",

x-pack/plugin/inference/src/test/java/org/elasticsearch/xpack/inference/chunking/RecursiveChunkingSettingsTests.java

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -32,15 +32,15 @@ public void testFromMapValidSettingsWithSeparators() {
3232
assertEquals(separators, settings.getSeparators());
3333
}
3434

35-
public void testFromMapValidSettingsWithSeparatorSet() {
35+
public void testFromMapValidSettingsWithSeparatorGroup() {
3636
var maxChunkSize = randomIntBetween(10, 300);
37-
var separatorSet = randomFrom(SeparatorSet.values());
38-
Map<String, Object> validSettings = buildChunkingSettingsMap(maxChunkSize, Optional.of(separatorSet.name()), Optional.empty());
37+
var separatorGroup = randomFrom(SeparatorGroup.values());
38+
Map<String, Object> validSettings = buildChunkingSettingsMap(maxChunkSize, Optional.of(separatorGroup.name()), Optional.empty());
3939

4040
RecursiveChunkingSettings settings = RecursiveChunkingSettings.fromMap(validSettings);
4141

4242
assertEquals(maxChunkSize, settings.getMaxChunkSize());
43-
assertEquals(separatorSet.getSeparators(), settings.getSeparators());
43+
assertEquals(separatorGroup.getSeparators(), settings.getSeparators());
4444
}
4545

4646
public void testFromMapMaxChunkSizeTooSmall() {
@@ -55,7 +55,7 @@ public void testFromMapMaxChunkSizeTooLarge() {
5555
assertThrows(ValidationException.class, () -> RecursiveChunkingSettings.fromMap(invalidSettings));
5656
}
5757

58-
public void testFromMapInvalidSeparatorSet() {
58+
public void testFromMapInvalidSeparatorGroup() {
5959
Map<String, Object> invalidSettings = buildChunkingSettingsMap(randomIntBetween(10, 300), Optional.of("invalid"), Optional.empty());
6060

6161
assertThrows(ValidationException.class, () -> RecursiveChunkingSettings.fromMap(invalidSettings));
@@ -68,7 +68,7 @@ public void testFromMapInvalidSettingKey() {
6868
assertThrows(ValidationException.class, () -> RecursiveChunkingSettings.fromMap(invalidSettings));
6969
}
7070

71-
public void testFromMapBothSeparatorsAndSeparatorSet() {
71+
public void testFromMapBothSeparatorsAndSeparatorGroup() {
7272
Map<String, Object> invalidSettings = buildChunkingSettingsMap(
7373
randomIntBetween(10, 300),
7474
Optional.of("default"),
@@ -86,13 +86,13 @@ public void testFromMapEmptySeparators() {
8686

8787
private Map<String, Object> buildChunkingSettingsMap(
8888
int maxChunkSize,
89-
Optional<String> separatorSet,
89+
Optional<String> separatorGroup,
9090
Optional<List<String>> separators
9191
) {
9292
Map<String, Object> settingsMap = new HashMap<>();
9393
settingsMap.put(ChunkingSettingsOptions.STRATEGY.toString(), ChunkingStrategy.RECURSIVE.toString());
9494
settingsMap.put(ChunkingSettingsOptions.MAX_CHUNK_SIZE.toString(), maxChunkSize);
95-
separatorSet.ifPresent(s -> settingsMap.put(ChunkingSettingsOptions.SEPARATOR_SET.toString(), s));
95+
separatorGroup.ifPresent(s -> settingsMap.put(ChunkingSettingsOptions.SEPARATOR_GROUP.toString(), s));
9696
separators.ifPresent(strings -> settingsMap.put(ChunkingSettingsOptions.SEPARATORS.toString(), strings));
9797
return settingsMap;
9898
}

0 commit comments

Comments
 (0)