Skip to content

Commit cf3c77a

Browse files
Add upper and lower max chunk size limits to ChunkingSettings (#115130)
* Add upper and lower max chunk size limits to ChunkingSettings * Fix ServiceUtils tests --------- Co-authored-by: Elastic Machine <[email protected]>
1 parent bd54bff commit cf3c77a

File tree

10 files changed

+117
-42
lines changed

10 files changed

+117
-42
lines changed

x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/chunking/SentenceBoundaryChunkingSettings.java

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,8 @@
2929
public class SentenceBoundaryChunkingSettings implements ChunkingSettings {
3030
public static final String NAME = "SentenceBoundaryChunkingSettings";
3131
private static final ChunkingStrategy STRATEGY = ChunkingStrategy.SENTENCE;
32+
private static final int MAX_CHUNK_SIZE_LOWER_LIMIT = 20;
33+
private static final int MAX_CHUNK_SIZE_UPPER_LIMIT = 300;
3234
private static final Set<String> VALID_KEYS = Set.of(
3335
ChunkingSettingsOptions.STRATEGY.toString(),
3436
ChunkingSettingsOptions.MAX_CHUNK_SIZE.toString(),
@@ -62,9 +64,11 @@ public static SentenceBoundaryChunkingSettings fromMap(Map<String, Object> map)
6264
);
6365
}
6466

65-
Integer maxChunkSize = ServiceUtils.extractRequiredPositiveInteger(
67+
Integer maxChunkSize = ServiceUtils.extractRequiredPositiveIntegerBetween(
6668
map,
6769
ChunkingSettingsOptions.MAX_CHUNK_SIZE.toString(),
70+
MAX_CHUNK_SIZE_LOWER_LIMIT,
71+
MAX_CHUNK_SIZE_UPPER_LIMIT,
6872
ModelConfigurations.CHUNKING_SETTINGS,
6973
validationException
7074
);

x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/chunking/WordBoundaryChunkingSettings.java

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,8 @@
2828
public class WordBoundaryChunkingSettings implements ChunkingSettings {
2929
public static final String NAME = "WordBoundaryChunkingSettings";
3030
private static final ChunkingStrategy STRATEGY = ChunkingStrategy.WORD;
31+
private static final int MAX_CHUNK_SIZE_LOWER_LIMIT = 10;
32+
private static final int MAX_CHUNK_SIZE_UPPER_LIMIT = 300;
3133
private static final Set<String> VALID_KEYS = Set.of(
3234
ChunkingSettingsOptions.STRATEGY.toString(),
3335
ChunkingSettingsOptions.MAX_CHUNK_SIZE.toString(),
@@ -56,9 +58,11 @@ public static WordBoundaryChunkingSettings fromMap(Map<String, Object> map) {
5658
);
5759
}
5860

59-
Integer maxChunkSize = ServiceUtils.extractRequiredPositiveInteger(
61+
Integer maxChunkSize = ServiceUtils.extractRequiredPositiveIntegerBetween(
6062
map,
6163
ChunkingSettingsOptions.MAX_CHUNK_SIZE.toString(),
64+
MAX_CHUNK_SIZE_LOWER_LIMIT,
65+
MAX_CHUNK_SIZE_UPPER_LIMIT,
6266
ModelConfigurations.CHUNKING_SETTINGS,
6367
validationException
6468
);

x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/services/ServiceUtils.java

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -435,6 +435,32 @@ public static Integer extractRequiredPositiveIntegerLessThanOrEqualToMax(
435435
return field;
436436
}
437437

438+
public static Integer extractRequiredPositiveIntegerBetween(
439+
Map<String, Object> map,
440+
String settingName,
441+
int minValue,
442+
int maxValue,
443+
String scope,
444+
ValidationException validationException
445+
) {
446+
Integer field = extractRequiredPositiveInteger(map, settingName, scope, validationException);
447+
448+
if (field != null && field < minValue) {
449+
validationException.addValidationError(
450+
ServiceUtils.mustBeGreaterThanOrEqualNumberErrorMessage(settingName, scope, field, minValue)
451+
);
452+
return null;
453+
}
454+
if (field != null && field > maxValue) {
455+
validationException.addValidationError(
456+
ServiceUtils.mustBeLessThanOrEqualNumberErrorMessage(settingName, scope, field, maxValue)
457+
);
458+
return null;
459+
}
460+
461+
return field;
462+
}
463+
438464
public static Integer extractOptionalPositiveInteger(
439465
Map<String, Object> map,
440466
String settingName,

x-pack/plugin/inference/src/test/java/org/elasticsearch/xpack/inference/chunking/ChunkingSettingsBuilderTests.java

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -38,25 +38,27 @@ public void testValidChunkingSettingsMap() {
3838
}
3939

4040
private Map<Map<String, Object>, ChunkingSettings> chunkingSettingsMapToChunkingSettings() {
41-
var maxChunkSize = randomNonNegativeInt();
42-
var overlap = randomIntBetween(1, maxChunkSize / 2);
41+
var maxChunkSizeWordBoundaryChunkingSettings = randomIntBetween(10, 300);
42+
var overlap = randomIntBetween(1, maxChunkSizeWordBoundaryChunkingSettings / 2);
43+
var maxChunkSizeSentenceBoundaryChunkingSettings = randomIntBetween(20, 300);
44+
4345
return Map.of(
4446
Map.of(
4547
ChunkingSettingsOptions.STRATEGY.toString(),
4648
ChunkingStrategy.WORD.toString(),
4749
ChunkingSettingsOptions.MAX_CHUNK_SIZE.toString(),
48-
maxChunkSize,
50+
maxChunkSizeWordBoundaryChunkingSettings,
4951
ChunkingSettingsOptions.OVERLAP.toString(),
5052
overlap
5153
),
52-
new WordBoundaryChunkingSettings(maxChunkSize, overlap),
54+
new WordBoundaryChunkingSettings(maxChunkSizeWordBoundaryChunkingSettings, overlap),
5355
Map.of(
5456
ChunkingSettingsOptions.STRATEGY.toString(),
5557
ChunkingStrategy.SENTENCE.toString(),
5658
ChunkingSettingsOptions.MAX_CHUNK_SIZE.toString(),
57-
maxChunkSize
59+
maxChunkSizeSentenceBoundaryChunkingSettings
5860
),
59-
new SentenceBoundaryChunkingSettings(maxChunkSize, 1)
61+
new SentenceBoundaryChunkingSettings(maxChunkSizeSentenceBoundaryChunkingSettings, 1)
6062
);
6163
}
6264
}

x-pack/plugin/inference/src/test/java/org/elasticsearch/xpack/inference/chunking/ChunkingSettingsTests.java

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -21,11 +21,11 @@ public static ChunkingSettings createRandomChunkingSettings() {
2121

2222
switch (randomStrategy) {
2323
case WORD -> {
24-
var maxChunkSize = randomNonNegativeInt();
24+
var maxChunkSize = randomIntBetween(10, 300);
2525
return new WordBoundaryChunkingSettings(maxChunkSize, randomIntBetween(1, maxChunkSize / 2));
2626
}
2727
case SENTENCE -> {
28-
return new SentenceBoundaryChunkingSettings(randomNonNegativeInt(), randomBoolean() ? 0 : 1);
28+
return new SentenceBoundaryChunkingSettings(randomIntBetween(20, 300), randomBoolean() ? 0 : 1);
2929
}
3030
default -> throw new IllegalArgumentException("Unsupported random strategy [" + randomStrategy + "]");
3131
}
@@ -38,13 +38,13 @@ public static Map<String, Object> createRandomChunkingSettingsMap() {
3838

3939
switch (randomStrategy) {
4040
case WORD -> {
41-
var maxChunkSize = randomNonNegativeInt();
41+
var maxChunkSize = randomIntBetween(10, 300);
4242
chunkingSettingsMap.put(ChunkingSettingsOptions.MAX_CHUNK_SIZE.toString(), maxChunkSize);
4343
chunkingSettingsMap.put(ChunkingSettingsOptions.OVERLAP.toString(), randomIntBetween(1, maxChunkSize / 2));
4444

4545
}
4646
case SENTENCE -> {
47-
chunkingSettingsMap.put(ChunkingSettingsOptions.MAX_CHUNK_SIZE.toString(), randomNonNegativeInt());
47+
chunkingSettingsMap.put(ChunkingSettingsOptions.MAX_CHUNK_SIZE.toString(), randomIntBetween(20, 300));
4848
}
4949
default -> {
5050
}

x-pack/plugin/inference/src/test/java/org/elasticsearch/xpack/inference/chunking/SentenceBoundaryChunkerTests.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -318,7 +318,8 @@ public void testChunkSplitLargeChunkSizesWithChunkingSettings() {
318318
}
319319

320320
public void testInvalidChunkingSettingsProvided() {
321-
ChunkingSettings chunkingSettings = new WordBoundaryChunkingSettings(randomNonNegativeInt(), randomNonNegativeInt());
321+
var maxChunkSize = randomIntBetween(10, 300);
322+
ChunkingSettings chunkingSettings = new WordBoundaryChunkingSettings(maxChunkSize, randomIntBetween(1, maxChunkSize / 2));
322323
assertThrows(IllegalArgumentException.class, () -> { new SentenceBoundaryChunker().chunk(TEST_TEXT, chunkingSettings); });
323324
}
324325

x-pack/plugin/inference/src/test/java/org/elasticsearch/xpack/inference/chunking/SentenceBoundaryChunkingSettingsTests.java

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,6 @@
1111
import org.elasticsearch.common.io.stream.Writeable;
1212
import org.elasticsearch.inference.ChunkingStrategy;
1313
import org.elasticsearch.test.AbstractWireSerializingTestCase;
14-
import org.elasticsearch.test.ESTestCase;
1514

1615
import java.io.IOException;
1716
import java.util.HashMap;
@@ -28,14 +27,14 @@ public void testMaxChunkSizeNotProvided() {
2827
}
2928

3029
public void testInvalidInputsProvided() {
31-
var chunkingSettingsMap = buildChunkingSettingsMap(Optional.of(randomNonNegativeInt()));
30+
var chunkingSettingsMap = buildChunkingSettingsMap(Optional.of(randomIntBetween(20, 300)));
3231
chunkingSettingsMap.put(randomAlphaOfLength(10), randomNonNegativeInt());
3332

3433
assertThrows(ValidationException.class, () -> { SentenceBoundaryChunkingSettings.fromMap(chunkingSettingsMap); });
3534
}
3635

3736
public void testValidInputsProvided() {
38-
int maxChunkSize = randomNonNegativeInt();
37+
int maxChunkSize = randomIntBetween(20, 300);
3938
SentenceBoundaryChunkingSettings settings = SentenceBoundaryChunkingSettings.fromMap(
4039
buildChunkingSettingsMap(Optional.of(maxChunkSize))
4140
);
@@ -59,12 +58,12 @@ protected Writeable.Reader<SentenceBoundaryChunkingSettings> instanceReader() {
5958

6059
@Override
6160
protected SentenceBoundaryChunkingSettings createTestInstance() {
62-
return new SentenceBoundaryChunkingSettings(randomNonNegativeInt(), randomBoolean() ? 0 : 1);
61+
return new SentenceBoundaryChunkingSettings(randomIntBetween(20, 300), randomBoolean() ? 0 : 1);
6362
}
6463

6564
@Override
6665
protected SentenceBoundaryChunkingSettings mutateInstance(SentenceBoundaryChunkingSettings instance) throws IOException {
67-
var chunkSize = randomValueOtherThan(instance.maxChunkSize, ESTestCase::randomNonNegativeInt);
66+
var chunkSize = randomValueOtherThan(instance.maxChunkSize, () -> randomIntBetween(20, 300));
6867
return new SentenceBoundaryChunkingSettings(chunkSize, instance.sentenceOverlap);
6968
}
7069
}

x-pack/plugin/inference/src/test/java/org/elasticsearch/xpack/inference/chunking/WordBoundaryChunkerTests.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -136,7 +136,7 @@ public void testNumberOfChunksWithWordBoundaryChunkingSettings() {
136136
}
137137

138138
public void testInvalidChunkingSettingsProvided() {
139-
ChunkingSettings chunkingSettings = new SentenceBoundaryChunkingSettings(randomNonNegativeInt(), 0);
139+
ChunkingSettings chunkingSettings = new SentenceBoundaryChunkingSettings(randomIntBetween(20, 300), 0);
140140
assertThrows(IllegalArgumentException.class, () -> { new WordBoundaryChunker().chunk(TEST_TEXT, chunkingSettings); });
141141
}
142142

x-pack/plugin/inference/src/test/java/org/elasticsearch/xpack/inference/chunking/WordBoundaryChunkingSettingsTests.java

Lines changed: 8 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,6 @@
1414

1515
import java.io.IOException;
1616
import java.util.HashMap;
17-
import java.util.List;
1817
import java.util.Map;
1918
import java.util.Optional;
2019

@@ -28,27 +27,28 @@ public void testMaxChunkSizeNotProvided() {
2827

2928
public void testOverlapNotProvided() {
3029
assertThrows(ValidationException.class, () -> {
31-
WordBoundaryChunkingSettings.fromMap(buildChunkingSettingsMap(Optional.of(randomNonNegativeInt()), Optional.empty()));
30+
WordBoundaryChunkingSettings.fromMap(buildChunkingSettingsMap(Optional.of(randomIntBetween(10, 300)), Optional.empty()));
3231
});
3332
}
3433

3534
public void testInvalidInputsProvided() {
36-
var chunkingSettingsMap = buildChunkingSettingsMap(Optional.of(randomNonNegativeInt()), Optional.of(randomNonNegativeInt()));
35+
var maxChunkSize = randomIntBetween(10, 300);
36+
var chunkingSettingsMap = buildChunkingSettingsMap(Optional.of(maxChunkSize), Optional.of(randomIntBetween(1, maxChunkSize / 2)));
3737
chunkingSettingsMap.put(randomAlphaOfLength(10), randomNonNegativeInt());
3838

3939
assertThrows(ValidationException.class, () -> { WordBoundaryChunkingSettings.fromMap(chunkingSettingsMap); });
4040
}
4141

4242
public void testOverlapGreaterThanHalfMaxChunkSize() {
43-
var maxChunkSize = randomNonNegativeInt();
43+
var maxChunkSize = randomIntBetween(10, 300);
4444
var overlap = randomIntBetween((maxChunkSize / 2) + 1, maxChunkSize);
4545
assertThrows(ValidationException.class, () -> {
4646
WordBoundaryChunkingSettings.fromMap(buildChunkingSettingsMap(Optional.of(maxChunkSize), Optional.of(overlap)));
4747
});
4848
}
4949

5050
public void testValidInputsProvided() {
51-
int maxChunkSize = randomNonNegativeInt();
51+
int maxChunkSize = randomIntBetween(10, 300);
5252
int overlap = randomIntBetween(1, maxChunkSize / 2);
5353
WordBoundaryChunkingSettings settings = WordBoundaryChunkingSettings.fromMap(
5454
buildChunkingSettingsMap(Optional.of(maxChunkSize), Optional.of(overlap))
@@ -75,29 +75,14 @@ protected Writeable.Reader<WordBoundaryChunkingSettings> instanceReader() {
7575

7676
@Override
7777
protected WordBoundaryChunkingSettings createTestInstance() {
78-
var maxChunkSize = randomNonNegativeInt();
78+
var maxChunkSize = randomIntBetween(10, 300);
7979
return new WordBoundaryChunkingSettings(maxChunkSize, randomIntBetween(1, maxChunkSize / 2));
8080
}
8181

8282
@Override
8383
protected WordBoundaryChunkingSettings mutateInstance(WordBoundaryChunkingSettings instance) throws IOException {
84-
var valueToMutate = randomFrom(List.of(ChunkingSettingsOptions.MAX_CHUNK_SIZE, ChunkingSettingsOptions.OVERLAP));
85-
var maxChunkSize = instance.maxChunkSize;
86-
var overlap = instance.overlap;
87-
88-
if (valueToMutate.equals(ChunkingSettingsOptions.MAX_CHUNK_SIZE)) {
89-
while (maxChunkSize == instance.maxChunkSize) {
90-
maxChunkSize = randomNonNegativeInt();
91-
}
92-
93-
if (overlap > maxChunkSize / 2) {
94-
overlap = randomIntBetween(1, maxChunkSize / 2);
95-
}
96-
} else if (valueToMutate.equals(ChunkingSettingsOptions.OVERLAP)) {
97-
while (overlap == instance.overlap) {
98-
overlap = randomIntBetween(1, maxChunkSize / 2);
99-
}
100-
}
84+
var maxChunkSize = randomValueOtherThan(instance.maxChunkSize, () -> randomIntBetween(10, 300));
85+
var overlap = randomValueOtherThan(instance.overlap, () -> randomIntBetween(1, maxChunkSize / 2));
10186

10287
return new WordBoundaryChunkingSettings(maxChunkSize, overlap);
10388
}

x-pack/plugin/inference/src/test/java/org/elasticsearch/xpack/inference/services/ServiceUtilsTests.java

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -605,6 +605,60 @@ public void testExtractRequiredPositiveIntegerLessThanOrEqualToMax_AddsErrorWhen
605605
assertThat(validation.validationErrors().get(1), is("[scope] does not contain the required setting [not_key]"));
606606
}
607607

608+
public void testExtractRequiredPositiveIntegerBetween_ReturnsValueWhenValueIsBetweenMinAndMax() {
609+
var minValue = randomNonNegativeInt();
610+
var maxValue = randomIntBetween(minValue + 2, minValue + 10);
611+
testExtractRequiredPositiveIntegerBetween_Successful(minValue, maxValue, randomIntBetween(minValue + 1, maxValue - 1));
612+
}
613+
614+
public void testExtractRequiredPositiveIntegerBetween_ReturnsValueWhenValueIsEqualToMin() {
615+
var minValue = randomNonNegativeInt();
616+
var maxValue = randomIntBetween(minValue + 1, minValue + 10);
617+
testExtractRequiredPositiveIntegerBetween_Successful(minValue, maxValue, minValue);
618+
}
619+
620+
public void testExtractRequiredPositiveIntegerBetween_ReturnsValueWhenValueIsEqualToMax() {
621+
var minValue = randomNonNegativeInt();
622+
var maxValue = randomIntBetween(minValue + 1, minValue + 10);
623+
testExtractRequiredPositiveIntegerBetween_Successful(minValue, maxValue, maxValue);
624+
}
625+
626+
private void testExtractRequiredPositiveIntegerBetween_Successful(int minValue, int maxValue, int actualValue) {
627+
var validation = new ValidationException();
628+
validation.addValidationError("previous error");
629+
Map<String, Object> map = modifiableMap(Map.of("key", actualValue));
630+
var parsedInt = ServiceUtils.extractRequiredPositiveIntegerBetween(map, "key", minValue, maxValue, "scope", validation);
631+
632+
assertThat(validation.validationErrors(), hasSize(1));
633+
assertNotNull(parsedInt);
634+
assertThat(parsedInt, is(actualValue));
635+
assertTrue(map.isEmpty());
636+
}
637+
638+
public void testExtractRequiredIntBetween_AddsErrorForValueBelowMin() {
639+
var minValue = randomNonNegativeInt();
640+
var maxValue = randomIntBetween(minValue, minValue + 10);
641+
testExtractRequiredIntBetween_Unsuccessful(minValue, maxValue, minValue - 1);
642+
}
643+
644+
public void testExtractRequiredIntBetween_AddsErrorForValueAboveMax() {
645+
var minValue = randomNonNegativeInt();
646+
var maxValue = randomIntBetween(minValue, minValue + 10);
647+
testExtractRequiredIntBetween_Unsuccessful(minValue, maxValue, maxValue + 1);
648+
}
649+
650+
private void testExtractRequiredIntBetween_Unsuccessful(int minValue, int maxValue, int actualValue) {
651+
var validation = new ValidationException();
652+
validation.addValidationError("previous error");
653+
Map<String, Object> map = modifiableMap(Map.of("key", actualValue));
654+
var parsedInt = ServiceUtils.extractRequiredPositiveIntegerBetween(map, "key", minValue, maxValue, "scope", validation);
655+
656+
assertThat(validation.validationErrors(), hasSize(2));
657+
assertNull(parsedInt);
658+
assertTrue(map.isEmpty());
659+
assertThat(validation.validationErrors().get(1), containsString("Invalid value"));
660+
}
661+
608662
public void testExtractOptionalEnum_ReturnsNull_WhenFieldDoesNotExist() {
609663
var validation = new ValidationException();
610664
Map<String, Object> map = modifiableMap(Map.of("key", "value"));

0 commit comments

Comments
 (0)