Skip to content

Commit 6523643

Browse files
Merge branch 'main' of github.com:elastic/elasticsearch into cs-request-params
2 parents 70be427 + daf4fca commit 6523643

File tree

27 files changed

+1339
-117
lines changed

27 files changed

+1339
-117
lines changed

docs/changelog/126866.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
pr: 126866
2+
summary: Add recursive chunker
3+
area: Machine Learning
4+
type: enhancement
5+
issues: []

libs/simdvec/src/test/java/org/elasticsearch/simdvec/internal/vectorization/ES91OSQVectorScorerTests.java

Lines changed: 15 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -125,12 +125,21 @@ public void testScore() throws Exception {
125125
scores2
126126
);
127127
for (int j = 0; j < ES91OSQVectorsScorer.BULK_SIZE; j++) {
128-
if (scores1[j] > (maxDims * Short.MAX_VALUE)) {
129-
int diff = (int) (scores1[j] - scores2[j]);
130-
assertThat("defaultScores: " + scores1[j] + " bulkScores: " + scores2[j], Math.abs(diff), lessThan(65));
131-
} else if (scores1[j] > (maxDims * Byte.MAX_VALUE)) {
132-
int diff = (int) (scores1[j] - scores2[j]);
133-
assertThat("defaultScores: " + scores1[j] + " bulkScores: " + scores2[j], Math.abs(diff), lessThan(9));
128+
if (scores1[j] == scores2[j]) {
129+
continue;
130+
}
131+
if (scores1[j] > (maxDims * Byte.MAX_VALUE)) {
132+
float diff = Math.abs(scores1[j] - scores2[j]);
133+
assertThat(
134+
"defaultScores: " + scores1[j] + " bulkScores: " + scores2[j],
135+
diff / scores1[j],
136+
lessThan(1e-5f)
137+
);
138+
assertThat(
139+
"defaultScores: " + scores1[j] + " bulkScores: " + scores2[j],
140+
diff / scores2[j],
141+
lessThan(1e-5f)
142+
);
134143
} else {
135144
assertEquals(scores1[j], scores2[j], 1e-2f);
136145
}

muted-tests.yml

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -577,6 +577,18 @@ tests:
577577
- class: org.elasticsearch.server.cli.MachineDependentHeapTests
578578
method: testMlOnlyOptions
579579
issue: https://github.com/elastic/elasticsearch/issues/129236
580+
- class: org.elasticsearch.upgrades.RunningSnapshotIT
581+
method: testRunningSnapshotCompleteAfterUpgrade {upgradedNodes=1}
582+
issue: https://github.com/elastic/elasticsearch/issues/129644
583+
- class: org.elasticsearch.upgrades.RunningSnapshotIT
584+
method: testRunningSnapshotCompleteAfterUpgrade {upgradedNodes=2}
585+
issue: https://github.com/elastic/elasticsearch/issues/129645
586+
- class: org.elasticsearch.upgrades.RunningSnapshotIT
587+
method: testRunningSnapshotCompleteAfterUpgrade {upgradedNodes=3}
588+
issue: https://github.com/elastic/elasticsearch/issues/129646
589+
- class: org.elasticsearch.test.apmintegration.TracesApmIT
590+
method: testApmIntegration
591+
issue: https://github.com/elastic/elasticsearch/issues/129651
580592

581593
# Examples:
582594
#

server/src/internalClusterTest/java/org/elasticsearch/search/query/VectorIT.java

Lines changed: 13 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99

1010
package org.elasticsearch.search.query;
1111

12+
import org.apache.lucene.util.VectorUtil;
1213
import org.elasticsearch.cluster.metadata.IndexMetadata;
1314
import org.elasticsearch.common.settings.Settings;
1415
import org.elasticsearch.index.mapper.vectors.DenseVectorFieldMapper;
@@ -30,10 +31,11 @@ public class VectorIT extends ESIntegTestCase {
3031
private static final String VECTOR_FIELD = "vector";
3132
private static final String NUM_ID_FIELD = "num_id";
3233

33-
private static void randomVector(float[] vector) {
34+
private static void randomVector(float[] vector, int constant) {
3435
for (int i = 0; i < vector.length; i++) {
35-
vector[i] = randomFloat();
36+
vector[i] = randomFloat() * constant;
3637
}
38+
VectorUtil.l2normalize(vector);
3739
}
3840

3941
@Before
@@ -43,6 +45,7 @@ public void setup() throws IOException {
4345
.startObject("properties")
4446
.startObject(VECTOR_FIELD)
4547
.field("type", "dense_vector")
48+
.field("similarity", "dot_product")
4649
.startObject("index_options")
4750
.field("type", "hnsw")
4851
.endObject()
@@ -59,20 +62,21 @@ public void setup() throws IOException {
5962
.build();
6063
prepareCreate(INDEX_NAME).setMapping(mapping).setSettings(settings).get();
6164
ensureGreen(INDEX_NAME);
65+
float[] vector = new float[16];
6266
for (int i = 0; i < 150; i++) {
63-
float[] vector = new float[8];
64-
randomVector(vector);
67+
randomVector(vector, i % 25 + 1);
6568
prepareIndex(INDEX_NAME).setId(Integer.toString(i)).setSource(VECTOR_FIELD, vector, NUM_ID_FIELD, i).get();
6669
}
6770
forceMerge(true);
6871
refresh(INDEX_NAME);
6972
}
7073

7174
public void testFilteredQueryStrategy() {
72-
float[] vector = new float[8];
73-
randomVector(vector);
75+
float[] vector = new float[16];
76+
randomVector(vector, 25);
77+
int upperLimit = 35;
7478
var query = new KnnSearchBuilder(VECTOR_FIELD, vector, 1, 1, null, null).addFilterQuery(
75-
QueryBuilders.rangeQuery(NUM_ID_FIELD).lte(30)
79+
QueryBuilders.rangeQuery(NUM_ID_FIELD).lte(35)
7680
);
7781
assertResponse(client().prepareSearch(INDEX_NAME).setKnnSearch(List.of(query)).setSize(1).setProfile(true), acornResponse -> {
7882
assertNotEquals(0, acornResponse.getHits().getHits().length);
@@ -116,6 +120,8 @@ public void testFilteredQueryStrategy() {
116120
assertTrue(
117121
"fanoutVectorOps [" + fanoutVectorOpsSum + "] is not gt acornVectorOps [" + vectorOpsSum + "]",
118122
fanoutVectorOpsSum > vectorOpsSum
123+
// if both switch to brute-force due to excessive exploration, they will both equal to upperLimit
124+
|| (fanoutVectorOpsSum == vectorOpsSum && vectorOpsSum == upperLimit + 1)
119125
);
120126
});
121127
});

server/src/main/java/org/elasticsearch/TransportVersions.java

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -198,7 +198,8 @@ static TransportVersion def(int id) {
198198
public static final TransportVersion IDP_CUSTOM_SAML_ATTRIBUTES_ALLOW_LIST_8_19 = def(8_841_0_50);
199199
public static final TransportVersion SETTINGS_IN_DATA_STREAMS_8_19 = def(8_841_0_51);
200200
public static final TransportVersion ML_INFERENCE_CUSTOM_SERVICE_REMOVE_ERROR_PARSING_8_19 = def(8_841_0_52);
201-
public static final TransportVersion ML_INFERENCE_CUSTOM_SERVICE_INPUT_TYPE_8_19 = def(8_841_0_53);
201+
public static final TransportVersion ML_INFERENCE_CUSTOM_SERVICE_EMBEDDING_BATCH_SIZE_8_19 = def(8_841_0_53);
202+
public static final TransportVersion ML_INFERENCE_CUSTOM_SERVICE_INPUT_TYPE_8_19 = def(8_841_0_54);
202203
public static final TransportVersion V_9_0_0 = def(9_000_0_09);
203204
public static final TransportVersion INITIAL_ELASTICSEARCH_9_0_1 = def(9_000_0_10);
204205
public static final TransportVersion INITIAL_ELASTICSEARCH_9_0_2 = def(9_000_0_11);
@@ -305,7 +306,8 @@ static TransportVersion def(int id) {
305306
public static final TransportVersion STATE_PARAM_GET_SNAPSHOT = def(9_100_0_00);
306307
public static final TransportVersion PROJECT_ID_IN_SNAPSHOTS_DELETIONS_AND_REPO_CLEANUP = def(9_101_0_00);
307308
public static final TransportVersion ML_INFERENCE_CUSTOM_SERVICE_REMOVE_ERROR_PARSING = def(9_102_0_00);
308-
public static final TransportVersion ML_INFERENCE_CUSTOM_SERVICE_INPUT_TYPE = def(9_103_0_00);
309+
public static final TransportVersion ML_INFERENCE_CUSTOM_SERVICE_EMBEDDING_BATCH_SIZE = def(9_103_0_00);
310+
public static final TransportVersion ML_INFERENCE_CUSTOM_SERVICE_INPUT_TYPE = def(9_104_0_00);
309311

310312
/*
311313
* STOP! READ THIS FIRST! No, really,

server/src/main/java/org/elasticsearch/inference/ChunkingStrategy.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
public enum ChunkingStrategy {
1717
WORD("word"),
1818
SENTENCE("sentence"),
19+
RECURSIVE("recursive"),
1920
NONE("none");
2021

2122
private final String chunkingStrategy;

x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/InferenceNamedWriteablesProvider.java

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
import org.elasticsearch.xpack.core.inference.results.TextEmbeddingFloatResults;
2828
import org.elasticsearch.xpack.inference.action.task.StreamingTaskManager;
2929
import org.elasticsearch.xpack.inference.chunking.NoneChunkingSettings;
30+
import org.elasticsearch.xpack.inference.chunking.RecursiveChunkingSettings;
3031
import org.elasticsearch.xpack.inference.chunking.SentenceBoundaryChunkingSettings;
3132
import org.elasticsearch.xpack.inference.chunking.WordBoundaryChunkingSettings;
3233
import org.elasticsearch.xpack.inference.common.amazon.AwsSecretSettings;
@@ -567,6 +568,9 @@ private static void addChunkingSettingsNamedWriteables(List<NamedWriteableRegist
567568
SentenceBoundaryChunkingSettings::new
568569
)
569570
);
571+
namedWriteables.add(
572+
new NamedWriteableRegistry.Entry(ChunkingSettings.class, RecursiveChunkingSettings.NAME, RecursiveChunkingSettings::new)
573+
);
570574
}
571575

572576
private static void addInferenceResultsNamedWriteables(List<NamedWriteableRegistry.Entry> namedWriteables) {

x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/chunking/ChunkerBuilder.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ public static Chunker fromChunkingStrategy(ChunkingStrategy chunkingStrategy) {
1919
case NONE -> NoopChunker.INSTANCE;
2020
case WORD -> new WordBoundaryChunker();
2121
case SENTENCE -> new SentenceBoundaryChunker();
22+
case RECURSIVE -> new RecursiveChunker();
2223
};
2324
}
2425
}
Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
/*
2+
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
3+
* or more contributor license agreements. Licensed under the Elastic License
4+
* 2.0; you may not use this file except in compliance with the Elastic License
5+
* 2.0.
6+
*/
7+
8+
package org.elasticsearch.xpack.inference.chunking;
9+
10+
import com.ibm.icu.text.BreakIterator;
11+
12+
public class ChunkerUtils {
13+
14+
// setText() should be applied before using this function.
15+
static int countWords(int start, int end, BreakIterator wordIterator) {
16+
assert start < end;
17+
wordIterator.preceding(start); // start of the current word
18+
19+
int boundary = wordIterator.current();
20+
int wordCount = 0;
21+
while (boundary != BreakIterator.DONE && boundary <= end) {
22+
int wordStatus = wordIterator.getRuleStatus();
23+
if (wordStatus != BreakIterator.WORD_NONE) {
24+
wordCount++;
25+
}
26+
boundary = wordIterator.next();
27+
}
28+
29+
return wordCount;
30+
}
31+
}

x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/chunking/ChunkingSettingsBuilder.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@ public static ChunkingSettings fromMap(Map<String, Object> settings, boolean ret
4848
case NONE -> NoneChunkingSettings.INSTANCE;
4949
case WORD -> WordBoundaryChunkingSettings.fromMap(new HashMap<>(settings));
5050
case SENTENCE -> SentenceBoundaryChunkingSettings.fromMap(new HashMap<>(settings));
51+
case RECURSIVE -> RecursiveChunkingSettings.fromMap(new HashMap<>(settings));
5152
};
5253
}
5354
}

0 commit comments

Comments
 (0)