Skip to content

Commit baaa658

Browse files
Merge branch 'main' into likeListPushdown_v3
2 parents 38b8e54 + f48a3c3 commit baaa658

File tree

78 files changed

+3528
-1553
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

78 files changed

+3528
-1553
lines changed

benchmarks/build.gradle

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@ dependencies {
4141
}
4242
api(project(':libs:h3'))
4343
api(project(':modules:aggregations'))
44+
implementation project(':modules:mapper-extras');
4445
api(project(':x-pack:plugin:esql-core'))
4546
api(project(':x-pack:plugin:core'))
4647
api(project(':x-pack:plugin:esql'))

benchmarks/src/main/java/org/elasticsearch/benchmark/index/mapper/MapperServiceFactory.java

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
import org.elasticsearch.index.mapper.ProvidedIdFieldMapper;
3030
import org.elasticsearch.index.similarity.SimilarityService;
3131
import org.elasticsearch.indices.IndicesModule;
32+
import org.elasticsearch.plugins.MapperPlugin;
3233
import org.elasticsearch.script.Script;
3334
import org.elasticsearch.script.ScriptCompiler;
3435
import org.elasticsearch.script.ScriptContext;
@@ -38,11 +39,16 @@
3839
import java.io.IOException;
3940
import java.io.UncheckedIOException;
4041
import java.util.Collections;
42+
import java.util.List;
4143
import java.util.Map;
4244

4345
public class MapperServiceFactory {
4446

4547
public static MapperService create(String mappings) {
48+
return create(mappings, Collections.emptyList());
49+
}
50+
51+
public static MapperService create(String mappings, List<MapperPlugin> mapperPlugins) {
4652
Settings settings = Settings.builder()
4753
.put("index.number_of_replicas", 0)
4854
.put("index.number_of_shards", 1)
@@ -51,7 +57,7 @@ public static MapperService create(String mappings) {
5157
.build();
5258
IndexMetadata meta = IndexMetadata.builder("index").settings(settings).build();
5359
IndexSettings indexSettings = new IndexSettings(meta, settings);
54-
MapperRegistry mapperRegistry = new IndicesModule(Collections.emptyList()).getMapperRegistry();
60+
MapperRegistry mapperRegistry = new IndicesModule(mapperPlugins).getMapperRegistry();
5561

5662
SimilarityService similarityService = new SimilarityService(indexSettings, null, Map.of());
5763
BitsetFilterCache bitsetFilterCache = new BitsetFilterCache(indexSettings, BitsetFilterCache.Listener.NOOP);

benchmarks/src/main/java/org/elasticsearch/benchmark/vector/OSQScorerBenchmark.java

Lines changed: 22 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -126,7 +126,10 @@ public void scoreFromArray(Blackhole bh) throws IOException {
126126
in.readFloats(corrections, 0, corrections.length);
127127
int addition = Short.toUnsignedInt(in.readShort());
128128
float score = scorer.score(
129-
result,
129+
result.lowerInterval(),
130+
result.upperInterval(),
131+
result.quantizedComponentSum(),
132+
result.additionalCorrection(),
130133
VectorSimilarityFunction.EUCLIDEAN,
131134
centroidDp,
132135
corrections[0],
@@ -150,7 +153,10 @@ public void scoreFromMemorySegmentOnlyVector(Blackhole bh) throws IOException {
150153
in.readFloats(corrections, 0, corrections.length);
151154
int addition = Short.toUnsignedInt(in.readShort());
152155
float score = scorer.score(
153-
result,
156+
result.lowerInterval(),
157+
result.upperInterval(),
158+
result.quantizedComponentSum(),
159+
result.additionalCorrection(),
154160
VectorSimilarityFunction.EUCLIDEAN,
155161
centroidDp,
156162
corrections[0],
@@ -175,7 +181,10 @@ public void scoreFromMemorySegmentOnlyVectorBulk(Blackhole bh) throws IOExceptio
175181
in.readFloats(corrections, 0, corrections.length);
176182
int addition = Short.toUnsignedInt(in.readShort());
177183
float score = scorer.score(
178-
result,
184+
result.lowerInterval(),
185+
result.upperInterval(),
186+
result.quantizedComponentSum(),
187+
result.additionalCorrection(),
179188
VectorSimilarityFunction.EUCLIDEAN,
180189
centroidDp,
181190
corrections[0],
@@ -196,7 +205,16 @@ public void scoreFromMemorySegmentAllBulk(Blackhole bh) throws IOException {
196205
for (int j = 0; j < numQueries; j++) {
197206
in.seek(0);
198207
for (int i = 0; i < numVectors; i += 16) {
199-
scorer.scoreBulk(binaryQueries[j], result, VectorSimilarityFunction.EUCLIDEAN, centroidDp, scratchScores);
208+
scorer.scoreBulk(
209+
binaryQueries[j],
210+
result.lowerInterval(),
211+
result.upperInterval(),
212+
result.quantizedComponentSum(),
213+
result.additionalCorrection(),
214+
VectorSimilarityFunction.EUCLIDEAN,
215+
centroidDp,
216+
scratchScores
217+
);
200218
bh.consume(scratchScores);
201219
}
202220
}

benchmarks/src/main/java/org/elasticsearch/benchmark/xcontent/OptimizedTextBenchmark.java

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
import org.elasticsearch.common.logging.LogConfigurator;
1616
import org.elasticsearch.index.mapper.MapperService;
1717
import org.elasticsearch.index.mapper.SourceToParse;
18+
import org.elasticsearch.index.mapper.extras.MapperExtrasPlugin;
1819
import org.elasticsearch.xcontent.XContentBuilder;
1920
import org.elasticsearch.xcontent.XContentFactory;
2021
import org.elasticsearch.xcontent.XContentType;
@@ -34,6 +35,7 @@
3435
import org.openjdk.jmh.infra.Blackhole;
3536

3637
import java.io.IOException;
38+
import java.util.List;
3739
import java.util.Random;
3840
import java.util.concurrent.TimeUnit;
3941

@@ -66,7 +68,7 @@ public class OptimizedTextBenchmark {
6668
private SourceToParse[] sources;
6769

6870
private String randomValue(int length) {
69-
final String CHARS = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789";
71+
final String CHARS = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 ";
7072
Random random = new Random();
7173
StringBuilder builder = new StringBuilder(length);
7274
for (int i = 0; i < length; i++) {
@@ -83,17 +85,17 @@ public void setup() throws IOException {
8385
"dynamic": false,
8486
"properties": {
8587
"field": {
86-
"type": "keyword"
88+
"type": "match_only_text"
8789
}
8890
}
8991
}
9092
}
91-
""");
93+
""", List.of(new MapperExtrasPlugin()));
9294

9395
sources = new SourceToParse[nDocs];
9496
for (int i = 0; i < nDocs; i++) {
9597
XContentBuilder b = XContentFactory.jsonBuilder();
96-
b.startObject().field("field", randomValue(8)).endObject();
98+
b.startObject().field("field", randomValue(512)).endObject();
9799
sources[i] = new SourceToParse(UUIDs.randomBase64UUID(), BytesReference.bytes(b), XContentType.JSON);
98100
}
99101
}

docs/changelog/119967.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
pr: 119967
2+
summary: Add `index_options` to `semantic_text` field mappings
3+
area: Mapping
4+
type: enhancement
5+
issues: [ ]

docs/changelog/127636.yaml

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
pr: 127636
2+
summary: Disallow mixed quoted/unquoted patterns in FROM
3+
area: ES|QL
4+
type: breaking
5+
issues:
6+
- 122651
7+
breaking:
8+
title: Disallow mixed quoted/unquoted patterns in FROM
9+
area: ES|QL
10+
details: "Previously, the ES|QL grammar allowed users to individually quote constituent strings in index patterns\
11+
\ such as \"remote_cluster\":\"index_name\". This would allow users to write complex malformed index patterns\
12+
\ that often slip through grammar and the subsequent validation. This could result in runtime errors\
13+
\ that can be misleading. This change simplifies the grammar to early reject such malformed index patterns\
14+
\ at the parsing stage, allowing users to write simpler queries and see more relevant and meaningful\
15+
\ errors."
16+
impact: "Users can write queries with simpler index patterns and see more meaningful and relevant errors."
17+
notable: false

docs/changelog/129507.yaml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
pr: 129507
2+
summary: Using a temp `IndexService` for template validation
3+
area: Indices APIs
4+
type: bug
5+
issues:
6+
- 129473

docs/changelog/129509.yaml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
pr: 129509
2+
summary: Fix NPE in `SemanticTextHighlighter`
3+
area: Search
4+
type: bug
5+
issues:
6+
- 129501

docs/changelog/129548.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
pr: 129548
2+
summary: Fix NPE in `flat_bbq` scorer when all vectors are missing
3+
area: Vector Search
4+
type: bug
5+
issues: []

docs/reference/elasticsearch/mapping-reference/semantic-text.md

Lines changed: 45 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ service.
2828

2929
Using `semantic_text`, you won’t need to specify how to generate embeddings for
3030
your data, or how to index it. The {{infer}} endpoint automatically determines
31-
the embedding generation, indexing, and query to use.
31+
the embedding generation, indexing, and query to use.
3232
Newly created indices with `semantic_text` fields using dense embeddings will be
3333
[quantized](/reference/elasticsearch/mapping-reference/dense-vector.md#dense-vector-quantization)
3434
to `bbq_hnsw` automatically.
@@ -111,6 +111,33 @@ the [Create {{infer}} API](https://www.elastic.co/docs/api/doc/elasticsearch/ope
111111
to create the endpoint. If not specified, the {{infer}} endpoint defined by
112112
`inference_id` will be used at both index and query time.
113113

114+
`index_options`
115+
: (Optional, string) Specifies the index options to override default values
116+
for the field. Currently, `dense_vector` index options are supported.
117+
For text embeddings, `index_options` may match any allowed
118+
[dense_vector index options](/reference/elasticsearch/mapping-reference/dense-vector.md#dense-vector-index-options).
119+
120+
An example of how to set index_options for a `semantic_text` field:
121+
122+
```console
123+
PUT my-index-000004
124+
{
125+
"mappings": {
126+
"properties": {
127+
"inference_field": {
128+
"type": "semantic_text",
129+
"inference_id": "my-text-embedding-endpoint",
130+
"index_options": {
131+
"dense_vector": {
132+
"type": "int4_flat"
133+
}
134+
}
135+
}
136+
}
137+
}
138+
}
139+
```
140+
114141
`chunking_settings`
115142
: (Optional, object) Settings for chunking text into smaller passages.
116143
If specified, these will override the chunking settings set in the {{infer-cap}}
@@ -138,8 +165,10 @@ To completely disable chunking, use the `none` chunking strategy.
138165
or `1`. Required for `sentence` type chunking settings
139166

140167
::::{warning}
141-
If the input exceeds the maximum token limit of the underlying model, some services (such as OpenAI) may return an
142-
error. In contrast, the `elastic` and `elasticsearch` services will automatically truncate the input to fit within the
168+
If the input exceeds the maximum token limit of the underlying model, some
169+
services (such as OpenAI) may return an
170+
error. In contrast, the `elastic` and `elasticsearch` services will
171+
automatically truncate the input to fit within the
143172
model's limit.
144173
::::
145174

@@ -173,7 +202,8 @@ For more details on chunking and how to configure chunking settings,
173202
see [Configuring chunking](https://www.elastic.co/docs/api/doc/elasticsearch/group/endpoint-inference)
174203
in the Inference API documentation.
175204

176-
You can pre-chunk the input by sending it to Elasticsearch as an array of strings.
205+
You can pre-chunk the input by sending it to Elasticsearch as an array of
206+
strings.
177207
Example:
178208

179209
```console
@@ -203,15 +233,20 @@ PUT test-index/_doc/1
203233
```
204234

205235
1. The text is pre-chunked and provided as an array of strings.
206-
Each element in the array represents a single chunk that will be sent directly to the inference service without further chunking.
236+
Each element in the array represents a single chunk that will be sent
237+
directly to the inference service without further chunking.
207238

208239
**Important considerations**:
209240

210-
* When providing pre-chunked input, ensure that you set the chunking strategy to `none` to avoid additional processing.
211-
* Each chunk should be sized carefully, staying within the token limit of the inference service and the underlying model.
212-
* If a chunk exceeds the model's token limit, the behavior depends on the service:
213-
* Some services (such as OpenAI) will return an error.
214-
* Others (such as `elastic` and `elasticsearch`) will automatically truncate the input.
241+
* When providing pre-chunked input, ensure that you set the chunking strategy to
242+
`none` to avoid additional processing.
243+
* Each chunk should be sized carefully, staying within the token limit of the
244+
inference service and the underlying model.
245+
* If a chunk exceeds the model's token limit, the behavior depends on the
246+
service:
247+
* Some services (such as OpenAI) will return an error.
248+
* Others (such as `elastic` and `elasticsearch`) will automatically truncate
249+
the input.
215250

216251
Refer
217252
to [this tutorial](docs-content://solutions/search/semantic-search/semantic-search-semantic-text.md)

0 commit comments

Comments
 (0)