Skip to content

Commit 2ceaa36

Browse files
committed
Merge branch 'bbq_hnsw-disk-rescoring' into bbq_hnsw-bfloat16
2 parents feb7aee + 878fa71 commit 2ceaa36

File tree

573 files changed

+12621
-5837
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

573 files changed

+12621
-5837
lines changed

.buildkite/scripts/generate-pr-performance-benchmark.sh

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ steps:
4646
CONFIGURATION_NAME: ${GITHUB_PR_COMMENT_VAR_BENCHMARK}
4747
ENV_ID: ${env_id_baseline}
4848
REVISION: ${merge_base}
49+
BENCHMARK_TYPE: baseline
4950
- label: Trigger contender benchmark with ${GITHUB_PR_TRIGGERED_SHA:0:7}
5051
trigger: elasticsearch-performance-esbench-pr
5152
build:
@@ -56,6 +57,7 @@ steps:
5657
ENV_ID: ${env_id_contender}
5758
ES_REPO_URL: https://github.com/${GITHUB_PR_OWNER}/${GITHUB_PR_REPO}.git
5859
REVISION: ${GITHUB_PR_TRIGGERED_SHA}
60+
BENCHMARK_TYPE: contender
5961
- wait: ~
6062
- label: Update PR comment and Buildkite annotation
6163
command: |

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ build/
4646
**/.local*
4747
.vagrant/
4848
/logs/
49+
**/target/
4950

5051
# osx stuff
5152
.DS_Store

benchmarks/src/main/java/org/elasticsearch/benchmark/bytes/RecyclerBytesStreamOutputBenchmark.java

Lines changed: 5 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
package org.elasticsearch.benchmark.bytes;
1111

1212
import org.apache.lucene.util.BytesRef;
13+
import org.elasticsearch.benchmark.common.util.UTF8StringBytesBenchmark;
1314
import org.elasticsearch.common.io.stream.RecyclerBytesStreamOutput;
1415
import org.elasticsearch.common.recycler.Recycler;
1516
import org.openjdk.jmh.annotations.Benchmark;
@@ -65,10 +66,10 @@ public void initResults() throws IOException {
6566
// We use weights to generate certain sized UTF-8 characters and vInts. However, there is still some non-determinism which could
6667
// impact direct comparisons run-to-run
6768

68-
shortString = generateAsciiString(20);
69-
longString = generateAsciiString(100);
70-
nonAsciiString = generateUtf8String(200);
71-
veryLongString = generateAsciiString(800);
69+
shortString = UTF8StringBytesBenchmark.generateAsciiString(20);
70+
longString = UTF8StringBytesBenchmark.generateAsciiString(100);
71+
nonAsciiString = UTF8StringBytesBenchmark.generateUTF8String(200);
72+
veryLongString = UTF8StringBytesBenchmark.generateAsciiString(800);
7273
// vint values for benchmarking
7374
vints = new int[1000];
7475
for (int i = 0; i < vints.length; i++) {
@@ -143,49 +144,6 @@ public void writeVInt() throws IOException {
143144
}
144145
}
145146

146-
public static String generateAsciiString(int n) {
147-
ThreadLocalRandom random = ThreadLocalRandom.current();
148-
StringBuilder sb = new StringBuilder(n);
149-
150-
for (int i = 0; i < n; i++) {
151-
int ascii = random.nextInt(128);
152-
sb.append((char) ascii);
153-
}
154-
155-
return sb.toString();
156-
}
157-
158-
public static String generateUtf8String(int n) {
159-
ThreadLocalRandom random = ThreadLocalRandom.current();
160-
StringBuilder sb = new StringBuilder(n);
161-
162-
for (int i = 0; i < n; i++) {
163-
int codePoint;
164-
int probability = random.nextInt(100);
165-
166-
if (probability < 85) {
167-
// 1-byte UTF-8 (ASCII range)
168-
// 0x0000 to 0x007F
169-
codePoint = random.nextInt(0x0080);
170-
} else if (probability < 95) {
171-
// 2-byte UTF-8
172-
// 0x0080 to 0x07FF
173-
codePoint = random.nextInt(0x0080, 0x0800);
174-
} else {
175-
// 3-byte UTF-8
176-
// 0x0800 to 0xFFFF
177-
do {
178-
codePoint = random.nextInt(0x0800, 0x10000);
179-
// Skip surrogate pairs (0xD800-0xDFFF)
180-
} while (codePoint >= 0xD800 && codePoint <= 0xDFFF);
181-
}
182-
183-
sb.appendCodePoint(codePoint);
184-
}
185-
186-
return sb.toString();
187-
}
188-
189147
private record BenchmarkRecycler(AtomicReference<BytesRef> bytesRef) implements Recycler<BytesRef> {
190148

191149
@Override
Lines changed: 148 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,148 @@
1+
/*
2+
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
3+
* or more contributor license agreements. Licensed under the "Elastic License
4+
* 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side
5+
* Public License v 1"; you may not use this file except in compliance with, at
6+
* your election, the "Elastic License 2.0", the "GNU Affero General Public
7+
* License v3.0 only", or the "Server Side Public License, v 1".
8+
*/
9+
10+
package org.elasticsearch.benchmark.common.util;
11+
12+
import org.apache.lucene.util.BytesRef;
13+
import org.apache.lucene.util.UnicodeUtil;
14+
import org.elasticsearch.common.UUIDs;
15+
import org.openjdk.jmh.annotations.Benchmark;
16+
import org.openjdk.jmh.annotations.BenchmarkMode;
17+
import org.openjdk.jmh.annotations.Fork;
18+
import org.openjdk.jmh.annotations.Measurement;
19+
import org.openjdk.jmh.annotations.Mode;
20+
import org.openjdk.jmh.annotations.OutputTimeUnit;
21+
import org.openjdk.jmh.annotations.Param;
22+
import org.openjdk.jmh.annotations.Scope;
23+
import org.openjdk.jmh.annotations.Setup;
24+
import org.openjdk.jmh.annotations.State;
25+
import org.openjdk.jmh.annotations.Warmup;
26+
27+
import java.nio.ByteBuffer;
28+
import java.nio.charset.StandardCharsets;
29+
import java.util.concurrent.ThreadLocalRandom;
30+
import java.util.concurrent.TimeUnit;
31+
32+
@Warmup(iterations = 3)
33+
@Measurement(iterations = 3)
34+
@BenchmarkMode(Mode.AverageTime)
35+
@OutputTimeUnit(TimeUnit.NANOSECONDS)
36+
@Fork(value = 1)
37+
public class UTF8StringBytesBenchmark {
38+
39+
@State(Scope.Thread)
40+
public static class StringState {
41+
@Param({ "uuid", "short", "long", "nonAscii", "veryLong" })
42+
String stringType;
43+
44+
String string;
45+
BytesRef bytes;
46+
47+
@Setup
48+
public void setup() {
49+
string = switch (stringType) {
50+
case "uuid" -> UUIDs.base64UUID();
51+
case "short" -> generateAsciiString(20);
52+
case "long" -> generateAsciiString(100);
53+
case "nonAscii" -> generateUTF8String(200);
54+
case "veryLong" -> generateAsciiString(1000);
55+
default -> throw new IllegalArgumentException("Unknown stringType: " + stringType);
56+
};
57+
bytes = getBytes(string);
58+
}
59+
}
60+
61+
@Benchmark
62+
public BytesRef getBytesJDK(StringState state) {
63+
byte[] bytes = state.string.getBytes(StandardCharsets.UTF_8);
64+
return new BytesRef(bytes, 0, bytes.length);
65+
}
66+
67+
@Benchmark
68+
public BytesRef getBytesUnicodeUtils(StringState state) {
69+
String string = state.string;
70+
int length = string.length();
71+
int size = UnicodeUtil.calcUTF16toUTF8Length(string, 0, length);
72+
byte[] out = new byte[size];
73+
UnicodeUtil.UTF16toUTF8(string, 0, length, out, 0);
74+
return new BytesRef(out, 0, out.length);
75+
}
76+
77+
@Benchmark
78+
public BytesRef getBytesByteBufferEncoder(StringState state) {
79+
var byteBuff = StandardCharsets.UTF_8.encode(state.string);
80+
assert byteBuff.hasArray();
81+
return new BytesRef(byteBuff.array(), byteBuff.arrayOffset() + byteBuff.position(), byteBuff.remaining());
82+
}
83+
84+
@Benchmark
85+
public String getStringJDK(StringState state) {
86+
BytesRef bytes = state.bytes;
87+
return new String(bytes.bytes, bytes.offset, bytes.length, StandardCharsets.UTF_8);
88+
}
89+
90+
@Benchmark
91+
public String getStringByteBufferDecoder(StringState state) {
92+
BytesRef bytes = state.bytes;
93+
var byteBuff = ByteBuffer.wrap(bytes.bytes, bytes.offset, bytes.length);
94+
return StandardCharsets.UTF_8.decode(byteBuff).toString();
95+
}
96+
97+
private static BytesRef getBytes(String string) {
98+
int before = ThreadLocalRandom.current().nextInt(0, 50);
99+
int after = ThreadLocalRandom.current().nextInt(0, 50);
100+
byte[] stringBytes = string.getBytes(StandardCharsets.UTF_8);
101+
byte[] finalBytes = new byte[before + after + stringBytes.length];
102+
System.arraycopy(stringBytes, 0, finalBytes, before, stringBytes.length);
103+
return new BytesRef(finalBytes, before, stringBytes.length);
104+
}
105+
106+
public static String generateAsciiString(int n) {
107+
ThreadLocalRandom random = ThreadLocalRandom.current();
108+
StringBuilder sb = new StringBuilder(n);
109+
110+
for (int i = 0; i < n; i++) {
111+
int ascii = random.nextInt(128);
112+
sb.append((char) ascii);
113+
}
114+
115+
return sb.toString();
116+
}
117+
118+
public static String generateUTF8String(int n) {
119+
ThreadLocalRandom random = ThreadLocalRandom.current();
120+
StringBuilder sb = new StringBuilder(n);
121+
122+
for (int i = 0; i < n; i++) {
123+
int codePoint;
124+
int probability = random.nextInt(100);
125+
126+
if (probability < 85) {
127+
// 1-byte UTF-8 (ASCII range)
128+
// 0x0000 to 0x007F
129+
codePoint = random.nextInt(0x0080);
130+
} else if (probability < 95) {
131+
// 2-byte UTF-8
132+
// 0x0080 to 0x07FF
133+
codePoint = random.nextInt(0x0080, 0x0800);
134+
} else {
135+
// 3-byte UTF-8
136+
// 0x0800 to 0xFFFF
137+
do {
138+
codePoint = random.nextInt(0x0800, 0x10000);
139+
// Skip surrogate pairs (0xD800-0xDFFF)
140+
} while (codePoint >= 0xD800 && codePoint <= 0xDFFF);
141+
}
142+
143+
sb.appendCodePoint(codePoint);
144+
}
145+
146+
return sb.toString();
147+
}
148+
}

build-tools/src/main/java/org/elasticsearch/gradle/testclusters/ElasticsearchNode.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1407,7 +1407,7 @@ private void createConfiguration() {
14071407
// Limit the number of allocated processors for all nodes in the cluster by default.
14081408
// This is to ensure that the tests run consistently across different environments.
14091409
String processorCount = shouldConfigureTestClustersWithOneProcessor() ? "1" : "2";
1410-
if (getVersion().onOrAfter("7.6.0")) {
1410+
if (getVersion().onOrAfter("7.4.0")) {
14111411
baseConfig.put("node.processors", processorCount);
14121412
} else {
14131413
baseConfig.put("processors", processorCount);

docs/changelog/135886.yaml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
pr: 135886
2+
summary: Provide defaults for index sort settings
3+
area: Mapping
4+
type: bug
5+
issues:
6+
- 129062

docs/changelog/136066.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
pr: 136066
2+
summary: Simulate shards moved by explicit commands
3+
area: Allocation
4+
type: enhancement
5+
issues: []

docs/changelog/136119.yaml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
pr: 136119
2+
summary: Fix logsdb settings provider mapping filters
3+
area: Logs
4+
type: bug
5+
issues:
6+
- 136107

docs/reference/elasticsearch/mapping-reference/semantic-text.md

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -611,6 +611,31 @@ PUT test-index
611611
}
612612
```
613613

614+
## Querying `semantic_text` fields [querying-semantic-text-fields]
615+
616+
You can query `semantic_text` fields using the following query types:
617+
618+
- Match query: The recommended method for querying `semantic_text` fields. You can use [Query DSL](/reference/query-languages/query-dsl/query-dsl-match-query.md) or [ES|QL](/reference/query-languages/esql/functions-operators/search-functions.md#esql-match) syntax.
619+
<!--
620+
Refer to examples of match queries on `semantic_text` fields.
621+
-->
622+
623+
- [kNN query](/reference/query-languages/query-dsl/query-dsl-knn-query.md): Finds the nearest vectors to a query vector using a similarity metric, mainly for advanced or combined search use cases.
624+
<!--
625+
Refer to examples of kNN queries on `semantic_text` fields.
626+
-->
627+
628+
- [Sparse vector query](/reference/query-languages/query-dsl/query-dsl-sparse-vector-query.md): Executes searches using sparse vectors generated by a sparse retrieval model such as [ELSER](docs-content://explore-analyze/machine-learning/nlp/ml-nlp-elser.md).
629+
<!--
630+
Refer to examples of sparse vector queries on `semantic_text` fields.
631+
-->
632+
633+
- [Semantic query](/reference/query-languages/query-dsl/query-dsl-semantic-query.md): We don't recommend this legacy query type for _new_ projects, because the alternatives in this list enable more flexibility and customization. The `semantic` query remains available to support existing implementations.
634+
<!--
635+
Refer to examples of semantic queries on `semantic_text` fields.
636+
-->
637+
638+
614639
## Troubleshooting semantic_text fields [troubleshooting-semantic-text-fields]
615640

616641
If you want to verify that your embeddings look correct, you can view the

docs/reference/elasticsearch/mapping-reference/sparse-vector.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -95,13 +95,13 @@ This ensures that:
9595
* The tokens that are kept are frequent enough and have significant scoring.
9696
* Very infrequent tokens that may not have as high of a score are removed.
9797

98-
## Accessing `dense_vector` fields in search responses
98+
## Accessing `sparse_vector` fields in search responses
9999
```{applies_to}
100100
stack: ga 9.2
101101
serverless: ga
102102
```
103103

104-
By default, `dense_vector` fields are **not included in `_source`** in responses from the `_search`, `_msearch`, `_get`, and `_mget` APIs.
104+
By default, `sparse_vector` fields are **not included in `_source`** in responses from the `_search`, `_msearch`, `_get`, and `_mget` APIs.
105105
This helps reduce response size and improve performance, especially in scenarios where vectors are used solely for similarity scoring and not required in the output.
106106

107107
To retrieve vector values explicitly, you can use:

0 commit comments

Comments
 (0)