Skip to content

Commit 921d7dc

Browse files
Merge branch 'main' into es-134088-fix
2 parents 963dc60 + 45842f8 commit 921d7dc

File tree

7 files changed

+357
-542
lines changed

7 files changed

+357
-542
lines changed
Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
/*
2+
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
3+
* or more contributor license agreements. Licensed under the "Elastic License
4+
* 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side
5+
* Public License v 1"; you may not use this file except in compliance with, at
6+
* your election, the "Elastic License 2.0", the "GNU Affero General Public
7+
* License v3.0 only", or the "Server Side Public License, v 1".
8+
*/
9+
10+
package org.elasticsearch.benchmark.vector;
11+
12+
import org.elasticsearch.common.logging.LogConfigurator;
13+
import org.elasticsearch.index.codec.vectors.cluster.NeighborHood;
14+
import org.openjdk.jmh.annotations.Benchmark;
15+
import org.openjdk.jmh.annotations.BenchmarkMode;
16+
import org.openjdk.jmh.annotations.Fork;
17+
import org.openjdk.jmh.annotations.Measurement;
18+
import org.openjdk.jmh.annotations.Mode;
19+
import org.openjdk.jmh.annotations.OutputTimeUnit;
20+
import org.openjdk.jmh.annotations.Param;
21+
import org.openjdk.jmh.annotations.Scope;
22+
import org.openjdk.jmh.annotations.Setup;
23+
import org.openjdk.jmh.annotations.State;
24+
import org.openjdk.jmh.annotations.Warmup;
25+
import org.openjdk.jmh.infra.Blackhole;
26+
27+
import java.io.IOException;
28+
import java.util.Random;
29+
import java.util.concurrent.TimeUnit;
30+
31+
@BenchmarkMode(Mode.AverageTime)
32+
@OutputTimeUnit(TimeUnit.SECONDS)
33+
@State(Scope.Benchmark)
34+
// first iteration is complete garbage, so make sure we really warmup
35+
@Warmup(iterations = 1, time = 1)
36+
// real iterations. not useful to spend tons of time here, better to fork more
37+
@Measurement(iterations = 3, time = 1)
38+
// engage some noise reduction
39+
@Fork(value = 1)
40+
public class ComputeNeighboursBenchmark {
41+
42+
static {
43+
LogConfigurator.configureESLogging(); // native access requires logging to be initialized
44+
}
45+
46+
@Param({ "1000", "2000", "3000", "5000", "10000", "20000", "50000" })
47+
int numVectors;
48+
49+
@Param({ "384", "782", "1024" })
50+
int dims;
51+
52+
float[][] vectors;
53+
int clusterPerNeighbour = 128;
54+
55+
@Setup
56+
public void setup() throws IOException {
57+
Random random = new Random(123);
58+
vectors = new float[numVectors][dims];
59+
for (float[] vector : vectors) {
60+
for (int i = 0; i < dims; i++) {
61+
vector[i] = random.nextFloat();
62+
}
63+
}
64+
}
65+
66+
@Benchmark
67+
@Fork(jvmArgsPrepend = { "--add-modules=jdk.incubator.vector" })
68+
public void bruteForce(Blackhole bh) {
69+
bh.consume(NeighborHood.computeNeighborhoodsBruteForce(vectors, clusterPerNeighbour));
70+
}
71+
72+
@Benchmark
73+
@Fork(jvmArgsPrepend = { "--add-modules=jdk.incubator.vector" })
74+
public void graph(Blackhole bh) throws IOException {
75+
bh.consume(NeighborHood.computeNeighborhoodsGraph(vectors, clusterPerNeighbour));
76+
}
77+
}

docs/reference/elasticsearch/mapping-reference/doc-values.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,13 @@ PUT my-index-000001
7373
1. The `status_code` field has `doc_values` enabled by default.
7474
2. The `session_id` has `doc_values` disabled, but can still be queried.
7575

76+
## Multi-valued doc values note
77+
78+
Elasticsearch supports storing multi-valued fields at index time. Multi-valued fields can be provided as a json array. However in the doc values format, the values aren't stored in the order as was provided at index time. Additionally, duplicates may be lost.
79+
This implementation detail of doc values is visible when features directly interact with doc values, which may be the case for example in ES|QL or aggregations in the search API. Note, that _source always returns arrays in the way that was provided at index time.
80+
81+
How the ordering differs depends on whether the array is mapped as keyword or a numeric field type. In case of the `keyword` field type, the multi-valued values for each document are ordered lexicographically and duplicates are lost. If retaining duplicates is important then the `counted_keyword` field type should be used.
82+
In case of numeric field types (e.g. `long`, `double`, `scaled_float`, etc.), the multi-valued values for each document are ordered in natural order and duplicates are retained.
7683

7784

7885

muted-tests.yml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -480,9 +480,6 @@ tests:
480480
- class: org.elasticsearch.xpack.esql.qa.multi_node.EsqlClientYamlIT
481481
method: test {p0=esql/60_usage/Basic ESQL usage output (telemetry) non-snapshot version}
482482
issue: https://github.com/elastic/elasticsearch/issues/133461
483-
- class: org.elasticsearch.xpack.esql.action.TimeSeriesRateIT
484-
method: testRateWithTimeBucketAndClusterMultipleMetricsByMin
485-
issue: https://github.com/elastic/elasticsearch/issues/133478
486483
- class: org.elasticsearch.xpack.esql.action.LookupJoinTypesIT
487484
method: testLookupJoinOthers
488485
issue: https://github.com/elastic/elasticsearch/issues/133480
@@ -528,6 +525,9 @@ tests:
528525
- class: org.elasticsearch.xpack.esql.qa.single_node.GenerativeForkIT
529526
method: test {csv-spec:inlinestats.MultiIndexInlinestatsOfMultiTypedField}
530527
issue: https://github.com/elastic/elasticsearch/issues/133973
528+
- class: org.elasticsearch.test.rest.yaml.CcsCommonYamlTestSuiteIT
529+
method: test {p0=search/510_range_query_out_of_bounds/Test range query for float field with out of bounds lower limit}
530+
issue: https://github.com/elastic/elasticsearch/issues/134184
531531

532532
# Examples:
533533
#

server/src/main/java/org/elasticsearch/index/codec/vectors/cluster/KMeansLocal.java

Lines changed: 13 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -139,40 +139,40 @@ private static int getBestCentroidFromNeighbours(
139139
NeighborHood neighborhood,
140140
float[] distances
141141
) {
142-
final int limit = neighborhood.neighbors.length - 3;
142+
final int limit = neighborhood.neighbors().length - 3;
143143
int bestCentroidOffset = centroidIdx;
144144
assert centroidIdx >= 0 && centroidIdx < centroids.length;
145145
float minDsq = VectorUtil.squareDistance(vector, centroids[centroidIdx]);
146146
int i = 0;
147147
for (; i < limit; i += 4) {
148-
if (minDsq < neighborhood.maxIntraDistance) {
148+
if (minDsq < neighborhood.maxIntraDistance()) {
149149
// if the distance found is smaller than the maximum intra-cluster distance
150150
// we don't consider it for further re-assignment
151151
return bestCentroidOffset;
152152
}
153153
ESVectorUtil.squareDistanceBulk(
154154
vector,
155-
centroids[neighborhood.neighbors[i]],
156-
centroids[neighborhood.neighbors[i + 1]],
157-
centroids[neighborhood.neighbors[i + 2]],
158-
centroids[neighborhood.neighbors[i + 3]],
155+
centroids[neighborhood.neighbors()[i]],
156+
centroids[neighborhood.neighbors()[i + 1]],
157+
centroids[neighborhood.neighbors()[i + 2]],
158+
centroids[neighborhood.neighbors()[i + 3]],
159159
distances
160160
);
161161
for (int j = 0; j < distances.length; j++) {
162162
float dsq = distances[j];
163163
if (dsq < minDsq) {
164164
minDsq = dsq;
165-
bestCentroidOffset = neighborhood.neighbors[i + j];
165+
bestCentroidOffset = neighborhood.neighbors()[i + j];
166166
}
167167
}
168168
}
169-
for (; i < neighborhood.neighbors.length; i++) {
170-
if (minDsq < neighborhood.maxIntraDistance) {
169+
for (; i < neighborhood.neighbors().length; i++) {
170+
if (minDsq < neighborhood.maxIntraDistance()) {
171171
// if the distance found is smaller than the maximum intra-cluster distance
172172
// we don't consider it for further re-assignment
173173
return bestCentroidOffset;
174174
}
175-
int offset = neighborhood.neighbors[i];
175+
int offset = neighborhood.neighbors()[i];
176176
// float score = neighborhood.scores[i];
177177
assert offset >= 0 && offset < centroids.length : "Invalid neighbor offset: " + offset;
178178
// compute the distance to the centroid
@@ -210,52 +210,6 @@ private static int getBestCentroid(float[][] centroids, float[] vector, float[]
210210
return bestCentroidOffset;
211211
}
212212

213-
private NeighborHood[] computeNeighborhoods(float[][] centers, int clustersPerNeighborhood) {
214-
int k = centers.length;
215-
assert k > clustersPerNeighborhood;
216-
NeighborQueue[] neighborQueues = new NeighborQueue[k];
217-
for (int i = 0; i < k; i++) {
218-
neighborQueues[i] = new NeighborQueue(clustersPerNeighborhood, true);
219-
}
220-
final float[] scores = new float[4];
221-
final int limit = k - 3;
222-
for (int i = 0; i < k - 1; i++) {
223-
float[] center = centers[i];
224-
int j = i + 1;
225-
for (; j < limit; j += 4) {
226-
ESVectorUtil.squareDistanceBulk(center, centers[j], centers[j + 1], centers[j + 2], centers[j + 3], scores);
227-
for (int h = 0; h < 4; h++) {
228-
neighborQueues[j + h].insertWithOverflow(i, scores[h]);
229-
neighborQueues[i].insertWithOverflow(j + h, scores[h]);
230-
}
231-
}
232-
for (; j < k; j++) {
233-
float dsq = VectorUtil.squareDistance(center, centers[j]);
234-
neighborQueues[j].insertWithOverflow(i, dsq);
235-
neighborQueues[i].insertWithOverflow(j, dsq);
236-
}
237-
}
238-
239-
NeighborHood[] neighborhoods = new NeighborHood[k];
240-
for (int i = 0; i < k; i++) {
241-
NeighborQueue queue = neighborQueues[i];
242-
if (queue.size() == 0) {
243-
// no neighbors, skip
244-
neighborhoods[i] = NeighborHood.EMPTY;
245-
continue;
246-
}
247-
// consume the queue into the neighbors array and get the maximum intra-cluster distance
248-
int[] neighbors = new int[queue.size()];
249-
float maxIntraDistance = queue.topScore();
250-
int iter = 0;
251-
while (queue.size() > 0) {
252-
neighbors[neighbors.length - ++iter] = queue.pop();
253-
}
254-
neighborhoods[i] = new NeighborHood(neighbors, maxIntraDistance);
255-
}
256-
return neighborhoods;
257-
}
258-
259213
private void assignSpilled(
260214
FloatVectorValues vectors,
261215
KMeansIntermediate kmeansIntermediate,
@@ -299,8 +253,8 @@ private void assignSpilled(
299253
if (neighborhoods != null) {
300254
assert neighborhoods[currAssignment] != null;
301255
NeighborHood neighborhood = neighborhoods[currAssignment];
302-
centroidCount = neighborhood.neighbors.length;
303-
centroidOrds = c -> neighborhood.neighbors[c];
256+
centroidCount = neighborhood.neighbors().length;
257+
centroidOrds = c -> neighborhood.neighbors()[c];
304258
} else {
305259
centroidCount = centroids.length - 1;
306260
centroidOrds = c -> c < currAssignment ? c : c + 1; // skip the current centroid
@@ -344,10 +298,6 @@ private void assignSpilled(
344298
}
345299
}
346300

347-
record NeighborHood(int[] neighbors, float maxIntraDistance) {
348-
static final NeighborHood EMPTY = new NeighborHood(new int[0], Float.POSITIVE_INFINITY);
349-
}
350-
351301
/**
352302
* cluster using a lloyd k-means algorithm that is not neighbor aware
353303
*
@@ -390,7 +340,7 @@ private void doCluster(FloatVectorValues vectors, KMeansIntermediate kMeansInter
390340
NeighborHood[] neighborhoods = null;
391341
// if there are very few centroids, don't bother with neighborhoods or neighbor aware clustering
392342
if (neighborAware && centroids.length > clustersPerNeighborhood) {
393-
neighborhoods = computeNeighborhoods(centroids, clustersPerNeighborhood);
343+
neighborhoods = NeighborHood.computeNeighborhoods(centroids, clustersPerNeighborhood);
394344
}
395345
cluster(vectors, kMeansIntermediate, neighborhoods);
396346
if (neighborAware && soarLambda >= 0) {

0 commit comments

Comments
 (0)