Skip to content

Commit 308ad0c

Browse files
benwtrentRassyan
andauthored
[8.x] Add docvalue_fields Support for dense_vector Fields (#114484) (#116491)
* Add `docvalue_fields` Support for `dense_vector` Fields (#114484) Currently dense_vector field don't support docvalue_fields. This add this support for debugging purposes. Users can inspect row values of their vectors even if the source is disabled. Co-authored-by: Mayya Sharipova <[email protected]> (cherry picked from commit c8a8d4d) * fixing for backport --------- Co-authored-by: Rassyan <[email protected]>
1 parent 8adb2c4 commit 308ad0c

File tree

8 files changed

+323
-6
lines changed

8 files changed

+323
-6
lines changed

docs/changelog/114484.yaml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
pr: 114484
2+
summary: Add `docvalue_fields` Support for `dense_vector` Fields
3+
area: Search
4+
type: enhancement
5+
issues:
6+
- 108470
Lines changed: 163 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,163 @@
1+
setup:
2+
- requires:
3+
capabilities:
4+
- method: POST
5+
path: /_search
6+
capabilities: [ dense_vector_docvalue_fields ]
7+
test_runner_features: [ capabilities, close_to ]
8+
reason: Capability required to run test
9+
- do:
10+
indices.create:
11+
index: test
12+
body:
13+
mappings:
14+
properties:
15+
name:
16+
type: keyword
17+
vector1:
18+
type: dense_vector
19+
element_type: float
20+
dims: 5
21+
index: true
22+
vector2:
23+
type: dense_vector
24+
element_type: float
25+
dims: 5
26+
index: false
27+
vector3:
28+
type: dense_vector
29+
element_type: byte
30+
dims: 5
31+
index: true
32+
vector4:
33+
type: dense_vector
34+
element_type: byte
35+
dims: 5
36+
index: false
37+
vector5:
38+
type: dense_vector
39+
element_type: bit
40+
dims: 40
41+
index: true
42+
vector6:
43+
type: dense_vector
44+
element_type: bit
45+
dims: 40
46+
index: false
47+
- do:
48+
index:
49+
index: test
50+
id: "1"
51+
body:
52+
name: cow.jpg
53+
vector1: [230.0, 300.33, -34.8988, 15.555, -200.0]
54+
vector2: [130.0, 115.0, -1.02, 15.555, -100.0]
55+
vector3: [-1, 100, -13, 15, -128]
56+
vector4: [-1, 50, -1, 1, 120]
57+
vector5: [1, 111, -13, 15, -128]
58+
vector6: [-1, 11, 0, 12, 111]
59+
- do:
60+
index:
61+
index: test
62+
id: "2"
63+
body:
64+
name: moose.jpg
65+
vector1: [-0.5, 100.0, -13, 14.8, -156.0]
66+
vector4: [-1, 50, -1, 1, 120]
67+
vector5: [1, 111, -13, 15, -128]
68+
vector6: null
69+
- do:
70+
index:
71+
index: test
72+
id: "3"
73+
body:
74+
name: rabbit.jpg
75+
vector2: [130.0, 115.0, -1.02, 15.555, -100.0]
76+
vector3: [-1, 100, -13, 15, -128]
77+
78+
- do:
79+
indices.refresh: {}
80+
81+
---
82+
"Enable docvalue_fields parameter for dense_vector fields":
83+
- requires:
84+
capabilities:
85+
- method: POST
86+
path: /_search
87+
capabilities: [ dense_vector_docvalue_fields ]
88+
test_runner_features: capabilities
89+
reason: "Support for dense vector doc value fields capability required"
90+
- do:
91+
search:
92+
_source: false
93+
index: test
94+
body:
95+
docvalue_fields: [name, vector1, vector2, vector3, vector4, vector5, vector6]
96+
sort: name
97+
98+
99+
- match: {hits.hits.0._id: "1"}
100+
- match: {hits.hits.0.fields.name.0: "cow.jpg"}
101+
102+
- length: {hits.hits.0.fields.vector1.0: 5}
103+
- length: {hits.hits.0.fields.vector2.0: 5}
104+
- length: {hits.hits.0.fields.vector3.0: 5}
105+
- length: {hits.hits.0.fields.vector4.0: 5}
106+
- length: {hits.hits.0.fields.vector5.0: 5}
107+
- length: {hits.hits.0.fields.vector6.0: 5}
108+
109+
- close_to: { hits.hits.0.fields.vector1.0.0: { value: 230.0, error: 0.001 } }
110+
- close_to: { hits.hits.0.fields.vector1.0.1: { value: 300.33, error: 0.001 } }
111+
- close_to: { hits.hits.0.fields.vector1.0.2: { value: -34.8988, error: 0.001 } }
112+
- close_to: { hits.hits.0.fields.vector1.0.3: { value: 15.555, error: 0.001 } }
113+
- close_to: { hits.hits.0.fields.vector1.0.4: { value: -200.0, error: 0.001 } }
114+
115+
- close_to: { hits.hits.0.fields.vector2.0.0: { value: 130.0, error: 0.001 } }
116+
- close_to: { hits.hits.0.fields.vector2.0.1: { value: 115.0, error: 0.001 } }
117+
- close_to: { hits.hits.0.fields.vector2.0.2: { value: -1.02, error: 0.001 } }
118+
- close_to: { hits.hits.0.fields.vector2.0.3: { value: 15.555, error: 0.001 } }
119+
- close_to: { hits.hits.0.fields.vector2.0.4: { value: -100.0, error: 0.001 } }
120+
121+
- match: {hits.hits.0.fields.vector3.0: [-1, 100, -13, 15, -128]}
122+
- match: {hits.hits.0.fields.vector4.0: [-1, 50, -1, 1, 120]}
123+
- match: {hits.hits.0.fields.vector5.0: [1, 111, -13, 15, -128]}
124+
- match: {hits.hits.0.fields.vector6.0: [-1, 11, 0, 12, 111]}
125+
126+
127+
- match: {hits.hits.1._id: "2"}
128+
- match: {hits.hits.1.fields.name.0: "moose.jpg"}
129+
130+
- length: {hits.hits.1.fields.vector1.0: 5}
131+
- length: {hits.hits.1.fields.vector4.0: 5}
132+
- length: {hits.hits.1.fields.vector5.0: 5}
133+
- match: {hits.hits.1.fields.vector2: null}
134+
- match: {hits.hits.1.fields.vector3: null}
135+
- match: {hits.hits.1.fields.vector6: null}
136+
137+
- close_to: { hits.hits.1.fields.vector1.0.0: { value: -0.5, error: 0.001 } }
138+
- close_to: { hits.hits.1.fields.vector1.0.1: { value: 100.0, error: 0.001 } }
139+
- close_to: { hits.hits.1.fields.vector1.0.2: { value: -13, error: 0.001 } }
140+
- close_to: { hits.hits.1.fields.vector1.0.3: { value: 14.8, error: 0.001 } }
141+
- close_to: { hits.hits.1.fields.vector1.0.4: { value: -156.0, error: 0.001 } }
142+
143+
- match: {hits.hits.1.fields.vector4.0: [-1, 50, -1, 1, 120]}
144+
- match: {hits.hits.1.fields.vector5.0: [1, 111, -13, 15, -128]}
145+
146+
147+
- match: {hits.hits.2._id: "3"}
148+
- match: {hits.hits.2.fields.name.0: "rabbit.jpg"}
149+
150+
- length: {hits.hits.2.fields.vector2.0: 5}
151+
- length: {hits.hits.2.fields.vector3.0: 5}
152+
- match: {hits.hits.2.fields.vector1: null}
153+
- match: {hits.hits.2.fields.vector4: null}
154+
- match: {hits.hits.2.fields.vector5: null}
155+
- match: {hits.hits.2.fields.vector6: null}
156+
157+
- close_to: { hits.hits.2.fields.vector2.0.0: { value: 130.0, error: 0.001 } }
158+
- close_to: { hits.hits.2.fields.vector2.0.1: { value: 115.0, error: 0.001 } }
159+
- close_to: { hits.hits.2.fields.vector2.0.2: { value: -1.02, error: 0.001 } }
160+
- close_to: { hits.hits.2.fields.vector2.0.3: { value: 15.555, error: 0.001 } }
161+
- close_to: { hits.hits.2.fields.vector2.0.4: { value: -100.0, error: 0.001 } }
162+
163+
- match: {hits.hits.2.fields.vector3.0: [-1, 100, -13, 15, -128]}

server/src/main/java/org/elasticsearch/index/mapper/vectors/DenseVectorFieldMapper.java

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1903,9 +1903,7 @@ protected Object parseSourceValue(Object value) {
19031903

19041904
@Override
19051905
public DocValueFormat docValueFormat(String format, ZoneId timeZone) {
1906-
throw new IllegalArgumentException(
1907-
"Field [" + name() + "] of type [" + typeName() + "] doesn't support docvalue_fields or aggregations"
1908-
);
1906+
return DocValueFormat.DENSE_VECTOR;
19091907
}
19101908

19111909
@Override

server/src/main/java/org/elasticsearch/index/mapper/vectors/VectorDVLeafFieldData.java

Lines changed: 120 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,9 +10,14 @@
1010
package org.elasticsearch.index.mapper.vectors;
1111

1212
import org.apache.lucene.index.BinaryDocValues;
13+
import org.apache.lucene.index.ByteVectorValues;
1314
import org.apache.lucene.index.DocValues;
15+
import org.apache.lucene.index.FloatVectorValues;
1416
import org.apache.lucene.index.LeafReader;
17+
import org.apache.lucene.search.DocIdSetIterator;
18+
import org.apache.lucene.util.BytesRef;
1519
import org.elasticsearch.index.IndexVersion;
20+
import org.elasticsearch.index.fielddata.FormattedDocValues;
1621
import org.elasticsearch.index.fielddata.LeafFieldData;
1722
import org.elasticsearch.index.fielddata.SortedBinaryDocValues;
1823
import org.elasticsearch.index.mapper.vectors.DenseVectorFieldMapper.ElementType;
@@ -23,8 +28,12 @@
2328
import org.elasticsearch.script.field.vectors.ByteBinaryDenseVectorDocValuesField;
2429
import org.elasticsearch.script.field.vectors.ByteKnnDenseVectorDocValuesField;
2530
import org.elasticsearch.script.field.vectors.KnnDenseVectorDocValuesField;
31+
import org.elasticsearch.search.DocValueFormat;
2632

2733
import java.io.IOException;
34+
import java.util.Arrays;
35+
36+
import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS;
2837

2938
final class VectorDVLeafFieldData implements LeafFieldData {
3039

@@ -76,4 +85,115 @@ public DocValuesScriptFieldFactory getScriptFieldFactory(String name) {
7685
}
7786
}
7887

88+
@Override
89+
public FormattedDocValues getFormattedValues(DocValueFormat format) {
90+
int dims = elementType == ElementType.BIT ? this.dims / Byte.SIZE : this.dims;
91+
return switch (elementType) {
92+
case BYTE, BIT -> new FormattedDocValues() {
93+
private byte[] vector = new byte[dims];
94+
private ByteVectorValues byteVectorValues; // use when indexed
95+
private BinaryDocValues binary; // use when not indexed
96+
{
97+
try {
98+
if (indexed) {
99+
byteVectorValues = reader.getByteVectorValues(field);
100+
} else {
101+
binary = DocValues.getBinary(reader, field);
102+
}
103+
} catch (IOException e) {
104+
throw new IllegalStateException("Cannot load doc values", e);
105+
}
106+
107+
}
108+
109+
@Override
110+
public boolean advanceExact(int docId) throws IOException {
111+
if (indexed) {
112+
if (iteratorAdvanceExact(byteVectorValues, docId) == false) {
113+
return false;
114+
}
115+
vector = byteVectorValues.vectorValue();
116+
} else {
117+
if (binary == null || binary.advanceExact(docId) == false) {
118+
return false;
119+
}
120+
BytesRef ref = binary.binaryValue();
121+
System.arraycopy(ref.bytes, ref.offset, vector, 0, dims);
122+
}
123+
return true;
124+
}
125+
126+
@Override
127+
public int docValueCount() {
128+
return 1;
129+
}
130+
131+
public Object nextValue() {
132+
Byte[] vectorValue = new Byte[dims];
133+
for (int i = 0; i < dims; i++) {
134+
vectorValue[i] = vector[i];
135+
}
136+
return vectorValue;
137+
}
138+
};
139+
case FLOAT -> new FormattedDocValues() {
140+
float[] vector = new float[dims];
141+
private FloatVectorValues floatVectorValues; // use when indexed
142+
private BinaryDocValues binary; // use when not indexed
143+
{
144+
try {
145+
if (indexed) {
146+
floatVectorValues = reader.getFloatVectorValues(field);
147+
} else {
148+
binary = DocValues.getBinary(reader, field);
149+
}
150+
} catch (IOException e) {
151+
throw new IllegalStateException("Cannot load doc values", e);
152+
}
153+
154+
}
155+
156+
@Override
157+
public boolean advanceExact(int docId) throws IOException {
158+
if (indexed) {
159+
if (iteratorAdvanceExact(floatVectorValues, docId) == false) {
160+
return false;
161+
}
162+
vector = floatVectorValues.vectorValue();
163+
} else {
164+
if (binary == null || binary.advanceExact(docId) == false) {
165+
return false;
166+
}
167+
BytesRef ref = binary.binaryValue();
168+
VectorEncoderDecoder.decodeDenseVector(indexVersion, ref, vector);
169+
}
170+
return true;
171+
}
172+
173+
@Override
174+
public int docValueCount() {
175+
return 1;
176+
}
177+
178+
@Override
179+
public Object nextValue() {
180+
return Arrays.copyOf(vector, vector.length);
181+
}
182+
};
183+
};
184+
}
185+
186+
private static boolean iteratorAdvanceExact(DocIdSetIterator iterator, int docId) throws IOException {
187+
if (iterator == null) return false;
188+
int currentDoc = iterator.docID();
189+
if (currentDoc == NO_MORE_DOCS || docId < currentDoc) {
190+
return false;
191+
} else if (docId > currentDoc) {
192+
currentDoc = iterator.advance(docId);
193+
if (currentDoc != docId) {
194+
return false;
195+
}
196+
}
197+
return true;
198+
}
79199
}

server/src/main/java/org/elasticsearch/rest/action/search/SearchCapabilities.java

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,8 @@ private SearchCapabilities() {}
2828
private static final String BIT_DENSE_VECTOR_SYNTHETIC_SOURCE_CAPABILITY = "bit_dense_vector_synthetic_source";
2929
/** Support Byte and Float with Bit dot product. */
3030
private static final String BYTE_FLOAT_BIT_DOT_PRODUCT_CAPABILITY = "byte_float_bit_dot_product";
31+
/** Support docvalue_fields parameter for `dense_vector` field. */
32+
private static final String DENSE_VECTOR_DOCVALUE_FIELDS = "dense_vector_docvalue_fields";
3133
/** Support kql query. */
3234
private static final String KQL_QUERY_SUPPORTED = "kql_query";
3335

@@ -37,7 +39,8 @@ private static Set<String> capabilities() {
3739
Set<String> capabilities = Set.of(
3840
RANGE_REGEX_INTERVAL_QUERY_CAPABILITY,
3941
BIT_DENSE_VECTOR_SYNTHETIC_SOURCE_CAPABILITY,
40-
BYTE_FLOAT_BIT_DOT_PRODUCT_CAPABILITY
42+
BYTE_FLOAT_BIT_DOT_PRODUCT_CAPABILITY,
43+
DENSE_VECTOR_DOCVALUE_FIELDS
4144
);
4245

4346
if (Build.current().isSnapshot()) {

server/src/main/java/org/elasticsearch/search/DocValueFormat.java

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -168,6 +168,31 @@ public String toString() {
168168
}
169169
};
170170

171+
DocValueFormat DENSE_VECTOR = DenseVectorDocValueFormat.INSTANCE;
172+
173+
/**
174+
* Singleton, stateless formatter, for dense vector values, no need to actually format anything
175+
*/
176+
class DenseVectorDocValueFormat implements DocValueFormat {
177+
178+
public static final DocValueFormat INSTANCE = new DenseVectorDocValueFormat();
179+
180+
private DenseVectorDocValueFormat() {}
181+
182+
@Override
183+
public String getWriteableName() {
184+
return "dense_vector";
185+
}
186+
187+
@Override
188+
public void writeTo(StreamOutput out) {}
189+
190+
@Override
191+
public String toString() {
192+
return "dense_vector";
193+
}
194+
};
195+
171196
DocValueFormat BINARY = BinaryDocValueFormat.INSTANCE;
172197

173198
/**

server/src/main/java/org/elasticsearch/search/SearchModule.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1020,6 +1020,7 @@ private void registerValueFormats() {
10201020
registerValueFormat(DocValueFormat.IP.getWriteableName(), in -> DocValueFormat.IP);
10211021
registerValueFormat(DocValueFormat.RAW.getWriteableName(), in -> DocValueFormat.RAW);
10221022
registerValueFormat(DocValueFormat.BINARY.getWriteableName(), in -> DocValueFormat.BINARY);
1023+
registerValueFormat(DocValueFormat.DENSE_VECTOR.getWriteableName(), in -> DocValueFormat.DENSE_VECTOR);
10231024
registerValueFormat(DocValueFormat.UNSIGNED_LONG_SHIFTED.getWriteableName(), in -> DocValueFormat.UNSIGNED_LONG_SHIFTED);
10241025
registerValueFormat(DocValueFormat.TIME_SERIES_ID.getWriteableName(), in -> DocValueFormat.TIME_SERIES_ID);
10251026
registerValueFormat(TS_ROUTING_HASH_DOC_VALUE_FORMAT.getWriteableName(), in -> TS_ROUTING_HASH_DOC_VALUE_FORMAT);

0 commit comments

Comments
 (0)