-
Notifications
You must be signed in to change notification settings - Fork 25.6k
ES|QL dense vector field type support #126456
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 14 commits
6439422
c7e48a0
9e40cd4
03f6a92
d84c8dd
9ff5ed8
2754697
d196be0
975b4db
ae9fa5f
9896adf
dfa420e
d983495
5707e25
a8c8a6a
d827c6d
d8e139d
0539aff
7bbe7ee
7d1f8b7
8870fef
ad84f13
2619fe6
b23d5a9
6bb8e62
8013e47
e0e34f4
c30f5d9
a2fbb13
4833ce5
e8878a0
93d45fc
a6b0f6c
cd462b8
0675a42
7f95d6a
2cff4e7
efc3fb6
f8741ca
c951ee7
ba0a6b9
f5889b2
4b2126e
afe79f7
8049a58
217ce84
74fe676
af965a1
53ec29c
60da0fe
06cbea9
8f07e08
53bcbc7
fe82007
a63ca2c
c5d6ddc
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -51,15 +51,20 @@ | |
import org.elasticsearch.index.fielddata.FieldDataContext; | ||
import org.elasticsearch.index.fielddata.IndexFieldData; | ||
import org.elasticsearch.index.mapper.ArraySourceValueFetcher; | ||
import org.elasticsearch.index.mapper.BlockDocValuesReader; | ||
import org.elasticsearch.index.mapper.BlockLoader; | ||
import org.elasticsearch.index.mapper.BlockSourceReader; | ||
import org.elasticsearch.index.mapper.DocumentParserContext; | ||
import org.elasticsearch.index.mapper.FieldMapper; | ||
import org.elasticsearch.index.mapper.MappedFieldType; | ||
import org.elasticsearch.index.mapper.Mapper; | ||
import org.elasticsearch.index.mapper.MapperBuilderContext; | ||
import org.elasticsearch.index.mapper.MapperParsingException; | ||
import org.elasticsearch.index.mapper.MappingParser; | ||
import org.elasticsearch.index.mapper.NumberFieldMapper; | ||
import org.elasticsearch.index.mapper.SimpleMappedFieldType; | ||
import org.elasticsearch.index.mapper.SourceLoader; | ||
import org.elasticsearch.index.mapper.SourceValueFetcher; | ||
import org.elasticsearch.index.mapper.TextSearchInfo; | ||
import org.elasticsearch.index.mapper.ValueFetcher; | ||
import org.elasticsearch.index.query.SearchExecutionContext; | ||
|
@@ -89,6 +94,7 @@ | |
import java.util.Map; | ||
import java.util.Objects; | ||
import java.util.Optional; | ||
import java.util.Set; | ||
import java.util.function.Function; | ||
import java.util.function.Supplier; | ||
import java.util.stream.Stream; | ||
|
@@ -2077,6 +2083,18 @@ protected Object parseSourceValue(Object value) { | |
}; | ||
} | ||
|
||
private SourceValueFetcher sourceValueFetcher(Set<String> sourcePaths) { | ||
return new SourceValueFetcher(sourcePaths, null) { | ||
@Override | ||
protected Object parseSourceValue(Object value) { | ||
if (value.equals("")) { | ||
return null; | ||
} | ||
return NumberFieldMapper.NumberType.FLOAT.parse(value, false); | ||
} | ||
}; | ||
} | ||
|
||
@Override | ||
public DocValueFormat docValueFormat(String format, ZoneId timeZone) { | ||
return DocValueFormat.DENSE_VECTOR; | ||
|
@@ -2311,6 +2329,20 @@ int getVectorDimensions() { | |
ElementType getElementType() { | ||
return elementType; | ||
} | ||
|
||
@Override | ||
public BlockLoader blockLoader(MappedFieldType.BlockLoaderContext blContext) { | ||
if (elementType != ElementType.FLOAT) { | ||
throw new UnsupportedOperationException("Only float dense vectors are supported for now"); | ||
|
||
} | ||
|
||
if (indexed) { | ||
return new BlockDocValuesReader.DenseVectorBlockLoader(name()); | ||
} | ||
|
||
BlockSourceReader.LeafIteratorLookup lookup = BlockSourceReader.lookupMatchingAll(); | ||
return new BlockSourceReader.DoublesBlockLoader(sourceValueFetcher(blContext.sourcePaths(name())), lookup); | ||
} | ||
} | ||
|
||
private final IndexOptions indexOptions; | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -35,6 +35,7 @@ | |
import static org.elasticsearch.common.logging.LoggerMessageFormat.format; | ||
import static org.elasticsearch.xpack.esql.CsvTestUtils.ExpectedResults; | ||
import static org.elasticsearch.xpack.esql.CsvTestUtils.Type; | ||
import static org.elasticsearch.xpack.esql.CsvTestUtils.Type.DENSE_VECTOR; | ||
import static org.elasticsearch.xpack.esql.CsvTestUtils.Type.UNSIGNED_LONG; | ||
import static org.elasticsearch.xpack.esql.CsvTestUtils.logMetaData; | ||
import static org.elasticsearch.xpack.esql.core.util.DateUtils.UTC_DATE_TIME_FORMATTER; | ||
|
@@ -145,6 +146,10 @@ private static void assertMetadata( | |
// Type.asType translates all bytes references into keywords | ||
continue; | ||
} | ||
if (blockType == Type.DOUBLE && expectedType == DENSE_VECTOR) { | ||
// DENSE_VECTOR is internally represented as a double block | ||
|
||
continue; | ||
} | ||
if (blockType == Type.NULL) { | ||
// Null pages don't have any real type information beyond "it's all null, man" | ||
continue; | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
id:l, vector:dense_vector | ||
0, [1.0, 2.0, 3.0] | ||
1, [4.0, 5.0, 6.0] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this might be very nitpicky - but can we add another There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. mvOrdered does not impact retrieval as it's used as an optimization in some cases. But adding unordered data uncovered a small fix that needed to be done to support multivalued style fields: 4b2126e |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
|
||
retrieveDenseVectorData | ||
required_capability: dense_vector_field_type | ||
|
||
FROM dense_vector | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can we add more csv tests here with commands we know should be supported 🙈 ? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I've added one in 93d45fc. I'm not sure what this would catch, as the inner representation of dense_vector is a DoubleBlock and that is extensively tested for other fields. I'm sure we will keep adding tests once we include arithmetic operators, conversion, etc to dense_vector. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do we expected that the items in a Do we expect the functions/commands that take multi-valued fields apply to If I understand it right, There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Yes. It's crucial that the vector dimensions match between different vectors, so they can be compared.
I've used MV as this seemed a supported way of internally representing a double array. From a user perspective, it's a different data type - it's a
We were thinking on adding a
Besides that, there should be no relation between the two. They are different data types that have the same representation (an array of elements).
It would probably help to differentiate the two data types if We can provide support for multivalued functions, but most of them will not make sense in the context of a dense_vector (MV_APPEND, MV_CONCAT, MV_DEDUPE, MV_SORT, MV_SUM). Others can be useful even though they are not necessarily vector related (MV_COUNT, MV_FIRST,MV_MEDIAN, MV_MAX, MV_MIN), but supporting those could confuse ESQL users.
Correct.
We could support equality. Binary comparisons like greater / less than makes no sense for There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks for adding the tests - this is addressed from my POV. |
||
| KEEP id, vector | ||
| SORT id | ||
; | ||
|
||
id:l | vector:dense_vector | ||
0 | [1.0, 2.0, 3.0] | ||
1 | [4.0, 5.0, 6.0] | ||
; |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -63,18 +63,7 @@ | |
"type" : "keyword" | ||
}, | ||
"salary_change": { | ||
"type": "float", | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Changing CSV loading made this a problem, as there were parsing exceptions when trying to index float numeric data into integers. I didn't see a convenient way out of this and decided to remove as this field is not being tested. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If we can keep this unchanged, that will be great, this is a good example of nested fields. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It is just changed for the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Is it easier if we make another copy of employees's schema and data for There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
The problem is that changing how the CSV tests load data impacted this dataset. Before this change, multivalues were being uploaded as arrays of strings, which is something we don't want to do for dense_vectors as that is not a format supported on the It seemed like too much work to change the actual dataset when that particular field is not actually used in the tests. I'm open to other solutions here! There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @fang-xing-esql are these fields used in other tests somehow? |
||
"fields": { | ||
"int": { | ||
"type": "integer" | ||
}, | ||
"long": { | ||
"type": "long" | ||
}, | ||
"keyword": { | ||
"type" : "keyword" | ||
} | ||
} | ||
"type": "float" | ||
} | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
{ | ||
"properties": { | ||
"id": { | ||
"type": "long" | ||
}, | ||
"vector": { | ||
"type": "dense_vector", | ||
"similarity": "l2_norm" | ||
} | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Added a BlockLoader for dense vectors, that uses
FloatVectorValues
to retrieve indexed vector data.