Skip to content

Commit 3863edc

Browse files
jimcziJeremyDahlgren
authored andcommitted
Fix handling of empty sparse_vector excluded from _source (elastic#133736)
When a sparse_vector is excluded from _source, retrieval of empty values was incorrect. We now return an empty map when rehydrating from _source instead of omitting the value. Closes elastic#133184 Closes elastic#133508 Closes elastic#133508
1 parent a00c6b7 commit 3863edc

File tree

4 files changed

+105
-29
lines changed

4 files changed

+105
-29
lines changed

muted-tests.yml

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -522,9 +522,6 @@ tests:
522522
- class: org.elasticsearch.xpack.ml.integration.TextEmbeddingQueryIT
523523
method: testModelWithPrefixStrings
524524
issue: https://github.com/elastic/elasticsearch/issues/133138
525-
- class: org.elasticsearch.test.rest.yaml.CcsCommonYamlTestSuiteIT
526-
method: test {p0=search.vectors/90_sparse_vector/Indexing and searching multi-value sparse vectors in >=8.15}
527-
issue: https://github.com/elastic/elasticsearch/issues/133184
528525
- class: org.elasticsearch.test.rest.yaml.CcsCommonYamlTestSuiteIT
529526
method: test {p0=search.vectors/45_knn_search_byte/Vector rescoring has no effect for non-quantized vectors and provides same results as non-rescored knn}
530527
issue: https://github.com/elastic/elasticsearch/issues/133187
@@ -558,9 +555,6 @@ tests:
558555
- class: org.elasticsearch.test.rest.yaml.RcsCcsCommonYamlTestSuiteIT
559556
method: test {p0=search/160_exists_query/Test exists query on date field in empty index}
560557
issue: https://github.com/elastic/elasticsearch/issues/133439
561-
- class: org.elasticsearch.multiproject.test.CoreWithMultipleProjectsClientYamlTestSuiteIT
562-
method: test {yaml=search.vectors/90_sparse_vector/Indexing and searching multi-value sparse vectors in >=8.15}
563-
issue: https://github.com/elastic/elasticsearch/issues/133442
564558
- class: org.elasticsearch.xpack.test.rest.XPackRestIT
565559
method: test {p0=esql/60_usage/Basic ESQL usage output (telemetry) non-snapshot version}
566560
issue: https://github.com/elastic/elasticsearch/issues/133449
@@ -582,9 +576,6 @@ tests:
582576
- class: org.elasticsearch.xpack.esql.qa.mixed.MixedClusterEsqlSpecIT
583577
method: test {csv-spec:spatial.ConvertFromStringParseError}
584578
issue: https://github.com/elastic/elasticsearch/issues/133507
585-
- class: org.elasticsearch.test.rest.yaml.RcsCcsCommonYamlTestSuiteIT
586-
method: test {p0=search.vectors/90_sparse_vector/Indexing and searching multi-value sparse vectors in >=8.15}
587-
issue: https://github.com/elastic/elasticsearch/issues/133508
588579
- class: org.elasticsearch.test.rest.yaml.RcsCcsCommonYamlTestSuiteIT
589580
method: test {p0=search/10_source_filtering/no filtering}
590581
issue: https://github.com/elastic/elasticsearch/issues/133561

server/src/internalClusterTest/java/org/elasticsearch/search/query/VectorIT.java

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
import org.elasticsearch.common.settings.Settings;
1515
import org.elasticsearch.index.mapper.vectors.DenseVectorFieldMapper;
1616
import org.elasticsearch.index.query.QueryBuilders;
17+
import org.elasticsearch.search.aggregations.bucket.terms.TermsAggregationBuilder;
1718
import org.elasticsearch.search.vectors.KnnSearchBuilder;
1819
import org.elasticsearch.test.ESIntegTestCase;
1920
import org.elasticsearch.xcontent.XContentBuilder;
@@ -22,6 +23,7 @@
2223

2324
import java.io.IOException;
2425
import java.util.List;
26+
import java.util.Map;
2527

2628
import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertResponse;
2729

@@ -178,4 +180,47 @@ public void testHnswEarlyTerminationQuery() {
178180
});
179181
}
180182

183+
public void testSparseVectorExists() throws IOException {
184+
String indexName = "sparse_vector_index";
185+
XContentBuilder mapping = XContentFactory.jsonBuilder()
186+
.startObject()
187+
.startObject("properties")
188+
.startObject("id")
189+
.field("type", "long")
190+
.endObject()
191+
.startObject(VECTOR_FIELD)
192+
.field("type", "sparse_vector")
193+
.endObject()
194+
.startObject("embeddings")
195+
.field("type", "sparse_vector")
196+
.endObject()
197+
.endObject()
198+
.endObject();
199+
Settings settings = Settings.builder()
200+
.put(IndexMetadata.SETTING_NUMBER_OF_REPLICAS, 10)
201+
.put(IndexMetadata.SETTING_NUMBER_OF_SHARDS, 1)
202+
.build();
203+
prepareCreate("sparse_vector_index").setMapping(mapping).setSettings(settings).get();
204+
int loops = 10;
205+
for (int i = 0; i < loops; i++) {
206+
prepareIndex(indexName).setSource(VECTOR_FIELD, List.of(Map.of("dim", 1.0f), Map.of("dim", 12.0f)), "id", 1).get();
207+
prepareIndex(indexName).setSource(VECTOR_FIELD, Map.of("dim", 2.0f), "id", 2).get();
208+
prepareIndex(indexName).setSource(VECTOR_FIELD, List.of(), "id", 3).get();
209+
prepareIndex(indexName).setSource(VECTOR_FIELD, Map.of(), "id", 4).get();
210+
refresh(indexName);
211+
}
212+
TermsAggregationBuilder builder = new TermsAggregationBuilder("agg").field("id").size(1000);
213+
for (int i = 0; i < 10; i++) {
214+
assertResponse(
215+
client().prepareSearch(indexName)
216+
.setQuery(QueryBuilders.existsQuery(VECTOR_FIELD))
217+
.setTrackTotalHits(true)
218+
.setSize(30)
219+
.addAggregation(builder),
220+
resp -> {
221+
assertEquals(3 * loops, resp.getHits().getTotalHits().value());
222+
}
223+
);
224+
}
225+
}
181226
}

server/src/main/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapper.java

Lines changed: 50 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -13,9 +13,13 @@
1313
import org.apache.lucene.index.IndexableField;
1414
import org.apache.lucene.index.LeafReader;
1515
import org.apache.lucene.index.PostingsEnum;
16+
import org.apache.lucene.index.Term;
1617
import org.apache.lucene.index.TermsEnum;
18+
import org.apache.lucene.search.IndexSearcher;
1719
import org.apache.lucene.search.MatchNoDocsQuery;
1820
import org.apache.lucene.search.Query;
21+
import org.apache.lucene.search.ScoreMode;
22+
import org.apache.lucene.search.TermQuery;
1923
import org.apache.lucene.util.BytesRef;
2024
import org.elasticsearch.common.logging.DeprecationCategory;
2125
import org.elasticsearch.common.lucene.Lucene;
@@ -29,6 +33,7 @@
2933
import org.elasticsearch.index.fielddata.IndexFieldData;
3034
import org.elasticsearch.index.mapper.DocumentParserContext;
3135
import org.elasticsearch.index.mapper.FieldMapper;
36+
import org.elasticsearch.index.mapper.FieldNamesFieldMapper;
3237
import org.elasticsearch.index.mapper.MappedFieldType;
3338
import org.elasticsearch.index.mapper.MapperBuilderContext;
3439
import org.elasticsearch.index.mapper.MappingParserContext;
@@ -430,6 +435,7 @@ private static class SparseVectorSyntheticFieldLoader implements SourceLoader.Sy
430435
private final String leafName;
431436

432437
private TermsEnum termsDocEnum;
438+
private boolean hasValue;
433439

434440
private SparseVectorSyntheticFieldLoader(String fullPath, String leafName) {
435441
this.fullPath = fullPath;
@@ -443,39 +449,60 @@ public Stream<Map.Entry<String, StoredFieldLoader>> storedFieldLoaders() {
443449

444450
@Override
445451
public DocValuesLoader docValuesLoader(LeafReader leafReader, int[] docIdsInLeaf) throws IOException {
446-
var fieldInfos = leafReader.getFieldInfos().fieldInfo(fullPath);
447-
if (fieldInfos == null || fieldInfos.hasTermVectors() == false) {
448-
return null;
452+
// Use an exists query on _field_names to distinguish documents with no value
453+
// from those containing an empty map.
454+
var existsQuery = new TermQuery(new Term(FieldNamesFieldMapper.NAME, fullPath));
455+
var searcher = new IndexSearcher(leafReader);
456+
searcher.setQueryCache(null);
457+
var scorer = searcher.createWeight(existsQuery, ScoreMode.COMPLETE_NO_SCORES, 0).scorer(searcher.getLeafContexts().getFirst());
458+
if (scorer == null) {
459+
return docId -> false;
449460
}
461+
462+
var fieldInfos = leafReader.getFieldInfos().fieldInfo(fullPath);
463+
boolean hasTermVectors = fieldInfos != null && fieldInfos.hasTermVectors();
450464
return docId -> {
451-
var terms = leafReader.termVectors().get(docId, fullPath);
452-
if (terms == null) {
453-
return false;
465+
termsDocEnum = null;
466+
467+
if (scorer.iterator().docID() < docId) {
468+
scorer.iterator().advance(docId);
454469
}
455-
termsDocEnum = terms.iterator();
456-
if (termsDocEnum.next() == null) {
457-
termsDocEnum = null;
458-
return false;
470+
if (scorer.iterator().docID() != docId) {
471+
return hasValue = false;
459472
}
460-
return true;
473+
474+
if (hasTermVectors == false) {
475+
return hasValue = true;
476+
}
477+
478+
var terms = leafReader.termVectors().get(docId, fullPath);
479+
if (terms != null) {
480+
termsDocEnum = terms.iterator();
481+
if (termsDocEnum.next() == null) {
482+
termsDocEnum = null;
483+
}
484+
}
485+
return hasValue = true;
461486
};
462487
}
463488

464489
@Override
465490
public boolean hasValue() {
466-
return termsDocEnum != null;
491+
return hasValue;
467492
}
468493

469494
@Override
470495
public void write(XContentBuilder b) throws IOException {
471-
assert termsDocEnum != null;
472-
PostingsEnum reuse = null;
496+
assert hasValue;
473497
b.startObject(leafName);
474-
do {
475-
reuse = termsDocEnum.postings(reuse);
476-
reuse.nextDoc();
477-
b.field(termsDocEnum.term().utf8ToString(), XFeatureField.decodeFeatureValue(reuse.freq()));
478-
} while (termsDocEnum.next() != null);
498+
if (termsDocEnum != null) {
499+
PostingsEnum reuse = null;
500+
do {
501+
reuse = termsDocEnum.postings(reuse);
502+
reuse.nextDoc();
503+
b.field(termsDocEnum.term().utf8ToString(), XFeatureField.decodeFeatureValue(reuse.freq()));
504+
} while (termsDocEnum.next() != null);
505+
}
479506
b.endObject();
480507
}
481508

@@ -485,7 +512,10 @@ public void write(XContentBuilder b) throws IOException {
485512
* @throws IOException if reading fails
486513
*/
487514
private Map<String, Float> copyAsMap() throws IOException {
488-
assert termsDocEnum != null;
515+
assert hasValue;
516+
if (termsDocEnum == null) {
517+
return Map.of();
518+
}
489519
Map<String, Float> tokenMap = new LinkedHashMap<>();
490520
PostingsEnum reuse = null;
491521
do {

server/src/test/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapperTests.java

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,16 @@ protected void minimalMapping(XContentBuilder b) throws IOException {
107107
b.field("type", "sparse_vector");
108108
}
109109

110+
@Override
111+
protected boolean supportsEmptyInputArray() {
112+
return false;
113+
}
114+
115+
@Override
116+
protected boolean addsValueWhenNotSupplied() {
117+
return true;
118+
}
119+
110120
protected void minimalFieldMappingPreviousIndexDefaultsIncluded(XContentBuilder b) throws IOException {
111121
b.field("type", "sparse_vector");
112122
b.field("store", false);

0 commit comments

Comments
 (0)