diff --git a/muted-tests.yml b/muted-tests.yml index 1db0e2958e479..cdbc29768853c 100644 --- a/muted-tests.yml +++ b/muted-tests.yml @@ -522,9 +522,6 @@ tests: - class: org.elasticsearch.xpack.ml.integration.TextEmbeddingQueryIT method: testModelWithPrefixStrings issue: https://github.com/elastic/elasticsearch/issues/133138 -- class: org.elasticsearch.test.rest.yaml.CcsCommonYamlTestSuiteIT - method: test {p0=search.vectors/90_sparse_vector/Indexing and searching multi-value sparse vectors in >=8.15} - issue: https://github.com/elastic/elasticsearch/issues/133184 - class: org.elasticsearch.test.rest.yaml.CcsCommonYamlTestSuiteIT method: test {p0=search.vectors/45_knn_search_byte/Vector rescoring has no effect for non-quantized vectors and provides same results as non-rescored knn} issue: https://github.com/elastic/elasticsearch/issues/133187 @@ -558,9 +555,6 @@ tests: - class: org.elasticsearch.test.rest.yaml.RcsCcsCommonYamlTestSuiteIT method: test {p0=search/160_exists_query/Test exists query on date field in empty index} issue: https://github.com/elastic/elasticsearch/issues/133439 -- class: org.elasticsearch.multiproject.test.CoreWithMultipleProjectsClientYamlTestSuiteIT - method: test {yaml=search.vectors/90_sparse_vector/Indexing and searching multi-value sparse vectors in >=8.15} - issue: https://github.com/elastic/elasticsearch/issues/133442 - class: org.elasticsearch.xpack.test.rest.XPackRestIT method: test {p0=esql/60_usage/Basic ESQL usage output (telemetry) non-snapshot version} issue: https://github.com/elastic/elasticsearch/issues/133449 @@ -582,9 +576,6 @@ tests: - class: org.elasticsearch.xpack.esql.qa.mixed.MixedClusterEsqlSpecIT method: test {csv-spec:spatial.ConvertFromStringParseError} issue: https://github.com/elastic/elasticsearch/issues/133507 -- class: org.elasticsearch.test.rest.yaml.RcsCcsCommonYamlTestSuiteIT - method: test {p0=search.vectors/90_sparse_vector/Indexing and searching multi-value sparse vectors in >=8.15} - issue: https://github.com/elastic/elasticsearch/issues/133508 - class: org.elasticsearch.test.rest.yaml.RcsCcsCommonYamlTestSuiteIT method: test {p0=search/10_source_filtering/no filtering} issue: https://github.com/elastic/elasticsearch/issues/133561 diff --git a/server/src/internalClusterTest/java/org/elasticsearch/search/query/VectorIT.java b/server/src/internalClusterTest/java/org/elasticsearch/search/query/VectorIT.java index 3dabe1b37b43e..496bea95b7d65 100644 --- a/server/src/internalClusterTest/java/org/elasticsearch/search/query/VectorIT.java +++ b/server/src/internalClusterTest/java/org/elasticsearch/search/query/VectorIT.java @@ -14,6 +14,7 @@ import org.elasticsearch.common.settings.Settings; import org.elasticsearch.index.mapper.vectors.DenseVectorFieldMapper; import org.elasticsearch.index.query.QueryBuilders; +import org.elasticsearch.search.aggregations.bucket.terms.TermsAggregationBuilder; import org.elasticsearch.search.vectors.KnnSearchBuilder; import org.elasticsearch.test.ESIntegTestCase; import org.elasticsearch.xcontent.XContentBuilder; @@ -22,6 +23,7 @@ import java.io.IOException; import java.util.List; +import java.util.Map; import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertResponse; @@ -178,4 +180,47 @@ public void testHnswEarlyTerminationQuery() { }); } + public void testSparseVectorExists() throws IOException { + String indexName = "sparse_vector_index"; + XContentBuilder mapping = XContentFactory.jsonBuilder() + .startObject() + .startObject("properties") + .startObject("id") + .field("type", "long") + .endObject() + .startObject(VECTOR_FIELD) + .field("type", "sparse_vector") + .endObject() + .startObject("embeddings") + .field("type", "sparse_vector") + .endObject() + .endObject() + .endObject(); + Settings settings = Settings.builder() + .put(IndexMetadata.SETTING_NUMBER_OF_REPLICAS, 10) + .put(IndexMetadata.SETTING_NUMBER_OF_SHARDS, 1) + .build(); + prepareCreate("sparse_vector_index").setMapping(mapping).setSettings(settings).get(); + int loops = 10; + for (int i = 0; i < loops; i++) { + prepareIndex(indexName).setSource(VECTOR_FIELD, List.of(Map.of("dim", 1.0f), Map.of("dim", 12.0f)), "id", 1).get(); + prepareIndex(indexName).setSource(VECTOR_FIELD, Map.of("dim", 2.0f), "id", 2).get(); + prepareIndex(indexName).setSource(VECTOR_FIELD, List.of(), "id", 3).get(); + prepareIndex(indexName).setSource(VECTOR_FIELD, Map.of(), "id", 4).get(); + refresh(indexName); + } + TermsAggregationBuilder builder = new TermsAggregationBuilder("agg").field("id").size(1000); + for (int i = 0; i < 10; i++) { + assertResponse( + client().prepareSearch(indexName) + .setQuery(QueryBuilders.existsQuery(VECTOR_FIELD)) + .setTrackTotalHits(true) + .setSize(30) + .addAggregation(builder), + resp -> { + assertEquals(3 * loops, resp.getHits().getTotalHits().value()); + } + ); + } + } } diff --git a/server/src/main/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapper.java b/server/src/main/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapper.java index ef76540525898..e8f2bd5b8ddf6 100644 --- a/server/src/main/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapper.java +++ b/server/src/main/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapper.java @@ -13,9 +13,13 @@ import org.apache.lucene.index.IndexableField; import org.apache.lucene.index.LeafReader; import org.apache.lucene.index.PostingsEnum; +import org.apache.lucene.index.Term; import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.MatchNoDocsQuery; import org.apache.lucene.search.Query; +import org.apache.lucene.search.ScoreMode; +import org.apache.lucene.search.TermQuery; import org.apache.lucene.util.BytesRef; import org.elasticsearch.common.logging.DeprecationCategory; import org.elasticsearch.common.lucene.Lucene; @@ -29,6 +33,7 @@ import org.elasticsearch.index.fielddata.IndexFieldData; import org.elasticsearch.index.mapper.DocumentParserContext; import org.elasticsearch.index.mapper.FieldMapper; +import org.elasticsearch.index.mapper.FieldNamesFieldMapper; import org.elasticsearch.index.mapper.MappedFieldType; import org.elasticsearch.index.mapper.MapperBuilderContext; import org.elasticsearch.index.mapper.MappingParserContext; @@ -430,6 +435,7 @@ private static class SparseVectorSyntheticFieldLoader implements SourceLoader.Sy private final String leafName; private TermsEnum termsDocEnum; + private boolean hasValue; private SparseVectorSyntheticFieldLoader(String fullPath, String leafName) { this.fullPath = fullPath; @@ -443,39 +449,60 @@ public Stream> storedFieldLoaders() { @Override public DocValuesLoader docValuesLoader(LeafReader leafReader, int[] docIdsInLeaf) throws IOException { - var fieldInfos = leafReader.getFieldInfos().fieldInfo(fullPath); - if (fieldInfos == null || fieldInfos.hasTermVectors() == false) { - return null; + // Use an exists query on _field_names to distinguish documents with no value + // from those containing an empty map. + var existsQuery = new TermQuery(new Term(FieldNamesFieldMapper.NAME, fullPath)); + var searcher = new IndexSearcher(leafReader); + searcher.setQueryCache(null); + var scorer = searcher.createWeight(existsQuery, ScoreMode.COMPLETE_NO_SCORES, 0).scorer(searcher.getLeafContexts().getFirst()); + if (scorer == null) { + return docId -> false; } + + var fieldInfos = leafReader.getFieldInfos().fieldInfo(fullPath); + boolean hasTermVectors = fieldInfos != null && fieldInfos.hasTermVectors(); return docId -> { - var terms = leafReader.termVectors().get(docId, fullPath); - if (terms == null) { - return false; + termsDocEnum = null; + + if (scorer.iterator().docID() < docId) { + scorer.iterator().advance(docId); } - termsDocEnum = terms.iterator(); - if (termsDocEnum.next() == null) { - termsDocEnum = null; - return false; + if (scorer.iterator().docID() != docId) { + return hasValue = false; } - return true; + + if (hasTermVectors == false) { + return hasValue = true; + } + + var terms = leafReader.termVectors().get(docId, fullPath); + if (terms != null) { + termsDocEnum = terms.iterator(); + if (termsDocEnum.next() == null) { + termsDocEnum = null; + } + } + return hasValue = true; }; } @Override public boolean hasValue() { - return termsDocEnum != null; + return hasValue; } @Override public void write(XContentBuilder b) throws IOException { - assert termsDocEnum != null; - PostingsEnum reuse = null; + assert hasValue; b.startObject(leafName); - do { - reuse = termsDocEnum.postings(reuse); - reuse.nextDoc(); - b.field(termsDocEnum.term().utf8ToString(), XFeatureField.decodeFeatureValue(reuse.freq())); - } while (termsDocEnum.next() != null); + if (termsDocEnum != null) { + PostingsEnum reuse = null; + do { + reuse = termsDocEnum.postings(reuse); + reuse.nextDoc(); + b.field(termsDocEnum.term().utf8ToString(), XFeatureField.decodeFeatureValue(reuse.freq())); + } while (termsDocEnum.next() != null); + } b.endObject(); } @@ -485,7 +512,10 @@ public void write(XContentBuilder b) throws IOException { * @throws IOException if reading fails */ private Map copyAsMap() throws IOException { - assert termsDocEnum != null; + assert hasValue; + if (termsDocEnum == null) { + return Map.of(); + } Map tokenMap = new LinkedHashMap<>(); PostingsEnum reuse = null; do { diff --git a/server/src/test/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapperTests.java b/server/src/test/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapperTests.java index 4363606db10ba..5448fd792625a 100644 --- a/server/src/test/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapperTests.java +++ b/server/src/test/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapperTests.java @@ -107,6 +107,16 @@ protected void minimalMapping(XContentBuilder b) throws IOException { b.field("type", "sparse_vector"); } + @Override + protected boolean supportsEmptyInputArray() { + return false; + } + + @Override + protected boolean addsValueWhenNotSupplied() { + return true; + } + protected void minimalFieldMappingPreviousIndexDefaultsIncluded(XContentBuilder b) throws IOException { b.field("type", "sparse_vector"); b.field("store", false);