From 4983fdb76424b428eb5690522c3eb46b7660599e Mon Sep 17 00:00:00 2001 From: Benjamin Trent Date: Tue, 17 Jun 2025 17:39:19 -0400 Subject: [PATCH 1/2] Fix NPE in flat_bbq scorer when all vectors are missing (#129548) It is possible to get all the way down to the knn format reader and there be no vectors in the index. This execution path is possible if utilizing nested queries (which bypasses the higher level checks in `KnnFloatVectorQuery#approximateSearch`). bbq_flat should check for the existence of vectors before attempting to create the scorer. (cherry picked from commit 80667d0c8a80c88e900cac6f2006f0471ee6f234) --- docs/changelog/129548.yaml | 5 ++ .../es816/ES816BinaryFlatVectorsScorer.java | 3 + .../ES816BinaryQuantizedVectorsReader.java | 2 +- .../es818/ES818BinaryFlatVectorsScorer.java | 3 + .../ES818BinaryQuantizedVectorsReader.java | 2 +- ...S816BinaryQuantizedVectorsFormatTests.java | 65 +++++++++++++++++++ ...S818BinaryQuantizedVectorsFormatTests.java | 65 +++++++++++++++++++ 7 files changed, 143 insertions(+), 2 deletions(-) create mode 100644 docs/changelog/129548.yaml diff --git a/docs/changelog/129548.yaml b/docs/changelog/129548.yaml new file mode 100644 index 0000000000000..cb5b95faef1a7 --- /dev/null +++ b/docs/changelog/129548.yaml @@ -0,0 +1,5 @@ +pr: 129548 +summary: Fix NPE in `flat_bbq` scorer when all vectors are missing +area: Vector Search +type: bug +issues: [] diff --git a/server/src/main/java/org/elasticsearch/index/codec/vectors/es816/ES816BinaryFlatVectorsScorer.java b/server/src/main/java/org/elasticsearch/index/codec/vectors/es816/ES816BinaryFlatVectorsScorer.java index 5f3d09f03e95a..511ecde5a4978 100644 --- a/server/src/main/java/org/elasticsearch/index/codec/vectors/es816/ES816BinaryFlatVectorsScorer.java +++ b/server/src/main/java/org/elasticsearch/index/codec/vectors/es816/ES816BinaryFlatVectorsScorer.java @@ -59,6 +59,9 @@ public RandomVectorScorer getRandomVectorScorer( float[] target ) throws IOException { if (vectorValues instanceof RandomAccessBinarizedByteVectorValues binarizedVectors) { + assert binarizedVectors.getQuantizer() != null + : "BinarizedByteVectorValues must have a quantizer for ES816BinaryFlatVectorsScorer"; + assert binarizedVectors.size() > 0 : "BinarizedByteVectorValues must have at least one vector for ES816BinaryFlatVectorsScorer"; BinaryQuantizer quantizer = binarizedVectors.getQuantizer(); float[] centroid = binarizedVectors.getCentroid(); // FIXME: precompute this once? diff --git a/server/src/main/java/org/elasticsearch/index/codec/vectors/es816/ES816BinaryQuantizedVectorsReader.java b/server/src/main/java/org/elasticsearch/index/codec/vectors/es816/ES816BinaryQuantizedVectorsReader.java index 55f9bec577d82..ee98cf87ebdd0 100644 --- a/server/src/main/java/org/elasticsearch/index/codec/vectors/es816/ES816BinaryQuantizedVectorsReader.java +++ b/server/src/main/java/org/elasticsearch/index/codec/vectors/es816/ES816BinaryQuantizedVectorsReader.java @@ -154,7 +154,7 @@ static void validateFieldEntry(FieldInfo info, FieldEntry fieldEntry) { @Override public RandomVectorScorer getRandomVectorScorer(String field, float[] target) throws IOException { FieldEntry fi = fields.get(field); - if (fi == null) { + if (fi == null || fi.size() == 0) { return null; } return vectorScorer.getRandomVectorScorer( diff --git a/server/src/main/java/org/elasticsearch/index/codec/vectors/es818/ES818BinaryFlatVectorsScorer.java b/server/src/main/java/org/elasticsearch/index/codec/vectors/es818/ES818BinaryFlatVectorsScorer.java index 318ef5b28138c..91c4cd27d5b4b 100644 --- a/server/src/main/java/org/elasticsearch/index/codec/vectors/es818/ES818BinaryFlatVectorsScorer.java +++ b/server/src/main/java/org/elasticsearch/index/codec/vectors/es818/ES818BinaryFlatVectorsScorer.java @@ -64,6 +64,9 @@ public RandomVectorScorer getRandomVectorScorer( float[] target ) throws IOException { if (vectorValues instanceof RandomAccessBinarizedByteVectorValues binarizedVectors) { + assert binarizedVectors.getQuantizer() != null + : "BinarizedByteVectorValues must have a quantizer for ES816BinaryFlatVectorsScorer"; + assert binarizedVectors.size() > 0 : "BinarizedByteVectorValues must have at least one vector for ES816BinaryFlatVectorsScorer"; OptimizedScalarQuantizer quantizer = binarizedVectors.getQuantizer(); float[] centroid = binarizedVectors.getCentroid(); // We make a copy as the quantization process mutates the input diff --git a/server/src/main/java/org/elasticsearch/index/codec/vectors/es818/ES818BinaryQuantizedVectorsReader.java b/server/src/main/java/org/elasticsearch/index/codec/vectors/es818/ES818BinaryQuantizedVectorsReader.java index 4a0bcf165a462..bd74aa8330485 100644 --- a/server/src/main/java/org/elasticsearch/index/codec/vectors/es818/ES818BinaryQuantizedVectorsReader.java +++ b/server/src/main/java/org/elasticsearch/index/codec/vectors/es818/ES818BinaryQuantizedVectorsReader.java @@ -153,7 +153,7 @@ static void validateFieldEntry(FieldInfo info, FieldEntry fieldEntry) { @Override public RandomVectorScorer getRandomVectorScorer(String field, float[] target) throws IOException { FieldEntry fi = fields.get(field); - if (fi == null) { + if (fi == null || fi.size() == 0) { return null; } return vectorScorer.getRandomVectorScorer( diff --git a/server/src/test/java/org/elasticsearch/index/codec/vectors/es816/ES816BinaryQuantizedVectorsFormatTests.java b/server/src/test/java/org/elasticsearch/index/codec/vectors/es816/ES816BinaryQuantizedVectorsFormatTests.java index 0a26b74b6be27..a5d445aca9e4a 100644 --- a/server/src/test/java/org/elasticsearch/index/codec/vectors/es816/ES816BinaryQuantizedVectorsFormatTests.java +++ b/server/src/test/java/org/elasticsearch/index/codec/vectors/es816/ES816BinaryQuantizedVectorsFormatTests.java @@ -24,6 +24,7 @@ import org.apache.lucene.codecs.KnnVectorsFormat; import org.apache.lucene.codecs.lucene912.Lucene912Codec; import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; import org.apache.lucene.document.KnnFloatVectorField; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.FloatVectorValues; @@ -31,18 +32,30 @@ import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.LeafReader; +import org.apache.lucene.index.SoftDeletesRetentionMergePolicy; +import org.apache.lucene.index.Term; import org.apache.lucene.index.VectorSimilarityFunction; +import org.apache.lucene.search.FieldExistsQuery; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.KnnFloatVectorQuery; +import org.apache.lucene.search.MatchAllDocsQuery; import org.apache.lucene.search.Query; +import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.TopDocs; import org.apache.lucene.search.TotalHits; +import org.apache.lucene.search.join.BitSetProducer; +import org.apache.lucene.search.join.CheckJoinIndex; +import org.apache.lucene.search.join.DiversifyingChildrenFloatKnnVectorQuery; +import org.apache.lucene.search.join.QueryBitSetProducer; import org.apache.lucene.store.Directory; import org.apache.lucene.tests.index.BaseKnnVectorsFormatTestCase; import org.elasticsearch.common.logging.LogConfigurator; import org.elasticsearch.index.codec.vectors.BQVectorUtils; import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; import java.util.Locale; import static java.lang.String.format; @@ -67,6 +80,58 @@ public KnnVectorsFormat getKnnVectorsFormatForField(String field) { }; } + static String encodeInts(int[] i) { + return Arrays.toString(i); + } + + static BitSetProducer parentFilter(IndexReader r) throws IOException { + // Create a filter that defines "parent" documents in the index + BitSetProducer parentsFilter = new QueryBitSetProducer(new TermQuery(new Term("docType", "_parent"))); + CheckJoinIndex.check(r, parentsFilter); + return parentsFilter; + } + + Document makeParent(int[] children) { + Document parent = new Document(); + parent.add(newStringField("docType", "_parent", Field.Store.NO)); + parent.add(newStringField("id", encodeInts(children), Field.Store.YES)); + return parent; + } + + public void testEmptyDiversifiedChildSearch() throws Exception { + String fieldName = "field"; + int dims = random().nextInt(4, 65); + float[] vector = randomVector(dims); + VectorSimilarityFunction similarityFunction = VectorSimilarityFunction.EUCLIDEAN; + try (Directory d = newDirectory()) { + IndexWriterConfig iwc = newIndexWriterConfig().setCodec(codec); + iwc.setMergePolicy(new SoftDeletesRetentionMergePolicy("soft_delete", MatchAllDocsQuery::new, iwc.getMergePolicy())); + try (IndexWriter w = new IndexWriter(d, iwc)) { + List toAdd = new ArrayList<>(); + for (int j = 1; j <= 5; j++) { + Document doc = new Document(); + doc.add(new KnnFloatVectorField(fieldName, vector, similarityFunction)); + doc.add(newStringField("id", Integer.toString(j), Field.Store.YES)); + toAdd.add(doc); + } + toAdd.add(makeParent(new int[] { 1, 2, 3, 4, 5 })); + w.addDocuments(toAdd); + w.addDocuments(List.of(makeParent(new int[] { 6, 7, 8, 9, 10 }))); + w.deleteDocuments(new FieldExistsQuery(fieldName), new TermQuery(new Term("id", encodeInts(new int[] { 1, 2, 3, 4, 5 })))); + w.flush(); + w.commit(); + w.forceMerge(1); + try (IndexReader reader = DirectoryReader.open(w)) { + IndexSearcher searcher = new IndexSearcher(reader); + BitSetProducer parentFilter = parentFilter(searcher.getIndexReader()); + Query query = new DiversifyingChildrenFloatKnnVectorQuery(fieldName, vector, null, 1, parentFilter); + assertTrue(searcher.search(query, 1).scoreDocs.length == 0); + } + } + + } + } + public void testSearch() throws Exception { String fieldName = "field"; int numVectors = random().nextInt(99, 500); diff --git a/server/src/test/java/org/elasticsearch/index/codec/vectors/es818/ES818BinaryQuantizedVectorsFormatTests.java b/server/src/test/java/org/elasticsearch/index/codec/vectors/es818/ES818BinaryQuantizedVectorsFormatTests.java index 8725f519bd56f..30be54f8b1187 100644 --- a/server/src/test/java/org/elasticsearch/index/codec/vectors/es818/ES818BinaryQuantizedVectorsFormatTests.java +++ b/server/src/test/java/org/elasticsearch/index/codec/vectors/es818/ES818BinaryQuantizedVectorsFormatTests.java @@ -24,6 +24,7 @@ import org.apache.lucene.codecs.KnnVectorsFormat; import org.apache.lucene.codecs.lucene912.Lucene912Codec; import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; import org.apache.lucene.document.KnnFloatVectorField; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.FloatVectorValues; @@ -31,18 +32,30 @@ import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.LeafReader; +import org.apache.lucene.index.SoftDeletesRetentionMergePolicy; +import org.apache.lucene.index.Term; import org.apache.lucene.index.VectorSimilarityFunction; +import org.apache.lucene.search.FieldExistsQuery; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.KnnFloatVectorQuery; +import org.apache.lucene.search.MatchAllDocsQuery; import org.apache.lucene.search.Query; +import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.TopDocs; import org.apache.lucene.search.TotalHits; +import org.apache.lucene.search.join.BitSetProducer; +import org.apache.lucene.search.join.CheckJoinIndex; +import org.apache.lucene.search.join.DiversifyingChildrenFloatKnnVectorQuery; +import org.apache.lucene.search.join.QueryBitSetProducer; import org.apache.lucene.store.Directory; import org.apache.lucene.tests.index.BaseKnnVectorsFormatTestCase; import org.elasticsearch.common.logging.LogConfigurator; import org.elasticsearch.index.codec.vectors.BQVectorUtils; import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; import java.util.Locale; import static java.lang.String.format; @@ -67,6 +80,58 @@ public KnnVectorsFormat getKnnVectorsFormatForField(String field) { }; } + static String encodeInts(int[] i) { + return Arrays.toString(i); + } + + static BitSetProducer parentFilter(IndexReader r) throws IOException { + // Create a filter that defines "parent" documents in the index + BitSetProducer parentsFilter = new QueryBitSetProducer(new TermQuery(new Term("docType", "_parent"))); + CheckJoinIndex.check(r, parentsFilter); + return parentsFilter; + } + + Document makeParent(int[] children) { + Document parent = new Document(); + parent.add(newStringField("docType", "_parent", Field.Store.NO)); + parent.add(newStringField("id", encodeInts(children), Field.Store.YES)); + return parent; + } + + public void testEmptyDiversifiedChildSearch() throws Exception { + String fieldName = "field"; + int dims = random().nextInt(4, 65); + float[] vector = randomVector(dims); + VectorSimilarityFunction similarityFunction = VectorSimilarityFunction.EUCLIDEAN; + try (Directory d = newDirectory()) { + IndexWriterConfig iwc = newIndexWriterConfig().setCodec(codec); + iwc.setMergePolicy(new SoftDeletesRetentionMergePolicy("soft_delete", MatchAllDocsQuery::new, iwc.getMergePolicy())); + try (IndexWriter w = new IndexWriter(d, iwc)) { + List toAdd = new ArrayList<>(); + for (int j = 1; j <= 5; j++) { + Document doc = new Document(); + doc.add(new KnnFloatVectorField(fieldName, vector, similarityFunction)); + doc.add(newStringField("id", Integer.toString(j), Field.Store.YES)); + toAdd.add(doc); + } + toAdd.add(makeParent(new int[] { 1, 2, 3, 4, 5 })); + w.addDocuments(toAdd); + w.addDocuments(List.of(makeParent(new int[] { 6, 7, 8, 9, 10 }))); + w.deleteDocuments(new FieldExistsQuery(fieldName), new TermQuery(new Term("id", encodeInts(new int[] { 1, 2, 3, 4, 5 })))); + w.flush(); + w.commit(); + w.forceMerge(1); + try (IndexReader reader = DirectoryReader.open(w)) { + IndexSearcher searcher = new IndexSearcher(reader); + BitSetProducer parentFilter = parentFilter(searcher.getIndexReader()); + Query query = new DiversifyingChildrenFloatKnnVectorQuery(fieldName, vector, null, 1, parentFilter); + assertTrue(searcher.search(query, 1).scoreDocs.length == 0); + } + } + + } + } + public void testSearch() throws Exception { String fieldName = "field"; int numVectors = random().nextInt(99, 500); From 1022b05c6bd6496645ae26833f1a4b34f52a6703 Mon Sep 17 00:00:00 2001 From: Benjamin Trent <4357155+benwtrent@users.noreply.github.com> Date: Wed, 18 Jun 2025 08:57:50 -0400 Subject: [PATCH 2/2] fixing compilation --- .../vectors/es816/ES816BinaryQuantizedVectorsFormatTests.java | 2 +- .../vectors/es818/ES818BinaryQuantizedVectorsFormatTests.java | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/server/src/test/java/org/elasticsearch/index/codec/vectors/es816/ES816BinaryQuantizedVectorsFormatTests.java b/server/src/test/java/org/elasticsearch/index/codec/vectors/es816/ES816BinaryQuantizedVectorsFormatTests.java index a5d445aca9e4a..332338bb1ee35 100644 --- a/server/src/test/java/org/elasticsearch/index/codec/vectors/es816/ES816BinaryQuantizedVectorsFormatTests.java +++ b/server/src/test/java/org/elasticsearch/index/codec/vectors/es816/ES816BinaryQuantizedVectorsFormatTests.java @@ -104,7 +104,7 @@ public void testEmptyDiversifiedChildSearch() throws Exception { float[] vector = randomVector(dims); VectorSimilarityFunction similarityFunction = VectorSimilarityFunction.EUCLIDEAN; try (Directory d = newDirectory()) { - IndexWriterConfig iwc = newIndexWriterConfig().setCodec(codec); + IndexWriterConfig iwc = newIndexWriterConfig().setCodec(getCodec()); iwc.setMergePolicy(new SoftDeletesRetentionMergePolicy("soft_delete", MatchAllDocsQuery::new, iwc.getMergePolicy())); try (IndexWriter w = new IndexWriter(d, iwc)) { List toAdd = new ArrayList<>(); diff --git a/server/src/test/java/org/elasticsearch/index/codec/vectors/es818/ES818BinaryQuantizedVectorsFormatTests.java b/server/src/test/java/org/elasticsearch/index/codec/vectors/es818/ES818BinaryQuantizedVectorsFormatTests.java index 30be54f8b1187..54def78927540 100644 --- a/server/src/test/java/org/elasticsearch/index/codec/vectors/es818/ES818BinaryQuantizedVectorsFormatTests.java +++ b/server/src/test/java/org/elasticsearch/index/codec/vectors/es818/ES818BinaryQuantizedVectorsFormatTests.java @@ -104,7 +104,7 @@ public void testEmptyDiversifiedChildSearch() throws Exception { float[] vector = randomVector(dims); VectorSimilarityFunction similarityFunction = VectorSimilarityFunction.EUCLIDEAN; try (Directory d = newDirectory()) { - IndexWriterConfig iwc = newIndexWriterConfig().setCodec(codec); + IndexWriterConfig iwc = newIndexWriterConfig().setCodec(getCodec()); iwc.setMergePolicy(new SoftDeletesRetentionMergePolicy("soft_delete", MatchAllDocsQuery::new, iwc.getMergePolicy())); try (IndexWriter w = new IndexWriter(d, iwc)) { List toAdd = new ArrayList<>();