CNDB-13499: Optimize simple BM25 by deferring PrK creation (#1662)

michaeljmarshall · web-flow · commit 830613ed553d · 2025-03-31T09:01:16.000-05:00
### What is the issue Fixes: riptano/cndb#13499 ### What does this PR fix and why was it fixed This commit has two key optimizations. First, we defer materializing the PrimaryKey in simple BM25 queries until we know that the PrimaryKey is among the best scored rows in the sstable. This buys us two things. The most important is that we can defer reading the PrK's token from disk. The second is that we materialize one less object per row, which saves us essentially O(n) memory. Second, we defer creating the PrimaryKeyWithSortKey objects by using the jvector NodeQueue to sort based on a long packed by an index (int) and a score (float). This is a more compact way to sort because it takes less space and uses a slightly better sort algorithm for our use case since it is unlikely that we'll need to consume all of the rows being sorted. Initial testing on a 1 million document table with shows that this optimization improves query latency by about 40 percent.
diff --git a/src/java/org/apache/cassandra/index/sai/disk/v1/InvertedIndexSearcher.java b/src/java/org/apache/cassandra/index/sai/disk/v1/InvertedIndexSearcher.java
@@ -37,6 +37,7 @@
 import org.apache.cassandra.db.PartitionPosition;
 import org.apache.cassandra.db.Slice;
 import org.apache.cassandra.db.Slices;
+import org.apache.cassandra.db.memtable.Memtable;
 import org.apache.cassandra.db.rows.Cell;
 import org.apache.cassandra.db.rows.Row;
 import org.apache.cassandra.dht.AbstractBounds;
@@ -45,6 +46,7 @@
 import org.apache.cassandra.index.sai.QueryContext;
 import org.apache.cassandra.index.sai.SSTableContext;
 import org.apache.cassandra.index.sai.disk.PostingList;
+import org.apache.cassandra.index.sai.disk.PrimaryKeyMap;
 import org.apache.cassandra.index.sai.disk.TermsIterator;
 import org.apache.cassandra.index.sai.disk.format.IndexComponentType;
 import org.apache.cassandra.index.sai.disk.format.Version;
@@ -55,11 +57,13 @@
 import org.apache.cassandra.index.sai.plan.Expression;
 import org.apache.cassandra.index.sai.plan.Orderer;
 import org.apache.cassandra.index.sai.utils.BM25Utils;
-import org.apache.cassandra.index.sai.utils.BM25Utils.DocTF;
+import org.apache.cassandra.index.sai.utils.BM25Utils.EagerDocTF;
 import org.apache.cassandra.index.sai.utils.PrimaryKey;
+import org.apache.cassandra.index.sai.utils.PrimaryKeyWithScore;
 import org.apache.cassandra.index.sai.utils.PrimaryKeyWithSortKey;
 import org.apache.cassandra.index.sai.utils.RowIdWithByteComparable;
 import org.apache.cassandra.index.sai.utils.SAICodecUtils;
+import org.apache.cassandra.io.sstable.SSTableId;
 import org.apache.cassandra.io.sstable.format.SSTableReader;
 import org.apache.cassandra.io.sstable.format.SSTableReadsListener;
 import org.apache.cassandra.io.util.FileHandle;
@@ -200,20 +204,23 @@ public CloseableIterator<PrimaryKeyWithSortKey> orderBy(Orderer orderer, Express
         var docLengthsReader = new DocLengthsReader(docLengths, docLengthsMeta);
 
         // Wrap the iterator with resource management
-        var it = new AbstractIterator<DocTF>() { // Anonymous class extends AbstractIterator
+        var it = new AbstractIterator<BM25Utils.DocTF>() { // Anonymous class extends AbstractIterator
             private boolean closed;
 
             @Override
-            protected DocTF computeNext()
+            protected BM25Utils.DocTF computeNext()
             {
                 try
                 {
                     int rowId = merged.nextPosting();
                     if (rowId == PostingList.END_OF_STREAM)
                         return endOfData();
+                    // Reads from disk.
                     int docLength = docLengthsReader.get(rowId); // segment-local rowid
-                    var pk = pkm.primaryKeyFromRowId(segmentRowIdOffset + rowId); // sstable-global rowid
-                    return new DocTF(pk, docLength, merged.frequencies());
+                    // We defer creating the primary key because it reads the token from disk, which is only needed
+                    // for the top rows just before they are materialized from disk, so we wait until after scoring
+                    // and sorting to read the token.
+                    return new LazyDocTF(pkm, segmentRowIdOffset + rowId, docLength, merged.frequencies());
                 }
                 catch (IOException e)
                 {
@@ -232,7 +239,7 @@ public void close()
         return bm25Internal(it, queryTerms, documentFrequencies);
     }
 
-    private CloseableIterator<PrimaryKeyWithSortKey> bm25Internal(CloseableIterator<DocTF> keyIterator,
+    private CloseableIterator<PrimaryKeyWithSortKey> bm25Internal(CloseableIterator<BM25Utils.DocTF> keyIterator,
                                                                   List<ByteBuffer> queryTerms,
                                                                   Map<ByteBuffer, Long> documentFrequencies)
     {
@@ -269,7 +276,7 @@ public CloseableIterator<PrimaryKeyWithSortKey> orderResultsBy(SSTableReader rea
         }
         var analyzer = indexContext.getAnalyzerFactory().create();
         var it = keys.stream()
-                     .map(pk -> DocTF.createFromDocument(pk, readColumn(sstable, pk), analyzer, queryTerms))
+                     .map(pk -> EagerDocTF.createFromDocument(pk, readColumn(sstable, pk), analyzer, queryTerms))
                      .filter(Objects::nonNull)
                      .iterator();
         return bm25Internal(CloseableIterator.wrap(it), queryTerms, documentFrequencies);
@@ -334,4 +341,50 @@ public void close()
             FileUtils.closeQuietly(source, currentPostingList);
         }
     }
+
+    /**
+     * A {@link BM25Utils.DocTF} that is lazy in that it does not create the {@link PrimaryKey} until it is required.
+     */
+    private static class LazyDocTF implements BM25Utils.DocTF
+    {
+        private final PrimaryKeyMap pkm;
+        private final long sstableRowId;
+        private final int docLength;
+        private final Map<ByteBuffer, Integer> frequencies;
+
+        LazyDocTF(PrimaryKeyMap pkm, long sstableRowId, int docLength, Map<ByteBuffer, Integer> frequencies)
+        {
+            this.pkm = pkm;
+            this.sstableRowId = sstableRowId;
+            this.docLength = docLength;
+            this.frequencies = frequencies;
+        }
+
+        @Override
+        public int getTermFrequency(ByteBuffer term)
+        {
+            return frequencies.getOrDefault(term, 0);
+        }
+
+        @Override
+        public int termCount()
+        {
+            return docLength;
+        }
+
+        @Override
+        public PrimaryKeyWithSortKey primaryKey(IndexContext context, Memtable source, float score)
+        {
+            // Only sstables use this class, so this should never be called
+            throw new UnsupportedOperationException();
+        }
+
+        @Override
+        public PrimaryKeyWithSortKey primaryKey(IndexContext context, SSTableId<?> source, float score)
+        {
+            // We can eagerly get the token now, even though it might not technically be required until we know
+            // we have the best score. (Perhaps this should be lazy too?)
+            return new PrimaryKeyWithScore(context, source, pkm.primaryKeyFromRowId(sstableRowId), score);
+        }
+    }
 }
diff --git a/src/java/org/apache/cassandra/index/sai/memory/TrieMemtableIndex.java b/src/java/org/apache/cassandra/index/sai/memory/TrieMemtableIndex.java
@@ -324,7 +324,7 @@ public List<CloseableIterator<PrimaryKeyWithSortKey>> orderBy(QueryContext query
         var docStats = computeDocumentFrequencies(queryContext, queryTerms);
         var analyzer = indexContext.getAnalyzerFactory().create();
         var it = Streams.stream(intersectedIterator)
-                 .map(pk -> BM25Utils.DocTF.createFromDocument(pk, getCellForKey(pk), analyzer, queryTerms))
+                 .map(pk -> BM25Utils.EagerDocTF.createFromDocument(pk, getCellForKey(pk), analyzer, queryTerms))
                  .filter(Objects::nonNull)
                  .iterator();
 
@@ -393,7 +393,7 @@ public CloseableIterator<PrimaryKeyWithSortKey> orderResultsBy(QueryContext quer
         var queryTerms = orderer.getQueryTerms();
         var docStats = computeDocumentFrequencies(queryContext, queryTerms);
         var it = keys.stream()
-                     .map(pk -> BM25Utils.DocTF.createFromDocument(pk, getCellForKey(pk), analyzer, queryTerms))
+                     .map(pk -> BM25Utils.EagerDocTF.createFromDocument(pk, getCellForKey(pk), analyzer, queryTerms))
                      .filter(Objects::nonNull)
                      .iterator();
         return BM25Utils.computeScores(CloseableIterator.wrap(it),
diff --git a/src/java/org/apache/cassandra/index/sai/utils/BM25Utils.java b/src/java/org/apache/cassandra/index/sai/utils/BM25Utils.java
@@ -21,20 +21,21 @@
 import java.nio.ByteBuffer;
 import java.util.ArrayList;
 import java.util.Collection;
-import java.util.Collections;
 import java.util.HashMap;
-import java.util.Iterator;
 import java.util.List;
 import java.util.Map;
 
 import javax.annotation.Nullable;
 
+import io.github.jbellis.jvector.graph.NodeQueue;
+import io.github.jbellis.jvector.util.BoundedLongHeap;
 import org.apache.cassandra.db.memtable.Memtable;
 import org.apache.cassandra.db.rows.Cell;
 import org.apache.cassandra.index.sai.IndexContext;
 import org.apache.cassandra.index.sai.analyzer.AbstractAnalyzer;
 import org.apache.cassandra.io.sstable.SSTableId;
 import org.apache.cassandra.io.util.FileUtils;
+import org.apache.cassandra.utils.AbstractIterator;
 import org.apache.cassandra.utils.CloseableIterator;
 
 public class BM25Utils
@@ -60,15 +61,28 @@ public DocStats(Map<ByteBuffer, Long> frequencies, long docCount)
     }
 
     /**
-     * Term frequencies within a single document.  All instances of a term are counted.
+     * Term frequencies within a single document.  All instances of a term are counted. Allows us to optimize for
+     * the sstable use case, which is able to skip some reads from disk as well as some memory allocations.
      */
-    public static class DocTF
+    public interface DocTF
+    {
+        int getTermFrequency(ByteBuffer term);
+        int termCount();
+        PrimaryKeyWithSortKey primaryKey(IndexContext context, Memtable source, float score);
+        PrimaryKeyWithSortKey primaryKey(IndexContext context, SSTableId<?> source, float score);
+    }
+
+    /**
+     * Term frequencies within a single document.  All instances of a term are counted. It is eager in that the
+     * PrimaryKey is already created.
+     */
+    public static class EagerDocTF implements DocTF
     {
         private final PrimaryKey pk;
         private final Map<ByteBuffer, Integer> frequencies;
         private final int termCount;
 
-        public DocTF(PrimaryKey pk, int termCount, Map<ByteBuffer, Integer> frequencies)
+        public EagerDocTF(PrimaryKey pk, int termCount, Map<ByteBuffer, Integer> frequencies)
         {
             this.pk = pk;
             this.frequencies = frequencies;
@@ -80,6 +94,21 @@ public int getTermFrequency(ByteBuffer term)
             return frequencies.getOrDefault(term, 0);
         }
 
+        public int termCount()
+        {
+            return termCount;
+        }
+
+        public PrimaryKeyWithSortKey primaryKey(IndexContext context, Memtable source, float score)
+        {
+            return new PrimaryKeyWithScore(context, source, pk, score);
+        }
+
+        public PrimaryKeyWithSortKey primaryKey(IndexContext context, SSTableId<?> source, float score)
+        {
+            return new PrimaryKeyWithScore(context, source, pk, score);
+        }
+
         @Nullable
         public static DocTF createFromDocument(PrimaryKey pk,
                                                Cell<?> cell,
@@ -111,7 +140,7 @@ public static DocTF createFromDocument(PrimaryKey pk,
             if (queryTerms.size() > frequencies.size())
                 return null;
 
-            return new DocTF(pk, count, frequencies);
+            return new EagerDocTF(pk, count, frequencies);
         }
     }
 
@@ -121,6 +150,8 @@ public static CloseableIterator<PrimaryKeyWithSortKey> computeScores(CloseableIt
                                                                          IndexContext indexContext,
                                                                          Object source)
     {
+        assert source instanceof Memtable || source instanceof SSTableId : "Invalid source " + source.getClass();
+
         // data structures for document stats and frequencies
         ArrayList<DocTF> documents = new ArrayList<>();
         double totalTermCount = 0;
@@ -130,18 +161,20 @@ public static CloseableIterator<PrimaryKeyWithSortKey> computeScores(CloseableIt
         {
             var tf = docIterator.next();
             documents.add(tf);
-            totalTermCount += tf.termCount;
+            totalTermCount += tf.termCount();
         }
+
         if (documents.isEmpty())
             return CloseableIterator.emptyIterator();
 
         // Calculate average document length
         double avgDocLength = totalTermCount / documents.size();
 
-        // Calculate BM25 scores
-        var scoredDocs = new ArrayList<PrimaryKeyWithScore>(documents.size());
-        for (var doc : documents)
+        // Calculate BM25 scores. Uses a nodequeue that avoids additional allocations and has heap time complexity
+        var nodeQueue = new NodeQueue(new BoundedLongHeap(documents.size()), NodeQueue.Order.MAX_HEAP);
+        for (int i = 0; i < documents.size(); i++)
         {
+            var doc = documents.get(i);
             double score = 0.0;
             for (var queryTerm : queryTerms)
             {
@@ -150,45 +183,55 @@ public static CloseableIterator<PrimaryKeyWithSortKey> computeScores(CloseableIt
                 // we shouldn't have more hits for a term than we counted total documents
                 assert df <= docStats.docCount : String.format("df=%d, totalDocs=%d", df, docStats.docCount);
 
-                double normalizedTf = tf / (tf + K1 * (1 - B + B * doc.termCount / avgDocLength));
+                double normalizedTf = tf / (tf + K1 * (1 - B + B * doc.termCount() / avgDocLength));
                 double idf = Math.log(1 + (docStats.docCount - df + 0.5) / (df + 0.5));
                 double deltaScore = normalizedTf * idf;
                 assert deltaScore >= 0 : String.format("BM25 score for tf=%d, df=%d, tc=%d, totalDocs=%d is %f",
-                                                       tf, df, doc.termCount, docStats.docCount, deltaScore);
+                                                       tf, df, doc.termCount(), docStats.docCount, deltaScore);
                 score += deltaScore;
             }
-            if (source instanceof Memtable)
-                scoredDocs.add(new PrimaryKeyWithScore(indexContext, (Memtable) source, doc.pk, (float) score));
-            else if (source instanceof SSTableId)
-                scoredDocs.add(new PrimaryKeyWithScore(indexContext, (SSTableId) source, doc.pk, (float) score));
-            else
-                throw new IllegalArgumentException("Invalid source " + source.getClass());
+            nodeQueue.push(i, (float) score);
         }
 
-        // sort by score (PKWS implements Comparator correctly for us)
-        Collections.sort(scoredDocs);
+        return new NodeQueueDocTFIterator(nodeQueue, documents, indexContext, source, docIterator);
+    }
 
-        return new CloseableIterator<>()
+    private static class NodeQueueDocTFIterator extends AbstractIterator<PrimaryKeyWithSortKey>
+    {
+        private final NodeQueue nodeQueue;
+        private final List<DocTF> documents;
+        private final IndexContext indexContext;
+        private final Object source;
+        private final CloseableIterator<DocTF> docIterator;
+
+        NodeQueueDocTFIterator(NodeQueue nodeQueue, List<DocTF> documents, IndexContext indexContext, Object source, CloseableIterator<DocTF> docIterator)
         {
-            private final Iterator<PrimaryKeyWithScore> iterator = scoredDocs.iterator();
+            this.nodeQueue = nodeQueue;
+            this.documents = documents;
+            this.indexContext = indexContext;
+            this.source = source;
+            this.docIterator = docIterator;
+        }
 
-            @Override
-            public boolean hasNext()
-            {
-                return iterator.hasNext();
-            }
+        @Override
+        protected PrimaryKeyWithSortKey computeNext()
+        {
+            if (nodeQueue.size() == 0)
+                return endOfData();
 
-            @Override
-            public PrimaryKeyWithSortKey next()
-            {
-                return iterator.next();
-            }
+            var score = nodeQueue.topScore();
+            var node = nodeQueue.pop();
+            var doc = documents.get(node);
+            if (source instanceof Memtable)
+                return doc.primaryKey(indexContext, (Memtable) source, score);
+            else
+                return doc.primaryKey(indexContext, (SSTableId<?>) source, score);
+        }
 
-            @Override
-            public void close()
-            {
-                FileUtils.closeQuietly(docIterator);
-            }
-        };
+        @Override
+        public void close()
+        {
+            FileUtils.closeQuietly(docIterator);
+        }
     }
 }