CNDB-14797: Upgrade to jvector 4.0.0-rc1 (#1866)

michaeljmarshall · web-flow · commit e2ef6e2abfe8 · 2025-07-15T12:39:24.000-05:00
### What is the issue Fixes riptano/cndb#14797 CNDB test PR riptano/cndb#14799 ### What does this PR fix and why was it fixed Here is the list of commits included in the upgrade: datastax/jvector@4.0.0-beta.5...4.0.0-rc.1 Here is a copy of the relevant `CHANGELOG.md` lines: > #### [4.0.0-rc.1](datastax/jvector@4.0.0-beta.6...4.0.0-rc.1) > > - Fix issue when calling cleanup while concurrently executing searches [`#483`](datastax/jvector#483) > - Improve the efficiency and memory usage of GraphIndexBuilder.cleanup [`#477`](datastax/jvector#477) > - add PQ training benchmark [`#482`](datastax/jvector#482) > - Remove extraneous character from datasets.yml [`#484`](datastax/jvector#484) > - Upgrade YAML files to v5 after the format was introduced in the last update [`#478`](datastax/jvector#478) > - New chunked memory-mapped reader that supports &gt;2GB files [`61bffbe`](datastax/jvector@61bffbe) > - release 4.0.0-rc.1 [`6737596`](datastax/jvector@6737596) > - Fix comparison in TestADCGraphIndex [`b637f65`](datastax/jvector@b637f65) > > #### [4.0.0-beta.6](datastax/jvector@4.0.0-beta.5...4.0.0-beta.6) > > > 13 June 2025 > - Add a new graph node using a search score [`#473`](datastax/jvector#473) > - chore(release): Bump tag version and update changelog [`#471`](datastax/jvector#471) > - Sequential disk writer (#475). Upgrades file format from 4 to 5 [`d0ccb32`](datastax/jvector@d0ccb32) > - Allow empty sections in datasets.yml & add colbert-1M.yml [`2bf5f9a`](datastax/jvector@2bf5f9a) > - chore (release): Start release version 4.0.0-beta.6 [`9a453a3`](datastax/jvector@9a453a3) > >
diff --git a/build.xml b/build.xml
@@ -754,7 +754,7 @@
           <dependency groupId="org.apache.lucene" artifactId="lucene-core" version="9.8.0" />
           <dependency groupId="org.apache.lucene" artifactId="lucene-analysis-common" version="9.8.0" />
           <dependency groupId="org.apache.lucene" artifactId="lucene-backward-codecs" version="9.8.0" />
-          <dependency groupId="io.github.jbellis" artifactId="jvector" version="4.0.0-beta.5" />
+          <dependency groupId="io.github.jbellis" artifactId="jvector" version="4.0.0-rc.1" />
           <dependency groupId="com.bpodgursky" artifactId="jbool_expressions" version="1.14" scope="test"/>
 
           <dependency groupId="com.carrotsearch.randomizedtesting" artifactId="randomizedtesting-runner" version="2.1.2" scope="test">
diff --git a/src/java/org/apache/cassandra/index/sai/disk/vector/CassandraDiskAnn.java b/src/java/org/apache/cassandra/index/sai/disk/vector/CassandraDiskAnn.java
@@ -31,6 +31,7 @@
 import io.github.jbellis.jvector.graph.GraphSearcher;
 import io.github.jbellis.jvector.graph.disk.feature.FeatureId;
 import io.github.jbellis.jvector.graph.disk.OnDiskGraphIndex;
+import io.github.jbellis.jvector.graph.similarity.DefaultSearchScoreProvider;
 import io.github.jbellis.jvector.graph.similarity.SearchScoreProvider;
 import io.github.jbellis.jvector.quantization.BQVectors;
 import io.github.jbellis.jvector.quantization.CompressedVectors;
@@ -238,12 +239,12 @@ public CloseableIterator<RowIdWithScore> search(VectorFloat<?> queryVector,
             {
                 var asf = view.approximateScoreFunctionFor(queryVector, similarityFunction);
                 var rr = isRerankless ? null : view.rerankerFor(queryVector, similarityFunction);
-                ssp = new SearchScoreProvider(asf, rr);
+                ssp = new DefaultSearchScoreProvider(asf, rr);
             }
             else if (compressedVectors == null)
             {
                 // no compression, so we ignore isRerankless (except for setting rerankK to limit)
-                ssp = new SearchScoreProvider(view.rerankerFor(queryVector, similarityFunction));
+                ssp = new DefaultSearchScoreProvider(view.rerankerFor(queryVector, similarityFunction));
             }
             else
             {
@@ -254,7 +255,7 @@ else if (compressedVectors == null)
                          : similarityFunction;
                 var asf = compressedVectors.precomputedScoreFunctionFor(queryVector, sf);
                 var rr = isRerankless ? null : view.rerankerFor(queryVector, similarityFunction);
-                ssp = new SearchScoreProvider(asf, rr);
+                ssp = new DefaultSearchScoreProvider(asf, rr);
             }
             long start = System.nanoTime();
             var result = searcher.search(ssp, limit, rerankK, threshold, context.getAnnRerankFloor(), ordinalsMap.ignoringDeleted(acceptBits));
diff --git a/src/java/org/apache/cassandra/index/sai/disk/vector/CassandraOnHeapGraph.java b/src/java/org/apache/cassandra/index/sai/disk/vector/CassandraOnHeapGraph.java
@@ -49,6 +49,7 @@
 import io.github.jbellis.jvector.graph.disk.feature.Feature;
 import io.github.jbellis.jvector.graph.disk.feature.FeatureId;
 import io.github.jbellis.jvector.graph.disk.feature.InlineVectors;
+import io.github.jbellis.jvector.graph.similarity.DefaultSearchScoreProvider;
 import io.github.jbellis.jvector.graph.similarity.SearchScoreProvider;
 import io.github.jbellis.jvector.quantization.BinaryQuantization;
 import io.github.jbellis.jvector.quantization.CompressedVectors;
@@ -341,7 +342,7 @@ public CloseableIterator<SearchResult.NodeScore> search(QueryContext context, Ve
         searcher.setView(builder.getGraph().getView());
         try
         {
-            var ssf = SearchScoreProvider.exact(queryVector, similarityFunction, vectorValues);
+            var ssf = DefaultSearchScoreProvider.exact(queryVector, similarityFunction, vectorValues);
             long start = System.nanoTime();
             var result = searcher.search(ssf, limit, rerankK, threshold, 0.0f, bits);
             long elapsed = System.nanoTime() - start;
@@ -439,10 +440,10 @@ public SegmentMetadata.ComponentMetadataMap flush(IndexComponents.ForWrite perIn
         try (var pqOutput = perIndexComponents.addOrGet(IndexComponentType.PQ).openOutput(true);
              var postingsOutput = perIndexComponents.addOrGet(IndexComponentType.POSTING_LISTS).openOutput(true);
              var indexWriter = new OnDiskGraphIndexWriter.Builder(builder.getGraph(), indexFile.toPath())
+                               .withStartOffset(termsOffset)
                                .withVersion(Version.current().onDiskFormat().jvectorFileFormatVersion())
                                .withMapper(ordinalMapper)
                                .with(new InlineVectors(vectorValues.dimension()))
-                               .withStartOffset(termsOffset)
                                .build())
         {
             SAICodecUtils.writeHeader(pqOutput);
diff --git a/src/java/org/apache/cassandra/index/sai/disk/vector/CompactionGraph.java b/src/java/org/apache/cassandra/index/sai/disk/vector/CompactionGraph.java
@@ -76,7 +76,6 @@
 import org.apache.cassandra.index.sai.disk.format.Version;
 import org.apache.cassandra.index.sai.disk.v1.SegmentMetadata;
 import org.apache.cassandra.index.sai.disk.v2.V2VectorPostingsWriter;
-import org.apache.cassandra.index.sai.disk.v3.V3OnDiskFormat;
 import org.apache.cassandra.index.sai.disk.v5.V5OnDiskFormat;
 import org.apache.cassandra.index.sai.disk.v5.V5VectorPostingsWriter;
 import org.apache.cassandra.index.sai.disk.v5.V5VectorPostingsWriter.Structure;
@@ -211,23 +210,27 @@ else if (compressor instanceof BinaryQuantization)
                                         indexConfig.getNeighborhoodOverflow(1.2f),
                                         indexConfig.getAlpha(dimension > 3 ? 1.2f : 1.4f),
                                         indexConfig.isHierarchyEnabled() && jvectorVersion >= 4,
-                                        compactionSimdPool, compactionFjp);
+                                        true, // We always refine during compaction
+                                        compactionSimdPool,
+                                        compactionFjp);
 
         termsFile = perIndexComponents.addOrGet(IndexComponentType.TERMS_DATA).file();
         termsOffset = (termsFile.exists() ? termsFile.length() : 0)
                       + SAICodecUtils.headerSize();
         // placeholder writer, will be replaced at flush time when we finalize the index contents
-        writer = createTermsWriterBuilder().withMapper(new OrdinalMapper.IdentityMapper(maxRowsInGraph)).build();
+        writer = createTermsWriter(new OrdinalMapper.IdentityMapper(maxRowsInGraph));
         writer.getOutput().seek(termsFile.length()); // position at the end of the previous segment before writing our own header
         SAICodecUtils.writeHeader(SAICodecUtils.toLuceneOutput(writer.getOutput()));
     }
 
-    private OnDiskGraphIndexWriter.Builder createTermsWriterBuilder() throws IOException
+    private OnDiskGraphIndexWriter createTermsWriter(OrdinalMapper ordinalMapper) throws IOException
     {
         return new OnDiskGraphIndexWriter.Builder(builder.getGraph(), termsFile.toPath())
                .withStartOffset(termsOffset)
                .with(new InlineVectors(dimension))
-               .withVersion(Version.current().onDiskFormat().jvectorFileFormatVersion());
+               .withVersion(Version.current().onDiskFormat().jvectorFileFormatVersion())
+               .withMapper(ordinalMapper)
+               .build();
     }
 
     @Override
@@ -446,7 +449,7 @@ public SegmentMetadata.ComponentMetadataMap flush() throws IOException
             }
 
             // Recreate the writer with the final ordinalMapper
-            writer = createTermsWriterBuilder().withMapper(ordinalMapper.get()).build();
+            writer = createTermsWriter(ordinalMapper.get());
 
             // write the graph edge lists and optionally fused adc features
             var start = System.nanoTime();

Original file line number	Diff line number	Diff line change
`@@ -31,6 +31,7 @@`
`31`	`31`	`import io.github.jbellis.jvector.graph.GraphSearcher;`
`32`	`32`	`import io.github.jbellis.jvector.graph.disk.feature.FeatureId;`
`33`	`33`	`import io.github.jbellis.jvector.graph.disk.OnDiskGraphIndex;`
	`34`	`+import io.github.jbellis.jvector.graph.similarity.DefaultSearchScoreProvider;`
`34`	`35`	`import io.github.jbellis.jvector.graph.similarity.SearchScoreProvider;`
`35`	`36`	`import io.github.jbellis.jvector.quantization.BQVectors;`
`36`	`37`	`import io.github.jbellis.jvector.quantization.CompressedVectors;`
`@@ -238,12 +239,12 @@ public CloseableIterator<RowIdWithScore> search(VectorFloat<?> queryVector,`
`238`	`239`	`{`
`239`	`240`	`var asf = view.approximateScoreFunctionFor(queryVector, similarityFunction);`
`240`	`241`	`var rr = isRerankless ? null : view.rerankerFor(queryVector, similarityFunction);`
`241`		`- ssp = new SearchScoreProvider(asf, rr);`
	`242`	`+ ssp = new DefaultSearchScoreProvider(asf, rr);`
`242`	`243`	`}`
`243`	`244`	`else if (compressedVectors == null)`
`244`	`245`	`{`
`245`	`246`	`// no compression, so we ignore isRerankless (except for setting rerankK to limit)`
`246`		`- ssp = new SearchScoreProvider(view.rerankerFor(queryVector, similarityFunction));`
	`247`	`+ ssp = new DefaultSearchScoreProvider(view.rerankerFor(queryVector, similarityFunction));`
`247`	`248`	`}`
`248`	`249`	`else`
`249`	`250`	`{`
`@@ -254,7 +255,7 @@ else if (compressedVectors == null)`
`254`	`255`	`: similarityFunction;`
`255`	`256`	`var asf = compressedVectors.precomputedScoreFunctionFor(queryVector, sf);`
`256`	`257`	`var rr = isRerankless ? null : view.rerankerFor(queryVector, similarityFunction);`
`257`		`- ssp = new SearchScoreProvider(asf, rr);`
	`258`	`+ ssp = new DefaultSearchScoreProvider(asf, rr);`
`258`	`259`	`}`
`259`	`260`	`long start = System.nanoTime();`
`260`	`261`	`var result = searcher.search(ssp, limit, rerankK, threshold, context.getAnnRerankFloor(), ordinalsMap.ignoringDeleted(acceptBits));`