CNDB-14797: Upgrade to jvector 4.0.0-rc1 (#1866)

michaeljmarshall · driftx · commit 44f9087a6764 · 2025-07-24T05:57:35.000-05:00
### What is the issue Fixes riptano/cndb#14797 CNDB test PR riptano/cndb#14799 ### What does this PR fix and why was it fixed Here is the list of commits included in the upgrade: datastax/jvector@4.0.0-beta.5...4.0.0-rc.1 Here is a copy of the relevant `CHANGELOG.md` lines: > #### [4.0.0-rc.1](datastax/jvector@4.0.0-beta.6...4.0.0-rc.1) > > - Fix issue when calling cleanup while concurrently executing searches [`#483`](datastax/jvector#483) > - Improve the efficiency and memory usage of GraphIndexBuilder.cleanup [`#477`](datastax/jvector#477) > - add PQ training benchmark [`#482`](datastax/jvector#482) > - Remove extraneous character from datasets.yml [`#484`](datastax/jvector#484) > - Upgrade YAML files to v5 after the format was introduced in the last update [`#478`](datastax/jvector#478) > - New chunked memory-mapped reader that supports &gt;2GB files [`61bffbe`](datastax/jvector@61bffbe) > - release 4.0.0-rc.1 [`6737596`](datastax/jvector@6737596) > - Fix comparison in TestADCGraphIndex [`b637f65`](datastax/jvector@b637f65) > > #### [4.0.0-beta.6](datastax/jvector@4.0.0-beta.5...4.0.0-beta.6) > > > 13 June 2025 > - Add a new graph node using a search score [`#473`](datastax/jvector#473) > - chore(release): Bump tag version and update changelog [`#471`](datastax/jvector#471) > - Sequential disk writer (#475). Upgrades file format from 4 to 5 [`d0ccb32`](datastax/jvector@d0ccb32) > - Allow empty sections in datasets.yml & add colbert-1M.yml [`2bf5f9a`](datastax/jvector@2bf5f9a) > - chore (release): Start release version 4.0.0-beta.6 [`9a453a3`](datastax/jvector@9a453a3) > >
diff --git a/.build/parent-pom-template.xml b/.build/parent-pom-template.xml
@@ -1248,7 +1248,7 @@
       <dependency>
         <groupId>io.github.jbellis</groupId>
         <artifactId>jvector</artifactId>
-        <version>4.0.0-beta.5</version>
+        <version>4.0.0-rc.1</version>
       </dependency>
       <dependency>
         <groupId>com.carrotsearch.randomizedtesting</groupId>
diff --git a/src/java/org/apache/cassandra/index/sai/disk/vector/CassandraDiskAnn.java b/src/java/org/apache/cassandra/index/sai/disk/vector/CassandraDiskAnn.java
@@ -31,6 +31,7 @@
 import io.github.jbellis.jvector.graph.GraphSearcher;
 import io.github.jbellis.jvector.graph.disk.feature.FeatureId;
 import io.github.jbellis.jvector.graph.disk.OnDiskGraphIndex;
+import io.github.jbellis.jvector.graph.similarity.DefaultSearchScoreProvider;
 import io.github.jbellis.jvector.graph.similarity.SearchScoreProvider;
 import io.github.jbellis.jvector.quantization.BQVectors;
 import io.github.jbellis.jvector.quantization.CompressedVectors;
@@ -240,12 +241,12 @@ public CloseableIterator<RowIdWithScore> search(VectorFloat<?> queryVector,
             {
                 var asf = view.approximateScoreFunctionFor(queryVector, similarityFunction);
                 var rr = isRerankless ? null : view.rerankerFor(queryVector, similarityFunction);
-                ssp = new SearchScoreProvider(asf, rr);
+                ssp = new DefaultSearchScoreProvider(asf, rr);
             }
             else if (compressedVectors == null)
             {
                 // no compression, so we ignore isRerankless (except for setting rerankK to limit)
-                ssp = new SearchScoreProvider(view.rerankerFor(queryVector, similarityFunction));
+                ssp = new DefaultSearchScoreProvider(view.rerankerFor(queryVector, similarityFunction));
             }
             else
             {
@@ -256,7 +257,7 @@ else if (compressedVectors == null)
                          : similarityFunction;
                 var asf = compressedVectors.precomputedScoreFunctionFor(queryVector, sf);
                 var rr = isRerankless ? null : view.rerankerFor(queryVector, similarityFunction);
-                ssp = new SearchScoreProvider(asf, rr);
+                ssp = new DefaultSearchScoreProvider(asf, rr);
             }
             long start = nanoTime();
             var result = searcher.search(ssp, limit, rerankK, threshold, context.getAnnRerankFloor(), ordinalsMap.ignoringDeleted(acceptBits));
diff --git a/src/java/org/apache/cassandra/index/sai/disk/vector/CassandraOnHeapGraph.java b/src/java/org/apache/cassandra/index/sai/disk/vector/CassandraOnHeapGraph.java
@@ -49,7 +49,7 @@
 import io.github.jbellis.jvector.graph.disk.feature.Feature;
 import io.github.jbellis.jvector.graph.disk.feature.FeatureId;
 import io.github.jbellis.jvector.graph.disk.feature.InlineVectors;
-import io.github.jbellis.jvector.graph.similarity.SearchScoreProvider;
+import io.github.jbellis.jvector.graph.similarity.DefaultSearchScoreProvider;
 import io.github.jbellis.jvector.quantization.BinaryQuantization;
 import io.github.jbellis.jvector.quantization.CompressedVectors;
 import io.github.jbellis.jvector.quantization.ProductQuantization;
@@ -342,7 +342,7 @@ public CloseableIterator<SearchResult.NodeScore> search(QueryContext context, Ve
         searcher.setView(builder.getGraph().getView());
         try
         {
-            var ssf = SearchScoreProvider.exact(queryVector, similarityFunction, vectorValues);
+            var ssf = DefaultSearchScoreProvider.exact(queryVector, similarityFunction, vectorValues);
             long start = nanoTime();
             var result = searcher.search(ssf, limit, rerankK, threshold, 0.0f, bits);
             long elapsed = nanoTime() - start;
@@ -440,10 +440,10 @@ public SegmentMetadata.ComponentMetadataMap flush(IndexComponents.ForWrite perIn
         try (var pqOutput = perIndexComponents.addOrGet(IndexComponentType.PQ).openOutput(true);
              var postingsOutput = perIndexComponents.addOrGet(IndexComponentType.POSTING_LISTS).openOutput(true);
              var indexWriter = new OnDiskGraphIndexWriter.Builder(builder.getGraph(), indexFile.toPath())
+                               .withStartOffset(termsOffset)
                                .withVersion(Version.current().onDiskFormat().jvectorFileFormatVersion())
                                .withMapper(ordinalMapper)
                                .with(new InlineVectors(vectorValues.dimension()))
-                               .withStartOffset(termsOffset)
                                .build())
         {
             SAICodecUtils.writeHeader(pqOutput);
diff --git a/src/java/org/apache/cassandra/index/sai/disk/vector/CompactionGraph.java b/src/java/org/apache/cassandra/index/sai/disk/vector/CompactionGraph.java
@@ -211,23 +211,27 @@ else if (compressor instanceof BinaryQuantization)
                                         indexConfig.getNeighborhoodOverflow(1.2f),
                                         indexConfig.getAlpha(dimension > 3 ? 1.2f : 1.4f),
                                         indexConfig.isHierarchyEnabled() && jvectorVersion >= 4,
-                                        compactionSimdPool, compactionFjp);
+                                        true, // We always refine during compaction
+                                        compactionSimdPool,
+                                        compactionFjp);
 
         termsFile = perIndexComponents.addOrGet(IndexComponentType.TERMS_DATA).file();
         termsOffset = (termsFile.exists() ? termsFile.length() : 0)
                       + SAICodecUtils.headerSize();
         // placeholder writer, will be replaced at flush time when we finalize the index contents
-        writer = createTermsWriterBuilder().withMapper(new OrdinalMapper.IdentityMapper(maxRowsInGraph)).build();
+        writer = createTermsWriter(new OrdinalMapper.IdentityMapper(maxRowsInGraph));
         writer.getOutput().seek(termsFile.length()); // position at the end of the previous segment before writing our own header
         SAICodecUtils.writeHeader(SAICodecUtils.toLuceneOutput(writer.getOutput()));
     }
 
-    private OnDiskGraphIndexWriter.Builder createTermsWriterBuilder() throws IOException
+    private OnDiskGraphIndexWriter createTermsWriter(OrdinalMapper ordinalMapper) throws IOException
     {
         return new OnDiskGraphIndexWriter.Builder(builder.getGraph(), termsFile.toPath())
                .withStartOffset(termsOffset)
                .with(new InlineVectors(dimension))
-               .withVersion(Version.current().onDiskFormat().jvectorFileFormatVersion());
+               .withVersion(Version.current().onDiskFormat().jvectorFileFormatVersion())
+               .withMapper(ordinalMapper)
+               .build();
     }
 
     @Override
@@ -446,7 +450,7 @@ public SegmentMetadata.ComponentMetadataMap flush() throws IOException
             }
 
             // Recreate the writer with the final ordinalMapper
-            writer = createTermsWriterBuilder().withMapper(ordinalMapper.get()).build();
+            writer = createTermsWriter(ordinalMapper.get());
 
             // write the graph edge lists and optionally fused adc features
             var start = nanoTime();

Original file line number	Diff line number	Diff line change
`@@ -31,6 +31,7 @@`
`31`	`31`	`import io.github.jbellis.jvector.graph.GraphSearcher;`
`32`	`32`	`import io.github.jbellis.jvector.graph.disk.feature.FeatureId;`
`33`	`33`	`import io.github.jbellis.jvector.graph.disk.OnDiskGraphIndex;`
	`34`	`+import io.github.jbellis.jvector.graph.similarity.DefaultSearchScoreProvider;`
`34`	`35`	`import io.github.jbellis.jvector.graph.similarity.SearchScoreProvider;`
`35`	`36`	`import io.github.jbellis.jvector.quantization.BQVectors;`
`36`	`37`	`import io.github.jbellis.jvector.quantization.CompressedVectors;`
`@@ -240,12 +241,12 @@ public CloseableIterator<RowIdWithScore> search(VectorFloat<?> queryVector,`
`240`	`241`	`{`
`241`	`242`	`var asf = view.approximateScoreFunctionFor(queryVector, similarityFunction);`
`242`	`243`	`var rr = isRerankless ? null : view.rerankerFor(queryVector, similarityFunction);`
`243`		`- ssp = new SearchScoreProvider(asf, rr);`
	`244`	`+ ssp = new DefaultSearchScoreProvider(asf, rr);`
`244`	`245`	`}`
`245`	`246`	`else if (compressedVectors == null)`
`246`	`247`	`{`
`247`	`248`	`// no compression, so we ignore isRerankless (except for setting rerankK to limit)`
`248`		`- ssp = new SearchScoreProvider(view.rerankerFor(queryVector, similarityFunction));`
	`249`	`+ ssp = new DefaultSearchScoreProvider(view.rerankerFor(queryVector, similarityFunction));`
`249`	`250`	`}`
`250`	`251`	`else`
`251`	`252`	`{`
`@@ -256,7 +257,7 @@ else if (compressedVectors == null)`
`256`	`257`	`: similarityFunction;`
`257`	`258`	`var asf = compressedVectors.precomputedScoreFunctionFor(queryVector, sf);`
`258`	`259`	`var rr = isRerankless ? null : view.rerankerFor(queryVector, similarityFunction);`
`259`		`- ssp = new SearchScoreProvider(asf, rr);`
	`260`	`+ ssp = new DefaultSearchScoreProvider(asf, rr);`
`260`	`261`	`}`
`261`	`262`	`long start = nanoTime();`
`262`	`263`	`var result = searcher.search(ssp, limit, rerankK, threshold, context.getAnnRerankFloor(), ordinalsMap.ignoringDeleted(acceptBits));`