CNDB-12937: enable jvector 4; remove jvector 3 write support (#1685)

michaeljmarshall · web-flow · commit 61f57a602424 · 2025-04-10T12:31:25.000-05:00
### What is the issue riptano/cndb#12937. This is a follow up on #1676, which updated the jvector dependency but didn't update the jvector version in the actual graph builder. ### What does this PR fix and why was it fixed * Adds jvector version configuration, defaulted to 4 * Tests jvector versions 2 and 4 to ensure we can query sstables with both kinds of graphs * Makes the `enable_hierarchy` config backwards compatible by ignoring it if the jvector version is too low * Removes the ability to write FusedADC since it does not work with the latest jvector version (discovered when writing compatibility tests) * Does not remove FusedADC read path since there could be indexes based on it in production * Replaces call to deprecated `graph.size()` method with `graph.size(0)`
diff --git a/build.xml b/build.xml
@@ -754,7 +754,7 @@
           <dependency groupId="org.apache.lucene" artifactId="lucene-core" version="9.8.0-5ea8bb4f21" />
           <dependency groupId="org.apache.lucene" artifactId="lucene-analysis-common" version="9.8.0-5ea8bb4f21" />
           <dependency groupId="org.apache.lucene" artifactId="lucene-backward-codecs" version="9.8.0-5ea8bb4f21" />
-          <dependency groupId="io.github.jbellis" artifactId="jvector" version="4.0.0-beta.2" />
+          <dependency groupId="io.github.jbellis" artifactId="jvector" version="4.0.0-beta.3" />
           <dependency groupId="com.bpodgursky" artifactId="jbool_expressions" version="1.14" scope="test"/>
 
           <dependency groupId="com.carrotsearch.randomizedtesting" artifactId="randomizedtesting-runner" version="2.1.2" scope="test">
diff --git a/src/java/org/apache/cassandra/index/sai/disk/v3/V3OnDiskFormat.java b/src/java/org/apache/cassandra/index/sai/disk/v3/V3OnDiskFormat.java
@@ -50,7 +50,14 @@ public class V3OnDiskFormat extends V2OnDiskFormat
     public static volatile boolean WRITE_JVECTOR3_FORMAT = Boolean.parseBoolean(System.getProperty("cassandra.sai.write_jv3_format", "false"));
     public static final boolean ENABLE_LTM_CONSTRUCTION = Boolean.parseBoolean(System.getProperty("cassandra.sai.ltm_construction", "true"));
 
-    public static final int JVECTOR_2_VERSION = 2;
+    // These are built to be backwards and forwards compatible. Not final only for testing.
+    public static int JVECTOR_VERSION = Integer.parseInt(System.getProperty("cassandra.sai.jvector_version", "4"));
+    static
+    {
+        // JVector 3 is not compatible with the latest jvector changes, so we fail fast if the config is enabled.
+        assert JVECTOR_VERSION != 3 : "JVector version 3 is no longer suppoerted";
+        assert !WRITE_JVECTOR3_FORMAT : "JVector version 3 is no longer suppoerted";
+    }
 
     private static final Logger logger = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
 
@@ -110,10 +117,4 @@ public Set<IndexComponentType> perIndexComponentTypes(AbstractType<?> validator)
             return VECTOR_COMPONENTS_V3;
         return super.perIndexComponentTypes(validator);
     }
-
-    @VisibleForTesting
-    public static void enableJVector3Format()
-    {
-        WRITE_JVECTOR3_FORMAT = true;
-    }
 }
diff --git a/src/java/org/apache/cassandra/index/sai/disk/vector/CassandraDiskAnn.java b/src/java/org/apache/cassandra/index/sai/disk/vector/CassandraDiskAnn.java
@@ -191,7 +191,8 @@ public long ramBytesUsed()
 
     public int size()
     {
-        return graph.size();
+        // The base layer of the graph has all nodes.
+        return graph.size(0);
     }
 
     /**
@@ -221,6 +222,8 @@ public CloseableIterator<RowIdWithScore> search(VectorFloat<?> queryVector,
         {
             var view = (GraphIndex.ScoringView) searcher.getView();
             SearchScoreProvider ssp;
+            // FusedADC can no longer be written due to jvector upgrade. However, it's possible these index files
+            // still exist, so we have to support them.
             if (features.contains(FeatureId.FUSED_ADC))
             {
                 var asf = view.approximateScoreFunctionFor(queryVector, similarityFunction);
diff --git a/src/java/org/apache/cassandra/index/sai/disk/vector/CassandraOnHeapGraph.java b/src/java/org/apache/cassandra/index/sai/disk/vector/CassandraOnHeapGraph.java
@@ -93,7 +93,7 @@
 import org.apache.cassandra.utils.CloseableIterator;
 import org.apache.lucene.util.StringHelper;
 
-import static org.apache.cassandra.index.sai.disk.v3.V3OnDiskFormat.JVECTOR_2_VERSION;
+import static org.apache.cassandra.index.sai.disk.v3.V3OnDiskFormat.JVECTOR_VERSION;
 
 public class CassandraOnHeapGraph<T> implements Accountable
 {
@@ -156,13 +156,18 @@ public CassandraOnHeapGraph(IndexContext context, boolean forSearching, Memtable
         vectorsByKey = forSearching ? new NonBlockingHashMap<>() : null;
         invalidVectorBehavior = forSearching ? InvalidVectorBehavior.FAIL : InvalidVectorBehavior.IGNORE;
 
+        // This is only a warning since it's not a fatal error to write without hierarchy
+        if (indexConfig.isHierarchyEnabled() && V3OnDiskFormat.JVECTOR_VERSION < 4)
+            logger.warn("Hierarchical graphs configured but node configured with V3OnDiskFormat.JVECTOR_VERSION {}. " +
+                        "Skipping setting for {}", V3OnDiskFormat.JVECTOR_VERSION, indexConfig.getIndexName());
+
         builder = new GraphIndexBuilder(vectorValues,
                                         similarityFunction,
                                         indexConfig.getAnnMaxDegree(),
                                         indexConfig.getConstructionBeamWidth(),
                                         indexConfig.getNeighborhoodOverflow(1.0f), // no overflow means add will be a bit slower but flush will be faster
                                         indexConfig.getAlpha(dimension > 3 ? 1.2f : 2.0f),
-                                        indexConfig.isHierarchyEnabled());
+                                        indexConfig.isHierarchyEnabled() && V3OnDiskFormat.JVECTOR_VERSION >= 4);
         searchers = ThreadLocal.withInitial(() -> new GraphSearcherAccessManager(new GraphSearcher(builder.getGraph())));
     }
 
@@ -427,7 +432,7 @@ public SegmentMetadata.ComponentMetadataMap flush(IndexComponents.ForWrite perIn
         try (var pqOutput = perIndexComponents.addOrGet(IndexComponentType.PQ).openOutput(true);
              var postingsOutput = perIndexComponents.addOrGet(IndexComponentType.POSTING_LISTS).openOutput(true);
              var indexWriter = new OnDiskGraphIndexWriter.Builder(builder.getGraph(), indexFile.toPath())
-                               .withVersion(JVECTOR_2_VERSION) // always write old-version format since we're not using the new features
+                               .withVersion(JVECTOR_VERSION)
                                .withMapper(ordinalMapper)
                                .with(new InlineVectors(vectorValues.dimension()))
                                .withStartOffset(termsOffset)
@@ -566,14 +571,14 @@ private long writePQ(SequentialWriter writer, V5VectorPostingsWriter.RemappedPos
             return writer.position();
 
         // save (outside the synchronized block, this is io-bound not CPU)
-        cv.write(writer, JVECTOR_2_VERSION);
+        cv.write(writer, JVECTOR_VERSION);
         return writer.position();
     }
 
     static void writePqHeader(DataOutput writer, boolean unitVectors, CompressionType type)
     throws IOException
     {
-        if (V3OnDiskFormat.WRITE_JVECTOR3_FORMAT)
+        if (V3OnDiskFormat.JVECTOR_VERSION >= 3)
         {
             // version and optional fields
             writer.writeInt(CassandraDiskAnn.PQ_MAGIC);
diff --git a/src/java/org/apache/cassandra/index/sai/disk/vector/CompactionGraph.java b/src/java/org/apache/cassandra/index/sai/disk/vector/CompactionGraph.java
@@ -41,7 +41,6 @@
 import io.github.jbellis.jvector.graph.ListRandomAccessVectorValues;
 import io.github.jbellis.jvector.graph.disk.feature.Feature;
 import io.github.jbellis.jvector.graph.disk.feature.FeatureId;
-import io.github.jbellis.jvector.graph.disk.feature.FusedADC;
 import io.github.jbellis.jvector.graph.disk.feature.InlineVectors;
 import io.github.jbellis.jvector.graph.disk.OnDiskGraphIndex;
 import io.github.jbellis.jvector.graph.disk.OnDiskGraphIndexWriter;
@@ -91,7 +90,7 @@
 
 import static java.lang.Math.max;
 import static java.lang.Math.min;
-import static org.apache.cassandra.index.sai.disk.v3.V3OnDiskFormat.JVECTOR_2_VERSION;
+import static org.apache.cassandra.index.sai.disk.v3.V3OnDiskFormat.JVECTOR_VERSION;
 
 public class CompactionGraph implements Closeable, Accountable
 {
@@ -200,13 +199,17 @@ else if (compressor instanceof BinaryQuantization)
         {
             throw new IllegalArgumentException("Unsupported compressor: " + compressor);
         }
+        if (indexConfig.isHierarchyEnabled() && V3OnDiskFormat.JVECTOR_VERSION < 4)
+            logger.warn("Hierarchical graphs configured but node configured with V3OnDiskFormat.JVECTOR_VERSION {}. " +
+                        "Skipping setting for {}", V3OnDiskFormat.JVECTOR_VERSION, indexConfig.getIndexName());
+
         builder = new GraphIndexBuilder(bsp,
                                         dimension,
                                         indexConfig.getAnnMaxDegree(),
                                         indexConfig.getConstructionBeamWidth(),
                                         indexConfig.getNeighborhoodOverflow(1.2f),
                                         indexConfig.getAlpha(dimension > 3 ? 1.2f : 1.4f),
-                                        indexConfig.isHierarchyEnabled(),
+                                        indexConfig.isHierarchyEnabled() && V3OnDiskFormat.JVECTOR_VERSION >= 4,
                                         compactionSimdPool, compactionFjp);
 
         termsFile = perIndexComponents.addOrGet(IndexComponentType.TERMS_DATA).file();
@@ -220,19 +223,10 @@ else if (compressor instanceof BinaryQuantization)
 
     private OnDiskGraphIndexWriter.Builder createTermsWriterBuilder() throws IOException
     {
-        var indexConfig = context.getIndexWriterConfig();
-        var writerBuilder = new OnDiskGraphIndexWriter.Builder(builder.getGraph(), termsFile.toPath())
-                      .withStartOffset(termsOffset)
-                      .with(new InlineVectors(dimension));
-        if (V3OnDiskFormat.WRITE_JVECTOR3_FORMAT && compressor instanceof ProductQuantization)
-        {
-            writerBuilder = writerBuilder.with(new FusedADC(indexConfig.getAnnMaxDegree(), (ProductQuantization) compressor));
-        }
-        else
-        {
-            writerBuilder = writerBuilder.withVersion(JVECTOR_2_VERSION);
-        }
-        return writerBuilder;
+        return new OnDiskGraphIndexWriter.Builder(builder.getGraph(), termsFile.toPath())
+               .withStartOffset(termsOffset)
+               .with(new InlineVectors(dimension))
+               .withVersion(JVECTOR_VERSION);
     }
 
     @Override
@@ -401,7 +395,7 @@ public SegmentMetadata.ComponentMetadataMap flush() throws IOException
             // write PQ (time to do this is negligible, don't bother doing it async)
             long pqOffset = pqOutput.getFilePointer();
             CassandraOnHeapGraph.writePqHeader(pqOutput.asSequentialWriter(), unitVectors, VectorCompression.CompressionType.PRODUCT_QUANTIZATION);
-            compressedVectors.write(pqOutput.asSequentialWriter(), JVECTOR_2_VERSION); // VSTODO old version until we add APQ
+            compressedVectors.write(pqOutput.asSequentialWriter(), JVECTOR_VERSION); // VSTODO old version until we add APQ
             long pqLength = pqOutput.getFilePointer() - pqOffset;
 
             // write postings asynchronously while we run cleanup()
@@ -455,18 +449,9 @@ public SegmentMetadata.ComponentMetadataMap flush() throws IOException
 
             // write the graph edge lists and optionally fused adc features
             var start = System.nanoTime();
-            if (writer.getFeatureSet().contains(FeatureId.FUSED_ADC))
-            {
-                try (var view = builder.getGraph().getView())
-                {
-                    var supplier = Feature.singleStateFactory(FeatureId.FUSED_ADC, ordinal -> new FusedADC.State(view, (PQVectors) compressedVectors, ordinal));
-                    writer.write(supplier);
-                }
-            }
-            else
-            {
-                writer.write(Map.of());
-            }
+            // Required becuase jvector 3 wrote the fused adc map here. We no longer write jvector 3, but we still
+            // write out the empty map.
+            writer.write(Map.of());
             SAICodecUtils.writeFooter(writer.getOutput(), writer.checksum());
             logger.info("Writing graph took {}ms", (System.nanoTime() - start) / 1_000_000);
             long termsLength = writer.getOutput().position() - termsOffset;
diff --git a/test/unit/org/apache/cassandra/index/sai/cql/VectorDotProductWithLengthTest.java b/test/unit/org/apache/cassandra/index/sai/cql/VectorDotProductWithLengthTest.java
@@ -33,7 +33,8 @@ public class VectorDotProductWithLengthTest extends VectorTester
     public void setup() throws Throwable
     {
         super.setup();
-        V3OnDiskFormat.enableJVector3Format(); // we are testing unit vector detection which is part of the v3 changes
+        // we are testing unit vector detection which is part of the v3 changes, but continues in all subsequent versions
+        assert V3OnDiskFormat.JVECTOR_VERSION >= 3 : "This test assumes JVector version 3 or greater";
     }
 
     // This tests our detection of unit-length vectors used with dot product and PQ.
diff --git a/test/unit/org/apache/cassandra/index/sai/cql/VectorTypeTest.java b/test/unit/org/apache/cassandra/index/sai/cql/VectorTypeTest.java
@@ -48,6 +48,7 @@
 import org.apache.cassandra.index.sai.disk.format.Version;
 import org.apache.cassandra.index.sai.disk.v1.IndexWriterConfig;
 import org.apache.cassandra.index.sai.disk.v1.SegmentBuilder;
+import org.apache.cassandra.index.sai.disk.v3.V3OnDiskFormat;
 import org.apache.cassandra.index.sai.disk.vector.CassandraOnHeapGraph;
 import org.apache.cassandra.index.sai.disk.vector.VectorSourceModel;
 import org.apache.cassandra.index.sai.plan.QueryController;
@@ -1120,8 +1121,22 @@ public void testPartitionKeyRestrictionCombinedWithSearchPredicate() throws Thro
     }
 
     @Test
-    public void newJVectorOptionsTest()
+    public void newJVectorOptionsTestVersion2()
     {
+        newJVectorOptionsTest(2);
+    }
+    // We skip version 3 since it isn't supported anymore
+    @Test
+    public void newJVectorOptionsTestVersion4()
+    {
+        newJVectorOptionsTest(4);
+    }
+
+    public void newJVectorOptionsTest(int version)
+    {
+        // Configure the version to ensure we don't fail for settings that are unsupported on earlier versions of jvector
+        V3OnDiskFormat.JVECTOR_VERSION = version;
+
         // This test ensures that we can set and retrieve new jvector parameters
         // (neighborhood_overflow, alpha, enable_hierarchy), and that they are honored at index build time.
 
@@ -1142,20 +1157,20 @@ public void newJVectorOptionsTest()
                     + "  'alpha'                     : '1.8' "
                     + '}');
 
-        // Insert some data
-        execute("INSERT INTO %s (pk, txt, vec) VALUES (0, 'row0', [1.0, 2.0, 3.0, 4.0])");
-        execute("INSERT INTO %s (pk, txt, vec) VALUES (1, 'row1', [2.0, 2.5, 3.5, 4.5])");
-        execute("INSERT INTO %s (pk, txt, vec) VALUES (2, 'row2', [5.0, 1.0, 1.0, 1.0])");
-        // Run basic query
-        assertRows(execute("SELECT pk FROM %s ORDER BY vec ANN OF [2.0, 2.0, 3.0, 4.0] LIMIT 2"), row(1), row(0));
+        // Insert many rows
+        for (int i = 0; i < 2000; i++)
+            execute("INSERT INTO %s (pk, txt, vec) VALUES (?, ?, ?)", i, "row" + i, randomVectorBoxed(4));
+
+        // Run basic query to confirm we can, no need to validate results
+        execute("SELECT pk FROM %s ORDER BY vec ANN OF [2.0, 2.0, 3.0, 4.0] LIMIT 2");
         // Confirm that we can flush with custom options
         flush();
-        // Run basic query
-        assertRows(execute("SELECT pk FROM %s ORDER BY vec ANN OF [2.0, 2.0, 3.0, 4.0] LIMIT 2"), row(1), row(0));
+        // Run basic query to confirm we can, no need to validate results
+        execute("SELECT pk FROM %s ORDER BY vec ANN OF [2.0, 2.0, 3.0, 4.0] LIMIT 2");
         // Confirm that we can compact with custom options
         compact();
-        // Run basic query
-        assertRows(execute("SELECT pk FROM %s ORDER BY vec ANN OF [2.0, 2.0, 3.0, 4.0] LIMIT 2"), row(1), row(0));
+        // Run basic query to confirm we can, no need to validate results
+        execute("SELECT pk FROM %s ORDER BY vec ANN OF [2.0, 2.0, 3.0, 4.0] LIMIT 2");
 
         // Confirm that the config picks up our custom settings.
         StorageAttachedIndex saiIndex =
@@ -1172,4 +1187,31 @@ public void newJVectorOptionsTest()
         assertEquals(VectorSimilarityFunction.EUCLIDEAN, config.getSimilarityFunction());
     }
 
+    @Test
+    public void testMultiVersionJVectorCompatibility() throws Throwable
+    {
+        createTable("CREATE TABLE %s (pk int, vec vector<float, 4>, PRIMARY KEY(pk))");
+        createIndex("CREATE CUSTOM INDEX ON %s(vec) USING 'StorageAttachedIndex'");
+
+        // Note that we do not test the multi-version path where compaction produces different sstables, which is
+        // the norm in CNDB. If we had a way to compact individual sstables, we could.
+        disableCompaction();
+
+        // Create index files for each valid version
+        for (int version = 2; version <= V3OnDiskFormat.JVECTOR_VERSION; version++)
+        {
+            // Version 3 is no longer supported, so there is mild risk that it isn't covered here, but we can't write
+            // it any more, so there isn't much we can do.
+            if (version == 3)
+                continue;
+            V3OnDiskFormat.JVECTOR_VERSION = version;
+            for (int i = 0; i < CassandraOnHeapGraph.MIN_PQ_ROWS; i++)
+                execute("INSERT INTO %s (pk, vec) VALUES (?, ?)", i, randomVectorBoxed(4));
+            flush();
+        }
+
+        // Run basic query to confirm we can, no need to validate results
+        execute("SELECT pk FROM %s ORDER BY vec ANN OF [2.0, 2.0, 3.0, 4.0] LIMIT 2");
+    }
+
 }

Original file line number	Diff line number	Diff line change
`@@ -191,7 +191,8 @@ public long ramBytesUsed()`
`191`	`191`
`192`	`192`	`public int size()`
`193`	`193`	`{`
`194`		`- return graph.size();`
	`194`	`+ // The base layer of the graph has all nodes.`
	`195`	`+ return graph.size(0);`
`195`	`196`	`}`
`196`	`197`
`197`	`198`	`/**`
`@@ -221,6 +222,8 @@ public CloseableIterator<RowIdWithScore> search(VectorFloat<?> queryVector,`
`221`	`222`	`{`
`222`	`223`	`var view = (GraphIndex.ScoringView) searcher.getView();`
`223`	`224`	`SearchScoreProvider ssp;`
	`225`	`+ // FusedADC can no longer be written due to jvector upgrade. However, it's possible these index files`
	`226`	`+ // still exist, so we have to support them.`
`224`	`227`	`if (features.contains(FeatureId.FUSED_ADC))`
`225`	`228`	`{`
`226`	`229`	`var asf = view.approximateScoreFunctionFor(queryVector, similarityFunction);`
Original file line number	Diff line number	Diff line change
`@@ -33,7 +33,8 @@ public class VectorDotProductWithLengthTest extends VectorTester`
`33`	`33`	`public void setup() throws Throwable`
`34`	`34`	`{`
`35`	`35`	`super.setup();`
`36`		`- V3OnDiskFormat.enableJVector3Format(); // we are testing unit vector detection which is part of the v3 changes`
	`36`	`+ // we are testing unit vector detection which is part of the v3 changes, but continues in all subsequent versions`
	`37`	`+ assert V3OnDiskFormat.JVECTOR_VERSION >= 3 : "This test assumes JVector version 3 or greater";`
`37`	`38`	`}`
`38`	`39`
`39`	`40`	`// This tests our detection of unit-length vectors used with dot product and PQ.`