Skip to content

Commit 61f57a6

Browse files
CNDB-12937: enable jvector 4; remove jvector 3 write support (#1685)
### What is the issue riptano/cndb#12937. This is a follow up on #1676, which updated the jvector dependency but didn't update the jvector version in the actual graph builder. ### What does this PR fix and why was it fixed * Adds jvector version configuration, defaulted to 4 * Tests jvector versions 2 and 4 to ensure we can query sstables with both kinds of graphs * Makes the `enable_hierarchy` config backwards compatible by ignoring it if the jvector version is too low * Removes the ability to write FusedADC since it does not work with the latest jvector version (discovered when writing compatibility tests) * Does not remove FusedADC read path since there could be indexes based on it in production * Replaces call to deprecated `graph.size()` method with `graph.size(0)`
1 parent c02e486 commit 61f57a6

File tree

7 files changed

+92
-55
lines changed

7 files changed

+92
-55
lines changed

build.xml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -754,7 +754,7 @@
754754
<dependency groupId="org.apache.lucene" artifactId="lucene-core" version="9.8.0-5ea8bb4f21" />
755755
<dependency groupId="org.apache.lucene" artifactId="lucene-analysis-common" version="9.8.0-5ea8bb4f21" />
756756
<dependency groupId="org.apache.lucene" artifactId="lucene-backward-codecs" version="9.8.0-5ea8bb4f21" />
757-
<dependency groupId="io.github.jbellis" artifactId="jvector" version="4.0.0-beta.2" />
757+
<dependency groupId="io.github.jbellis" artifactId="jvector" version="4.0.0-beta.3" />
758758
<dependency groupId="com.bpodgursky" artifactId="jbool_expressions" version="1.14" scope="test"/>
759759

760760
<dependency groupId="com.carrotsearch.randomizedtesting" artifactId="randomizedtesting-runner" version="2.1.2" scope="test">

src/java/org/apache/cassandra/index/sai/disk/v3/V3OnDiskFormat.java

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,14 @@ public class V3OnDiskFormat extends V2OnDiskFormat
5050
public static volatile boolean WRITE_JVECTOR3_FORMAT = Boolean.parseBoolean(System.getProperty("cassandra.sai.write_jv3_format", "false"));
5151
public static final boolean ENABLE_LTM_CONSTRUCTION = Boolean.parseBoolean(System.getProperty("cassandra.sai.ltm_construction", "true"));
5252

53-
public static final int JVECTOR_2_VERSION = 2;
53+
// These are built to be backwards and forwards compatible. Not final only for testing.
54+
public static int JVECTOR_VERSION = Integer.parseInt(System.getProperty("cassandra.sai.jvector_version", "4"));
55+
static
56+
{
57+
// JVector 3 is not compatible with the latest jvector changes, so we fail fast if the config is enabled.
58+
assert JVECTOR_VERSION != 3 : "JVector version 3 is no longer suppoerted";
59+
assert !WRITE_JVECTOR3_FORMAT : "JVector version 3 is no longer suppoerted";
60+
}
5461

5562
private static final Logger logger = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
5663

@@ -110,10 +117,4 @@ public Set<IndexComponentType> perIndexComponentTypes(AbstractType<?> validator)
110117
return VECTOR_COMPONENTS_V3;
111118
return super.perIndexComponentTypes(validator);
112119
}
113-
114-
@VisibleForTesting
115-
public static void enableJVector3Format()
116-
{
117-
WRITE_JVECTOR3_FORMAT = true;
118-
}
119120
}

src/java/org/apache/cassandra/index/sai/disk/vector/CassandraDiskAnn.java

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -191,7 +191,8 @@ public long ramBytesUsed()
191191

192192
public int size()
193193
{
194-
return graph.size();
194+
// The base layer of the graph has all nodes.
195+
return graph.size(0);
195196
}
196197

197198
/**
@@ -221,6 +222,8 @@ public CloseableIterator<RowIdWithScore> search(VectorFloat<?> queryVector,
221222
{
222223
var view = (GraphIndex.ScoringView) searcher.getView();
223224
SearchScoreProvider ssp;
225+
// FusedADC can no longer be written due to jvector upgrade. However, it's possible these index files
226+
// still exist, so we have to support them.
224227
if (features.contains(FeatureId.FUSED_ADC))
225228
{
226229
var asf = view.approximateScoreFunctionFor(queryVector, similarityFunction);

src/java/org/apache/cassandra/index/sai/disk/vector/CassandraOnHeapGraph.java

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -93,7 +93,7 @@
9393
import org.apache.cassandra.utils.CloseableIterator;
9494
import org.apache.lucene.util.StringHelper;
9595

96-
import static org.apache.cassandra.index.sai.disk.v3.V3OnDiskFormat.JVECTOR_2_VERSION;
96+
import static org.apache.cassandra.index.sai.disk.v3.V3OnDiskFormat.JVECTOR_VERSION;
9797

9898
public class CassandraOnHeapGraph<T> implements Accountable
9999
{
@@ -156,13 +156,18 @@ public CassandraOnHeapGraph(IndexContext context, boolean forSearching, Memtable
156156
vectorsByKey = forSearching ? new NonBlockingHashMap<>() : null;
157157
invalidVectorBehavior = forSearching ? InvalidVectorBehavior.FAIL : InvalidVectorBehavior.IGNORE;
158158

159+
// This is only a warning since it's not a fatal error to write without hierarchy
160+
if (indexConfig.isHierarchyEnabled() && V3OnDiskFormat.JVECTOR_VERSION < 4)
161+
logger.warn("Hierarchical graphs configured but node configured with V3OnDiskFormat.JVECTOR_VERSION {}. " +
162+
"Skipping setting for {}", V3OnDiskFormat.JVECTOR_VERSION, indexConfig.getIndexName());
163+
159164
builder = new GraphIndexBuilder(vectorValues,
160165
similarityFunction,
161166
indexConfig.getAnnMaxDegree(),
162167
indexConfig.getConstructionBeamWidth(),
163168
indexConfig.getNeighborhoodOverflow(1.0f), // no overflow means add will be a bit slower but flush will be faster
164169
indexConfig.getAlpha(dimension > 3 ? 1.2f : 2.0f),
165-
indexConfig.isHierarchyEnabled());
170+
indexConfig.isHierarchyEnabled() && V3OnDiskFormat.JVECTOR_VERSION >= 4);
166171
searchers = ThreadLocal.withInitial(() -> new GraphSearcherAccessManager(new GraphSearcher(builder.getGraph())));
167172
}
168173

@@ -427,7 +432,7 @@ public SegmentMetadata.ComponentMetadataMap flush(IndexComponents.ForWrite perIn
427432
try (var pqOutput = perIndexComponents.addOrGet(IndexComponentType.PQ).openOutput(true);
428433
var postingsOutput = perIndexComponents.addOrGet(IndexComponentType.POSTING_LISTS).openOutput(true);
429434
var indexWriter = new OnDiskGraphIndexWriter.Builder(builder.getGraph(), indexFile.toPath())
430-
.withVersion(JVECTOR_2_VERSION) // always write old-version format since we're not using the new features
435+
.withVersion(JVECTOR_VERSION)
431436
.withMapper(ordinalMapper)
432437
.with(new InlineVectors(vectorValues.dimension()))
433438
.withStartOffset(termsOffset)
@@ -566,14 +571,14 @@ private long writePQ(SequentialWriter writer, V5VectorPostingsWriter.RemappedPos
566571
return writer.position();
567572

568573
// save (outside the synchronized block, this is io-bound not CPU)
569-
cv.write(writer, JVECTOR_2_VERSION);
574+
cv.write(writer, JVECTOR_VERSION);
570575
return writer.position();
571576
}
572577

573578
static void writePqHeader(DataOutput writer, boolean unitVectors, CompressionType type)
574579
throws IOException
575580
{
576-
if (V3OnDiskFormat.WRITE_JVECTOR3_FORMAT)
581+
if (V3OnDiskFormat.JVECTOR_VERSION >= 3)
577582
{
578583
// version and optional fields
579584
writer.writeInt(CassandraDiskAnn.PQ_MAGIC);

src/java/org/apache/cassandra/index/sai/disk/vector/CompactionGraph.java

Lines changed: 14 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,6 @@
4141
import io.github.jbellis.jvector.graph.ListRandomAccessVectorValues;
4242
import io.github.jbellis.jvector.graph.disk.feature.Feature;
4343
import io.github.jbellis.jvector.graph.disk.feature.FeatureId;
44-
import io.github.jbellis.jvector.graph.disk.feature.FusedADC;
4544
import io.github.jbellis.jvector.graph.disk.feature.InlineVectors;
4645
import io.github.jbellis.jvector.graph.disk.OnDiskGraphIndex;
4746
import io.github.jbellis.jvector.graph.disk.OnDiskGraphIndexWriter;
@@ -91,7 +90,7 @@
9190

9291
import static java.lang.Math.max;
9392
import static java.lang.Math.min;
94-
import static org.apache.cassandra.index.sai.disk.v3.V3OnDiskFormat.JVECTOR_2_VERSION;
93+
import static org.apache.cassandra.index.sai.disk.v3.V3OnDiskFormat.JVECTOR_VERSION;
9594

9695
public class CompactionGraph implements Closeable, Accountable
9796
{
@@ -200,13 +199,17 @@ else if (compressor instanceof BinaryQuantization)
200199
{
201200
throw new IllegalArgumentException("Unsupported compressor: " + compressor);
202201
}
202+
if (indexConfig.isHierarchyEnabled() && V3OnDiskFormat.JVECTOR_VERSION < 4)
203+
logger.warn("Hierarchical graphs configured but node configured with V3OnDiskFormat.JVECTOR_VERSION {}. " +
204+
"Skipping setting for {}", V3OnDiskFormat.JVECTOR_VERSION, indexConfig.getIndexName());
205+
203206
builder = new GraphIndexBuilder(bsp,
204207
dimension,
205208
indexConfig.getAnnMaxDegree(),
206209
indexConfig.getConstructionBeamWidth(),
207210
indexConfig.getNeighborhoodOverflow(1.2f),
208211
indexConfig.getAlpha(dimension > 3 ? 1.2f : 1.4f),
209-
indexConfig.isHierarchyEnabled(),
212+
indexConfig.isHierarchyEnabled() && V3OnDiskFormat.JVECTOR_VERSION >= 4,
210213
compactionSimdPool, compactionFjp);
211214

212215
termsFile = perIndexComponents.addOrGet(IndexComponentType.TERMS_DATA).file();
@@ -220,19 +223,10 @@ else if (compressor instanceof BinaryQuantization)
220223

221224
private OnDiskGraphIndexWriter.Builder createTermsWriterBuilder() throws IOException
222225
{
223-
var indexConfig = context.getIndexWriterConfig();
224-
var writerBuilder = new OnDiskGraphIndexWriter.Builder(builder.getGraph(), termsFile.toPath())
225-
.withStartOffset(termsOffset)
226-
.with(new InlineVectors(dimension));
227-
if (V3OnDiskFormat.WRITE_JVECTOR3_FORMAT && compressor instanceof ProductQuantization)
228-
{
229-
writerBuilder = writerBuilder.with(new FusedADC(indexConfig.getAnnMaxDegree(), (ProductQuantization) compressor));
230-
}
231-
else
232-
{
233-
writerBuilder = writerBuilder.withVersion(JVECTOR_2_VERSION);
234-
}
235-
return writerBuilder;
226+
return new OnDiskGraphIndexWriter.Builder(builder.getGraph(), termsFile.toPath())
227+
.withStartOffset(termsOffset)
228+
.with(new InlineVectors(dimension))
229+
.withVersion(JVECTOR_VERSION);
236230
}
237231

238232
@Override
@@ -401,7 +395,7 @@ public SegmentMetadata.ComponentMetadataMap flush() throws IOException
401395
// write PQ (time to do this is negligible, don't bother doing it async)
402396
long pqOffset = pqOutput.getFilePointer();
403397
CassandraOnHeapGraph.writePqHeader(pqOutput.asSequentialWriter(), unitVectors, VectorCompression.CompressionType.PRODUCT_QUANTIZATION);
404-
compressedVectors.write(pqOutput.asSequentialWriter(), JVECTOR_2_VERSION); // VSTODO old version until we add APQ
398+
compressedVectors.write(pqOutput.asSequentialWriter(), JVECTOR_VERSION); // VSTODO old version until we add APQ
405399
long pqLength = pqOutput.getFilePointer() - pqOffset;
406400

407401
// write postings asynchronously while we run cleanup()
@@ -455,18 +449,9 @@ public SegmentMetadata.ComponentMetadataMap flush() throws IOException
455449

456450
// write the graph edge lists and optionally fused adc features
457451
var start = System.nanoTime();
458-
if (writer.getFeatureSet().contains(FeatureId.FUSED_ADC))
459-
{
460-
try (var view = builder.getGraph().getView())
461-
{
462-
var supplier = Feature.singleStateFactory(FeatureId.FUSED_ADC, ordinal -> new FusedADC.State(view, (PQVectors) compressedVectors, ordinal));
463-
writer.write(supplier);
464-
}
465-
}
466-
else
467-
{
468-
writer.write(Map.of());
469-
}
452+
// Required becuase jvector 3 wrote the fused adc map here. We no longer write jvector 3, but we still
453+
// write out the empty map.
454+
writer.write(Map.of());
470455
SAICodecUtils.writeFooter(writer.getOutput(), writer.checksum());
471456
logger.info("Writing graph took {}ms", (System.nanoTime() - start) / 1_000_000);
472457
long termsLength = writer.getOutput().position() - termsOffset;

test/unit/org/apache/cassandra/index/sai/cql/VectorDotProductWithLengthTest.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,8 @@ public class VectorDotProductWithLengthTest extends VectorTester
3333
public void setup() throws Throwable
3434
{
3535
super.setup();
36-
V3OnDiskFormat.enableJVector3Format(); // we are testing unit vector detection which is part of the v3 changes
36+
// we are testing unit vector detection which is part of the v3 changes, but continues in all subsequent versions
37+
assert V3OnDiskFormat.JVECTOR_VERSION >= 3 : "This test assumes JVector version 3 or greater";
3738
}
3839

3940
// This tests our detection of unit-length vectors used with dot product and PQ.

test/unit/org/apache/cassandra/index/sai/cql/VectorTypeTest.java

Lines changed: 53 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@
4848
import org.apache.cassandra.index.sai.disk.format.Version;
4949
import org.apache.cassandra.index.sai.disk.v1.IndexWriterConfig;
5050
import org.apache.cassandra.index.sai.disk.v1.SegmentBuilder;
51+
import org.apache.cassandra.index.sai.disk.v3.V3OnDiskFormat;
5152
import org.apache.cassandra.index.sai.disk.vector.CassandraOnHeapGraph;
5253
import org.apache.cassandra.index.sai.disk.vector.VectorSourceModel;
5354
import org.apache.cassandra.index.sai.plan.QueryController;
@@ -1120,8 +1121,22 @@ public void testPartitionKeyRestrictionCombinedWithSearchPredicate() throws Thro
11201121
}
11211122

11221123
@Test
1123-
public void newJVectorOptionsTest()
1124+
public void newJVectorOptionsTestVersion2()
11241125
{
1126+
newJVectorOptionsTest(2);
1127+
}
1128+
// We skip version 3 since it isn't supported anymore
1129+
@Test
1130+
public void newJVectorOptionsTestVersion4()
1131+
{
1132+
newJVectorOptionsTest(4);
1133+
}
1134+
1135+
public void newJVectorOptionsTest(int version)
1136+
{
1137+
// Configure the version to ensure we don't fail for settings that are unsupported on earlier versions of jvector
1138+
V3OnDiskFormat.JVECTOR_VERSION = version;
1139+
11251140
// This test ensures that we can set and retrieve new jvector parameters
11261141
// (neighborhood_overflow, alpha, enable_hierarchy), and that they are honored at index build time.
11271142

@@ -1142,20 +1157,20 @@ public void newJVectorOptionsTest()
11421157
+ " 'alpha' : '1.8' "
11431158
+ '}');
11441159

1145-
// Insert some data
1146-
execute("INSERT INTO %s (pk, txt, vec) VALUES (0, 'row0', [1.0, 2.0, 3.0, 4.0])");
1147-
execute("INSERT INTO %s (pk, txt, vec) VALUES (1, 'row1', [2.0, 2.5, 3.5, 4.5])");
1148-
execute("INSERT INTO %s (pk, txt, vec) VALUES (2, 'row2', [5.0, 1.0, 1.0, 1.0])");
1149-
// Run basic query
1150-
assertRows(execute("SELECT pk FROM %s ORDER BY vec ANN OF [2.0, 2.0, 3.0, 4.0] LIMIT 2"), row(1), row(0));
1160+
// Insert many rows
1161+
for (int i = 0; i < 2000; i++)
1162+
execute("INSERT INTO %s (pk, txt, vec) VALUES (?, ?, ?)", i, "row" + i, randomVectorBoxed(4));
1163+
1164+
// Run basic query to confirm we can, no need to validate results
1165+
execute("SELECT pk FROM %s ORDER BY vec ANN OF [2.0, 2.0, 3.0, 4.0] LIMIT 2");
11511166
// Confirm that we can flush with custom options
11521167
flush();
1153-
// Run basic query
1154-
assertRows(execute("SELECT pk FROM %s ORDER BY vec ANN OF [2.0, 2.0, 3.0, 4.0] LIMIT 2"), row(1), row(0));
1168+
// Run basic query to confirm we can, no need to validate results
1169+
execute("SELECT pk FROM %s ORDER BY vec ANN OF [2.0, 2.0, 3.0, 4.0] LIMIT 2");
11551170
// Confirm that we can compact with custom options
11561171
compact();
1157-
// Run basic query
1158-
assertRows(execute("SELECT pk FROM %s ORDER BY vec ANN OF [2.0, 2.0, 3.0, 4.0] LIMIT 2"), row(1), row(0));
1172+
// Run basic query to confirm we can, no need to validate results
1173+
execute("SELECT pk FROM %s ORDER BY vec ANN OF [2.0, 2.0, 3.0, 4.0] LIMIT 2");
11591174

11601175
// Confirm that the config picks up our custom settings.
11611176
StorageAttachedIndex saiIndex =
@@ -1172,4 +1187,31 @@ public void newJVectorOptionsTest()
11721187
assertEquals(VectorSimilarityFunction.EUCLIDEAN, config.getSimilarityFunction());
11731188
}
11741189

1190+
@Test
1191+
public void testMultiVersionJVectorCompatibility() throws Throwable
1192+
{
1193+
createTable("CREATE TABLE %s (pk int, vec vector<float, 4>, PRIMARY KEY(pk))");
1194+
createIndex("CREATE CUSTOM INDEX ON %s(vec) USING 'StorageAttachedIndex'");
1195+
1196+
// Note that we do not test the multi-version path where compaction produces different sstables, which is
1197+
// the norm in CNDB. If we had a way to compact individual sstables, we could.
1198+
disableCompaction();
1199+
1200+
// Create index files for each valid version
1201+
for (int version = 2; version <= V3OnDiskFormat.JVECTOR_VERSION; version++)
1202+
{
1203+
// Version 3 is no longer supported, so there is mild risk that it isn't covered here, but we can't write
1204+
// it any more, so there isn't much we can do.
1205+
if (version == 3)
1206+
continue;
1207+
V3OnDiskFormat.JVECTOR_VERSION = version;
1208+
for (int i = 0; i < CassandraOnHeapGraph.MIN_PQ_ROWS; i++)
1209+
execute("INSERT INTO %s (pk, vec) VALUES (?, ?)", i, randomVectorBoxed(4));
1210+
flush();
1211+
}
1212+
1213+
// Run basic query to confirm we can, no need to validate results
1214+
execute("SELECT pk FROM %s ORDER BY vec ANN OF [2.0, 2.0, 3.0, 4.0] LIMIT 2");
1215+
}
1216+
11751217
}

0 commit comments

Comments
 (0)