Skip to content

Commit 5cb23da

Browse files
CNDB-12937: Update jvector to 4.0.0-beta.2; add new graph construction parameters to index config (#1676)
### What is the issue Fixes: riptano/cndb#12937 ### What does this PR fix and why was it fixed This pull request upgrades jvector from **4.0.0-beta.1** to **4.0.0-beta.2** and introduces three new configuration options to influence graph construction: 1. `neighborhood_overflow` 2. `alpha` 3. `enable_hierarchy` The defaults for these hyperparameters vary between in memory and on disk, but when these are configured, they will be uniformly applied to graphs built by a memtable and by compaction. **Details** - **jvector 4.0.0-beta.2** - Minor changes to the graph index architecture, including some code now under `...disk.feature...` packages. - Removed the old `CachingGraphIndex` code in Cassandra, which is no longer used. - New constructor arguments for controlling `neighborhood_overflow`, `alpha`, and hierarchical levels in the HNSW graph. - **New configuration options** 1. **`neighborhood_overflow`**: A `float` >= 1.0 controlling how aggressively the graph tries to insert extra neighbors on each HNSW layer. 2. **`alpha`**: A `float` > 0 used in the neighbor selection phase for HNSW. 3. **`enable_hierarchy`**: A `boolean` indicating whether HNSW should allow multiple layers (true) or a single-layer approximate graph (false). Defaults to false. I manually verified that the jvector upgrade is backwards compatible, meaning that when we build using the new jvector version, we can read with an old jvector version, so I did not create a new SAI on disk file format version.
1 parent 08557df commit 5cb23da

File tree

13 files changed

+261
-85
lines changed

13 files changed

+261
-85
lines changed

build.xml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -754,7 +754,7 @@
754754
<dependency groupId="org.apache.lucene" artifactId="lucene-core" version="9.8.0-5ea8bb4f21" />
755755
<dependency groupId="org.apache.lucene" artifactId="lucene-analysis-common" version="9.8.0-5ea8bb4f21" />
756756
<dependency groupId="org.apache.lucene" artifactId="lucene-backward-codecs" version="9.8.0-5ea8bb4f21" />
757-
<dependency groupId="io.github.jbellis" artifactId="jvector" version="4.0.0-beta.1" />
757+
<dependency groupId="io.github.jbellis" artifactId="jvector" version="4.0.0-beta.2" />
758758
<dependency groupId="com.bpodgursky" artifactId="jbool_expressions" version="1.14" scope="test"/>
759759

760760
<dependency groupId="com.carrotsearch.randomizedtesting" artifactId="randomizedtesting-runner" version="2.1.2" scope="test">

src/java/org/apache/cassandra/config/CassandraRelevantProperties.java

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -362,9 +362,6 @@ public enum CassandraRelevantProperties
362362
/** Controls the hnsw vector cache size, in bytes, per index segment. 0 to disable */
363363
SAI_HNSW_VECTOR_CACHE_BYTES("cassandra.sai.vector_search.vector_cache_bytes", String.valueOf(4 * 1024 * 1024)),
364364

365-
/** Whether to allow the user to specify custom options to the hnsw index */
366-
SAI_HNSW_ALLOW_CUSTOM_PARAMETERS("cassandra.sai.hnsw.allow_custom_parameters", "false"),
367-
368365
/** Whether to validate terms that will be SAI indexed at the coordinator */
369366
SAI_VALIDATE_TERMS_AT_COORDINATOR("cassandra.sai.validate_terms_at_coordinator", "true"),
370367

src/java/org/apache/cassandra/index/sai/StorageAttachedIndex.java

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -206,6 +206,9 @@ public List<SecondaryIndexBuilder> getParallelIndexBuildTasks(ColumnFamilyStore
206206
IndexWriterConfig.POSTING_LIST_LVL_SKIP_OPTION,
207207
IndexWriterConfig.MAXIMUM_NODE_CONNECTIONS,
208208
IndexWriterConfig.CONSTRUCTION_BEAM_WIDTH,
209+
IndexWriterConfig.NEIGHBORHOOD_OVERFLOW,
210+
IndexWriterConfig.ALPHA,
211+
IndexWriterConfig.ENABLE_HIERARCHY,
209212
IndexWriterConfig.SIMILARITY_FUNCTION,
210213
IndexWriterConfig.SOURCE_MODEL,
211214
IndexWriterConfig.OPTIMIZE_FOR,

src/java/org/apache/cassandra/index/sai/disk/v1/IndexWriterConfig.java

Lines changed: 97 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,9 @@ public class IndexWriterConfig
4545

4646
public static final String MAXIMUM_NODE_CONNECTIONS = "maximum_node_connections";
4747
public static final String CONSTRUCTION_BEAM_WIDTH = "construction_beam_width";
48+
public static final String NEIGHBORHOOD_OVERFLOW = "neighborhood_overflow";
49+
public static final String ALPHA = "alpha";
50+
public static final String ENABLE_HIERARCHY = "enable_hierarchy";
4851
public static final String SIMILARITY_FUNCTION = "similarity_function";
4952
public static final String SOURCE_MODEL = "source_model";
5053
public static final String OPTIMIZE_FOR = "optimize_for"; // unused, retained for compatibility w/ old schemas
@@ -54,6 +57,7 @@ public class IndexWriterConfig
5457

5558
public static final int DEFAULT_MAXIMUM_NODE_CONNECTIONS = 16;
5659
public static final int DEFAULT_CONSTRUCTION_BEAM_WIDTH = 100;
60+
public static final boolean DEFAULT_ENABLE_HIERARCHY = false;
5761

5862
public static final int MAX_TOP_K = SAI_VECTOR_SEARCH_MAX_TOP_K.getInt();
5963

@@ -87,6 +91,10 @@ public class IndexWriterConfig
8791
private final VectorSimilarityFunction similarityFunction;
8892
private final VectorSourceModel sourceModel;
8993

94+
private final Float neighborhoodOverflow; // default varies for in memory/compaction build
95+
private final Float alpha; // default varies for in memory/compaction build
96+
private final boolean enableHierarchy; // defaults to false
97+
9098
public IndexWriterConfig(String indexName,
9199
int bkdPostingsSkip,
92100
int bkdPostingsMinLeaves)
@@ -108,6 +116,21 @@ public IndexWriterConfig(String indexName,
108116
int constructionBeamWidth,
109117
VectorSimilarityFunction similarityFunction,
110118
VectorSourceModel sourceModel)
119+
{
120+
this(indexName, bkdPostingsSkip, bkdPostingsMinLeaves, maximumNodeConnections, constructionBeamWidth,
121+
similarityFunction, sourceModel, null, null, false);
122+
}
123+
124+
public IndexWriterConfig(String indexName,
125+
int bkdPostingsSkip,
126+
int bkdPostingsMinLeaves,
127+
int maximumNodeConnections,
128+
int constructionBeamWidth,
129+
VectorSimilarityFunction similarityFunction,
130+
VectorSourceModel sourceModel,
131+
Float neighborhoodOverflow,
132+
Float alpha,
133+
boolean enableHierarchy)
111134
{
112135
this.indexName = indexName;
113136
this.bkdPostingsSkip = bkdPostingsSkip;
@@ -116,6 +139,9 @@ public IndexWriterConfig(String indexName,
116139
this.constructionBeamWidth = constructionBeamWidth;
117140
this.similarityFunction = similarityFunction;
118141
this.sourceModel = sourceModel;
142+
this.neighborhoodOverflow = neighborhoodOverflow;
143+
this.alpha = alpha;
144+
this.enableHierarchy = enableHierarchy;
119145
}
120146

121147
public String getIndexName()
@@ -163,6 +189,21 @@ public VectorSourceModel getSourceModel()
163189
return sourceModel;
164190
}
165191

192+
public float getNeighborhoodOverflow(float defaultValue)
193+
{
194+
return neighborhoodOverflow == null ? defaultValue : neighborhoodOverflow;
195+
}
196+
197+
public float getAlpha(float defaultValue)
198+
{
199+
return alpha == null ? defaultValue : alpha;
200+
}
201+
202+
public boolean isHierarchyEnabled()
203+
{
204+
return enableHierarchy;
205+
}
206+
166207
public static IndexWriterConfig fromOptions(String indexName, AbstractType<?> type, Map<String, String> options)
167208
{
168209
int minLeaves = DEFAULT_POSTING_LIST_MIN_LEAVES;
@@ -172,6 +213,10 @@ public static IndexWriterConfig fromOptions(String indexName, AbstractType<?> ty
172213
VectorSourceModel sourceModel = DEFAULT_SOURCE_MODEL;
173214
VectorSimilarityFunction similarityFunction = sourceModel.defaultSimilarityFunction; // don't leave null in case no options at all are given
174215

216+
Float neighborhoodOverflow = null;
217+
Float alpha = null;
218+
boolean enableHierarchy = DEFAULT_ENABLE_HIERARCHY;
219+
175220
if (options.get(POSTING_LIST_LVL_MIN_LEAVES) != null || options.get(POSTING_LIST_LVL_SKIP_OPTION) != null)
176221
{
177222
if (TypeUtil.isLiteral(type))
@@ -213,16 +258,16 @@ else if (options.get(MAXIMUM_NODE_CONNECTIONS) != null ||
213258
options.get(CONSTRUCTION_BEAM_WIDTH) != null ||
214259
options.get(OPTIMIZE_FOR) != null ||
215260
options.get(SIMILARITY_FUNCTION) != null ||
216-
options.get(SOURCE_MODEL) != null)
261+
options.get(SOURCE_MODEL) != null ||
262+
options.get(NEIGHBORHOOD_OVERFLOW) != null ||
263+
options.get(ALPHA) != null ||
264+
options.get(ENABLE_HIERARCHY) != null)
217265
{
218266
if (!type.isVector())
219267
throw new InvalidRequestException(String.format("CQL type %s cannot have vector options", type.asCQL3Type()));
220268

221269
if (options.containsKey(MAXIMUM_NODE_CONNECTIONS))
222270
{
223-
if (!CassandraRelevantProperties.SAI_HNSW_ALLOW_CUSTOM_PARAMETERS.getBoolean())
224-
throw new InvalidRequestException(String.format("Maximum node connections cannot be set without enabling %s", CassandraRelevantProperties.SAI_HNSW_ALLOW_CUSTOM_PARAMETERS.name()));
225-
226271
try
227272
{
228273
maximumNodeConnections = Integer.parseInt(options.get(MAXIMUM_NODE_CONNECTIONS));
@@ -237,9 +282,6 @@ else if (options.get(MAXIMUM_NODE_CONNECTIONS) != null ||
237282
}
238283
if (options.containsKey(CONSTRUCTION_BEAM_WIDTH))
239284
{
240-
if (!CassandraRelevantProperties.SAI_HNSW_ALLOW_CUSTOM_PARAMETERS.getBoolean())
241-
throw new InvalidRequestException(String.format("Construction beam width cannot be set without enabling %s", CassandraRelevantProperties.SAI_HNSW_ALLOW_CUSTOM_PARAMETERS.name()));
242-
243285
try
244286
{
245287
queueSize = Integer.parseInt(options.get(CONSTRUCTION_BEAM_WIDTH));
@@ -285,9 +327,51 @@ else if (options.get(MAXIMUM_NODE_CONNECTIONS) != null ||
285327
{
286328
similarityFunction = sourceModel.defaultSimilarityFunction;
287329
}
330+
331+
if (options.containsKey(NEIGHBORHOOD_OVERFLOW))
332+
{
333+
try
334+
{
335+
neighborhoodOverflow = Float.parseFloat(options.get(NEIGHBORHOOD_OVERFLOW));
336+
if (neighborhoodOverflow < 1.0f)
337+
throw new InvalidRequestException(String.format("Neighborhood overflow for index %s must be >= 1.0, was %s",
338+
indexName, neighborhoodOverflow));
339+
}
340+
catch (NumberFormatException e)
341+
{
342+
throw new InvalidRequestException(String.format("Neighborhood overflow %s is not a valid float for index %s",
343+
options.get(NEIGHBORHOOD_OVERFLOW), indexName));
344+
}
345+
}
346+
347+
if (options.containsKey(ALPHA))
348+
{
349+
try
350+
{
351+
alpha = Float.parseFloat(options.get(ALPHA));
352+
if (alpha <= 0)
353+
throw new InvalidRequestException(String.format("Alpha for index %s must be > 0, was %s",
354+
indexName, alpha));
355+
}
356+
catch (NumberFormatException e)
357+
{
358+
throw new InvalidRequestException(String.format("Alpha %s is not a valid float for index %s",
359+
options.get(ALPHA), indexName));
360+
}
361+
}
362+
363+
if (options.containsKey(ENABLE_HIERARCHY))
364+
{
365+
String value = options.get(ENABLE_HIERARCHY).toLowerCase();
366+
if (!value.equals("true") && !value.equals("false"))
367+
throw new InvalidRequestException(String.format("Enable hierarchy must be 'true' or 'false' for index %s, was '%s'",
368+
indexName, value));
369+
enableHierarchy = Boolean.parseBoolean(value);
370+
}
288371
}
289372

290-
return new IndexWriterConfig(indexName, skip, minLeaves, maximumNodeConnections, queueSize, similarityFunction, sourceModel);
373+
return new IndexWriterConfig(indexName, skip, minLeaves, maximumNodeConnections, queueSize,
374+
similarityFunction, sourceModel, neighborhoodOverflow, alpha, enableHierarchy);
291375
}
292376

293377
public static IndexWriterConfig defaultConfig(String indexName)
@@ -310,12 +394,15 @@ public static IndexWriterConfig emptyConfig()
310394
@Override
311395
public String toString()
312396
{
313-
return String.format("IndexWriterConfig{%s=%d, %s=%d, %s=%d, %s=%d, %s=%s, %s=%s}",
397+
return String.format("IndexWriterConfig{%s=%d, %s=%d, %s=%d, %s=%d, %s=%s, %s=%s, %s=%f, %s=%f, %s=%b}",
314398
POSTING_LIST_LVL_SKIP_OPTION, bkdPostingsSkip,
315399
POSTING_LIST_LVL_MIN_LEAVES, bkdPostingsMinLeaves,
316400
MAXIMUM_NODE_CONNECTIONS, maximumNodeConnections,
317401
CONSTRUCTION_BEAM_WIDTH, constructionBeamWidth,
318402
SIMILARITY_FUNCTION, similarityFunction,
319-
SOURCE_MODEL, sourceModel);
403+
SOURCE_MODEL, sourceModel,
404+
NEIGHBORHOOD_OVERFLOW, neighborhoodOverflow,
405+
ALPHA, alpha,
406+
ENABLE_HIERARCHY, enableHierarchy);
320407
}
321408
}

src/java/org/apache/cassandra/index/sai/disk/vector/CassandraDiskAnn.java

Lines changed: 2 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -29,8 +29,7 @@
2929

3030
import io.github.jbellis.jvector.graph.GraphIndex;
3131
import io.github.jbellis.jvector.graph.GraphSearcher;
32-
import io.github.jbellis.jvector.graph.disk.CachingGraphIndex;
33-
import io.github.jbellis.jvector.graph.disk.FeatureId;
32+
import io.github.jbellis.jvector.graph.disk.feature.FeatureId;
3433
import io.github.jbellis.jvector.graph.disk.OnDiskGraphIndex;
3534
import io.github.jbellis.jvector.graph.similarity.SearchScoreProvider;
3635
import io.github.jbellis.jvector.quantization.BQVectors;
@@ -57,7 +56,6 @@
5756
import org.apache.cassandra.tracing.Tracing;
5857
import org.apache.cassandra.utils.CloseableIterator;
5958

60-
import static java.lang.Math.min;
6159

6260
public class CassandraDiskAnn
6361
{
@@ -94,7 +92,7 @@ public CassandraDiskAnn(SSTableContext sstableContext, SegmentMetadata.Component
9492
graphHandle = indexFiles.termsData();
9593
var rawGraph = OnDiskGraphIndex.load(graphHandle::createReader, termsMetadata.offset);
9694
features = rawGraph.getFeatureSet();
97-
graph = V3OnDiskFormat.ENABLE_EDGES_CACHE ? cachingGraphFor(rawGraph) : rawGraph;
95+
graph = rawGraph;
9896

9997
long pqSegmentOffset = this.componentMetadatas.get(IndexComponentType.PQ).offset;
10098
try (var pqFile = indexFiles.pq();
@@ -175,27 +173,6 @@ public ProductQuantization getPQ()
175173
return pq;
176174
}
177175

178-
private GraphIndex cachingGraphFor(OnDiskGraphIndex rawGraph)
179-
{
180-
// cache edges around the entry point
181-
// we can easily hold 1% of the edges in memory for typical index sizes, but
182-
// there is a lot of redundancy in the nodes we observe in practice around the entry point
183-
// (only 10%-20% are unique), so use 5% as our target.
184-
//
185-
// 32**3 = 32k, which would be 4MB if all the nodes are unique, so 3 levels deep is a safe upper bound
186-
int distance = min(logBaseX(0.05d * rawGraph.size(), rawGraph.maxDegree()), 3);
187-
var result = new CachingGraphIndex(rawGraph, distance);
188-
logger.debug("Cached {}@{} to distance {} in {}B",
189-
this, graphHandle.path(), distance, result.ramBytesUsed());
190-
return result;
191-
}
192-
193-
private static int logBaseX(double val, double base) {
194-
if (base <= 1.0d || val <= 1.0d)
195-
return 0;
196-
return (int)Math.floor(Math.log(val) / Math.log(base));
197-
}
198-
199176
public long ramBytesUsed()
200177
{
201178
return graph.ramBytesUsed();

src/java/org/apache/cassandra/index/sai/disk/vector/CassandraOnHeapGraph.java

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -43,11 +43,11 @@
4343
import io.github.jbellis.jvector.graph.GraphSearcher;
4444
import io.github.jbellis.jvector.graph.RandomAccessVectorValues;
4545
import io.github.jbellis.jvector.graph.SearchResult;
46-
import io.github.jbellis.jvector.graph.disk.Feature;
47-
import io.github.jbellis.jvector.graph.disk.FeatureId;
48-
import io.github.jbellis.jvector.graph.disk.InlineVectors;
4946
import io.github.jbellis.jvector.graph.disk.OnDiskGraphIndexWriter;
5047
import io.github.jbellis.jvector.graph.disk.OrdinalMapper;
48+
import io.github.jbellis.jvector.graph.disk.feature.Feature;
49+
import io.github.jbellis.jvector.graph.disk.feature.FeatureId;
50+
import io.github.jbellis.jvector.graph.disk.feature.InlineVectors;
5151
import io.github.jbellis.jvector.graph.similarity.SearchScoreProvider;
5252
import io.github.jbellis.jvector.quantization.BinaryQuantization;
5353
import io.github.jbellis.jvector.quantization.CompressedVectors;
@@ -157,8 +157,9 @@ public CassandraOnHeapGraph(IndexContext context, boolean forSearching, Memtable
157157
similarityFunction,
158158
indexConfig.getAnnMaxDegree(),
159159
indexConfig.getConstructionBeamWidth(),
160-
1.0f, // no overflow means add will be a bit slower but flush will be faster
161-
dimension > 3 ? 1.2f : 2.0f);
160+
indexConfig.getNeighborhoodOverflow(1.0f), // no overflow means add will be a bit slower but flush will be faster
161+
indexConfig.getAlpha(dimension > 3 ? 1.2f : 2.0f),
162+
indexConfig.isHierarchyEnabled());
162163
searchers = ThreadLocal.withInitial(() -> new GraphSearcherAccessManager(new GraphSearcher(builder.getGraph())));
163164
}
164165

src/java/org/apache/cassandra/index/sai/disk/vector/CompactionGraph.java

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -39,10 +39,10 @@
3939

4040
import io.github.jbellis.jvector.graph.GraphIndexBuilder;
4141
import io.github.jbellis.jvector.graph.ListRandomAccessVectorValues;
42-
import io.github.jbellis.jvector.graph.disk.Feature;
43-
import io.github.jbellis.jvector.graph.disk.FeatureId;
44-
import io.github.jbellis.jvector.graph.disk.FusedADC;
45-
import io.github.jbellis.jvector.graph.disk.InlineVectors;
42+
import io.github.jbellis.jvector.graph.disk.feature.Feature;
43+
import io.github.jbellis.jvector.graph.disk.feature.FeatureId;
44+
import io.github.jbellis.jvector.graph.disk.feature.FusedADC;
45+
import io.github.jbellis.jvector.graph.disk.feature.InlineVectors;
4646
import io.github.jbellis.jvector.graph.disk.OnDiskGraphIndex;
4747
import io.github.jbellis.jvector.graph.disk.OnDiskGraphIndexWriter;
4848
import io.github.jbellis.jvector.graph.disk.OrdinalMapper;
@@ -204,8 +204,9 @@ else if (compressor instanceof BinaryQuantization)
204204
dimension,
205205
indexConfig.getAnnMaxDegree(),
206206
indexConfig.getConstructionBeamWidth(),
207-
1.2f,
208-
dimension > 3 ? 1.2f : 1.4f,
207+
indexConfig.getNeighborhoodOverflow(1.2f),
208+
indexConfig.getAlpha(dimension > 3 ? 1.2f : 1.4f),
209+
indexConfig.isHierarchyEnabled(),
209210
compactionSimdPool, compactionFjp);
210211

211212
termsFile = perIndexComponents.addOrGet(IndexComponentType.TERMS_DATA).file();
@@ -316,8 +317,7 @@ public InsertionResult maybeAddVector(ByteBuffer term, int segmentRowId) throws
316317
compressedVectors = new MutablePQVectors((ProductQuantization) compressor);
317318
compactionFjp.submit(() -> {
318319
IntStream.range(0, encodedVectorCount)
319-
// FIXME parallel is disabled until 4.0.0 beta2 (encodeAndSet is not threadsafe before then)
320-
// .parallel()
320+
.parallel()
321321
.forEach(i -> {
322322
var v = vectorsByOrdinal.get(i);
323323
if (v == null)

test/unit/org/apache/cassandra/index/sai/cql/VectorCompactionTest.java

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,9 @@ public void testPQRefine()
7777
flush();
7878
}
7979

80-
CompactionGraph.PQ_TRAINING_SIZE = 2 * MIN_PQ_ROWS;
80+
// TODO deterimine proper design for PQ training on vectors of dimension < 100.
81+
// see https://github.com/riptano/cndb/issues/13630
82+
// CompactionGraph.PQ_TRAINING_SIZE = 2 * MIN_PQ_ROWS;
8183
compact();
8284

8385
// Confirm we can query the data with reasonable recall

test/unit/org/apache/cassandra/index/sai/cql/VectorTester.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,8 @@ public static double rawIndexedRecall(Collection<float[]> rawVectors, float[] ra
8484
16,
8585
100,
8686
1.2f,
87-
1.4f);
87+
1.4f,
88+
false);
8889

8990
for (float[] raw : rawVectors)
9091
{

0 commit comments

Comments
 (0)