Skip to content

Commit e2ef6e2

Browse files
CNDB-14797: Upgrade to jvector 4.0.0-rc1 (#1866)
### What is the issue Fixes riptano/cndb#14797 CNDB test PR riptano/cndb#14799 ### What does this PR fix and why was it fixed Here is the list of commits included in the upgrade: datastax/jvector@4.0.0-beta.5...4.0.0-rc.1 Here is a copy of the relevant `CHANGELOG.md` lines: > #### [4.0.0-rc.1](datastax/jvector@4.0.0-beta.6...4.0.0-rc.1) > > - Fix issue when calling cleanup while concurrently executing searches [`#483`](datastax/jvector#483) > - Improve the efficiency and memory usage of GraphIndexBuilder.cleanup [`#477`](datastax/jvector#477) > - add PQ training benchmark [`#482`](datastax/jvector#482) > - Remove extraneous character from datasets.yml [`#484`](datastax/jvector#484) > - Upgrade YAML files to v5 after the format was introduced in the last update [`#478`](datastax/jvector#478) > - New chunked memory-mapped reader that supports >2GB files [`61bffbe`](datastax/jvector@61bffbe) > - release 4.0.0-rc.1 [`6737596`](datastax/jvector@6737596) > - Fix comparison in TestADCGraphIndex [`b637f65`](datastax/jvector@b637f65) > > #### [4.0.0-beta.6](datastax/jvector@4.0.0-beta.5...4.0.0-beta.6) > > > 13 June 2025 > - Add a new graph node using a search score [`#473`](datastax/jvector#473) > - chore(release): Bump tag version and update changelog [`#471`](datastax/jvector#471) > - Sequential disk writer (#475). Upgrades file format from 4 to 5 [`d0ccb32`](datastax/jvector@d0ccb32) > - Allow empty sections in datasets.yml & add colbert-1M.yml [`2bf5f9a`](datastax/jvector@2bf5f9a) > - chore (release): Start release version 4.0.0-beta.6 [`9a453a3`](datastax/jvector@9a453a3) > >
1 parent f23b0bf commit e2ef6e2

File tree

4 files changed

+17
-12
lines changed

4 files changed

+17
-12
lines changed

build.xml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -754,7 +754,7 @@
754754
<dependency groupId="org.apache.lucene" artifactId="lucene-core" version="9.8.0" />
755755
<dependency groupId="org.apache.lucene" artifactId="lucene-analysis-common" version="9.8.0" />
756756
<dependency groupId="org.apache.lucene" artifactId="lucene-backward-codecs" version="9.8.0" />
757-
<dependency groupId="io.github.jbellis" artifactId="jvector" version="4.0.0-beta.5" />
757+
<dependency groupId="io.github.jbellis" artifactId="jvector" version="4.0.0-rc.1" />
758758
<dependency groupId="com.bpodgursky" artifactId="jbool_expressions" version="1.14" scope="test"/>
759759

760760
<dependency groupId="com.carrotsearch.randomizedtesting" artifactId="randomizedtesting-runner" version="2.1.2" scope="test">

src/java/org/apache/cassandra/index/sai/disk/vector/CassandraDiskAnn.java

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@
3131
import io.github.jbellis.jvector.graph.GraphSearcher;
3232
import io.github.jbellis.jvector.graph.disk.feature.FeatureId;
3333
import io.github.jbellis.jvector.graph.disk.OnDiskGraphIndex;
34+
import io.github.jbellis.jvector.graph.similarity.DefaultSearchScoreProvider;
3435
import io.github.jbellis.jvector.graph.similarity.SearchScoreProvider;
3536
import io.github.jbellis.jvector.quantization.BQVectors;
3637
import io.github.jbellis.jvector.quantization.CompressedVectors;
@@ -238,12 +239,12 @@ public CloseableIterator<RowIdWithScore> search(VectorFloat<?> queryVector,
238239
{
239240
var asf = view.approximateScoreFunctionFor(queryVector, similarityFunction);
240241
var rr = isRerankless ? null : view.rerankerFor(queryVector, similarityFunction);
241-
ssp = new SearchScoreProvider(asf, rr);
242+
ssp = new DefaultSearchScoreProvider(asf, rr);
242243
}
243244
else if (compressedVectors == null)
244245
{
245246
// no compression, so we ignore isRerankless (except for setting rerankK to limit)
246-
ssp = new SearchScoreProvider(view.rerankerFor(queryVector, similarityFunction));
247+
ssp = new DefaultSearchScoreProvider(view.rerankerFor(queryVector, similarityFunction));
247248
}
248249
else
249250
{
@@ -254,7 +255,7 @@ else if (compressedVectors == null)
254255
: similarityFunction;
255256
var asf = compressedVectors.precomputedScoreFunctionFor(queryVector, sf);
256257
var rr = isRerankless ? null : view.rerankerFor(queryVector, similarityFunction);
257-
ssp = new SearchScoreProvider(asf, rr);
258+
ssp = new DefaultSearchScoreProvider(asf, rr);
258259
}
259260
long start = System.nanoTime();
260261
var result = searcher.search(ssp, limit, rerankK, threshold, context.getAnnRerankFloor(), ordinalsMap.ignoringDeleted(acceptBits));

src/java/org/apache/cassandra/index/sai/disk/vector/CassandraOnHeapGraph.java

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@
4949
import io.github.jbellis.jvector.graph.disk.feature.Feature;
5050
import io.github.jbellis.jvector.graph.disk.feature.FeatureId;
5151
import io.github.jbellis.jvector.graph.disk.feature.InlineVectors;
52+
import io.github.jbellis.jvector.graph.similarity.DefaultSearchScoreProvider;
5253
import io.github.jbellis.jvector.graph.similarity.SearchScoreProvider;
5354
import io.github.jbellis.jvector.quantization.BinaryQuantization;
5455
import io.github.jbellis.jvector.quantization.CompressedVectors;
@@ -341,7 +342,7 @@ public CloseableIterator<SearchResult.NodeScore> search(QueryContext context, Ve
341342
searcher.setView(builder.getGraph().getView());
342343
try
343344
{
344-
var ssf = SearchScoreProvider.exact(queryVector, similarityFunction, vectorValues);
345+
var ssf = DefaultSearchScoreProvider.exact(queryVector, similarityFunction, vectorValues);
345346
long start = System.nanoTime();
346347
var result = searcher.search(ssf, limit, rerankK, threshold, 0.0f, bits);
347348
long elapsed = System.nanoTime() - start;
@@ -439,10 +440,10 @@ public SegmentMetadata.ComponentMetadataMap flush(IndexComponents.ForWrite perIn
439440
try (var pqOutput = perIndexComponents.addOrGet(IndexComponentType.PQ).openOutput(true);
440441
var postingsOutput = perIndexComponents.addOrGet(IndexComponentType.POSTING_LISTS).openOutput(true);
441442
var indexWriter = new OnDiskGraphIndexWriter.Builder(builder.getGraph(), indexFile.toPath())
443+
.withStartOffset(termsOffset)
442444
.withVersion(Version.current().onDiskFormat().jvectorFileFormatVersion())
443445
.withMapper(ordinalMapper)
444446
.with(new InlineVectors(vectorValues.dimension()))
445-
.withStartOffset(termsOffset)
446447
.build())
447448
{
448449
SAICodecUtils.writeHeader(pqOutput);

src/java/org/apache/cassandra/index/sai/disk/vector/CompactionGraph.java

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,6 @@
7676
import org.apache.cassandra.index.sai.disk.format.Version;
7777
import org.apache.cassandra.index.sai.disk.v1.SegmentMetadata;
7878
import org.apache.cassandra.index.sai.disk.v2.V2VectorPostingsWriter;
79-
import org.apache.cassandra.index.sai.disk.v3.V3OnDiskFormat;
8079
import org.apache.cassandra.index.sai.disk.v5.V5OnDiskFormat;
8180
import org.apache.cassandra.index.sai.disk.v5.V5VectorPostingsWriter;
8281
import org.apache.cassandra.index.sai.disk.v5.V5VectorPostingsWriter.Structure;
@@ -211,23 +210,27 @@ else if (compressor instanceof BinaryQuantization)
211210
indexConfig.getNeighborhoodOverflow(1.2f),
212211
indexConfig.getAlpha(dimension > 3 ? 1.2f : 1.4f),
213212
indexConfig.isHierarchyEnabled() && jvectorVersion >= 4,
214-
compactionSimdPool, compactionFjp);
213+
true, // We always refine during compaction
214+
compactionSimdPool,
215+
compactionFjp);
215216

216217
termsFile = perIndexComponents.addOrGet(IndexComponentType.TERMS_DATA).file();
217218
termsOffset = (termsFile.exists() ? termsFile.length() : 0)
218219
+ SAICodecUtils.headerSize();
219220
// placeholder writer, will be replaced at flush time when we finalize the index contents
220-
writer = createTermsWriterBuilder().withMapper(new OrdinalMapper.IdentityMapper(maxRowsInGraph)).build();
221+
writer = createTermsWriter(new OrdinalMapper.IdentityMapper(maxRowsInGraph));
221222
writer.getOutput().seek(termsFile.length()); // position at the end of the previous segment before writing our own header
222223
SAICodecUtils.writeHeader(SAICodecUtils.toLuceneOutput(writer.getOutput()));
223224
}
224225

225-
private OnDiskGraphIndexWriter.Builder createTermsWriterBuilder() throws IOException
226+
private OnDiskGraphIndexWriter createTermsWriter(OrdinalMapper ordinalMapper) throws IOException
226227
{
227228
return new OnDiskGraphIndexWriter.Builder(builder.getGraph(), termsFile.toPath())
228229
.withStartOffset(termsOffset)
229230
.with(new InlineVectors(dimension))
230-
.withVersion(Version.current().onDiskFormat().jvectorFileFormatVersion());
231+
.withVersion(Version.current().onDiskFormat().jvectorFileFormatVersion())
232+
.withMapper(ordinalMapper)
233+
.build();
231234
}
232235

233236
@Override
@@ -446,7 +449,7 @@ public SegmentMetadata.ComponentMetadataMap flush() throws IOException
446449
}
447450

448451
// Recreate the writer with the final ordinalMapper
449-
writer = createTermsWriterBuilder().withMapper(ordinalMapper.get()).build();
452+
writer = createTermsWriter(ordinalMapper.get());
450453

451454
// write the graph edge lists and optionally fused adc features
452455
var start = System.nanoTime();

0 commit comments

Comments
 (0)