Skip to content

Commit 37372a7

Browse files
michaeljmarshalldriftx
authored andcommitted
CNDB-14501: Enable checksum for jvector index files starting at version EC (#1841)
Fixes: riptano/cndb#14501 CNDB test pr: riptano/cndb#14660 This PR introduces vector index checksum validation starting with files in version `EC` at above. For non terms files, we compute the checksum correctly. For terms files, we skip validation of the checksum because it is not computed correctly. There is a new issue to track fixing the checksum logic on those TERMS files riptano/cndb#14656. I implemented the version-dependent validation by overriding `V7OnDiskFormat#validateIndexComponent`. This allows us to validate new files while skipping the old ones. In doing so, I removed the misleading `IndexFeatureSet` method for checksums. The modified comment describes why that feature didn't make sense as an index-level feature.
1 parent 761851f commit 37372a7

File tree

16 files changed

+1388
-57
lines changed

16 files changed

+1388
-57
lines changed

src/java/org/apache/cassandra/index/sai/StorageAttachedIndexGroup.java

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@
3434
import com.google.common.annotations.VisibleForTesting;
3535
import com.google.common.collect.ImmutableSet;
3636
import com.google.common.collect.Lists;
37+
import com.google.common.collect.Sets;
3738

3839
import org.slf4j.Logger;
3940
import org.slf4j.LoggerFactory;
@@ -411,9 +412,18 @@ public boolean validateSSTableAttachedIndexes(Collection<SSTableReader> sstables
411412

412413
for (SSTableReader sstable : sstables)
413414
{
414-
IndexDescriptor indexDescriptor = contextManager.getOrLoadIndexDescriptor(sstable, indices);
415+
logger.debug("Validating {} SAI indices for {}", indices.size(), sstable.getFilename());
416+
if (indices.isEmpty())
417+
{
418+
logger.debug("No SAI indices to validate for {}", sstable.getFilename());
419+
return true;
420+
}
421+
422+
// For validation, we need to load a fresh descriptor to ensure we see the current state of files
423+
IndexDescriptor indexDescriptor = IndexDescriptor.load(sstable, contexts());
415424
IndexComponents.ForRead perSSTableComponents = indexDescriptor.perSSTableComponents();
416425

426+
logger.debug("Per-SSTable components complete: {}", indexDescriptor.perSSTableComponents().isComplete());
417427
if (indexDescriptor.perSSTableComponents().isComplete())
418428
{
419429
perSSTableComponents.validateComponents(sstable, baseCfs.getTracker(), validateChecksum, true);
@@ -422,6 +432,7 @@ public boolean validateSSTableAttachedIndexes(Collection<SSTableReader> sstables
422432
{
423433
IndexComponents.ForRead perIndexComponents = indexDescriptor.perIndexComponents(index.getIndexContext());
424434

435+
logger.debug("Per-index components complete for {}: {}", index.getIndexContext().getIndexName(), perIndexComponents.isComplete());
425436
if (perIndexComponents.isComplete())
426437
perIndexComponents.validateComponents(sstable, baseCfs.getTracker(), validateChecksum, true);
427438
else if (throwOnIncomplete)
@@ -555,4 +566,12 @@ public void reset()
555566
indices.forEach(index -> index.makeIndexNonQueryable());
556567
onSSTableChanged(baseCfs.getLiveSSTables(), Collections.emptySet(), indices, false);
557568
}
569+
570+
private Set<IndexContext> contexts()
571+
{
572+
Set<IndexContext> contexts = Sets.newHashSetWithExpectedSize(indices.size());
573+
for (StorageAttachedIndex index : indices)
574+
contexts.add(index.getIndexContext());
575+
return contexts;
576+
}
558577
}

src/java/org/apache/cassandra/index/sai/disk/format/IndexDescriptor.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,7 @@ public class IndexDescriptor
8585
// TODO Because indexes can be added at any time to existing data, the Version of a column index
8686
// may not match the Version of the base sstable. OnDiskFormat + IndexFeatureSet + IndexDescriptor
8787
// was not designed with this in mind, leading to some awkwardness, notably in IFS where some features
88-
// are per-sstable (`isRowAware`) and some are per-column (`hasVectorIndexChecksum`).
88+
// are per-sstable (`isRowAware`) and some are per-column (`hasTermsHistogram`).
8989

9090
public final Descriptor descriptor;
9191

src/java/org/apache/cassandra/index/sai/disk/format/IndexFeatureSet.java

Lines changed: 0 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -37,11 +37,6 @@ public interface IndexFeatureSet
3737
*/
3838
boolean isRowAware();
3939

40-
/**
41-
* @return true if vector index files include a checksum at the end
42-
*/
43-
boolean hasVectorIndexChecksum();
44-
4540
/**
4641
* @return true if index metadata contains term histograms for fast cardinality estimation
4742
*/
@@ -64,7 +59,6 @@ public interface IndexFeatureSet
6459
class Accumulator
6560
{
6661
boolean isRowAware = true;
67-
boolean hasVectorIndexChecksum = true;
6862
boolean hasTermsHistogram = true;
6963
boolean complete = false;
7064

@@ -83,8 +77,6 @@ public void accumulate(IndexFeatureSet indexFeatureSet)
8377
assert !complete : "Cannot accumulate after complete has been called";
8478
if (!indexFeatureSet.isRowAware())
8579
isRowAware = false;
86-
if (!indexFeatureSet.hasVectorIndexChecksum())
87-
hasVectorIndexChecksum = false;
8880
if (!indexFeatureSet.hasTermsHistogram())
8981
hasTermsHistogram = false;
9082
}
@@ -106,12 +98,6 @@ public boolean isRowAware()
10698
return isRowAware;
10799
}
108100

109-
@Override
110-
public boolean hasVectorIndexChecksum()
111-
{
112-
return hasVectorIndexChecksum;
113-
}
114-
115101
@Override
116102
public boolean hasTermsHistogram()
117103
{

src/java/org/apache/cassandra/index/sai/disk/format/Version.java

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,9 @@ public class Version implements Comparable<Version>
5252
public static final Version AA = new Version("aa", V1OnDiskFormat.instance, Version::aaFileNameFormat);
5353
// Stargazer
5454
public static final Version BA = new Version("ba", V2OnDiskFormat.instance, (c, i, g) -> stargazerFileNameFormat(c, i, g, "ba"));
55-
// Converged Cassandra with JVector
55+
// Converged Cassandra with JVector with file format version 2
56+
// Note: vector index checksums for TERMS files were computed in two different ways for this version. As such,
57+
// we do not validate checksums for this version or any subsequent version until EC.
5658
public static final Version CA = new Version("ca", V3OnDiskFormat.instance, (c, i, g) -> stargazerFileNameFormat(c, i, g, "ca"));
5759
// NOTE: use DB to prevent collisions with upstream file formats
5860
// Encode trie entries using their AbstractType to ensure trie entries are sorted for range queries and are prefix free.
@@ -61,7 +63,9 @@ public class Version implements Comparable<Version>
6163
public static final Version DC = new Version("dc", V5OnDiskFormat.instance, (c, i, g) -> stargazerFileNameFormat(c, i, g, "dc"));
6264
// histograms in index metadata
6365
public static final Version EB = new Version("eb", V6OnDiskFormat.instance, (c, i, g) -> stargazerFileNameFormat(c, i, g, "eb"));
64-
// term frequencies index component (support for BM25)
66+
// term frequencies index component (support for BM25); bump jvector file format version to 4
67+
// Start validating vector index component checksums, except for the TERMS_FILE because it's checksum is non-standard
68+
// and isn't easily validated when an sstable index has multiple segments within the TERMS_FILE.
6569
public static final Version EC = new Version("ec", V7OnDiskFormat.instance, (c, i, g) -> stargazerFileNameFormat(c, i, g, "ec"));
6670
// total terms count serialization in index metadata
6771
public static final Version ED = new Version("ed", V7OnDiskFormat.instance, (c, i, g) -> stargazerFileNameFormat(c, i, g, "ed"));

src/java/org/apache/cassandra/index/sai/disk/v1/V1OnDiskFormat.java

Lines changed: 2 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -133,12 +133,6 @@ public boolean isRowAware()
133133
return false;
134134
}
135135

136-
@Override
137-
public boolean hasVectorIndexChecksum()
138-
{
139-
return false;
140-
}
141-
142136
@Override
143137
public boolean hasTermsHistogram()
144138
{
@@ -236,13 +230,10 @@ public void validateIndexComponent(IndexComponent.ForRead component, boolean che
236230
if (component.isCompletionMarker())
237231
return;
238232

239-
// starting with v3, vector components include proper headers and checksum; skip for earlier versions
233+
// We do not validate vector components until V7, so we skip for earlier versions
240234
IndexContext context = component.parent().context();
241-
if (isVectorDataComponent(context, component.componentType())
242-
&& !component.parent().onDiskFormat().indexFeatureSet().hasVectorIndexChecksum())
243-
{
235+
if (isVectorDataComponent(context, component.componentType()))
244236
return;
245-
}
246237

247238
Version earliest = getExpectedEarliestVersion(context, component.componentType());
248239
try (IndexInput input = component.openInput())

src/java/org/apache/cassandra/index/sai/disk/v2/V2OnDiskFormat.java

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -74,12 +74,6 @@ public boolean isRowAware()
7474
return true;
7575
}
7676

77-
@Override
78-
public boolean hasVectorIndexChecksum()
79-
{
80-
return false;
81-
}
82-
8377
@Override
8478
public boolean hasTermsHistogram()
8579
{

src/java/org/apache/cassandra/index/sai/disk/v3/V3OnDiskFormat.java

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -82,12 +82,6 @@ public boolean isRowAware()
8282
return true;
8383
}
8484

85-
@Override
86-
public boolean hasVectorIndexChecksum()
87-
{
88-
return false;
89-
}
90-
9185
@Override
9286
public boolean hasTermsHistogram()
9387
{

src/java/org/apache/cassandra/index/sai/disk/v6/V6OnDiskFormat.java

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -33,12 +33,6 @@ public boolean isRowAware()
3333
return true;
3434
}
3535

36-
@Override
37-
public boolean hasVectorIndexChecksum()
38-
{
39-
return false;
40-
}
41-
4236
@Override
4337
public boolean hasTermsHistogram()
4438
{

src/java/org/apache/cassandra/index/sai/disk/v7/V7OnDiskFormat.java

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,16 +18,30 @@
1818

1919
package org.apache.cassandra.index.sai.disk.v7;
2020

21+
import java.io.IOException;
22+
import java.io.UncheckedIOException;
23+
import java.lang.invoke.MethodHandles;
2124
import java.util.EnumSet;
2225
import java.util.Set;
2326

27+
import org.slf4j.Logger;
28+
import org.slf4j.LoggerFactory;
29+
2430
import org.apache.cassandra.db.marshal.AbstractType;
31+
import org.apache.cassandra.index.sai.IndexContext;
32+
import org.apache.cassandra.index.sai.disk.format.IndexComponent;
2533
import org.apache.cassandra.index.sai.disk.format.IndexComponentType;
34+
import org.apache.cassandra.index.sai.disk.format.Version;
2635
import org.apache.cassandra.index.sai.disk.v6.V6OnDiskFormat;
36+
import org.apache.cassandra.index.sai.utils.SAICodecUtils;
2737
import org.apache.cassandra.index.sai.utils.TypeUtil;
38+
import org.apache.cassandra.utils.Throwables;
39+
import org.apache.lucene.store.IndexInput;
2840

2941
public class V7OnDiskFormat extends V6OnDiskFormat
3042
{
43+
private static final Logger logger = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
44+
3145
public static final V7OnDiskFormat instance = new V7OnDiskFormat();
3246

3347
private static final Set<IndexComponentType> LITERAL_COMPONENTS = EnumSet.of(IndexComponentType.COLUMN_COMPLETION_MARKER,
@@ -56,4 +70,45 @@ public int jvectorFileFormatVersion()
5670
// compatibility.
5771
return 4;
5872
}
73+
74+
@Override
75+
public void validateIndexComponent(IndexComponent.ForRead component, boolean checksum)
76+
{
77+
if (component.isCompletionMarker())
78+
return;
79+
80+
IndexContext context = component.parent().context();
81+
if (context != null && context.isVector() && component.parent().version().onOrAfter(Version.EC))
82+
{
83+
try (IndexInput input = component.openInput())
84+
{
85+
// We can't validate TERMS_DATA with checksum because the checksum was computed incorrectly through
86+
// V7. See https://github.com/riptano/cndb/issues/14656. We can still call the basic validate method
87+
// which does not check the checksum. (The issue is in the way the checksum was computed. It didn't
88+
// include the header/footer bytes, and for multi-segment builds, it didn't include the bytes from
89+
// all previous segments, which is the design for all index components to date.)
90+
if (!checksum || component.componentType() == IndexComponentType.TERMS_DATA)
91+
SAICodecUtils.validate(input, getExpectedEarliestVersion(context, component.componentType()));
92+
else
93+
SAICodecUtils.validateChecksum(input, getExpectedEarliestVersion(context, component.componentType()));
94+
}
95+
catch (Throwable e)
96+
{
97+
logger.warn(component.parent().logMessage("{} failed for index component {} on SSTable {}"),
98+
(checksum ? "Checksum validation" : "Validation"),
99+
component,
100+
component.parent().descriptor(),
101+
e);
102+
if (e instanceof IOException)
103+
throw new UncheckedIOException((IOException) e);
104+
if (e.getCause() instanceof IOException)
105+
throw new UncheckedIOException((IOException) e.getCause());
106+
throw Throwables.unchecked(e);
107+
}
108+
}
109+
else
110+
{
111+
super.validateIndexComponent(component, checksum);
112+
}
113+
}
59114
}

src/java/org/apache/cassandra/index/sai/utils/SAICodecUtils.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,8 @@ public static void writeFooter(IndexOutput out) throws IOException
7575
writeChecksum(out);
7676
}
7777

78+
// Warning: this method produces an incomplete checksum when using other Lucene tooling because it computes
79+
// the checksum without including the FOOTER_MAGIC and 0. See https://github.com/riptano/cndb/issues/14501.
7880
public static void writeFooter(RandomAccessWriter braw, long checksum) throws IOException
7981
{
8082
var out = toLuceneOutput(braw);

0 commit comments

Comments
 (0)