Skip to content

Commit 7653af9

Browse files
authored
CNDB-13480: NullPointerException when using ORDER BY BM25 after deleting a row (#1653)
### What is the issue see CNDB-13480 ### What does this PR fix and why was it fixed Exclude deleted rows while computing terms frequencies and preparing for computing the scores
1 parent f38360e commit 7653af9

File tree

3 files changed

+79
-16
lines changed

3 files changed

+79
-16
lines changed

src/java/org/apache/cassandra/index/sai/memory/TrieMemtableIndex.java

Lines changed: 25 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@
4545
import org.apache.cassandra.db.memtable.Memtable;
4646
import org.apache.cassandra.db.memtable.ShardBoundaries;
4747
import org.apache.cassandra.db.memtable.TrieMemtable;
48+
import org.apache.cassandra.db.rows.Cell;
4849
import org.apache.cassandra.db.rows.Row;
4950
import org.apache.cassandra.dht.AbstractBounds;
5051
import org.apache.cassandra.dht.Bounds;
@@ -266,7 +267,15 @@ public List<CloseableIterator<PrimaryKeyWithSortKey>> orderBy(QueryContext query
266267
// Compute BM25 scores
267268
var docStats = computeDocumentFrequencies(queryContext, queryTerms);
268269
var analyzer = indexContext.getAnalyzerFactory().create();
269-
var it = Iterators.transform(intersectedIterator, pk -> BM25Utils.DocTF.createFromDocument(pk, getCellForKey(pk), analyzer, queryTerms));
270+
var it = Iterators.filter(Iterators.transform(intersectedIterator, pk ->
271+
{
272+
Cell<?> cellForKey = getCellForKey(pk);
273+
if (cellForKey == null)
274+
// skip deleted rows
275+
return null;
276+
return BM25Utils.DocTF.createFromDocument(pk, cellForKey, analyzer, queryTerms);
277+
}), Objects::nonNull);
278+
270279
return List.of(BM25Utils.computeScores(CloseableIterator.wrap(it),
271280
queryTerms,
272281
docStats,
@@ -331,7 +340,13 @@ public CloseableIterator<PrimaryKeyWithSortKey> orderResultsBy(QueryContext quer
331340
var analyzer = indexContext.getAnalyzerFactory().create();
332341
var queryTerms = orderer.getQueryTerms();
333342
var docStats = computeDocumentFrequencies(queryContext, queryTerms);
334-
var it = keys.stream().map(pk -> BM25Utils.DocTF.createFromDocument(pk, getCellForKey(pk), analyzer, queryTerms)).iterator();
343+
var it = keys.stream().map(pk -> {
344+
Cell<?> cellForKey = getCellForKey(pk);
345+
if (cellForKey == null)
346+
// skip deleted rows
347+
return null;
348+
return BM25Utils.DocTF.createFromDocument(pk, cellForKey, analyzer, queryTerms);
349+
}).filter(Objects::nonNull).iterator();
335350
return BM25Utils.computeScores(CloseableIterator.wrap(it),
336351
queryTerms,
337352
docStats,
@@ -350,8 +365,15 @@ private BM25Utils.DocStats computeDocumentFrequencies(QueryContext queryContext,
350365
{
351366
// KeyRangeIterator.getMaxKeys is not accurate enough, we have to count them
352367
long keys = 0;
353-
for (var it = termIterators.get(i); it.hasNext(); it.next())
368+
for (var it = termIterators.get(i); it.hasNext(); )
369+
{
370+
PrimaryKey pk = it.next();
371+
Cell<?> cellForKey = getCellForKey(pk);
372+
if (cellForKey == null)
373+
// skip deleted rows
374+
continue;
354375
keys++;
376+
}
355377
documentFrequencies.put(queryTerms.get(i), keys);
356378
}
357379
long docCount = 0;

src/java/org/apache/cassandra/index/sai/utils/BM25Utils.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -83,9 +83,10 @@ public static DocTF createFromDocument(PrimaryKey pk,
8383
AbstractAnalyzer docAnalyzer,
8484
Collection<ByteBuffer> queryTerms)
8585
{
86+
assert cell != null : "Cannot find a cell for pk " + pk;
87+
8688
int count = 0;
8789
Map<ByteBuffer, Integer> frequencies = new HashMap<>();
88-
8990
docAnalyzer.reset(cell.buffer());
9091
try
9192
{

test/unit/org/apache/cassandra/index/sai/cql/BM25Test.java

Lines changed: 52 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -57,20 +57,60 @@ public void testTwoIndexes()
5757
assertInvalidMessage("BM25 ordering on column v requires an analyzed index",
5858
"SELECT k FROM %s WHERE v : 'apple' ORDER BY v BM25 OF 'apple' LIMIT 3");
5959

60-
// create analyzed index
61-
analyzeIndex();
60+
createAnalyzedIndex();
6261
// BM25 query should work now
6362
var result = execute("SELECT k FROM %s WHERE v : 'apple' ORDER BY v BM25 OF 'apple' LIMIT 3");
6463
assertRows(result, row(1));
6564
}
6665

66+
@Test
67+
public void testDeletedRow() throws Throwable
68+
{
69+
createTable("CREATE TABLE %s (k int PRIMARY KEY, v text)");
70+
createAnalyzedIndex();
71+
execute("INSERT INTO %s (k, v) VALUES (1, 'apple')");
72+
execute("INSERT INTO %s (k, v) VALUES (2, 'apple juice')");
73+
var result = execute("SELECT k FROM %s ORDER BY v BM25 OF 'apple' LIMIT 3");
74+
assertThat(result).hasSize(2);
75+
execute("DELETE FROM %s WHERE k=2");
76+
String select = "SELECT k FROM %s ORDER BY v BM25 OF 'apple' LIMIT 3";
77+
beforeAndAfterFlush(() -> assertRows(execute(select), row(1)));
78+
}
79+
80+
@Test
81+
public void testDeletedColumn() throws Throwable
82+
{
83+
createTable("CREATE TABLE %s (k int PRIMARY KEY, v text)");
84+
createAnalyzedIndex();
85+
execute("INSERT INTO %s (k, v) VALUES (1, 'apple')");
86+
execute("INSERT INTO %s (k, v) VALUES (2, 'apple juice')");
87+
String select = "SELECT k FROM %s ORDER BY v BM25 OF 'apple' LIMIT 3";
88+
assertRows(execute(select), row(1), row(2));
89+
execute("DELETE v FROM %s WHERE k = 2");
90+
beforeAndAfterFlush(() -> assertRows(execute(select), row(1)));
91+
}
92+
93+
@Test
94+
public void testDeletedRowWithPredicate() throws Throwable
95+
{
96+
createTable("CREATE TABLE %s (k int PRIMARY KEY, v text, n int)");
97+
createIndex("CREATE CUSTOM INDEX ON %s(n) USING 'org.apache.cassandra.index.sai.StorageAttachedIndex'");
98+
createAnalyzedIndex();
99+
execute("INSERT INTO %s (k, v, n) VALUES (1, 'apple', 0)");
100+
execute("INSERT INTO %s (k, v, n) VALUES (2, 'apple juice', 0)");
101+
String select = "SELECT k FROM %s WHERE n = 0 ORDER BY v BM25 OF 'apple' LIMIT 3";
102+
assertRows(execute(select), row(1), row(2));
103+
execute("DELETE FROM %s WHERE k=2");
104+
beforeAndAfterFlush(() -> assertRows(execute(select), row(1)));
105+
}
106+
67107
@Test
68108
public void testTwoIndexesAmbiguousPredicate() throws Throwable
69109
{
70110
createTable("CREATE TABLE %s (k int PRIMARY KEY, v text)");
71111

72-
// Create analyzed and un-analyzed indexes
73-
analyzeIndex();
112+
createAnalyzedIndex();
113+
// Create un-analyzed indexes
74114
createIndex("CREATE CUSTOM INDEX ON %s(v) USING 'org.apache.cassandra.index.sai.StorageAttachedIndex'");
75115

76116
execute("INSERT INTO %s (k, v) VALUES (1, 'apple')");
@@ -368,10 +408,10 @@ public void testIrrelevantRowsWithCompaction()
368408
private void createSimpleTable()
369409
{
370410
createTable("CREATE TABLE %s (k int PRIMARY KEY, v text)");
371-
analyzeIndex();
411+
createAnalyzedIndex();
372412
}
373413

374-
private String analyzeIndex()
414+
private String createAnalyzedIndex()
375415
{
376416
return createIndex("CREATE CUSTOM INDEX ON %s(v) " +
377417
"USING 'org.apache.cassandra.index.sai.StorageAttachedIndex' " +
@@ -387,7 +427,7 @@ private String analyzeIndex()
387427
public void testWithPredicate() throws Throwable
388428
{
389429
createTable("CREATE TABLE %s (k int PRIMARY KEY, p int, v text)");
390-
analyzeIndex();
430+
createAnalyzedIndex();
391431
execute("CREATE CUSTOM INDEX ON %s(p) USING 'StorageAttachedIndex'");
392432

393433
// Insert documents with varying frequencies of the term "apple"
@@ -412,7 +452,7 @@ public void testWithPredicate() throws Throwable
412452
public void testWidePartition() throws Throwable
413453
{
414454
createTable("CREATE TABLE %s (k1 int, k2 int, v text, PRIMARY KEY (k1, k2))");
415-
analyzeIndex();
455+
createAnalyzedIndex();
416456

417457
// Insert documents with varying frequencies of the term "apple"
418458
execute("INSERT INTO %s (k1, k2, v) VALUES (0, 1, 'apple')");
@@ -434,7 +474,7 @@ public void testWidePartition() throws Throwable
434474
public void testWidePartitionWithPkPredicate() throws Throwable
435475
{
436476
createTable("CREATE TABLE %s (k1 int, k2 int, v text, PRIMARY KEY (k1, k2))");
437-
analyzeIndex();
477+
createAnalyzedIndex();
438478

439479
// Insert documents with varying frequencies of the term "apple"
440480
execute("INSERT INTO %s (k1, k2, v) VALUES (0, 1, 'apple')");
@@ -458,7 +498,7 @@ public void testWidePartitionWithPkPredicate() throws Throwable
458498
public void testWidePartitionWithPredicate() throws Throwable
459499
{
460500
createTable("CREATE TABLE %s (k1 int, k2 int, p int, v text, PRIMARY KEY (k1, k2))");
461-
analyzeIndex();
501+
createAnalyzedIndex();
462502
execute("CREATE CUSTOM INDEX ON %s(p) USING 'StorageAttachedIndex'");
463503

464504
// Insert documents with varying frequencies of the term "apple"
@@ -519,7 +559,7 @@ public void testQueryEmptyTable()
519559
public void testBM25RaceConditionConcurrentQueriesInInvertedIndexSearcher() throws Throwable
520560
{
521561
createTable("CREATE TABLE %s (pk int, v text, PRIMARY KEY (pk))");
522-
analyzeIndex();
562+
createAnalyzedIndex();
523563

524564
// Create 3 docs that have the same BM25 score and will be our top docs
525565
execute("INSERT INTO %s (pk, v) VALUES (1, 'apple apple apple')");
@@ -552,7 +592,7 @@ public void testBM25RaceConditionConcurrentQueriesInInvertedIndexSearcher() thro
552592
public void testWildcardSelection()
553593
{
554594
createTable("CREATE TABLE %s (k int, c int, v text, PRIMARY KEY (k, c))");
555-
analyzeIndex();
595+
createAnalyzedIndex();
556596
execute("INSERT INTO %s (k, c, v) VALUES (1, 1, 'apple')");
557597

558598
var result = execute("SELECT * FROM %s ORDER BY v BM25 OF 'apple' LIMIT 3");

0 commit comments

Comments
 (0)