Skip to content

Commit 9cd33f3

Browse files
CNDB-13446: Make search-then-sort BM25 queries apply BM25 filter (#1659)
### What is the issue Fixes riptano/cndb#13446 ### What does this PR fix and why was it fixed This pr fixes an issue in the search-then-sort path where we never apply the query term filter. The change is quite simple because we were already scoring these terms, so we just need to filter out the ones that do not have frequency of at least 1 for each query term.
1 parent 2915032 commit 9cd33f3

File tree

4 files changed

+91
-21
lines changed

4 files changed

+91
-21
lines changed

src/java/org/apache/cassandra/index/sai/disk/v1/InvertedIndexSearcher.java

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
import java.util.HashMap;
2626
import java.util.List;
2727
import java.util.Map;
28+
import java.util.Objects;
2829
import java.util.function.Function;
2930
import java.util.stream.Collectors;
3031

@@ -267,7 +268,10 @@ public CloseableIterator<PrimaryKeyWithSortKey> orderResultsBy(SSTableReader rea
267268
documentFrequencies.put(term, matches);
268269
}
269270
var analyzer = indexContext.getAnalyzerFactory().create();
270-
var it = keys.stream().map(pk -> DocTF.createFromDocument(pk, readColumn(sstable, pk), analyzer, queryTerms)).iterator();
271+
var it = keys.stream()
272+
.map(pk -> DocTF.createFromDocument(pk, readColumn(sstable, pk), analyzer, queryTerms))
273+
.filter(Objects::nonNull)
274+
.iterator();
271275
return bm25Internal(CloseableIterator.wrap(it), queryTerms, documentFrequencies);
272276
}
273277

src/java/org/apache/cassandra/index/sai/memory/TrieMemtableIndex.java

Lines changed: 9 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@
3333
import com.google.common.annotations.VisibleForTesting;
3434
import com.google.common.base.Preconditions;
3535
import com.google.common.collect.Iterators;
36+
import com.google.common.collect.Streams;
3637
import com.google.common.util.concurrent.Runnables;
3738

3839
import org.apache.cassandra.cql3.Operator;
@@ -322,14 +323,10 @@ public List<CloseableIterator<PrimaryKeyWithSortKey>> orderBy(QueryContext query
322323
// Compute BM25 scores
323324
var docStats = computeDocumentFrequencies(queryContext, queryTerms);
324325
var analyzer = indexContext.getAnalyzerFactory().create();
325-
var it = Iterators.filter(Iterators.transform(intersectedIterator, pk ->
326-
{
327-
Cell<?> cellForKey = getCellForKey(pk);
328-
if (cellForKey == null)
329-
// skip deleted rows
330-
return null;
331-
return BM25Utils.DocTF.createFromDocument(pk, cellForKey, analyzer, queryTerms);
332-
}), Objects::nonNull);
326+
var it = Streams.stream(intersectedIterator)
327+
.map(pk -> BM25Utils.DocTF.createFromDocument(pk, getCellForKey(pk), analyzer, queryTerms))
328+
.filter(Objects::nonNull)
329+
.iterator();
333330

334331
return List.of(BM25Utils.computeScores(CloseableIterator.wrap(it),
335332
queryTerms,
@@ -395,13 +392,10 @@ public CloseableIterator<PrimaryKeyWithSortKey> orderResultsBy(QueryContext quer
395392
var analyzer = indexContext.getAnalyzerFactory().create();
396393
var queryTerms = orderer.getQueryTerms();
397394
var docStats = computeDocumentFrequencies(queryContext, queryTerms);
398-
var it = keys.stream().map(pk -> {
399-
Cell<?> cellForKey = getCellForKey(pk);
400-
if (cellForKey == null)
401-
// skip deleted rows
402-
return null;
403-
return BM25Utils.DocTF.createFromDocument(pk, cellForKey, analyzer, queryTerms);
404-
}).filter(Objects::nonNull).iterator();
395+
var it = keys.stream()
396+
.map(pk -> BM25Utils.DocTF.createFromDocument(pk, getCellForKey(pk), analyzer, queryTerms))
397+
.filter(Objects::nonNull)
398+
.iterator();
405399
return BM25Utils.computeScores(CloseableIterator.wrap(it),
406400
queryTerms,
407401
docStats,

src/java/org/apache/cassandra/index/sai/utils/BM25Utils.java

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,8 @@
2727
import java.util.List;
2828
import java.util.Map;
2929

30+
import javax.annotation.Nullable;
31+
3032
import org.apache.cassandra.db.memtable.Memtable;
3133
import org.apache.cassandra.db.rows.Cell;
3234
import org.apache.cassandra.index.sai.IndexContext;
@@ -78,12 +80,14 @@ public int getTermFrequency(ByteBuffer term)
7880
return frequencies.getOrDefault(term, 0);
7981
}
8082

83+
@Nullable
8184
public static DocTF createFromDocument(PrimaryKey pk,
82-
Cell<?> cell,
83-
AbstractAnalyzer docAnalyzer,
84-
Collection<ByteBuffer> queryTerms)
85+
Cell<?> cell,
86+
AbstractAnalyzer docAnalyzer,
87+
Collection<ByteBuffer> queryTerms)
8588
{
86-
assert cell != null : "Cannot find a cell for pk " + pk;
89+
if (cell == null)
90+
return null;
8791

8892
int count = 0;
8993
Map<ByteBuffer, Integer> frequencies = new HashMap<>();
@@ -103,6 +107,10 @@ public static DocTF createFromDocument(PrimaryKey pk,
103107
docAnalyzer.end();
104108
}
105109

110+
// Every query term must be present in the document
111+
if (queryTerms.size() > frequencies.size())
112+
return null;
113+
106114
return new DocTF(pk, count, frequencies);
107115
}
108116
}

test/unit/org/apache/cassandra/index/sai/cql/BM25Test.java

Lines changed: 65 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -416,7 +416,12 @@ private void createSimpleTable()
416416

417417
private String createAnalyzedIndex()
418418
{
419-
return createIndex("CREATE CUSTOM INDEX ON %s(v) " +
419+
return createAnalyzedIndex("v");
420+
}
421+
422+
private String createAnalyzedIndex(String column)
423+
{
424+
return createIndex("CREATE CUSTOM INDEX ON %s(" + column + ") " +
420425
"USING 'org.apache.cassandra.index.sai.StorageAttachedIndex' " +
421426
"WITH OPTIONS = {" +
422427
"'index_analyzer': '{" +
@@ -628,4 +633,63 @@ public void cannotHaveAggregationOnBM25Query()
628633
.isInstanceOf(InvalidRequestException.class)
629634
.hasMessage(SelectStatement.TOPK_AGGREGATION_ERROR);
630635
}
636+
637+
@Test
638+
public void testBM25andFilterz() throws Throwable
639+
{
640+
createTable("CREATE TABLE %s (id int PRIMARY KEY, category text, score int, title text, body text)");
641+
createAnalyzedIndex("body");
642+
createIndex("CREATE CUSTOM INDEX ON %s (score) USING 'StorageAttachedIndex'");
643+
insertArticle();
644+
beforeAndAfterFlush(
645+
() -> {
646+
// 10 docs have score 3 and 3 of those have "health"
647+
var result = execute("SELECT * FROM %s WHERE score = 3 ORDER BY body BM25 OF ? LIMIT 10",
648+
"health");
649+
assertThat(result).hasSize(3);
650+
651+
// 4 docs have score 2 and one of those has "discussed"
652+
result = execute("SELECT * FROM %s WHERE score = 2 ORDER BY body BM25 OF ? LIMIT 10",
653+
"discussed");
654+
assertThat(result).hasSize(1);
655+
});
656+
}
657+
658+
private void insertArticle()
659+
{
660+
Object[][] dataset = {
661+
{ 1, "Climate", 5, "Climate change is a pressing issue. Climate patterns are shifting globally. Scientists study climate data daily." },
662+
{ 2, "Technology", 3, "Technology is advancing. New technology in AI and robotics is groundbreaking." },
663+
{ 3, "Economy", 4, "The economy is recovering. Economy experts are optimistic. However, the global economy still faces risks." },
664+
{ 4, "Health", 3, "Health is wealth. Health policies need to be improved to ensure better public health outcomes." },
665+
{ 5, "Education", 2, "Education is the foundation of success. Online education is booming." },
666+
{ 6, "Climate", 4, "Climate and health are closely linked. Climate affects air quality and health outcomes." },
667+
{ 7, "Education", 3, "Technology and education go hand in hand. EdTech is revolutionizing education through technology." },
668+
{ 8, "Economy", 3, "The global economy is influenced by technology. Fintech is a key part of the economy today." },
669+
{ 9, "Health", 3, "Education and health programs must be prioritized. Health education is vital in schools." },
670+
{ 10, "Mixed", 3, "Technology, economy, and education are pillars of development." },
671+
{ 11, "Climate", 5, "Climate climate climate. It's everywhere. Climate drives political and economic decisions." },
672+
{ 12, "Health", 2, "Health concerns rise with climate issues. Health organizations are sounding the alarm." },
673+
{ 13, "Economy", 3, "The economy is fluctuating. Uncertainty looms over the economy." },
674+
{ 14, "Health", 3, "Cutting-edge technology is transforming healthcare. Healthtech merges health and technology." },
675+
{ 15, "Education", 2, "Education reforms are underway. Education experts suggest holistic changes." },
676+
{ 16, "Climate", 4, "Climate affects the economy and health. Climate events cost billions annually." },
677+
{ 17, "Technology", 3, "Technology is the backbone of the modern economy. Without technology, economic growth stagnates." },
678+
{ 18, "Health", 2, "Health is discussed less than economy or climate, but health matters deeply." },
679+
{ 19, "Climate", 5, "Climate change, climate policies, climate research—climate is the buzzword of our time." },
680+
{ 20, "Mixed", 3, "Investments in education and technology will shape the future of the global economy." }
681+
};
682+
683+
for (Object[] article : dataset)
684+
{
685+
execute(
686+
"INSERT INTO %s (id, category, score, body) VALUES (?, ?, ?, ?)",
687+
article[0],
688+
article[1],
689+
article[2],
690+
article[3]
691+
);
692+
}
693+
}
694+
631695
}

0 commit comments

Comments
 (0)