Skip to content

Commit cb3de82

Browse files
authored
CNDB-13408 add BM25 tests to compare with CNDB (#1663)
Adds CC's unit tests to help with testing BM25 in integration CNDB's tests and expect the same result. Also adds tests to reproduce minor issues with error messages on BM25. Fixes few warnings in the affected file.
1 parent 460a7c4 commit cb3de82

File tree

1 file changed

+174
-56
lines changed

1 file changed

+174
-56
lines changed

test/unit/org/apache/cassandra/index/sai/cql/BM25Test.java

Lines changed: 174 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,11 @@
11
/*
2-
* Licensed to the Apache Software Foundation (ASF) under one
3-
* or more contributor license agreements. See the NOTICE file
4-
* distributed with this work for additional information
5-
* regarding copyright ownership. The ASF licenses this file
6-
* to you under the Apache License, Version 2.0 (the
7-
* "License"); you may not use this file except in compliance
8-
* with the License. You may obtain a copy of the License at
2+
* Copyright DataStax, Inc.
93
*
10-
* http://www.apache.org/licenses/LICENSE-2.0
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
119
*
1210
* Unless required by applicable law or agreed to in writing, software
1311
* distributed under the License is distributed on an "AS IS" BASIS,
@@ -19,10 +17,16 @@
1917
package org.apache.cassandra.index.sai.cql;
2018

2119
import java.util.ArrayList;
20+
import java.util.Arrays;
21+
import java.util.HashMap;
22+
import java.util.HashSet;
23+
import java.util.List;
2224
import java.util.concurrent.ExecutorService;
2325
import java.util.concurrent.Executors;
2426
import java.util.concurrent.Future;
27+
import java.util.stream.Collectors;
2528

29+
import org.assertj.core.api.Assertions;
2630
import org.junit.Before;
2731
import org.junit.Test;
2832

@@ -124,15 +128,15 @@ public void testTwoIndexesAmbiguousPredicate() throws Throwable
124128
// be rejected
125129
beforeAndAfterFlush(() -> {
126130
// Single predicate
127-
assertInvalidMessage(String.format(EQ_AMBIGUOUS_ERROR, "v", getIndex(0), getIndex(1)),
131+
assertInvalidMessage(String.format(EQ_AMBIGUOUS_ERROR, 'v', getIndex(0), getIndex(1)),
128132
"SELECT k FROM %s WHERE v = 'apple'");
129133

130134
// AND
131-
assertInvalidMessage(String.format(EQ_AMBIGUOUS_ERROR, "v", getIndex(0), getIndex(1)),
135+
assertInvalidMessage(String.format(EQ_AMBIGUOUS_ERROR, 'v', getIndex(0), getIndex(1)),
132136
"SELECT k FROM %s WHERE v = 'apple' AND v : 'juice'");
133137

134138
// OR
135-
assertInvalidMessage(String.format(EQ_AMBIGUOUS_ERROR, "v", getIndex(0), getIndex(1)),
139+
assertInvalidMessage(String.format(EQ_AMBIGUOUS_ERROR, 'v', getIndex(0), getIndex(1)),
136140
"SELECT k FROM %s WHERE v = 'apple' OR v : 'juice'");
137141
});
138142
}
@@ -178,14 +182,7 @@ public void testComplexQueriesWithMultipleIndexes() throws Throwable
178182

179183
// Create mix of analyzed, unanalyzed, and non-text indexes
180184
createIndex("CREATE CUSTOM INDEX ON %s(v1) USING 'org.apache.cassandra.index.sai.StorageAttachedIndex'");
181-
createIndex("CREATE CUSTOM INDEX ON %s(v2) " +
182-
"USING 'org.apache.cassandra.index.sai.StorageAttachedIndex' " +
183-
"WITH OPTIONS = {" +
184-
"'index_analyzer': '{" +
185-
"\"tokenizer\" : {\"name\" : \"standard\"}, " +
186-
"\"filters\" : [{\"name\" : \"porterstem\"}]" +
187-
"}'" +
188-
"}");
185+
createAnalyzedIndex("v2");
189186
createIndex("CREATE CUSTOM INDEX ON %s(v3) USING 'org.apache.cassandra.index.sai.StorageAttachedIndex'");
190187

191188
execute("INSERT INTO %s (k, v1, v2, v3) VALUES (1, 'apple', 'orange juice', 5)");
@@ -263,10 +260,8 @@ public void testEmptyQuery() throws Throwable
263260
execute("INSERT INTO %s (k, v) VALUES (1, 'apple')");
264261

265262
beforeAndAfterFlush(() ->
266-
{
267-
assertInvalidMessage("BM25 query must contain at least one term (perhaps your analyzer is discarding tokens you didn't expect)",
268-
"SELECT k FROM %s ORDER BY v BM25 OF '+' LIMIT 1");
269-
});
263+
assertInvalidMessage("BM25 query must contain at least one term (perhaps your analyzer is discarding tokens you didn't expect)",
264+
"SELECT k FROM %s ORDER BY v BM25 OF '+' LIMIT 1"));
270265
}
271266

272267
@Test
@@ -420,14 +415,20 @@ private String createAnalyzedIndex()
420415
}
421416

422417
private String createAnalyzedIndex(String column)
418+
{
419+
return createAnalyzedIndex(column, false);
420+
}
421+
422+
private String createAnalyzedIndex(String column, boolean lowercase)
423423
{
424424
return createIndex("CREATE CUSTOM INDEX ON %s(" + column + ") " +
425425
"USING 'org.apache.cassandra.index.sai.StorageAttachedIndex' " +
426426
"WITH OPTIONS = {" +
427427
"'index_analyzer': '{" +
428428
"\"tokenizer\" : {\"name\" : \"standard\"}, " +
429-
"\"filters\" : [{\"name\" : \"porterstem\"}]" +
430-
"}'}"
429+
"\"filters\" : [{\"name\" : \"porterstem\"}" +
430+
(lowercase ? ", {\"name\" : \"lowercase\"}]" : "]")
431+
+ "}'}"
431432
);
432433
}
433434

@@ -640,7 +641,7 @@ public void testBM25andFilterz() throws Throwable
640641
createTable("CREATE TABLE %s (id int PRIMARY KEY, category text, score int, title text, body text)");
641642
createAnalyzedIndex("body");
642643
createIndex("CREATE CUSTOM INDEX ON %s (score) USING 'StorageAttachedIndex'");
643-
insertArticle();
644+
insertPrimitiveData();
644645
beforeAndAfterFlush(
645646
() -> {
646647
// 10 docs have score 3 and 3 of those have "health"
@@ -655,41 +656,158 @@ public void testBM25andFilterz() throws Throwable
655656
});
656657
}
657658

658-
private void insertArticle()
659-
{
660-
Object[][] dataset = {
661-
{ 1, "Climate", 5, "Climate change is a pressing issue. Climate patterns are shifting globally. Scientists study climate data daily." },
662-
{ 2, "Technology", 3, "Technology is advancing. New technology in AI and robotics is groundbreaking." },
663-
{ 3, "Economy", 4, "The economy is recovering. Economy experts are optimistic. However, the global economy still faces risks." },
664-
{ 4, "Health", 3, "Health is wealth. Health policies need to be improved to ensure better public health outcomes." },
665-
{ 5, "Education", 2, "Education is the foundation of success. Online education is booming." },
666-
{ 6, "Climate", 4, "Climate and health are closely linked. Climate affects air quality and health outcomes." },
667-
{ 7, "Education", 3, "Technology and education go hand in hand. EdTech is revolutionizing education through technology." },
668-
{ 8, "Economy", 3, "The global economy is influenced by technology. Fintech is a key part of the economy today." },
669-
{ 9, "Health", 3, "Education and health programs must be prioritized. Health education is vital in schools." },
670-
{ 10, "Mixed", 3, "Technology, economy, and education are pillars of development." },
671-
{ 11, "Climate", 5, "Climate climate climate. It's everywhere. Climate drives political and economic decisions." },
672-
{ 12, "Health", 2, "Health concerns rise with climate issues. Health organizations are sounding the alarm." },
673-
{ 13, "Economy", 3, "The economy is fluctuating. Uncertainty looms over the economy." },
674-
{ 14, "Health", 3, "Cutting-edge technology is transforming healthcare. Healthtech merges health and technology." },
675-
{ 15, "Education", 2, "Education reforms are underway. Education experts suggest holistic changes." },
676-
{ 16, "Climate", 4, "Climate affects the economy and health. Climate events cost billions annually." },
677-
{ 17, "Technology", 3, "Technology is the backbone of the modern economy. Without technology, economic growth stagnates." },
678-
{ 18, "Health", 2, "Health is discussed less than economy or climate, but health matters deeply." },
679-
{ 19, "Climate", 5, "Climate change, climate policies, climate research—climate is the buzzword of our time." },
680-
{ 20, "Mixed", 3, "Investments in education and technology will shape the future of the global economy." }
681-
};
682-
683-
for (Object[] article : dataset)
659+
@Test
660+
public void testErrorMessages()
661+
{
662+
createTable("CREATE TABLE %s (id int PRIMARY KEY, category text, score int, " +
663+
"title text, body text, bodyset set<text>, " +
664+
"map_category map<int, text>, map_body map<text, text>)");
665+
createAnalyzedIndex("body", true);
666+
createAnalyzedIndex("bodyset", true);
667+
createAnalyzedIndex("map_body", true);
668+
669+
// Improve message issue CNDB-13514
670+
assertInvalidMessage("BM25 ordering on column bodyset requires an analyzed index",
671+
"SELECT * FROM %s ORDER BY bodyset BM25 OF ? LIMIT 10");
672+
673+
// Discussion of message incosistency CNDB-13526
674+
assertInvalidMessage("Ordering on non-clustering column requires each restricted column to be indexed except for fully-specified partition keys",
675+
"SELECT * FROM %s WHERE map_body CONTAINS KEY 'Climate' ORDER BY body BM25 OF ? LIMIT 10");
676+
}
677+
678+
@Test
679+
public void testCollections() throws Throwable
680+
{
681+
createTable("CREATE TABLE %s (id int PRIMARY KEY, category text, score int, tie int," +
682+
"title text, body text, bodyset set<text>, " +
683+
"map_category map<int, text>, map_body map<text, text>)");
684+
createAnalyzedIndex("body", true);
685+
createAnalyzedIndex("bodyset", true);
686+
createAnalyzedIndex("map_body", true);
687+
createIndex("CREATE CUSTOM INDEX ON %s (score) USING 'StorageAttachedIndex'");
688+
createIndex("CREATE CUSTOM INDEX ON %s (category) USING 'StorageAttachedIndex'");
689+
createIndex("CREATE CUSTOM INDEX ON %s (tie) USING 'StorageAttachedIndex'");
690+
createIndex("CREATE CUSTOM INDEX ON %s (map_category) USING 'StorageAttachedIndex'");
691+
createIndex("CREATE CUSTOM INDEX ON %s (KEYS(map_body)) USING 'StorageAttachedIndex'");
692+
insertCollectionData();
693+
694+
beforeAndAfterFlush(
695+
() -> {
696+
executeQuery(Arrays.asList(11, 1, 16, 18), "SELECT * FROM %s WHERE tie = 1 ORDER BY body BM25 OF ? LIMIT 10",
697+
"climate");
698+
executeQuery(Arrays.asList(11, 1), "SELECT * FROM %s WHERE score = 5 AND tie = 1 ORDER BY body BM25 OF ? LIMIT 10",
699+
"climate");
700+
executeQuery(Arrays.asList(6, 16), "SELECT * FROM %s WHERE score > 3 ORDER BY body BM25 OF ? LIMIT 10",
701+
"health");
702+
executeQuery(Arrays.asList(4, 18, 14), "SELECT * FROM %s WHERE category = 'Health' AND tie = 1 " +
703+
"ORDER BY body BM25 OF ? LIMIT 10",
704+
"Health");
705+
executeQuery(Arrays.asList(4, 18, 14), "SELECT * FROM %s WHERE score <= 3 AND tie = 1 AND category = 'Health' " +
706+
"ORDER BY body BM25 OF ? LIMIT 10",
707+
"health");
708+
executeQuery(Arrays.asList(11, 1, 16, 18), "SELECT * FROM %s WHERE bodyset CONTAINS 'climate' AND tie <= 1 ORDER BY body BM25 OF ? LIMIT 10",
709+
"climate");
710+
executeQuery(Arrays.asList(6, 12), "SELECT * FROM %s WHERE bodyset CONTAINS 'health' AND tie > 1 ORDER BY body BM25 OF ? LIMIT 10",
711+
"climate");
712+
executeQuery(Arrays.asList(11, 1, 16, 18), "SELECT * FROM %s WHERE map_category CONTAINS 'Climate' AND tie <= 1 ORDER BY body BM25 OF ? LIMIT 10",
713+
"climate");
714+
executeQuery(Arrays.asList(19, 6, 12), "SELECT * FROM %s WHERE map_category CONTAINS 'Health' AND tie > 1 ORDER BY body BM25 OF ? LIMIT 10",
715+
"climate");
716+
executeQuery(Arrays.asList(11, 1, 16, 18), "SELECT * FROM %s WHERE map_body CONTAINS 'Climate' AND tie <= 1 ORDER BY body BM25 OF ? LIMIT 10",
717+
"climate");
718+
executeQuery(Arrays.asList(11, 16, 18), "SELECT * FROM %s WHERE map_body CONTAINS 'health' AND tie < 2 ORDER BY body BM25 OF ? LIMIT 10",
719+
"climate");
720+
executeQuery(Arrays.asList(19, 6, 12), "SELECT * FROM %s WHERE map_body CONTAINS KEY 'Health' AND tie >= 2 ORDER BY body BM25 OF ? LIMIT 10",
721+
"climate");
722+
});
723+
}
724+
725+
private final static Object[][] DATASET =
726+
{
727+
{ 1, "Climate", 5, "Climate change is a pressing issue. Climate patterns are shifting globally. Scientists study climate data daily.", 1 },
728+
{ 2, "Technology", 3, "Technology is advancing. New technology in AI and robotics is groundbreaking.", 1 },
729+
{ 3, "Economy", 4, "The economy is recovering. Economy experts are optimistic. However, the global economy still faces risks.", 1 },
730+
{ 4, "Health", 3, "Health is wealth. Health policies need to be improved to ensure better public health outcomes.", 1 },
731+
{ 5, "Education", 2, "Education is the foundation of success. Online education is booming.", 4 },
732+
{ 6, "Climate", 4, "Climate and health are closely linked. Climate affects air quality and health outcomes.", 2 },
733+
{ 7, "Education", 3, "Technology and education go hand in hand. EdTech is revolutionizing education through technology.", 3 },
734+
{ 8, "Economy", 3, "The global economy is influenced by technology. Fintech is a key part of the economy today.", 2 },
735+
{ 9, "Health", 3, "Education and health programs must be prioritized. Health education is vital in schools.", 2 },
736+
{ 10, "Mixed", 3, "Technology, economy, and education are pillars of development.", 2 },
737+
{ 11, "Climate", 5, "Climate climate climate. It's everywhere. Climate drives political and economic decisions.", 1 },
738+
{ 12, "Health", 2, "Health concerns rise with climate issues. Health organizations are sounding the alarm.", 2 },
739+
{ 13, "Economy", 3, "The economy is fluctuating. Uncertainty looms over the economy.", 1 },
740+
{ 14, "Health", 3, "Cutting-edge technology is transforming healthcare. Healthtech merges health and technology.", 1 },
741+
{ 15, "Education", 2, "Education reforms are underway. Education experts suggest holistic changes.", 1 },
742+
{ 16, "Climate", 4, "Climate affects the economy and health. Climate events cost billions annually.", 1 },
743+
{ 17, "Technology", 3, "Technology is the backbone of the modern economy. Without technology, economic growth stagnates.", 2 },
744+
{ 18, "Health", 2, "Health is discussed less than economy or climate, but health matters deeply.", 1 },
745+
{ 19, "Climate", 5, "Climate change, climate policies, climate research—climate is the buzzword of our time.", 2 },
746+
{ 20, "Mixed", 3, "Investments in education and technology will shape the future of the global economy.", 1 }
747+
};
748+
749+
private void insertPrimitiveData()
750+
{
751+
for (Object[] row : DATASET)
684752
{
685753
execute(
686754
"INSERT INTO %s (id, category, score, body) VALUES (?, ?, ?, ?)",
687-
article[0],
688-
article[1],
689-
article[2],
690-
article[3]
755+
row[0],
756+
row[1],
757+
row[2],
758+
row[3]
759+
);
760+
}
761+
}
762+
763+
private void insertCollectionData()
764+
{
765+
int setsize = 1;
766+
for (int row = 0; row < DATASET.length; row++)
767+
{
768+
var set = new HashSet<String>();
769+
for (int j = 0; j < setsize; j++)
770+
set.add((String) DATASET[row - j][3]);
771+
if (setsize >= 3)
772+
setsize -= 2;
773+
else
774+
setsize++;
775+
var map = new HashMap<Integer, String>();
776+
var map_text = new HashMap<String, String>();
777+
for (int j = 0; j <= row && j < 3; j++)
778+
{
779+
map.putIfAbsent((Integer) DATASET[row - j][2], (String) DATASET[row - j][1]);
780+
map_text.putIfAbsent((String) DATASET[row - j][1], (String) DATASET[row - j][3]);
781+
}
782+
783+
execute(
784+
"INSERT INTO %s (id, category, score, body, tie, bodyset, map_category, map_body) " +
785+
"VALUES (?, ?, ?, ?, ?, ?, ?, ?)",
786+
DATASET[row][0],
787+
DATASET[row][1],
788+
DATASET[row][2],
789+
DATASET[row][3],
790+
DATASET[row][4],
791+
set,
792+
map,
793+
map_text
691794
);
692795
}
693796
}
694797

798+
private void executeQuery(List<Integer> expected, String query, Object... values) throws Throwable
799+
{
800+
assertResult(execute(query, values), expected);
801+
prepare(query);
802+
assertResult(execute(query, values), expected);
803+
}
804+
805+
private void assertResult(UntypedResultSet result, List<Integer> expected)
806+
{
807+
Assertions.assertThat(result).hasSize(expected.size());
808+
var ids = result.stream()
809+
.map(row -> row.getInt("id"))
810+
.collect(Collectors.toList());
811+
Assertions.assertThat(ids).isEqualTo(expected);
812+
}
695813
}

0 commit comments

Comments
 (0)