Skip to content

Commit fa52c61

Browse files
k-rusdjatnieks
authored andcommitted
CNDB-13671 simplify tests with collection condition (#1688)
Removes unnecessary tiebreaker column, since the word counts give different scores for text with the same term count. One value is improved to remove possibility for flakiness in the order. Adds comments with the term and word counts to help verifying the expected results. A helper function to count them is added. Also adds a small tests for analyzer with lowercase.
1 parent dfb6768 commit fa52c61

File tree

1 file changed

+68
-23
lines changed

1 file changed

+68
-23
lines changed

test/unit/org/apache/cassandra/index/sai/cql/BM25Test.java

Lines changed: 68 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
import java.util.concurrent.ExecutorService;
2525
import java.util.concurrent.Executors;
2626
import java.util.concurrent.Future;
27+
import java.util.regex.Pattern;
2728
import java.util.stream.Collectors;
2829

2930
import org.assertj.core.api.Assertions;
@@ -675,50 +676,76 @@ public void testErrorMessages()
675676
"SELECT * FROM %s WHERE map_body CONTAINS KEY 'Climate' ORDER BY body BM25 OF ? LIMIT 10");
676677
}
677678

679+
@Test
680+
public void testWithLowercase() throws Throwable
681+
{
682+
createTable("CREATE TABLE %s (id int PRIMARY KEY, body text)");
683+
createAnalyzedIndex("body", true);
684+
execute("INSERT INTO %s (id, body) VALUES (?, ?)", 1, "Hi hi");
685+
execute("INSERT INTO %s (id, body) VALUES (?, ?)", 2, "hi hi longer");
686+
executeQuery(Arrays.asList(1, 2), "SELECT * FROM %s ORDER BY body BM25 OF 'hi' LIMIT 4");
687+
}
688+
678689
@Test
679690
public void testCollections() throws Throwable
680691
{
681-
createTable("CREATE TABLE %s (id int PRIMARY KEY, category text, score int, tie int," +
692+
createTable("CREATE TABLE %s (id int PRIMARY KEY, category text, score int, " +
682693
"title text, body text, bodyset set<text>, " +
683694
"map_category map<int, text>, map_body map<text, text>)");
684695
createAnalyzedIndex("body", true);
685696
createAnalyzedIndex("bodyset", true);
686697
createAnalyzedIndex("map_body", true);
687698
createIndex("CREATE CUSTOM INDEX ON %s (score) USING 'StorageAttachedIndex'");
688699
createIndex("CREATE CUSTOM INDEX ON %s (category) USING 'StorageAttachedIndex'");
689-
createIndex("CREATE CUSTOM INDEX ON %s (tie) USING 'StorageAttachedIndex'");
690700
createIndex("CREATE CUSTOM INDEX ON %s (map_category) USING 'StorageAttachedIndex'");
691701
createIndex("CREATE CUSTOM INDEX ON %s (KEYS(map_body)) USING 'StorageAttachedIndex'");
692702
insertCollectionData();
703+
analyzeDataset("climate");
704+
analyzeDataset("health");
693705

694706
beforeAndAfterFlush(
695707
() -> {
696-
executeQuery(Arrays.asList(11, 1, 16, 18), "SELECT * FROM %s WHERE tie = 1 ORDER BY body BM25 OF ? LIMIT 10",
708+
// ID 11: total words = 12, climate occurrences = 4
709+
// ID 19: total words = 13, climate occurrences = 4
710+
// ID 1: total words = 16, climate occurrences = 3
711+
// ID 16: total words = 11, climate occurrences = 2
712+
// ID 6: total words = 13, climate occurrences = 2
713+
// ID 12: total words = 12, climate occurrences = 1
714+
// ID 18: total words = 14, climate occurrences = 1
715+
executeQuery(Arrays.asList(11, 19, 1, 16, 6, 12, 18), "SELECT * FROM %s ORDER BY body BM25 OF ? LIMIT 10",
697716
"climate");
698-
executeQuery(Arrays.asList(11, 1), "SELECT * FROM %s WHERE score = 5 AND tie = 1 ORDER BY body BM25 OF ? LIMIT 10",
717+
executeQuery(Arrays.asList(11, 19, 1), "SELECT * FROM %s WHERE score = 5 ORDER BY body BM25 OF ? LIMIT 10",
699718
"climate");
700-
executeQuery(Arrays.asList(6, 16), "SELECT * FROM %s WHERE score > 3 ORDER BY body BM25 OF ? LIMIT 10",
701-
"health");
702-
executeQuery(Arrays.asList(4, 18, 14), "SELECT * FROM %s WHERE category = 'Health' AND tie = 1 " +
703-
"ORDER BY body BM25 OF ? LIMIT 10",
704-
"Health");
705-
executeQuery(Arrays.asList(4, 18, 14), "SELECT * FROM %s WHERE score <= 3 AND tie = 1 AND category = 'Health' " +
706-
"ORDER BY body BM25 OF ? LIMIT 10",
707-
"health");
708-
executeQuery(Arrays.asList(11, 1, 16, 18), "SELECT * FROM %s WHERE bodyset CONTAINS 'climate' AND tie <= 1 ORDER BY body BM25 OF ? LIMIT 10",
719+
executeQuery(Arrays.asList(11, 19, 1, 16, 6, 12, 18), "SELECT * FROM %s WHERE bodyset CONTAINS 'climate' ORDER BY body BM25 OF ? LIMIT 10",
709720
"climate");
710-
executeQuery(Arrays.asList(6, 12), "SELECT * FROM %s WHERE bodyset CONTAINS 'health' AND tie > 1 ORDER BY body BM25 OF ? LIMIT 10",
721+
executeQuery(Arrays.asList(16, 6, 12, 18), "SELECT * FROM %s WHERE bodyset CONTAINS 'health' ORDER BY body BM25 OF ? LIMIT 10",
711722
"climate");
712-
executeQuery(Arrays.asList(11, 1, 16, 18), "SELECT * FROM %s WHERE map_category CONTAINS 'Climate' AND tie <= 1 ORDER BY body BM25 OF ? LIMIT 10",
723+
executeQuery(Arrays.asList(11, 19, 1, 16, 6, 12, 18), "SELECT * FROM %s WHERE map_category CONTAINS 'Climate' ORDER BY body BM25 OF ? LIMIT 10",
713724
"climate");
714-
executeQuery(Arrays.asList(19, 6, 12), "SELECT * FROM %s WHERE map_category CONTAINS 'Health' AND tie > 1 ORDER BY body BM25 OF ? LIMIT 10",
725+
executeQuery(Arrays.asList(19, 16, 6, 12, 18), "SELECT * FROM %s WHERE map_category CONTAINS 'Health' ORDER BY body BM25 OF ? LIMIT 10",
715726
"climate");
716-
executeQuery(Arrays.asList(11, 1, 16, 18), "SELECT * FROM %s WHERE map_body CONTAINS 'Climate' AND tie <= 1 ORDER BY body BM25 OF ? LIMIT 10",
727+
executeQuery(Arrays.asList(11, 19, 1, 16, 6, 12, 18), "SELECT * FROM %s WHERE map_body CONTAINS 'Climate' ORDER BY body BM25 OF ? LIMIT 10",
717728
"climate");
718-
executeQuery(Arrays.asList(11, 16, 18), "SELECT * FROM %s WHERE map_body CONTAINS 'health' AND tie < 2 ORDER BY body BM25 OF ? LIMIT 10",
729+
executeQuery(Arrays.asList(11, 19, 16, 6, 12, 18), "SELECT * FROM %s WHERE map_body CONTAINS 'health' ORDER BY body BM25 OF ? LIMIT 10",
719730
"climate");
720-
executeQuery(Arrays.asList(19, 6, 12), "SELECT * FROM %s WHERE map_body CONTAINS KEY 'Health' AND tie >= 2 ORDER BY body BM25 OF ? LIMIT 10",
731+
executeQuery(Arrays.asList(11, 19, 16, 6, 12, 18), "SELECT * FROM %s WHERE map_body CONTAINS KEY 'Health' ORDER BY body BM25 OF ? LIMIT 10",
721732
"climate");
733+
734+
// ID 4: total words = 15, health occurrences = 3
735+
// ID 12: total words = 12, health occurrences = 2
736+
// ID 6: total words = 13, health occurrences = 2
737+
// ID 9: total words = 13, health occurrences = 2
738+
// ID 18: total words = 14, health occurrences = 2
739+
// ID 14: total words = 11, health occurrences = 1
740+
// ID 16: total words = 11, health occurrences = 1
741+
executeQuery(Arrays.asList(6, 16), "SELECT * FROM %s WHERE score > 3 ORDER BY body BM25 OF ? LIMIT 10",
742+
"health");
743+
executeQuery(Arrays.asList(4, 12, 9, 18, 14), "SELECT * FROM %s WHERE category = 'Health' " +
744+
"ORDER BY body BM25 OF ? LIMIT 10",
745+
"Health");
746+
executeQuery(Arrays.asList(4, 12, 9, 18, 14), "SELECT * FROM %s WHERE score <= 3 AND category = 'Health' " +
747+
"ORDER BY body BM25 OF ? LIMIT 10",
748+
"health");
722749
});
723750
}
724751

@@ -741,10 +768,29 @@ public void testCollections() throws Throwable
741768
{ 15, "Education", 2, "Education reforms are underway. Education experts suggest holistic changes.", 1 },
742769
{ 16, "Climate", 4, "Climate affects the economy and health. Climate events cost billions annually.", 1 },
743770
{ 17, "Technology", 3, "Technology is the backbone of the modern economy. Without technology, economic growth stagnates.", 2 },
744-
{ 18, "Health", 2, "Health is discussed less than economy or climate, but health matters deeply.", 1 },
771+
{ 18, "Health", 2, "Health is discussed less than economy or climate or technology, but health matters deeply.", 1 },
745772
{ 19, "Climate", 5, "Climate change, climate policies, climate research—climate is the buzzword of our time.", 2 },
746773
{ 20, "Mixed", 3, "Investments in education and technology will shape the future of the global economy.", 1 }
747774
};
775+
776+
private void analyzeDataset(String term)
777+
{
778+
final Pattern PATTERN = Pattern.compile("\\W+");
779+
for (Object[] row : DATASET)
780+
{
781+
String body = (String) row[3];
782+
String[] words = PATTERN.split(body.toLowerCase());
783+
784+
long totalWords = words.length;
785+
long termCount = Arrays.stream(words)
786+
.filter(word -> word.equals(term))
787+
.count();
788+
789+
if (termCount > 0)
790+
System.out.printf(" // ID %d: total words = %d, %s occurrences = %d%n",
791+
(Integer) row[0], totalWords, term, termCount);
792+
}
793+
}
748794

749795
private void insertPrimitiveData()
750796
{
@@ -781,13 +827,12 @@ private void insertCollectionData()
781827
}
782828

783829
execute(
784-
"INSERT INTO %s (id, category, score, body, tie, bodyset, map_category, map_body) " +
785-
"VALUES (?, ?, ?, ?, ?, ?, ?, ?)",
830+
"INSERT INTO %s (id, category, score, body, bodyset, map_category, map_body) " +
831+
"VALUES (?, ?, ?, ?, ?, ?, ?)",
786832
DATASET[row][0],
787833
DATASET[row][1],
788834
DATASET[row][2],
789835
DATASET[row][3],
790-
DATASET[row][4],
791836
set,
792837
map,
793838
map_text

0 commit comments

Comments
 (0)