Skip to content

Commit d72d0f2

Browse files
michaeljmarshalldriftx
authored andcommitted
SAI: test that default stop words are the ENGLISH_STOP_WORDS_SET (#1697)
### What is the issue Didn't create an issue as this is just a new test. ### What does this PR fix and why was it fixed We implicitly rely on this and are exposing this as a feature to our end users, so we want to confirm that the stop words are the expected stop words.
1 parent 825ecaf commit d72d0f2

File tree

1 file changed

+36
-0
lines changed

1 file changed

+36
-0
lines changed

test/unit/org/apache/cassandra/index/sai/analyzer/LuceneAnalyzerTest.java

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,8 +28,10 @@
2828

2929
import java.nio.ByteBuffer;
3030
import java.util.ArrayList;
31+
import java.util.Arrays;
3132
import java.util.HashMap;
3233
import java.util.List;
34+
import java.util.stream.Collectors;
3335

3436
import com.google.common.base.Charsets;
3537
import org.junit.Test;
@@ -39,8 +41,11 @@
3941
import org.apache.cassandra.utils.ByteBufferUtil;
4042
import org.apache.lucene.analysis.Analyzer;
4143
import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
44+
import org.apache.lucene.analysis.en.EnglishAnalyzer;
4245

4346
import static org.junit.Assert.assertArrayEquals;
47+
import static org.junit.Assert.assertEquals;
48+
import static org.junit.Assert.assertFalse;
4449

4550
public class LuceneAnalyzerTest
4651
{
@@ -152,6 +157,37 @@ public void testStopwordWithSpace() throws Exception
152157
assertArrayEquals(new String[]{}, list.toArray(new String[0]));
153158
}
154159

160+
@Test
161+
public void testStopwordWithoutArgsDefaultsToEnglish() throws Exception
162+
{
163+
String json = "{\"tokenizer\":{\"name\" : \"whitespace\"}," +
164+
"\"filters\":[{\"name\":\"stop\"}]}";
165+
// Assert that when we do not pass any arguments to the stop filter, it defaults to english, and
166+
// show this by joining the stop words together and asserting it produces no tokens
167+
String testString = EnglishAnalyzer.ENGLISH_STOP_WORDS_SET.stream()
168+
.map(s -> ((char[]) s))
169+
.map(String::new)
170+
.reduce((a, b) -> a + " " + b)
171+
.get();
172+
assertFalse(testString.isEmpty());
173+
List<String> list = tokenize(testString, json);
174+
assertArrayEquals(new String[]{}, list.toArray(new String[0]));
175+
176+
// Let's also confirm the stop words are the expected ones. (We rely on this set for some indexes, so if
177+
// it were to change (it shouldn't), that would create inconsistencies in existing indexes which means
178+
// we want to know before any changes get released.)
179+
var expectedStopWords = Arrays.asList("a", "an", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in",
180+
"into", "is", "it", "no", "not", "of", "on", "or", "such", "that", "the",
181+
"their", "then", "there", "these", "they", "this", "to", "was", "will",
182+
"with");
183+
var actualStopWords = EnglishAnalyzer.ENGLISH_STOP_WORDS_SET.stream()
184+
.map(s -> ((char[]) s))
185+
.map(String::new)
186+
.sorted()
187+
.collect(Collectors.toList());
188+
assertEquals(expectedStopWords, actualStopWords);
189+
}
190+
155191
@Test(expected = InvalidRequestException.class)
156192
public void testMissingSynonymArg() throws Exception
157193
{

0 commit comments

Comments
 (0)