|
28 | 28 |
|
29 | 29 | import java.nio.ByteBuffer;
|
30 | 30 | import java.util.ArrayList;
|
| 31 | +import java.util.Arrays; |
31 | 32 | import java.util.HashMap;
|
32 | 33 | import java.util.List;
|
| 34 | +import java.util.stream.Collectors; |
33 | 35 |
|
34 | 36 | import com.google.common.base.Charsets;
|
35 | 37 | import org.junit.Test;
|
|
39 | 41 | import org.apache.cassandra.utils.ByteBufferUtil;
|
40 | 42 | import org.apache.lucene.analysis.Analyzer;
|
41 | 43 | import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
|
| 44 | +import org.apache.lucene.analysis.en.EnglishAnalyzer; |
42 | 45 |
|
43 | 46 | import static org.junit.Assert.assertArrayEquals;
|
| 47 | +import static org.junit.Assert.assertEquals; |
| 48 | +import static org.junit.Assert.assertFalse; |
44 | 49 |
|
45 | 50 | public class LuceneAnalyzerTest
|
46 | 51 | {
|
@@ -152,6 +157,37 @@ public void testStopwordWithSpace() throws Exception
|
152 | 157 | assertArrayEquals(new String[]{}, list.toArray(new String[0]));
|
153 | 158 | }
|
154 | 159 |
|
| 160 | + @Test |
| 161 | + public void testStopwordWithoutArgsDefaultsToEnglish() throws Exception |
| 162 | + { |
| 163 | + String json = "{\"tokenizer\":{\"name\" : \"whitespace\"}," + |
| 164 | + "\"filters\":[{\"name\":\"stop\"}]}"; |
| 165 | + // Assert that when we do not pass any arguments to the stop filter, it defaults to english, and |
| 166 | + // show this by joining the stop words together and asserting it produces no tokens |
| 167 | + String testString = EnglishAnalyzer.ENGLISH_STOP_WORDS_SET.stream() |
| 168 | + .map(s -> ((char[]) s)) |
| 169 | + .map(String::new) |
| 170 | + .reduce((a, b) -> a + " " + b) |
| 171 | + .get(); |
| 172 | + assertFalse(testString.isEmpty()); |
| 173 | + List<String> list = tokenize(testString, json); |
| 174 | + assertArrayEquals(new String[]{}, list.toArray(new String[0])); |
| 175 | + |
| 176 | + // Let's also confirm the stop words are the expected ones. (We rely on this set for some indexes, so if |
| 177 | + // it were to change (it shouldn't), that would create inconsistencies in existing indexes which means |
| 178 | + // we want to know before any changes get released.) |
| 179 | + var expectedStopWords = Arrays.asList("a", "an", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", |
| 180 | + "into", "is", "it", "no", "not", "of", "on", "or", "such", "that", "the", |
| 181 | + "their", "then", "there", "these", "they", "this", "to", "was", "will", |
| 182 | + "with"); |
| 183 | + var actualStopWords = EnglishAnalyzer.ENGLISH_STOP_WORDS_SET.stream() |
| 184 | + .map(s -> ((char[]) s)) |
| 185 | + .map(String::new) |
| 186 | + .sorted() |
| 187 | + .collect(Collectors.toList()); |
| 188 | + assertEquals(expectedStopWords, actualStopWords); |
| 189 | + } |
| 190 | + |
155 | 191 | @Test(expected = InvalidRequestException.class)
|
156 | 192 | public void testMissingSynonymArg() throws Exception
|
157 | 193 | {
|
|
0 commit comments