Skip to content

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -48,9 +48,13 @@
import org.apache.lucene.analysis.snowball.SnowballFilter;
import org.apache.lucene.analysis.sv.SwedishLightStemFilter;
import org.elasticsearch.common.Strings;
import org.elasticsearch.common.logging.DeprecationCategory;
import org.elasticsearch.common.logging.DeprecationLogger;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.IndexVersion;
import org.elasticsearch.index.IndexVersions;
import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
import org.tartarus.snowball.ext.ArmenianStemmer;
import org.tartarus.snowball.ext.BasqueStemmer;
Expand Down Expand Up @@ -82,10 +86,14 @@ public class StemmerTokenFilterFactory extends AbstractTokenFilterFactory {

private static final TokenStream EMPTY_TOKEN_STREAM = new EmptyTokenStream();

private String language;
private final String language;
private final IndexVersion version;

private static final DeprecationLogger DEPRECATION_LOGGER = DeprecationLogger.getLogger(StemmerTokenFilterFactory.class);

StemmerTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) throws IOException {
super(name, settings);
this.version = indexSettings.getIndexVersionCreated();
this.language = Strings.capitalize(settings.get("language", settings.get("name", "porter")));
// check that we have a valid language by trying to create a TokenStream
create(EMPTY_TOKEN_STREAM).close();
Expand Down Expand Up @@ -165,9 +173,19 @@ public TokenStream create(TokenStream tokenStream) {

// German stemmers
} else if ("german".equalsIgnoreCase(language)) {
return new SnowballFilter(tokenStream, new GermanStemmer());
if (this.version.onOrAfter(IndexVersions.UPGRADE_TO_LUCENE_10_0_0)) {
return new SnowballFilter(tokenStream, new GermanStemmer());
} else {
// use pre-L10 GermanStemmer that doesn't normalize umlauts etc...
return new SnowballFilter(tokenStream, new LegacyGermanStemmer());
}
} else if ("german2".equalsIgnoreCase(language)) {
// TODO Lucene 10 upgrade: how about bw comp for users relying on german2 stemmer that is now folded into german stemmer?
DEPRECATION_LOGGER.warn(
DeprecationCategory.ANALYSIS,
"german2_stemmer_deprecation",
"The 'german2' stemmer has been deprecated and folged into the 'german' Stemmer. "
+ "Replace all usages of 'german2' with 'german'."
);
return new SnowballFilter(tokenStream, new GermanStemmer());
} else if ("light_german".equalsIgnoreCase(language) || "lightGerman".equalsIgnoreCase(language)) {
return new GermanLightStemFilter(tokenStream);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@
import java.io.IOException;
import java.io.StringReader;

import static org.apache.lucene.tests.analysis.BaseTokenStreamTestCase.assertTokenStreamContents;

public class ASCIIFoldingTokenFilterFactoryTests extends ESTokenStreamTestCase {
public void testDefault() throws IOException {
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@
import java.io.IOException;
import java.io.StringReader;

import static org.apache.lucene.tests.analysis.BaseTokenStreamTestCase.assertTokenStreamContents;

/**
* Base class to test {@link WordDelimiterTokenFilterFactory} and
* {@link WordDelimiterGraphTokenFilterFactory}.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@
import java.io.IOException;
import java.io.StringReader;

import static org.apache.lucene.tests.analysis.BaseTokenStreamTestCase.assertTokenStreamContents;

public class CJKFilterFactoryTests extends ESTokenStreamTestCase {
private static final String RESOURCE = "/org/elasticsearch/analysis/common/cjk_analysis.json";

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@
import java.io.StringReader;
import java.util.Arrays;

import static org.apache.lucene.tests.analysis.BaseTokenStreamTestCase.assertTokenStreamContents;

public class CharGroupTokenizerFactoryTests extends ESTokenStreamTestCase {

public void testParseTokenChars() {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@
import java.nio.file.Files;
import java.nio.file.Path;

import static org.apache.lucene.tests.analysis.BaseTokenStreamTestCase.assertTokenStreamContents;

public class CommonGramsTokenFilterFactoryTests extends ESTokenStreamTestCase {
public void testDefault() throws IOException {
Settings settings = Settings.builder()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
import java.io.StringReader;
import java.text.ParseException;

import static org.apache.lucene.tests.analysis.BaseTokenStreamTestCase.assertTokenStreamContents;
import static org.hamcrest.Matchers.containsString;

public class ESSolrSynonymParserTests extends ESTokenStreamTestCase {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
import java.io.StringReader;
import java.text.ParseException;

import static org.apache.lucene.tests.analysis.BaseTokenStreamTestCase.assertTokenStreamContents;
import static org.hamcrest.Matchers.containsString;

public class ESWordnetSynonymParserTests extends ESTokenStreamTestCase {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@
import java.io.IOException;
import java.io.StringReader;

import static org.apache.lucene.tests.analysis.BaseTokenStreamTestCase.assertTokenStreamContents;

public class EdgeNGramTokenFilterFactoryTests extends ESTokenStreamTestCase {

public void testDefault() throws IOException {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,9 @@
import java.io.StringReader;
import java.util.Collections;

import static org.apache.lucene.tests.analysis.BaseTokenStreamTestCase.assertAnalyzesTo;
import static org.apache.lucene.tests.analysis.BaseTokenStreamTestCase.assertTokenStreamContents;

public class EdgeNGramTokenizerTests extends ESTokenStreamTestCase {

private static IndexAnalyzers buildAnalyzers(IndexVersion version, String tokenizer) throws IOException {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@
import org.apache.lucene.analysis.CharArraySet;
import org.elasticsearch.test.ESTokenStreamTestCase;

import static org.apache.lucene.tests.analysis.BaseTokenStreamTestCase.assertAnalyzesTo;

public class FingerprintAnalyzerTests extends ESTokenStreamTestCase {

public void testFingerprint() throws Exception {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@

import java.io.IOException;

import static org.apache.lucene.tests.analysis.BaseTokenStreamTestCase.assertTokenStreamContents;

public class FlattenGraphTokenFilterFactoryTests extends ESTokenStreamTestCase {

public void testBasic() throws IOException {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
import java.io.IOException;
import java.io.StringReader;

import static org.apache.lucene.tests.analysis.BaseTokenStreamTestCase.assertTokenStreamContents;
import static org.hamcrest.Matchers.instanceOf;

public class KeepFilterFactoryTests extends ESTokenStreamTestCase {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
import java.io.IOException;
import java.io.StringReader;

import static org.apache.lucene.tests.analysis.BaseTokenStreamTestCase.assertTokenStreamContents;
import static org.hamcrest.Matchers.instanceOf;

public class KeepTypesFilterFactoryTests extends ESTokenStreamTestCase {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@

import java.io.IOException;

import static org.apache.lucene.tests.analysis.BaseTokenStreamTestCase.assertAnalyzesTo;
import static org.hamcrest.Matchers.instanceOf;

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@
import java.io.IOException;
import java.io.StringReader;

import static org.apache.lucene.tests.analysis.BaseTokenStreamTestCase.assertTokenStreamContents;

public class LimitTokenCountFilterFactoryTests extends ESTokenStreamTestCase {
public void testDefault() throws IOException {
Settings settings = Settings.builder()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@
import java.io.IOException;
import java.io.StringReader;

import static org.apache.lucene.tests.analysis.BaseTokenStreamTestCase.assertStreamHasNumberOfTokens;

public class MinHashFilterFactoryTests extends ESTokenStreamTestCase {
public void testDefault() throws IOException {
int default_hash_count = 1;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@
import java.io.IOException;
import java.util.Collections;

import static org.apache.lucene.tests.analysis.BaseTokenStreamTestCase.assertAnalyzesTo;

public class MultiplexerTokenFilterTests extends ESTokenStreamTestCase {

public void testMultiplexingFilter() throws IOException {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@
import java.io.IOException;
import java.io.StringReader;

import static org.apache.lucene.tests.analysis.BaseTokenStreamTestCase.assertTokenStreamContents;

public class NGramTokenFilterFactoryTests extends ESTokenStreamTestCase {

public void testDefault() throws IOException {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
import java.io.StringReader;
import java.util.Arrays;

import static com.carrotsearch.randomizedtesting.RandomizedTest.scaledRandomIntBetween;
import static org.apache.lucene.tests.analysis.BaseTokenStreamTestCase.assertTokenStreamContents;
import static org.hamcrest.Matchers.instanceOf;

public class NGramTokenizerFactoryTests extends ESTokenStreamTestCase {
Expand Down Expand Up @@ -183,6 +183,9 @@ public void testBackwardsCompatibilityEdgeNgramTokenFilter() throws Exception {
assertThat(edgeNGramTokenFilter, instanceOf(EdgeNGramTokenFilter.class));
}
}
assertWarnings(
"The [side] parameter is deprecated and will be removed. Use a [reverse] before and after the [edge_ngram] instead."
);
}

/*`
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@
import java.io.IOException;
import java.io.StringReader;

import static org.apache.lucene.tests.analysis.BaseTokenStreamTestCase.assertTokenStreamContents;

public class PathHierarchyTokenizerFactoryTests extends ESTokenStreamTestCase {

public void testDefaults() throws IOException {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,9 @@
import java.util.Arrays;
import java.util.regex.Pattern;

import static org.apache.lucene.tests.analysis.BaseTokenStreamTestCase.assertAnalyzesTo;
import static org.apache.lucene.tests.analysis.BaseTokenStreamTestCase.checkRandomData;

/**
* Verifies the behavior of PatternAnalyzer.
*/
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
import org.elasticsearch.test.ESTokenStreamTestCase;
import org.elasticsearch.test.IndexSettingsModule;

import static org.apache.lucene.tests.analysis.BaseTokenStreamTestCase.assertTokenStreamContents;
import static org.elasticsearch.test.ESTestCase.createTestAnalysis;
import static org.hamcrest.Matchers.containsString;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@

import java.io.IOException;

import static org.apache.lucene.tests.analysis.BaseTokenStreamTestCase.assertTokenStreamContents;

public class PatternReplaceTokenFilterTests extends ESTokenStreamTestCase {

public void testNormalizer() throws IOException {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
import java.io.IOException;
import java.util.Collections;

import static org.apache.lucene.tests.analysis.BaseTokenStreamTestCase.assertAnalyzesTo;
import static org.mockito.Mockito.mock;
import static org.mockito.Mockito.when;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@

import java.io.IOException;

import static org.apache.lucene.tests.analysis.BaseTokenStreamTestCase.assertTokenStreamContents;
import static org.hamcrest.Matchers.instanceOf;

public class RemoveDuplicatesFilterFactoryTests extends ESTokenStreamTestCase {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@

import java.util.Collections;

import static org.apache.lucene.tests.analysis.BaseTokenStreamTestCase.assertAnalyzesTo;
import static org.mockito.Mockito.mock;
import static org.mockito.Mockito.when;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@
import org.apache.lucene.analysis.en.EnglishAnalyzer;
import org.elasticsearch.test.ESTokenStreamTestCase;

import static org.apache.lucene.tests.analysis.BaseTokenStreamTestCase.assertAnalyzesTo;

public class SnowballAnalyzerTests extends ESTokenStreamTestCase {

public void testEnglish() throws Exception {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@
import java.util.List;
import java.util.Locale;

import static org.apache.lucene.tests.analysis.BaseTokenStreamTestCase.assertTokenStreamContents;

public class StemmerOverrideTokenFilterFactoryTests extends ESTokenStreamTestCase {
@Rule
public ExpectedException expectedException = ExpectedException.none();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
*/
package org.elasticsearch.analysis.common;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
Expand All @@ -16,6 +17,7 @@
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexVersion;
import org.elasticsearch.index.IndexVersions;
import org.elasticsearch.index.analysis.AnalysisTestsHelper;
import org.elasticsearch.index.analysis.IndexAnalyzers;
import org.elasticsearch.index.analysis.NamedAnalyzer;
Expand All @@ -27,7 +29,7 @@
import java.io.IOException;
import java.io.StringReader;

import static com.carrotsearch.randomizedtesting.RandomizedTest.scaledRandomIntBetween;
import static org.apache.lucene.tests.analysis.BaseTokenStreamTestCase.assertAnalyzesTo;
import static org.elasticsearch.cluster.metadata.IndexMetadata.SETTING_VERSION_CREATED;
import static org.hamcrest.Matchers.instanceOf;

Expand Down Expand Up @@ -103,4 +105,58 @@ public void testMultipleLanguagesThrowsException() throws IOException {
);
assertEquals("Invalid stemmer class specified: [english, light_english]", e.getMessage());
}

public void testGermanVsGerman2Stemmer() throws IOException {
{
IndexVersion v = IndexVersionUtils.randomVersionBetween(
random(),
IndexVersionUtils.getFirstVersion(),
IndexVersionUtils.getPreviousVersion(IndexVersions.UPGRADE_TO_LUCENE_10_0_0)
);
Analyzer analyzer = createGermanStemmer("german", v);
assertAnalyzesTo(analyzer, "Buecher Bücher", new String[] { "Buech", "Buch" });

analyzer = createGermanStemmer("german2", v);
assertAnalyzesTo(analyzer, "Buecher Bücher", new String[] { "Buch", "Buch" });
}
{
IndexVersion v = IndexVersionUtils.randomVersionBetween(
random(),
IndexVersions.UPGRADE_TO_LUCENE_10_0_0,
IndexVersion.current()
);
Analyzer analyzer = createGermanStemmer("german", v);
assertAnalyzesTo(analyzer, "Buecher Bücher", new String[] { "Buch", "Buch" });

analyzer = createGermanStemmer("german2", v);
assertAnalyzesTo(analyzer, "Buecher Bücher", new String[] { "Buch", "Buch" });
assertWarnings(
"The 'german2' stemmer has been deprecated and folged into the 'german' Stemmer. "
+ "Replace all usages of 'german2' with 'german'."
);
}
}

public Analyzer createGermanStemmer(String variant, IndexVersion v) throws IOException {

Settings settings = Settings.builder()
.put("index.analysis.filter.my_german.type", "stemmer")
.put("index.analysis.filter.my_german.language", variant)
.put("index.analysis.analyzer.my_german.tokenizer", "whitespace")
.put("index.analysis.analyzer.my_german.filter", "my_german")
.put(SETTING_VERSION_CREATED, v)
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
.build();

ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(settings, PLUGIN);
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_german");
assertThat(tokenFilter, instanceOf(StemmerTokenFilterFactory.class));
Tokenizer tokenizer = new WhitespaceTokenizer();
tokenizer.setReader(new StringReader("Buecher oder Bücher"));
TokenStream create = tokenFilter.create(tokenizer);
assertThat(create, instanceOf(SnowballFilter.class));
IndexAnalyzers indexAnalyzers = analysis.indexAnalyzers;
NamedAnalyzer analyzer = indexAnalyzers.get("my_german");
return analyzer;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
import org.elasticsearch.test.ESTokenStreamTestCase;
import org.elasticsearch.test.IndexSettingsModule;

import static org.apache.lucene.tests.analysis.BaseTokenStreamTestCase.assertTokenStreamContents;
import static org.elasticsearch.test.ESTestCase.createTestAnalysis;

public class StopAnalyzerTests extends ESTokenStreamTestCase {
Expand Down
Loading