Skip to content
Original file line number Diff line number Diff line change
Expand Up @@ -48,9 +48,13 @@
import org.apache.lucene.analysis.snowball.SnowballFilter;
import org.apache.lucene.analysis.sv.SwedishLightStemFilter;
import org.elasticsearch.common.Strings;
import org.elasticsearch.common.logging.DeprecationCategory;
import org.elasticsearch.common.logging.DeprecationLogger;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.IndexVersion;
import org.elasticsearch.index.IndexVersions;
import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
import org.tartarus.snowball.ext.ArmenianStemmer;
import org.tartarus.snowball.ext.BasqueStemmer;
Expand Down Expand Up @@ -82,10 +86,14 @@ public class StemmerTokenFilterFactory extends AbstractTokenFilterFactory {

private static final TokenStream EMPTY_TOKEN_STREAM = new EmptyTokenStream();

private String language;
private final String language;
private final IndexVersion version;

private static final DeprecationLogger DEPRECATION_LOGGER = DeprecationLogger.getLogger(StemmerTokenFilterFactory.class);

StemmerTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) throws IOException {
super(name, settings);
this.version = indexSettings.getIndexVersionCreated();
this.language = Strings.capitalize(settings.get("language", settings.get("name", "porter")));
// check that we have a valid language by trying to create a TokenStream
create(EMPTY_TOKEN_STREAM).close();
Expand Down Expand Up @@ -167,7 +175,12 @@ public TokenStream create(TokenStream tokenStream) {
} else if ("german".equalsIgnoreCase(language)) {
return new SnowballFilter(tokenStream, new GermanStemmer());
} else if ("german2".equalsIgnoreCase(language)) {
// TODO Lucene 10 upgrade: how about bw comp for users relying on german2 stemmer that is now folded into german stemmer?
DEPRECATION_LOGGER.critical(
DeprecationCategory.ANALYSIS,
"german2_stemmer_deprecation",
"The 'german2' stemmer has been deprecated and folded into the 'german' Stemmer. "
+ "Replace all usages of 'german2' with 'german'."
);
return new SnowballFilter(tokenStream, new GermanStemmer());
} else if ("light_german".equalsIgnoreCase(language) || "lightGerman".equalsIgnoreCase(language)) {
return new GermanLightStemFilter(tokenStream);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
*/
package org.elasticsearch.analysis.common;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
Expand All @@ -16,6 +17,7 @@
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexVersion;
import org.elasticsearch.index.IndexVersions;
import org.elasticsearch.index.analysis.AnalysisTestsHelper;
import org.elasticsearch.index.analysis.IndexAnalyzers;
import org.elasticsearch.index.analysis.NamedAnalyzer;
Expand Down Expand Up @@ -103,4 +105,40 @@ public void testMultipleLanguagesThrowsException() throws IOException {
);
assertEquals("Invalid stemmer class specified: [english, light_english]", e.getMessage());
}

public void testGermanAndGerman2Stemmer() throws IOException {
IndexVersion v = IndexVersionUtils.randomVersionBetween(random(), IndexVersions.UPGRADE_TO_LUCENE_10_0_0, IndexVersion.current());
Analyzer analyzer = createGermanStemmer("german", v);
assertAnalyzesTo(analyzer, "Buecher Bücher", new String[] { "Buch", "Buch" });

analyzer = createGermanStemmer("german2", v);
assertAnalyzesTo(analyzer, "Buecher Bücher", new String[] { "Buch", "Buch" });
assertWarnings(
"The 'german2' stemmer has been deprecated and folded into the 'german' Stemmer. "
+ "Replace all usages of 'german2' with 'german'."
);
}

public Analyzer createGermanStemmer(String variant, IndexVersion v) throws IOException {

Settings settings = Settings.builder()
.put("index.analysis.filter.my_german.type", "stemmer")
.put("index.analysis.filter.my_german.language", variant)
.put("index.analysis.analyzer.my_german.tokenizer", "whitespace")
.put("index.analysis.analyzer.my_german.filter", "my_german")
.put(SETTING_VERSION_CREATED, v)
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
.build();

ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(settings, PLUGIN);
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_german");
assertThat(tokenFilter, instanceOf(StemmerTokenFilterFactory.class));
Tokenizer tokenizer = new WhitespaceTokenizer();
tokenizer.setReader(new StringReader("Buecher oder Bücher"));
TokenStream create = tokenFilter.create(tokenizer);
assertThat(create, instanceOf(SnowballFilter.class));
IndexAnalyzers indexAnalyzers = analysis.indexAnalyzers;
NamedAnalyzer analyzer = indexAnalyzers.get("my_german");
return analyzer;
}
}