Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 18 additions & 0 deletions docs/changelog/113614.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
pr: 113614
summary: The 'german2' stemmer is now an alias for the 'german' snowball stemmer
area: Analysis
type: breaking
issues: []
breaking:
title: The "german2" snowball stemmer is now an alias for the "german" stemmer
area: Analysis
details: >-
Lucene 10 has merged the improved "german2" snowball language stemmer with the
"german" stemmer. For Elasticsearch, "german2" is now a deprecated alias for
"german". This may results in slightly different tokens being generated for
terms with umlaut substitution (like "ue" for "ü" etc...)
impact: >-
Replace usages of "german2" with "german" in analysis configuration. Old
indices that use the "german" stemmer should be reindexed if possible.
notable: false

Original file line number Diff line number Diff line change
Expand Up @@ -173,7 +173,6 @@ http://bvg.udc.es/recursos_lingua/stemming.jsp[`minimal_galician`] (Plural step
German::
https://dl.acm.org/citation.cfm?id=1141523[*`light_german`*],
https://snowballstem.org/algorithms/german/stemmer.html[`german`],
https://snowballstem.org/algorithms/german2/stemmer.html[`german2`],
http://members.unine.ch/jacques.savoy/clef/morpho.pdf[`minimal_german`]

Greek::
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,8 @@ public class StemmerTokenFilterFactory extends AbstractTokenFilterFactory {

private final String language;

private static final DeprecationLogger DEPRECATION_LOGGER = DeprecationLogger.getLogger(StemmerTokenFilterFactory.class);

StemmerTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) throws IOException {
super(name, settings);
this.language = Strings.capitalize(settings.get("language", settings.get("name", "porter")));
Expand Down Expand Up @@ -190,7 +192,12 @@ public boolean incrementToken() {
} else if ("german".equalsIgnoreCase(language)) {
return new SnowballFilter(tokenStream, new GermanStemmer());
} else if ("german2".equalsIgnoreCase(language)) {
// TODO Lucene 10 upgrade: how about bw comp for users relying on german2 stemmer that is now folded into german stemmer?
DEPRECATION_LOGGER.critical(
DeprecationCategory.ANALYSIS,
"german2_stemmer_deprecation",
"The 'german2' stemmer has been deprecated and folded into the 'german' Stemmer. "
+ "Replace all usages of 'german2' with 'german'."
);
return new SnowballFilter(tokenStream, new GermanStemmer());
} else if ("light_german".equalsIgnoreCase(language) || "lightGerman".equalsIgnoreCase(language)) {
return new GermanLightStemFilter(tokenStream);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
*/
package org.elasticsearch.analysis.common;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
Expand All @@ -16,6 +17,7 @@
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexVersion;
import org.elasticsearch.index.IndexVersions;
import org.elasticsearch.index.analysis.AnalysisTestsHelper;
import org.elasticsearch.index.analysis.IndexAnalyzers;
import org.elasticsearch.index.analysis.NamedAnalyzer;
Expand Down Expand Up @@ -103,6 +105,42 @@ public void testMultipleLanguagesThrowsException() throws IOException {
assertEquals("Invalid stemmer class specified: [english, light_english]", e.getMessage());
}

public void testGermanAndGerman2Stemmer() throws IOException {
IndexVersion v = IndexVersionUtils.randomVersionBetween(random(), IndexVersions.UPGRADE_TO_LUCENE_10_0_0, IndexVersion.current());
Analyzer analyzer = createGermanStemmer("german", v);
assertAnalyzesTo(analyzer, "Buecher Bücher", new String[] { "Buch", "Buch" });

analyzer = createGermanStemmer("german2", v);
assertAnalyzesTo(analyzer, "Buecher Bücher", new String[] { "Buch", "Buch" });
assertWarnings(
"The 'german2' stemmer has been deprecated and folded into the 'german' Stemmer. "
+ "Replace all usages of 'german2' with 'german'."
);
}

private static Analyzer createGermanStemmer(String variant, IndexVersion v) throws IOException {

Settings settings = Settings.builder()
.put("index.analysis.filter.my_german.type", "stemmer")
.put("index.analysis.filter.my_german.language", variant)
.put("index.analysis.analyzer.my_german.tokenizer", "whitespace")
.put("index.analysis.analyzer.my_german.filter", "my_german")
.put(SETTING_VERSION_CREATED, v)
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
.build();

ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(settings, PLUGIN);
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_german");
assertThat(tokenFilter, instanceOf(StemmerTokenFilterFactory.class));
Tokenizer tokenizer = new WhitespaceTokenizer();
tokenizer.setReader(new StringReader("Buecher oder Bücher"));
TokenStream create = tokenFilter.create(tokenizer);
assertThat(create, instanceOf(SnowballFilter.class));
IndexAnalyzers indexAnalyzers = analysis.indexAnalyzers;
NamedAnalyzer analyzer = indexAnalyzers.get("my_german");
return analyzer;
}

public void testKpDeprecation() throws IOException {
IndexVersion v = IndexVersionUtils.randomVersion(random());
Settings settings = Settings.builder()
Expand Down