diff --git a/docs/changelog/113614.yaml b/docs/changelog/113614.yaml new file mode 100644 index 0000000000000..bd9dcb3e38772 --- /dev/null +++ b/docs/changelog/113614.yaml @@ -0,0 +1,18 @@ +pr: 113614 +summary: The 'german2' stemmer is now an alias for the 'german' snowball stemmer +area: Analysis +type: breaking +issues: [] +breaking: + title: The "german2" snowball stemmer is now an alias for the "german" stemmer + area: Analysis + details: >- + Lucene 10 has merged the improved "german2" snowball language stemmer with the + "german" stemmer. For Elasticsearch, "german2" is now a deprecated alias for + "german". This may results in slightly different tokens being generated for + terms with umlaut substitution (like "ue" for "ü" etc...) + impact: >- + Replace usages of "german2" with "german" in analysis configuration. Old + indices that use the "german" stemmer should be reindexed if possible. + notable: false + diff --git a/docs/reference/analysis/tokenfilters/stemmer-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/stemmer-tokenfilter.asciidoc index 4cd088935af19..d9e2120afe6d1 100644 --- a/docs/reference/analysis/tokenfilters/stemmer-tokenfilter.asciidoc +++ b/docs/reference/analysis/tokenfilters/stemmer-tokenfilter.asciidoc @@ -173,7 +173,6 @@ http://bvg.udc.es/recursos_lingua/stemming.jsp[`minimal_galician`] (Plural step German:: https://dl.acm.org/citation.cfm?id=1141523[*`light_german`*], https://snowballstem.org/algorithms/german/stemmer.html[`german`], -https://snowballstem.org/algorithms/german2/stemmer.html[`german2`], http://members.unine.ch/jacques.savoy/clef/morpho.pdf[`minimal_german`] Greek:: diff --git a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/StemmerTokenFilterFactory.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/StemmerTokenFilterFactory.java index 6b622444980d3..7548c8ad2b88b 100644 --- a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/StemmerTokenFilterFactory.java +++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/StemmerTokenFilterFactory.java @@ -89,6 +89,8 @@ public class StemmerTokenFilterFactory extends AbstractTokenFilterFactory { private final String language; + private static final DeprecationLogger DEPRECATION_LOGGER = DeprecationLogger.getLogger(StemmerTokenFilterFactory.class); + StemmerTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) throws IOException { super(name, settings); this.language = Strings.capitalize(settings.get("language", settings.get("name", "porter"))); @@ -190,7 +192,12 @@ public boolean incrementToken() { } else if ("german".equalsIgnoreCase(language)) { return new SnowballFilter(tokenStream, new GermanStemmer()); } else if ("german2".equalsIgnoreCase(language)) { - // TODO Lucene 10 upgrade: how about bw comp for users relying on german2 stemmer that is now folded into german stemmer? + DEPRECATION_LOGGER.critical( + DeprecationCategory.ANALYSIS, + "german2_stemmer_deprecation", + "The 'german2' stemmer has been deprecated and folded into the 'german' Stemmer. " + + "Replace all usages of 'german2' with 'german'." + ); return new SnowballFilter(tokenStream, new GermanStemmer()); } else if ("light_german".equalsIgnoreCase(language) || "lightGerman".equalsIgnoreCase(language)) { return new GermanLightStemFilter(tokenStream); diff --git a/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/StemmerTokenFilterFactoryTests.java b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/StemmerTokenFilterFactoryTests.java index 8f3d52f0174c6..bb06c221873b5 100644 --- a/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/StemmerTokenFilterFactoryTests.java +++ b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/StemmerTokenFilterFactoryTests.java @@ -8,6 +8,7 @@ */ package org.elasticsearch.analysis.common; +import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.core.WhitespaceTokenizer; @@ -16,6 +17,7 @@ import org.elasticsearch.common.settings.Settings; import org.elasticsearch.env.Environment; import org.elasticsearch.index.IndexVersion; +import org.elasticsearch.index.IndexVersions; import org.elasticsearch.index.analysis.AnalysisTestsHelper; import org.elasticsearch.index.analysis.IndexAnalyzers; import org.elasticsearch.index.analysis.NamedAnalyzer; @@ -103,6 +105,42 @@ public void testMultipleLanguagesThrowsException() throws IOException { assertEquals("Invalid stemmer class specified: [english, light_english]", e.getMessage()); } + public void testGermanAndGerman2Stemmer() throws IOException { + IndexVersion v = IndexVersionUtils.randomVersionBetween(random(), IndexVersions.UPGRADE_TO_LUCENE_10_0_0, IndexVersion.current()); + Analyzer analyzer = createGermanStemmer("german", v); + assertAnalyzesTo(analyzer, "Buecher Bücher", new String[] { "Buch", "Buch" }); + + analyzer = createGermanStemmer("german2", v); + assertAnalyzesTo(analyzer, "Buecher Bücher", new String[] { "Buch", "Buch" }); + assertWarnings( + "The 'german2' stemmer has been deprecated and folded into the 'german' Stemmer. " + + "Replace all usages of 'german2' with 'german'." + ); + } + + private static Analyzer createGermanStemmer(String variant, IndexVersion v) throws IOException { + + Settings settings = Settings.builder() + .put("index.analysis.filter.my_german.type", "stemmer") + .put("index.analysis.filter.my_german.language", variant) + .put("index.analysis.analyzer.my_german.tokenizer", "whitespace") + .put("index.analysis.analyzer.my_german.filter", "my_german") + .put(SETTING_VERSION_CREATED, v) + .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) + .build(); + + ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(settings, PLUGIN); + TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_german"); + assertThat(tokenFilter, instanceOf(StemmerTokenFilterFactory.class)); + Tokenizer tokenizer = new WhitespaceTokenizer(); + tokenizer.setReader(new StringReader("Buecher oder Bücher")); + TokenStream create = tokenFilter.create(tokenizer); + assertThat(create, instanceOf(SnowballFilter.class)); + IndexAnalyzers indexAnalyzers = analysis.indexAnalyzers; + NamedAnalyzer analyzer = indexAnalyzers.get("my_german"); + return analyzer; + } + public void testKpDeprecation() throws IOException { IndexVersion v = IndexVersionUtils.randomVersion(random()); Settings settings = Settings.builder()