diff --git a/docs/changelog/113482.yaml b/docs/changelog/113482.yaml new file mode 100644 index 0000000000000..f16f183ea05c6 --- /dev/null +++ b/docs/changelog/113482.yaml @@ -0,0 +1,27 @@ +pr: 113482 +summary: The 'persian' analyzer has stemmer by default +area: Analysis +type: breaking +issues: +- 113050 +breaking: + title: The 'persian' analyzer has stemmer by default + area: Analysis + details: >- + Lucene 10 has added a final stemming step to its PersianAnalyzer that we + expose as 'persian' analyzer. Existing indices will keep the old + non-stemming behaviour while new indices will see the updated behaviour with + added stemming. + Users that want to maintain the non-stemming behaviour need to define their + own analyzer as outlines in + https://www.elastic.co/guide/en/elasticsearch/reference/8.15/analysis-lang-analyzer.html#persian-analyzer. + Users that want to use the new stemming behaviour for existing indices will + have to reindex their data. + impact: >- + Indexing with the 'persian' analyzer will produce slightly different token. + Users should check if this impacts their search results. If they want to + maintain the legacy non-stemming behaviour they can define their own + analyzer equivalent as explained in + https://www.elastic.co/guide/en/elasticsearch/reference/8.15/analysis-lang-analyzer.html#persian-analyzer. + notable: false + diff --git a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/PersianAnalyzerProvider.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/PersianAnalyzerProvider.java index 9ea3a9fa4eee9..917a45188123c 100644 --- a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/PersianAnalyzerProvider.java +++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/PersianAnalyzerProvider.java @@ -9,24 +9,72 @@ package org.elasticsearch.analysis.common; +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.LowerCaseFilter; +import org.apache.lucene.analysis.StopFilter; +import org.apache.lucene.analysis.StopwordAnalyzerBase; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.ar.ArabicNormalizationFilter; +import org.apache.lucene.analysis.core.DecimalDigitFilter; import org.apache.lucene.analysis.fa.PersianAnalyzer; +import org.apache.lucene.analysis.fa.PersianCharFilter; +import org.apache.lucene.analysis.fa.PersianNormalizationFilter; +import org.apache.lucene.analysis.standard.StandardTokenizer; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.env.Environment; import org.elasticsearch.index.IndexSettings; +import org.elasticsearch.index.IndexVersions; import org.elasticsearch.index.analysis.AbstractIndexAnalyzerProvider; import org.elasticsearch.index.analysis.Analysis; -public class PersianAnalyzerProvider extends AbstractIndexAnalyzerProvider { +import java.io.Reader; - private final PersianAnalyzer analyzer; +public class PersianAnalyzerProvider extends AbstractIndexAnalyzerProvider { + + private final StopwordAnalyzerBase analyzer; PersianAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) { super(name, settings); - analyzer = new PersianAnalyzer(Analysis.parseStopWords(env, settings, PersianAnalyzer.getDefaultStopSet())); + if (indexSettings.getIndexVersionCreated().onOrAfter(IndexVersions.UPGRADE_TO_LUCENE_10_0_0)) { + // since Lucene 10 this analyzer contains stemming by default + analyzer = new PersianAnalyzer(Analysis.parseStopWords(env, settings, PersianAnalyzer.getDefaultStopSet())); + } else { + // for older index versions we need the old analyzer behaviour without stemming + analyzer = new StopwordAnalyzerBase(Analysis.parseStopWords(env, settings, PersianAnalyzer.getDefaultStopSet())) { + + protected Analyzer.TokenStreamComponents createComponents(String fieldName) { + final Tokenizer source = new StandardTokenizer(); + TokenStream result = new LowerCaseFilter(source); + result = new DecimalDigitFilter(result); + result = new ArabicNormalizationFilter(result); + /* additional persian-specific normalization */ + result = new PersianNormalizationFilter(result); + /* + * the order here is important: the stopword list is normalized with the + * above! + */ + return new TokenStreamComponents(source, new StopFilter(result, stopwords)); + } + + protected TokenStream normalize(String fieldName, TokenStream in) { + TokenStream result = new LowerCaseFilter(in); + result = new DecimalDigitFilter(result); + result = new ArabicNormalizationFilter(result); + /* additional persian-specific normalization */ + result = new PersianNormalizationFilter(result); + return result; + } + + protected Reader initReader(String fieldName, Reader reader) { + return new PersianCharFilter(reader); + } + }; + } } @Override - public PersianAnalyzer get() { + public StopwordAnalyzerBase get() { return this.analyzer; } } diff --git a/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/PersianAnalyzerProviderTests.java b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/PersianAnalyzerProviderTests.java new file mode 100644 index 0000000000000..7b962538c2a10 --- /dev/null +++ b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/PersianAnalyzerProviderTests.java @@ -0,0 +1,78 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the "Elastic License + * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side + * Public License v 1"; you may not use this file except in compliance with, at + * your election, the "Elastic License 2.0", the "GNU Affero General Public + * License v3.0 only", or the "Server Side Public License, v 1". + */ + +package org.elasticsearch.analysis.common; + +import org.apache.lucene.analysis.Analyzer; +import org.elasticsearch.cluster.metadata.IndexMetadata; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.env.Environment; +import org.elasticsearch.index.IndexSettings; +import org.elasticsearch.index.IndexVersion; +import org.elasticsearch.index.IndexVersions; +import org.elasticsearch.test.ESTestCase; +import org.elasticsearch.test.ESTokenStreamTestCase; +import org.elasticsearch.test.IndexSettingsModule; +import org.elasticsearch.test.index.IndexVersionUtils; + +import java.io.IOException; + +import static org.apache.lucene.tests.analysis.BaseTokenStreamTestCase.assertAnalyzesTo; + +/** + * Tests Persian Analyzer factory and behavioural changes with Lucene 10 + */ +public class PersianAnalyzerProviderTests extends ESTokenStreamTestCase { + + public void testPersianAnalyzerPostLucene10() throws IOException { + IndexVersion postLucene10Version = IndexVersionUtils.randomVersionBetween( + random(), + IndexVersions.UPGRADE_TO_LUCENE_10_0_0, + IndexVersion.current() + ); + Settings settings = ESTestCase.indexSettings(1, 1) + .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) + .put(IndexMetadata.SETTING_VERSION_CREATED, postLucene10Version) + .build(); + IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings); + Environment environment = new Environment(settings, null); + + PersianAnalyzerProvider persianAnalyzerProvider = new PersianAnalyzerProvider( + idxSettings, + environment, + "my-analyzer", + Settings.EMPTY + ); + Analyzer analyzer = persianAnalyzerProvider.get(); + assertAnalyzesTo(analyzer, "من کتاب های زیادی خوانده ام", new String[] { "كتاب", "زياد", "خوانده" }); + } + + public void testPersianAnalyzerPreLucene10() throws IOException { + IndexVersion preLucene10Version = IndexVersionUtils.randomVersionBetween( + random(), + IndexVersionUtils.getFirstVersion(), + IndexVersionUtils.getPreviousVersion(IndexVersions.UPGRADE_TO_LUCENE_10_0_0) + ); + Settings settings = ESTestCase.indexSettings(1, 1) + .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) + .put(IndexMetadata.SETTING_VERSION_CREATED, preLucene10Version) + .build(); + IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings); + Environment environment = new Environment(settings, null); + + PersianAnalyzerProvider persianAnalyzerProvider = new PersianAnalyzerProvider( + idxSettings, + environment, + "my-analyzer", + Settings.EMPTY + ); + Analyzer analyzer = persianAnalyzerProvider.get(); + assertAnalyzesTo(analyzer, "من کتاب های زیادی خوانده ام", new String[] { "كتاب", "زيادي", "خوانده" }); + } +} diff --git a/modules/analysis-common/src/yamlRestTest/resources/rest-api-spec/test/analysis-common/20_analyzers.yml b/modules/analysis-common/src/yamlRestTest/resources/rest-api-spec/test/analysis-common/20_analyzers.yml index 7674b95af4851..8930e485aa249 100644 --- a/modules/analysis-common/src/yamlRestTest/resources/rest-api-spec/test/analysis-common/20_analyzers.yml +++ b/modules/analysis-common/src/yamlRestTest/resources/rest-api-spec/test/analysis-common/20_analyzers.yml @@ -901,6 +901,31 @@ - length: { tokens: 1 } - match: { tokens.0.token: خورد } +--- +"persian stemming": + - requires: + cluster_features: ["lucene_10_upgrade"] + reason: "test requires persian analyzer stemming capabilities that come with Lucene 10" + + - do: + indices.create: + index: test + body: + settings: + analysis: + analyzer: + my_analyzer: + type: persian + + - do: + indices.analyze: + index: test + body: + text: كتابها + analyzer: my_analyzer + - length: { tokens: 1 } + - match: { tokens.0.token: كتاب } + --- "portuguese": - do: diff --git a/qa/full-cluster-restart/src/javaRestTest/java/org/elasticsearch/upgrades/FullClusterRestartIT.java b/qa/full-cluster-restart/src/javaRestTest/java/org/elasticsearch/upgrades/FullClusterRestartIT.java index ee18f8fc2ec4b..43ed1755317f6 100644 --- a/qa/full-cluster-restart/src/javaRestTest/java/org/elasticsearch/upgrades/FullClusterRestartIT.java +++ b/qa/full-cluster-restart/src/javaRestTest/java/org/elasticsearch/upgrades/FullClusterRestartIT.java @@ -33,6 +33,7 @@ import org.elasticsearch.index.IndexVersions; import org.elasticsearch.index.mapper.DateFieldMapper; import org.elasticsearch.rest.action.admin.indices.RestPutIndexTemplateAction; +import org.elasticsearch.search.SearchFeatures; import org.elasticsearch.test.NotEqualMessageBuilder; import org.elasticsearch.test.XContentTestUtils; import org.elasticsearch.test.cluster.ElasticsearchCluster; @@ -1726,6 +1727,106 @@ public void testSystemIndexMetadataIsUpgraded() throws Exception { } } + /** + * This test ensures that search results on old indices using "persian" analyzer don't change + * after we introduce Lucene 10 + */ + public void testPersianAnalyzerBWC() throws Exception { + var originalClusterLegacyPersianAnalyzer = oldClusterHasFeature(SearchFeatures.LUCENE_10_0_0_UPGRADE) == false; + assumeTrue("Don't run this test if both versions already support stemming", originalClusterLegacyPersianAnalyzer); + final String indexName = "test_persian_stemmer"; + Settings idxSettings = indexSettings(1, 1).build(); + String mapping = """ + { + "properties": { + "textfield" : { + "type": "text", + "analyzer": "persian" + } + } + } + """; + + String query = """ + { + "query": { + "match": { + "textfield": "كتابها" + } + } + } + """; + + if (isRunningAgainstOldCluster()) { + createIndex(client(), indexName, idxSettings, mapping); + ensureGreen(indexName); + + assertOK( + client().performRequest( + newXContentRequest( + HttpMethod.POST, + "/" + indexName + "/" + "_doc/1", + (builder, params) -> builder.field("textfield", "كتابها") + ) + ) + ); + assertOK( + client().performRequest( + newXContentRequest( + HttpMethod.POST, + "/" + indexName + "/" + "_doc/2", + (builder, params) -> builder.field("textfield", "كتاب") + ) + ) + ); + refresh(indexName); + + assertNumHits(indexName, 2, 1); + + Request searchRequest = new Request("POST", "/" + indexName + "/_search"); + searchRequest.setJsonEntity(query); + assertTotalHits(1, entityAsMap(client().performRequest(searchRequest))); + } else { + // old index should still only return one doc + Request searchRequest = new Request("POST", "/" + indexName + "/_search"); + searchRequest.setJsonEntity(query); + assertTotalHits(1, entityAsMap(client().performRequest(searchRequest))); + + String newIndexName = indexName + "_new"; + createIndex(client(), newIndexName, idxSettings, mapping); + ensureGreen(newIndexName); + + assertOK( + client().performRequest( + newXContentRequest( + HttpMethod.POST, + "/" + newIndexName + "/" + "_doc/1", + (builder, params) -> builder.field("textfield", "كتابها") + ) + ) + ); + assertOK( + client().performRequest( + newXContentRequest( + HttpMethod.POST, + "/" + newIndexName + "/" + "_doc/2", + (builder, params) -> builder.field("textfield", "كتاب") + ) + ) + ); + refresh(newIndexName); + + searchRequest = new Request("POST", "/" + newIndexName + "/_search"); + searchRequest.setJsonEntity(query); + assertTotalHits(2, entityAsMap(client().performRequest(searchRequest))); + + // searching both indices (old and new analysis version) we should get 1 hit from the old and 2 from the new index + searchRequest = new Request("POST", "/" + indexName + "," + newIndexName + "/_search"); + searchRequest.setJsonEntity(query); + assertTotalHits(3, entityAsMap(client().performRequest(searchRequest))); + } + } + /** * This test ensures that soft deletes are enabled a when upgrading a pre-8 cluster to 8.0+ */