diff --git a/docs/reference/analysis/analyzers/lang-analyzer.asciidoc b/docs/reference/analysis/analyzers/lang-analyzer.asciidoc index 881970787f5a6..5273537389e3d 100644 --- a/docs/reference/analysis/analyzers/lang-analyzer.asciidoc +++ b/docs/reference/analysis/analyzers/lang-analyzer.asciidoc @@ -1430,8 +1430,7 @@ PUT /persian_example "decimal_digit", "arabic_normalization", "persian_normalization", - "persian_stop", - "persian_stem" + "persian_stop" ] } } diff --git a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/PersianAnalyzerProvider.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/PersianAnalyzerProvider.java index 9ea3a9fa4eee9..e9f5171180204 100644 --- a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/PersianAnalyzerProvider.java +++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/PersianAnalyzerProvider.java @@ -9,24 +9,72 @@ package org.elasticsearch.analysis.common; +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.LowerCaseFilter; +import org.apache.lucene.analysis.StopFilter; +import org.apache.lucene.analysis.StopwordAnalyzerBase; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.ar.ArabicNormalizationFilter; +import org.apache.lucene.analysis.core.DecimalDigitFilter; import org.apache.lucene.analysis.fa.PersianAnalyzer; +import org.apache.lucene.analysis.fa.PersianCharFilter; +import org.apache.lucene.analysis.fa.PersianNormalizationFilter; +import org.apache.lucene.analysis.standard.StandardTokenizer; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.env.Environment; import org.elasticsearch.index.IndexSettings; +import org.elasticsearch.index.IndexVersions; import org.elasticsearch.index.analysis.AbstractIndexAnalyzerProvider; import org.elasticsearch.index.analysis.Analysis; +import java.io.Reader; + public class PersianAnalyzerProvider extends AbstractIndexAnalyzerProvider { - private final PersianAnalyzer analyzer; + private final Analyzer analyzer; PersianAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) { super(name, settings); - analyzer = new PersianAnalyzer(Analysis.parseStopWords(env, settings, PersianAnalyzer.getDefaultStopSet())); + if (indexSettings.getIndexVersionCreated().onOrAfter(IndexVersions.UPGRADE_TO_LUCENE_10_0_0)) { + // since Lucene 10 this analyzer contains stemming by default + analyzer = new PersianAnalyzer(Analysis.parseStopWords(env, settings, PersianAnalyzer.getDefaultStopSet())); + } else { + // for older index versions we need the old analyzer behaviour without stemming + analyzer = new StopwordAnalyzerBase(Analysis.parseStopWords(env, settings, PersianAnalyzer.getDefaultStopSet())) { + + protected Analyzer.TokenStreamComponents createComponents(String fieldName) { + final Tokenizer source = new StandardTokenizer(); + TokenStream result = new LowerCaseFilter(source); + result = new DecimalDigitFilter(result); + result = new ArabicNormalizationFilter(result); + /* additional persian-specific normalization */ + result = new PersianNormalizationFilter(result); + /* + * the order here is important: the stopword list is normalized with the + * above! + */ + return new TokenStreamComponents(source, new StopFilter(result, stopwords)); + } + + protected TokenStream normalize(String fieldName, TokenStream in) { + TokenStream result = new LowerCaseFilter(in); + result = new DecimalDigitFilter(result); + result = new ArabicNormalizationFilter(result); + /* additional persian-specific normalization */ + result = new PersianNormalizationFilter(result); + return result; + } + + protected Reader initReader(String fieldName, Reader reader) { + return new PersianCharFilter(reader); + } + }; + } } @Override - public PersianAnalyzer get() { + public Analyzer get() { return this.analyzer; } } diff --git a/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/PersianAnalyzerProviderTests.java b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/PersianAnalyzerProviderTests.java new file mode 100644 index 0000000000000..e9d68955b6a57 --- /dev/null +++ b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/PersianAnalyzerProviderTests.java @@ -0,0 +1,76 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the "Elastic License + * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side + * Public License v 1"; you may not use this file except in compliance with, at + * your election, the "Elastic License 2.0", the "GNU Affero General Public + * License v3.0 only", or the "Server Side Public License, v 1". + */ + +package org.elasticsearch.analysis.common; + +import org.apache.lucene.analysis.Analyzer; +import org.elasticsearch.cluster.metadata.IndexMetadata; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.env.Environment; +import org.elasticsearch.index.IndexSettings; +import org.elasticsearch.index.IndexVersion; +import org.elasticsearch.index.IndexVersions; +import org.elasticsearch.test.ESTestCase; +import org.elasticsearch.test.ESTokenStreamTestCase; +import org.elasticsearch.test.IndexSettingsModule; +import org.elasticsearch.test.index.IndexVersionUtils; + +import java.io.IOException; + +/** + * Tests Persian Analyzer factory and behavioural changes with Lucene 10 + */ +public class PersianAnalyzerProviderTests extends ESTokenStreamTestCase { + + public void testPersianAnalyzerPostLucene10() throws IOException { + IndexVersion postLucene10Version = IndexVersionUtils.randomVersionBetween( + random(), + IndexVersions.UPGRADE_TO_LUCENE_10_0_0, + IndexVersion.current() + ); + Settings settings = ESTestCase.indexSettings(1, 1) + .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) + .put(IndexMetadata.SETTING_VERSION_CREATED, postLucene10Version) + .build(); + IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings); + Environment environment = new Environment(settings, null); + + PersianAnalyzerProvider persianAnalyzerProvider = new PersianAnalyzerProvider( + idxSettings, + environment, + "my-analyzer", + Settings.EMPTY + ); + Analyzer analyzer = persianAnalyzerProvider.get(); + assertAnalyzesTo(analyzer, "من کتاب های زیادی خوانده ام", new String[] { "كتاب", "زياد", "خوانده" }); + } + + public void testPersianAnalyzerPreLucene10() throws IOException { + IndexVersion preLucene10Version = IndexVersionUtils.randomVersionBetween( + random(), + IndexVersionUtils.getFirstVersion(), + IndexVersionUtils.getPreviousVersion(IndexVersions.UPGRADE_TO_LUCENE_10_0_0) + ); + Settings settings = ESTestCase.indexSettings(1, 1) + .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) + .put(IndexMetadata.SETTING_VERSION_CREATED, preLucene10Version) + .build(); + IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings); + Environment environment = new Environment(settings, null); + + PersianAnalyzerProvider persianAnalyzerProvider = new PersianAnalyzerProvider( + idxSettings, + environment, + "my-analyzer", + Settings.EMPTY + ); + Analyzer analyzer = persianAnalyzerProvider.get(); + assertAnalyzesTo(analyzer, "من کتاب های زیادی خوانده ام", new String[] { "كتاب", "زيادي", "خوانده" }); + } +} diff --git a/modules/analysis-common/src/yamlRestTest/resources/rest-api-spec/test/analysis-common/20_analyzers.yml b/modules/analysis-common/src/yamlRestTest/resources/rest-api-spec/test/analysis-common/20_analyzers.yml index 7674b95af4851..0fc9387226343 100644 --- a/modules/analysis-common/src/yamlRestTest/resources/rest-api-spec/test/analysis-common/20_analyzers.yml +++ b/modules/analysis-common/src/yamlRestTest/resources/rest-api-spec/test/analysis-common/20_analyzers.yml @@ -901,6 +901,31 @@ - length: { tokens: 1 } - match: { tokens.0.token: خورد } +--- +"persian stemming": + - requires: + cluster_features: ["persian_analyzer_with_stemmer"] + reason: "test requires persian analyzer stemming capabilities that come with Lucene 10" + + - do: + indices.create: + index: test + body: + settings: + analysis: + analyzer: + my_analyzer: + type: persian + + - do: + indices.analyze: + index: test + body: + text: كتابها + analyzer: my_analyzer + - length: { tokens: 1 } + - match: { tokens.0.token: كتاب } + --- "portuguese": - do: diff --git a/modules/analysis-common/src/yamlRestTest/resources/rest-api-spec/test/search.query/80_persian_search.yml b/modules/analysis-common/src/yamlRestTest/resources/rest-api-spec/test/search.query/80_persian_search.yml new file mode 100644 index 0000000000000..0fc7d17bba6be --- /dev/null +++ b/modules/analysis-common/src/yamlRestTest/resources/rest-api-spec/test/search.query/80_persian_search.yml @@ -0,0 +1,44 @@ +# integration tests for persian analyzer changes from Lucene 9 to Lucene 10 +setup: + - do: + indices.create: + index: test + body: + mappings: + properties: + text: + type: text + analyzer: persian + +--- +"persian search": + - do: + bulk: + refresh: true + body: + - '{"index": {"_index": "test", "_id": "1"}}' + - '{"text" : "كتابها"}' + - '{"index": {"_index": "test", "_id": "2"}}' + - '{"text" : "كتاب"}' + + - do: + search: + rest_total_hits_as_int: true + index: test + body: + query: + match: + text: + query: كتابها + - match: { hits.total: 2 } + + - do: + search: + rest_total_hits_as_int: true + index: test + body: + query: + match: + text: + query: كتاب + - match: { hits.total: 2 } diff --git a/server/src/main/java/org/elasticsearch/index/analysis/Analysis.java b/server/src/main/java/org/elasticsearch/index/analysis/Analysis.java index 462490a7fceb7..2be590333034e 100644 --- a/server/src/main/java/org/elasticsearch/index/analysis/Analysis.java +++ b/server/src/main/java/org/elasticsearch/index/analysis/Analysis.java @@ -54,6 +54,7 @@ import org.elasticsearch.common.logging.DeprecationLogger; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.env.Environment; +import org.elasticsearch.features.NodeFeature; import org.elasticsearch.synonyms.PagedResult; import org.elasticsearch.synonyms.SynonymRule; import org.elasticsearch.synonyms.SynonymsManagementAPIService; @@ -83,6 +84,8 @@ public class Analysis { private static final DeprecationLogger DEPRECATION_LOGGER = DeprecationLogger.getLogger(Analysis.class); private static final Logger logger = LogManager.getLogger(Analysis.class); + public static final NodeFeature PERSIAN_ANALYZER_WITH_STEMMER = new NodeFeature("persian_analyzer_with_stemmer"); + public static void checkForDeprecatedVersion(String name, Settings settings) { String sVersion = settings.get("version"); if (sVersion != null) { diff --git a/server/src/main/java/org/elasticsearch/index/analysis/AnalyzerProvider.java b/server/src/main/java/org/elasticsearch/index/analysis/AnalyzerProvider.java index 757c22ca7d8b5..ad48fa96d5468 100644 --- a/server/src/main/java/org/elasticsearch/index/analysis/AnalyzerProvider.java +++ b/server/src/main/java/org/elasticsearch/index/analysis/AnalyzerProvider.java @@ -12,12 +12,12 @@ import org.apache.lucene.analysis.Analyzer; import org.elasticsearch.injection.guice.Provider; -public interface AnalyzerProvider extends Provider { +public interface AnalyzerProvider extends Provider { String name(); AnalyzerScope scope(); @Override - T get(); + Analyzer get(); } diff --git a/server/src/main/java/org/elasticsearch/rest/RestFeatures.java b/server/src/main/java/org/elasticsearch/rest/RestFeatures.java index 8d546f7aa43f8..4595214430952 100644 --- a/server/src/main/java/org/elasticsearch/rest/RestFeatures.java +++ b/server/src/main/java/org/elasticsearch/rest/RestFeatures.java @@ -18,6 +18,7 @@ import java.util.Map; import java.util.Set; +import static org.elasticsearch.index.analysis.Analysis.PERSIAN_ANALYZER_WITH_STEMMER; import static org.elasticsearch.search.fetch.subphase.highlight.DefaultHighlighter.UNIFIED_HIGHLIGHTER_MATCHED_FIELDS; public class RestFeatures implements FeatureSpecification { @@ -26,7 +27,8 @@ public Set getFeatures() { return Set.of( RestNodesCapabilitiesAction.CAPABILITIES_ACTION, RestNodesCapabilitiesAction.LOCAL_ONLY_CAPABILITIES, - UNIFIED_HIGHLIGHTER_MATCHED_FIELDS + UNIFIED_HIGHLIGHTER_MATCHED_FIELDS, + PERSIAN_ANALYZER_WITH_STEMMER ); }