Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions docs/reference/analysis/analyzers/lang-analyzer.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -1430,8 +1430,7 @@ PUT /persian_example
"decimal_digit",
"arabic_normalization",
"persian_normalization",
"persian_stop",
"persian_stem"
"persian_stop"
]
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,24 +9,72 @@

package org.elasticsearch.analysis.common;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.StopwordAnalyzerBase;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.ar.ArabicNormalizationFilter;
import org.apache.lucene.analysis.core.DecimalDigitFilter;
import org.apache.lucene.analysis.fa.PersianAnalyzer;
import org.apache.lucene.analysis.fa.PersianCharFilter;
import org.apache.lucene.analysis.fa.PersianNormalizationFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.IndexVersions;
import org.elasticsearch.index.analysis.AbstractIndexAnalyzerProvider;
import org.elasticsearch.index.analysis.Analysis;

import java.io.Reader;

public class PersianAnalyzerProvider extends AbstractIndexAnalyzerProvider<PersianAnalyzer> {

private final PersianAnalyzer analyzer;
private final Analyzer analyzer;

PersianAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
super(name, settings);
analyzer = new PersianAnalyzer(Analysis.parseStopWords(env, settings, PersianAnalyzer.getDefaultStopSet()));
if (indexSettings.getIndexVersionCreated().onOrAfter(IndexVersions.UPGRADE_TO_LUCENE_10_0_0)) {
// since Lucene 10 this analyzer contains stemming by default
analyzer = new PersianAnalyzer(Analysis.parseStopWords(env, settings, PersianAnalyzer.getDefaultStopSet()));
} else {
// for older index versions we need the old analyzer behaviour without stemming
analyzer = new StopwordAnalyzerBase(Analysis.parseStopWords(env, settings, PersianAnalyzer.getDefaultStopSet())) {

protected Analyzer.TokenStreamComponents createComponents(String fieldName) {
final Tokenizer source = new StandardTokenizer();
TokenStream result = new LowerCaseFilter(source);
result = new DecimalDigitFilter(result);
result = new ArabicNormalizationFilter(result);
/* additional persian-specific normalization */
result = new PersianNormalizationFilter(result);
/*
* the order here is important: the stopword list is normalized with the
* above!
*/
return new TokenStreamComponents(source, new StopFilter(result, stopwords));
}

protected TokenStream normalize(String fieldName, TokenStream in) {
TokenStream result = new LowerCaseFilter(in);
result = new DecimalDigitFilter(result);
result = new ArabicNormalizationFilter(result);
/* additional persian-specific normalization */
result = new PersianNormalizationFilter(result);
return result;
}

protected Reader initReader(String fieldName, Reader reader) {
return new PersianCharFilter(reader);
}
};
}
}

@Override
public PersianAnalyzer get() {
public Analyzer get() {
return this.analyzer;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the "Elastic License
* 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side
* Public License v 1"; you may not use this file except in compliance with, at
* your election, the "Elastic License 2.0", the "GNU Affero General Public
* License v3.0 only", or the "Server Side Public License, v 1".
*/

package org.elasticsearch.analysis.common;

import org.apache.lucene.analysis.Analyzer;
import org.elasticsearch.cluster.metadata.IndexMetadata;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.IndexVersion;
import org.elasticsearch.index.IndexVersions;
import org.elasticsearch.test.ESTestCase;
import org.elasticsearch.test.ESTokenStreamTestCase;
import org.elasticsearch.test.IndexSettingsModule;
import org.elasticsearch.test.index.IndexVersionUtils;

import java.io.IOException;

/**
* Tests Persian Analyzer factory and behavioural changes with Lucene 10
*/
public class PersianAnalyzerProviderTests extends ESTokenStreamTestCase {

public void testPersianAnalyzerPostLucene10() throws IOException {
IndexVersion postLucene10Version = IndexVersionUtils.randomVersionBetween(
random(),
IndexVersions.UPGRADE_TO_LUCENE_10_0_0,
IndexVersion.current()
);
Settings settings = ESTestCase.indexSettings(1, 1)
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
.put(IndexMetadata.SETTING_VERSION_CREATED, postLucene10Version)
.build();
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings);
Environment environment = new Environment(settings, null);

PersianAnalyzerProvider persianAnalyzerProvider = new PersianAnalyzerProvider(
idxSettings,
environment,
"my-analyzer",
Settings.EMPTY
);
Analyzer analyzer = persianAnalyzerProvider.get();
assertAnalyzesTo(analyzer, "من کتاب های زیادی خوانده ام", new String[] { "كتاب", "زياد", "خوانده" });
}

public void testPersianAnalyzerPreLucene10() throws IOException {
IndexVersion preLucene10Version = IndexVersionUtils.randomVersionBetween(
random(),
IndexVersionUtils.getFirstVersion(),
IndexVersionUtils.getPreviousVersion(IndexVersions.UPGRADE_TO_LUCENE_10_0_0)
);
Settings settings = ESTestCase.indexSettings(1, 1)
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
.put(IndexMetadata.SETTING_VERSION_CREATED, preLucene10Version)
.build();
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings);
Environment environment = new Environment(settings, null);

PersianAnalyzerProvider persianAnalyzerProvider = new PersianAnalyzerProvider(
idxSettings,
environment,
"my-analyzer",
Settings.EMPTY
);
Analyzer analyzer = persianAnalyzerProvider.get();
assertAnalyzesTo(analyzer, "من کتاب های زیادی خوانده ام", new String[] { "كتاب", "زيادي", "خوانده" });
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -901,6 +901,31 @@
- length: { tokens: 1 }
- match: { tokens.0.token: خورد }

---
"persian stemming":
- requires:
cluster_features: ["persian_analyzer_with_stemmer"]
reason: "test requires persian analyzer stemming capabilities that come with Lucene 10"

- do:
indices.create:
index: test
body:
settings:
analysis:
analyzer:
my_analyzer:
type: persian

- do:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this to test stemming?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Then this test will need to be guarded with skip version or something. ES 9 will have mixed cluster tests with 8.last & 9.current and the 8.last won't have the stemming automatically correct?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would prefer a new test that is guarded, that way the original test isn't always skipped.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

All that would have made sense to me a few months ago. Now we live in a world where we merge Lucene 10 to "main" at some point which not necessarily is the point in which it becomes 9.0. So this is becoming a bit of a head-scratcher for me. I need to figure out if we can skip based on IndexVersion or not (since that is what we condition the behavioural change on), if we need some new sort of capability voodoo for that etc...

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

fwiw I'm afraid I might have to introduce a "cluster_feature" for this. Maybe it makes sense to have one for "Cluster runs with Lucene 10".

indices.analyze:
index: test
body:
text: كتابها
analyzer: my_analyzer
- length: { tokens: 1 }
- match: { tokens.0.token: كتاب }

---
"portuguese":
- do:
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
# integration tests for persian analyzer changes from Lucene 9 to Lucene 10
setup:
Comment on lines +1 to +2
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If you want to test with old data, then upgrade, then verify the query results don't change, a rolling upgrade test, or one of the full restart tests.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes I was afraid I'd need that full-blown infra and was somehow hoping I could leverage some yaml test. This at least shows that stemming works in the new version of the analyzer, i.e. both search terms are matching both documents which means they are analyzed to the same root form.

- do:
indices.create:
index: test
body:
mappings:
properties:
text:
type: text
analyzer: persian

---
"persian search":
- do:
bulk:
refresh: true
body:
- '{"index": {"_index": "test", "_id": "1"}}'
- '{"text" : "كتابها"}'
- '{"index": {"_index": "test", "_id": "2"}}'
- '{"text" : "كتاب"}'

- do:
search:
rest_total_hits_as_int: true
index: test
body:
query:
match:
text:
query: كتابها
- match: { hits.total: 2 }

- do:
search:
rest_total_hits_as_int: true
index: test
body:
query:
match:
text:
query: كتاب
- match: { hits.total: 2 }
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@
import org.elasticsearch.common.logging.DeprecationLogger;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.features.NodeFeature;
import org.elasticsearch.synonyms.PagedResult;
import org.elasticsearch.synonyms.SynonymRule;
import org.elasticsearch.synonyms.SynonymsManagementAPIService;
Expand Down Expand Up @@ -83,6 +84,8 @@ public class Analysis {
private static final DeprecationLogger DEPRECATION_LOGGER = DeprecationLogger.getLogger(Analysis.class);
private static final Logger logger = LogManager.getLogger(Analysis.class);

public static final NodeFeature PERSIAN_ANALYZER_WITH_STEMMER = new NodeFeature("persian_analyzer_with_stemmer");

public static void checkForDeprecatedVersion(String name, Settings settings) {
String sVersion = settings.get("version");
if (sVersion != null) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,12 @@
import org.apache.lucene.analysis.Analyzer;
import org.elasticsearch.injection.guice.Provider;

public interface AnalyzerProvider<T extends Analyzer> extends Provider<T> {
public interface AnalyzerProvider<T extends Analyzer> extends Provider<Analyzer> {

String name();

AnalyzerScope scope();

@Override
T get();
Analyzer get();
}
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
import java.util.Map;
import java.util.Set;

import static org.elasticsearch.index.analysis.Analysis.PERSIAN_ANALYZER_WITH_STEMMER;
import static org.elasticsearch.search.fetch.subphase.highlight.DefaultHighlighter.UNIFIED_HIGHLIGHTER_MATCHED_FIELDS;

public class RestFeatures implements FeatureSpecification {
Expand All @@ -26,7 +27,8 @@ public Set<NodeFeature> getFeatures() {
return Set.of(
RestNodesCapabilitiesAction.CAPABILITIES_ACTION,
RestNodesCapabilitiesAction.LOCAL_ONLY_CAPABILITIES,
UNIFIED_HIGHLIGHTER_MATCHED_FIELDS
UNIFIED_HIGHLIGHTER_MATCHED_FIELDS,
PERSIAN_ANALYZER_WITH_STEMMER
);
}

Expand Down