Skip to content
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions docs/reference/analysis/analyzers/lang-analyzer.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -1430,8 +1430,7 @@ PUT /persian_example
"decimal_digit",
"arabic_normalization",
"persian_normalization",
"persian_stop",
"persian_stem"
"persian_stop"
]
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,24 +9,72 @@

package org.elasticsearch.analysis.common;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.StopwordAnalyzerBase;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.ar.ArabicNormalizationFilter;
import org.apache.lucene.analysis.core.DecimalDigitFilter;
import org.apache.lucene.analysis.fa.PersianAnalyzer;
import org.apache.lucene.analysis.fa.PersianCharFilter;
import org.apache.lucene.analysis.fa.PersianNormalizationFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.IndexVersions;
import org.elasticsearch.index.analysis.AbstractIndexAnalyzerProvider;
import org.elasticsearch.index.analysis.Analysis;

import java.io.Reader;

public class PersianAnalyzerProvider extends AbstractIndexAnalyzerProvider<PersianAnalyzer> {

private final PersianAnalyzer analyzer;
private final Analyzer analyzer;

PersianAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
super(name, settings);
analyzer = new PersianAnalyzer(Analysis.parseStopWords(env, settings, PersianAnalyzer.getDefaultStopSet()));
if (indexSettings.getIndexVersionCreated().onOrAfter(IndexVersions.UPGRADE_TO_LUCENE_10_0_0)) {
// since Lucene 10 this analyzer contains stemming by default
analyzer = new PersianAnalyzer(Analysis.parseStopWords(env, settings, PersianAnalyzer.getDefaultStopSet()));
} else {
// for older index versions we need the old analyzer behaviour without stemming
analyzer = new StopwordAnalyzerBase(Analysis.parseStopWords(env, settings, PersianAnalyzer.getDefaultStopSet())) {

protected Analyzer.TokenStreamComponents createComponents(String fieldName) {
final Tokenizer source = new StandardTokenizer();
TokenStream result = new LowerCaseFilter(source);
result = new DecimalDigitFilter(result);
result = new ArabicNormalizationFilter(result);
/* additional persian-specific normalization */
result = new PersianNormalizationFilter(result);
/*
* the order here is important: the stopword list is normalized with the
* above!
*/
return new TokenStreamComponents(source, new StopFilter(result, stopwords));
}

protected TokenStream normalize(String fieldName, TokenStream in) {
TokenStream result = new LowerCaseFilter(in);
result = new DecimalDigitFilter(result);
result = new ArabicNormalizationFilter(result);
/* additional persian-specific normalization */
result = new PersianNormalizationFilter(result);
return result;
}

protected Reader initReader(String fieldName, Reader reader) {
return new PersianCharFilter(reader);
}
};
}
}

@Override
public PersianAnalyzer get() {
public Analyzer get() {
return this.analyzer;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the "Elastic License
* 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side
* Public License v 1"; you may not use this file except in compliance with, at
* your election, the "Elastic License 2.0", the "GNU Affero General Public
* License v3.0 only", or the "Server Side Public License, v 1".
*/

package org.elasticsearch.analysis.common;

import org.apache.lucene.analysis.Analyzer;
import org.elasticsearch.cluster.metadata.IndexMetadata;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.IndexVersion;
import org.elasticsearch.index.IndexVersions;
import org.elasticsearch.test.ESTestCase;
import org.elasticsearch.test.ESTokenStreamTestCase;
import org.elasticsearch.test.IndexSettingsModule;
import org.elasticsearch.test.index.IndexVersionUtils;

import java.io.IOException;

/**
* Tests Persian Analyzer factory and behavioural changes with Lucene 10
*/
public class PersianAnalyzerProviderTests extends ESTokenStreamTestCase {

public void testPersianAnalyzerPostLucene10() throws IOException {
IndexVersion postLucene10Version = IndexVersionUtils.randomVersionBetween(
random(),
IndexVersions.UPGRADE_TO_LUCENE_10_0_0,
IndexVersion.current()
);
Settings settings = ESTestCase.indexSettings(1, 1)
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
.put(IndexMetadata.SETTING_VERSION_CREATED, postLucene10Version)
.build();
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings);
Environment environment = new Environment(settings, null);

PersianAnalyzerProvider persianAnalyzerProvider = new PersianAnalyzerProvider(
idxSettings,
environment,
"my-analyzer",
Settings.EMPTY
);
Analyzer analyzer = persianAnalyzerProvider.get();
assertAnalyzesTo(analyzer, "من کتاب های زیادی خوانده ام", new String[] { "كتاب", "زياد", "خوانده" });
}

public void testPersianAnalyzerPreLucene10() throws IOException {
IndexVersion preLucene10Version = IndexVersionUtils.randomVersionBetween(
random(),
IndexVersionUtils.getFirstVersion(),
IndexVersionUtils.getPreviousVersion(IndexVersions.UPGRADE_TO_LUCENE_10_0_0)
);
Settings settings = ESTestCase.indexSettings(1, 1)
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
.put(IndexMetadata.SETTING_VERSION_CREATED, preLucene10Version)
.build();
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings);
Environment environment = new Environment(settings, null);

PersianAnalyzerProvider persianAnalyzerProvider = new PersianAnalyzerProvider(
idxSettings,
environment,
"my-analyzer",
Settings.EMPTY
);
Analyzer analyzer = persianAnalyzerProvider.get();
assertAnalyzesTo(analyzer, "من کتاب های زیادی خوانده ام", new String[] { "كتاب", "زيادي", "خوانده" });
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -901,6 +901,31 @@
- length: { tokens: 1 }
- match: { tokens.0.token: خورد }

---
"persian stemming":
- requires:
cluster_features: ["lucene_10_upgrade"]
reason: "test requires persian analyzer stemming capabilities that come with Lucene 10"

- do:
indices.create:
index: test
body:
settings:
analysis:
analyzer:
my_analyzer:
type: persian

- do:
indices.analyze:
index: test
body:
text: كتابها
analyzer: my_analyzer
- length: { tokens: 1 }
- match: { tokens.0.token: كتاب }

---
"portuguese":
- do:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
import org.elasticsearch.index.IndexVersions;
import org.elasticsearch.index.mapper.DateFieldMapper;
import org.elasticsearch.rest.action.admin.indices.RestPutIndexTemplateAction;
import org.elasticsearch.search.SearchFeatures;
import org.elasticsearch.test.NotEqualMessageBuilder;
import org.elasticsearch.test.XContentTestUtils;
import org.elasticsearch.test.cluster.ElasticsearchCluster;
Expand Down Expand Up @@ -1726,6 +1727,106 @@ public void testSystemIndexMetadataIsUpgraded() throws Exception {
}
}

/**
* This test ensures that search results on old indices using "persian" analyzer don't change
* after we introduce Lucene 10
*/
public void testPersianAnalyzerBWC() throws Exception {
var originalClusterLegacyPersianAnalyzer = oldClusterHasFeature(SearchFeatures.LUCENE_10_0_0_UPGRADE) == false;
assumeTrue("Don't run this test if both versions already support stemming", originalClusterLegacyPersianAnalyzer);
final String indexName = "test_persian_stemmer";
Settings idxSettings = indexSettings(1, 1).build();
String mapping = """
{
"properties": {
"textfield" : {
"type": "text",
"analyzer": "persian"
}
}
}
""";

String query = """
{
"query": {
"match": {
"textfield": "كتابها"
}
}
}
""";

if (isRunningAgainstOldCluster()) {
createIndex(client(), indexName, idxSettings, mapping);
ensureGreen(indexName);

assertOK(
client().performRequest(
newXContentRequest(
HttpMethod.POST,
"/" + indexName + "/" + "_doc/1",
(builder, params) -> builder.field("textfield", "كتابها")
)
)
);
assertOK(
client().performRequest(
newXContentRequest(
HttpMethod.POST,
"/" + indexName + "/" + "_doc/2",
(builder, params) -> builder.field("textfield", "كتاب")
)
)
);
refresh(indexName);

assertNumHits(indexName, 2, 1);

Request searchRequest = new Request("POST", "/" + indexName + "/_search");
searchRequest.setJsonEntity(query);
assertTotalHits(1, entityAsMap(client().performRequest(searchRequest)));
} else {
// old index should still only return one doc
Request searchRequest = new Request("POST", "/" + indexName + "/_search");
searchRequest.setJsonEntity(query);
assertTotalHits(1, entityAsMap(client().performRequest(searchRequest)));

String newIndexName = indexName + "_new";
createIndex(client(), newIndexName, idxSettings, mapping);
ensureGreen(newIndexName);

assertOK(
client().performRequest(
newXContentRequest(
HttpMethod.POST,
"/" + newIndexName + "/" + "_doc/1",
(builder, params) -> builder.field("textfield", "كتابها")
)
)
);
assertOK(
client().performRequest(
newXContentRequest(
HttpMethod.POST,
"/" + newIndexName + "/" + "_doc/2",
(builder, params) -> builder.field("textfield", "كتاب")
)
)
);
refresh(newIndexName);

searchRequest = new Request("POST", "/" + newIndexName + "/_search");
searchRequest.setJsonEntity(query);
assertTotalHits(2, entityAsMap(client().performRequest(searchRequest)));

// searching both indices (old and new analysis version) we should get 1 hit from the old and 2 from the new index
searchRequest = new Request("POST", "/" + indexName + "," + newIndexName + "/_search");
searchRequest.setJsonEntity(query);
assertTotalHits(3, entityAsMap(client().performRequest(searchRequest)));
}
}

/**
* This test ensures that soft deletes are enabled a when upgrading a pre-8 cluster to 8.0+
*/
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,12 @@
import org.apache.lucene.analysis.Analyzer;
import org.elasticsearch.injection.guice.Provider;

public interface AnalyzerProvider<T extends Analyzer> extends Provider<T> {
public interface AnalyzerProvider<T extends Analyzer> extends Provider<Analyzer> {

String name();

AnalyzerScope scope();

@Override
T get();
Analyzer get();
}
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
import static org.elasticsearch.search.fetch.subphase.highlight.DefaultHighlighter.UNIFIED_HIGHLIGHTER_MATCHED_FIELDS;

public class RestFeatures implements FeatureSpecification {

@Override
public Set<NodeFeature> getFeatures() {
return Set.of(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,11 @@
import java.util.Set;

public final class SearchFeatures implements FeatureSpecification {

public static final NodeFeature LUCENE_10_0_0_UPGRADE = new NodeFeature("lucene_10_upgrade");

@Override
public Set<NodeFeature> getFeatures() {
return Set.of(KnnVectorQueryBuilder.K_PARAM_SUPPORTED);
return Set.of(KnnVectorQueryBuilder.K_PARAM_SUPPORTED, LUCENE_10_0_0_UPGRADE);
}
}