Skip to content

Commit 9c29462

Browse files
laminelamLamine Idjeraouiandrross
authored
Handle dependencies between analyzers (#19248)
Fix "synonym_graph filter fails with word_delimiter_graph when using whitespace or classic tokenizer in synonym_analyzer" bug. Use automatic dependency detection using kahn's algorithm topological sort. Signed-off-by: Lamine Idjeraoui <lidjeraoui@apple.com> Signed-off-by: Andrew Ross <andrross@amazon.com> Co-authored-by: Lamine Idjeraoui <lidjeraoui@apple.com> Co-authored-by: Andrew Ross <andrross@amazon.com>
1 parent 47a7402 commit 9c29462

File tree

10 files changed

+388
-30
lines changed

10 files changed

+388
-30
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
5858
- Delegate getMin/getMax methods for ExitableTerms ([#20775](https://github.com/opensearch-project/OpenSearch/pull/20775))
5959
- Fix terms lookup subquery fetch limit reading from non-existent index setting instead of cluster `max_clause_count` ([#20823](https://github.com/opensearch-project/OpenSearch/pull/20823))
6060
- Fix array_index_out_of_bounds_exception with wildcard and aggregations ([#20842](https://github.com/opensearch-project/OpenSearch/pull/20842))
61+
- Handle dependencies between analyzers ([#19248](https://github.com/opensearch-project/OpenSearch/pull/19248))
6162

6263
### Dependencies
6364
- Bump shadow-gradle-plugin from 8.3.9 to 9.3.1 ([#20569](https://github.com/opensearch-project/OpenSearch/pull/20569))

modules/analysis-common/src/main/java/org/opensearch/analysis/common/SynonymGraphTokenFilterFactory.java

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,18 @@ public TokenFilterFactory getChainAwareTokenFilterFactory(
7272
List<TokenFilterFactory> previousTokenFilters,
7373
Function<String, TokenFilterFactory> allFilters
7474
) {
75-
final Analyzer analyzer = buildSynonymAnalyzer(tokenizer, charFilters, previousTokenFilters, allFilters);
75+
return getChainAwareTokenFilterFactory(tokenizer, charFilters, previousTokenFilters, allFilters, name -> null);
76+
}
77+
78+
@Override
79+
public TokenFilterFactory getChainAwareTokenFilterFactory(
80+
TokenizerFactory tokenizer,
81+
List<CharFilterFactory> charFilters,
82+
List<TokenFilterFactory> previousTokenFilters,
83+
Function<String, TokenFilterFactory> allFilters,
84+
Function<String, Analyzer> analyzersBuiltSoFar
85+
) {
86+
final Analyzer analyzer = buildSynonymAnalyzer(tokenizer, charFilters, previousTokenFilters, allFilters, analyzersBuiltSoFar);
7687
final SynonymMap synonyms = buildSynonyms(analyzer, getRulesFromSettings(environment));
7788
final String name = name();
7889
return new TokenFilterFactory() {

modules/analysis-common/src/main/java/org/opensearch/analysis/common/SynonymTokenFilterFactory.java

Lines changed: 19 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -114,7 +114,18 @@ public TokenFilterFactory getChainAwareTokenFilterFactory(
114114
List<TokenFilterFactory> previousTokenFilters,
115115
Function<String, TokenFilterFactory> allFilters
116116
) {
117-
final Analyzer analyzer = buildSynonymAnalyzer(tokenizer, charFilters, previousTokenFilters, allFilters);
117+
return getChainAwareTokenFilterFactory(tokenizer, charFilters, previousTokenFilters, allFilters, name -> null);
118+
}
119+
120+
@Override
121+
public TokenFilterFactory getChainAwareTokenFilterFactory(
122+
TokenizerFactory tokenizer,
123+
List<CharFilterFactory> charFilters,
124+
List<TokenFilterFactory> previousTokenFilters,
125+
Function<String, TokenFilterFactory> allFilters,
126+
Function<String, Analyzer> analyzersBuiltSoFar
127+
) {
128+
final Analyzer analyzer = buildSynonymAnalyzer(tokenizer, charFilters, previousTokenFilters, allFilters, analyzersBuiltSoFar);
118129
final SynonymMap synonyms = buildSynonyms(analyzer, getRulesFromSettings(environment));
119130
final String name = name();
120131
return new TokenFilterFactory() {
@@ -147,10 +158,15 @@ Analyzer buildSynonymAnalyzer(
147158
TokenizerFactory tokenizer,
148159
List<CharFilterFactory> charFilters,
149160
List<TokenFilterFactory> tokenFilters,
150-
Function<String, TokenFilterFactory> allFilters
161+
Function<String, TokenFilterFactory> allFilters,
162+
Function<String, Analyzer> analyzersBuiltSoFar
151163
) {
152164
if (synonymAnalyzerName != null) {
153-
Analyzer customSynonymAnalyzer;
165+
// first, check settings analyzers
166+
Analyzer customSynonymAnalyzer = analyzersBuiltSoFar.apply(synonymAnalyzerName);
167+
if (customSynonymAnalyzer != null) {
168+
return customSynonymAnalyzer;
169+
}
154170
try {
155171
customSynonymAnalyzer = analysisRegistry.getAnalyzer(synonymAnalyzerName);
156172
} catch (IOException e) {

modules/analysis-common/src/test/java/org/opensearch/analysis/common/EdgeNGramTokenizerTests.java

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -49,11 +49,9 @@
4949

5050
import java.io.IOException;
5151
import java.io.StringReader;
52+
import java.util.Arrays;
5253
import java.util.Collections;
5354

54-
import static org.hamcrest.Matchers.containsString;
55-
import static org.hamcrest.Matchers.hasToString;
56-
5755
public class EdgeNGramTokenizerTests extends OpenSearchTokenStreamTestCase {
5856

5957
private IndexAnalyzers buildAnalyzers(Version version, String tokenizer) throws IOException {
@@ -99,7 +97,14 @@ public void testPreConfiguredTokenizer() throws IOException {
9997
IllegalArgumentException.class,
10098
() -> buildAnalyzers(VersionUtils.randomVersionBetween(random(), Version.V_3_0_0, Version.CURRENT), "edgeNGram")
10199
);
102-
assertThat(e, hasToString(containsString("The [edgeNGram] tokenizer name was deprecated pre 1.0.")));
100+
101+
assertTrue(
102+
"expected deprecation message in suppressed causes",
103+
Arrays.stream(e.getSuppressed())
104+
.map(org.opensearch.ExceptionsHelper::unwrapCause)
105+
.map(Throwable::getMessage)
106+
.anyMatch(msg -> msg.contains("The [edgeNGram] tokenizer name was deprecated pre 1.0."))
107+
);
103108
}
104109
}
105110

modules/analysis-common/src/test/java/org/opensearch/analysis/common/SynonymsAnalysisTests.java

Lines changed: 91 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -120,7 +120,7 @@ public void testSynonymWordDeleteByAnalyzer() throws IOException {
120120
fail("fail! due to synonym word deleted by analyzer");
121121
} catch (Exception e) {
122122
assertThat(e, instanceOf(IllegalArgumentException.class));
123-
assertThat(e.getMessage(), startsWith("Failed to build synonyms"));
123+
assertThat(e.getMessage(), startsWith("Failed to build analyzers: [synonymAnalyzerWithStopSynonymBeforeSynonym]"));
124124
}
125125
}
126126

@@ -141,7 +141,7 @@ public void testExpandSynonymWordDeleteByAnalyzer() throws IOException {
141141
fail("fail! due to synonym word deleted by analyzer");
142142
} catch (Exception e) {
143143
assertThat(e, instanceOf(IllegalArgumentException.class));
144-
assertThat(e.getMessage(), startsWith("Failed to build synonyms"));
144+
assertThat(e.getMessage(), startsWith("Failed to build analyzers: [synonymAnalyzerExpandWithStopBeforeSynonym]"));
145145
}
146146
}
147147

@@ -269,7 +269,7 @@ public void testTokenFiltersBypassSynonymAnalysis() throws IOException {
269269
TokenFilterFactory tff = plugin.getTokenFilters(analysisModule).get(factory).get(idxSettings, environment, factory, settings);
270270
TokenizerFactory tok = new KeywordTokenizerFactory(idxSettings, environment, "keyword", settings);
271271
SynonymTokenFilterFactory stff = new SynonymTokenFilterFactory(idxSettings, environment, "synonym", settings, analysisRegistry);
272-
Analyzer analyzer = stff.buildSynonymAnalyzer(tok, Collections.emptyList(), Collections.singletonList(tff), null);
272+
Analyzer analyzer = stff.buildSynonymAnalyzer(tok, Collections.emptyList(), Collections.singletonList(tff), null, null);
273273

274274
try (TokenStream ts = analyzer.tokenStream("field", "text")) {
275275
assertThat(ts, instanceOf(KeywordTokenizer.class));
@@ -350,7 +350,7 @@ public void testDisallowedTokenFilters() throws IOException {
350350
IllegalArgumentException e = expectThrows(
351351
IllegalArgumentException.class,
352352
"Expected IllegalArgumentException for factory " + factory,
353-
() -> stff.buildSynonymAnalyzer(tok, Collections.emptyList(), Collections.singletonList(tff), null)
353+
() -> stff.buildSynonymAnalyzer(tok, Collections.emptyList(), Collections.singletonList(tff), null, null)
354354
);
355355

356356
assertEquals(factory, "Token filter [" + factory + "] cannot be used to parse synonyms", e.getMessage());
@@ -443,4 +443,91 @@ public void testSynonymAnalyzerWithWordDelimiter() throws IOException {
443443
assertTokenStreamContents(ts, new String[] { "notebook" }, new int[] { 0 }, new int[] { 6 });
444444
}
445445
}
446+
447+
/**
448+
* Test the core dependency resolution issue from GitHub #18037:
449+
* synonym_graph with custom synonym_analyzer should work even when
450+
* the main analyzer contains word_delimiter_graph that would normally
451+
* cause "cannot be used to parse synonyms" error.
452+
*
453+
* This test intentionally declares the dependent analyzer before the
454+
* synonym analyzer to ensure the system resolves dependencies
455+
* automatically without requiring a manual "order" setting.
456+
*/
457+
public void testSynonymAnalyzerDependencyResolution() throws IOException {
458+
Settings settings = Settings.builder()
459+
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
460+
.put(IndexMetadata.SETTING_VERSION_CREATED, Version.CURRENT)
461+
462+
// Declare dependent analyzer FIRST intentionally
463+
.put("index.analysis.analyzer.main_analyzer.type", "custom")
464+
.put("index.analysis.analyzer.main_analyzer.tokenizer", "standard")
465+
.putList("index.analysis.analyzer.main_analyzer.filter", "lowercase", "test_word_delimiter", "test_synonyms")
466+
467+
// Problematic filter for synonym parsing
468+
.put("index.analysis.filter.test_word_delimiter.type", "word_delimiter_graph")
469+
.put("index.analysis.filter.test_word_delimiter.generate_word_parts", true)
470+
471+
// Synonym filter referencing another analyzer
472+
.put("index.analysis.filter.test_synonyms.type", "synonym_graph")
473+
.putList("index.analysis.filter.test_synonyms.synonyms", "laptop,notebook")
474+
.put("index.analysis.filter.test_synonyms.synonym_analyzer", "simple_synonym_analyzer")
475+
476+
// Define the synonym analyzer AFTER the main analyzer
477+
.put("index.analysis.analyzer.simple_synonym_analyzer.type", "custom")
478+
.put("index.analysis.analyzer.simple_synonym_analyzer.tokenizer", "standard")
479+
.build();
480+
481+
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("test_index", settings);
482+
483+
// Should succeed with the fix (would fail before due to registration order)
484+
IndexAnalyzers analyzers = new AnalysisModule(
485+
TestEnvironment.newEnvironment(settings),
486+
Collections.singletonList(new CommonAnalysisModulePlugin())
487+
).getAnalysisRegistry().build(idxSettings);
488+
489+
assertNotNull("main_analyzer should be created", analyzers.get("main_analyzer"));
490+
assertNotNull("simple_synonym_analyzer should be created", analyzers.get("simple_synonym_analyzer"));
491+
}
492+
493+
/**
494+
* Verifies that circular synonym_analyzer dependencies are detected
495+
* and rejected during analyzer construction.
496+
*/
497+
public void testCircularSynonymAnalyzerDependency() throws IOException {
498+
Settings settings = Settings.builder()
499+
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
500+
.put(IndexMetadata.SETTING_VERSION_CREATED, Version.CURRENT)
501+
502+
// Analyzer A depends on B
503+
.put("index.analysis.analyzer.analyzer_a.type", "custom")
504+
.put("index.analysis.analyzer.analyzer_a.tokenizer", "standard")
505+
.putList("index.analysis.analyzer.analyzer_a.filter", "syn_filter_a")
506+
507+
.put("index.analysis.filter.syn_filter_a.type", "synonym_graph")
508+
.putList("index.analysis.filter.syn_filter_a.synonyms", "foo,bar")
509+
.put("index.analysis.filter.syn_filter_a.synonym_analyzer", "analyzer_b")
510+
511+
// Analyzer B depends on A
512+
.put("index.analysis.analyzer.analyzer_b.type", "custom")
513+
.put("index.analysis.analyzer.analyzer_b.tokenizer", "standard")
514+
.putList("index.analysis.analyzer.analyzer_b.filter", "syn_filter_b")
515+
516+
.put("index.analysis.filter.syn_filter_b.type", "synonym_graph")
517+
.putList("index.analysis.filter.syn_filter_b.synonyms", "baz,qux")
518+
.put("index.analysis.filter.syn_filter_b.synonym_analyzer", "analyzer_a")
519+
520+
.build();
521+
522+
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("test_index", settings);
523+
524+
IllegalArgumentException e = expectThrows(
525+
IllegalArgumentException.class,
526+
() -> new AnalysisModule(TestEnvironment.newEnvironment(settings), Collections.singletonList(new CommonAnalysisModulePlugin()))
527+
.getAnalysisRegistry()
528+
.build(idxSettings)
529+
);
530+
531+
assertThat(e.getMessage(), startsWith("Circular analyzer dependency"));
532+
}
446533
}

0 commit comments

Comments
 (0)