Skip to content

Commit e241221

Browse files
john-wagsterpstrsrelasticmachine
authored
Provide access to new settings for HyphenationCompoundWordTokenFilter (elastic#115585) (elastic#116968)
Allow the new flags added in Lucene in the HyphenationCompoundWordTokenFilter Adds access to the two new flags no_sub_matches and no_overlapping_matches. Lucene issue: apache/lucene#9231 Co-authored-by: Peter Straßer <[email protected]> Co-authored-by: Elastic Machine <[email protected]>
1 parent 2511e69 commit e241221

File tree

7 files changed

+1295
-11
lines changed

7 files changed

+1295
-11
lines changed

docs/changelog/115585.yaml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
pr: 115459
2+
summary: Adds access to flags no_sub_matches and no_overlapping_matches to hyphenation-decompounder-tokenfilter
3+
area: Search
4+
type: enhancement
5+
issues:
6+
- 97849

docs/reference/analysis/tokenfilters/hyphenation-decompounder-tokenfilter.asciidoc

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,18 @@ output. Defaults to `5`.
111111
(Optional, Boolean)
112112
If `true`, only include the longest matching subword. Defaults to `false`.
113113

114+
`no_sub_matches`::
115+
(Optional, Boolean)
116+
If `true`, do not match sub tokens in tokens that are in the word list.
117+
Defaults to `false`.
118+
119+
`no_overlapping_matches`::
120+
(Optional, Boolean)
121+
If `true`, do not allow overlapping tokens.
122+
Defaults to `false`.
123+
124+
Typically users will only want to include one of the three flags as enabling `no_overlapping_matches` is the most restrictive and `no_sub_matches` is more restrictive than `only_longest_match`. When enabling a more restrictive option the state of the less restrictive does not have any effect.
125+
114126
[[analysis-hyp-decomp-tokenfilter-customize]]
115127
==== Customize and add to an analyzer
116128

modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/HyphenationCompoundWordTokenFilterFactory.java

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,8 @@
2828
*/
2929
public class HyphenationCompoundWordTokenFilterFactory extends AbstractCompoundWordTokenFilterFactory {
3030

31+
private final boolean noSubMatches;
32+
private final boolean noOverlappingMatches;
3133
private final HyphenationTree hyphenationTree;
3234

3335
HyphenationCompoundWordTokenFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
@@ -46,6 +48,9 @@ public class HyphenationCompoundWordTokenFilterFactory extends AbstractCompoundW
4648
} catch (Exception e) {
4749
throw new IllegalArgumentException("Exception while reading hyphenation_patterns_path.", e);
4850
}
51+
52+
noSubMatches = settings.getAsBoolean("no_sub_matches", false);
53+
noOverlappingMatches = settings.getAsBoolean("no_overlapping_matches", false);
4954
}
5055

5156
@Override
@@ -57,7 +62,9 @@ public TokenStream create(TokenStream tokenStream) {
5762
minWordSize,
5863
minSubwordSize,
5964
maxSubwordSize,
60-
onlyLongestMatch
65+
onlyLongestMatch,
66+
noSubMatches,
67+
noOverlappingMatches
6168
);
6269
}
6370
}

modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/CompoundAnalysisTests.java

Lines changed: 56 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,9 @@
3131
import org.hamcrest.MatcherAssert;
3232

3333
import java.io.IOException;
34+
import java.io.InputStream;
35+
import java.nio.file.Files;
36+
import java.nio.file.Path;
3437
import java.util.ArrayList;
3538
import java.util.Arrays;
3639
import java.util.List;
@@ -42,6 +45,7 @@
4245
import static org.hamcrest.Matchers.instanceOf;
4346

4447
public class CompoundAnalysisTests extends ESTestCase {
48+
4549
public void testDefaultsCompoundAnalysis() throws Exception {
4650
Settings settings = getJsonSettings();
4751
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("test", settings);
@@ -63,6 +67,44 @@ public void testDictionaryDecompounder() throws Exception {
6367
assertWarnings("Setting [version] on analysis component [custom7] has no effect and is deprecated");
6468
}
6569

70+
public void testHyphenationDecompoundingAnalyzerOnlyLongestMatch() throws Exception {
71+
Settings[] settingsArr = new Settings[] { getJsonSettings(), getYamlSettings() };
72+
for (Settings settings : settingsArr) {
73+
List<String> terms = analyze(settings, "hyphenationDecompoundingAnalyzerOnlyLongestMatch", "kaffeemaschine fussballpumpe");
74+
MatcherAssert.assertThat(
75+
terms,
76+
hasItems("kaffeemaschine", "kaffee", "fee", "maschine", "fussballpumpe", "fussball", "ballpumpe", "pumpe")
77+
);
78+
}
79+
assertWarnings("Setting [version] on analysis component [custom7] has no effect and is deprecated");
80+
}
81+
82+
/**
83+
* For example given a word list of: ["kaffee", "fee", "maschine"]
84+
* no_sub_matches should prevent the token "fee" as a token in "kaffeemaschine".
85+
*/
86+
public void testHyphenationDecompoundingAnalyzerNoSubMatches() throws Exception {
87+
Settings[] settingsArr = new Settings[] { getJsonSettings(), getYamlSettings() };
88+
for (Settings settings : settingsArr) {
89+
List<String> terms = analyze(settings, "hyphenationDecompoundingAnalyzerNoSubMatches", "kaffeemaschine fussballpumpe");
90+
MatcherAssert.assertThat(terms, hasItems("kaffeemaschine", "kaffee", "maschine", "fussballpumpe", "fussball", "ballpumpe"));
91+
}
92+
assertWarnings("Setting [version] on analysis component [custom7] has no effect and is deprecated");
93+
}
94+
95+
/**
96+
* For example given a word list of: ["fuss", "fussball", "ballpumpe", "ball", "pumpe"]
97+
* no_overlapping_matches should prevent the token "ballpumpe" as a token in "fussballpumpe.
98+
*/
99+
public void testHyphenationDecompoundingAnalyzerNoOverlappingMatches() throws Exception {
100+
Settings[] settingsArr = new Settings[] { getJsonSettings(), getYamlSettings() };
101+
for (Settings settings : settingsArr) {
102+
List<String> terms = analyze(settings, "hyphenationDecompoundingAnalyzerNoOverlappingMatches", "kaffeemaschine fussballpumpe");
103+
MatcherAssert.assertThat(terms, hasItems("kaffeemaschine", "kaffee", "maschine", "fussballpumpe", "fussball", "pumpe"));
104+
}
105+
assertWarnings("Setting [version] on analysis component [custom7] has no effect and is deprecated");
106+
}
107+
66108
private List<String> analyze(Settings settings, String analyzerName, String text) throws IOException {
67109
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("test", settings);
68110
AnalysisModule analysisModule = createAnalysisModule(settings);
@@ -92,20 +134,25 @@ public Map<String, AnalysisProvider<TokenFilterFactory>> getTokenFilters() {
92134
}
93135

94136
private Settings getJsonSettings() throws IOException {
95-
String json = "/org/elasticsearch/analysis/common/test1.json";
96-
return Settings.builder()
97-
.loadFromStream(json, getClass().getResourceAsStream(json), false)
98-
.put(IndexMetadata.SETTING_VERSION_CREATED, IndexVersion.current())
99-
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
100-
.build();
137+
return getSettings("/org/elasticsearch/analysis/common/test1.json");
101138
}
102139

103140
private Settings getYamlSettings() throws IOException {
104-
String yaml = "/org/elasticsearch/analysis/common/test1.yml";
141+
return getSettings("/org/elasticsearch/analysis/common/test1.yml");
142+
}
143+
144+
private Settings getSettings(String filePath) throws IOException {
145+
String hypenationRulesFileName = "de_DR.xml";
146+
InputStream hypenationRules = getClass().getResourceAsStream(hypenationRulesFileName);
147+
Path home = createTempDir();
148+
Path config = home.resolve("config");
149+
Files.createDirectory(config);
150+
Files.copy(hypenationRules, config.resolve(hypenationRulesFileName));
151+
105152
return Settings.builder()
106-
.loadFromStream(yaml, getClass().getResourceAsStream(yaml), false)
153+
.loadFromStream(filePath, getClass().getResourceAsStream(filePath), false)
107154
.put(IndexMetadata.SETTING_VERSION_CREATED, IndexVersion.current())
108-
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
155+
.put(Environment.PATH_HOME_SETTING.getKey(), home.toString())
109156
.build();
110157
}
111158
}

0 commit comments

Comments
 (0)