Skip to content
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions docs/changelog/115585.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
pr: 115459
summary: Adds access to flags no_sub_matches and no_overlapping_matches to hyphenation-decompounder-tokenfilter
area: Search
type: enhancement
issues:
- 97849
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,18 @@ output. Defaults to `5`.
(Optional, Boolean)
If `true`, only include the longest matching subword. Defaults to `false`.

`no_sub_matches`::
(Optional, Boolean)
If `true`, do not match sub tokens in tokens that are in the word list.
Defaults to `false`.

`no_overlapping_matches`::
(Optional, Boolean)
If `true`, do not allow overlapping tokens.
Defaults to `false`.

Typically users will only want to include one of the three flags as enabling `no_overlapping_matches` is the most restrictive and `no_sub_matches` is more restrictive than `only_longest_match`. When enabling a more restrictive option the state of the less restrictive does not have any effect.

[[analysis-hyp-decomp-tokenfilter-customize]]
==== Customize and add to an analyzer

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@
*/
public class HyphenationCompoundWordTokenFilterFactory extends AbstractCompoundWordTokenFilterFactory {

private final boolean noSubMatches;
private final boolean noOverlappingMatches;
private final HyphenationTree hyphenationTree;

HyphenationCompoundWordTokenFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
Expand All @@ -46,6 +48,9 @@ public class HyphenationCompoundWordTokenFilterFactory extends AbstractCompoundW
} catch (Exception e) {
throw new IllegalArgumentException("Exception while reading hyphenation_patterns_path.", e);
}

noSubMatches = settings.getAsBoolean("no_sub_matches", false);
noOverlappingMatches = settings.getAsBoolean("no_overlapping_matches", false);
}

@Override
Expand All @@ -57,7 +62,9 @@ public TokenStream create(TokenStream tokenStream) {
minWordSize,
minSubwordSize,
maxSubwordSize,
onlyLongestMatch
onlyLongestMatch,
noSubMatches,
noOverlappingMatches
);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,9 @@
import org.hamcrest.MatcherAssert;

import java.io.IOException;
import java.io.InputStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
Expand All @@ -42,6 +45,7 @@
import static org.hamcrest.Matchers.instanceOf;

public class CompoundAnalysisTests extends ESTestCase {

public void testDefaultsCompoundAnalysis() throws Exception {
Settings settings = getJsonSettings();
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("test", settings);
Expand All @@ -63,6 +67,44 @@ public void testDictionaryDecompounder() throws Exception {
assertWarnings("Setting [version] on analysis component [custom7] has no effect and is deprecated");
}

public void testHyphenationDecompoundingAnalyzerOnlyLongestMatch() throws Exception {
Settings[] settingsArr = new Settings[] { getJsonSettings(), getYamlSettings() };
for (Settings settings : settingsArr) {
List<String> terms = analyze(settings, "hyphenationDecompoundingAnalyzerOnlyLongestMatch", "kaffeemaschine fussballpumpe");
MatcherAssert.assertThat(
terms,
hasItems("kaffeemaschine", "kaffee", "fee", "maschine", "fussballpumpe", "fussball", "ballpumpe", "pumpe")
);
}
assertWarnings("Setting [version] on analysis component [custom7] has no effect and is deprecated");
}

/**
* Given a word list of: ["kaffee", "fee", "maschine"]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is that the word list being used or is it this: [fuss, fussball, ballpumpe, ball, pumpe, kaffee, fee, maschine]. I was thrown off by the comment but had trouble tracking that through in my head. Same thing on the comment on the subsequent test. The test result makes sense to me and looks good.

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Technically, the wordlist contains ["fuss", "fussball", "ballpumpe", "ball", "pumpe", "kaffee", "fee", "maschine"], as defined in test1.json:43. The comment should highlight, that this parameter should solve this specific problem of preventing the match of "fee" (fairy) within "kaffee" (coffee).

I left in the same wordlist for all tests and input text to ensure that they are not any unintended side effect.

If it's clearer I could isolate the tests and only include the Kaffeemaschine related words in this test and only the Fussballpumpe in the other one.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

gotcha I'm tracking now; this comment was a "for example". So I'll just nit (change it if you want). I'd just include before the comment something like "for example given a word list of: " ... that way it's clear that the test is validating more than just that word list.

* no_sub_matches should prevent the token "fee" as a token in "kaffeemaschine".
*/
public void testHyphenationDecompoundingAnalyzerNoSubMatches() throws Exception {
Settings[] settingsArr = new Settings[] { getJsonSettings(), getYamlSettings() };
for (Settings settings : settingsArr) {
List<String> terms = analyze(settings, "hyphenationDecompoundingAnalyzerNoSubMatches", "kaffeemaschine fussballpumpe");
MatcherAssert.assertThat(terms, hasItems("kaffeemaschine", "kaffee", "maschine", "fussballpumpe", "fussball", "ballpumpe"));
}
assertWarnings("Setting [version] on analysis component [custom7] has no effect and is deprecated");
}

/**
* Given a word list of: ["fuss", "fussball", "ballpumpe", "ball", "pumpe"]
* no_overlapping_matches should prevent the token "ballpumpe" as a token in "fussballpumpe.
*/
public void testHyphenationDecompoundingAnalyzerNoOverlappingMatches() throws Exception {
Settings[] settingsArr = new Settings[] { getJsonSettings(), getYamlSettings() };
for (Settings settings : settingsArr) {
List<String> terms = analyze(settings, "hyphenationDecompoundingAnalyzerNoOverlappingMatches", "kaffeemaschine fussballpumpe");
MatcherAssert.assertThat(terms, hasItems("kaffeemaschine", "kaffee", "maschine", "fussballpumpe", "fussball", "pumpe"));
}
assertWarnings("Setting [version] on analysis component [custom7] has no effect and is deprecated");
}

private List<String> analyze(Settings settings, String analyzerName, String text) throws IOException {
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("test", settings);
AnalysisModule analysisModule = createAnalysisModule(settings);
Expand Down Expand Up @@ -92,20 +134,25 @@ public Map<String, AnalysisProvider<TokenFilterFactory>> getTokenFilters() {
}

private Settings getJsonSettings() throws IOException {
String json = "/org/elasticsearch/analysis/common/test1.json";
return Settings.builder()
.loadFromStream(json, getClass().getResourceAsStream(json), false)
.put(IndexMetadata.SETTING_VERSION_CREATED, IndexVersion.current())
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
.build();
return getSettings("/org/elasticsearch/analysis/common/test1.json");
}

private Settings getYamlSettings() throws IOException {
String yaml = "/org/elasticsearch/analysis/common/test1.yml";
return getSettings("/org/elasticsearch/analysis/common/test1.yml");
}

private Settings getSettings(String filePath) throws IOException {
String hypenationRulesFileName = "de_DR.xml";
InputStream hypenationRules = getClass().getResourceAsStream(hypenationRulesFileName);
Path home = createTempDir();
Path config = home.resolve("config");
Files.createDirectory(config);
Files.copy(hypenationRules, config.resolve(hypenationRulesFileName));

return Settings.builder()
.loadFromStream(yaml, getClass().getResourceAsStream(yaml), false)
.loadFromStream(filePath, getClass().getResourceAsStream(filePath), false)
.put(IndexMetadata.SETTING_VERSION_CREATED, IndexVersion.current())
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
.put(Environment.PATH_HOME_SETTING.getKey(), home.toString())
.build();
}
}
Loading