Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions docs/changelog/112768.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
pr: 112768
summary: Deduplicate Kuromoji User Dictionary
area: Search
type: enhancement
issues: []
8 changes: 7 additions & 1 deletion docs/plugins/analysis-kuromoji.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,11 @@ unknown words. It can be set to:

Whether punctuation should be discarded from the output. Defaults to `true`.

`lenient`::

Whether the `user_dictionary` should be deduplicated on the provided `text`.
False by default causing duplicates to generate an error.

`user_dictionary`::
+
--
Expand Down Expand Up @@ -221,7 +226,8 @@ PUT kuromoji_sample
"type": "kuromoji_tokenizer",
"mode": "extended",
"discard_punctuation": "false",
"user_dictionary": "userdict_ja.txt"
"user_dictionary": "userdict_ja.txt",
"lenient": "true"
}
},
"analyzer": {
Expand Down
9 changes: 7 additions & 2 deletions docs/plugins/analysis-nori.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,11 @@ It can be set to:

Whether punctuation should be discarded from the output. Defaults to `true`.

`lenient`::

Whether the `user_dictionary` should be deduplicated on the provided `text`.
False by default causing duplicates to generate an error.

`user_dictionary`::
+
--
Expand Down Expand Up @@ -104,7 +109,8 @@ PUT nori_sample
"type": "nori_tokenizer",
"decompound_mode": "mixed",
"discard_punctuation": "false",
"user_dictionary": "userdict_ko.txt"
"user_dictionary": "userdict_ko.txt",
"lenient": "true"
}
},
"analyzer": {
Expand Down Expand Up @@ -299,7 +305,6 @@ Which responds with:
}
--------------------------------------------------


[[analysis-nori-speech]]
==== `nori_part_of_speech` token filter

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ public class KuromojiTokenizerFactory extends AbstractTokenizerFactory {
private static final String NBEST_COST = "nbest_cost";
private static final String NBEST_EXAMPLES = "nbest_examples";
private static final String DISCARD_COMPOUND_TOKEN = "discard_compound_token";
private static final String LENIENT = "lenient";

private final UserDictionary userDictionary;
private final Mode mode;
Expand All @@ -58,14 +59,23 @@ public static UserDictionary getUserDictionary(Environment env, Settings setting
"It is not allowed to use [" + USER_DICT_PATH_OPTION + "] in conjunction" + " with [" + USER_DICT_RULES_OPTION + "]"
);
}
List<String> ruleList = Analysis.getWordList(env, settings, USER_DICT_PATH_OPTION, USER_DICT_RULES_OPTION, false, true);
List<String> ruleList = Analysis.getWordList(
env,
settings,
USER_DICT_PATH_OPTION,
USER_DICT_RULES_OPTION,
LENIENT,
false, // typically don't want to remove comments as deduplication will provide better feedback
true
);
if (ruleList == null || ruleList.isEmpty()) {
return null;
}
StringBuilder sb = new StringBuilder();
for (String line : ruleList) {
sb.append(line).append(System.lineSeparator());
}

try (Reader rulesReader = new StringReader(sb.toString())) {
return UserDictionary.open(rulesReader);
} catch (IOException e) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -445,7 +445,26 @@ public void testKuromojiAnalyzerDuplicateUserDictRule() throws Exception {
)
.build();
IllegalArgumentException exc = expectThrows(IllegalArgumentException.class, () -> createTestAnalysis(settings));
assertThat(exc.getMessage(), containsString("[制限スピード] in user dictionary at line [3]"));
assertThat(exc.getMessage(), containsString("[制限スピード] in user dictionary at line [4]"));
}

public void testKuromojiAnalyzerDuplicateUserDictRuleDeduplication() throws Exception {
Settings settings = Settings.builder()
.put("index.analysis.analyzer.my_analyzer.type", "kuromoji")
.put("index.analysis.analyzer.my_analyzer.lenient", "true")
.putList(
"index.analysis.analyzer.my_analyzer.user_dictionary_rules",
"c++,c++,w,w",
"#comment",
"制限スピード,制限スピード,セイゲンスピード,テスト名詞",
"制限スピード,制限スピード,セイゲンスピード,テスト名詞"
)
.build();
TestAnalysis analysis = createTestAnalysis(settings);
Analyzer analyzer = analysis.indexAnalyzers.get("my_analyzer");
try (TokenStream stream = analyzer.tokenStream("", "制限スピード")) {
assertTokenStreamContents(stream, new String[] { "制限スピード" });
}
}

public void testDiscardCompoundToken() throws Exception {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
public class NoriTokenizerFactory extends AbstractTokenizerFactory {
private static final String USER_DICT_PATH_OPTION = "user_dictionary";
private static final String USER_DICT_RULES_OPTION = "user_dictionary_rules";
private static final String LENIENT = "lenient";

private final UserDictionary userDictionary;
private final KoreanTokenizer.DecompoundMode decompoundMode;
Expand All @@ -54,7 +55,8 @@ public static UserDictionary getUserDictionary(Environment env, Settings setting
settings,
USER_DICT_PATH_OPTION,
USER_DICT_RULES_OPTION,
true,
LENIENT,
false, // typically don't want to remove comments as deduplication will provide better feedback
isSupportDuplicateCheck(indexSettings)
);
if (ruleList == null || ruleList.isEmpty()) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,7 @@ public void testNoriAnalyzerDuplicateUserDictRule() throws Exception {
.build();

final IllegalArgumentException exc = expectThrows(IllegalArgumentException.class, () -> createTestAnalysis(settings));
assertThat(exc.getMessage(), containsString("[세종] in user dictionary at line [3]"));
assertThat(exc.getMessage(), containsString("[세종] in user dictionary at line [4]"));
}

public void testNoriAnalyzerDuplicateUserDictRuleWithLegacyVersion() throws IOException {
Expand All @@ -144,6 +144,20 @@ public void testNoriAnalyzerDuplicateUserDictRuleWithLegacyVersion() throws IOEx
}
}

public void testNoriAnalyzerDuplicateUserDictRuleDeduplication() throws Exception {
Settings settings = Settings.builder()
.put("index.analysis.analyzer.my_analyzer.type", "nori")
.put("index.analysis.analyzer.my_analyzer.lenient", "true")
.put(IndexMetadata.SETTING_VERSION_CREATED, IndexVersions.NORI_DUPLICATES)
.putList("index.analysis.analyzer.my_analyzer.user_dictionary_rules", "c++", "C쁠쁠", "세종", "세종", "세종시 세종 시")
.build();
TestAnalysis analysis = createTestAnalysis(settings);
Analyzer analyzer = analysis.indexAnalyzers.get("my_analyzer");
try (TokenStream stream = analyzer.tokenStream("", "세종시")) {
assertTokenStreamContents(stream, new String[] { "세종", "시" });
}
}

public void testNoriTokenizer() throws Exception {
Settings settings = Settings.builder()
.put("index.analysis.tokenizer.my_tokenizer.type", "nori_tokenizer")
Expand Down
40 changes: 29 additions & 11 deletions server/src/main/java/org/elasticsearch/index/analysis/Analysis.java
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@

package org.elasticsearch.index.analysis;

import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.ar.ArabicAnalyzer;
import org.apache.lucene.analysis.bg.BulgarianAnalyzer;
Expand Down Expand Up @@ -67,6 +69,7 @@
import java.security.AccessControlException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Locale;
Expand All @@ -78,6 +81,7 @@
public class Analysis {

private static final DeprecationLogger DEPRECATION_LOGGER = DeprecationLogger.getLogger(Analysis.class);
private static final Logger logger = LogManager.getLogger(Analysis.class);

public static void checkForDeprecatedVersion(String name, Settings settings) {
String sVersion = settings.get("version");
Expand Down Expand Up @@ -267,12 +271,14 @@ public static List<String> getWordList(
Settings settings,
String settingPath,
String settingList,
String settingLenient,
boolean removeComments,
boolean checkDuplicate
) {
boolean deduplicateDictionary = settings.getAsBoolean(settingLenient, false);
final List<String> ruleList = getWordList(env, settings, settingPath, settingList, removeComments);
if (ruleList != null && ruleList.isEmpty() == false && checkDuplicate) {
checkDuplicateRules(ruleList);
return deDuplicateRules(ruleList, deduplicateDictionary == false);
}
return ruleList;
}
Expand All @@ -288,24 +294,36 @@ public static List<String> getWordList(
* If the addition to the HashSet returns false, it means that item was already present in the set, indicating a duplicate.
* In such a case, an IllegalArgumentException is thrown specifying the duplicate term and the line number in the original list.
*
* Optionally the function will return the deduplicated list
*
* @param ruleList The list of rules to check for duplicates.
* @throws IllegalArgumentException If a duplicate rule is found.
*/
private static void checkDuplicateRules(List<String> ruleList) {
Set<String> dup = new HashSet<>();
int lineNum = 0;
for (String line : ruleList) {
// ignore comments
private static List<String> deDuplicateRules(List<String> ruleList, boolean failOnDuplicate) {
Set<String> duplicateKeys = new HashSet<>();
List<String> deduplicatedList = new ArrayList<>();
for (int lineNum = 0; lineNum < ruleList.size(); lineNum++) {
String line = ruleList.get(lineNum);
// ignore lines beginning with # as those are comments
if (line.startsWith("#") == false) {
String[] values = CSVUtil.parse(line);
if (dup.add(values[0]) == false) {
throw new IllegalArgumentException(
"Found duplicate term [" + values[0] + "] in user dictionary " + "at line [" + lineNum + "]"
);
if (duplicateKeys.add(values[0]) == false) {
if (failOnDuplicate) {
throw new IllegalArgumentException(
"Found duplicate term [" + values[0] + "] in user dictionary " + "at line [" + (lineNum + 1) + "]"
);
} else {
logger.warn("Ignoring duplicate term [" + values[0] + "] in user dictionary " + "at line [" + (lineNum + 1) + "]");
}
} else {
deduplicatedList.add(line);
}
} else {
deduplicatedList.add(line);
}
++lineNum;
}

return Collections.unmodifiableList(deduplicatedList);
}

private static List<String> loadWordList(Path path, boolean removeComments) throws IOException {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
import java.util.Arrays;
import java.util.List;

import static org.hamcrest.CoreMatchers.containsString;
import static org.hamcrest.Matchers.is;

public class AnalysisTests extends ESTestCase {
Expand Down Expand Up @@ -104,4 +105,92 @@ public void testParseWordList() throws IOException {
List<String> wordList = Analysis.getWordList(env, nodeSettings, "foo.bar");
assertEquals(Arrays.asList("hello", "world"), wordList);
}

public void testParseDuplicates() throws IOException {
Path tempDir = createTempDir();
Path dict = tempDir.resolve("foo.dict");
Settings nodeSettings = Settings.builder()
.put("foo.path", tempDir.resolve(dict))
.put("bar.list", "")
.put("soup.lenient", "true")
.put(Environment.PATH_HOME_SETTING.getKey(), tempDir)
.build();
try (BufferedWriter writer = Files.newBufferedWriter(dict, StandardCharsets.UTF_8)) {
writer.write("# This is a test of the emergency broadcast system");
writer.write('\n');
writer.write("最終契約,最終契約,最終契約,カスタム名 詞");
writer.write('\n');
writer.write("最終契約,最終契約,最終契約,カスタム名 詞");
writer.write('\n');
writer.write("# This is a test of the emergency broadcast system");
writer.write('\n');
writer.write("最終契約,最終契約,最終契約,カスタム名 詞,extra stuff that gets discarded");
writer.write('\n');
}
Environment env = TestEnvironment.newEnvironment(nodeSettings);
List<String> wordList = Analysis.getWordList(env, nodeSettings, "foo.path", "bar.list", "soup.lenient", true, true);
assertEquals(List.of("最終契約,最終契約,最終契約,カスタム名 詞"), wordList);
}

public void testFailOnDuplicates() throws IOException {
Path tempDir = createTempDir();
Path dict = tempDir.resolve("foo.dict");
Settings nodeSettings = Settings.builder()
.put("foo.path", tempDir.resolve(dict))
.put("bar.list", "")
.put("soup.lenient", "false")
.put(Environment.PATH_HOME_SETTING.getKey(), tempDir)
.build();
try (BufferedWriter writer = Files.newBufferedWriter(dict, StandardCharsets.UTF_8)) {
writer.write("# This is a test of the emergency broadcast system");
writer.write('\n');
writer.write("最終契約,最終契約,最終契約,カスタム名 詞");
writer.write('\n');
writer.write("最終契,最終契,最終契約,カスタム名 詞");
writer.write('\n');
writer.write("# This is a test of the emergency broadcast system");
writer.write('\n');
writer.write("最終契約,最終契約,最終契約,カスタム名 詞,extra");
writer.write('\n');
}
Environment env = TestEnvironment.newEnvironment(nodeSettings);
IllegalArgumentException exc = expectThrows(
IllegalArgumentException.class,
() -> Analysis.getWordList(env, nodeSettings, "foo.path", "bar.list", "soup.lenient", false, true)
);
assertThat(exc.getMessage(), containsString("[最終契約] in user dictionary at line [5]"));
}

public void testParseDuplicatesWComments() throws IOException {
Path tempDir = createTempDir();
Path dict = tempDir.resolve("foo.dict");
Settings nodeSettings = Settings.builder()
.put("foo.path", tempDir.resolve(dict))
.put("bar.list", "")
.put("soup.lenient", "true")
.put(Environment.PATH_HOME_SETTING.getKey(), tempDir)
.build();
try (BufferedWriter writer = Files.newBufferedWriter(dict, StandardCharsets.UTF_8)) {
writer.write("# This is a test of the emergency broadcast system");
writer.write('\n');
writer.write("最終契約,最終契約,最終契約,カスタム名 詞");
writer.write('\n');
writer.write("最終契約,最終契約,最終契約,カスタム名 詞");
writer.write('\n');
writer.write("# This is a test of the emergency broadcast system");
writer.write('\n');
writer.write("最終契約,最終契約,最終契約,カスタム名 詞,extra");
writer.write('\n');
}
Environment env = TestEnvironment.newEnvironment(nodeSettings);
List<String> wordList = Analysis.getWordList(env, nodeSettings, "foo.path", "bar.list", "soup.lenient", false, true);
assertEquals(
List.of(
"# This is a test of the emergency broadcast system",
"最終契約,最終契約,最終契約,カスタム名 詞",
"# This is a test of the emergency broadcast system"
),
wordList
);
}
}
Loading