Skip to content

Commit 13f80b0

Browse files
authored
Deduplicate Nori and Kuromoji User Dictionary (#112768)
added the ability to deduplicate the user dictionary optionally
1 parent f5b979b commit 13f80b0

File tree

9 files changed

+186
-18
lines changed

9 files changed

+186
-18
lines changed

docs/changelog/112768.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
pr: 112768
2+
summary: Deduplicate Kuromoji User Dictionary
3+
area: Search
4+
type: enhancement
5+
issues: []

docs/plugins/analysis-kuromoji.asciidoc

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -133,6 +133,11 @@ unknown words. It can be set to:
133133

134134
Whether punctuation should be discarded from the output. Defaults to `true`.
135135

136+
`lenient`::
137+
138+
Whether the `user_dictionary` should be deduplicated on the provided `text`.
139+
False by default causing duplicates to generate an error.
140+
136141
`user_dictionary`::
137142
+
138143
--
@@ -221,7 +226,8 @@ PUT kuromoji_sample
221226
"type": "kuromoji_tokenizer",
222227
"mode": "extended",
223228
"discard_punctuation": "false",
224-
"user_dictionary": "userdict_ja.txt"
229+
"user_dictionary": "userdict_ja.txt",
230+
"lenient": "true"
225231
}
226232
},
227233
"analyzer": {

docs/plugins/analysis-nori.asciidoc

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,11 @@ It can be set to:
5858

5959
Whether punctuation should be discarded from the output. Defaults to `true`.
6060

61+
`lenient`::
62+
63+
Whether the `user_dictionary` should be deduplicated on the provided `text`.
64+
False by default causing duplicates to generate an error.
65+
6166
`user_dictionary`::
6267
+
6368
--
@@ -104,7 +109,8 @@ PUT nori_sample
104109
"type": "nori_tokenizer",
105110
"decompound_mode": "mixed",
106111
"discard_punctuation": "false",
107-
"user_dictionary": "userdict_ko.txt"
112+
"user_dictionary": "userdict_ko.txt",
113+
"lenient": "true"
108114
}
109115
},
110116
"analyzer": {
@@ -299,7 +305,6 @@ Which responds with:
299305
}
300306
--------------------------------------------------
301307

302-
303308
[[analysis-nori-speech]]
304309
==== `nori_part_of_speech` token filter
305310

plugins/analysis-kuromoji/src/main/java/org/elasticsearch/plugin/analysis/kuromoji/KuromojiTokenizerFactory.java

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ public class KuromojiTokenizerFactory extends AbstractTokenizerFactory {
3333
private static final String NBEST_COST = "nbest_cost";
3434
private static final String NBEST_EXAMPLES = "nbest_examples";
3535
private static final String DISCARD_COMPOUND_TOKEN = "discard_compound_token";
36+
private static final String LENIENT = "lenient";
3637

3738
private final UserDictionary userDictionary;
3839
private final Mode mode;
@@ -58,14 +59,23 @@ public static UserDictionary getUserDictionary(Environment env, Settings setting
5859
"It is not allowed to use [" + USER_DICT_PATH_OPTION + "] in conjunction" + " with [" + USER_DICT_RULES_OPTION + "]"
5960
);
6061
}
61-
List<String> ruleList = Analysis.getWordList(env, settings, USER_DICT_PATH_OPTION, USER_DICT_RULES_OPTION, false, true);
62+
List<String> ruleList = Analysis.getWordList(
63+
env,
64+
settings,
65+
USER_DICT_PATH_OPTION,
66+
USER_DICT_RULES_OPTION,
67+
LENIENT,
68+
false, // typically don't want to remove comments as deduplication will provide better feedback
69+
true
70+
);
6271
if (ruleList == null || ruleList.isEmpty()) {
6372
return null;
6473
}
6574
StringBuilder sb = new StringBuilder();
6675
for (String line : ruleList) {
6776
sb.append(line).append(System.lineSeparator());
6877
}
78+
6979
try (Reader rulesReader = new StringReader(sb.toString())) {
7080
return UserDictionary.open(rulesReader);
7181
} catch (IOException e) {

plugins/analysis-kuromoji/src/test/java/org/elasticsearch/plugin/analysis/kuromoji/KuromojiAnalysisTests.java

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -445,7 +445,26 @@ public void testKuromojiAnalyzerDuplicateUserDictRule() throws Exception {
445445
)
446446
.build();
447447
IllegalArgumentException exc = expectThrows(IllegalArgumentException.class, () -> createTestAnalysis(settings));
448-
assertThat(exc.getMessage(), containsString("[制限スピード] in user dictionary at line [3]"));
448+
assertThat(exc.getMessage(), containsString("[制限スピード] in user dictionary at line [4]"));
449+
}
450+
451+
public void testKuromojiAnalyzerDuplicateUserDictRuleDeduplication() throws Exception {
452+
Settings settings = Settings.builder()
453+
.put("index.analysis.analyzer.my_analyzer.type", "kuromoji")
454+
.put("index.analysis.analyzer.my_analyzer.lenient", "true")
455+
.putList(
456+
"index.analysis.analyzer.my_analyzer.user_dictionary_rules",
457+
"c++,c++,w,w",
458+
"#comment",
459+
"制限スピード,制限スピード,セイゲンスピード,テスト名詞",
460+
"制限スピード,制限スピード,セイゲンスピード,テスト名詞"
461+
)
462+
.build();
463+
TestAnalysis analysis = createTestAnalysis(settings);
464+
Analyzer analyzer = analysis.indexAnalyzers.get("my_analyzer");
465+
try (TokenStream stream = analyzer.tokenStream("", "制限スピード")) {
466+
assertTokenStreamContents(stream, new String[] { "制限スピード" });
467+
}
449468
}
450469

451470
public void testDiscardCompoundToken() throws Exception {

plugins/analysis-nori/src/main/java/org/elasticsearch/plugin/analysis/nori/NoriTokenizerFactory.java

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@
3131
public class NoriTokenizerFactory extends AbstractTokenizerFactory {
3232
private static final String USER_DICT_PATH_OPTION = "user_dictionary";
3333
private static final String USER_DICT_RULES_OPTION = "user_dictionary_rules";
34+
private static final String LENIENT = "lenient";
3435

3536
private final UserDictionary userDictionary;
3637
private final KoreanTokenizer.DecompoundMode decompoundMode;
@@ -54,7 +55,8 @@ public static UserDictionary getUserDictionary(Environment env, Settings setting
5455
settings,
5556
USER_DICT_PATH_OPTION,
5657
USER_DICT_RULES_OPTION,
57-
true,
58+
LENIENT,
59+
false, // typically don't want to remove comments as deduplication will provide better feedback
5860
isSupportDuplicateCheck(indexSettings)
5961
);
6062
if (ruleList == null || ruleList.isEmpty()) {

plugins/analysis-nori/src/test/java/org/elasticsearch/plugin/analysis/nori/NoriAnalysisTests.java

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -127,7 +127,7 @@ public void testNoriAnalyzerDuplicateUserDictRule() throws Exception {
127127
.build();
128128

129129
final IllegalArgumentException exc = expectThrows(IllegalArgumentException.class, () -> createTestAnalysis(settings));
130-
assertThat(exc.getMessage(), containsString("[세종] in user dictionary at line [3]"));
130+
assertThat(exc.getMessage(), containsString("[세종] in user dictionary at line [4]"));
131131
}
132132

133133
public void testNoriAnalyzerDuplicateUserDictRuleWithLegacyVersion() throws IOException {
@@ -144,6 +144,20 @@ public void testNoriAnalyzerDuplicateUserDictRuleWithLegacyVersion() throws IOEx
144144
}
145145
}
146146

147+
public void testNoriAnalyzerDuplicateUserDictRuleDeduplication() throws Exception {
148+
Settings settings = Settings.builder()
149+
.put("index.analysis.analyzer.my_analyzer.type", "nori")
150+
.put("index.analysis.analyzer.my_analyzer.lenient", "true")
151+
.put(IndexMetadata.SETTING_VERSION_CREATED, IndexVersions.NORI_DUPLICATES)
152+
.putList("index.analysis.analyzer.my_analyzer.user_dictionary_rules", "c++", "C쁠쁠", "세종", "세종", "세종시 세종 시")
153+
.build();
154+
TestAnalysis analysis = createTestAnalysis(settings);
155+
Analyzer analyzer = analysis.indexAnalyzers.get("my_analyzer");
156+
try (TokenStream stream = analyzer.tokenStream("", "세종시")) {
157+
assertTokenStreamContents(stream, new String[] { "세종", "시" });
158+
}
159+
}
160+
147161
public void testNoriTokenizer() throws Exception {
148162
Settings settings = Settings.builder()
149163
.put("index.analysis.tokenizer.my_tokenizer.type", "nori_tokenizer")

server/src/main/java/org/elasticsearch/index/analysis/Analysis.java

Lines changed: 29 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@
99

1010
package org.elasticsearch.index.analysis;
1111

12+
import org.apache.logging.log4j.LogManager;
13+
import org.apache.logging.log4j.Logger;
1214
import org.apache.lucene.analysis.CharArraySet;
1315
import org.apache.lucene.analysis.ar.ArabicAnalyzer;
1416
import org.apache.lucene.analysis.bg.BulgarianAnalyzer;
@@ -67,6 +69,7 @@
6769
import java.security.AccessControlException;
6870
import java.util.ArrayList;
6971
import java.util.Collection;
72+
import java.util.Collections;
7073
import java.util.HashSet;
7174
import java.util.List;
7275
import java.util.Locale;
@@ -78,6 +81,7 @@
7881
public class Analysis {
7982

8083
private static final DeprecationLogger DEPRECATION_LOGGER = DeprecationLogger.getLogger(Analysis.class);
84+
private static final Logger logger = LogManager.getLogger(Analysis.class);
8185

8286
public static void checkForDeprecatedVersion(String name, Settings settings) {
8387
String sVersion = settings.get("version");
@@ -267,12 +271,14 @@ public static List<String> getWordList(
267271
Settings settings,
268272
String settingPath,
269273
String settingList,
274+
String settingLenient,
270275
boolean removeComments,
271276
boolean checkDuplicate
272277
) {
278+
boolean deduplicateDictionary = settings.getAsBoolean(settingLenient, false);
273279
final List<String> ruleList = getWordList(env, settings, settingPath, settingList, removeComments);
274280
if (ruleList != null && ruleList.isEmpty() == false && checkDuplicate) {
275-
checkDuplicateRules(ruleList);
281+
return deDuplicateRules(ruleList, deduplicateDictionary == false);
276282
}
277283
return ruleList;
278284
}
@@ -288,24 +294,36 @@ public static List<String> getWordList(
288294
* If the addition to the HashSet returns false, it means that item was already present in the set, indicating a duplicate.
289295
* In such a case, an IllegalArgumentException is thrown specifying the duplicate term and the line number in the original list.
290296
*
297+
* Optionally the function will return the deduplicated list
298+
*
291299
* @param ruleList The list of rules to check for duplicates.
292300
* @throws IllegalArgumentException If a duplicate rule is found.
293301
*/
294-
private static void checkDuplicateRules(List<String> ruleList) {
295-
Set<String> dup = new HashSet<>();
296-
int lineNum = 0;
297-
for (String line : ruleList) {
298-
// ignore comments
302+
private static List<String> deDuplicateRules(List<String> ruleList, boolean failOnDuplicate) {
303+
Set<String> duplicateKeys = new HashSet<>();
304+
List<String> deduplicatedList = new ArrayList<>();
305+
for (int lineNum = 0; lineNum < ruleList.size(); lineNum++) {
306+
String line = ruleList.get(lineNum);
307+
// ignore lines beginning with # as those are comments
299308
if (line.startsWith("#") == false) {
300309
String[] values = CSVUtil.parse(line);
301-
if (dup.add(values[0]) == false) {
302-
throw new IllegalArgumentException(
303-
"Found duplicate term [" + values[0] + "] in user dictionary " + "at line [" + lineNum + "]"
304-
);
310+
if (duplicateKeys.add(values[0]) == false) {
311+
if (failOnDuplicate) {
312+
throw new IllegalArgumentException(
313+
"Found duplicate term [" + values[0] + "] in user dictionary " + "at line [" + (lineNum + 1) + "]"
314+
);
315+
} else {
316+
logger.warn("Ignoring duplicate term [" + values[0] + "] in user dictionary " + "at line [" + (lineNum + 1) + "]");
317+
}
318+
} else {
319+
deduplicatedList.add(line);
305320
}
321+
} else {
322+
deduplicatedList.add(line);
306323
}
307-
++lineNum;
308324
}
325+
326+
return Collections.unmodifiableList(deduplicatedList);
309327
}
310328

311329
private static List<String> loadWordList(Path path, boolean removeComments) throws IOException {

server/src/test/java/org/elasticsearch/index/analysis/AnalysisTests.java

Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
import java.util.Arrays;
2929
import java.util.List;
3030

31+
import static org.hamcrest.CoreMatchers.containsString;
3132
import static org.hamcrest.Matchers.is;
3233

3334
public class AnalysisTests extends ESTestCase {
@@ -104,4 +105,92 @@ public void testParseWordList() throws IOException {
104105
List<String> wordList = Analysis.getWordList(env, nodeSettings, "foo.bar");
105106
assertEquals(Arrays.asList("hello", "world"), wordList);
106107
}
108+
109+
public void testParseDuplicates() throws IOException {
110+
Path tempDir = createTempDir();
111+
Path dict = tempDir.resolve("foo.dict");
112+
Settings nodeSettings = Settings.builder()
113+
.put("foo.path", tempDir.resolve(dict))
114+
.put("bar.list", "")
115+
.put("soup.lenient", "true")
116+
.put(Environment.PATH_HOME_SETTING.getKey(), tempDir)
117+
.build();
118+
try (BufferedWriter writer = Files.newBufferedWriter(dict, StandardCharsets.UTF_8)) {
119+
writer.write("# This is a test of the emergency broadcast system");
120+
writer.write('\n');
121+
writer.write("最終契約,最終契約,最終契約,カスタム名 詞");
122+
writer.write('\n');
123+
writer.write("最終契約,最終契約,最終契約,カスタム名 詞");
124+
writer.write('\n');
125+
writer.write("# This is a test of the emergency broadcast system");
126+
writer.write('\n');
127+
writer.write("最終契約,最終契約,最終契約,カスタム名 詞,extra stuff that gets discarded");
128+
writer.write('\n');
129+
}
130+
Environment env = TestEnvironment.newEnvironment(nodeSettings);
131+
List<String> wordList = Analysis.getWordList(env, nodeSettings, "foo.path", "bar.list", "soup.lenient", true, true);
132+
assertEquals(List.of("最終契約,最終契約,最終契約,カスタム名 詞"), wordList);
133+
}
134+
135+
public void testFailOnDuplicates() throws IOException {
136+
Path tempDir = createTempDir();
137+
Path dict = tempDir.resolve("foo.dict");
138+
Settings nodeSettings = Settings.builder()
139+
.put("foo.path", tempDir.resolve(dict))
140+
.put("bar.list", "")
141+
.put("soup.lenient", "false")
142+
.put(Environment.PATH_HOME_SETTING.getKey(), tempDir)
143+
.build();
144+
try (BufferedWriter writer = Files.newBufferedWriter(dict, StandardCharsets.UTF_8)) {
145+
writer.write("# This is a test of the emergency broadcast system");
146+
writer.write('\n');
147+
writer.write("最終契約,最終契約,最終契約,カスタム名 詞");
148+
writer.write('\n');
149+
writer.write("最終契,最終契,最終契約,カスタム名 詞");
150+
writer.write('\n');
151+
writer.write("# This is a test of the emergency broadcast system");
152+
writer.write('\n');
153+
writer.write("最終契約,最終契約,最終契約,カスタム名 詞,extra");
154+
writer.write('\n');
155+
}
156+
Environment env = TestEnvironment.newEnvironment(nodeSettings);
157+
IllegalArgumentException exc = expectThrows(
158+
IllegalArgumentException.class,
159+
() -> Analysis.getWordList(env, nodeSettings, "foo.path", "bar.list", "soup.lenient", false, true)
160+
);
161+
assertThat(exc.getMessage(), containsString("[最終契約] in user dictionary at line [5]"));
162+
}
163+
164+
public void testParseDuplicatesWComments() throws IOException {
165+
Path tempDir = createTempDir();
166+
Path dict = tempDir.resolve("foo.dict");
167+
Settings nodeSettings = Settings.builder()
168+
.put("foo.path", tempDir.resolve(dict))
169+
.put("bar.list", "")
170+
.put("soup.lenient", "true")
171+
.put(Environment.PATH_HOME_SETTING.getKey(), tempDir)
172+
.build();
173+
try (BufferedWriter writer = Files.newBufferedWriter(dict, StandardCharsets.UTF_8)) {
174+
writer.write("# This is a test of the emergency broadcast system");
175+
writer.write('\n');
176+
writer.write("最終契約,最終契約,最終契約,カスタム名 詞");
177+
writer.write('\n');
178+
writer.write("最終契約,最終契約,最終契約,カスタム名 詞");
179+
writer.write('\n');
180+
writer.write("# This is a test of the emergency broadcast system");
181+
writer.write('\n');
182+
writer.write("最終契約,最終契約,最終契約,カスタム名 詞,extra");
183+
writer.write('\n');
184+
}
185+
Environment env = TestEnvironment.newEnvironment(nodeSettings);
186+
List<String> wordList = Analysis.getWordList(env, nodeSettings, "foo.path", "bar.list", "soup.lenient", false, true);
187+
assertEquals(
188+
List.of(
189+
"# This is a test of the emergency broadcast system",
190+
"最終契約,最終契約,最終契約,カスタム名 詞",
191+
"# This is a test of the emergency broadcast system"
192+
),
193+
wordList
194+
);
195+
}
107196
}

0 commit comments

Comments
 (0)