Deduplicate Nori and Kuromoji User Dictionary (#112768)

john-wagster · web-flow · commit 13f80b02511d · 2024-09-20T16:25:24.000-05:00
added the ability to deduplicate the user dictionary optionally
diff --git a/docs/changelog/112768.yaml b/docs/changelog/112768.yaml
@@ -0,0 +1,5 @@
+pr: 112768
+summary: Deduplicate Kuromoji User Dictionary
+area: Search
+type: enhancement
+issues: []
diff --git a/docs/plugins/analysis-kuromoji.asciidoc b/docs/plugins/analysis-kuromoji.asciidoc
@@ -133,6 +133,11 @@ unknown words. It can be set to:
 
     Whether punctuation should be discarded from the output. Defaults to `true`.
 
+`lenient`::
+
+    Whether the `user_dictionary` should be deduplicated on the provided `text`.
+    False by default causing duplicates to generate an error.
+
 `user_dictionary`::
 +
 --
@@ -221,7 +226,8 @@ PUT kuromoji_sample
             "type": "kuromoji_tokenizer",
             "mode": "extended",
             "discard_punctuation": "false",
-            "user_dictionary": "userdict_ja.txt"
+            "user_dictionary": "userdict_ja.txt",
+            "lenient": "true"
           }
         },
         "analyzer": {
diff --git a/docs/plugins/analysis-nori.asciidoc b/docs/plugins/analysis-nori.asciidoc
@@ -58,6 +58,11 @@ It can be set to:
 
     Whether punctuation should be discarded from the output. Defaults to `true`.
 
+`lenient`::
+
+    Whether the `user_dictionary` should be deduplicated on the provided `text`.
+    False by default causing duplicates to generate an error.
+
 `user_dictionary`::
 +
 --
@@ -104,7 +109,8 @@ PUT nori_sample
             "type": "nori_tokenizer",
             "decompound_mode": "mixed",
             "discard_punctuation": "false",
-            "user_dictionary": "userdict_ko.txt"
+            "user_dictionary": "userdict_ko.txt",
+            "lenient": "true"
           }
         },
         "analyzer": {
@@ -299,7 +305,6 @@ Which responds with:
 }
 --------------------------------------------------
 
-
 [[analysis-nori-speech]]
 ==== `nori_part_of_speech` token filter
 
diff --git a/plugins/analysis-kuromoji/src/main/java/org/elasticsearch/plugin/analysis/kuromoji/KuromojiTokenizerFactory.java b/plugins/analysis-kuromoji/src/main/java/org/elasticsearch/plugin/analysis/kuromoji/KuromojiTokenizerFactory.java
@@ -33,6 +33,7 @@ public class KuromojiTokenizerFactory extends AbstractTokenizerFactory {
     private static final String NBEST_COST = "nbest_cost";
     private static final String NBEST_EXAMPLES = "nbest_examples";
     private static final String DISCARD_COMPOUND_TOKEN = "discard_compound_token";
+    private static final String LENIENT = "lenient";
 
     private final UserDictionary userDictionary;
     private final Mode mode;
@@ -58,14 +59,23 @@ public static UserDictionary getUserDictionary(Environment env, Settings setting
                 "It is not allowed to use [" + USER_DICT_PATH_OPTION + "] in conjunction" + " with [" + USER_DICT_RULES_OPTION + "]"
             );
         }
-        List<String> ruleList = Analysis.getWordList(env, settings, USER_DICT_PATH_OPTION, USER_DICT_RULES_OPTION, false, true);
+        List<String> ruleList = Analysis.getWordList(
+            env,
+            settings,
+            USER_DICT_PATH_OPTION,
+            USER_DICT_RULES_OPTION,
+            LENIENT,
+            false,  // typically don't want to remove comments as deduplication will provide better feedback
+            true
+        );
         if (ruleList == null || ruleList.isEmpty()) {
             return null;
         }
         StringBuilder sb = new StringBuilder();
         for (String line : ruleList) {
             sb.append(line).append(System.lineSeparator());
         }
+
         try (Reader rulesReader = new StringReader(sb.toString())) {
             return UserDictionary.open(rulesReader);
         } catch (IOException e) {
diff --git a/plugins/analysis-kuromoji/src/test/java/org/elasticsearch/plugin/analysis/kuromoji/KuromojiAnalysisTests.java b/plugins/analysis-kuromoji/src/test/java/org/elasticsearch/plugin/analysis/kuromoji/KuromojiAnalysisTests.java
@@ -445,7 +445,26 @@ public void testKuromojiAnalyzerDuplicateUserDictRule() throws Exception {
             )
             .build();
         IllegalArgumentException exc = expectThrows(IllegalArgumentException.class, () -> createTestAnalysis(settings));
-        assertThat(exc.getMessage(), containsString("[制限スピード] in user dictionary at line [3]"));
+        assertThat(exc.getMessage(), containsString("[制限スピード] in user dictionary at line [4]"));
+    }
+
+    public void testKuromojiAnalyzerDuplicateUserDictRuleDeduplication() throws Exception {
+        Settings settings = Settings.builder()
+            .put("index.analysis.analyzer.my_analyzer.type", "kuromoji")
+            .put("index.analysis.analyzer.my_analyzer.lenient", "true")
+            .putList(
+                "index.analysis.analyzer.my_analyzer.user_dictionary_rules",
+                "c++,c++,w,w",
+                "#comment",
+                "制限スピード,制限スピード,セイゲンスピード,テスト名詞",
+                "制限スピード,制限スピード,セイゲンスピード,テスト名詞"
+            )
+            .build();
+        TestAnalysis analysis = createTestAnalysis(settings);
+        Analyzer analyzer = analysis.indexAnalyzers.get("my_analyzer");
+        try (TokenStream stream = analyzer.tokenStream("", "制限スピード")) {
+            assertTokenStreamContents(stream, new String[] { "制限スピード" });
+        }
     }
 
     public void testDiscardCompoundToken() throws Exception {
diff --git a/plugins/analysis-nori/src/main/java/org/elasticsearch/plugin/analysis/nori/NoriTokenizerFactory.java b/plugins/analysis-nori/src/main/java/org/elasticsearch/plugin/analysis/nori/NoriTokenizerFactory.java
@@ -31,6 +31,7 @@
 public class NoriTokenizerFactory extends AbstractTokenizerFactory {
     private static final String USER_DICT_PATH_OPTION = "user_dictionary";
     private static final String USER_DICT_RULES_OPTION = "user_dictionary_rules";
+    private static final String LENIENT = "lenient";
 
     private final UserDictionary userDictionary;
     private final KoreanTokenizer.DecompoundMode decompoundMode;
@@ -54,7 +55,8 @@ public static UserDictionary getUserDictionary(Environment env, Settings setting
             settings,
             USER_DICT_PATH_OPTION,
             USER_DICT_RULES_OPTION,
-            true,
+            LENIENT,
+            false,  // typically don't want to remove comments as deduplication will provide better feedback
             isSupportDuplicateCheck(indexSettings)
         );
         if (ruleList == null || ruleList.isEmpty()) {
diff --git a/plugins/analysis-nori/src/test/java/org/elasticsearch/plugin/analysis/nori/NoriAnalysisTests.java b/plugins/analysis-nori/src/test/java/org/elasticsearch/plugin/analysis/nori/NoriAnalysisTests.java
@@ -127,7 +127,7 @@ public void testNoriAnalyzerDuplicateUserDictRule() throws Exception {
             .build();
 
         final IllegalArgumentException exc = expectThrows(IllegalArgumentException.class, () -> createTestAnalysis(settings));
-        assertThat(exc.getMessage(), containsString("[세종] in user dictionary at line [3]"));
+        assertThat(exc.getMessage(), containsString("[세종] in user dictionary at line [4]"));
     }
 
     public void testNoriAnalyzerDuplicateUserDictRuleWithLegacyVersion() throws IOException {
@@ -144,6 +144,20 @@ public void testNoriAnalyzerDuplicateUserDictRuleWithLegacyVersion() throws IOEx
         }
     }
 
+    public void testNoriAnalyzerDuplicateUserDictRuleDeduplication() throws Exception {
+        Settings settings = Settings.builder()
+            .put("index.analysis.analyzer.my_analyzer.type", "nori")
+            .put("index.analysis.analyzer.my_analyzer.lenient", "true")
+            .put(IndexMetadata.SETTING_VERSION_CREATED, IndexVersions.NORI_DUPLICATES)
+            .putList("index.analysis.analyzer.my_analyzer.user_dictionary_rules", "c++", "C쁠쁠", "세종", "세종", "세종시 세종 시")
+            .build();
+        TestAnalysis analysis = createTestAnalysis(settings);
+        Analyzer analyzer = analysis.indexAnalyzers.get("my_analyzer");
+        try (TokenStream stream = analyzer.tokenStream("", "세종시")) {
+            assertTokenStreamContents(stream, new String[] { "세종", "시" });
+        }
+    }
+
     public void testNoriTokenizer() throws Exception {
         Settings settings = Settings.builder()
             .put("index.analysis.tokenizer.my_tokenizer.type", "nori_tokenizer")
diff --git a/server/src/main/java/org/elasticsearch/index/analysis/Analysis.java b/server/src/main/java/org/elasticsearch/index/analysis/Analysis.java
@@ -9,6 +9,8 @@
 
 package org.elasticsearch.index.analysis;
 
+import org.apache.logging.log4j.LogManager;
+import org.apache.logging.log4j.Logger;
 import org.apache.lucene.analysis.CharArraySet;
 import org.apache.lucene.analysis.ar.ArabicAnalyzer;
 import org.apache.lucene.analysis.bg.BulgarianAnalyzer;
@@ -67,6 +69,7 @@
 import java.security.AccessControlException;
 import java.util.ArrayList;
 import java.util.Collection;
+import java.util.Collections;
 import java.util.HashSet;
 import java.util.List;
 import java.util.Locale;
@@ -78,6 +81,7 @@
 public class Analysis {
 
     private static final DeprecationLogger DEPRECATION_LOGGER = DeprecationLogger.getLogger(Analysis.class);
+    private static final Logger logger = LogManager.getLogger(Analysis.class);
 
     public static void checkForDeprecatedVersion(String name, Settings settings) {
         String sVersion = settings.get("version");
@@ -267,12 +271,14 @@ public static List<String> getWordList(
         Settings settings,
         String settingPath,
         String settingList,
+        String settingLenient,
         boolean removeComments,
         boolean checkDuplicate
     ) {
+        boolean deduplicateDictionary = settings.getAsBoolean(settingLenient, false);
         final List<String> ruleList = getWordList(env, settings, settingPath, settingList, removeComments);
         if (ruleList != null && ruleList.isEmpty() == false && checkDuplicate) {
-            checkDuplicateRules(ruleList);
+            return deDuplicateRules(ruleList, deduplicateDictionary == false);
         }
         return ruleList;
     }
@@ -288,24 +294,36 @@ public static List<String> getWordList(
      * If the addition to the HashSet returns false, it means that item was already present in the set, indicating a duplicate.
      * In such a case, an IllegalArgumentException is thrown specifying the duplicate term and the line number in the original list.
      *
+     * Optionally the function will return the deduplicated list
+     *
      * @param ruleList The list of rules to check for duplicates.
      * @throws IllegalArgumentException If a duplicate rule is found.
      */
-    private static void checkDuplicateRules(List<String> ruleList) {
-        Set<String> dup = new HashSet<>();
-        int lineNum = 0;
-        for (String line : ruleList) {
-            // ignore comments
+    private static List<String> deDuplicateRules(List<String> ruleList, boolean failOnDuplicate) {
+        Set<String> duplicateKeys = new HashSet<>();
+        List<String> deduplicatedList = new ArrayList<>();
+        for (int lineNum = 0; lineNum < ruleList.size(); lineNum++) {
+            String line = ruleList.get(lineNum);
+            // ignore lines beginning with # as those are comments
             if (line.startsWith("#") == false) {
                 String[] values = CSVUtil.parse(line);
-                if (dup.add(values[0]) == false) {
-                    throw new IllegalArgumentException(
-                        "Found duplicate term [" + values[0] + "] in user dictionary " + "at line [" + lineNum + "]"
-                    );
+                if (duplicateKeys.add(values[0]) == false) {
+                    if (failOnDuplicate) {
+                        throw new IllegalArgumentException(
+                            "Found duplicate term [" + values[0] + "] in user dictionary " + "at line [" + (lineNum + 1) + "]"
+                        );
+                    } else {
+                        logger.warn("Ignoring duplicate term [" + values[0] + "] in user dictionary " + "at line [" + (lineNum + 1) + "]");
+                    }
+                } else {
+                    deduplicatedList.add(line);
                 }
+            } else {
+                deduplicatedList.add(line);
             }
-            ++lineNum;
         }
+
+        return Collections.unmodifiableList(deduplicatedList);
     }
 
     private static List<String> loadWordList(Path path, boolean removeComments) throws IOException {
diff --git a/server/src/test/java/org/elasticsearch/index/analysis/AnalysisTests.java b/server/src/test/java/org/elasticsearch/index/analysis/AnalysisTests.java
@@ -28,6 +28,7 @@
 import java.util.Arrays;
 import java.util.List;
 
+import static org.hamcrest.CoreMatchers.containsString;
 import static org.hamcrest.Matchers.is;
 
 public class AnalysisTests extends ESTestCase {
@@ -104,4 +105,92 @@ public void testParseWordList() throws IOException {
         List<String> wordList = Analysis.getWordList(env, nodeSettings, "foo.bar");
         assertEquals(Arrays.asList("hello", "world"), wordList);
     }
+
+    public void testParseDuplicates() throws IOException {
+        Path tempDir = createTempDir();
+        Path dict = tempDir.resolve("foo.dict");
+        Settings nodeSettings = Settings.builder()
+            .put("foo.path", tempDir.resolve(dict))
+            .put("bar.list", "")
+            .put("soup.lenient", "true")
+            .put(Environment.PATH_HOME_SETTING.getKey(), tempDir)
+            .build();
+        try (BufferedWriter writer = Files.newBufferedWriter(dict, StandardCharsets.UTF_8)) {
+            writer.write("# This is a test of the emergency broadcast system");
+            writer.write('\n');
+            writer.write("最終契約,最終契約,最終契約,カスタム名 詞");
+            writer.write('\n');
+            writer.write("最終契約,最終契約,最終契約,カスタム名 詞");
+            writer.write('\n');
+            writer.write("# This is a test of the emergency broadcast system");
+            writer.write('\n');
+            writer.write("最終契約,最終契約,最終契約,カスタム名 詞,extra stuff that gets discarded");
+            writer.write('\n');
+        }
+        Environment env = TestEnvironment.newEnvironment(nodeSettings);
+        List<String> wordList = Analysis.getWordList(env, nodeSettings, "foo.path", "bar.list", "soup.lenient", true, true);
+        assertEquals(List.of("最終契約,最終契約,最終契約,カスタム名 詞"), wordList);
+    }
+
+    public void testFailOnDuplicates() throws IOException {
+        Path tempDir = createTempDir();
+        Path dict = tempDir.resolve("foo.dict");
+        Settings nodeSettings = Settings.builder()
+            .put("foo.path", tempDir.resolve(dict))
+            .put("bar.list", "")
+            .put("soup.lenient", "false")
+            .put(Environment.PATH_HOME_SETTING.getKey(), tempDir)
+            .build();
+        try (BufferedWriter writer = Files.newBufferedWriter(dict, StandardCharsets.UTF_8)) {
+            writer.write("# This is a test of the emergency broadcast system");
+            writer.write('\n');
+            writer.write("最終契約,最終契約,最終契約,カスタム名 詞");
+            writer.write('\n');
+            writer.write("最終契,最終契,最終契約,カスタム名 詞");
+            writer.write('\n');
+            writer.write("# This is a test of the emergency broadcast system");
+            writer.write('\n');
+            writer.write("最終契約,最終契約,最終契約,カスタム名 詞,extra");
+            writer.write('\n');
+        }
+        Environment env = TestEnvironment.newEnvironment(nodeSettings);
+        IllegalArgumentException exc = expectThrows(
+            IllegalArgumentException.class,
+            () -> Analysis.getWordList(env, nodeSettings, "foo.path", "bar.list", "soup.lenient", false, true)
+        );
+        assertThat(exc.getMessage(), containsString("[最終契約] in user dictionary at line [5]"));
+    }
+
+    public void testParseDuplicatesWComments() throws IOException {
+        Path tempDir = createTempDir();
+        Path dict = tempDir.resolve("foo.dict");
+        Settings nodeSettings = Settings.builder()
+            .put("foo.path", tempDir.resolve(dict))
+            .put("bar.list", "")
+            .put("soup.lenient", "true")
+            .put(Environment.PATH_HOME_SETTING.getKey(), tempDir)
+            .build();
+        try (BufferedWriter writer = Files.newBufferedWriter(dict, StandardCharsets.UTF_8)) {
+            writer.write("# This is a test of the emergency broadcast system");
+            writer.write('\n');
+            writer.write("最終契約,最終契約,最終契約,カスタム名 詞");
+            writer.write('\n');
+            writer.write("最終契約,最終契約,最終契約,カスタム名 詞");
+            writer.write('\n');
+            writer.write("# This is a test of the emergency broadcast system");
+            writer.write('\n');
+            writer.write("最終契約,最終契約,最終契約,カスタム名 詞,extra");
+            writer.write('\n');
+        }
+        Environment env = TestEnvironment.newEnvironment(nodeSettings);
+        List<String> wordList = Analysis.getWordList(env, nodeSettings, "foo.path", "bar.list", "soup.lenient", false, true);
+        assertEquals(
+            List.of(
+                "# This is a test of the emergency broadcast system",
+                "最終契約,最終契約,最終契約,カスタム名 詞",
+                "# This is a test of the emergency broadcast system"
+            ),
+            wordList
+        );
+    }
 }