LUCENE-9930: Only load Ukrainian morfologik dictionary once per JVM (#109)

romseygeek · web-flow · commit 90d363ece711 · 2021-04-28T13:51:23.000+01:00
The UkrainianMorfologikAnalyzer was reloading its dictionary every
time it created a new TokenStreamComponents, which meant that
while the analyzer was open it would hold onto one copy of the
dictionary per thread.

This commit loads the dictionary in a lazy static initializer, alongside
its stopword set. It also makes the normalizer charmap a singleton
so that we do not rebuild the same immutable object on every call
to initReader.
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
@@ -243,6 +243,9 @@ Bug fixes
 * LUCENE-9580: Fix bug in the polygon tessellator when introducing collinear edges during polygon
   splitting. (Ignacio Vera)
 
+* LUCENE-9930: The Ukrainian analyzer was reloading its dictionary for every new
+  TokenStreamComponents, which could lead to memory leaks. (Alan Woodward)
+
 Changes in Backwards Compatibility Policy
 
 * LUCENE-9904: regenerated UAX29URLEmailTokenizer and the corresponding analyzer with up-to-date top
diff --git a/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/uk/UkrainianMorfologikAnalyzer.java b/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/uk/UkrainianMorfologikAnalyzer.java
@@ -42,11 +42,31 @@
  * @since 6.2.0
  */
 public final class UkrainianMorfologikAnalyzer extends StopwordAnalyzerBase {
+
   private final CharArraySet stemExclusionSet;
 
   /** File containing default Ukrainian stopwords. */
   public static final String DEFAULT_STOPWORD_FILE = "stopwords.txt";
 
+  private static final NormalizeCharMap NORMALIZER_MAP;
+
+  static {
+    NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder();
+    // different apostrophes
+    builder.add("\u2019", "'");
+    builder.add("\u2018", "'");
+    builder.add("\u02BC", "'");
+    builder.add("`", "'");
+    builder.add("´", "'");
+    // ignored characters
+    builder.add("\u0301", "");
+    builder.add("\u00AD", "");
+    builder.add("ґ", "г");
+    builder.add("Ґ", "Г");
+
+    NORMALIZER_MAP = builder.build();
+  }
+
   /**
    * Returns an unmodifiable instance of the default stop words set.
    *
@@ -57,11 +77,12 @@ public static CharArraySet getDefaultStopSet() {
   }
 
   /**
-   * Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class accesses the
-   * static final set the first time.;
+   * Atomically loads the DEFAULT_STOP_SET and DICTIONARY in a lazy fashion once the outer class
+   * accesses the static final set the first time.;
    */
   private static class DefaultSetHolder {
     static final CharArraySet DEFAULT_STOP_SET;
+    static final Dictionary DICTIONARY;
 
     static {
       try {
@@ -71,10 +92,15 @@ private static class DefaultSetHolder {
                     UkrainianMorfologikAnalyzer.class,
                     DEFAULT_STOPWORD_FILE,
                     StandardCharsets.UTF_8));
+        DICTIONARY =
+            Dictionary.read(
+                UkrainianMorfologikAnalyzer.class
+                    .getClassLoader()
+                    .getResource("ua/net/nlp/ukrainian.dict"));
       } catch (IOException ex) {
         // default set should always be present as it is part of the
         // distribution (JAR)
-        throw new UncheckedIOException("Unable to load default stopword set", ex);
+        throw new UncheckedIOException("Unable to load analyzer resources", ex);
       }
     }
   }
@@ -107,22 +133,7 @@ public UkrainianMorfologikAnalyzer(CharArraySet stopwords, CharArraySet stemExcl
 
   @Override
   protected Reader initReader(String fieldName, Reader reader) {
-    NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder();
-    // different apostrophes
-    builder.add("\u2019", "'");
-    builder.add("\u2018", "'");
-    builder.add("\u02BC", "'");
-    builder.add("`", "'");
-    builder.add("´", "'");
-    // ignored characters
-    builder.add("\u0301", "");
-    builder.add("\u00AD", "");
-    builder.add("ґ", "г");
-    builder.add("Ґ", "Г");
-
-    NormalizeCharMap normMap = builder.build();
-    reader = new MappingCharFilter(normMap, reader);
-    return reader;
+    return new MappingCharFilter(NORMALIZER_MAP, reader);
   }
 
   /**
@@ -144,18 +155,7 @@ protected TokenStreamComponents createComponents(String fieldName) {
       result = new SetKeywordMarkerFilter(result, stemExclusionSet);
     }
 
-    result = new MorfologikFilter(result, getDictionary());
+    result = new MorfologikFilter(result, DefaultSetHolder.DICTIONARY);
     return new TokenStreamComponents(source, result);
   }
-
-  private static Dictionary getDictionary() {
-    try {
-      return Dictionary.read(
-          UkrainianMorfologikAnalyzer.class
-              .getClassLoader()
-              .getResource("ua/net/nlp/ukrainian.dict"));
-    } catch (IOException e) {
-      throw new RuntimeException(e);
-    }
-  }
 }