Skip to content

Commit 90d363e

Browse files
authored
LUCENE-9930: Only load Ukrainian morfologik dictionary once per JVM (#109)
The UkrainianMorfologikAnalyzer was reloading its dictionary every time it created a new TokenStreamComponents, which meant that while the analyzer was open it would hold onto one copy of the dictionary per thread. This commit loads the dictionary in a lazy static initializer, alongside its stopword set. It also makes the normalizer charmap a singleton so that we do not rebuild the same immutable object on every call to initReader.
1 parent 0c33e62 commit 90d363e

File tree

2 files changed

+34
-31
lines changed

2 files changed

+34
-31
lines changed

lucene/CHANGES.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -243,6 +243,9 @@ Bug fixes
243243
* LUCENE-9580: Fix bug in the polygon tessellator when introducing collinear edges during polygon
244244
splitting. (Ignacio Vera)
245245

246+
* LUCENE-9930: The Ukrainian analyzer was reloading its dictionary for every new
247+
TokenStreamComponents, which could lead to memory leaks. (Alan Woodward)
248+
246249
Changes in Backwards Compatibility Policy
247250

248251
* LUCENE-9904: regenerated UAX29URLEmailTokenizer and the corresponding analyzer with up-to-date top

lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/uk/UkrainianMorfologikAnalyzer.java

Lines changed: 31 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -42,11 +42,31 @@
4242
* @since 6.2.0
4343
*/
4444
public final class UkrainianMorfologikAnalyzer extends StopwordAnalyzerBase {
45+
4546
private final CharArraySet stemExclusionSet;
4647

4748
/** File containing default Ukrainian stopwords. */
4849
public static final String DEFAULT_STOPWORD_FILE = "stopwords.txt";
4950

51+
private static final NormalizeCharMap NORMALIZER_MAP;
52+
53+
static {
54+
NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder();
55+
// different apostrophes
56+
builder.add("\u2019", "'");
57+
builder.add("\u2018", "'");
58+
builder.add("\u02BC", "'");
59+
builder.add("`", "'");
60+
builder.add("´", "'");
61+
// ignored characters
62+
builder.add("\u0301", "");
63+
builder.add("\u00AD", "");
64+
builder.add("ґ", "г");
65+
builder.add("Ґ", "Г");
66+
67+
NORMALIZER_MAP = builder.build();
68+
}
69+
5070
/**
5171
* Returns an unmodifiable instance of the default stop words set.
5272
*
@@ -57,11 +77,12 @@ public static CharArraySet getDefaultStopSet() {
5777
}
5878

5979
/**
60-
* Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class accesses the
61-
* static final set the first time.;
80+
* Atomically loads the DEFAULT_STOP_SET and DICTIONARY in a lazy fashion once the outer class
81+
* accesses the static final set the first time.;
6282
*/
6383
private static class DefaultSetHolder {
6484
static final CharArraySet DEFAULT_STOP_SET;
85+
static final Dictionary DICTIONARY;
6586

6687
static {
6788
try {
@@ -71,10 +92,15 @@ private static class DefaultSetHolder {
7192
UkrainianMorfologikAnalyzer.class,
7293
DEFAULT_STOPWORD_FILE,
7394
StandardCharsets.UTF_8));
95+
DICTIONARY =
96+
Dictionary.read(
97+
UkrainianMorfologikAnalyzer.class
98+
.getClassLoader()
99+
.getResource("ua/net/nlp/ukrainian.dict"));
74100
} catch (IOException ex) {
75101
// default set should always be present as it is part of the
76102
// distribution (JAR)
77-
throw new UncheckedIOException("Unable to load default stopword set", ex);
103+
throw new UncheckedIOException("Unable to load analyzer resources", ex);
78104
}
79105
}
80106
}
@@ -107,22 +133,7 @@ public UkrainianMorfologikAnalyzer(CharArraySet stopwords, CharArraySet stemExcl
107133

108134
@Override
109135
protected Reader initReader(String fieldName, Reader reader) {
110-
NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder();
111-
// different apostrophes
112-
builder.add("\u2019", "'");
113-
builder.add("\u2018", "'");
114-
builder.add("\u02BC", "'");
115-
builder.add("`", "'");
116-
builder.add("´", "'");
117-
// ignored characters
118-
builder.add("\u0301", "");
119-
builder.add("\u00AD", "");
120-
builder.add("ґ", "г");
121-
builder.add("Ґ", "Г");
122-
123-
NormalizeCharMap normMap = builder.build();
124-
reader = new MappingCharFilter(normMap, reader);
125-
return reader;
136+
return new MappingCharFilter(NORMALIZER_MAP, reader);
126137
}
127138

128139
/**
@@ -144,18 +155,7 @@ protected TokenStreamComponents createComponents(String fieldName) {
144155
result = new SetKeywordMarkerFilter(result, stemExclusionSet);
145156
}
146157

147-
result = new MorfologikFilter(result, getDictionary());
158+
result = new MorfologikFilter(result, DefaultSetHolder.DICTIONARY);
148159
return new TokenStreamComponents(source, result);
149160
}
150-
151-
private static Dictionary getDictionary() {
152-
try {
153-
return Dictionary.read(
154-
UkrainianMorfologikAnalyzer.class
155-
.getClassLoader()
156-
.getResource("ua/net/nlp/ukrainian.dict"));
157-
} catch (IOException e) {
158-
throw new RuntimeException(e);
159-
}
160-
}
161161
}

0 commit comments

Comments
 (0)