42
42
* @since 6.2.0
43
43
*/
44
44
public final class UkrainianMorfologikAnalyzer extends StopwordAnalyzerBase {
45
+
45
46
private final CharArraySet stemExclusionSet ;
46
47
47
48
/** File containing default Ukrainian stopwords. */
48
49
public static final String DEFAULT_STOPWORD_FILE = "stopwords.txt" ;
49
50
51
+ private static final NormalizeCharMap NORMALIZER_MAP ;
52
+
53
+ static {
54
+ NormalizeCharMap .Builder builder = new NormalizeCharMap .Builder ();
55
+ // different apostrophes
56
+ builder .add ("\u2019 " , "'" );
57
+ builder .add ("\u2018 " , "'" );
58
+ builder .add ("\u02BC " , "'" );
59
+ builder .add ("`" , "'" );
60
+ builder .add ("´" , "'" );
61
+ // ignored characters
62
+ builder .add ("\u0301 " , "" );
63
+ builder .add ("\u00AD " , "" );
64
+ builder .add ("ґ" , "г" );
65
+ builder .add ("Ґ" , "Г" );
66
+
67
+ NORMALIZER_MAP = builder .build ();
68
+ }
69
+
50
70
/**
51
71
* Returns an unmodifiable instance of the default stop words set.
52
72
*
@@ -57,11 +77,12 @@ public static CharArraySet getDefaultStopSet() {
57
77
}
58
78
59
79
/**
60
- * Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class accesses the
61
- * static final set the first time.;
80
+ * Atomically loads the DEFAULT_STOP_SET and DICTIONARY in a lazy fashion once the outer class
81
+ * accesses the static final set the first time.;
62
82
*/
63
83
private static class DefaultSetHolder {
64
84
static final CharArraySet DEFAULT_STOP_SET ;
85
+ static final Dictionary DICTIONARY ;
65
86
66
87
static {
67
88
try {
@@ -71,10 +92,15 @@ private static class DefaultSetHolder {
71
92
UkrainianMorfologikAnalyzer .class ,
72
93
DEFAULT_STOPWORD_FILE ,
73
94
StandardCharsets .UTF_8 ));
95
+ DICTIONARY =
96
+ Dictionary .read (
97
+ UkrainianMorfologikAnalyzer .class
98
+ .getClassLoader ()
99
+ .getResource ("ua/net/nlp/ukrainian.dict" ));
74
100
} catch (IOException ex ) {
75
101
// default set should always be present as it is part of the
76
102
// distribution (JAR)
77
- throw new UncheckedIOException ("Unable to load default stopword set " , ex );
103
+ throw new UncheckedIOException ("Unable to load analyzer resources " , ex );
78
104
}
79
105
}
80
106
}
@@ -107,22 +133,7 @@ public UkrainianMorfologikAnalyzer(CharArraySet stopwords, CharArraySet stemExcl
107
133
108
134
@ Override
109
135
protected Reader initReader (String fieldName , Reader reader ) {
110
- NormalizeCharMap .Builder builder = new NormalizeCharMap .Builder ();
111
- // different apostrophes
112
- builder .add ("\u2019 " , "'" );
113
- builder .add ("\u2018 " , "'" );
114
- builder .add ("\u02BC " , "'" );
115
- builder .add ("`" , "'" );
116
- builder .add ("´" , "'" );
117
- // ignored characters
118
- builder .add ("\u0301 " , "" );
119
- builder .add ("\u00AD " , "" );
120
- builder .add ("ґ" , "г" );
121
- builder .add ("Ґ" , "Г" );
122
-
123
- NormalizeCharMap normMap = builder .build ();
124
- reader = new MappingCharFilter (normMap , reader );
125
- return reader ;
136
+ return new MappingCharFilter (NORMALIZER_MAP , reader );
126
137
}
127
138
128
139
/**
@@ -144,18 +155,7 @@ protected TokenStreamComponents createComponents(String fieldName) {
144
155
result = new SetKeywordMarkerFilter (result , stemExclusionSet );
145
156
}
146
157
147
- result = new MorfologikFilter (result , getDictionary () );
158
+ result = new MorfologikFilter (result , DefaultSetHolder . DICTIONARY );
148
159
return new TokenStreamComponents (source , result );
149
160
}
150
-
151
- private static Dictionary getDictionary () {
152
- try {
153
- return Dictionary .read (
154
- UkrainianMorfologikAnalyzer .class
155
- .getClassLoader ()
156
- .getResource ("ua/net/nlp/ukrainian.dict" ));
157
- } catch (IOException e ) {
158
- throw new RuntimeException (e );
159
- }
160
- }
161
161
}
0 commit comments