@@ -20,87 +20,187 @@ import cats.effect.kernel.{Resource, Sync}
2020import org .apache .lucene .analysis .Analyzer .TokenStreamComponents
2121import org .apache .lucene .analysis .standard .StandardTokenizer
2222import org .apache .lucene .analysis .en .PorterStemFilter
23+ import org .apache .lucene .analysis .es .SpanishLightStemFilter
24+ import org .apache .lucene .analysis .fr .FrenchLightStemFilter
2325import org .apache .lucene .analysis .LowerCaseFilter
2426import org .apache .lucene .analysis .Analyzer
2527import org .apache .lucene .analysis .miscellaneous .ASCIIFoldingFilter
2628import org .apache .lucene .analysis .CharArraySet
2729import org .apache .lucene .analysis .StopFilter
30+ import org .apache .lucene .analysis .TokenStream
2831
29- /** Build an Analyzer or tokenizer function
30- */
31- final class AnalyzerBuilder private (
32- val lowerCase : Boolean ,
33- val foldASCII : Boolean ,
34- val stopWords : Set [String ],
35- val stemmer : Boolean ,
36- ) { self =>
32+ final case class Config (
33+ lowerCase : Boolean ,
34+ foldASCII : Boolean ,
35+ stopWords : Set [String ],
36+ ) {
37+ def withLowerCasing : Config =
38+ copy(lowerCase = true )
3739
38- private def copy (
39- lowerCase : Boolean = self.lowerCase,
40- foldASCII : Boolean = self.foldASCII,
41- stemmer : Boolean = self.stemmer,
42- stopWords : Set [String ] = self.stopWords,
43- ): AnalyzerBuilder =
44- new AnalyzerBuilder (
45- lowerCase = lowerCase,
46- foldASCII = foldASCII,
47- stemmer = stemmer,
48- stopWords = stopWords,
49- )
40+ def withASCIIFolding : Config =
41+ copy(foldASCII = true )
42+
43+ def withStopWords (words : Set [String ]): Config =
44+ copy(stopWords = words)
45+ }
46+ object Config {
47+ def empty : Config = Config (false , false , Set .empty)
48+ }
49+
50+ /** Build an Analyzer or tokenizer function */
51+ sealed abstract class AnalyzerBuilder private [lucene] (config : Config ) {
52+ type Builder <: AnalyzerBuilder
53+
54+ def withConfig (config : Config ): Builder
5055
5156 /** Adds a lowercasing stage to the analyzer pipeline */
52- def withLowerCasing : AnalyzerBuilder =
53- copy(lowerCase = true )
57+ def withLowerCasing : Builder =
58+ withConfig(config.withLowerCasing )
5459
5560 /** Adds an ASCII folding stage to the analyzer pipeline
5661 * ASCII folding converts alphanumeric and symbolic Unicode characters into
5762 * their ASCII equivalents, if one exists.
5863 */
59- def withASCIIFolding : AnalyzerBuilder =
60- copy(foldASCII = true )
64+ def withASCIIFolding : Builder =
65+ withConfig(config.withASCIIFolding )
6166
62- /** Adds the Porter Stemmer to the end of the analyzer pipeline and enables lowercasing.
63- * Stemming reduces words like `jumping` and `jumps` to their root word `jump`.
64- * NOTE: Lowercasing is forced as it is required for the Lucene PorterStemFilter.
65- */
66- def withPorterStemmer : AnalyzerBuilder =
67- copy(stemmer = true , lowerCase = true )
67+ /** Adds a stop filter stage to analyzer pipeline for non-empty sets. */
68+ def withStopWords (words : Set [String ]): Builder =
69+ withConfig(config.withStopWords(words))
6870
69- /** Adds a stop filter stage to analyzer pipeline for non-empty sets.
70- */
71- def withStopWords (words : Set [String ]): AnalyzerBuilder =
72- copy(stopWords = words)
71+ /** Build the Analyzer wrapped inside a Resource. */
72+ def build [F [_]](implicit F : Sync [F ]): Resource [F , Analyzer ]
7373
74- /** Build the Analyzer wrapped inside a Resource.
74+ /** Directly construct a tokenizing function
7575 */
76- def build [F [_]](implicit F : Sync [F ]): Resource [F , Analyzer ] =
76+ def tokenizer [F [_]](implicit F : Sync [F ]): Resource [F , String => F [Vector [String ]]] =
77+ build.map(a => Tokenizer .vectorTokenizer(a))
78+
79+ private [lucene] def mkFromStandardTokenizer [F [_]](
80+ config : Config
81+ )(extras : TokenStream => TokenStream )(implicit F : Sync [F ]): Resource [F , Analyzer ] =
7782 Resource .make(F .delay(new Analyzer {
7883 protected def createComponents (fieldName : String ): TokenStreamComponents = {
7984 val source = new StandardTokenizer ()
80- var tokens = if (self .lowerCase) new LowerCaseFilter (source) else source
81- tokens = if (self .foldASCII) new ASCIIFoldingFilter (tokens) else tokens
85+ var tokens = if (config .lowerCase) new LowerCaseFilter (source) else source
86+ tokens = if (config .foldASCII) new ASCIIFoldingFilter (tokens) else tokens
8287 tokens =
83- if (self .stopWords.isEmpty) tokens
88+ if (config .stopWords.isEmpty) tokens
8489 else {
85- val stopSet = new CharArraySet (self .stopWords.size, true )
86- stopWords.foreach(w => stopSet.add(w))
90+ val stopSet = new CharArraySet (config .stopWords.size, true )
91+ config. stopWords.foreach(w => stopSet.add(w))
8792 new StopFilter (tokens, stopSet)
8893 }
89- tokens = if (self.stemmer) new PorterStemFilter (tokens) else tokens
90- new TokenStreamComponents (source, tokens)
94+ new TokenStreamComponents (source, extras(tokens))
9195 }
9296 }))(analyzer => F .delay(analyzer.close()))
9397
94- /** Directly construct a tokenizing function
95- */
96- def tokenizer [F [_]](implicit F : Sync [F ]): Resource [F , String => F [Vector [String ]]] =
97- self.build.map(a => Tokenizer .vectorTokenizer(a))
9898}
9999object AnalyzerBuilder {
100- def default : AnalyzerBuilder = new AnalyzerBuilder (
101- lowerCase = false ,
102- foldASCII = false ,
103- stemmer = false ,
104- stopWords = Set .empty,
105- )
100+ def default : DefaultAnalyzerBuilder =
101+ new DefaultAnalyzerBuilder (Config .empty)
102+ def english : EnglishAnalyzerBuilder =
103+ new EnglishAnalyzerBuilder (Config .empty, false )
104+ def french : FrenchAnalyzerBuilder =
105+ new FrenchAnalyzerBuilder (Config .empty, false )
106+ def spanish : SpanishAnalyzerBuilder =
107+ new SpanishAnalyzerBuilder (Config .empty, false )
108+ }
109+
110+ final class DefaultAnalyzerBuilder private [lucene] (config : Config )
111+ extends AnalyzerBuilder (config) { self =>
112+ type Builder = DefaultAnalyzerBuilder
113+
114+ def withConfig (newConfig : Config ): DefaultAnalyzerBuilder =
115+ new DefaultAnalyzerBuilder (newConfig)
116+
117+ def english : EnglishAnalyzerBuilder =
118+ new EnglishAnalyzerBuilder (config, false )
119+
120+ def french : FrenchAnalyzerBuilder =
121+ new FrenchAnalyzerBuilder (config, false )
122+
123+ def spanish : SpanishAnalyzerBuilder =
124+ new SpanishAnalyzerBuilder (config, false )
125+
126+ def build [F [_]](implicit F : Sync [F ]): Resource [F , Analyzer ] =
127+ mkFromStandardTokenizer(config)(identity)
128+ }
129+
130+ final class EnglishAnalyzerBuilder private [lucene] (
131+ config : Config ,
132+ stemmer : Boolean ,
133+ ) extends AnalyzerBuilder (config) { self =>
134+ type Builder = EnglishAnalyzerBuilder
135+
136+ private def copy (
137+ newConfig : Config ,
138+ stemmer : Boolean = self.stemmer,
139+ ): EnglishAnalyzerBuilder =
140+ new EnglishAnalyzerBuilder (newConfig, stemmer)
141+
142+ def withConfig (newConfig : Config ): EnglishAnalyzerBuilder =
143+ copy(newConfig = newConfig)
144+
145+ /** Adds the Porter Stemmer to the end of the analyzer pipeline and enables lowercasing.
146+ * Stemming reduces words like `jumping` and `jumps` to their root word `jump`.
147+ * NOTE: Lowercasing is forced as it is required for the Lucene PorterStemFilter.
148+ */
149+ def withPorterStemmer : EnglishAnalyzerBuilder =
150+ copy(config.copy(lowerCase = true ), stemmer = true )
151+
152+ def build [F [_]](implicit F : Sync [F ]): Resource [F , Analyzer ] =
153+ mkFromStandardTokenizer(config)(ts => if (self.stemmer) new PorterStemFilter (ts) else ts)
154+ }
155+
156+ final class FrenchAnalyzerBuilder private [lucene] (
157+ config : Config ,
158+ stemmer : Boolean ,
159+ ) extends AnalyzerBuilder (config) { self =>
160+ type Builder = FrenchAnalyzerBuilder
161+
162+ private def copy (
163+ newConfig : Config ,
164+ stemmer : Boolean = self.stemmer,
165+ ): FrenchAnalyzerBuilder =
166+ new FrenchAnalyzerBuilder (newConfig, stemmer)
167+
168+ def withConfig (newConfig : Config ): FrenchAnalyzerBuilder =
169+ copy(newConfig = newConfig)
170+
171+ /** Adds the FrenchLight Stemmer to the end of the analyzer pipeline and enables lowercasing.
172+ * Stemming reduces words like `jumping` and `jumps` to their root word `jump`.
173+ * NOTE: Lowercasing is forced as it is required for the Lucene FrenchLightStemFilter.
174+ */
175+ def withFrenchLightStemmer : FrenchAnalyzerBuilder =
176+ copy(config.copy(lowerCase = true ), stemmer = true )
177+
178+ def build [F [_]](implicit F : Sync [F ]): Resource [F , Analyzer ] =
179+ mkFromStandardTokenizer(config)(ts => if (self.stemmer) new FrenchLightStemFilter (ts) else ts)
180+ }
181+
182+ final class SpanishAnalyzerBuilder private [lucene] (
183+ config : Config ,
184+ stemmer : Boolean ,
185+ ) extends AnalyzerBuilder (config) { self =>
186+ type Builder = SpanishAnalyzerBuilder
187+
188+ private def copy (
189+ newConfig : Config ,
190+ stemmer : Boolean = self.stemmer,
191+ ): SpanishAnalyzerBuilder =
192+ new SpanishAnalyzerBuilder (newConfig, stemmer)
193+
194+ def withConfig (newConfig : Config ): SpanishAnalyzerBuilder =
195+ copy(newConfig = newConfig)
196+
197+ /** Adds the SpanishLight Stemmer to the end of the analyzer pipeline and enables lowercasing.
198+ * Stemming reduces words like `jumping` and `jumps` to their root word `jump`.
199+ * NOTE: Lowercasing is forced as it is required for the Lucene SpanishLightStemFilter.
200+ */
201+ def withSpanishLightStemmer : SpanishAnalyzerBuilder =
202+ copy(config.copy(lowerCase = true ), stemmer = true )
203+
204+ def build [F [_]](implicit F : Sync [F ]): Resource [F , Analyzer ] =
205+ mkFromStandardTokenizer(config)(ts => if (self.stemmer) new SpanishLightStemFilter (ts) else ts)
106206}
0 commit comments