@@ -20,12 +20,15 @@ import scala.jdk.CollectionConverters._
2020
2121import cats .effect .kernel .{Resource , Sync }
2222import org .apache .lucene .analysis .Analyzer .TokenStreamComponents
23+ import org .apache .lucene .analysis .snowball .SnowballFilter
2324import org .apache .lucene .analysis .standard .StandardTokenizer
2425import org .apache .lucene .analysis .en .PorterStemFilter
2526import org .apache .lucene .analysis .es .SpanishLightStemFilter
2627import org .apache .lucene .analysis .fr .FrenchLightStemFilter
2728import org .apache .lucene .analysis .it .ItalianLightStemFilter
2829import org .apache .lucene .analysis .de .GermanLightStemFilter
30+ import org .apache .lucene .analysis .pt .PortugueseLightStemFilter
31+ import org .apache .lucene .analysis .br .BrazilianStemFilter
2932import org .apache .lucene .analysis .LowerCaseFilter
3033import org .apache .lucene .analysis .Analyzer
3134import org .apache .lucene .analysis .miscellaneous .ASCIIFoldingFilter
@@ -37,6 +40,11 @@ import org.apache.lucene.analysis.fr.FrenchAnalyzer.{getDefaultStopSet => getFre
3740import org .apache .lucene .analysis .es .SpanishAnalyzer .{getDefaultStopSet => getSpanishStopSet }
3841import org .apache .lucene .analysis .it .ItalianAnalyzer .{getDefaultStopSet => getItalianStopSet }
3942import org .apache .lucene .analysis .de .GermanAnalyzer .{getDefaultStopSet => getGermanStopSet }
43+ import org .apache .lucene .analysis .nl .DutchAnalyzer .{getDefaultStopSet => getDutchStopSet }
44+ import org .apache .lucene .analysis .pt .PortugueseAnalyzer .{getDefaultStopSet => getPortugueseStopSet }
45+ import org .apache .lucene .analysis .br .BrazilianAnalyzer .{
46+ getDefaultStopSet => getBrazilianPortugueseStopSet
47+ }
4048
4149final case class Config (
4250 lowerCase : Boolean ,
@@ -122,6 +130,12 @@ object AnalyzerBuilder {
122130 new FrenchAnalyzerBuilder (Config .empty, false )
123131 def german : GermanAnalyzerBuilder =
124132 new GermanAnalyzerBuilder (Config .empty, false )
133+ def dutch : DutchAnalyzerBuilder =
134+ new DutchAnalyzerBuilder (Config .empty, false )
135+ def brazilianPortuguese : BrazilianPortugueseAnalyzerBuilder =
136+ new BrazilianPortugueseAnalyzerBuilder (Config .empty, false )
137+ def portuguese : PortugueseAnalyzerBuilder =
138+ new PortugueseAnalyzerBuilder (Config .empty, false )
125139 def italian : ItalianAnalyzerBuilder =
126140 new ItalianAnalyzerBuilder (Config .empty, false )
127141 def spanish : SpanishAnalyzerBuilder =
@@ -143,6 +157,21 @@ final class DefaultAnalyzerBuilder private[lucene] (config: Config)
143157 def french : FrenchAnalyzerBuilder =
144158 new FrenchAnalyzerBuilder (config, false )
145159
160+ def german : GermanAnalyzerBuilder =
161+ new GermanAnalyzerBuilder (config, false )
162+
163+ def dutch : DutchAnalyzerBuilder =
164+ new DutchAnalyzerBuilder (config, false )
165+
166+ def brazilianPortuguese : BrazilianPortugueseAnalyzerBuilder =
167+ new BrazilianPortugueseAnalyzerBuilder (config, false )
168+
169+ def portuguese : PortugueseAnalyzerBuilder =
170+ new PortugueseAnalyzerBuilder (config, false )
171+
172+ def italian : ItalianAnalyzerBuilder =
173+ new ItalianAnalyzerBuilder (config, false )
174+
146175 def spanish : SpanishAnalyzerBuilder =
147176 new SpanishAnalyzerBuilder (config, false )
148177
@@ -174,7 +203,7 @@ final class EnglishAnalyzerBuilder private[lucene] (
174203
175204 /** Adds the Porter Stemmer to the end of the analyzer pipeline and enables lowercasing.
176205 * Stemming reduces words like `jumping` and `jumps` to their root word `jump`.
177- * NOTE: Lowercasing is forced as it is required for the Lucene PorterStemFilter .
206+ * NOTE: Lowercasing is forced as it is required by most Lucene stemmers .
178207 */
179208 def withPorterStemmer : EnglishAnalyzerBuilder =
180209 copy(config.copy(lowerCase = true ), stemmer = true )
@@ -211,7 +240,7 @@ final class FrenchAnalyzerBuilder private[lucene] (
211240
212241 /** Adds the FrenchLight Stemmer to the end of the analyzer pipeline and enables lowercasing.
213242 * Stemming reduces words like `jumping` and `jumps` to their root word `jump`.
214- * NOTE: Lowercasing is forced as it is required for the Lucene FrenchLightStemFilter .
243+ * NOTE: Lowercasing is forced as it is required by most Lucene stemmers .
215244 */
216245 def withFrenchLightStemmer : FrenchAnalyzerBuilder =
217246 copy(config.copy(lowerCase = true ), stemmer = true )
@@ -247,7 +276,7 @@ final class SpanishAnalyzerBuilder private[lucene] (
247276
248277 /** Adds the SpanishLight Stemmer to the end of the analyzer pipeline and enables lowercasing.
249278 * Stemming reduces words like `jumping` and `jumps` to their root word `jump`.
250- * NOTE: Lowercasing is forced as it is required for the Lucene SpanishLightStemFilter .
279+ * NOTE: Lowercasing is forced as it is required by most Lucene stemmers .
251280 */
252281 def withSpanishLightStemmer : SpanishAnalyzerBuilder =
253282 copy(config.copy(lowerCase = true ), stemmer = true )
@@ -283,7 +312,7 @@ final class ItalianAnalyzerBuilder private[lucene] (
283312
284313 /** Adds the ItalianLight Stemmer to the end of the analyzer pipeline and enables lowercasing.
285314 * Stemming reduces words like `jumping` and `jumps` to their root word `jump`.
286- * NOTE: Lowercasing is forced as it is required for the Lucene ItalianLightStemFilter .
315+ * NOTE: Lowercasing is forced as it is required by most Lucene stemmers .
287316 */
288317 def withItalianLightStemmer : ItalianAnalyzerBuilder =
289318 copy(config.copy(lowerCase = true ), stemmer = true )
@@ -319,7 +348,7 @@ final class GermanAnalyzerBuilder private[lucene] (
319348
320349 /** Adds the GermanLight Stemmer to the end of the analyzer pipeline and enables lowercasing.
321350 * Stemming reduces words like `jumping` and `jumps` to their root word `jump`.
322- * NOTE: Lowercasing is forced as it is required for the Lucene GermanLightStemFilter .
351+ * NOTE: Lowercasing is forced as it is required by most Lucene stemmers .
323352 */
324353 def withGermanLightStemmer : GermanAnalyzerBuilder =
325354 copy(config.copy(lowerCase = true ), stemmer = true )
@@ -330,3 +359,118 @@ final class GermanAnalyzerBuilder private[lucene] (
330359 if (self.stemmer) new GermanLightStemFilter (tokens) else tokens
331360 }
332361}
362+
363+ final class DutchAnalyzerBuilder private [lucene] (
364+ config : Config ,
365+ stemmer : Boolean ,
366+ ) extends AnalyzerBuilder (config) { self =>
367+ type Builder = DutchAnalyzerBuilder
368+
369+ private def copy (
370+ newConfig : Config ,
371+ stemmer : Boolean = self.stemmer,
372+ ): DutchAnalyzerBuilder =
373+ new DutchAnalyzerBuilder (newConfig, stemmer)
374+
375+ def withConfig (newConfig : Config ): DutchAnalyzerBuilder =
376+ copy(newConfig = newConfig)
377+
378+ /** A convenience value for debugging or investigating, to inspect the Lucene default stop words.
379+ * This set is immutable, and unused; it is the underlying Lucene `CharArraySet` that we use to
380+ * build the default StopFilter
381+ */
382+ lazy val defaultStopWords : Set [String ] =
383+ getDutchStopSet().asScala.map(ca => String .valueOf(ca.asInstanceOf [Array [Char ]])).toSet
384+
385+ /** Adds the Dutch Snowball Stemmer to the end of the analyzer pipeline and enables lowercasing.
386+ * Stemming reduces words like `jumping` and `jumps` to their root word `jump`.
387+ * NOTE: Lowercasing is forced as it is required by most Lucene stemmers.
388+ */
389+ def withDutchStemmer : DutchAnalyzerBuilder =
390+ copy(config.copy(lowerCase = true ), stemmer = true )
391+
392+ def build [F [_]](implicit F : Sync [F ]): Resource [F , Analyzer ] =
393+ mkFromStandardTokenizer(config) { ts =>
394+ val tokens = if (self.config.defaultStopWords) new StopFilter (ts, getDutchStopSet()) else ts
395+ if (self.stemmer) {
396+ new SnowballFilter (ts, new org.tartarus.snowball.ext.DutchStemmer ())
397+ } else tokens
398+ }
399+ }
400+
401+ final class PortugueseAnalyzerBuilder private [lucene] (
402+ config : Config ,
403+ stemmer : Boolean ,
404+ ) extends AnalyzerBuilder (config) { self =>
405+ type Builder = PortugueseAnalyzerBuilder
406+
407+ private def copy (
408+ newConfig : Config ,
409+ stemmer : Boolean = self.stemmer,
410+ ): PortugueseAnalyzerBuilder =
411+ new PortugueseAnalyzerBuilder (newConfig, stemmer)
412+
413+ def withConfig (newConfig : Config ): PortugueseAnalyzerBuilder =
414+ copy(newConfig = newConfig)
415+
416+ /** A convenience value for debugging or investigating, to inspect the Lucene default stop words.
417+ * This set is immutable, and unused; it is the underlying Lucene `CharArraySet` that we use to
418+ * build the default StopFilter
419+ */
420+ lazy val defaultStopWords : Set [String ] =
421+ getPortugueseStopSet().asScala.map(ca => String .valueOf(ca.asInstanceOf [Array [Char ]])).toSet
422+
423+ /** Adds the PortugueseLight Stemmer to the end of the analyzer pipeline and enables lowercasing.
424+ * Stemming reduces words like `jumping` and `jumps` to their root word `jump`.
425+ * NOTE: Lowercasing is forced as it is required by most Lucene stemmers.
426+ */
427+ def withPortugueseLightStemmer : PortugueseAnalyzerBuilder =
428+ copy(config.copy(lowerCase = true ), stemmer = true )
429+
430+ def build [F [_]](implicit F : Sync [F ]): Resource [F , Analyzer ] =
431+ mkFromStandardTokenizer(config) { ts =>
432+ val tokens =
433+ if (self.config.defaultStopWords) new StopFilter (ts, getPortugueseStopSet()) else ts
434+ if (self.stemmer) new PortugueseLightStemFilter (tokens) else tokens
435+ }
436+ }
437+
438+ final class BrazilianPortugueseAnalyzerBuilder private [lucene] (
439+ config : Config ,
440+ stemmer : Boolean ,
441+ ) extends AnalyzerBuilder (config) { self =>
442+ type Builder = BrazilianPortugueseAnalyzerBuilder
443+
444+ private def copy (
445+ newConfig : Config ,
446+ stemmer : Boolean = self.stemmer,
447+ ): BrazilianPortugueseAnalyzerBuilder =
448+ new BrazilianPortugueseAnalyzerBuilder (newConfig, stemmer)
449+
450+ def withConfig (newConfig : Config ): BrazilianPortugueseAnalyzerBuilder =
451+ copy(newConfig = newConfig)
452+
453+ /** A convenience value for debugging or investigating, to inspect the Lucene default stop words.
454+ * This set is immutable, and unused; it is the underlying Lucene `CharArraySet` that we use to
455+ * build the default StopFilter
456+ */
457+ lazy val defaultStopWords : Set [String ] =
458+ getBrazilianPortugueseStopSet().asScala
459+ .map(ca => String .valueOf(ca.asInstanceOf [Array [Char ]]))
460+ .toSet
461+
462+ /** Adds the Brazilian Stemmer to the end of the analyzer pipeline and enables lowercasing.
463+ * Stemming reduces words like `jumping` and `jumps` to their root word `jump`.
464+ * NOTE: Lowercasing is forced as it is required by most Lucene stemmers.
465+ */
466+ def withBrazilianStemmer : BrazilianPortugueseAnalyzerBuilder =
467+ copy(config.copy(lowerCase = true ), stemmer = true )
468+
469+ def build [F [_]](implicit F : Sync [F ]): Resource [F , Analyzer ] =
470+ mkFromStandardTokenizer(config) { ts =>
471+ val tokens =
472+ if (self.config.defaultStopWords) new StopFilter (ts, getBrazilianPortugueseStopSet())
473+ else ts
474+ if (self.stemmer) new BrazilianStemFilter (tokens) else tokens
475+ }
476+ }
0 commit comments