Skip to content

Commit bd24dae

Browse files
authored
Merge pull request #45 from valencik/langs-pt2
Add yet more languages
2 parents 7fbfa32 + c518a35 commit bd24dae

File tree

2 files changed

+308
-5
lines changed

2 files changed

+308
-5
lines changed

lucene/src/main/scala/textmogrify/lucene/AnalyzerBuilder.scala

Lines changed: 149 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -20,12 +20,15 @@ import scala.jdk.CollectionConverters._
2020

2121
import cats.effect.kernel.{Resource, Sync}
2222
import org.apache.lucene.analysis.Analyzer.TokenStreamComponents
23+
import org.apache.lucene.analysis.snowball.SnowballFilter
2324
import org.apache.lucene.analysis.standard.StandardTokenizer
2425
import org.apache.lucene.analysis.en.PorterStemFilter
2526
import org.apache.lucene.analysis.es.SpanishLightStemFilter
2627
import org.apache.lucene.analysis.fr.FrenchLightStemFilter
2728
import org.apache.lucene.analysis.it.ItalianLightStemFilter
2829
import org.apache.lucene.analysis.de.GermanLightStemFilter
30+
import org.apache.lucene.analysis.pt.PortugueseLightStemFilter
31+
import org.apache.lucene.analysis.br.BrazilianStemFilter
2932
import org.apache.lucene.analysis.LowerCaseFilter
3033
import org.apache.lucene.analysis.Analyzer
3134
import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter
@@ -37,6 +40,11 @@ import org.apache.lucene.analysis.fr.FrenchAnalyzer.{getDefaultStopSet => getFre
3740
import org.apache.lucene.analysis.es.SpanishAnalyzer.{getDefaultStopSet => getSpanishStopSet}
3841
import org.apache.lucene.analysis.it.ItalianAnalyzer.{getDefaultStopSet => getItalianStopSet}
3942
import org.apache.lucene.analysis.de.GermanAnalyzer.{getDefaultStopSet => getGermanStopSet}
43+
import org.apache.lucene.analysis.nl.DutchAnalyzer.{getDefaultStopSet => getDutchStopSet}
44+
import org.apache.lucene.analysis.pt.PortugueseAnalyzer.{getDefaultStopSet => getPortugueseStopSet}
45+
import org.apache.lucene.analysis.br.BrazilianAnalyzer.{
46+
getDefaultStopSet => getBrazilianPortugueseStopSet
47+
}
4048

4149
final case class Config(
4250
lowerCase: Boolean,
@@ -122,6 +130,12 @@ object AnalyzerBuilder {
122130
new FrenchAnalyzerBuilder(Config.empty, false)
123131
def german: GermanAnalyzerBuilder =
124132
new GermanAnalyzerBuilder(Config.empty, false)
133+
def dutch: DutchAnalyzerBuilder =
134+
new DutchAnalyzerBuilder(Config.empty, false)
135+
def brazilianPortuguese: BrazilianPortugueseAnalyzerBuilder =
136+
new BrazilianPortugueseAnalyzerBuilder(Config.empty, false)
137+
def portuguese: PortugueseAnalyzerBuilder =
138+
new PortugueseAnalyzerBuilder(Config.empty, false)
125139
def italian: ItalianAnalyzerBuilder =
126140
new ItalianAnalyzerBuilder(Config.empty, false)
127141
def spanish: SpanishAnalyzerBuilder =
@@ -143,6 +157,21 @@ final class DefaultAnalyzerBuilder private[lucene] (config: Config)
143157
def french: FrenchAnalyzerBuilder =
144158
new FrenchAnalyzerBuilder(config, false)
145159

160+
def german: GermanAnalyzerBuilder =
161+
new GermanAnalyzerBuilder(config, false)
162+
163+
def dutch: DutchAnalyzerBuilder =
164+
new DutchAnalyzerBuilder(config, false)
165+
166+
def brazilianPortuguese: BrazilianPortugueseAnalyzerBuilder =
167+
new BrazilianPortugueseAnalyzerBuilder(config, false)
168+
169+
def portuguese: PortugueseAnalyzerBuilder =
170+
new PortugueseAnalyzerBuilder(config, false)
171+
172+
def italian: ItalianAnalyzerBuilder =
173+
new ItalianAnalyzerBuilder(config, false)
174+
146175
def spanish: SpanishAnalyzerBuilder =
147176
new SpanishAnalyzerBuilder(config, false)
148177

@@ -174,7 +203,7 @@ final class EnglishAnalyzerBuilder private[lucene] (
174203

175204
/** Adds the Porter Stemmer to the end of the analyzer pipeline and enables lowercasing.
176205
* Stemming reduces words like `jumping` and `jumps` to their root word `jump`.
177-
* NOTE: Lowercasing is forced as it is required for the Lucene PorterStemFilter.
206+
* NOTE: Lowercasing is forced as it is required by most Lucene stemmers.
178207
*/
179208
def withPorterStemmer: EnglishAnalyzerBuilder =
180209
copy(config.copy(lowerCase = true), stemmer = true)
@@ -211,7 +240,7 @@ final class FrenchAnalyzerBuilder private[lucene] (
211240

212241
/** Adds the FrenchLight Stemmer to the end of the analyzer pipeline and enables lowercasing.
213242
* Stemming reduces words like `jumping` and `jumps` to their root word `jump`.
214-
* NOTE: Lowercasing is forced as it is required for the Lucene FrenchLightStemFilter.
243+
* NOTE: Lowercasing is forced as it is required by most Lucene stemmers.
215244
*/
216245
def withFrenchLightStemmer: FrenchAnalyzerBuilder =
217246
copy(config.copy(lowerCase = true), stemmer = true)
@@ -247,7 +276,7 @@ final class SpanishAnalyzerBuilder private[lucene] (
247276

248277
/** Adds the SpanishLight Stemmer to the end of the analyzer pipeline and enables lowercasing.
249278
* Stemming reduces words like `jumping` and `jumps` to their root word `jump`.
250-
* NOTE: Lowercasing is forced as it is required for the Lucene SpanishLightStemFilter.
279+
* NOTE: Lowercasing is forced as it is required by most Lucene stemmers.
251280
*/
252281
def withSpanishLightStemmer: SpanishAnalyzerBuilder =
253282
copy(config.copy(lowerCase = true), stemmer = true)
@@ -283,7 +312,7 @@ final class ItalianAnalyzerBuilder private[lucene] (
283312

284313
/** Adds the ItalianLight Stemmer to the end of the analyzer pipeline and enables lowercasing.
285314
* Stemming reduces words like `jumping` and `jumps` to their root word `jump`.
286-
* NOTE: Lowercasing is forced as it is required for the Lucene ItalianLightStemFilter.
315+
* NOTE: Lowercasing is forced as it is required by most Lucene stemmers.
287316
*/
288317
def withItalianLightStemmer: ItalianAnalyzerBuilder =
289318
copy(config.copy(lowerCase = true), stemmer = true)
@@ -319,7 +348,7 @@ final class GermanAnalyzerBuilder private[lucene] (
319348

320349
/** Adds the GermanLight Stemmer to the end of the analyzer pipeline and enables lowercasing.
321350
* Stemming reduces words like `jumping` and `jumps` to their root word `jump`.
322-
* NOTE: Lowercasing is forced as it is required for the Lucene GermanLightStemFilter.
351+
* NOTE: Lowercasing is forced as it is required by most Lucene stemmers.
323352
*/
324353
def withGermanLightStemmer: GermanAnalyzerBuilder =
325354
copy(config.copy(lowerCase = true), stemmer = true)
@@ -330,3 +359,118 @@ final class GermanAnalyzerBuilder private[lucene] (
330359
if (self.stemmer) new GermanLightStemFilter(tokens) else tokens
331360
}
332361
}
362+
363+
final class DutchAnalyzerBuilder private[lucene] (
364+
config: Config,
365+
stemmer: Boolean,
366+
) extends AnalyzerBuilder(config) { self =>
367+
type Builder = DutchAnalyzerBuilder
368+
369+
private def copy(
370+
newConfig: Config,
371+
stemmer: Boolean = self.stemmer,
372+
): DutchAnalyzerBuilder =
373+
new DutchAnalyzerBuilder(newConfig, stemmer)
374+
375+
def withConfig(newConfig: Config): DutchAnalyzerBuilder =
376+
copy(newConfig = newConfig)
377+
378+
/** A convenience value for debugging or investigating, to inspect the Lucene default stop words.
379+
* This set is immutable, and unused; it is the underlying Lucene `CharArraySet` that we use to
380+
* build the default StopFilter
381+
*/
382+
lazy val defaultStopWords: Set[String] =
383+
getDutchStopSet().asScala.map(ca => String.valueOf(ca.asInstanceOf[Array[Char]])).toSet
384+
385+
/** Adds the Dutch Snowball Stemmer to the end of the analyzer pipeline and enables lowercasing.
386+
* Stemming reduces words like `jumping` and `jumps` to their root word `jump`.
387+
* NOTE: Lowercasing is forced as it is required by most Lucene stemmers.
388+
*/
389+
def withDutchStemmer: DutchAnalyzerBuilder =
390+
copy(config.copy(lowerCase = true), stemmer = true)
391+
392+
def build[F[_]](implicit F: Sync[F]): Resource[F, Analyzer] =
393+
mkFromStandardTokenizer(config) { ts =>
394+
val tokens = if (self.config.defaultStopWords) new StopFilter(ts, getDutchStopSet()) else ts
395+
if (self.stemmer) {
396+
new SnowballFilter(ts, new org.tartarus.snowball.ext.DutchStemmer())
397+
} else tokens
398+
}
399+
}
400+
401+
final class PortugueseAnalyzerBuilder private[lucene] (
402+
config: Config,
403+
stemmer: Boolean,
404+
) extends AnalyzerBuilder(config) { self =>
405+
type Builder = PortugueseAnalyzerBuilder
406+
407+
private def copy(
408+
newConfig: Config,
409+
stemmer: Boolean = self.stemmer,
410+
): PortugueseAnalyzerBuilder =
411+
new PortugueseAnalyzerBuilder(newConfig, stemmer)
412+
413+
def withConfig(newConfig: Config): PortugueseAnalyzerBuilder =
414+
copy(newConfig = newConfig)
415+
416+
/** A convenience value for debugging or investigating, to inspect the Lucene default stop words.
417+
* This set is immutable, and unused; it is the underlying Lucene `CharArraySet` that we use to
418+
* build the default StopFilter
419+
*/
420+
lazy val defaultStopWords: Set[String] =
421+
getPortugueseStopSet().asScala.map(ca => String.valueOf(ca.asInstanceOf[Array[Char]])).toSet
422+
423+
/** Adds the PortugueseLight Stemmer to the end of the analyzer pipeline and enables lowercasing.
424+
* Stemming reduces words like `jumping` and `jumps` to their root word `jump`.
425+
* NOTE: Lowercasing is forced as it is required by most Lucene stemmers.
426+
*/
427+
def withPortugueseLightStemmer: PortugueseAnalyzerBuilder =
428+
copy(config.copy(lowerCase = true), stemmer = true)
429+
430+
def build[F[_]](implicit F: Sync[F]): Resource[F, Analyzer] =
431+
mkFromStandardTokenizer(config) { ts =>
432+
val tokens =
433+
if (self.config.defaultStopWords) new StopFilter(ts, getPortugueseStopSet()) else ts
434+
if (self.stemmer) new PortugueseLightStemFilter(tokens) else tokens
435+
}
436+
}
437+
438+
final class BrazilianPortugueseAnalyzerBuilder private[lucene] (
439+
config: Config,
440+
stemmer: Boolean,
441+
) extends AnalyzerBuilder(config) { self =>
442+
type Builder = BrazilianPortugueseAnalyzerBuilder
443+
444+
private def copy(
445+
newConfig: Config,
446+
stemmer: Boolean = self.stemmer,
447+
): BrazilianPortugueseAnalyzerBuilder =
448+
new BrazilianPortugueseAnalyzerBuilder(newConfig, stemmer)
449+
450+
def withConfig(newConfig: Config): BrazilianPortugueseAnalyzerBuilder =
451+
copy(newConfig = newConfig)
452+
453+
/** A convenience value for debugging or investigating, to inspect the Lucene default stop words.
454+
* This set is immutable, and unused; it is the underlying Lucene `CharArraySet` that we use to
455+
* build the default StopFilter
456+
*/
457+
lazy val defaultStopWords: Set[String] =
458+
getBrazilianPortugueseStopSet().asScala
459+
.map(ca => String.valueOf(ca.asInstanceOf[Array[Char]]))
460+
.toSet
461+
462+
/** Adds the Brazilian Stemmer to the end of the analyzer pipeline and enables lowercasing.
463+
* Stemming reduces words like `jumping` and `jumps` to their root word `jump`.
464+
* NOTE: Lowercasing is forced as it is required by most Lucene stemmers.
465+
*/
466+
def withBrazilianStemmer: BrazilianPortugueseAnalyzerBuilder =
467+
copy(config.copy(lowerCase = true), stemmer = true)
468+
469+
def build[F[_]](implicit F: Sync[F]): Resource[F, Analyzer] =
470+
mkFromStandardTokenizer(config) { ts =>
471+
val tokens =
472+
if (self.config.defaultStopWords) new StopFilter(ts, getBrazilianPortugueseStopSet())
473+
else ts
474+
if (self.stemmer) new BrazilianStemFilter(tokens) else tokens
475+
}
476+
}

lucene/src/test/scala/textmogrify/lucene/AnalyzerBuilderSuite.scala

Lines changed: 159 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -316,3 +316,162 @@ class GermanAnalyzerBuilderSuite extends CatsEffectSuite {
316316
}
317317

318318
}
319+
320+
class DutchAnalyzerBuilderSuite extends CatsEffectSuite {
321+
322+
val jalapenos = "Ik hou van Jalapeños"
323+
val jumping = "Neeko springt graag op balies"
324+
325+
test("dutch analyzer default should tokenize without any transformations") {
326+
val analyzer = AnalyzerBuilder.dutch
327+
val actual = analyzer.tokenizer[IO].use(f => f(jalapenos))
328+
assertIO(actual, Vector("Ik", "hou", "van", "Jalapeños"))
329+
}
330+
331+
test("dutch analyzer withLowerCasing should lowercase all letters") {
332+
val analyzer = AnalyzerBuilder.dutch.withLowerCasing
333+
val actual = analyzer.tokenizer[IO].use(f => f(jalapenos))
334+
assertIO(actual, Vector("ik", "hou", "van", "jalapeños"))
335+
}
336+
337+
test("dutch analyzer withASCIIFolding should fold 'ñ' to 'n'") {
338+
val analyzer = AnalyzerBuilder.dutch.withASCIIFolding
339+
val actual = analyzer.tokenizer[IO].use(f => f(jalapenos))
340+
assertIO(actual, Vector("Ik", "hou", "van", "Jalapenos"))
341+
}
342+
343+
test("dutch analyzer withCustomStopWords should filter them out") {
344+
val analyzer = AnalyzerBuilder.dutch.withCustomStopWords(Set("Ik"))
345+
val actual = analyzer.tokenizer[IO].use(f => f(jalapenos))
346+
assertIO(actual, Vector("hou", "van", "Jalapeños"))
347+
}
348+
349+
test("dutch analyzer withDefaultStopWords should filter them out") {
350+
val analyzer = AnalyzerBuilder.dutch.withDefaultStopWords
351+
val actual = analyzer.tokenizer[IO].use(f => f(jumping))
352+
assertIO(actual, Vector("Neeko", "springt", "graag", "balies"))
353+
}
354+
355+
test("dutch analyzer withDutchLightStemmer should lowercase and stem words") {
356+
val analyzer = AnalyzerBuilder.dutch.withDutchStemmer
357+
val actual = analyzer.tokenizer[IO].use(f => f(jumping))
358+
assertIO(actual, Vector("neeko", "springt", "grag", "op", "balies"))
359+
}
360+
361+
test("dutch analyzer builder settings can be chained") {
362+
val analyzer = AnalyzerBuilder.dutch.withDutchStemmer
363+
.withCustomStopWords(Set("Neeko"))
364+
.withDefaultStopWords
365+
.withASCIIFolding
366+
.withLowerCasing
367+
val actual = analyzer.tokenizer[IO].use(f => f(jumping))
368+
assertIO(actual, Vector("springt", "grag", "op", "balies"))
369+
}
370+
371+
}
372+
373+
class PortugueseAnalyzerBuilderSuite extends CatsEffectSuite {
374+
375+
val jalapenos = "Eu gosto de jalapeños"
376+
val jumping = "Neeko gosta de saltar em balcões"
377+
378+
test("portuguese analyzer default should tokenize without any transformations") {
379+
val analyzer = AnalyzerBuilder.portuguese
380+
val actual = analyzer.tokenizer[IO].use(f => f(jalapenos))
381+
assertIO(actual, Vector("Eu", "gosto", "de", "jalapeños"))
382+
}
383+
384+
test("portuguese analyzer withLowerCasing should lowercase all letters") {
385+
val analyzer = AnalyzerBuilder.portuguese.withLowerCasing
386+
val actual = analyzer.tokenizer[IO].use(f => f(jalapenos))
387+
assertIO(actual, Vector("eu", "gosto", "de", "jalapeños"))
388+
}
389+
390+
test("portuguese analyzer withASCIIFolding should fold 'ñ' to 'n'") {
391+
val analyzer = AnalyzerBuilder.portuguese.withASCIIFolding
392+
val actual = analyzer.tokenizer[IO].use(f => f(jalapenos))
393+
assertIO(actual, Vector("Eu", "gosto", "de", "jalapenos"))
394+
}
395+
396+
test("portuguese analyzer withCustomStopWords should filter them out") {
397+
val analyzer = AnalyzerBuilder.portuguese.withCustomStopWords(Set("eu", "de"))
398+
val actual = analyzer.tokenizer[IO].use(f => f(jalapenos))
399+
assertIO(actual, Vector("gosto", "jalapeños"))
400+
}
401+
402+
test("portuguese analyzer withDefaultStopWords should filter them out") {
403+
val analyzer = AnalyzerBuilder.portuguese.withDefaultStopWords
404+
val actual = analyzer.tokenizer[IO].use(f => f(jumping))
405+
assertIO(actual, Vector("Neeko", "gosta", "saltar", "balcões"))
406+
}
407+
408+
test("portuguese analyzer withPortugueseLightStemmer should lowercase and stem words") {
409+
val analyzer = AnalyzerBuilder.portuguese.withPortugueseLightStemmer
410+
val actual = analyzer.tokenizer[IO].use(f => f(jumping))
411+
assertIO(actual, Vector("neek", "gost", "de", "saltar", "em", "balca"))
412+
}
413+
414+
test("portuguese analyzer builder settings can be chained") {
415+
val analyzer = AnalyzerBuilder.portuguese.withPortugueseLightStemmer
416+
.withCustomStopWords(Set("Neeko"))
417+
.withDefaultStopWords
418+
.withASCIIFolding
419+
.withLowerCasing
420+
val actual = analyzer.tokenizer[IO].use(f => f(jumping))
421+
assertIO(actual, Vector("gost", "saltar", "balco"))
422+
}
423+
424+
}
425+
426+
class BrazilianPortugueseAnalyzerBuilderSuite extends CatsEffectSuite {
427+
428+
val jalapenos = "Eu gosto de jalapeños"
429+
val jumping = "Neeko gosta de pular em balcões"
430+
431+
test("brazilianPortuguese analyzer default should tokenize without any transformations") {
432+
val analyzer = AnalyzerBuilder.brazilianPortuguese
433+
val actual = analyzer.tokenizer[IO].use(f => f(jalapenos))
434+
assertIO(actual, Vector("Eu", "gosto", "de", "jalapeños"))
435+
}
436+
437+
test("brazilianPortuguese analyzer withLowerCasing should lowercase all letters") {
438+
val analyzer = AnalyzerBuilder.brazilianPortuguese.withLowerCasing
439+
val actual = analyzer.tokenizer[IO].use(f => f(jalapenos))
440+
assertIO(actual, Vector("eu", "gosto", "de", "jalapeños"))
441+
}
442+
443+
test("brazilianPortuguese analyzer withASCIIFolding should fold 'ñ' to 'n'") {
444+
val analyzer = AnalyzerBuilder.brazilianPortuguese.withASCIIFolding
445+
val actual = analyzer.tokenizer[IO].use(f => f(jalapenos))
446+
assertIO(actual, Vector("Eu", "gosto", "de", "jalapenos"))
447+
}
448+
449+
test("brazilianPortuguese analyzer withCustomStopWords should filter them out") {
450+
val analyzer = AnalyzerBuilder.brazilianPortuguese.withCustomStopWords(Set("eu", "de"))
451+
val actual = analyzer.tokenizer[IO].use(f => f(jalapenos))
452+
assertIO(actual, Vector("gosto", "jalapeños"))
453+
}
454+
455+
test("brazilianPortuguese analyzer withDefaultStopWords should filter them out") {
456+
val analyzer = AnalyzerBuilder.brazilianPortuguese.withDefaultStopWords
457+
val actual = analyzer.tokenizer[IO].use(f => f(jumping))
458+
assertIO(actual, Vector("Neeko", "gosta", "pular", "balcões"))
459+
}
460+
461+
test("brazilianPortuguese analyzer withPortugueseLightStemmer should lowercase and stem words") {
462+
val analyzer = AnalyzerBuilder.brazilianPortuguese.withBrazilianStemmer
463+
val actual = analyzer.tokenizer[IO].use(f => f(jumping))
464+
assertIO(actual, Vector("neek", "gost", "de", "pul", "em", "balco"))
465+
}
466+
467+
test("brazilianPortuguese analyzer builder settings can be chained") {
468+
val analyzer = AnalyzerBuilder.brazilianPortuguese.withBrazilianStemmer
469+
.withCustomStopWords(Set("Neeko"))
470+
.withDefaultStopWords
471+
.withASCIIFolding
472+
.withLowerCasing
473+
val actual = analyzer.tokenizer[IO].use(f => f(jumping))
474+
assertIO(actual, Vector("gost", "pul", "balco"))
475+
}
476+
477+
}

0 commit comments

Comments
 (0)