Skip to content

Commit 0dccf13

Browse files
authored
Merge pull request #25 from valencik/other-langs
Add Support For Multiple Languages
2 parents 98f7008 + b11ce2b commit 0dccf13

File tree

5 files changed

+406
-70
lines changed

5 files changed

+406
-70
lines changed

docs/index.md

Lines changed: 22 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ Textmogrify is a pre-alpha text manipulation library that hopefully works well w
44

55
## Usage
66

7-
This library is currently available for Scala binary versions 2.13 and 3.1.
7+
This library is currently available for Scala binary versions 2.13 and 3.2.
88

99
To use the latest version, include the following in your `build.sbt`:
1010

@@ -26,9 +26,10 @@ libraryDependencies ++= Seq(
2626

2727
The Lucene module lets you use a Lucene [Analyzer][analyzer] to modify text, additionally it provides helpers to use `Analyzer`s with an fs2 [Stream][stream].
2828

29+
2930
### Basics
3031

31-
Typical usage is to use the `AnalyzerBuilder` to configure an `Analyzer` and call `.tokenizer` to get a `Resource[F, String => F[Vector[String]]]`:
32+
Typical usage is to use the `AnalyzerBuilder` to configure an `Analyzer` and call `.tokenizer[F]` to get a `Resource[F, String => F[Vector[String]]]`:
3233

3334
```scala mdoc:silent
3435
import textmogrify.lucene.AnalyzerBuilder
@@ -52,9 +53,26 @@ tokens.unsafeRunSync()
5253
We can see that our text was lowercased and the unicode `ñ` replaced with an ASCII `n`.
5354

5455

56+
### Languages
57+
58+
Textmogrify comes with support for multiple languages.
59+
When setting up an `AnalyzerBuilder` you'll have access to language specific options once you call one of the helper language methods like `english` or `french`.
60+
Specifying a language preserves the configuration set beforehand.
61+
62+
```scala mdoc:silent
63+
val base = AnalyzerBuilder.default.withLowerCasing.withASCIIFolding
64+
65+
val en = base.english.withPorterStemmer.tokenizer[IO]
66+
val fr = base.french.withFrenchLightStemmer.tokenizer[IO]
67+
val es = base.spanish.withSpanishLightStemmer.tokenizer[IO]
68+
```
69+
70+
All of `en`, `fr`, and `es` will both lowercase and asciifold their inputs in addition to using their language specific stemmers.
71+
72+
5573
### Pipelines
5674

57-
Another common use is to construct a `Pipe`, or `Stream` to `Stream` function.
75+
Another common use is to construct a `Pipe`, or `Stream` to `Stream` function using an `Analyzer`.
5876
Let's say we have some messages we want to analyze and index as part of some search component.
5977
Given a raw `Msg` type and an analyzed `Doc` type, we want to transform a `Stream[F, Msg]` into a `Stream[F, Doc]`.
6078

@@ -75,7 +93,7 @@ val input = Stream(
7593
import fs2.Pipe
7694

7795
val normalizeMsgs: Pipe[IO, Msg, Doc] = msgs => {
78-
val tokenizer = AnalyzerBuilder.default
96+
val tokenizer = AnalyzerBuilder.english
7997
.withLowerCasing
8098
.withStopWords(Set("how", "do", "i", "my"))
8199
.withPorterStemmer
Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
/*
2+
* Copyright 2022 Pig.io
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
package textmogrify
18+
19+
import textmogrify.lucene.AnalyzerBuilder
20+
import cats.syntax.all._
21+
import cats.effect.{IO, IOApp, Resource}
22+
import fs2.{Pipe, Stream}
23+
24+
object MultiLingualPipeline extends IOApp.Simple {
25+
26+
sealed trait Lang extends Product with Serializable
27+
case object En extends Lang
28+
case object Fr extends Lang
29+
case object Es extends Lang
30+
31+
case class Msg(lang: Lang, msg: String)
32+
case class Doc(lang: Lang, tokens: Vector[String])
33+
34+
val input = Stream(
35+
Msg(En, "I Like Jalapeños"),
36+
Msg(En, "Neeko likes jumping on counters"),
37+
Msg(Fr, "J'aime Les Jalapeños"),
38+
Msg(Fr, "Neeko aime sauter sur les compteurs"),
39+
Msg(Es, "Me gustan los jalapeños"),
40+
Msg(Es, "A Neeko le gusta saltar sobre los mostradores"),
41+
)
42+
43+
def multiTokenizer: Resource[IO, Msg => IO[Vector[String]]] = {
44+
val base = AnalyzerBuilder.default.withLowerCasing.withASCIIFolding
45+
46+
val englishA = base.english.withPorterStemmer.tokenizer[IO]
47+
val frenchA = base.french.withFrenchLightStemmer.tokenizer[IO]
48+
val spanishA = base.spanish.withSpanishLightStemmer.tokenizer[IO]
49+
50+
(englishA, frenchA, spanishA).parTupled.map { case (en, fr, es) =>
51+
(msg: Msg) =>
52+
msg.lang match {
53+
case En => en(msg.msg)
54+
case Fr => fr(msg.msg)
55+
case Es => es(msg.msg)
56+
}
57+
}
58+
}
59+
60+
val tokenizeMsgs: Pipe[IO, Msg, Doc] = msgs =>
61+
Stream
62+
.resource(multiTokenizer)
63+
.flatMap(f => msgs.evalMap(m => f(m).map(ts => Doc(m.lang, ts))))
64+
65+
val docs: Stream[IO, Doc] = input.through(tokenizeMsgs)
66+
val run = docs.compile.toList.flatMap(IO.println)
67+
68+
}

example/src/main/scala/textmogrify/Pipeline.scala

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ object Pipeline extends IOApp.Simple {
3232
)
3333

3434
val tokenizeMsgs: Pipe[IO, Msg, Doc] = msgs => {
35-
val tokenizer = AnalyzerBuilder.default.withLowerCasing
35+
val tokenizer = AnalyzerBuilder.english.withLowerCasing
3636
.withStopWords(Set("how", "do", "i", "my"))
3737
.withPorterStemmer
3838
.tokenizer[IO]

lucene/src/main/scala/textmogrify/lucene/AnalyzerBuilder.scala

Lines changed: 153 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -20,87 +20,187 @@ import cats.effect.kernel.{Resource, Sync}
2020
import org.apache.lucene.analysis.Analyzer.TokenStreamComponents
2121
import org.apache.lucene.analysis.standard.StandardTokenizer
2222
import org.apache.lucene.analysis.en.PorterStemFilter
23+
import org.apache.lucene.analysis.es.SpanishLightStemFilter
24+
import org.apache.lucene.analysis.fr.FrenchLightStemFilter
2325
import org.apache.lucene.analysis.LowerCaseFilter
2426
import org.apache.lucene.analysis.Analyzer
2527
import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter
2628
import org.apache.lucene.analysis.CharArraySet
2729
import org.apache.lucene.analysis.StopFilter
30+
import org.apache.lucene.analysis.TokenStream
2831

29-
/** Build an Analyzer or tokenizer function
30-
*/
31-
final class AnalyzerBuilder private (
32-
val lowerCase: Boolean,
33-
val foldASCII: Boolean,
34-
val stopWords: Set[String],
35-
val stemmer: Boolean,
36-
) { self =>
32+
final case class Config(
33+
lowerCase: Boolean,
34+
foldASCII: Boolean,
35+
stopWords: Set[String],
36+
) {
37+
def withLowerCasing: Config =
38+
copy(lowerCase = true)
3739

38-
private def copy(
39-
lowerCase: Boolean = self.lowerCase,
40-
foldASCII: Boolean = self.foldASCII,
41-
stemmer: Boolean = self.stemmer,
42-
stopWords: Set[String] = self.stopWords,
43-
): AnalyzerBuilder =
44-
new AnalyzerBuilder(
45-
lowerCase = lowerCase,
46-
foldASCII = foldASCII,
47-
stemmer = stemmer,
48-
stopWords = stopWords,
49-
)
40+
def withASCIIFolding: Config =
41+
copy(foldASCII = true)
42+
43+
def withStopWords(words: Set[String]): Config =
44+
copy(stopWords = words)
45+
}
46+
object Config {
47+
def empty: Config = Config(false, false, Set.empty)
48+
}
49+
50+
/** Build an Analyzer or tokenizer function */
51+
sealed abstract class AnalyzerBuilder private[lucene] (config: Config) {
52+
type Builder <: AnalyzerBuilder
53+
54+
def withConfig(config: Config): Builder
5055

5156
/** Adds a lowercasing stage to the analyzer pipeline */
52-
def withLowerCasing: AnalyzerBuilder =
53-
copy(lowerCase = true)
57+
def withLowerCasing: Builder =
58+
withConfig(config.withLowerCasing)
5459

5560
/** Adds an ASCII folding stage to the analyzer pipeline
5661
* ASCII folding converts alphanumeric and symbolic Unicode characters into
5762
* their ASCII equivalents, if one exists.
5863
*/
59-
def withASCIIFolding: AnalyzerBuilder =
60-
copy(foldASCII = true)
64+
def withASCIIFolding: Builder =
65+
withConfig(config.withASCIIFolding)
6166

62-
/** Adds the Porter Stemmer to the end of the analyzer pipeline and enables lowercasing.
63-
* Stemming reduces words like `jumping` and `jumps` to their root word `jump`.
64-
* NOTE: Lowercasing is forced as it is required for the Lucene PorterStemFilter.
65-
*/
66-
def withPorterStemmer: AnalyzerBuilder =
67-
copy(stemmer = true, lowerCase = true)
67+
/** Adds a stop filter stage to analyzer pipeline for non-empty sets. */
68+
def withStopWords(words: Set[String]): Builder =
69+
withConfig(config.withStopWords(words))
6870

69-
/** Adds a stop filter stage to analyzer pipeline for non-empty sets.
70-
*/
71-
def withStopWords(words: Set[String]): AnalyzerBuilder =
72-
copy(stopWords = words)
71+
/** Build the Analyzer wrapped inside a Resource. */
72+
def build[F[_]](implicit F: Sync[F]): Resource[F, Analyzer]
7373

74-
/** Build the Analyzer wrapped inside a Resource.
74+
/** Directly construct a tokenizing function
7575
*/
76-
def build[F[_]](implicit F: Sync[F]): Resource[F, Analyzer] =
76+
def tokenizer[F[_]](implicit F: Sync[F]): Resource[F, String => F[Vector[String]]] =
77+
build.map(a => Tokenizer.vectorTokenizer(a))
78+
79+
private[lucene] def mkFromStandardTokenizer[F[_]](
80+
config: Config
81+
)(extras: TokenStream => TokenStream)(implicit F: Sync[F]): Resource[F, Analyzer] =
7782
Resource.make(F.delay(new Analyzer {
7883
protected def createComponents(fieldName: String): TokenStreamComponents = {
7984
val source = new StandardTokenizer()
80-
var tokens = if (self.lowerCase) new LowerCaseFilter(source) else source
81-
tokens = if (self.foldASCII) new ASCIIFoldingFilter(tokens) else tokens
85+
var tokens = if (config.lowerCase) new LowerCaseFilter(source) else source
86+
tokens = if (config.foldASCII) new ASCIIFoldingFilter(tokens) else tokens
8287
tokens =
83-
if (self.stopWords.isEmpty) tokens
88+
if (config.stopWords.isEmpty) tokens
8489
else {
85-
val stopSet = new CharArraySet(self.stopWords.size, true)
86-
stopWords.foreach(w => stopSet.add(w))
90+
val stopSet = new CharArraySet(config.stopWords.size, true)
91+
config.stopWords.foreach(w => stopSet.add(w))
8792
new StopFilter(tokens, stopSet)
8893
}
89-
tokens = if (self.stemmer) new PorterStemFilter(tokens) else tokens
90-
new TokenStreamComponents(source, tokens)
94+
new TokenStreamComponents(source, extras(tokens))
9195
}
9296
}))(analyzer => F.delay(analyzer.close()))
9397

94-
/** Directly construct a tokenizing function
95-
*/
96-
def tokenizer[F[_]](implicit F: Sync[F]): Resource[F, String => F[Vector[String]]] =
97-
self.build.map(a => Tokenizer.vectorTokenizer(a))
9898
}
9999
object AnalyzerBuilder {
100-
def default: AnalyzerBuilder = new AnalyzerBuilder(
101-
lowerCase = false,
102-
foldASCII = false,
103-
stemmer = false,
104-
stopWords = Set.empty,
105-
)
100+
def default: DefaultAnalyzerBuilder =
101+
new DefaultAnalyzerBuilder(Config.empty)
102+
def english: EnglishAnalyzerBuilder =
103+
new EnglishAnalyzerBuilder(Config.empty, false)
104+
def french: FrenchAnalyzerBuilder =
105+
new FrenchAnalyzerBuilder(Config.empty, false)
106+
def spanish: SpanishAnalyzerBuilder =
107+
new SpanishAnalyzerBuilder(Config.empty, false)
108+
}
109+
110+
final class DefaultAnalyzerBuilder private[lucene] (config: Config)
111+
extends AnalyzerBuilder(config) { self =>
112+
type Builder = DefaultAnalyzerBuilder
113+
114+
def withConfig(newConfig: Config): DefaultAnalyzerBuilder =
115+
new DefaultAnalyzerBuilder(newConfig)
116+
117+
def english: EnglishAnalyzerBuilder =
118+
new EnglishAnalyzerBuilder(config, false)
119+
120+
def french: FrenchAnalyzerBuilder =
121+
new FrenchAnalyzerBuilder(config, false)
122+
123+
def spanish: SpanishAnalyzerBuilder =
124+
new SpanishAnalyzerBuilder(config, false)
125+
126+
def build[F[_]](implicit F: Sync[F]): Resource[F, Analyzer] =
127+
mkFromStandardTokenizer(config)(identity)
128+
}
129+
130+
final class EnglishAnalyzerBuilder private[lucene] (
131+
config: Config,
132+
stemmer: Boolean,
133+
) extends AnalyzerBuilder(config) { self =>
134+
type Builder = EnglishAnalyzerBuilder
135+
136+
private def copy(
137+
newConfig: Config,
138+
stemmer: Boolean = self.stemmer,
139+
): EnglishAnalyzerBuilder =
140+
new EnglishAnalyzerBuilder(newConfig, stemmer)
141+
142+
def withConfig(newConfig: Config): EnglishAnalyzerBuilder =
143+
copy(newConfig = newConfig)
144+
145+
/** Adds the Porter Stemmer to the end of the analyzer pipeline and enables lowercasing.
146+
* Stemming reduces words like `jumping` and `jumps` to their root word `jump`.
147+
* NOTE: Lowercasing is forced as it is required for the Lucene PorterStemFilter.
148+
*/
149+
def withPorterStemmer: EnglishAnalyzerBuilder =
150+
copy(config.copy(lowerCase = true), stemmer = true)
151+
152+
def build[F[_]](implicit F: Sync[F]): Resource[F, Analyzer] =
153+
mkFromStandardTokenizer(config)(ts => if (self.stemmer) new PorterStemFilter(ts) else ts)
154+
}
155+
156+
final class FrenchAnalyzerBuilder private[lucene] (
157+
config: Config,
158+
stemmer: Boolean,
159+
) extends AnalyzerBuilder(config) { self =>
160+
type Builder = FrenchAnalyzerBuilder
161+
162+
private def copy(
163+
newConfig: Config,
164+
stemmer: Boolean = self.stemmer,
165+
): FrenchAnalyzerBuilder =
166+
new FrenchAnalyzerBuilder(newConfig, stemmer)
167+
168+
def withConfig(newConfig: Config): FrenchAnalyzerBuilder =
169+
copy(newConfig = newConfig)
170+
171+
/** Adds the FrenchLight Stemmer to the end of the analyzer pipeline and enables lowercasing.
172+
* Stemming reduces words like `jumping` and `jumps` to their root word `jump`.
173+
* NOTE: Lowercasing is forced as it is required for the Lucene FrenchLightStemFilter.
174+
*/
175+
def withFrenchLightStemmer: FrenchAnalyzerBuilder =
176+
copy(config.copy(lowerCase = true), stemmer = true)
177+
178+
def build[F[_]](implicit F: Sync[F]): Resource[F, Analyzer] =
179+
mkFromStandardTokenizer(config)(ts => if (self.stemmer) new FrenchLightStemFilter(ts) else ts)
180+
}
181+
182+
final class SpanishAnalyzerBuilder private[lucene] (
183+
config: Config,
184+
stemmer: Boolean,
185+
) extends AnalyzerBuilder(config) { self =>
186+
type Builder = SpanishAnalyzerBuilder
187+
188+
private def copy(
189+
newConfig: Config,
190+
stemmer: Boolean = self.stemmer,
191+
): SpanishAnalyzerBuilder =
192+
new SpanishAnalyzerBuilder(newConfig, stemmer)
193+
194+
def withConfig(newConfig: Config): SpanishAnalyzerBuilder =
195+
copy(newConfig = newConfig)
196+
197+
/** Adds the SpanishLight Stemmer to the end of the analyzer pipeline and enables lowercasing.
198+
* Stemming reduces words like `jumping` and `jumps` to their root word `jump`.
199+
* NOTE: Lowercasing is forced as it is required for the Lucene SpanishLightStemFilter.
200+
*/
201+
def withSpanishLightStemmer: SpanishAnalyzerBuilder =
202+
copy(config.copy(lowerCase = true), stemmer = true)
203+
204+
def build[F[_]](implicit F: Sync[F]): Resource[F, Analyzer] =
205+
mkFromStandardTokenizer(config)(ts => if (self.stemmer) new SpanishLightStemFilter(ts) else ts)
106206
}

0 commit comments

Comments
 (0)