Skip to content

Commit d6a1158

Browse files
authored
Merge pull request #51 from valencik/rework-tokenizer
Change Tokenizer to take Resource[F, Analyzer]
2 parents e016823 + 431e015 commit d6a1158

File tree

4 files changed

+17
-19
lines changed

4 files changed

+17
-19
lines changed

lucene/src/main/scala/textmogrify/lucene/AnalyzerBuilder.scala

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -104,10 +104,9 @@ sealed abstract class AnalyzerBuilder private[lucene] (config: Config) {
104104
/** Build the Analyzer wrapped inside a Resource. */
105105
def build[F[_]](implicit F: Sync[F]): Resource[F, Analyzer]
106106

107-
/** Directly construct a tokenizing function
108-
*/
107+
/** Build a tokenizing function that uses the Analyzer and collects tokens in a vector */
109108
def tokenizer[F[_]](implicit F: Sync[F]): Resource[F, String => F[Vector[String]]] =
110-
build.map(a => Tokenizer.vectorTokenizer(a))
109+
Tokenizer.vectorTokenizer(build)
111110

112111
private[lucene] def mkFromStandardTokenizer[F[_]](
113112
config: Config

lucene/src/main/scala/textmogrify/lucene/AnalyzerResource.scala

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -26,12 +26,4 @@ object AnalyzerResource {
2626
*/
2727
def fromAnalyzer[F[_]](analyzer: => Analyzer)(implicit F: Sync[F]): Resource[F, Analyzer] =
2828
Resource.make(F.delay(analyzer))(analyzer => F.delay(analyzer.close()))
29-
30-
/** Construct a tokenizing function directly from an Analyzer
31-
*/
32-
def tokenizer[F[_]](
33-
analyzer: => Analyzer
34-
)(implicit F: Sync[F]): Resource[F, String => F[Vector[String]]] =
35-
fromAnalyzer(analyzer)
36-
.map(a => Tokenizer.vectorTokenizer(a))
3729
}

lucene/src/main/scala/textmogrify/lucene/Tokenizer.scala

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616

1717
package textmogrify.lucene
1818

19+
import cats.effect.Resource
1920
import cats.effect.kernel.Sync
2021
import scala.collection.mutable.ArrayBuffer
2122
import java.io.StringReader
@@ -27,8 +28,10 @@ object Tokenizer {
2728
/** Build a tokenizing function that runs its input through the Analyzer and collects
2829
* all tokens into a `Vector`
2930
*/
30-
def vectorTokenizer[F[_]](analyzer: Analyzer)(implicit F: Sync[F]): String => F[Vector[String]] =
31-
(s: String) =>
31+
def vectorTokenizer[F[_]](
32+
analyzer: Resource[F, Analyzer]
33+
)(implicit F: Sync[F]): Resource[F, String => F[Vector[String]]] =
34+
analyzer.map { analyzer => (s: String) =>
3235
F.delay {
3336
val ts = analyzer.tokenStream("textmogrify-field", new StringReader(s))
3437
val termAtt = ts.addAttribute(classOf[CharTermAttribute])
@@ -42,4 +45,5 @@ object Tokenizer {
4245
ts.close()
4346
arr.toVector
4447
}
48+
}
4549
}

lucene/src/test/scala/textmogrify/lucene/AnalyzerResourceSuite.scala

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -24,16 +24,18 @@ import org.apache.lucene.analysis.en.EnglishAnalyzer
2424
class AnalyzerResourceSuite extends CatsEffectSuite {
2525

2626
test("tokenizer should work") {
27-
val analyzer = AnalyzerResource.tokenizer[IO](new EnglishAnalyzer())
28-
val actual = analyzer.use { f =>
27+
val analyzer = AnalyzerResource.fromAnalyzer[IO](new EnglishAnalyzer())
28+
val tokenizer = Tokenizer.vectorTokenizer(analyzer)
29+
val actual = tokenizer.use { f =>
2930
f("Hello my name is Neeko")
3031
}
3132
assertIO(actual, Vector("hello", "my", "name", "neeko"))
3233
}
3334

3435
test("tokenizer should yield a func that can be used multiple times") {
35-
val analyzer = AnalyzerResource.tokenizer[IO](new EnglishAnalyzer())
36-
val actual = analyzer.use { f =>
36+
val analyzer = AnalyzerResource.fromAnalyzer[IO](new EnglishAnalyzer())
37+
val tokenizer = Tokenizer.vectorTokenizer(analyzer)
38+
val actual = tokenizer.use { f =>
3739
for {
3840
v1 <- f("Hello my name is Neeko")
3941
v2 <- f("I enjoy jumping on counters")
@@ -50,14 +52,15 @@ class AnalyzerResourceSuite extends CatsEffectSuite {
5052
import org.apache.lucene.analysis.LowerCaseFilter
5153
import org.apache.lucene.analysis.Analyzer
5254

53-
val stemmer = AnalyzerResource.tokenizer[IO](new Analyzer {
55+
val analyzer = AnalyzerResource.fromAnalyzer[IO](new Analyzer {
5456
protected def createComponents(fieldName: String): TokenStreamComponents = {
5557
val source = new StandardTokenizer()
5658
val tokens = new LowerCaseFilter(source)
5759
new TokenStreamComponents(source, new PorterStemFilter(tokens))
5860
}
5961
})
60-
val actual = stemmer.use { f =>
62+
val tokenizer = Tokenizer.vectorTokenizer(analyzer)
63+
val actual = tokenizer.use { f =>
6164
for {
6265
v1 <- f("Hello my name is Neeko")
6366
v2 <- f("I enjoy jumping on counters")

0 commit comments

Comments
 (0)