Skip to content

Commit 3febb9e

Browse files
committed
refactor tests to minimzie code duplication
1 parent 09f37e1 commit 3febb9e

File tree

5 files changed

+140
-242
lines changed

5 files changed

+140
-242
lines changed

test/abstract_text_transformer.jl

Lines changed: 138 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,138 @@
1+
using MLJBase
2+
using TextAnalysis
3+
4+
@testset "basic use" begin
5+
# add some test docs
6+
docs = ["Hi my name is Sam.", "How are you today?"]
7+
8+
# convert to ngrams
9+
ngram_vec = ngrams.(documents(Corpus(NGramDocument.(docs))))
10+
11+
# train tfidf transformer
12+
tfidf_transformer = MLJText.TfidfTransformer()
13+
test_tfidf_machine = @test_logs machine(tfidf_transformer, ngram_vec)
14+
MLJBase.fit!(test_tfidf_machine)
15+
16+
# train bag_of_words transformer
17+
bagofwords_vectorizer = MLJText.BagOfWordsTransformer()
18+
test_bow_machine = @test_logs machine(bagofwords_vectorizer, ngram_vec)
19+
MLJBase.fit!(test_bow_machine)
20+
21+
# train bm25 transformer
22+
bm25_transformer = MLJText.BM25Transformer()
23+
test_bm25_machine = @test_logs machine(bm25_transformer, ngram_vec)
24+
MLJBase.fit!(test_bm25_machine)
25+
26+
test_machines = [test_tfidf_machine, test_bow_machine, test_bm25_machine]
27+
28+
# test single doc
29+
test_doc1 = ngrams(NGramDocument("Another sentence ok"))
30+
for mach = test_machines
31+
test_doc_transform = transform(mach, [test_doc])
32+
@test sum(test_doc_transform, dims=2)[1] == 0.0
33+
@test size(test_doc_transform) == (1, 11)
34+
end
35+
36+
# test another single doc
37+
test_doc2 = ngrams(NGramDocument("Listen Sam, today is not the day."))
38+
for mach = test_machines
39+
test_doc_transform = transform(mach, [test_doc2])
40+
@test sum(test_doc_transform, dims=2)[1] > 0.0
41+
@test size(test_doc_transform) == (1, 11)
42+
end
43+
44+
# test two docs
45+
test_doc3 = ngrams.(
46+
Corpus([NGramDocument("Another sentence ok"), NGramDocument("Listen Sam, today is not the day.")])
47+
)
48+
for mach = test_machines
49+
test_doc_transform = transform(mach, test_doc3)
50+
@test sum(test_doc_transform, dims=2)[1] == 0.0
51+
@test sum(test_doc_transform, dims=2)[2] > 0.0
52+
@test size(test_doc_transform) == (2, 11)
53+
end
54+
55+
# test tokenized docs
56+
test_doc4 = [["Another", "sentence", "ok"], ["Listen", "Sam", ",", "today", "is", "not", "the", "day", "."]]
57+
for mach = test_machines
58+
test_doc_transform = transform(mach, test_doc4)
59+
@test sum(test_doc_transform, dims=2)[1] == 0.0
60+
@test sum(test_doc_transform, dims=2)[2] > 0.0
61+
@test size(test_doc_transform) == (2, 11)
62+
end
63+
end
64+
65+
@testset "bag of words use" begin
66+
# test with bag of words
67+
bag_of_words = Dict(
68+
"cat in" => 1,
69+
"the hat" => 1,
70+
"the" => 2,
71+
"cat" => 1,
72+
"hat" => 1,
73+
"in the" => 1,
74+
"in" => 1,
75+
"the cat" => 1
76+
)
77+
bag = Dict(Tuple(String.(split(k))) => v for (k, v) in bag_of_words)
78+
79+
# train tfidf transformer
80+
tfidf_transformer = MLJText.TfidfTransformer()
81+
test_tfidf_machine2 = @test_logs machine(tfidf_transformer, [bag])
82+
MLJBase.fit!(test_tfidf_machine2)
83+
84+
# train bag_of_words transformer
85+
bagofwords_vectorizer = MLJText.BagOfWordsTransformer()
86+
test_bow_machine2 = @test_logs machine(bagofwords_vectorizer, [bag])
87+
MLJBase.fit!(test_bow_machine2)
88+
89+
# train bm25 transformer
90+
bm25_transformer = MLJText.BM25Transformer()
91+
test_bm25_machine2 = @test_logs machine(bm25_transformer, [bag])
92+
MLJBase.fit!(test_bm25_machine2)
93+
94+
test_doc5 = ["How about a cat in a hat"]
95+
for mach = [test_tfidf_machine2, test_bow_machine2, test_bm25_machine2]
96+
test_doc_transform = transform(mach, test_doc5)
97+
@test sum(test_doc_transform, dims=2)[1] > 0.0
98+
@test size(test_doc_transform) == (1, 8)
99+
end
100+
end
101+
102+
@testset "min max features use" begin
103+
# test min/max features
104+
docs = [
105+
"the BIL opens the door to new possibilities and should raise our collective expectations",
106+
"about what we can achieve in the near term.",
107+
"the following projects are not yet at a stage where they could be competitive",
108+
"for construction dollars over the next five years,",
109+
"but with some attention and preliminary work, our transportation leaders could turn",
110+
"these stretch projects into shovel-worthy ones."
111+
]
112+
ngram_vec = ngrams.(documents(Corpus(NGramDocument.(docs))))
113+
114+
# train tfidf transformer
115+
tfidf_transformer = MLJText.TfidfTransformer(max_doc_freq=0.8, min_doc_freq=0.2)
116+
test_tfidf_machine3 = @test_logs machine(tfidf_transformer, ngram_vec)
117+
MLJBase.fit!(test_tfidf_machine3)
118+
119+
# train bag_of_words transformer
120+
bagofwords_vectorizer = MLJText.BagOfWordsTransformer(max_doc_freq=0.8)
121+
test_bow_machine3 = @test_logs machine(bagofwords_vectorizer, ngram_vec)
122+
MLJBase.fit!(test_bow_machine3)
123+
124+
# train bm25 transformer
125+
bm25_transformer = MLJText.BM25Transformer(max_doc_freq=0.8, min_doc_freq=0.2)
126+
test_bm25_machine3 = @test_logs machine(bm25_transformer, ngram_vec)
127+
MLJBase.fit!(test_bm25_machine3)
128+
129+
# test all three machines
130+
test_doc_transform = transform(test_tfidf_machine3, ngram_vec)
131+
@test (Vector(vec(sum(test_doc_transform, dims=2))) .> 0.2) == Bool[1, 1, 1, 1, 1, 1]
132+
133+
test_doc_transform = transform(test_bow_machine3, ngram_vec)
134+
@test Vector(vec(sum(test_doc_transform, dims=2))) == [14, 10, 14, 9, 13, 7]
135+
136+
test_doc_transform = transform(test_bm25_machine3, ngram_vec)
137+
@test (Vector(vec(sum(test_doc_transform, dims=2))) .> 0.8) == Bool[1, 1, 1, 1, 1, 1]
138+
end

test/bagofwords_transformer.jl

Lines changed: 0 additions & 78 deletions
This file was deleted.

test/bm25_transformer.jl

Lines changed: 0 additions & 76 deletions
This file was deleted.

test/runtests.jl

Lines changed: 2 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,8 @@
11
using Test
22
using MLJText
33

4-
@testset "tfidf_transformer" begin
5-
include("tfidf_transformer.jl")
6-
end
7-
8-
@testset "bm25_transformer" begin
9-
include("bm25_transformer.jl")
10-
end
11-
12-
@testset "bagofwords_transformer" begin
13-
include("bagofwords_transformer.jl")
4+
@testset "abstract text transformer" begin
5+
include("abstract_text_transformer.jl")
146
end
157

168
@testset "scitypes" begin

test/tfidf_transformer.jl

Lines changed: 0 additions & 78 deletions
This file was deleted.

0 commit comments

Comments
 (0)