1
+ using MLJBase
2
+ using TextAnalysis
3
+
4
+ @testset " basic use" begin
5
+ # add some test docs
6
+ docs = [" Hi my name is Sam." , " How are you today?" ]
7
+
8
+ # convert to ngrams
9
+ ngram_vec = ngrams .(documents (Corpus (NGramDocument .(docs))))
10
+
11
+ # train tfidf transformer
12
+ tfidf_transformer = MLJText. TfidfTransformer ()
13
+ test_tfidf_machine = @test_logs machine (tfidf_transformer, ngram_vec)
14
+ MLJBase. fit! (test_tfidf_machine)
15
+
16
+ # train bag_of_words transformer
17
+ bagofwords_vectorizer = MLJText. BagOfWordsTransformer ()
18
+ test_bow_machine = @test_logs machine (bagofwords_vectorizer, ngram_vec)
19
+ MLJBase. fit! (test_bow_machine)
20
+
21
+ # train bm25 transformer
22
+ bm25_transformer = MLJText. BM25Transformer ()
23
+ test_bm25_machine = @test_logs machine (bm25_transformer, ngram_vec)
24
+ MLJBase. fit! (test_bm25_machine)
25
+
26
+ test_machines = [test_tfidf_machine, test_bow_machine, test_bm25_machine]
27
+
28
+ # test single doc
29
+ test_doc1 = ngrams (NGramDocument (" Another sentence ok" ))
30
+ for mach = test_machines
31
+ test_doc_transform = transform (mach, [test_doc])
32
+ @test sum (test_doc_transform, dims= 2 )[1 ] == 0.0
33
+ @test size (test_doc_transform) == (1 , 11 )
34
+ end
35
+
36
+ # test another single doc
37
+ test_doc2 = ngrams (NGramDocument (" Listen Sam, today is not the day." ))
38
+ for mach = test_machines
39
+ test_doc_transform = transform (mach, [test_doc2])
40
+ @test sum (test_doc_transform, dims= 2 )[1 ] > 0.0
41
+ @test size (test_doc_transform) == (1 , 11 )
42
+ end
43
+
44
+ # test two docs
45
+ test_doc3 = ngrams .(
46
+ Corpus ([NGramDocument (" Another sentence ok" ), NGramDocument (" Listen Sam, today is not the day." )])
47
+ )
48
+ for mach = test_machines
49
+ test_doc_transform = transform (mach, test_doc3)
50
+ @test sum (test_doc_transform, dims= 2 )[1 ] == 0.0
51
+ @test sum (test_doc_transform, dims= 2 )[2 ] > 0.0
52
+ @test size (test_doc_transform) == (2 , 11 )
53
+ end
54
+
55
+ # test tokenized docs
56
+ test_doc4 = [[" Another" , " sentence" , " ok" ], [" Listen" , " Sam" , " ," , " today" , " is" , " not" , " the" , " day" , " ." ]]
57
+ for mach = test_machines
58
+ test_doc_transform = transform (mach, test_doc4)
59
+ @test sum (test_doc_transform, dims= 2 )[1 ] == 0.0
60
+ @test sum (test_doc_transform, dims= 2 )[2 ] > 0.0
61
+ @test size (test_doc_transform) == (2 , 11 )
62
+ end
63
+ end
64
+
65
+ @testset " bag of words use" begin
66
+ # test with bag of words
67
+ bag_of_words = Dict (
68
+ " cat in" => 1 ,
69
+ " the hat" => 1 ,
70
+ " the" => 2 ,
71
+ " cat" => 1 ,
72
+ " hat" => 1 ,
73
+ " in the" => 1 ,
74
+ " in" => 1 ,
75
+ " the cat" => 1
76
+ )
77
+ bag = Dict (Tuple (String .(split (k))) => v for (k, v) in bag_of_words)
78
+
79
+ # train tfidf transformer
80
+ tfidf_transformer = MLJText. TfidfTransformer ()
81
+ test_tfidf_machine2 = @test_logs machine (tfidf_transformer, [bag])
82
+ MLJBase. fit! (test_tfidf_machine2)
83
+
84
+ # train bag_of_words transformer
85
+ bagofwords_vectorizer = MLJText. BagOfWordsTransformer ()
86
+ test_bow_machine2 = @test_logs machine (bagofwords_vectorizer, [bag])
87
+ MLJBase. fit! (test_bow_machine2)
88
+
89
+ # train bm25 transformer
90
+ bm25_transformer = MLJText. BM25Transformer ()
91
+ test_bm25_machine2 = @test_logs machine (bm25_transformer, [bag])
92
+ MLJBase. fit! (test_bm25_machine2)
93
+
94
+ test_doc5 = [" How about a cat in a hat" ]
95
+ for mach = [test_tfidf_machine2, test_bow_machine2, test_bm25_machine2]
96
+ test_doc_transform = transform (mach, test_doc5)
97
+ @test sum (test_doc_transform, dims= 2 )[1 ] > 0.0
98
+ @test size (test_doc_transform) == (1 , 8 )
99
+ end
100
+ end
101
+
102
+ @testset " min max features use" begin
103
+ # test min/max features
104
+ docs = [
105
+ " the BIL opens the door to new possibilities and should raise our collective expectations" ,
106
+ " about what we can achieve in the near term." ,
107
+ " the following projects are not yet at a stage where they could be competitive" ,
108
+ " for construction dollars over the next five years," ,
109
+ " but with some attention and preliminary work, our transportation leaders could turn" ,
110
+ " these stretch projects into shovel-worthy ones."
111
+ ]
112
+ ngram_vec = ngrams .(documents (Corpus (NGramDocument .(docs))))
113
+
114
+ # train tfidf transformer
115
+ tfidf_transformer = MLJText. TfidfTransformer (max_doc_freq= 0.8 , min_doc_freq= 0.2 )
116
+ test_tfidf_machine3 = @test_logs machine (tfidf_transformer, ngram_vec)
117
+ MLJBase. fit! (test_tfidf_machine3)
118
+
119
+ # train bag_of_words transformer
120
+ bagofwords_vectorizer = MLJText. BagOfWordsTransformer (max_doc_freq= 0.8 )
121
+ test_bow_machine3 = @test_logs machine (bagofwords_vectorizer, ngram_vec)
122
+ MLJBase. fit! (test_bow_machine3)
123
+
124
+ # train bm25 transformer
125
+ bm25_transformer = MLJText. BM25Transformer (max_doc_freq= 0.8 , min_doc_freq= 0.2 )
126
+ test_bm25_machine3 = @test_logs machine (bm25_transformer, ngram_vec)
127
+ MLJBase. fit! (test_bm25_machine3)
128
+
129
+ # test all three machines
130
+ test_doc_transform = transform (test_tfidf_machine3, ngram_vec)
131
+ @test (Vector (vec (sum (test_doc_transform, dims= 2 ))) .> 0.2 ) == Bool[1 , 1 , 1 , 1 , 1 , 1 ]
132
+
133
+ test_doc_transform = transform (test_bow_machine3, ngram_vec)
134
+ @test Vector (vec (sum (test_doc_transform, dims= 2 ))) == [14 , 10 , 14 , 9 , 13 , 7 ]
135
+
136
+ test_doc_transform = transform (test_bm25_machine3, ngram_vec)
137
+ @test (Vector (vec (sum (test_doc_transform, dims= 2 ))) .> 0.8 ) == Bool[1 , 1 , 1 , 1 , 1 , 1 ]
138
+ end
0 commit comments