Skip to content

Commit 43d9d6d

Browse files
authored
Merge pull request #20 from JuliaAI/update_classifier_name
change BagOfWordsTransformer to CountTransformer
2 parents 2b6cb31 + 62e04d3 commit 43d9d6d

File tree

4 files changed

+42
-42
lines changed

4 files changed

+42
-42
lines changed

README.md

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -89,19 +89,19 @@ BM25Transformer(
8989
```
9090
Please see [http://ethen8181.github.io/machine-learning/search/bm25_intro.html](http://ethen8181.github.io/machine-learning/search/bm25_intro.html) for more details about how these parameters affect the matrix that is generated.
9191

92-
## Bag-of-Words Transformer
92+
## Count Transformer
9393
The `MLJText` package also offers a way to represent documents using the simpler bag-of-words representation. This returns a document-term matrix (as you would get in `TextAnalysis`) that consists of the count for every word in the corpus for each document in the corpus.
9494

9595
### Usage
9696
```julia
9797
using MLJ, MLJText, TextAnalysis
9898

9999
docs = ["Hi my name is Sam.", "How are you today?"]
100-
bagofwords_transformer = BagOfWordsTransformer()
101-
mach = machine(bagofwords_transformer, tokenize.(docs))
100+
count_transformer = CountTransformer()
101+
mach = machine(count_transformer, tokenize.(docs))
102102
MLJ.fit!(mach)
103103

104-
bagofwords_mat = transform(mach, tokenize.(docs))
104+
count_mat = transform(mach, tokenize.(docs))
105105
```
106106

107107
The resulting matrix looks like:

src/MLJText.jl

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,9 +21,9 @@ include("scitypes.jl")
2121
include("utils.jl")
2222
include("abstract_text_transformer.jl")
2323
include("tfidf_transformer.jl")
24-
include("bagofwords_transformer.jl")
24+
include("count_transformer.jl")
2525
include("bm25_transformer.jl")
2626

27-
export TfidfTransformer, BM25Transformer, BagOfWordsTransformer
27+
export TfidfTransformer, BM25Transformer, CountTransformer
2828

2929
end # module
Lines changed: 21 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,13 @@
11
"""
2-
BagOfWordsTransformer()
2+
CountTransformer()
33
4-
Convert a collection of raw documents to matrix representing a bag-of-words structure.
5-
Essentially, a bag-of-words approach to representing documents in a matrix is comprised of
6-
a count of every word in the document corpus/collection for every document. This is a simple
7-
but often quite powerful way of representing documents as vectors. The resulting representation is
8-
a matrix with rows representing every document in the corpus and columns representing every word
9-
in the corpus. The value for each cell is the raw count of a particular word in a particular
10-
document.
4+
Convert a collection of raw documents to matrix representing a bag-of-words structure from
5+
word counts. Essentially, a bag-of-words approach to representing documents in a matrix is
6+
comprised of a count of every word in the document corpus/collection for every document.
7+
This is a simple but often quite powerful way of representing documents as vectors. The
8+
resulting representation is a matrix with rows representing every document in the corpus
9+
and columns representing every word in the corpus. The value for each cell is the raw count
10+
of a particular word in a particular document.
1111
1212
Similarly to the `TfidfTransformer`, the vocabulary considered can be restricted
1313
to words occuring in a maximum or minimum portion of documents.
@@ -19,23 +19,23 @@ will be removed. Similarly, the `min_doc_freq` parameter restricts terms in the
1919
other direction. A value of 0.01 means that only terms that are at least in 1% of
2020
documents will be included.
2121
"""
22-
mutable struct BagOfWordsTransformer <: AbstractTextTransformer
22+
mutable struct CountTransformer <: AbstractTextTransformer
2323
max_doc_freq::Float64
2424
min_doc_freq::Float64
2525
end
2626

27-
function BagOfWordsTransformer(; max_doc_freq::Float64 = 1.0, min_doc_freq::Float64 = 0.0)
28-
transformer = BagOfWordsTransformer(max_doc_freq, min_doc_freq)
27+
function CountTransformer(; max_doc_freq::Float64 = 1.0, min_doc_freq::Float64 = 0.0)
28+
transformer = CountTransformer(max_doc_freq, min_doc_freq)
2929
message = MMI.clean!(transformer)
3030
isempty(message) || @warn message
3131
return transformer
3232
end
3333

34-
struct BagOfWordsTransformerResult
34+
struct CountTransformerResult
3535
vocab::Vector{String}
3636
end
3737

38-
function _fit(transformer::BagOfWordsTransformer, verbosity::Int, X::Corpus)
38+
function _fit(transformer::CountTransformer, verbosity::Int, X::Corpus)
3939
# process corpus vocab
4040
update_lexicon!(X)
4141

@@ -52,14 +52,14 @@ function _fit(transformer::BagOfWordsTransformer, verbosity::Int, X::Corpus)
5252
end
5353

5454
# prepare result
55-
fitresult = BagOfWordsTransformerResult(vocab)
55+
fitresult = CountTransformerResult(vocab)
5656
cache = nothing
5757

5858
return fitresult, cache, NamedTuple()
5959
end
6060

61-
function _transform(::BagOfWordsTransformer,
62-
result::BagOfWordsTransformerResult,
61+
function _transform(::CountTransformer,
62+
result::CountTransformerResult,
6363
v::Corpus)
6464
dtm_matrix = build_dtm(v, result.vocab)
6565

@@ -69,14 +69,14 @@ function _transform(::BagOfWordsTransformer,
6969
end
7070

7171
# for returning user-friendly form of the learned parameters:
72-
function MMI.fitted_params(::BagOfWordsTransformer, fitresult::BagOfWordsTransformerResult)
72+
function MMI.fitted_params(::CountTransformer, fitresult::CountTransformerResult)
7373
vocab = fitresult.vocab
7474
return (vocab = vocab,)
7575
end
7676

7777
## META DATA
7878

79-
MMI.metadata_pkg(BagOfWordsTransformer,
79+
MMI.metadata_pkg(CountTransformer,
8080
name="$PKG",
8181
uuid="7876af07-990d-54b4-ab0e-23690620f79a",
8282
url="https://github.com/JuliaAI/MLJText.jl",
@@ -85,13 +85,13 @@ MMI.metadata_pkg(BagOfWordsTransformer,
8585
is_wrapper=false
8686
)
8787

88-
MMI.metadata_model(BagOfWordsTransformer,
88+
MMI.metadata_model(CountTransformer,
8989
input_scitype = Union{
9090
AbstractVector{<:AbstractVector{STB.Textual}},
9191
AbstractVector{<:STB.Multiset{<:ScientificNGram}},
9292
AbstractVector{<:STB.Multiset{STB.Textual}}
9393
},
9494
output_scitype = AbstractMatrix{STB.Continuous},
95-
docstring = "Build Bag-of-Words matrix for corpus of documents",
96-
path = "MLJText.BagOfWordsTransformer"
95+
docstring = "Build Bag-of-Words matrix from word counts for corpus of documents",
96+
path = "MLJText.CountTransformer"
9797
)

test/abstract_text_transformer.jl

Lines changed: 15 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -13,17 +13,17 @@ using TextAnalysis
1313
test_tfidf_machine = @test_logs machine(tfidf_transformer, ngram_vec)
1414
MLJBase.fit!(test_tfidf_machine)
1515

16-
# train bag_of_words transformer
17-
bagofwords_vectorizer = MLJText.BagOfWordsTransformer()
18-
test_bow_machine = @test_logs machine(bagofwords_vectorizer, ngram_vec)
19-
MLJBase.fit!(test_bow_machine)
16+
# train count transformer
17+
count_transformer = MLJText.CountTransformer()
18+
test_count_machine = @test_logs machine(count_transformer, ngram_vec)
19+
MLJBase.fit!(test_count_machine)
2020

2121
# train bm25 transformer
2222
bm25_transformer = MLJText.BM25Transformer()
2323
test_bm25_machine = @test_logs machine(bm25_transformer, ngram_vec)
2424
MLJBase.fit!(test_bm25_machine)
2525

26-
test_machines = [test_tfidf_machine, test_bow_machine, test_bm25_machine]
26+
test_machines = [test_tfidf_machine, test_count_machine, test_bm25_machine]
2727

2828
# test single doc
2929
test_doc1 = ngrams(NGramDocument("Another sentence ok"))
@@ -91,18 +91,18 @@ end
9191
test_tfidf_machine2 = @test_logs machine(tfidf_transformer, [bag])
9292
MLJBase.fit!(test_tfidf_machine2)
9393

94-
# train bag_of_words transformer
95-
bagofwords_vectorizer = MLJText.BagOfWordsTransformer()
96-
test_bow_machine2 = @test_logs machine(bagofwords_vectorizer, [bag])
97-
MLJBase.fit!(test_bow_machine2)
94+
# train count transformer
95+
count_transformer = MLJText.CountTransformer()
96+
test_count_machine2 = @test_logs machine(count_transformer, [bag])
97+
MLJBase.fit!(test_count_machine2)
9898

9999
# train bm25 transformer
100100
bm25_transformer = MLJText.BM25Transformer()
101101
test_bm25_machine2 = @test_logs machine(bm25_transformer, [bag])
102102
MLJBase.fit!(test_bm25_machine2)
103103

104104
test_doc5 = ["How about a cat in a hat"]
105-
for mach = [test_tfidf_machine2, test_bow_machine2, test_bm25_machine2]
105+
for mach = [test_tfidf_machine2, test_count_machine2, test_bm25_machine2]
106106
test_doc_transform = transform(mach, test_doc5)
107107
@test sum(test_doc_transform, dims=2)[1] > 0.0
108108
@test size(test_doc_transform) == (1, 8)
@@ -126,10 +126,10 @@ end
126126
test_tfidf_machine3 = @test_logs machine(tfidf_transformer, ngram_vec)
127127
MLJBase.fit!(test_tfidf_machine3)
128128

129-
# train bag_of_words transformer
130-
bagofwords_vectorizer = MLJText.BagOfWordsTransformer(max_doc_freq=0.8)
131-
test_bow_machine3 = @test_logs machine(bagofwords_vectorizer, ngram_vec)
132-
MLJBase.fit!(test_bow_machine3)
129+
# train count transformer
130+
count_transformer = MLJText.CountTransformer(max_doc_freq=0.8)
131+
test_count_machine3 = @test_logs machine(count_transformer, ngram_vec)
132+
MLJBase.fit!(test_count_machine3)
133133

134134
# train bm25 transformer
135135
bm25_transformer = MLJText.BM25Transformer(max_doc_freq=0.8, min_doc_freq=0.2)
@@ -140,7 +140,7 @@ end
140140
test_doc_transform = transform(test_tfidf_machine3, ngram_vec)
141141
@test (Vector(vec(sum(test_doc_transform, dims=2))) .> 0.2) == Bool[1, 1, 1, 1, 1, 1]
142142

143-
test_doc_transform = transform(test_bow_machine3, ngram_vec)
143+
test_doc_transform = transform(test_count_machine3, ngram_vec)
144144
@test Vector(vec(sum(test_doc_transform, dims=2))) == [14, 10, 14, 9, 13, 7]
145145

146146
test_doc_transform = transform(test_bm25_machine3, ngram_vec)

0 commit comments

Comments
 (0)