Merge pull request #20 from JuliaAI/update_classifier_name

pazzo83 · web-flow · commit 43d9d6da537d · 2022-02-07T15:33:15.000-05:00
change BagOfWordsTransformer to CountTransformer
diff --git a/README.md b/README.md
@@ -89,19 +89,19 @@ BM25Transformer(
 ```
 Please see [http://ethen8181.github.io/machine-learning/search/bm25_intro.html](http://ethen8181.github.io/machine-learning/search/bm25_intro.html) for more details about how these parameters affect the matrix that is generated.
 
-## Bag-of-Words Transformer
+## Count Transformer
 The `MLJText` package also offers a way to represent documents using the simpler bag-of-words representation. This returns a document-term matrix (as you would get in `TextAnalysis`) that consists of the count for every word in the corpus for each document in the corpus.
 
 ### Usage
 ```julia
 using MLJ, MLJText, TextAnalysis
 
 docs = ["Hi my name is Sam.", "How are you today?"]
-bagofwords_transformer = BagOfWordsTransformer()
-mach = machine(bagofwords_transformer, tokenize.(docs))
+count_transformer = CountTransformer()
+mach = machine(count_transformer, tokenize.(docs))
 MLJ.fit!(mach)
 
-bagofwords_mat = transform(mach, tokenize.(docs))
+count_mat = transform(mach, tokenize.(docs))
 ```
 
 The resulting matrix looks like:
diff --git a/src/MLJText.jl b/src/MLJText.jl
@@ -21,9 +21,9 @@ include("scitypes.jl")
 include("utils.jl")
 include("abstract_text_transformer.jl")
 include("tfidf_transformer.jl")
-include("bagofwords_transformer.jl")
+include("count_transformer.jl")
 include("bm25_transformer.jl")
 
-export TfidfTransformer, BM25Transformer, BagOfWordsTransformer
+export TfidfTransformer, BM25Transformer, CountTransformer
 
 end # module
diff --git a/src/count_transformer.jl b/src/count_transformer.jl
@@ -1,13 +1,13 @@
 """
-    BagOfWordsTransformer()
+    CountTransformer()
 
-Convert a collection of raw documents to matrix representing a bag-of-words structure.
-Essentially, a bag-of-words approach to representing documents in a matrix is comprised of
-a count of every word in the document corpus/collection for every document. This is a simple
-but often quite powerful way of representing documents as vectors. The resulting representation is
-a matrix with rows representing every document in the corpus and columns representing every word
-in the corpus. The value for each cell is the raw count of a particular word in a particular
-document.
+Convert a collection of raw documents to matrix representing a bag-of-words structure from 
+word counts. Essentially, a bag-of-words approach to representing documents in a matrix is 
+comprised of a count of every word in the document corpus/collection for every document. 
+This is a simple but often quite powerful way of representing documents as vectors. The 
+resulting representation is a matrix with rows representing every document in the corpus 
+and columns representing every word in the corpus. The value for each cell is the raw count 
+of a particular word in a particular document.
 
 Similarly to the `TfidfTransformer`, the vocabulary considered can be restricted
 to words occuring in a maximum or minimum portion of documents.
@@ -19,23 +19,23 @@ will be removed. Similarly, the `min_doc_freq` parameter restricts terms in the
 other direction. A value of 0.01 means that only terms that are at least in 1% of
 documents will be included.
 """
-mutable struct BagOfWordsTransformer <: AbstractTextTransformer
+mutable struct CountTransformer <: AbstractTextTransformer
     max_doc_freq::Float64
     min_doc_freq::Float64
 end
 
-function BagOfWordsTransformer(; max_doc_freq::Float64 = 1.0, min_doc_freq::Float64 = 0.0)    
-    transformer = BagOfWordsTransformer(max_doc_freq, min_doc_freq)
+function CountTransformer(; max_doc_freq::Float64 = 1.0, min_doc_freq::Float64 = 0.0)    
+    transformer = CountTransformer(max_doc_freq, min_doc_freq)
     message = MMI.clean!(transformer)
     isempty(message) || @warn message
     return transformer
 end
 
-struct BagOfWordsTransformerResult
+struct CountTransformerResult
     vocab::Vector{String}
 end
 
-function _fit(transformer::BagOfWordsTransformer, verbosity::Int, X::Corpus)
+function _fit(transformer::CountTransformer, verbosity::Int, X::Corpus)
     # process corpus vocab
     update_lexicon!(X)    
 
@@ -52,14 +52,14 @@ function _fit(transformer::BagOfWordsTransformer, verbosity::Int, X::Corpus)
     end
 
     # prepare result
-    fitresult = BagOfWordsTransformerResult(vocab)
+    fitresult = CountTransformerResult(vocab)
     cache = nothing
 
     return fitresult, cache, NamedTuple()
 end
 
-function _transform(::BagOfWordsTransformer, 
-                    result::BagOfWordsTransformerResult,
+function _transform(::CountTransformer, 
+                    result::CountTransformerResult,
                     v::Corpus)
     dtm_matrix = build_dtm(v, result.vocab)
 
@@ -69,14 +69,14 @@ function _transform(::BagOfWordsTransformer,
 end
 
 # for returning user-friendly form of the learned parameters:
-function MMI.fitted_params(::BagOfWordsTransformer, fitresult::BagOfWordsTransformerResult)
+function MMI.fitted_params(::CountTransformer, fitresult::CountTransformerResult)
     vocab = fitresult.vocab
     return (vocab = vocab,)
 end
 
 ## META DATA
 
-MMI.metadata_pkg(BagOfWordsTransformer,
+MMI.metadata_pkg(CountTransformer,
              name="$PKG",
              uuid="7876af07-990d-54b4-ab0e-23690620f79a",
              url="https://github.com/JuliaAI/MLJText.jl",
@@ -85,13 +85,13 @@ MMI.metadata_pkg(BagOfWordsTransformer,
              is_wrapper=false
 )
 
-MMI.metadata_model(BagOfWordsTransformer,
+MMI.metadata_model(CountTransformer,
                input_scitype = Union{
                    AbstractVector{<:AbstractVector{STB.Textual}},
                    AbstractVector{<:STB.Multiset{<:ScientificNGram}},
                    AbstractVector{<:STB.Multiset{STB.Textual}}
                    },
                output_scitype = AbstractMatrix{STB.Continuous},
-               docstring = "Build Bag-of-Words matrix for corpus of documents",
-               path = "MLJText.BagOfWordsTransformer"
+               docstring = "Build Bag-of-Words matrix from word counts for corpus of documents",
+               path = "MLJText.CountTransformer"
                )
diff --git a/test/abstract_text_transformer.jl b/test/abstract_text_transformer.jl
@@ -13,17 +13,17 @@ using TextAnalysis
     test_tfidf_machine = @test_logs machine(tfidf_transformer, ngram_vec)
     MLJBase.fit!(test_tfidf_machine)
 
-    # train bag_of_words transformer
-    bagofwords_vectorizer = MLJText.BagOfWordsTransformer()
-    test_bow_machine = @test_logs machine(bagofwords_vectorizer, ngram_vec)
-    MLJBase.fit!(test_bow_machine)
+    # train count transformer
+    count_transformer = MLJText.CountTransformer()
+    test_count_machine = @test_logs machine(count_transformer, ngram_vec)
+    MLJBase.fit!(test_count_machine)
 
     # train bm25 transformer
     bm25_transformer = MLJText.BM25Transformer()
     test_bm25_machine = @test_logs machine(bm25_transformer, ngram_vec)
     MLJBase.fit!(test_bm25_machine)
 
-    test_machines = [test_tfidf_machine, test_bow_machine, test_bm25_machine]
+    test_machines = [test_tfidf_machine, test_count_machine, test_bm25_machine]
 
     # test single doc
     test_doc1 = ngrams(NGramDocument("Another sentence ok"))
@@ -91,18 +91,18 @@ end
     test_tfidf_machine2 = @test_logs machine(tfidf_transformer, [bag])
     MLJBase.fit!(test_tfidf_machine2)
 
-    # train bag_of_words transformer
-    bagofwords_vectorizer = MLJText.BagOfWordsTransformer()
-    test_bow_machine2 = @test_logs machine(bagofwords_vectorizer, [bag])
-    MLJBase.fit!(test_bow_machine2)
+    # train count transformer
+    count_transformer = MLJText.CountTransformer()
+    test_count_machine2 = @test_logs machine(count_transformer, [bag])
+    MLJBase.fit!(test_count_machine2)
 
     # train bm25 transformer
     bm25_transformer = MLJText.BM25Transformer()
     test_bm25_machine2 = @test_logs machine(bm25_transformer, [bag])
     MLJBase.fit!(test_bm25_machine2)
 
     test_doc5 = ["How about a cat in a hat"]
-    for mach = [test_tfidf_machine2, test_bow_machine2, test_bm25_machine2]
+    for mach = [test_tfidf_machine2, test_count_machine2, test_bm25_machine2]
         test_doc_transform = transform(mach, test_doc5)
         @test sum(test_doc_transform, dims=2)[1] > 0.0
         @test size(test_doc_transform) == (1, 8)
@@ -126,10 +126,10 @@ end
     test_tfidf_machine3 = @test_logs machine(tfidf_transformer, ngram_vec)
     MLJBase.fit!(test_tfidf_machine3)
 
-    # train bag_of_words transformer
-    bagofwords_vectorizer = MLJText.BagOfWordsTransformer(max_doc_freq=0.8)
-    test_bow_machine3 = @test_logs machine(bagofwords_vectorizer, ngram_vec)
-    MLJBase.fit!(test_bow_machine3)
+    # train count transformer
+    count_transformer = MLJText.CountTransformer(max_doc_freq=0.8)
+    test_count_machine3 = @test_logs machine(count_transformer, ngram_vec)
+    MLJBase.fit!(test_count_machine3)
 
     # train bm25 transformer
     bm25_transformer = MLJText.BM25Transformer(max_doc_freq=0.8, min_doc_freq=0.2)
@@ -140,7 +140,7 @@ end
     test_doc_transform = transform(test_tfidf_machine3, ngram_vec)
     @test (Vector(vec(sum(test_doc_transform, dims=2))) .> 0.2) == Bool[1, 1, 1, 1, 1, 1]
 
-    test_doc_transform = transform(test_bow_machine3, ngram_vec)
+    test_doc_transform = transform(test_count_machine3, ngram_vec)
     @test Vector(vec(sum(test_doc_transform, dims=2))) == [14, 10, 14, 9, 13, 7]
 
     test_doc_transform = transform(test_bm25_machine3, ngram_vec)