Whitespace changes and light editing of doc-string for readability

ablaom · ablaom · commit c61d3aa1948e · 2021-09-21T08:57:05.000+12:00
diff --git a/src/MLJText.jl b/src/MLJText.jl
@@ -14,26 +14,33 @@ const STB = ScientificTypesBase
 
 Convert a collection of raw documents to a matrix of TF-IDF features.
 
-Tf means term-frequency while tf-idf means term-frequency times inverse document-frequency. 
-This is a common term weighting scheme in information retrieval, that has also found good use 
-in document classification.
-
-The goal of using tf-idf instead of the raw frequencies of occurrence of a token in a given 
-document is to scale down the impact of tokens that occur very frequently in a given corpus 
-and that are hence empirically less informative than features that occur in a small fraction of 
-the training corpus.
-
-The formula that is used to compute the tf-idf for a term t of a document d in a document set is 
-tf-idf(t, d) = tf(t, d) * idf(t), and the idf is computed as idf(t) = log [ n / df(t) ] + 1 (if `smooth_idf=false`), 
-where n is the total number of documents in the document set and df(t) is the document frequency of t; the 
-document frequency is the number of documents in the document set that contain the term t. The effect of adding “1” 
-to the idf in the equation above is that terms with zero idf, i.e., terms that occur in all documents in a training 
-set, will not be entirely ignored. (Note that the idf formula above differs from the standard textbook notation 
-that defines the idf as idf(t) = log [ n / (df(t) + 1) ]).
-
-If `smooth_idf=true` (the default), the constant “1” is added to the numerator and denominator of the idf as if an extra 
-document was seen containing every term in the collection exactly once, which prevents zero divisions: 
-idf(t) = log [ (1 + n) / (1 + df(t)) ] + 1.
+"Tf" means term-frequency while "tf-idf" means term-frequency times
+inverse document-frequency.  This is a common term weighting scheme in
+information retrieval, that has also found good use in document
+classification.
+
+The goal of using tf-idf instead of the raw frequencies of occurrence
+of a token in a given document is to scale down the impact of tokens
+that occur very frequently in a given corpus and that are hence
+empirically less informative than features that occur in a small
+fraction of the training corpus.
+
+The formula that is used to compute the tf-idf for a term `t` of a
+document `d` in a document set is `tf_idf(t, d) = tf(t, d) *
+idf(t)`. Assuming `smooth_idf=false`, `idf(t) = log [ n / df(t) ] + 1`
+where `n` is the total number of documents in the document set and
+`df(t)` is the document frequency of `t`. The document frequency is
+the number of documents in the document set that contain the term
+`t`. The effect of adding “1” to the idf in the equation above is that
+terms with zero idf, i.e., terms that occur in all documents in a
+training set, will not be entirely ignored. (Note that the idf formula
+above differs from that appearing in standard texts, `idf(t) = log [ n
+/ (df(t) + 1) ])`.
+
+If `smooth_idf=true` (the default), the constant “1” is added to the
+numerator and denominator of the idf as if an extra document was seen
+containing every term in the collection exactly once, which prevents
+zero divisions: `idf(t) = log [ (1 + n) / (1 + df(t)) ] + 1`.
 
 """
 MMI.@mlj_model mutable struct TfidfTransformer <: MLJModelInterface.Unsupervised
@@ -49,7 +56,9 @@ struct TfidfTransformerResult
     idf_vector::Vector{Float64}
 end
 
-function limit_features(doc_term_matrix::DocumentTermMatrix, high::Int, low::Int)
+function limit_features(doc_term_matrix::DocumentTermMatrix,
+                        high::Int,
+                        low::Int)
     doc_freqs = vec(sum(doc_term_matrix.dtm, dims=1))
 
     # build mask to restrict terms
@@ -66,16 +75,21 @@ function limit_features(doc_term_matrix::DocumentTermMatrix, high::Int, low::Int
     return (doc_term_matrix.dtm[:, mask], new_terms)
 end
 
-_convert_bag_of_words(X::Dict{NGram, Int}) = Dict(join(k, " ") => v for (k, v) in X)
+_convert_bag_of_words(X::Dict{NGram, Int}) =
+    Dict(join(k, " ") => v for (k, v) in X)
 
-build_corpus(X::Vector{Dict{NGram, Int}}) = build_corpus(_convert_bag_of_words.(X))
-build_corpus(X::Vector{Dict{S, Int}}) where {S <: AbstractString} = Corpus(NGramDocument.(X))
+build_corpus(X::Vector{Dict{NGram, Int}}) =
+    build_corpus(_convert_bag_of_words.(X))
+build_corpus(X::Vector{Dict{S, Int}}) where {S <: AbstractString} =
+    Corpus(NGramDocument.(X))
 build_corpus(X) = Corpus(TokenDocument.(X))
 
-MMI.fit(transformer::TfidfTransformer, verbosity::Int, X) = _fit(transformer, verbosity, build_corpus(X))
+MMI.fit(transformer::TfidfTransformer, verbosity::Int, X) =
+    _fit(transformer, verbosity, build_corpus(X))
 
 function _fit(transformer::TfidfTransformer, verbosity::Int, X::Corpus)
-    transformer.max_doc_freq < transformer.min_doc_freq && error("Max doc frequency cannot be less than Min doc frequency!")
+    transformer.max_doc_freq < transformer.min_doc_freq &&
+        error("Max doc frequency cannot be less than Min doc frequency!")
 
     # process corpus vocab
     update_lexicon!(X)
@@ -104,7 +118,10 @@ function _fit(transformer::TfidfTransformer, verbosity::Int, X::Corpus)
     return fitresult, cache, NamedTuple()
 end
 
-function build_tfidf!(dtm::SparseMatrixCSC{T}, tfidf::SparseMatrixCSC{F}, idf_vector::Vector{F}) where {T <: Real, F <: AbstractFloat}
+function build_tfidf!(dtm::SparseMatrixCSC{T},
+                      tfidf::SparseMatrixCSC{F},
+                      idf_vector::Vector{F}) where {T<:Real,F<:AbstractFloat}
+
     rows = rowvals(dtm)
     dtmvals = nonzeros(dtm)
     tfidfvals = nonzeros(tfidf)
@@ -126,9 +143,13 @@ function build_tfidf!(dtm::SparseMatrixCSC{T}, tfidf::SparseMatrixCSC{F}, idf_ve
     return tfidf
 end
 
-MMI.transform(transformer::TfidfTransformer, result::TfidfTransformerResult, v) = _transform(transformer, result, build_corpus(v))
+MMI.transform(transformer::TfidfTransformer,
+              result::TfidfTransformerResult, v) =
+                  _transform(transformer, result, build_corpus(v))
 
-function _transform(::TfidfTransformer, result::TfidfTransformerResult, v::Corpus)
+function _transform(::TfidfTransformer,
+                    result::TfidfTransformerResult,
+                    v::Corpus)
     m = DocumentTermMatrix(v, result.vocab)
     tfidf = similar(m.dtm, eltype(result.idf_vector))
     build_tfidf!(m.dtm, tfidf, result.idf_vector)