Merge pull request #1 from JuliaAI/tfidftransformer

ablaom · web-flow · commit 1057d6a2fcdd · 2021-09-28T14:55:51.000+13:00
initial commit of tfidf transformer
diff --git a/Project.toml b/Project.toml
@@ -1,24 +1,25 @@
 name = "MLJText"
 uuid = "5e27fcf9-6bac-46ba-8580-b5712f3d6387"
-authors = ["Chris Alexander, Anthony D. Blaom <anthony.blaom@gmail.com>"]
+authors = ["Chris Alexander <uvapazzo@gmail.com>, Anthony D. Blaom <anthony.blaom@gmail.com>"]
 version = "0.1.0"
 
 [deps]
 MLJModelInterface = "e80e1ace-859a-464e-9ed9-23947d8ae3ea"
 ScientificTypesBase = "30f210dd-8aff-4c5f-94ba-8e64358c1161"
+ScientificTypes = "321657f4-b219-11e9-178b-2701a2544e81"
+SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
 TextAnalysis = "a2db99b7-8b79-58f8-94bf-bbc811eef33d"
 
 [compat]
-MLJModelInterface = "1.1.1"
-ScientificTypesBase = "1"
+MLJModelInterface = "1.3"
+ScientificTypesBase = "2.2.0"
+ScientificTypes = "2.2.2"
 TextAnalysis = "0.7.3"
 julia = "1.3"
 
 [extras]
-Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
 MLJBase = "a7f614a8-145f-11e9-1d2a-a57a1082229d"
-StableRNGs = "860ef19b-820b-49d6-a774-d7a799459cd3"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
 [targets]
-test = ["Distributions", "MLJBase", "StableRNGs", "Test"]
+test = ["MLJBase", "Test"]
diff --git a/src/MLJText.jl b/src/MLJText.jl
@@ -1,115 +1,230 @@
 module MLJText
 
-# The following is just boostrap code to get a working template. You
-# will remove this and replace "import .TextAnalysis" with "import
-# TextAnalysis" and any other deps you need.
-
-module TextAnalysis
+import TextAnalysis # substitute model-providing package name here (no dot)
+import MLJModelInterface
+import ScientificTypesBase
+using SparseArrays, TextAnalysis
 
-function fit(Xmatrix::Matrix, yint::AbstractVector{<:Integer})
-    classes = sort(unique(yint))
-    counts = [count(==(c), yint) for c in classes]
-    Θ = counts / sum(counts)
-end
+const PKG = "MLJText"          # substitute model-providing package name
+const MMI = MLJModelInterface
+const STB = ScientificTypesBase
 
-predict(Xnew::Matrix, Θ) = vcat(fill(Θ', size(Xnew, 1))...)
+"""
+    TfidfTransformer()
+
+The following is taken largely from scikit-learn's documentation:
+https://github.com/scikit-learn/scikit-learn/blob/main/sklearn/feature_extraction/text.py
+
+Convert a collection of raw documents to a matrix of TF-IDF features.
+
+"TF" means term-frequency while "TF-IDF" means term-frequency times
+inverse document-frequency.  This is a common term weighting scheme in
+information retrieval, that has also found good use in document
+classification.
+
+The goal of using TF-IDF instead of the raw frequencies of occurrence
+of a token in a given document is to scale down the impact of tokens
+that occur very frequently in a given corpus and that are hence
+empirically less informative than features that occur in a small
+fraction of the training corpus.
+
+The formula that is used to compute the TF-IDF for a term `t` of a
+document `d` in a document set is `tf_idf(t, d) = tf(t, d) *
+idf(t)`. Assuming `smooth_idf=false`, `idf(t) = log [ n / df(t) ] + 1`
+where `n` is the total number of documents in the document set and
+`df(t)` is the document frequency of `t`. The document frequency is
+the number of documents in the document set that contain the term
+`t`. The effect of adding “1” to the idf in the equation above is that
+terms with zero idf, i.e., terms that occur in all documents in a
+training set, will not be entirely ignored. (Note that the idf formula
+above differs from that appearing in standard texts, `idf(t) = log [ n
+/ (df(t) + 1) ])`.
+
+If `smooth_idf=true` (the default), the constant “1” is added to the
+numerator and denominator of the idf as if an extra document was seen
+containing every term in the collection exactly once, which prevents
+zero divisions: `idf(t) = log [ (1 + n) / (1 + df(t)) ] + 1`.
 
-# julia> yint = rand([1,3,4], 100);
+"""
+MMI.@mlj_model mutable struct TfidfTransformer <: MLJModelInterface.Unsupervised
+    max_doc_freq::Float64 = 1.0
+    min_doc_freq::Float64 = 0.0
+    smooth_idf::Bool = true
+end
 
-# julia> Θ = fit(rand(100, 3), yint)
-# 3-element Vector{Float64}:
-#  0.35
-#  0.23
-#  0.42
+const NGram{N} = NTuple{<:Any,<:AbstractString}
 
-# julia> predict(rand(5, 3), Θ)
-# 5×3 Matrix{Float64}:
-#  0.35  0.23  0.42
-#  0.35  0.23  0.42
-#  0.35  0.23  0.42
-#  0.35  0.23  0.42
-#  0.35  0.23  0.42
+struct TfidfTransformerResult
+    vocab::Vector{String}
+    idf_vector::Vector{Float64}
+end
 
-end # of module
+function limit_features(doc_term_matrix::DocumentTermMatrix,
+                        high::Int,
+                        low::Int)
+    doc_freqs = vec(sum(doc_term_matrix.dtm, dims=2))
 
+    # build mask to restrict terms
+    mask = trues(length(doc_freqs))
+    if high < 1
+        mask .&= (doc_freqs .<= high)
+    end
+    if low > 0
+        mask .&= (doc_freqs .>= low)
+    end
 
-### CONTINUATION OF TEMPLATE
+    new_terms = doc_term_matrix.terms[mask]
 
-import .TextAnalysis # substitute model-providing package name here (no dot)
-import MLJModelInterface
-import ScientificTypesBase
-
-const PKG = "TextAnalysis"          # substitute model-providing package name
-const MMI = MLJModelInterface
-const STB = ScientificTypesBase
+    return (doc_term_matrix.dtm[mask, :], new_terms)
+end
 
-"""
-    CoolProbabilisticClassifier()
+_convert_bag_of_words(X::Dict{<:NGram, <:Integer}) = 
+    Dict(join(k, " ") => v for (k, v) in X)
+
+build_corpus(X::Vector{<:Dict{<:NGram, <:Integer}}) = 
+    build_corpus(_convert_bag_of_words.(X))
+build_corpus(X::Vector{<:Dict{S, <:Integer}}) where {S <: AbstractString} = 
+    Corpus(NGramDocument.(X))
+build_corpus(X) = Corpus(TokenDocument.(X))
+
+# based on https://github.com/zgornel/StringAnalysis.jl/blob/master/src/dtm.jl
+# and https://github.com/JuliaText/TextAnalysis.jl/blob/master/src/dtm.jl
+build_dtm(docs::Corpus) = build_dtm(docs, sort(collect(keys(lexicon(docs)))))
+function build_dtm(docs::Corpus, terms::Vector{T}) where {T}
+    # we are flipping the orientation of this matrix
+    # so we get the `columnindices` from the TextAnalysis API
+    row_indices = TextAnalysis.columnindices(terms)
+
+    m = length(terms) # terms are rows
+    n = length(docs)  # docs are columns
+
+    rows = Vector{Int}(undef, 0) # terms
+    columns = Vector{Int}(undef, 0) # docs
+    values = Vector{Int}(undef, 0)
+    for i in eachindex(docs.documents)
+        doc = docs.documents[i]
+        ngs = ngrams(doc)
+        for ngram in keys(ngs)
+            j = get(row_indices, ngram, 0)
+            v = ngs[ngram]
+            if j != 0
+                push!(columns, i)
+                push!(rows, j)
+                push!(values, v)
+            end
+        end
+    end
+    if length(rows) > 0
+        dtm = sparse(rows, columns, values, m, n)
+    else
+        dtm = spzeros(Int, m, n)
+    end
+    DocumentTermMatrix(dtm, terms, row_indices)
+end
 
-A cool classifier that predicts `UnivariateFinite` probability
-distributions. These are distributions for a finite sample space whose
-elements are *labeled*.
+MMI.fit(transformer::TfidfTransformer, verbosity::Int, X) = 
+    _fit(transformer, verbosity, build_corpus(X))
+
+function _fit(transformer::TfidfTransformer, verbosity::Int, X::Corpus)
+    transformer.max_doc_freq < transformer.min_doc_freq && 
+        error("Max doc frequency cannot be less than Min doc frequency!")
+
+    # process corpus vocab
+    update_lexicon!(X)
+    dtm_matrix = build_dtm(X)
+    n = size(dtm_matrix.dtm, 2) # docs are columns
+
+    # calculate min and max doc freq limits
+    if transformer.max_doc_freq < 1 || transformer.min_doc_freq > 0
+        high = round(Int, transformer.max_doc_freq * n)
+        low = round(Int, transformer.min_doc_freq * n)
+        new_dtm, vocab = limit_features(dtm_matrix, high, low)
+    else
+        new_dtm = dtm_matrix.dtm
+        vocab = dtm_matrix.terms
+    end
+
+    # calculate IDF
+    smooth_idf = Int(transformer.smooth_idf)
+    documents_containing_term = vec(sum(new_dtm .> 0, dims=2)) .+ smooth_idf
+    idf = log.((n + smooth_idf) ./ documents_containing_term) .+ 1
+
+    # prepare result
+    fitresult = TfidfTransformerResult(vocab, idf)
+    cache = nothing
 
-"""
-MMI.@mlj_model mutable struct CoolProbabilisticClassifier <: MMI.Probabilistic
-    dummy_hyperparameter1::Float64 = 1.0::(_ ≥ 0)
-    dummy_hyperparameter2::Int = 1::(0 < _ ≤ 1)
-    dummy_hyperparameter3
+    return fitresult, cache, NamedTuple()
 end
 
-function MMI.fit(::CoolProbabilisticClassifier, verbosity, X, y)
+function build_tfidf!(dtm::SparseMatrixCSC{T},
+                      tfidf::SparseMatrixCSC{F},
+                      idf_vector::Vector{F}) where {T <: Real, F <: AbstractFloat}
+    rows = rowvals(dtm)
+    dtmvals = nonzeros(dtm)
+    tfidfvals = nonzeros(tfidf)
+    @assert size(dtmvals) == size(tfidfvals)
 
-    Xmatrix = MMI.matrix(X)
+    p, n = size(dtm)
 
-    yint = MMI.int(y)
-    decode = MMI.decoder(y[1])                # for decoding int repr.
-    classes_seen = decode(sort(unique(yint))) # ordered by int repr.
+    # TF tells us what proportion of a document is defined by a term
+    words_in_documents = F.(sum(dtm, dims=1))
+    oneval = one(F)
 
-    Θ = TextAnalysis.fit(Xmatrix, yint)            # probability vector
-    fitresult = (Θ, classes_seen)
-    report = (n_classes_seen = length(classes_seen),)
-    cache = nothing
-
-    return fitresult, cache, report
+    for i = 1:n
+        for j in nzrange(dtm, i)
+            row = rows[j]
+            tfidfvals[j] = dtmvals[j] / max(words_in_documents[i], oneval) * idf_vector[row]
+        end
+    end
 
+    return tfidf
 end
 
-function MMI.predict(::CoolProbabilisticClassifier, fitresult, Xnew)
-    Xmatrix = MMI.matrix(Xnew)
-
-    Θ, classes_seen = fitresult
-    prob_matrix = TextAnalysis.predict(Xmatrix, Θ)
+MMI.transform(transformer::TfidfTransformer, result::TfidfTransformerResult, v) = 
+    _transform(transformer, result, build_corpus(v))
 
-    # `classes_seen` is a categorical vector whose pool actually
-    # includes *all* classes. The `UnivariateFinite` constructor
-    # automatically assigns zero probability to the unseen classes.
+function _transform(::TfidfTransformer, 
+                    result::TfidfTransformerResult,
+                    v::Corpus)
+    dtm_matrix = build_dtm(v, result.vocab)
+    tfidf = similar(dtm_matrix.dtm, eltype(result.idf_vector))
+    build_tfidf!(dtm_matrix.dtm, tfidf, result.idf_vector)
 
-    return MMI.UnivariateFinite(classes_seen, prob_matrix)
+    # here we return the `adjoint` of our sparse matrix to conform to 
+    # the `n x p` dimensions throughout MLJ
+    return adjoint(tfidf)
 end
 
 # for returning user-friendly form of the learned parameters:
-function MMI.fitted_params(::CoolProbabilisticClassifier, fitresult)
-    Θ, classes_seen = fitresult
-    return (raw_probabilities = Θ, classes_seen_in_training = classes_seen)
+function MMI.fitted_params(::TfidfTransformer, fitresult)
+    vocab = fitresult.vocab
+    idf_vector = fitresult.idf_vector
+    return (vocab = vocab, idf_vector = idf_vector)
 end
 
 
 ## META DATA
 
-MMI.metadata_pkg(CoolProbabilisticClassifier,
+MMI.metadata_pkg(TfidfTransformer,
              name="$PKG",
              uuid="7876af07-990d-54b4-ab0e-23690620f79a",
-             url="https://github.com/JuliaLang/TextAnalysis.jl",
+             url="https://github.com/JuliaAI/MLJText.jl",
              is_pure_julia=true,
              license="MIT",
              is_wrapper=false
 )
 
-MMI.metadata_model(CoolProbabilisticClassifier,
-               input_scitype = MMI.Table(STB.Continuous),
-               target_scitype = AbstractVector{<:STB.Finite},# ie, a classifier
-               docstring = "Really cool classifier",         # brief description
-               path = "$PKG.CoolProbabilisiticClassifier"
+const ScientificNGram{N} = NTuple{<:Any,STB.Textual}
+
+MMI.metadata_model(TfidfTransformer,
+               input_scitype = Union{
+                   AbstractVector{<:AbstractVector{STB.Textual}},
+                   AbstractVector{<:STB.Multiset{<:ScientificNGram}},
+                   AbstractVector{<:STB.Multiset{STB.Textual}}
+                   },
+               output_scitype = AbstractMatrix{STB.Continuous},
+               docstring = "Build TF-IDF matrix from raw documents",
+               path = "MLJText.TfidfTransformer"
                )
 
-end # module
+end # module
diff --git a/test/runtests.jl b/test/runtests.jl