Merge pull request #24 from JuliaAI/dev

ablaom · web-flow · commit e73694b8b744 · 2022-10-05T14:06:21.000+13:00
For a 0.2.1 release
diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "MLJText"
 uuid = "5e27fcf9-6bac-46ba-8580-b5712f3d6387"
 authors = ["Chris Alexander <uvapazzo@gmail.com>, Anthony D. Blaom <anthony.blaom@gmail.com>"]
-version = "0.2.0"
+version = "0.2.1"
 
 [deps]
 CorpusLoaders = "214a0ac2-f95b-54f7-a80b-442ed9c2c9e8"
@@ -14,7 +14,7 @@ TextAnalysis = "a2db99b7-8b79-58f8-94bf-bbc811eef33d"
 
 [compat]
 CorpusLoaders = "0.3"
-MLJModelInterface = "1.3"
+MLJModelInterface = "1.4"
 ScientificTypes = "2.2.2, 3"
 ScientificTypesBase = "2.2.0, 3"
 TextAnalysis = "0.7.3"
diff --git a/src/MLJText.jl b/src/MLJText.jl
@@ -17,6 +17,7 @@ const PKG = "MLJText"          # substitute model-providing package name
 const ScientificNGram{N} = NTuple{<:Any,STB.Textual}
 const NGram{N} = NTuple{<:Any,<:AbstractString}
 
+include("docstring_helpers.jl")
 include("scitypes.jl")
 include("utils.jl")
 include("abstract_text_transformer.jl")
@@ -26,4 +27,5 @@ include("bm25_transformer.jl")
 
 export TfidfTransformer, BM25Transformer, CountTransformer
 
+
 end # module
diff --git a/src/bm25_transformer.jl b/src/bm25_transformer.jl
@@ -1,38 +1,3 @@
-"""
-    BM25Transformer()
-
-Convert a collection of raw documents to a matrix using the Okapi BM25 document-word statistic.
-
-BM25 is an approach similar to that of TF-IDF in terms of representing documents in a vector
-space.  The BM25 scoring function uses both term frequency (TF) and inverse document frequency 
-(IDF) so that, for each term in a document, its relative concentration in the document is
-scored (like TF-IDF). However, BM25 improves upon TF-IDF by incorporating probability - particularly,
-the probability that a user will consider a search result relevant based on the terms in the search query
-and those in each document.
-
-The parameters `max_doc_freq`, `min_doc_freq`, and `smooth_idf` all work identically to those in the
-`TfidfTransformer`. BM25 introduces two additional parameters:
-
-`κ` is the term frequency saturation characteristic. Higher values represent slower saturation. What 
-we mean by saturation is the degree to which a term occuring extra times adds to the overall score. This defaults
-to 2.
-
-`β` is a parameter, bound between 0 and 1, that amplifies the particular document length compared to the average length.
-The bigger β is, the more document length is amplified in terms of the overall score. The default value is 0.75.
-
-For more explanations, please see:
-- http://ethen8181.github.io/machine-learning/search/bm25_intro.html
-- https://en.wikipedia.org/wiki/Okapi_BM25
-- https://nlp.stanford.edu/IR-book/html/htmledition/okapi-bm25-a-non-binary-model-1.html
-
-The parameters `max_doc_freq` and `min_doc_freq` restrict the vocabulary
-that the transformer will consider. `max_doc_freq` indicates that terms in only
-up to the specified percentage of documents will be considered. For example, if
-`max_doc_freq` is set to 0.9, terms that are in more than 90% of documents
-will be removed. Similarly, the `min_doc_freq` parameter restricts terms in the
-other direction. A value of 0.01 means that only terms that are at least in 1% of
-documents will be included.
-"""
 mutable struct BM25Transformer <: AbstractTextTransformer
     max_doc_freq::Float64
     min_doc_freq::Float64
@@ -41,13 +6,13 @@ mutable struct BM25Transformer <: AbstractTextTransformer
     smooth_idf::Bool
 end
 
-function BM25Transformer(; 
+function BM25Transformer(;
     max_doc_freq::Float64 = 1.0,
     min_doc_freq::Float64 = 0.0,
     κ::Int=2,
     β::Float64=0.75,
     smooth_idf::Bool = true
-    )    
+    )
     transformer = BM25Transformer(max_doc_freq, min_doc_freq, κ, β, smooth_idf)
     message = MMI.clean!(transformer)
     isempty(message) || @warn message
@@ -103,14 +68,14 @@ function build_bm25!(doc_term_mat::SparseMatrixCSC{T},
     return bm25
 end
 
-function _transform(transformer::BM25Transformer, 
+function _transform(transformer::BM25Transformer,
                     result::BMI25TransformerResult,
                     v::Corpus)
     doc_terms = build_dtm(v, result.vocab)
     bm25 = similar(doc_terms.dtm, eltype(result.idf_vector))
     build_bm25!(doc_terms.dtm, bm25, result.idf_vector, result.mean_words_in_docs; κ=transformer.κ, β=transformer.β)
 
-    # here we return the `adjoint` of our sparse matrix to conform to 
+    # here we return the `adjoint` of our sparse matrix to conform to
     # the `n x p` dimensions throughout MLJ
     return adjoint(bm25)
 end
@@ -142,6 +107,82 @@ MMI.metadata_model(BM25Transformer,
                    AbstractVector{<:STB.Multiset{STB.Textual}}
                    },
                output_scitype = AbstractMatrix{STB.Continuous},
-               docstring = "Build BM-25 matrix from raw documents",
                path = "MLJText.BM25Transformer"
                )
+
+# # DOC STRING
+
+"""
+$(MMI.doc_header(BM25Transformer))
+
+The transformer converts a collection of documents, tokenized or pre-parsed as bags of
+words/ngrams, to a matrix of [Okapi BM25 document-word
+statistics](https://en.wikipedia.org/wiki/Okapi_BM25). The BM25 scoring function uses both
+term frequency (TF) and inverse document frequency (IDF, defined below), as in
+[`TfidfTransformer`](ref), but additionally adjusts for the probability that a user will
+consider a search result relevant based, on the terms in the search query and those in
+each document.
+
+$DOC_IDF
+
+References:
+
+- http://ethen8181.github.io/machine-learning/search/bm25_intro.html
+- https://en.wikipedia.org/wiki/Okapi_BM25
+- https://nlp.stanford.edu/IR-book/html/htmledition/okapi-bm25-a-non-binary-model-1.html
+
+# Training data
+
+In MLJ or MLJBase, bind an instance `model` to data with
+
+    mach = machine(model, X)
+
+$DOC_IDF
+
+Train the machine using `fit!(mach, rows=...)`.
+
+# Hyper-parameters
+
+- `max_doc_freq=1.0`: Restricts the vocabulary that the transformer will consider.
+  Terms that occur in `> max_doc_freq` documents will not be considered by the
+  transformer. For example, if `max_doc_freq` is set to 0.9, terms that are in more than
+  90% of the documents will be removed.
+
+- `min_doc_freq=0.0`: Restricts the vocabulary that the transformer will consider.
+  Terms that occur in `< max_doc_freq` documents will not be considered by the
+  transformer. A value of 0.01 means that only terms that are at least in 1% of the
+  documents will be included.
+
+- `κ=2`: The term frequency saturation characteristic. Higher values represent slower
+  saturation. What we mean by saturation is the degree to which a term occurring extra
+  times adds to the overall score.
+
+- `β=0.075`: Amplifies the particular document length compared to the average length. The
+  bigger β is, the more document length is amplified in terms of the overall score. The
+  default value is 0.75, and the bounds are restricted between 0 and 1.
+
+- `smooth_idf=true`: Control which definition of IDF to use (see above).
+
+# Operations
+
+- `transform(mach, Xnew)`: Based on the vocabulary, IDF, and mean word counts learned in
+  training, return the matrix of BM25 scores for `Xnew`, a vector of the same form as `X`
+  above. The matrix has size `(n, p)`, where `n = length(Xnew)` and `p` the size of the
+  vocabulary. Tokens/ngrams not appearing in the learned vocabulary are scored zero.
+
+# Fitted parameters
+
+The fields of `fitted_params(mach)` are:
+
+- `vocab`: A vector containing the string used in the transformer's vocabulary.
+
+- `idf_vector`: The transformer's calculated IDF vector.
+
+- `mean_words_in_docs`: The mean number of words in each document.
+
+$(doc_examples(:BM25Transformer))
+
+See also [`TfidfTransformer`](@ref), [`CountTransformer`](@ref)
+
+"""
+BM25Transformer
diff --git a/src/count_transformer.jl b/src/count_transformer.jl
@@ -1,30 +1,9 @@
-"""
-    CountTransformer()
-
-Convert a collection of raw documents to matrix representing a bag-of-words structure from 
-word counts. Essentially, a bag-of-words approach to representing documents in a matrix is 
-comprised of a count of every word in the document corpus/collection for every document. 
-This is a simple but often quite powerful way of representing documents as vectors. The 
-resulting representation is a matrix with rows representing every document in the corpus 
-and columns representing every word in the corpus. The value for each cell is the raw count 
-of a particular word in a particular document.
-
-Similarly to the `TfidfTransformer`, the vocabulary considered can be restricted
-to words occuring in a maximum or minimum portion of documents.
-The parameters `max_doc_freq` and `min_doc_freq` restrict the vocabulary
-that the transformer will consider. `max_doc_freq` indicates that terms in only
-up to the specified percentage of documents will be considered. For example, if
-`max_doc_freq` is set to 0.9, terms that are in more than 90% of documents
-will be removed. Similarly, the `min_doc_freq` parameter restricts terms in the
-other direction. A value of 0.01 means that only terms that are at least in 1% of
-documents will be included.
-"""
 mutable struct CountTransformer <: AbstractTextTransformer
     max_doc_freq::Float64
     min_doc_freq::Float64
 end
 
-function CountTransformer(; max_doc_freq::Float64 = 1.0, min_doc_freq::Float64 = 0.0)    
+function CountTransformer(; max_doc_freq::Float64 = 1.0, min_doc_freq::Float64 = 0.0)
     transformer = CountTransformer(max_doc_freq, min_doc_freq)
     message = MMI.clean!(transformer)
     isempty(message) || @warn message
@@ -37,7 +16,7 @@ end
 
 function _fit(transformer::CountTransformer, verbosity::Int, X::Corpus)
     # process corpus vocab
-    update_lexicon!(X)    
+    update_lexicon!(X)
 
     # calculate min and max doc freq limits
     if transformer.max_doc_freq < 1 || transformer.min_doc_freq > 0
@@ -58,12 +37,12 @@ function _fit(transformer::CountTransformer, verbosity::Int, X::Corpus)
     return fitresult, cache, NamedTuple()
 end
 
-function _transform(::CountTransformer, 
+function _transform(::CountTransformer,
                     result::CountTransformerResult,
                     v::Corpus)
     dtm_matrix = build_dtm(v, result.vocab)
 
-    # here we return the `adjoint` of our sparse matrix to conform to 
+    # here we return the `adjoint` of our sparse matrix to conform to
     # the `n x p` dimensions throughout MLJ
     return adjoint(dtm_matrix.dtm)
 end
@@ -92,6 +71,55 @@ MMI.metadata_model(CountTransformer,
                    AbstractVector{<:STB.Multiset{STB.Textual}}
                    },
                output_scitype = AbstractMatrix{STB.Continuous},
-               docstring = "Build Bag-of-Words matrix from word counts for corpus of documents",
                path = "MLJText.CountTransformer"
-               )
+               )
+
+# # DOCUMENT STRING
+
+"""
+$(MMI.doc_header(CountTransformer))
+
+The transformer converts a collection of documents, tokenized or pre-parsed as bags of
+words/ngrams, to a matrix of term counts.
+
+# Training data
+
+In MLJ or MLJBase, bind an instance `model` to data with
+
+    mach = machine(model, X)
+
+$DOC_TRANSFORMER_INPUTS
+
+Train the machine using `fit!(mach, rows=...)`.
+
+# Hyper-parameters
+
+- `max_doc_freq=1.0`: Restricts the vocabulary that the transformer will consider.
+  Terms that occur in `> max_doc_freq` documents will not be considered by the
+  transformer. For example, if `max_doc_freq` is set to 0.9, terms that are in more than
+  90% of the documents will be removed.
+
+- `min_doc_freq=0.0`: Restricts the vocabulary that the transformer will consider.
+  Terms that occur in `< max_doc_freq` documents will not be considered by the
+  transformer. A value of 0.01 means that only terms that are at least in 1% of the
+  documents will be included.
+
+# Operations
+
+- `transform(mach, Xnew)`: Based on the vocabulary learned in training, return the matrix
+  of counts for `Xnew`, a vector of the same form as `X` above. The matrix has size `(n,
+  p)`, where `n = length(Xnew)` and `p` the size of the vocabulary. Tokens/ngrams not
+  appearing in the learned vocabulary are scored zero.
+
+# Fitted parameters
+
+The fields of `fitted_params(mach)` are:
+
+- `vocab`: A vector containing the string used in the transformer's vocabulary.
+
+$(doc_examples(:CountTransformer))
+
+See also
+[`TfidfTransformer`](@ref), [`BM25Transformer`](@ref)
+"""
+CountTransformer
diff --git a/src/docstring_helpers.jl b/src/docstring_helpers.jl
diff --git a/src/tfidf_transformer.jl b/src/tfidf_transformer.jl