change version num; update doc strings; add clean methods and slight refactoring

pazzo83 · pazzo83 · commit 09f37e1767ee · 2021-12-17T23:30:03.000-05:00
diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "MLJText"
 uuid = "5e27fcf9-6bac-46ba-8580-b5712f3d6387"
 authors = ["Chris Alexander <uvapazzo@gmail.com>, Anthony D. Blaom <anthony.blaom@gmail.com>"]
-version = "0.2.0"
+version = "0.1.1"
 
 [deps]
 CorpusLoaders = "214a0ac2-f95b-54f7-a80b-442ed9c2c9e8"
diff --git a/src/MLJText.jl b/src/MLJText.jl
@@ -24,6 +24,6 @@ include("tfidf_transformer.jl")
 include("bagofwords_transformer.jl")
 include("bm25_transformer.jl")
 
-export TfidfTransformer
+export TfidfTransformer, BM25Transformer, BagOfWordsTransformer
 
 end # module
diff --git a/src/abstract_text_transformer.jl b/src/abstract_text_transformer.jl
@@ -1,13 +1,30 @@
 abstract type AbstractTextTransformer <: MMI.Unsupervised end
 
+function MMI.clean!(transformer::AbstractTextTransformer)
+    warning = ""
+    if transformer.min_doc_freq < 0.0
+        warning *= "Need min_doc_freq ≥ 0. Resetting min_doc_freq=0. "
+        transformer.min_doc_freq = 0.0
+    end
+
+    if transformer.max_doc_freq > 1.0
+        warning *= "Need max_doc_freq ≤ 1. Resetting max_doc_freq=1. "
+        transformer.max_doc_freq = 1.0
+    end
+
+    if transformer.max_doc_freq < transformer.min_doc_freq
+        warning *= "max_doc_freq cannot be less than min_doc_freq, resetting to defaults. "
+        transformer.min_doc_freq = 0.0
+        transformer.max_doc_freq = 1.0
+    end
+    return warning
+end
+
 ## General method to fit text transformer models ##
 MMI.fit(transformer::AbstractTextTransformer, verbosity::Int, X) = 
     _fit(transformer, verbosity, build_corpus(X))
 
 function _fit(transformer::AbstractTextTransformer, verbosity::Int, X::Corpus)
-    transformer.max_doc_freq < transformer.min_doc_freq && 
-        error("Max doc frequency cannot be less than Min doc frequency!")
-
     # process corpus vocab
     update_lexicon!(X)
     dtm_matrix = build_dtm(X)
diff --git a/src/bagofwords_transformer.jl b/src/bagofwords_transformer.jl
@@ -1,31 +1,43 @@
 """
     BagOfWordsTransformer()
 
-    Convert a collection of raw documents to matrix representing a bag-of-words structure.
+Convert a collection of raw documents to matrix representing a bag-of-words structure.
 
-    Essentially, a bag-of-words approach to representing documents in a matrix is comprised of
-    a count of every word in the document corpus/collection for every document.  This is a simple
-    but often quite powerful way of representing documents as vectors.  The end representation is
-    a matrix with rows representing every document in the corpus and columns representing every word
-    in the corpus.  The value for each cell is the raw count of a particular word in a particular
-    document.
+Essentially, a bag-of-words approach to representing documents in a matrix is comprised of
+a count of every word in the document corpus/collection for every document.  This is a simple
+but often quite powerful way of representing documents as vectors.  The end representation is
+a matrix with rows representing every document in the corpus and columns representing every word
+in the corpus.  The value for each cell is the raw count of a particular word in a particular
+document.
 
-    Similarly to the `TfidfTransformer`, the vocabulary considered can be restricted
-    to words occuring in a maximum or minimum portion of documents.
+Similarly to the `TfidfTransformer`, the vocabulary considered can be restricted
+to words occuring in a maximum or minimum portion of documents.
+
+The parameters `max_doc_freq` and `min_doc_freq` restrict the vocabulary
+that the transformer will consider.  `max_doc_freq` indicates that terms in only
+up to the specified percentage of documents will be considered.  For example, if
+`max_doc_freq` is set to 0.9, terms that are in more than 90% of documents
+will be removed.  Similarly, the `min_doc_freq` parameter restricts terms in the
+other direction.  A value of 0.01 means that only terms that are at least in 1% of
+documents will be included.
 """
-MMI.@mlj_model mutable struct BagOfWordsTransformer <: AbstractTextTransformer
-    max_doc_freq::Float64 = 1.0
-    min_doc_freq::Float64 = 0.0
+mutable struct BagOfWordsTransformer <: AbstractTextTransformer
+    max_doc_freq::Float64
+    min_doc_freq::Float64
+end
+
+function BagOfWordsTransformer(; max_doc_freq::Float64 = 1.0, min_doc_freq::Float64 = 0.0)    
+    transformer = BagOfWordsTransformer(max_doc_freq, min_doc_freq)
+    message = MMI.clean!(transformer)
+    isempty(message) || @warn message
+    return transformer
 end
 
 struct BagOfWordsTransformerResult
     vocab::Vector{String}
 end
 
 function _fit(transformer::BagOfWordsTransformer, verbosity::Int, X::Corpus)
-    transformer.max_doc_freq < transformer.min_doc_freq && 
-        error("Max doc frequency cannot be less than Min doc frequency!")
-
     # process corpus vocab
     update_lexicon!(X)    
 
diff --git a/src/bm25_transformer.jl b/src/bm25_transformer.jl
@@ -1,36 +1,57 @@
 """
     BM25Transformer()
 
-    Convert a collection of raw documents to a matrix using the Okapi BM25 document-word statistic.
-
-    BM25 is an approach similar to that of TF-IDF in terms of representing documents in a vector
-    space.  The BM25 scoring function uses both term frequency (TF) and inverse document frequency 
-    (IDF) so that, for each term in a document, its relative concentration in the document is
-    scored (like TF-IDF).  However, BM25 improves upon TF-IDF by incorporating probability - particularly,
-    the probability that a user will consider a search result relevant based on the terms in the search query
-    and those in each document.
-
-    The parameters `max_doc_freq`, `min_doc_freq`, and `smooth_idf` all work identically to those in the
-    `TfidfTransformer`.  BM25 introduces two additional parameters:
-
-    `κ` is the term frequency saturation characteristic.  Higher values represent slower satuartion.  What 
-    we mean by saturation is the degree to which a term occuring extra times adds to the overall score.  This defaults
-    to 2.
-
-    `β` is a parameter, bound between 0 and 1, that amplifies the particular document length compared to the average length.
-    The bigger β is, the more document length is amplified in terms of the overall score.  The default value is 0.75.
-
-    For more explanations, please see:
-    http://ethen8181.github.io/machine-learning/search/bm25_intro.html
-    https://en.wikipedia.org/wiki/Okapi_BM25
-    https://nlp.stanford.edu/IR-book/html/htmledition/okapi-bm25-a-non-binary-model-1.html
+Convert a collection of raw documents to a matrix using the Okapi BM25 document-word statistic.
+
+BM25 is an approach similar to that of TF-IDF in terms of representing documents in a vector
+space.  The BM25 scoring function uses both term frequency (TF) and inverse document frequency 
+(IDF) so that, for each term in a document, its relative concentration in the document is
+scored (like TF-IDF).  However, BM25 improves upon TF-IDF by incorporating probability - particularly,
+the probability that a user will consider a search result relevant based on the terms in the search query
+and those in each document.
+
+The parameters `max_doc_freq`, `min_doc_freq`, and `smooth_idf` all work identically to those in the
+`TfidfTransformer`.  BM25 introduces two additional parameters:
+
+`κ` is the term frequency saturation characteristic.  Higher values represent slower saturation.  What 
+we mean by saturation is the degree to which a term occuring extra times adds to the overall score.  This defaults
+to 2.
+
+`β` is a parameter, bound between 0 and 1, that amplifies the particular document length compared to the average length.
+The bigger β is, the more document length is amplified in terms of the overall score.  The default value is 0.75.
+
+For more explanations, please see:
+http://ethen8181.github.io/machine-learning/search/bm25_intro.html
+https://en.wikipedia.org/wiki/Okapi_BM25
+https://nlp.stanford.edu/IR-book/html/htmledition/okapi-bm25-a-non-binary-model-1.html
+
+The parameters `max_doc_freq` and `min_doc_freq` restrict the vocabulary
+that the transformer will consider.  `max_doc_freq` indicates that terms in only
+up to the specified percentage of documents will be considered.  For example, if
+`max_doc_freq` is set to 0.9, terms that are in more than 90% of documents
+will be removed.  Similarly, the `min_doc_freq` parameter restricts terms in the
+other direction.  A value of 0.01 means that only terms that are at least in 1% of
+documents will be included.
 """
-MMI.@mlj_model mutable struct BM25Transformer <: AbstractTextTransformer
-    max_doc_freq::Float64 = 1.0
-    min_doc_freq::Float64 = 0.0
-    κ::Int=2
-    β::Float64=0.75
+mutable struct BM25Transformer <: AbstractTextTransformer
+    max_doc_freq::Float64
+    min_doc_freq::Float64
+    κ::Int
+    β::Float64
+    smooth_idf::Bool
+end
+
+function BM25Transformer(; 
+    max_doc_freq::Float64 = 1.0,
+    min_doc_freq::Float64 = 0.0,
+    κ::Int=2,
+    β::Float64=0.75,
     smooth_idf::Bool = true
+    )    
+    transformer = BM25Transformer(max_doc_freq, min_doc_freq, κ, β, smooth_idf)
+    message = MMI.clean!(transformer)
+    isempty(message) || @warn message
+    return transformer
 end
 
 struct BMI25TransformerResult
diff --git a/src/tfidf_transformer.jl b/src/tfidf_transformer.jl
@@ -34,11 +34,25 @@ numerator and denominator of the idf as if an extra document was seen
 containing every term in the collection exactly once, which prevents
 zero divisions: `idf(t) = log [ (1 + n) / (1 + df(t)) ] + 1`.
 
+The parameters `max_doc_freq` and `min_doc_freq` restrict the vocabulary
+that the transformer will consider.  `max_doc_freq` indicates that terms in only
+up to the specified percentage of documents will be considered.  For example, if
+`max_doc_freq` is set to 0.9, terms that are in more than 90% of documents
+will be removed.  Similarly, the `min_doc_freq` parameter restricts terms in the
+other direction.  A value of 0.01 means that only terms that are at least in 1% of
+documents will be included.
 """
-MMI.@mlj_model mutable struct TfidfTransformer <: AbstractTextTransformer
-    max_doc_freq::Float64 = 1.0
-    min_doc_freq::Float64 = 0.0
-    smooth_idf::Bool = true
+mutable struct TfidfTransformer <: AbstractTextTransformer
+    max_doc_freq::Float64
+    min_doc_freq::Float64
+    smooth_idf::Bool
+end
+
+function TfidfTransformer(; max_doc_freq::Float64 = 1.0, min_doc_freq::Float64 = 0.0, smooth_idf::Bool = true)    
+    transformer = TfidfTransformer(max_doc_freq, min_doc_freq, smooth_idf)
+    message = MMI.clean!(transformer)
+    isempty(message) || @warn message
+    return transformer
 end
 
 struct TfidfTransformerResult