JuliaAI
diff --git a/‎Project.toml‎
Lines changed: 2 additions & 1 deletion b/‎Project.toml‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎README.md‎
Lines changed: 62 additions & 5 deletions b/‎README.md‎
Lines changed: 62 additions & 5 deletions
diff --git a/‎src/MLJText.jl‎
Lines changed: 9 additions & 1 deletion b/‎src/MLJText.jl‎
Lines changed: 9 additions & 1 deletion
diff --git a/‎src/abstract_text_transformer.jl‎
Lines changed: 55 additions & 0 deletions b/‎src/abstract_text_transformer.jl‎
Lines changed: 55 additions & 0 deletions
diff --git a/‎src/bagofwords_transformer.jl‎
Lines changed: 99 additions & 0 deletions b/‎src/bagofwords_transformer.jl‎
Lines changed: 99 additions & 0 deletions
@@ -1,14 +1,15 @@
 name = "MLJText"
 uuid = "5e27fcf9-6bac-46ba-8580-b5712f3d6387"
 authors = ["Chris Alexander <[email protected]>, Anthony D. Blaom <[email protected]>"]
-version = "0.1.0"
+version = "0.1.1"
 
 [deps]
 CorpusLoaders = "214a0ac2-f95b-54f7-a80b-442ed9c2c9e8"
 MLJModelInterface = "e80e1ace-859a-464e-9ed9-23947d8ae3ea"
 ScientificTypes = "321657f4-b219-11e9-178b-2701a2544e81"
 ScientificTypesBase = "30f210dd-8aff-4c5f-94ba-8e64358c1161"
 SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
+Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 TextAnalysis = "a2db99b7-8b79-58f8-94bf-bbc811eef33d"
 
 [compat]
 
@@ -10,14 +10,14 @@ extension providing tools and models for text analysis.
 
 The goal of this package is to provide an interface to various Natural Language Processing (NLP) resources for `MLJ` via such existing packages like [TextAnalysis](https://github.com/JuliaText/TextAnalysis.jl)
 
-Currently, we have  TF-IDF Transformer which converts a collection of raw documents into a TF-IDF matrix.
+Currently, we have a TF-IDF Transformer which converts a collection of raw documents into a TF-IDF matrix. We also have a similar way of representing documents using the Okapi Best Match 25 algorithm - this works in a similar fashion to TF-IDF but introduces the probability that a term is relevant in a particular document.  See [Okapi BM25](https://en.wikipedia.org/wiki/Okapi_BM25). Finally, there is also a simple Bag-of-Word representation available.
 
 ## TF-IDF Transformer
-"TF" means term-frequency while "TF-IDF" means term-frequency times inverse document-frequency.  This is a common term weighting scheme in information retrieval, that has also found good use in document classification.
+"TF" means term-frequency while "TF-IDF" means term-frequency times inverse document-frequency. This is a common term weighting scheme in information retrieval, that has also found good use in document classification.
 
 The goal of using TF-IDF instead of the raw frequencies of occurrence of a token in a given document is to scale down the impact of tokens that occur very frequently in a given corpus and that are hence empirically less informative than features that occur in a small fraction of the training corpus.
 
-### Uses
+### Usage
 The TF-IDF Transformer accepts a variety of inputs for the raw documents that one wishes to convert into a TF-IDF matrix.
 
 Raw documents can simply be provided as tokenized documents.
@@ -38,9 +38,9 @@ The resulting matrix looks like:
 2×11 adjoint(::SparseArrays.SparseMatrixCSC{Float64, Int64}) with eltype Float64:
  0.234244  0.0       0.234244  0.0       0.234244  0.0       0.234244  0.234244  0.234244  0.0       0.0
  0.0       0.281093  0.0       0.281093  0.0       0.281093  0.0       0.0       0.0       0.281093  0.281093
- ```
+```
 
-Functionality similar to Scikit-Learn's implementation with N-Grams can easily be implemented using features from `TextAnalysis`.  Then the N-Grams themselves (either as a dictionary of Strings or dictionary of Tuples) can be passed into the transformer.  We will likely introduce an additional transformer to handle these types of conversions in a future update to `MLJText`.
+Functionality similar to Scikit-Learn's implementation with N-Grams can easily be implemented using features from `TextAnalysis`. Then the N-Grams themselves (either as a dictionary of Strings or dictionary of Tuples) can be passed into the transformer. We will likely introduce an additional transformer to handle these types of conversions in a future update to `MLJText`.
 ```julia
 
 # this will create unigrams and bigrams
@@ -53,3 +53,60 @@ MLJ.fit!(mach)
 
 tfidf_mat = transform(mach, ngram_docs)
 ```
+
+## BM25 Transformer
+BM25 is an approach similar to that of TF-IDF in terms of representing documents in a vector space. The BM25 scoring function uses both term frequency (TF) and inverse document frequency (IDF) so that, for each term in a document, its relative concentration in the document is scored (like TF-IDF). However, BM25 improves upon TF-IDF by incorporating probability - particularly, the probability that a user will consider a search result relevant based on the terms in the search query and those in each document.
+
+### Usage
+This transformer is used in much the same way as the `TfidfTransformer`.
+
+```julia
+using MLJ, MLJText, TextAnalysis
+
+docs = ["Hi my name is Sam.", "How are you today?"]
+bm25_transformer = BM25Transformer()
+mach = machine(bm25_transformer, tokenize.(docs))
+MLJ.fit!(mach)
+
+bm25_mat = transform(mach, tokenize.(docs))
+```
+
+The resulting matrix looks like:
+```
+2×11 adjoint(::SparseArrays.SparseMatrixCSC{Float64, Int64}) with eltype Float64:
+ 0.676463  0.0      0.676463  0.0      0.676463  0.0      0.676463  0.676463  0.676463  0.0      0.0
+ 0.0       0.81599  0.0       0.81599  0.0       0.81599  0.0       0.0       0.0       0.81599  0.81599
+```
+
+You will note that this transformer has some additional parameters compared to the `TfidfTransformer`:
+```
+BM25Transformer(
+    max_doc_freq = 1.0,
+    min_doc_freq = 0.0,
+    κ = 2,
+    β = 0.75,
+    smooth_idf = true)
+```
+Please see [http://ethen8181.github.io/machine-learning/search/bm25_intro.html](http://ethen8181.github.io/machine-learning/search/bm25_intro.html) for more details about how these parameters affect the matrix that is generated.
+
+## Bag-of-Words Transformer
+The `MLJText` package also offers a way to represent documents using the simpler bag-of-words representation. This returns a document-term matrix (as you would get in `TextAnalysis`) that consists of the count for every word in the corpus for each document in the corpus.
+
+### Usage
+```julia
+using MLJ, MLJText, TextAnalysis
+
+docs = ["Hi my name is Sam.", "How are you today?"]
+bagofwords_transformer = BagOfWordsTransformer()
+mach = machine(bagofwords_transformer, tokenize.(docs))
+MLJ.fit!(mach)
+
+bagofwords_mat = transform(mach, tokenize.(docs))
+```
+
+The resulting matrix looks like:
+```
+2×11 adjoint(::SparseArrays.SparseMatrixCSC{Int64, Int64}) with eltype Int64:
+ 1  0  1  0  1  0  1  1  1  0  0
+ 0  1  0  1  0  1  0  0  0  1  1
+```
@@ -6,16 +6,24 @@ import ScientificTypes: DefaultConvention
 import CorpusLoaders
 using SparseArrays
 using TextAnalysis
+using Statistics
 
 const MMI = MLJModelInterface
 const STB = ScientificTypesBase
 const CL = CorpusLoaders
 
 const PKG = "MLJText"          # substitute model-providing package name
 
+const ScientificNGram{N} = NTuple{<:Any,STB.Textual}
+const NGram{N} = NTuple{<:Any,<:AbstractString}
+
 include("scitypes.jl")
+include("utils.jl")
+include("abstract_text_transformer.jl")
 include("tfidf_transformer.jl")
+include("bagofwords_transformer.jl")
+include("bm25_transformer.jl")
 
-export TfidfTransformer
+export TfidfTransformer, BM25Transformer, BagOfWordsTransformer
 
 end # module
@@ -0,0 +1,55 @@
+abstract type AbstractTextTransformer <: MMI.Unsupervised end
+
+function MMI.clean!(transformer::AbstractTextTransformer)
+    warning = ""
+    if transformer.min_doc_freq < 0.0
+        warning *= "Need min_doc_freq ≥ 0. Resetting min_doc_freq=0. "
+        transformer.min_doc_freq = 0.0
+    end
+
+    if transformer.max_doc_freq > 1.0
+        warning *= "Need max_doc_freq ≤ 1. Resetting max_doc_freq=1. "
+        transformer.max_doc_freq = 1.0
+    end
+
+    if transformer.max_doc_freq < transformer.min_doc_freq
+        warning *= "max_doc_freq cannot be less than min_doc_freq, resetting to defaults. "
+        transformer.min_doc_freq = 0.0
+        transformer.max_doc_freq = 1.0
+    end
+    return warning
+end
+
+## General method to fit text transformer models ##
+MMI.fit(transformer::AbstractTextTransformer, verbosity::Int, X) = 
+    _fit(transformer, verbosity, build_corpus(X))
+
+function _fit(transformer::AbstractTextTransformer, verbosity::Int, X::Corpus)
+    # process corpus vocab
+    update_lexicon!(X)
+    dtm_matrix = build_dtm(X)
+    n = size(dtm_matrix.dtm, 2) # docs are columns
+
+    # calculate min and max doc freq limits
+    if transformer.max_doc_freq < 1 || transformer.min_doc_freq > 0
+        high = round(Int, transformer.max_doc_freq * n)
+        low = round(Int, transformer.min_doc_freq * n)
+        new_dtm, vocab = limit_features(dtm_matrix, high, low)
+    else
+        new_dtm = dtm_matrix.dtm
+        vocab = dtm_matrix.terms
+    end
+
+    # calculate IDF
+    idf = compute_idf(transformer.smooth_idf, new_dtm)
+
+    # prepare result
+    fitresult = get_result(transformer, idf, vocab)
+    cache = nothing
+
+    return fitresult, cache, NamedTuple()
+end
+
+## General method to transform using text transformer models ##
+MMI.transform(transformer::AbstractTextTransformer, result, v) = 
+    _transform(transformer, result, build_corpus(v))
@@ -0,0 +1,99 @@
+"""
+    BagOfWordsTransformer()
+
+Convert a collection of raw documents to matrix representing a bag-of-words structure.
+
+Essentially, a bag-of-words approach to representing documents in a matrix is comprised of
+a count of every word in the document corpus/collection for every document. This is a simple
+but often quite powerful way of representing documents as vectors. The resulting representation is
+a matrix with rows representing every document in the corpus and columns representing every word
+in the corpus. The value for each cell is the raw count of a particular word in a particular
+document.
+
+Similarly to the `TfidfTransformer`, the vocabulary considered can be restricted
+to words occuring in a maximum or minimum portion of documents.
+
+The parameters `max_doc_freq` and `min_doc_freq` restrict the vocabulary
+that the transformer will consider. `max_doc_freq` indicates that terms in only
+up to the specified percentage of documents will be considered. For example, if
+`max_doc_freq` is set to 0.9, terms that are in more than 90% of documents
+will be removed. Similarly, the `min_doc_freq` parameter restricts terms in the
+other direction. A value of 0.01 means that only terms that are at least in 1% of
+documents will be included.
+"""
+mutable struct BagOfWordsTransformer <: AbstractTextTransformer
+    max_doc_freq::Float64
+    min_doc_freq::Float64
+end
+
+function BagOfWordsTransformer(; max_doc_freq::Float64 = 1.0, min_doc_freq::Float64 = 0.0)    
+    transformer = BagOfWordsTransformer(max_doc_freq, min_doc_freq)
+    message = MMI.clean!(transformer)
+    isempty(message) || @warn message
+    return transformer
+end
+
+struct BagOfWordsTransformerResult
+    vocab::Vector{String}
+end
+
+function _fit(transformer::BagOfWordsTransformer, verbosity::Int, X::Corpus)
+    # process corpus vocab
+    update_lexicon!(X)    
+
+    # calculate min and max doc freq limits
+    if transformer.max_doc_freq < 1 || transformer.min_doc_freq > 0
+        # we need to build out the DTM
+        dtm_matrix = build_dtm(X)
+        n = size(dtm_matrix.dtm, 2) # docs are columns
+        high = round(Int, transformer.max_doc_freq * n)
+        low = round(Int, transformer.min_doc_freq * n)
+        _, vocab = limit_features(dtm_matrix, high, low)
+    else
+        vocab = sort(collect(keys(lexicon(X))))
+    end
+
+    # prepare result
+    fitresult = BagOfWordsTransformerResult(vocab)
+    cache = nothing
+
+    return fitresult, cache, NamedTuple()
+end
+
+function _transform(::BagOfWordsTransformer, 
+                    result::BagOfWordsTransformerResult,
+                    v::Corpus)
+    dtm_matrix = build_dtm(v, result.vocab)
+
+    # here we return the `adjoint` of our sparse matrix to conform to 
+    # the `n x p` dimensions throughout MLJ
+    return adjoint(dtm_matrix.dtm)
+end
+
+# for returning user-friendly form of the learned parameters:
+function MMI.fitted_params(::BagOfWordsTransformer, fitresult::BagOfWordsTransformerResult)
+    vocab = fitresult.vocab
+    return (vocab = vocab,)
+end
+
+## META DATA
+
+MMI.metadata_pkg(BagOfWordsTransformer,
+             name="$PKG",
+             uuid="7876af07-990d-54b4-ab0e-23690620f79a",
+             url="https://github.com/JuliaAI/MLJText.jl",
+             is_pure_julia=true,
+             license="MIT",
+             is_wrapper=false
+)
+
+MMI.metadata_model(BagOfWordsTransformer,
+               input_scitype = Union{
+                   AbstractVector{<:AbstractVector{STB.Textual}},
+                   AbstractVector{<:STB.Multiset{<:ScientificNGram}},
+                   AbstractVector{<:STB.Multiset{STB.Textual}}
+                   },
+               output_scitype = AbstractMatrix{STB.Continuous},
+               docstring = "Build Bag-of-Words matrix for corpus of documents",
+               path = "MLJText.BagOfWordsTransformer"
+               )