Skip to content

Commit 09f37e1

Browse files
committed
change version num; update doc strings; add clean methods and slight refactoring
1 parent 553906b commit 09f37e1

File tree

6 files changed

+116
-52
lines changed

6 files changed

+116
-52
lines changed

Project.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
name = "MLJText"
22
uuid = "5e27fcf9-6bac-46ba-8580-b5712f3d6387"
33
authors = ["Chris Alexander <[email protected]>, Anthony D. Blaom <[email protected]>"]
4-
version = "0.2.0"
4+
version = "0.1.1"
55

66
[deps]
77
CorpusLoaders = "214a0ac2-f95b-54f7-a80b-442ed9c2c9e8"

src/MLJText.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,6 @@ include("tfidf_transformer.jl")
2424
include("bagofwords_transformer.jl")
2525
include("bm25_transformer.jl")
2626

27-
export TfidfTransformer
27+
export TfidfTransformer, BM25Transformer, BagOfWordsTransformer
2828

2929
end # module

src/abstract_text_transformer.jl

Lines changed: 20 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,30 @@
11
abstract type AbstractTextTransformer <: MMI.Unsupervised end
22

3+
function MMI.clean!(transformer::AbstractTextTransformer)
4+
warning = ""
5+
if transformer.min_doc_freq < 0.0
6+
warning *= "Need min_doc_freq ≥ 0. Resetting min_doc_freq=0. "
7+
transformer.min_doc_freq = 0.0
8+
end
9+
10+
if transformer.max_doc_freq > 1.0
11+
warning *= "Need max_doc_freq ≤ 1. Resetting max_doc_freq=1. "
12+
transformer.max_doc_freq = 1.0
13+
end
14+
15+
if transformer.max_doc_freq < transformer.min_doc_freq
16+
warning *= "max_doc_freq cannot be less than min_doc_freq, resetting to defaults. "
17+
transformer.min_doc_freq = 0.0
18+
transformer.max_doc_freq = 1.0
19+
end
20+
return warning
21+
end
22+
323
## General method to fit text transformer models ##
424
MMI.fit(transformer::AbstractTextTransformer, verbosity::Int, X) =
525
_fit(transformer, verbosity, build_corpus(X))
626

727
function _fit(transformer::AbstractTextTransformer, verbosity::Int, X::Corpus)
8-
transformer.max_doc_freq < transformer.min_doc_freq &&
9-
error("Max doc frequency cannot be less than Min doc frequency!")
10-
1128
# process corpus vocab
1229
update_lexicon!(X)
1330
dtm_matrix = build_dtm(X)

src/bagofwords_transformer.jl

Lines changed: 27 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,31 +1,43 @@
11
"""
22
BagOfWordsTransformer()
33
4-
Convert a collection of raw documents to matrix representing a bag-of-words structure.
4+
Convert a collection of raw documents to matrix representing a bag-of-words structure.
55
6-
Essentially, a bag-of-words approach to representing documents in a matrix is comprised of
7-
a count of every word in the document corpus/collection for every document. This is a simple
8-
but often quite powerful way of representing documents as vectors. The end representation is
9-
a matrix with rows representing every document in the corpus and columns representing every word
10-
in the corpus. The value for each cell is the raw count of a particular word in a particular
11-
document.
6+
Essentially, a bag-of-words approach to representing documents in a matrix is comprised of
7+
a count of every word in the document corpus/collection for every document. This is a simple
8+
but often quite powerful way of representing documents as vectors. The end representation is
9+
a matrix with rows representing every document in the corpus and columns representing every word
10+
in the corpus. The value for each cell is the raw count of a particular word in a particular
11+
document.
1212
13-
Similarly to the `TfidfTransformer`, the vocabulary considered can be restricted
14-
to words occuring in a maximum or minimum portion of documents.
13+
Similarly to the `TfidfTransformer`, the vocabulary considered can be restricted
14+
to words occuring in a maximum or minimum portion of documents.
15+
16+
The parameters `max_doc_freq` and `min_doc_freq` restrict the vocabulary
17+
that the transformer will consider. `max_doc_freq` indicates that terms in only
18+
up to the specified percentage of documents will be considered. For example, if
19+
`max_doc_freq` is set to 0.9, terms that are in more than 90% of documents
20+
will be removed. Similarly, the `min_doc_freq` parameter restricts terms in the
21+
other direction. A value of 0.01 means that only terms that are at least in 1% of
22+
documents will be included.
1523
"""
16-
MMI.@mlj_model mutable struct BagOfWordsTransformer <: AbstractTextTransformer
17-
max_doc_freq::Float64 = 1.0
18-
min_doc_freq::Float64 = 0.0
24+
mutable struct BagOfWordsTransformer <: AbstractTextTransformer
25+
max_doc_freq::Float64
26+
min_doc_freq::Float64
27+
end
28+
29+
function BagOfWordsTransformer(; max_doc_freq::Float64 = 1.0, min_doc_freq::Float64 = 0.0)
30+
transformer = BagOfWordsTransformer(max_doc_freq, min_doc_freq)
31+
message = MMI.clean!(transformer)
32+
isempty(message) || @warn message
33+
return transformer
1934
end
2035

2136
struct BagOfWordsTransformerResult
2237
vocab::Vector{String}
2338
end
2439

2540
function _fit(transformer::BagOfWordsTransformer, verbosity::Int, X::Corpus)
26-
transformer.max_doc_freq < transformer.min_doc_freq &&
27-
error("Max doc frequency cannot be less than Min doc frequency!")
28-
2941
# process corpus vocab
3042
update_lexicon!(X)
3143

src/bm25_transformer.jl

Lines changed: 49 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -1,36 +1,57 @@
11
"""
22
BM25Transformer()
33
4-
Convert a collection of raw documents to a matrix using the Okapi BM25 document-word statistic.
5-
6-
BM25 is an approach similar to that of TF-IDF in terms of representing documents in a vector
7-
space. The BM25 scoring function uses both term frequency (TF) and inverse document frequency
8-
(IDF) so that, for each term in a document, its relative concentration in the document is
9-
scored (like TF-IDF). However, BM25 improves upon TF-IDF by incorporating probability - particularly,
10-
the probability that a user will consider a search result relevant based on the terms in the search query
11-
and those in each document.
12-
13-
The parameters `max_doc_freq`, `min_doc_freq`, and `smooth_idf` all work identically to those in the
14-
`TfidfTransformer`. BM25 introduces two additional parameters:
15-
16-
`κ` is the term frequency saturation characteristic. Higher values represent slower satuartion. What
17-
we mean by saturation is the degree to which a term occuring extra times adds to the overall score. This defaults
18-
to 2.
19-
20-
`β` is a parameter, bound between 0 and 1, that amplifies the particular document length compared to the average length.
21-
The bigger β is, the more document length is amplified in terms of the overall score. The default value is 0.75.
22-
23-
For more explanations, please see:
24-
http://ethen8181.github.io/machine-learning/search/bm25_intro.html
25-
https://en.wikipedia.org/wiki/Okapi_BM25
26-
https://nlp.stanford.edu/IR-book/html/htmledition/okapi-bm25-a-non-binary-model-1.html
4+
Convert a collection of raw documents to a matrix using the Okapi BM25 document-word statistic.
5+
6+
BM25 is an approach similar to that of TF-IDF in terms of representing documents in a vector
7+
space. The BM25 scoring function uses both term frequency (TF) and inverse document frequency
8+
(IDF) so that, for each term in a document, its relative concentration in the document is
9+
scored (like TF-IDF). However, BM25 improves upon TF-IDF by incorporating probability - particularly,
10+
the probability that a user will consider a search result relevant based on the terms in the search query
11+
and those in each document.
12+
13+
The parameters `max_doc_freq`, `min_doc_freq`, and `smooth_idf` all work identically to those in the
14+
`TfidfTransformer`. BM25 introduces two additional parameters:
15+
16+
`κ` is the term frequency saturation characteristic. Higher values represent slower saturation. What
17+
we mean by saturation is the degree to which a term occuring extra times adds to the overall score. This defaults
18+
to 2.
19+
20+
`β` is a parameter, bound between 0 and 1, that amplifies the particular document length compared to the average length.
21+
The bigger β is, the more document length is amplified in terms of the overall score. The default value is 0.75.
22+
23+
For more explanations, please see:
24+
http://ethen8181.github.io/machine-learning/search/bm25_intro.html
25+
https://en.wikipedia.org/wiki/Okapi_BM25
26+
https://nlp.stanford.edu/IR-book/html/htmledition/okapi-bm25-a-non-binary-model-1.html
27+
28+
The parameters `max_doc_freq` and `min_doc_freq` restrict the vocabulary
29+
that the transformer will consider. `max_doc_freq` indicates that terms in only
30+
up to the specified percentage of documents will be considered. For example, if
31+
`max_doc_freq` is set to 0.9, terms that are in more than 90% of documents
32+
will be removed. Similarly, the `min_doc_freq` parameter restricts terms in the
33+
other direction. A value of 0.01 means that only terms that are at least in 1% of
34+
documents will be included.
2735
"""
28-
MMI.@mlj_model mutable struct BM25Transformer <: AbstractTextTransformer
29-
max_doc_freq::Float64 = 1.0
30-
min_doc_freq::Float64 = 0.0
31-
κ::Int=2
32-
β::Float64=0.75
36+
mutable struct BM25Transformer <: AbstractTextTransformer
37+
max_doc_freq::Float64
38+
min_doc_freq::Float64
39+
κ::Int
40+
β::Float64
41+
smooth_idf::Bool
42+
end
43+
44+
function BM25Transformer(;
45+
max_doc_freq::Float64 = 1.0,
46+
min_doc_freq::Float64 = 0.0,
47+
κ::Int=2,
48+
β::Float64=0.75,
3349
smooth_idf::Bool = true
50+
)
51+
transformer = BM25Transformer(max_doc_freq, min_doc_freq, κ, β, smooth_idf)
52+
message = MMI.clean!(transformer)
53+
isempty(message) || @warn message
54+
return transformer
3455
end
3556

3657
struct BMI25TransformerResult

src/tfidf_transformer.jl

Lines changed: 18 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -34,11 +34,25 @@ numerator and denominator of the idf as if an extra document was seen
3434
containing every term in the collection exactly once, which prevents
3535
zero divisions: `idf(t) = log [ (1 + n) / (1 + df(t)) ] + 1`.
3636
37+
The parameters `max_doc_freq` and `min_doc_freq` restrict the vocabulary
38+
that the transformer will consider. `max_doc_freq` indicates that terms in only
39+
up to the specified percentage of documents will be considered. For example, if
40+
`max_doc_freq` is set to 0.9, terms that are in more than 90% of documents
41+
will be removed. Similarly, the `min_doc_freq` parameter restricts terms in the
42+
other direction. A value of 0.01 means that only terms that are at least in 1% of
43+
documents will be included.
3744
"""
38-
MMI.@mlj_model mutable struct TfidfTransformer <: AbstractTextTransformer
39-
max_doc_freq::Float64 = 1.0
40-
min_doc_freq::Float64 = 0.0
41-
smooth_idf::Bool = true
45+
mutable struct TfidfTransformer <: AbstractTextTransformer
46+
max_doc_freq::Float64
47+
min_doc_freq::Float64
48+
smooth_idf::Bool
49+
end
50+
51+
function TfidfTransformer(; max_doc_freq::Float64 = 1.0, min_doc_freq::Float64 = 0.0, smooth_idf::Bool = true)
52+
transformer = TfidfTransformer(max_doc_freq, min_doc_freq, smooth_idf)
53+
message = MMI.clean!(transformer)
54+
isempty(message) || @warn message
55+
return transformer
4256
end
4357

4458
struct TfidfTransformerResult

0 commit comments

Comments
 (0)