Skip to content

Commit e73694b

Browse files
authored
Merge pull request #24 from JuliaAI/dev
For a 0.2.1 release
2 parents 45a2eea + fa15988 commit e73694b

File tree

6 files changed

+310
-120
lines changed

6 files changed

+310
-120
lines changed

Project.toml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
name = "MLJText"
22
uuid = "5e27fcf9-6bac-46ba-8580-b5712f3d6387"
33
authors = ["Chris Alexander <[email protected]>, Anthony D. Blaom <[email protected]>"]
4-
version = "0.2.0"
4+
version = "0.2.1"
55

66
[deps]
77
CorpusLoaders = "214a0ac2-f95b-54f7-a80b-442ed9c2c9e8"
@@ -14,7 +14,7 @@ TextAnalysis = "a2db99b7-8b79-58f8-94bf-bbc811eef33d"
1414

1515
[compat]
1616
CorpusLoaders = "0.3"
17-
MLJModelInterface = "1.3"
17+
MLJModelInterface = "1.4"
1818
ScientificTypes = "2.2.2, 3"
1919
ScientificTypesBase = "2.2.0, 3"
2020
TextAnalysis = "0.7.3"

src/MLJText.jl

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ const PKG = "MLJText" # substitute model-providing package name
1717
const ScientificNGram{N} = NTuple{<:Any,STB.Textual}
1818
const NGram{N} = NTuple{<:Any,<:AbstractString}
1919

20+
include("docstring_helpers.jl")
2021
include("scitypes.jl")
2122
include("utils.jl")
2223
include("abstract_text_transformer.jl")
@@ -26,4 +27,5 @@ include("bm25_transformer.jl")
2627

2728
export TfidfTransformer, BM25Transformer, CountTransformer
2829

30+
2931
end # module

src/bm25_transformer.jl

Lines changed: 81 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -1,38 +1,3 @@
1-
"""
2-
BM25Transformer()
3-
4-
Convert a collection of raw documents to a matrix using the Okapi BM25 document-word statistic.
5-
6-
BM25 is an approach similar to that of TF-IDF in terms of representing documents in a vector
7-
space. The BM25 scoring function uses both term frequency (TF) and inverse document frequency
8-
(IDF) so that, for each term in a document, its relative concentration in the document is
9-
scored (like TF-IDF). However, BM25 improves upon TF-IDF by incorporating probability - particularly,
10-
the probability that a user will consider a search result relevant based on the terms in the search query
11-
and those in each document.
12-
13-
The parameters `max_doc_freq`, `min_doc_freq`, and `smooth_idf` all work identically to those in the
14-
`TfidfTransformer`. BM25 introduces two additional parameters:
15-
16-
`κ` is the term frequency saturation characteristic. Higher values represent slower saturation. What
17-
we mean by saturation is the degree to which a term occuring extra times adds to the overall score. This defaults
18-
to 2.
19-
20-
`β` is a parameter, bound between 0 and 1, that amplifies the particular document length compared to the average length.
21-
The bigger β is, the more document length is amplified in terms of the overall score. The default value is 0.75.
22-
23-
For more explanations, please see:
24-
- http://ethen8181.github.io/machine-learning/search/bm25_intro.html
25-
- https://en.wikipedia.org/wiki/Okapi_BM25
26-
- https://nlp.stanford.edu/IR-book/html/htmledition/okapi-bm25-a-non-binary-model-1.html
27-
28-
The parameters `max_doc_freq` and `min_doc_freq` restrict the vocabulary
29-
that the transformer will consider. `max_doc_freq` indicates that terms in only
30-
up to the specified percentage of documents will be considered. For example, if
31-
`max_doc_freq` is set to 0.9, terms that are in more than 90% of documents
32-
will be removed. Similarly, the `min_doc_freq` parameter restricts terms in the
33-
other direction. A value of 0.01 means that only terms that are at least in 1% of
34-
documents will be included.
35-
"""
361
mutable struct BM25Transformer <: AbstractTextTransformer
372
max_doc_freq::Float64
383
min_doc_freq::Float64
@@ -41,13 +6,13 @@ mutable struct BM25Transformer <: AbstractTextTransformer
416
smooth_idf::Bool
427
end
438

44-
function BM25Transformer(;
9+
function BM25Transformer(;
4510
max_doc_freq::Float64 = 1.0,
4611
min_doc_freq::Float64 = 0.0,
4712
κ::Int=2,
4813
β::Float64=0.75,
4914
smooth_idf::Bool = true
50-
)
15+
)
5116
transformer = BM25Transformer(max_doc_freq, min_doc_freq, κ, β, smooth_idf)
5217
message = MMI.clean!(transformer)
5318
isempty(message) || @warn message
@@ -103,14 +68,14 @@ function build_bm25!(doc_term_mat::SparseMatrixCSC{T},
10368
return bm25
10469
end
10570

106-
function _transform(transformer::BM25Transformer,
71+
function _transform(transformer::BM25Transformer,
10772
result::BMI25TransformerResult,
10873
v::Corpus)
10974
doc_terms = build_dtm(v, result.vocab)
11075
bm25 = similar(doc_terms.dtm, eltype(result.idf_vector))
11176
build_bm25!(doc_terms.dtm, bm25, result.idf_vector, result.mean_words_in_docs; κ=transformer.κ, β=transformer.β)
11277

113-
# here we return the `adjoint` of our sparse matrix to conform to
78+
# here we return the `adjoint` of our sparse matrix to conform to
11479
# the `n x p` dimensions throughout MLJ
11580
return adjoint(bm25)
11681
end
@@ -142,6 +107,82 @@ MMI.metadata_model(BM25Transformer,
142107
AbstractVector{<:STB.Multiset{STB.Textual}}
143108
},
144109
output_scitype = AbstractMatrix{STB.Continuous},
145-
docstring = "Build BM-25 matrix from raw documents",
146110
path = "MLJText.BM25Transformer"
147111
)
112+
113+
# # DOC STRING
114+
115+
"""
116+
$(MMI.doc_header(BM25Transformer))
117+
118+
The transformer converts a collection of documents, tokenized or pre-parsed as bags of
119+
words/ngrams, to a matrix of [Okapi BM25 document-word
120+
statistics](https://en.wikipedia.org/wiki/Okapi_BM25). The BM25 scoring function uses both
121+
term frequency (TF) and inverse document frequency (IDF, defined below), as in
122+
[`TfidfTransformer`](ref), but additionally adjusts for the probability that a user will
123+
consider a search result relevant based, on the terms in the search query and those in
124+
each document.
125+
126+
$DOC_IDF
127+
128+
References:
129+
130+
- http://ethen8181.github.io/machine-learning/search/bm25_intro.html
131+
- https://en.wikipedia.org/wiki/Okapi_BM25
132+
- https://nlp.stanford.edu/IR-book/html/htmledition/okapi-bm25-a-non-binary-model-1.html
133+
134+
# Training data
135+
136+
In MLJ or MLJBase, bind an instance `model` to data with
137+
138+
mach = machine(model, X)
139+
140+
$DOC_IDF
141+
142+
Train the machine using `fit!(mach, rows=...)`.
143+
144+
# Hyper-parameters
145+
146+
- `max_doc_freq=1.0`: Restricts the vocabulary that the transformer will consider.
147+
Terms that occur in `> max_doc_freq` documents will not be considered by the
148+
transformer. For example, if `max_doc_freq` is set to 0.9, terms that are in more than
149+
90% of the documents will be removed.
150+
151+
- `min_doc_freq=0.0`: Restricts the vocabulary that the transformer will consider.
152+
Terms that occur in `< max_doc_freq` documents will not be considered by the
153+
transformer. A value of 0.01 means that only terms that are at least in 1% of the
154+
documents will be included.
155+
156+
- `κ=2`: The term frequency saturation characteristic. Higher values represent slower
157+
saturation. What we mean by saturation is the degree to which a term occurring extra
158+
times adds to the overall score.
159+
160+
- `β=0.075`: Amplifies the particular document length compared to the average length. The
161+
bigger β is, the more document length is amplified in terms of the overall score. The
162+
default value is 0.75, and the bounds are restricted between 0 and 1.
163+
164+
- `smooth_idf=true`: Control which definition of IDF to use (see above).
165+
166+
# Operations
167+
168+
- `transform(mach, Xnew)`: Based on the vocabulary, IDF, and mean word counts learned in
169+
training, return the matrix of BM25 scores for `Xnew`, a vector of the same form as `X`
170+
above. The matrix has size `(n, p)`, where `n = length(Xnew)` and `p` the size of the
171+
vocabulary. Tokens/ngrams not appearing in the learned vocabulary are scored zero.
172+
173+
# Fitted parameters
174+
175+
The fields of `fitted_params(mach)` are:
176+
177+
- `vocab`: A vector containing the string used in the transformer's vocabulary.
178+
179+
- `idf_vector`: The transformer's calculated IDF vector.
180+
181+
- `mean_words_in_docs`: The mean number of words in each document.
182+
183+
$(doc_examples(:BM25Transformer))
184+
185+
See also [`TfidfTransformer`](@ref), [`CountTransformer`](@ref)
186+
187+
"""
188+
BM25Transformer

src/count_transformer.jl

Lines changed: 55 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -1,30 +1,9 @@
1-
"""
2-
CountTransformer()
3-
4-
Convert a collection of raw documents to matrix representing a bag-of-words structure from
5-
word counts. Essentially, a bag-of-words approach to representing documents in a matrix is
6-
comprised of a count of every word in the document corpus/collection for every document.
7-
This is a simple but often quite powerful way of representing documents as vectors. The
8-
resulting representation is a matrix with rows representing every document in the corpus
9-
and columns representing every word in the corpus. The value for each cell is the raw count
10-
of a particular word in a particular document.
11-
12-
Similarly to the `TfidfTransformer`, the vocabulary considered can be restricted
13-
to words occuring in a maximum or minimum portion of documents.
14-
The parameters `max_doc_freq` and `min_doc_freq` restrict the vocabulary
15-
that the transformer will consider. `max_doc_freq` indicates that terms in only
16-
up to the specified percentage of documents will be considered. For example, if
17-
`max_doc_freq` is set to 0.9, terms that are in more than 90% of documents
18-
will be removed. Similarly, the `min_doc_freq` parameter restricts terms in the
19-
other direction. A value of 0.01 means that only terms that are at least in 1% of
20-
documents will be included.
21-
"""
221
mutable struct CountTransformer <: AbstractTextTransformer
232
max_doc_freq::Float64
243
min_doc_freq::Float64
254
end
265

27-
function CountTransformer(; max_doc_freq::Float64 = 1.0, min_doc_freq::Float64 = 0.0)
6+
function CountTransformer(; max_doc_freq::Float64 = 1.0, min_doc_freq::Float64 = 0.0)
287
transformer = CountTransformer(max_doc_freq, min_doc_freq)
298
message = MMI.clean!(transformer)
309
isempty(message) || @warn message
@@ -37,7 +16,7 @@ end
3716

3817
function _fit(transformer::CountTransformer, verbosity::Int, X::Corpus)
3918
# process corpus vocab
40-
update_lexicon!(X)
19+
update_lexicon!(X)
4120

4221
# calculate min and max doc freq limits
4322
if transformer.max_doc_freq < 1 || transformer.min_doc_freq > 0
@@ -58,12 +37,12 @@ function _fit(transformer::CountTransformer, verbosity::Int, X::Corpus)
5837
return fitresult, cache, NamedTuple()
5938
end
6039

61-
function _transform(::CountTransformer,
40+
function _transform(::CountTransformer,
6241
result::CountTransformerResult,
6342
v::Corpus)
6443
dtm_matrix = build_dtm(v, result.vocab)
6544

66-
# here we return the `adjoint` of our sparse matrix to conform to
45+
# here we return the `adjoint` of our sparse matrix to conform to
6746
# the `n x p` dimensions throughout MLJ
6847
return adjoint(dtm_matrix.dtm)
6948
end
@@ -92,6 +71,55 @@ MMI.metadata_model(CountTransformer,
9271
AbstractVector{<:STB.Multiset{STB.Textual}}
9372
},
9473
output_scitype = AbstractMatrix{STB.Continuous},
95-
docstring = "Build Bag-of-Words matrix from word counts for corpus of documents",
9674
path = "MLJText.CountTransformer"
97-
)
75+
)
76+
77+
# # DOCUMENT STRING
78+
79+
"""
80+
$(MMI.doc_header(CountTransformer))
81+
82+
The transformer converts a collection of documents, tokenized or pre-parsed as bags of
83+
words/ngrams, to a matrix of term counts.
84+
85+
# Training data
86+
87+
In MLJ or MLJBase, bind an instance `model` to data with
88+
89+
mach = machine(model, X)
90+
91+
$DOC_TRANSFORMER_INPUTS
92+
93+
Train the machine using `fit!(mach, rows=...)`.
94+
95+
# Hyper-parameters
96+
97+
- `max_doc_freq=1.0`: Restricts the vocabulary that the transformer will consider.
98+
Terms that occur in `> max_doc_freq` documents will not be considered by the
99+
transformer. For example, if `max_doc_freq` is set to 0.9, terms that are in more than
100+
90% of the documents will be removed.
101+
102+
- `min_doc_freq=0.0`: Restricts the vocabulary that the transformer will consider.
103+
Terms that occur in `< max_doc_freq` documents will not be considered by the
104+
transformer. A value of 0.01 means that only terms that are at least in 1% of the
105+
documents will be included.
106+
107+
# Operations
108+
109+
- `transform(mach, Xnew)`: Based on the vocabulary learned in training, return the matrix
110+
of counts for `Xnew`, a vector of the same form as `X` above. The matrix has size `(n,
111+
p)`, where `n = length(Xnew)` and `p` the size of the vocabulary. Tokens/ngrams not
112+
appearing in the learned vocabulary are scored zero.
113+
114+
# Fitted parameters
115+
116+
The fields of `fitted_params(mach)` are:
117+
118+
- `vocab`: A vector containing the string used in the transformer's vocabulary.
119+
120+
$(doc_examples(:CountTransformer))
121+
122+
See also
123+
[`TfidfTransformer`](@ref), [`BM25Transformer`](@ref)
124+
"""
125+
CountTransformer

0 commit comments

Comments
 (0)