Skip to content

Commit c61d3aa

Browse files
committed
Whitespace changes and light editing of doc-string for readability
1 parent 014a52b commit c61d3aa

File tree

1 file changed

+50
-29
lines changed

1 file changed

+50
-29
lines changed

src/MLJText.jl

Lines changed: 50 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -14,26 +14,33 @@ const STB = ScientificTypesBase
1414
1515
Convert a collection of raw documents to a matrix of TF-IDF features.
1616
17-
Tf means term-frequency while tf-idf means term-frequency times inverse document-frequency.
18-
This is a common term weighting scheme in information retrieval, that has also found good use
19-
in document classification.
20-
21-
The goal of using tf-idf instead of the raw frequencies of occurrence of a token in a given
22-
document is to scale down the impact of tokens that occur very frequently in a given corpus
23-
and that are hence empirically less informative than features that occur in a small fraction of
24-
the training corpus.
25-
26-
The formula that is used to compute the tf-idf for a term t of a document d in a document set is
27-
tf-idf(t, d) = tf(t, d) * idf(t), and the idf is computed as idf(t) = log [ n / df(t) ] + 1 (if `smooth_idf=false`),
28-
where n is the total number of documents in the document set and df(t) is the document frequency of t; the
29-
document frequency is the number of documents in the document set that contain the term t. The effect of adding “1”
30-
to the idf in the equation above is that terms with zero idf, i.e., terms that occur in all documents in a training
31-
set, will not be entirely ignored. (Note that the idf formula above differs from the standard textbook notation
32-
that defines the idf as idf(t) = log [ n / (df(t) + 1) ]).
33-
34-
If `smooth_idf=true` (the default), the constant “1” is added to the numerator and denominator of the idf as if an extra
35-
document was seen containing every term in the collection exactly once, which prevents zero divisions:
36-
idf(t) = log [ (1 + n) / (1 + df(t)) ] + 1.
17+
"Tf" means term-frequency while "tf-idf" means term-frequency times
18+
inverse document-frequency. This is a common term weighting scheme in
19+
information retrieval, that has also found good use in document
20+
classification.
21+
22+
The goal of using tf-idf instead of the raw frequencies of occurrence
23+
of a token in a given document is to scale down the impact of tokens
24+
that occur very frequently in a given corpus and that are hence
25+
empirically less informative than features that occur in a small
26+
fraction of the training corpus.
27+
28+
The formula that is used to compute the tf-idf for a term `t` of a
29+
document `d` in a document set is `tf_idf(t, d) = tf(t, d) *
30+
idf(t)`. Assuming `smooth_idf=false`, `idf(t) = log [ n / df(t) ] + 1`
31+
where `n` is the total number of documents in the document set and
32+
`df(t)` is the document frequency of `t`. The document frequency is
33+
the number of documents in the document set that contain the term
34+
`t`. The effect of adding “1” to the idf in the equation above is that
35+
terms with zero idf, i.e., terms that occur in all documents in a
36+
training set, will not be entirely ignored. (Note that the idf formula
37+
above differs from that appearing in standard texts, `idf(t) = log [ n
38+
/ (df(t) + 1) ])`.
39+
40+
If `smooth_idf=true` (the default), the constant “1” is added to the
41+
numerator and denominator of the idf as if an extra document was seen
42+
containing every term in the collection exactly once, which prevents
43+
zero divisions: `idf(t) = log [ (1 + n) / (1 + df(t)) ] + 1`.
3744
3845
"""
3946
MMI.@mlj_model mutable struct TfidfTransformer <: MLJModelInterface.Unsupervised
@@ -49,7 +56,9 @@ struct TfidfTransformerResult
4956
idf_vector::Vector{Float64}
5057
end
5158

52-
function limit_features(doc_term_matrix::DocumentTermMatrix, high::Int, low::Int)
59+
function limit_features(doc_term_matrix::DocumentTermMatrix,
60+
high::Int,
61+
low::Int)
5362
doc_freqs = vec(sum(doc_term_matrix.dtm, dims=1))
5463

5564
# build mask to restrict terms
@@ -66,16 +75,21 @@ function limit_features(doc_term_matrix::DocumentTermMatrix, high::Int, low::Int
6675
return (doc_term_matrix.dtm[:, mask], new_terms)
6776
end
6877

69-
_convert_bag_of_words(X::Dict{NGram, Int}) = Dict(join(k, " ") => v for (k, v) in X)
78+
_convert_bag_of_words(X::Dict{NGram, Int}) =
79+
Dict(join(k, " ") => v for (k, v) in X)
7080

71-
build_corpus(X::Vector{Dict{NGram, Int}}) = build_corpus(_convert_bag_of_words.(X))
72-
build_corpus(X::Vector{Dict{S, Int}}) where {S <: AbstractString} = Corpus(NGramDocument.(X))
81+
build_corpus(X::Vector{Dict{NGram, Int}}) =
82+
build_corpus(_convert_bag_of_words.(X))
83+
build_corpus(X::Vector{Dict{S, Int}}) where {S <: AbstractString} =
84+
Corpus(NGramDocument.(X))
7385
build_corpus(X) = Corpus(TokenDocument.(X))
7486

75-
MMI.fit(transformer::TfidfTransformer, verbosity::Int, X) = _fit(transformer, verbosity, build_corpus(X))
87+
MMI.fit(transformer::TfidfTransformer, verbosity::Int, X) =
88+
_fit(transformer, verbosity, build_corpus(X))
7689

7790
function _fit(transformer::TfidfTransformer, verbosity::Int, X::Corpus)
78-
transformer.max_doc_freq < transformer.min_doc_freq && error("Max doc frequency cannot be less than Min doc frequency!")
91+
transformer.max_doc_freq < transformer.min_doc_freq &&
92+
error("Max doc frequency cannot be less than Min doc frequency!")
7993

8094
# process corpus vocab
8195
update_lexicon!(X)
@@ -104,7 +118,10 @@ function _fit(transformer::TfidfTransformer, verbosity::Int, X::Corpus)
104118
return fitresult, cache, NamedTuple()
105119
end
106120

107-
function build_tfidf!(dtm::SparseMatrixCSC{T}, tfidf::SparseMatrixCSC{F}, idf_vector::Vector{F}) where {T <: Real, F <: AbstractFloat}
121+
function build_tfidf!(dtm::SparseMatrixCSC{T},
122+
tfidf::SparseMatrixCSC{F},
123+
idf_vector::Vector{F}) where {T<:Real,F<:AbstractFloat}
124+
108125
rows = rowvals(dtm)
109126
dtmvals = nonzeros(dtm)
110127
tfidfvals = nonzeros(tfidf)
@@ -126,9 +143,13 @@ function build_tfidf!(dtm::SparseMatrixCSC{T}, tfidf::SparseMatrixCSC{F}, idf_ve
126143
return tfidf
127144
end
128145

129-
MMI.transform(transformer::TfidfTransformer, result::TfidfTransformerResult, v) = _transform(transformer, result, build_corpus(v))
146+
MMI.transform(transformer::TfidfTransformer,
147+
result::TfidfTransformerResult, v) =
148+
_transform(transformer, result, build_corpus(v))
130149

131-
function _transform(::TfidfTransformer, result::TfidfTransformerResult, v::Corpus)
150+
function _transform(::TfidfTransformer,
151+
result::TfidfTransformerResult,
152+
v::Corpus)
132153
m = DocumentTermMatrix(v, result.vocab)
133154
tfidf = similar(m.dtm, eltype(result.idf_vector))
134155
build_tfidf!(m.dtm, tfidf, result.idf_vector)

0 commit comments

Comments
 (0)