@@ -14,26 +14,33 @@ const STB = ScientificTypesBase
14
14
15
15
Convert a collection of raw documents to a matrix of TF-IDF features.
16
16
17
- Tf means term-frequency while tf-idf means term-frequency times inverse document-frequency.
18
- This is a common term weighting scheme in information retrieval, that has also found good use
19
- in document classification.
20
-
21
- The goal of using tf-idf instead of the raw frequencies of occurrence of a token in a given
22
- document is to scale down the impact of tokens that occur very frequently in a given corpus
23
- and that are hence empirically less informative than features that occur in a small fraction of
24
- the training corpus.
25
-
26
- The formula that is used to compute the tf-idf for a term t of a document d in a document set is
27
- tf-idf(t, d) = tf(t, d) * idf(t), and the idf is computed as idf(t) = log [ n / df(t) ] + 1 (if `smooth_idf=false`),
28
- where n is the total number of documents in the document set and df(t) is the document frequency of t; the
29
- document frequency is the number of documents in the document set that contain the term t. The effect of adding “1”
30
- to the idf in the equation above is that terms with zero idf, i.e., terms that occur in all documents in a training
31
- set, will not be entirely ignored. (Note that the idf formula above differs from the standard textbook notation
32
- that defines the idf as idf(t) = log [ n / (df(t) + 1) ]).
33
-
34
- If `smooth_idf=true` (the default), the constant “1” is added to the numerator and denominator of the idf as if an extra
35
- document was seen containing every term in the collection exactly once, which prevents zero divisions:
36
- idf(t) = log [ (1 + n) / (1 + df(t)) ] + 1.
17
+ "Tf" means term-frequency while "tf-idf" means term-frequency times
18
+ inverse document-frequency. This is a common term weighting scheme in
19
+ information retrieval, that has also found good use in document
20
+ classification.
21
+
22
+ The goal of using tf-idf instead of the raw frequencies of occurrence
23
+ of a token in a given document is to scale down the impact of tokens
24
+ that occur very frequently in a given corpus and that are hence
25
+ empirically less informative than features that occur in a small
26
+ fraction of the training corpus.
27
+
28
+ The formula that is used to compute the tf-idf for a term `t` of a
29
+ document `d` in a document set is `tf_idf(t, d) = tf(t, d) *
30
+ idf(t)`. Assuming `smooth_idf=false`, `idf(t) = log [ n / df(t) ] + 1`
31
+ where `n` is the total number of documents in the document set and
32
+ `df(t)` is the document frequency of `t`. The document frequency is
33
+ the number of documents in the document set that contain the term
34
+ `t`. The effect of adding “1” to the idf in the equation above is that
35
+ terms with zero idf, i.e., terms that occur in all documents in a
36
+ training set, will not be entirely ignored. (Note that the idf formula
37
+ above differs from that appearing in standard texts, `idf(t) = log [ n
38
+ / (df(t) + 1) ])`.
39
+
40
+ If `smooth_idf=true` (the default), the constant “1” is added to the
41
+ numerator and denominator of the idf as if an extra document was seen
42
+ containing every term in the collection exactly once, which prevents
43
+ zero divisions: `idf(t) = log [ (1 + n) / (1 + df(t)) ] + 1`.
37
44
38
45
"""
39
46
MMI. @mlj_model mutable struct TfidfTransformer <: MLJModelInterface.Unsupervised
@@ -49,7 +56,9 @@ struct TfidfTransformerResult
49
56
idf_vector:: Vector{Float64}
50
57
end
51
58
52
- function limit_features (doc_term_matrix:: DocumentTermMatrix , high:: Int , low:: Int )
59
+ function limit_features (doc_term_matrix:: DocumentTermMatrix ,
60
+ high:: Int ,
61
+ low:: Int )
53
62
doc_freqs = vec (sum (doc_term_matrix. dtm, dims= 1 ))
54
63
55
64
# build mask to restrict terms
@@ -66,16 +75,21 @@ function limit_features(doc_term_matrix::DocumentTermMatrix, high::Int, low::Int
66
75
return (doc_term_matrix. dtm[:, mask], new_terms)
67
76
end
68
77
69
- _convert_bag_of_words (X:: Dict{NGram, Int} ) = Dict (join (k, " " ) => v for (k, v) in X)
78
+ _convert_bag_of_words (X:: Dict{NGram, Int} ) =
79
+ Dict (join (k, " " ) => v for (k, v) in X)
70
80
71
- build_corpus (X:: Vector{Dict{NGram, Int}} ) = build_corpus (_convert_bag_of_words .(X))
72
- build_corpus (X:: Vector{Dict{S, Int}} ) where {S <: AbstractString } = Corpus (NGramDocument .(X))
81
+ build_corpus (X:: Vector{Dict{NGram, Int}} ) =
82
+ build_corpus (_convert_bag_of_words .(X))
83
+ build_corpus (X:: Vector{Dict{S, Int}} ) where {S <: AbstractString } =
84
+ Corpus (NGramDocument .(X))
73
85
build_corpus (X) = Corpus (TokenDocument .(X))
74
86
75
- MMI. fit (transformer:: TfidfTransformer , verbosity:: Int , X) = _fit (transformer, verbosity, build_corpus (X))
87
+ MMI. fit (transformer:: TfidfTransformer , verbosity:: Int , X) =
88
+ _fit (transformer, verbosity, build_corpus (X))
76
89
77
90
function _fit (transformer:: TfidfTransformer , verbosity:: Int , X:: Corpus )
78
- transformer. max_doc_freq < transformer. min_doc_freq && error (" Max doc frequency cannot be less than Min doc frequency!" )
91
+ transformer. max_doc_freq < transformer. min_doc_freq &&
92
+ error (" Max doc frequency cannot be less than Min doc frequency!" )
79
93
80
94
# process corpus vocab
81
95
update_lexicon! (X)
@@ -104,7 +118,10 @@ function _fit(transformer::TfidfTransformer, verbosity::Int, X::Corpus)
104
118
return fitresult, cache, NamedTuple ()
105
119
end
106
120
107
- function build_tfidf! (dtm:: SparseMatrixCSC{T} , tfidf:: SparseMatrixCSC{F} , idf_vector:: Vector{F} ) where {T <: Real , F <: AbstractFloat }
121
+ function build_tfidf! (dtm:: SparseMatrixCSC{T} ,
122
+ tfidf:: SparseMatrixCSC{F} ,
123
+ idf_vector:: Vector{F} ) where {T<: Real ,F<: AbstractFloat }
124
+
108
125
rows = rowvals (dtm)
109
126
dtmvals = nonzeros (dtm)
110
127
tfidfvals = nonzeros (tfidf)
@@ -126,9 +143,13 @@ function build_tfidf!(dtm::SparseMatrixCSC{T}, tfidf::SparseMatrixCSC{F}, idf_ve
126
143
return tfidf
127
144
end
128
145
129
- MMI. transform (transformer:: TfidfTransformer , result:: TfidfTransformerResult , v) = _transform (transformer, result, build_corpus (v))
146
+ MMI. transform (transformer:: TfidfTransformer ,
147
+ result:: TfidfTransformerResult , v) =
148
+ _transform (transformer, result, build_corpus (v))
130
149
131
- function _transform (:: TfidfTransformer , result:: TfidfTransformerResult , v:: Corpus )
150
+ function _transform (:: TfidfTransformer ,
151
+ result:: TfidfTransformerResult ,
152
+ v:: Corpus )
132
153
m = DocumentTermMatrix (v, result. vocab)
133
154
tfidf = similar (m. dtm, eltype (result. idf_vector))
134
155
build_tfidf! (m. dtm, tfidf, result. idf_vector)
0 commit comments