@@ -12,20 +12,23 @@ const STB = ScientificTypesBase
12
12
"""
13
13
TfidfTransformer()
14
14
15
+ The following is taken largely from scikit-learn's documentation:
16
+ https://github.com/scikit-learn/scikit-learn/blob/main/sklearn/feature_extraction/text.py
17
+
15
18
Convert a collection of raw documents to a matrix of TF-IDF features.
16
19
17
- "Tf " means term-frequency while "tf-idf " means term-frequency times
20
+ "TF " means term-frequency while "TF-IDF " means term-frequency times
18
21
inverse document-frequency. This is a common term weighting scheme in
19
22
information retrieval, that has also found good use in document
20
23
classification.
21
24
22
- The goal of using tf-idf instead of the raw frequencies of occurrence
25
+ The goal of using TF-IDF instead of the raw frequencies of occurrence
23
26
of a token in a given document is to scale down the impact of tokens
24
27
that occur very frequently in a given corpus and that are hence
25
28
empirically less informative than features that occur in a small
26
29
fraction of the training corpus.
27
30
28
- The formula that is used to compute the tf-idf for a term `t` of a
31
+ The formula that is used to compute the TF-IDF for a term `t` of a
29
32
document `d` in a document set is `tf_idf(t, d) = tf(t, d) *
30
33
idf(t)`. Assuming `smooth_idf=false`, `idf(t) = log [ n / df(t) ] + 1`
31
34
where `n` is the total number of documents in the document set and
59
62
function limit_features (doc_term_matrix:: DocumentTermMatrix ,
60
63
high:: Int ,
61
64
low:: Int )
62
- doc_freqs = vec (sum (doc_term_matrix. dtm, dims= 1 ))
65
+ doc_freqs = vec (sum (doc_term_matrix. dtm, dims= 2 ))
63
66
64
67
# build mask to restrict terms
65
68
mask = trues (length (doc_freqs))
@@ -72,43 +75,78 @@ function limit_features(doc_term_matrix::DocumentTermMatrix,
72
75
73
76
new_terms = doc_term_matrix. terms[mask]
74
77
75
- return (doc_term_matrix. dtm[:, mask ], new_terms)
78
+ return (doc_term_matrix. dtm[mask, : ], new_terms)
76
79
end
77
80
78
- _convert_bag_of_words (X:: Dict{NGram, Int} ) =
81
+ _convert_bag_of_words (X:: Dict{NGram, Int} ) =
79
82
Dict (join (k, " " ) => v for (k, v) in X)
80
83
81
- build_corpus (X:: Vector{Dict{NGram, Int}} ) =
84
+ build_corpus (X:: Vector{Dict{NGram, Int}} ) =
82
85
build_corpus (_convert_bag_of_words .(X))
83
- build_corpus (X:: Vector{Dict{S, Int}} ) where {S <: AbstractString } =
86
+ build_corpus (X:: Vector{Dict{S, Int}} ) where {S <: AbstractString } =
84
87
Corpus (NGramDocument .(X))
85
88
build_corpus (X) = Corpus (TokenDocument .(X))
86
89
87
- MMI. fit (transformer:: TfidfTransformer , verbosity:: Int , X) =
90
+ # based on https://github.com/zgornel/StringAnalysis.jl/blob/master/src/dtm.jl
91
+ # and https://github.com/JuliaText/TextAnalysis.jl/blob/master/src/dtm.jl
92
+ build_dtm (docs:: Corpus ) = build_dtm (docs, sort (collect (keys (lexicon (docs)))))
93
+ function build_dtm (docs:: Corpus , terms:: Vector{T} ) where {T}
94
+ # we are flipping the orientation of this matrix
95
+ # so we get the `columnindices` from the TextAnalysis API
96
+ row_indices = TextAnalysis. columnindices (terms)
97
+
98
+ m = length (terms) # terms are rows
99
+ n = length (docs) # docs are columns
100
+
101
+ rows = Vector {Int} (undef, 0 ) # terms
102
+ columns = Vector {Int} (undef, 0 ) # docs
103
+ values = Vector {Int} (undef, 0 )
104
+ for i in eachindex (docs. documents)
105
+ doc = docs. documents[i]
106
+ ngs = ngrams (doc)
107
+ for ngram in keys (ngs)
108
+ j = get (row_indices, ngram, 0 )
109
+ v = ngs[ngram]
110
+ if j != 0
111
+ push! (columns, i)
112
+ push! (rows, j)
113
+ push! (values, v)
114
+ end
115
+ end
116
+ end
117
+ if length (rows) > 0
118
+ dtm = sparse (rows, columns, values, m, n)
119
+ else
120
+ dtm = spzeros (Int, m, n)
121
+ end
122
+ DocumentTermMatrix (dtm, terms, row_indices)
123
+ end
124
+
125
+ MMI. fit (transformer:: TfidfTransformer , verbosity:: Int , X) =
88
126
_fit (transformer, verbosity, build_corpus (X))
89
127
90
128
function _fit (transformer:: TfidfTransformer , verbosity:: Int , X:: Corpus )
91
- transformer. max_doc_freq < transformer. min_doc_freq &&
129
+ transformer. max_doc_freq < transformer. min_doc_freq &&
92
130
error (" Max doc frequency cannot be less than Min doc frequency!" )
93
131
94
132
# process corpus vocab
95
133
update_lexicon! (X)
96
- m = DocumentTermMatrix (X)
97
- n = size (m . dtm, 1 )
134
+ dtm_matrix = build_dtm (X)
135
+ n = size (dtm_matrix . dtm, 2 ) # docs are columns
98
136
99
137
# calculate min and max doc freq limits
100
138
if transformer. max_doc_freq < 1 || transformer. min_doc_freq > 0
101
139
high = round (Int, transformer. max_doc_freq * n)
102
140
low = round (Int, transformer. min_doc_freq * n)
103
- new_dtm, vocab = limit_features (m , high, low)
141
+ new_dtm, vocab = limit_features (dtm_matrix , high, low)
104
142
else
105
- new_dtm = m . dtm
106
- vocab = m . terms
143
+ new_dtm = dtm_matrix . dtm
144
+ vocab = dtm_matrix . terms
107
145
end
108
146
109
147
# calculate IDF
110
148
smooth_idf = Int (transformer. smooth_idf)
111
- documents_containing_term = vec (sum (new_dtm .> 0 , dims= 1 )) .+ smooth_idf
149
+ documents_containing_term = vec (sum (new_dtm .> 0 , dims= 2 )) .+ smooth_idf
112
150
idf = log .((n + smooth_idf) ./ documents_containing_term) .+ 1
113
151
114
152
# prepare result
@@ -120,41 +158,41 @@ end
120
158
121
159
function build_tfidf! (dtm:: SparseMatrixCSC{T} ,
122
160
tfidf:: SparseMatrixCSC{F} ,
123
- idf_vector:: Vector{F} ) where {T<: Real ,F<: AbstractFloat }
124
-
161
+ idf_vector:: Vector{F} ) where {T <: Real , F <: AbstractFloat }
125
162
rows = rowvals (dtm)
126
163
dtmvals = nonzeros (dtm)
127
164
tfidfvals = nonzeros (tfidf)
128
165
@assert size (dtmvals) == size (tfidfvals)
129
166
130
- p = size (dtm, 2 )
167
+ p, n = size (dtm)
131
168
132
169
# TF tells us what proportion of a document is defined by a term
133
- words_in_documents = F .(sum (dtm, dims= 2 ))
170
+ words_in_documents = F .(sum (dtm, dims= 1 ))
134
171
oneval = one (F)
135
172
136
- for i = 1 : p
173
+ for i = 1 : n
137
174
for j in nzrange (dtm, i)
138
175
row = rows[j]
139
- tfidfvals[j] = dtmvals[j] / max (words_in_documents[row ], oneval) * idf_vector[i ]
176
+ tfidfvals[j] = dtmvals[j] / max (words_in_documents[i ], oneval) * idf_vector[row ]
140
177
end
141
178
end
142
179
143
180
return tfidf
144
181
end
145
182
146
- MMI. transform (transformer:: TfidfTransformer ,
147
- result:: TfidfTransformerResult , v) =
148
- _transform (transformer, result, build_corpus (v))
183
+ MMI. transform (transformer:: TfidfTransformer , result:: TfidfTransformerResult , v) =
184
+ _transform (transformer, result, build_corpus (v))
149
185
150
- function _transform (:: TfidfTransformer ,
186
+ function _transform (:: TfidfTransformer ,
151
187
result:: TfidfTransformerResult ,
152
188
v:: Corpus )
153
- m = DocumentTermMatrix (v, result. vocab)
154
- tfidf = similar (m . dtm, eltype (result. idf_vector))
155
- build_tfidf! (m . dtm, tfidf, result. idf_vector)
189
+ dtm_matrix = build_dtm (v, result. vocab)
190
+ tfidf = similar (dtm_matrix . dtm, eltype (result. idf_vector))
191
+ build_tfidf! (dtm_matrix . dtm, tfidf, result. idf_vector)
156
192
157
- return tfidf
193
+ # here we return the `adjoint` of our sparse matrix to conform to
194
+ # the `n x p` dimensions throughout MLJ
195
+ return adjoint (tfidf)
158
196
end
159
197
160
198
# for returning user-friendly form of the learned parameters:
@@ -189,4 +227,4 @@ MMI.metadata_model(TfidfTransformer,
189
227
path = " MLJText.TfidfTransformer"
190
228
)
191
229
192
- end # module
230
+ end # module
0 commit comments