@@ -40,26 +40,13 @@ MMI.@mlj_model mutable struct TfidfTransformer <: MLJModelInterface.Unsupervised
40
40
max_doc_freq:: Float64 = 0.98
41
41
min_doc_freq:: Float64 = 0.02
42
42
smooth_idf:: Bool = true
43
- min_ngram_range:: Int = 1
44
- max_ngram_range:: Int = 1
45
43
end
46
44
47
45
struct TfidfTransformerResult
48
46
vocab:: Vector{String}
49
47
idf_vector:: Vector{Float64}
50
48
end
51
49
52
- _build_corpus (transformer:: TfidfTransformer , docs:: Vector{String} ) = _build_corpus (transformer, StringDocument .(docs))
53
-
54
- function _build_corpus (transformer:: TfidfTransformer , docs:: Vector{StringDocument{String}} )
55
- corpus = Corpus (
56
- NGramDocument .(
57
- ngrams .(docs, transformer. min_ngram_range, transformer. max_ngram_range)
58
- )
59
- )
60
- return corpus
61
- end
62
-
63
50
function limit_features (doc_term_matrix:: DocumentTermMatrix , high:: Int , low:: Int )
64
51
doc_freqs = vec (sum (doc_term_matrix. dtm, dims= 1 ))
65
52
@@ -77,7 +64,7 @@ function limit_features(doc_term_matrix::DocumentTermMatrix, high::Int, low::Int
77
64
return (doc_term_matrix. dtm[:, mask], new_terms)
78
65
end
79
66
80
- MMI. fit (transformer:: TfidfTransformer , verbosity:: Int , X) = _fit (transformer, verbosity, _build_corpus (transformer, X ))
67
+ MMI. fit (transformer:: TfidfTransformer , verbosity:: Int , X) = _fit (transformer, verbosity, Corpus ( NGramDocument .(X) ))
81
68
82
69
function _fit (transformer:: TfidfTransformer , verbosity:: Int , X:: Corpus )
83
70
transformer. max_doc_freq < transformer. min_doc_freq && error (" Max doc frequency cannot be less than Min doc frequency!" )
@@ -131,7 +118,7 @@ function build_tfidf!(dtm::SparseMatrixCSC{T}, tfidf::SparseMatrixCSC{F}, idf_ve
131
118
return tfidf
132
119
end
133
120
134
- MMI. transform (transformer:: TfidfTransformer , result:: TfidfTransformerResult , v) = _transform (transformer, result, _build_corpus (transformer, v ))
121
+ MMI. transform (transformer:: TfidfTransformer , result:: TfidfTransformerResult , v) = _transform (transformer, result, Corpus ( NGramDocument .(v) ))
135
122
136
123
function _transform (:: TfidfTransformer , result:: TfidfTransformerResult , v:: Corpus )
137
124
m = DocumentTermMatrix (v, result. vocab)
@@ -161,7 +148,7 @@ MMI.metadata_pkg(TfidfTransformer,
161
148
)
162
149
163
150
MMI. metadata_model (TfidfTransformer,
164
- input_scitype = AbstractVector{STB. Textual},
151
+ input_scitype = AbstractVector{STB. Multiset{STB . Textual} },
165
152
output_scitype = AbstractMatrix{STB. Continuous},# ie, a classifier
166
153
docstring = " Build TF-IDF matrix from raw documents" , # brief description
167
154
path = " MLJText.TfidfTransformer"
0 commit comments