@@ -42,6 +42,8 @@ MMI.@mlj_model mutable struct TfidfTransformer <: MLJModelInterface.Unsupervised
42
42
smooth_idf:: Bool = true
43
43
end
44
44
45
+ const NGram{N} = NTuple{<: Any ,<: AbstractString }
46
+
45
47
struct TfidfTransformerResult
46
48
vocab:: Vector{String}
47
49
idf_vector:: Vector{Float64}
@@ -64,7 +66,10 @@ function limit_features(doc_term_matrix::DocumentTermMatrix, high::Int, low::Int
64
66
return (doc_term_matrix. dtm[:, mask], new_terms)
65
67
end
66
68
67
- build_corpus (X:: Vector{Dict{String, Int64}} ) = Corpus (NGramDocument .(X))
69
+ _convert_bag_of_words (X:: Dict{NGram, Int} ) = Dict (join (k, " " ) => v for (k, v) in X)
70
+
71
+ build_corpus (X:: Vector{Dict{NGram, Int}} ) = build_corpus (_convert_bag_of_words .(X))
72
+ build_corpus (X:: Vector{Dict{S, Int}} ) where {S <: AbstractString } = Corpus (NGramDocument .(X))
68
73
build_corpus (X) = Corpus (TokenDocument .(X))
69
74
70
75
MMI. fit (transformer:: TfidfTransformer , verbosity:: Int , X) = _fit (transformer, verbosity, build_corpus (X))
@@ -151,7 +156,9 @@ MMI.metadata_pkg(TfidfTransformer,
151
156
)
152
157
153
158
MMI. metadata_model (TfidfTransformer,
154
- input_scitype = Union{AbstractVector{STB. Multiset{STB. Textual}}, AbstractVector{AbstractVector{STB. Textual}}},
159
+ input_scitype = Union{
160
+ AbstractVector{<: AbstractVector{STB.Textual} }, AbstractVector{<: STB.Multiset{<:NGram} }, AbstractVector{<: STB.Multiset{STB.Textual} }
161
+ },
155
162
output_scitype = AbstractMatrix{STB. Continuous},# ie, a classifier
156
163
docstring = " Build TF-IDF matrix from raw documents" , # brief description
157
164
path = " MLJText.TfidfTransformer"
0 commit comments