Skip to content

Commit 4f07e38

Browse files
committed
immigration of scitype() overloading from ScientificTypes 2.3.1
1 parent 65ccc6b commit 4f07e38

File tree

4 files changed

+97
-10
lines changed

4 files changed

+97
-10
lines changed

src/MLJText.jl

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,21 @@
11
module MLJText
22

3+
import MLJModelInterface
4+
using ScientificTypesBase
5+
import ScientificTypes: DefaultConvention
6+
import CorpusLoaders
7+
using SparseArrays
8+
using TextAnalysis
9+
10+
const MMI = MLJModelInterface
11+
const STB = ScientificTypesBase
12+
const CL = CorpusLoaders
13+
14+
const PKG = "MLJText" # substitute model-providing package name
15+
16+
include("scitypes.jl")
317
include("tfidf_transformer.jl")
418

519
export TfidfTransformer
620

7-
end # module
21+
end # module

src/scitypes.jl

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
const STB = ScientificTypesBase
2+
3+
# aliases not exported:
4+
const PlainNGram{N} = NTuple{N,<:AbstractString}
5+
const TaggedNGram{N} = NTuple{N,<:CL.TaggedWord}
6+
7+
# This can be made less of a hack once ScientificTypes #155 is sorted.
8+
9+
type2scitype(T::Type) = STB.Scitype(T, DefaultConvention())
10+
type2scitype(::Type{<:AbstractVector{T}}) where T =
11+
AbstractVector{type2scitype(T)}
12+
type2scitype(::NTuple{N,T}) where {N,T} = NTuple{type2scitype{T}}
13+
14+
STB.scitype(::CL.TaggedWord, ::DefaultConvention) = Annotated{Textual}
15+
STB.scitype(::CL.Document{<:AbstractVector{T}}, ::DefaultConvention) where T =
16+
Annotated{AbstractVector{type2scitype(T)}}
17+
STB.scitype(::AbstractDict{<:AbstractString,<:Integer},
18+
::DefaultConvention) = Multiset{Textual}
19+
STB.scitype(::AbstractDict{<:CL.TaggedWord,<:Integer},
20+
::DefaultConvention) = Multiset{Annotated{Textual}}
21+
STB.scitype(::AbstractDict{<:Union{CL.TaggedWord,AbstractString},<:Integer},
22+
::DefaultConvention) =
23+
Multiset{Union{Textual,Annotated{Textual}}}
24+
STB.scitype(::AbstractDict{<:PlainNGram{N}}) where N =
25+
Multiset{NTuple{N,Textual}}
26+
STB.scitype(::AbstractDict{<:TaggedNGram{N}}) where N =
27+
Multiset{NTuple{N,Annotated{Textual}}}
28+
STB.scitype(::AbstractDict{<:PlainNGram}) =
29+
Multiset{NTuple{<:Any,Textual}}
30+
STB.scitype(::AbstractDict{<:TaggedNGram}) =
31+
Multiset{NTuple{<:Any,Annotated{Textual}}}
32+
33+
STB.Scitype(::Type{<:CL.TaggedWord}, ::DefaultConvention) =
34+
Annotated{Textual}

src/tfidf_transformer.jl

Lines changed: 1 addition & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,3 @@
1-
import MLJModelInterface
2-
import ScientificTypesBase
3-
using SparseArrays, TextAnalysis
4-
5-
const PKG = "MLJText" # substitute model-providing package name
6-
const MMI = MLJModelInterface
7-
const STB = ScientificTypesBase
8-
91
"""
102
TfidfTransformer()
113
@@ -222,4 +214,4 @@ MMI.metadata_model(TfidfTransformer,
222214
output_scitype = AbstractMatrix{STB.Continuous},
223215
docstring = "Build TF-IDF matrix from raw documents",
224216
path = "MLJText.TfidfTransformer"
225-
)
217+
)

test/scitypes.jl

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
@testset "text analysis" begin
2+
tagged_word = CorpusLoaders.PosTaggedWord("NN", "wheelbarrow")
3+
tagged_word2 = CorpusLoaders.PosTaggedWord("NN", "soil")
4+
@test scitype(tagged_word) == Annotated{Textual}
5+
bag_of_words = Dict("cat"=>1, "dog"=>3)
6+
@test scitype(bag_of_words) == Multiset{Textual}
7+
bag_of_tagged_words = Dict(tagged_word => 5)
8+
@test scitype(bag_of_tagged_words) == Multiset{Annotated{Textual}}
9+
@test scitype(Document("My Document", "kadsfkj")) == Unknown
10+
@test scitype(Document([tagged_word, tagged_word2])) ==
11+
Annotated{AbstractVector{Annotated{Textual}}}
12+
@test scitype(Document("My Other Doc", [tagged_word, tagged_word2])) ==
13+
Annotated{AbstractVector{Annotated{Textual}}}
14+
nested_tokens = [["dog", "cat"], ["bird", "cat"]]
15+
@test scitype(Document("Essay Number 1", nested_tokens)) ==
16+
Annotated{AbstractVector{AbstractVector{Textual}}}
17+
18+
@test scitype(Dict(("cat", "in") => 3)) == Multiset{Tuple{Textual,Textual}}
19+
bag_of_words = Dict("cat in" => 1,
20+
"the hat" => 1,
21+
"the" => 2,
22+
"cat" => 1,
23+
"hat" => 1,
24+
"in the" => 1,
25+
"in" => 1,
26+
"the cat" => 1)
27+
bag_of_ngrams =
28+
Dict(Tuple(String.(split(k))) => v for (k, v) in bag_of_words)
29+
# Dict{Tuple{String, Vararg{String, N} where N}, Int64} with 8 entries:
30+
# ("cat",) => 1
31+
# ("cat", "in") => 1
32+
# ("in",) => 1
33+
# ("the", "hat") => 1
34+
# ("the",) => 2
35+
# ("hat",) => 1
36+
# ("in", "the") => 1
37+
# ("the", "cat") => 1
38+
@test scitype(bag_of_ngrams) == Multiset{NTuple{<:Any,Textual}}
39+
40+
@test scitype(Dict((tagged_word, tagged_word2) => 3)) ==
41+
Multiset{Tuple{Annotated{Textual},Annotated{Textual}}}
42+
bag_of_ngrams = Dict((tagged_word, tagged_word2) => 3,
43+
(tagged_word,) => 7)
44+
@test scitype(bag_of_ngrams) == Multiset{NTuple{<:Any,Annotated{Textual}}}
45+
46+
end
47+

0 commit comments

Comments
 (0)