immigration of scitype() overloading from ScientificTypes 2.3.1

ablaom · ablaom · commit 4f07e38c27b9 · 2021-10-11T15:18:41.000+13:00
diff --git a/src/MLJText.jl b/src/MLJText.jl
@@ -1,7 +1,21 @@
 module MLJText
 
+import MLJModelInterface
+using ScientificTypesBase
+import ScientificTypes: DefaultConvention
+import CorpusLoaders
+using SparseArrays
+using TextAnalysis
+
+const MMI = MLJModelInterface
+const STB = ScientificTypesBase
+const CL = CorpusLoaders
+
+const PKG = "MLJText"          # substitute model-providing package name
+
+include("scitypes.jl")
 include("tfidf_transformer.jl")
 
 export TfidfTransformer
 
-end # module
+end # module
diff --git a/src/scitypes.jl b/src/scitypes.jl
@@ -0,0 +1,34 @@
+const STB = ScientificTypesBase
+
+# aliases not exported:
+const PlainNGram{N}  = NTuple{N,<:AbstractString}
+const TaggedNGram{N} = NTuple{N,<:CL.TaggedWord}
+
+# This can be made less of a hack once ScientificTypes #155 is sorted.
+
+type2scitype(T::Type) = STB.Scitype(T, DefaultConvention())
+type2scitype(::Type{<:AbstractVector{T}}) where T =
+    AbstractVector{type2scitype(T)}
+type2scitype(::NTuple{N,T}) where {N,T} = NTuple{type2scitype{T}}
+
+STB.scitype(::CL.TaggedWord, ::DefaultConvention) = Annotated{Textual}
+STB.scitype(::CL.Document{<:AbstractVector{T}}, ::DefaultConvention) where T =
+    Annotated{AbstractVector{type2scitype(T)}}
+STB.scitype(::AbstractDict{<:AbstractString,<:Integer},
+           ::DefaultConvention) = Multiset{Textual}
+STB.scitype(::AbstractDict{<:CL.TaggedWord,<:Integer},
+           ::DefaultConvention) = Multiset{Annotated{Textual}}
+STB.scitype(::AbstractDict{<:Union{CL.TaggedWord,AbstractString},<:Integer},
+           ::DefaultConvention) =
+               Multiset{Union{Textual,Annotated{Textual}}}
+STB.scitype(::AbstractDict{<:PlainNGram{N}}) where N =
+    Multiset{NTuple{N,Textual}}
+STB.scitype(::AbstractDict{<:TaggedNGram{N}}) where N =
+    Multiset{NTuple{N,Annotated{Textual}}}
+STB.scitype(::AbstractDict{<:PlainNGram}) =
+    Multiset{NTuple{<:Any,Textual}}
+STB.scitype(::AbstractDict{<:TaggedNGram}) =
+    Multiset{NTuple{<:Any,Annotated{Textual}}}
+
+STB.Scitype(::Type{<:CL.TaggedWord}, ::DefaultConvention) =
+    Annotated{Textual}
diff --git a/src/tfidf_transformer.jl b/src/tfidf_transformer.jl
@@ -1,11 +1,3 @@
-import MLJModelInterface
-import ScientificTypesBase
-using SparseArrays, TextAnalysis
-
-const PKG = "MLJText"          # substitute model-providing package name
-const MMI = MLJModelInterface
-const STB = ScientificTypesBase
-
 """
     TfidfTransformer()
 
@@ -222,4 +214,4 @@ MMI.metadata_model(TfidfTransformer,
                output_scitype = AbstractMatrix{STB.Continuous},
                docstring = "Build TF-IDF matrix from raw documents",
                path = "MLJText.TfidfTransformer"
-               )
+               )
diff --git a/test/scitypes.jl b/test/scitypes.jl
@@ -0,0 +1,47 @@
+@testset "text analysis" begin
+    tagged_word = CorpusLoaders.PosTaggedWord("NN", "wheelbarrow")
+    tagged_word2 = CorpusLoaders.PosTaggedWord("NN", "soil")
+    @test scitype(tagged_word) == Annotated{Textual}
+    bag_of_words = Dict("cat"=>1, "dog"=>3)
+    @test scitype(bag_of_words) == Multiset{Textual}
+    bag_of_tagged_words = Dict(tagged_word => 5)
+    @test scitype(bag_of_tagged_words) == Multiset{Annotated{Textual}}
+    @test scitype(Document("My Document", "kadsfkj")) == Unknown
+    @test scitype(Document([tagged_word, tagged_word2])) ==
+        Annotated{AbstractVector{Annotated{Textual}}}
+    @test scitype(Document("My Other Doc", [tagged_word, tagged_word2])) ==
+        Annotated{AbstractVector{Annotated{Textual}}}
+    nested_tokens = [["dog", "cat"], ["bird", "cat"]]
+    @test scitype(Document("Essay Number 1", nested_tokens)) ==
+        Annotated{AbstractVector{AbstractVector{Textual}}}
+
+    @test scitype(Dict(("cat", "in") => 3)) == Multiset{Tuple{Textual,Textual}}
+    bag_of_words = Dict("cat in" => 1,
+                        "the hat" => 1,
+                        "the" => 2,
+                        "cat" => 1,
+                        "hat" => 1,
+                        "in the" => 1,
+                        "in" => 1,
+                        "the cat" => 1)
+    bag_of_ngrams =
+        Dict(Tuple(String.(split(k))) => v for (k, v) in bag_of_words)
+    # Dict{Tuple{String, Vararg{String, N} where N}, Int64} with 8 entries:
+    #   ("cat",)       => 1
+    #   ("cat", "in")  => 1
+    #   ("in",)        => 1
+    #   ("the", "hat") => 1
+    #   ("the",)       => 2
+    #   ("hat",)       => 1
+    #   ("in", "the")  => 1
+    #   ("the", "cat") => 1
+    @test scitype(bag_of_ngrams) == Multiset{NTuple{<:Any,Textual}}
+
+    @test scitype(Dict((tagged_word, tagged_word2) => 3)) ==
+        Multiset{Tuple{Annotated{Textual},Annotated{Textual}}}
+    bag_of_ngrams = Dict((tagged_word, tagged_word2) => 3,
+                        (tagged_word,) => 7)
+    @test scitype(bag_of_ngrams) == Multiset{NTuple{<:Any,Annotated{Textual}}}
+
+end
+