|
| 1 | +@testset "text analysis" begin |
| 2 | + tagged_word = CorpusLoaders.PosTaggedWord("NN", "wheelbarrow") |
| 3 | + tagged_word2 = CorpusLoaders.PosTaggedWord("NN", "soil") |
| 4 | + @test scitype(tagged_word) == Annotated{Textual} |
| 5 | + bag_of_words = Dict("cat"=>1, "dog"=>3) |
| 6 | + @test scitype(bag_of_words) == Multiset{Textual} |
| 7 | + bag_of_tagged_words = Dict(tagged_word => 5) |
| 8 | + @test scitype(bag_of_tagged_words) == Multiset{Annotated{Textual}} |
| 9 | + @test scitype(Document("My Document", "kadsfkj")) == Unknown |
| 10 | + @test scitype(Document([tagged_word, tagged_word2])) == |
| 11 | + Annotated{AbstractVector{Annotated{Textual}}} |
| 12 | + @test scitype(Document("My Other Doc", [tagged_word, tagged_word2])) == |
| 13 | + Annotated{AbstractVector{Annotated{Textual}}} |
| 14 | + nested_tokens = [["dog", "cat"], ["bird", "cat"]] |
| 15 | + @test scitype(Document("Essay Number 1", nested_tokens)) == |
| 16 | + Annotated{AbstractVector{AbstractVector{Textual}}} |
| 17 | + |
| 18 | + @test scitype(Dict(("cat", "in") => 3)) == Multiset{Tuple{Textual,Textual}} |
| 19 | + bag_of_words = Dict("cat in" => 1, |
| 20 | + "the hat" => 1, |
| 21 | + "the" => 2, |
| 22 | + "cat" => 1, |
| 23 | + "hat" => 1, |
| 24 | + "in the" => 1, |
| 25 | + "in" => 1, |
| 26 | + "the cat" => 1) |
| 27 | + bag_of_ngrams = |
| 28 | + Dict(Tuple(String.(split(k))) => v for (k, v) in bag_of_words) |
| 29 | + # Dict{Tuple{String, Vararg{String, N} where N}, Int64} with 8 entries: |
| 30 | + # ("cat",) => 1 |
| 31 | + # ("cat", "in") => 1 |
| 32 | + # ("in",) => 1 |
| 33 | + # ("the", "hat") => 1 |
| 34 | + # ("the",) => 2 |
| 35 | + # ("hat",) => 1 |
| 36 | + # ("in", "the") => 1 |
| 37 | + # ("the", "cat") => 1 |
| 38 | + @test scitype(bag_of_ngrams) == Multiset{NTuple{<:Any,Textual}} |
| 39 | + |
| 40 | + @test scitype(Dict((tagged_word, tagged_word2) => 3)) == |
| 41 | + Multiset{Tuple{Annotated{Textual},Annotated{Textual}}} |
| 42 | + bag_of_ngrams = Dict((tagged_word, tagged_word2) => 3, |
| 43 | + (tagged_word,) => 7) |
| 44 | + @test scitype(bag_of_ngrams) == Multiset{NTuple{<:Any,Annotated{Textual}}} |
| 45 | + |
| 46 | +end |
| 47 | + |
0 commit comments