diff --git a/.gitignore b/.gitignore index 8c960ec..3f02ca7 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ *.jl.cov *.jl.*.cov *.jl.mem +Manifest.toml diff --git a/Project.toml b/Project.toml index a81d92f..526c3ec 100644 --- a/Project.toml +++ b/Project.toml @@ -5,14 +5,18 @@ version = "0.5.6" [deps] DataDeps = "124859b0-ceae-595e-8997-d05f6a7a8dfe" HTML_Entities = "7693890a-d069-55fe-a829-b4a6d304f0ee" +InternedStrings = "7d512f48-7fb1-5a58-b986-67e6dc259f01" +JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6" StrTables = "9700d1a9-a7c8-5760-9816-a99fda30bb8f" Unicode = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5" [compat] DataDeps = "0.6.5, 0.7" -julia = "1" -HTML_Entities= "1" +HTML_Entities = "1" StrTables = "1" +julia = "1, 1.1" +JSON = "0.21.1" +InternedStrings = "0.7.0" [extras] Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" diff --git a/README.md b/README.md index 148f666..b394a81 100644 --- a/README.md +++ b/README.md @@ -294,42 +294,50 @@ julia> tokenize("hi__hello") "__" "hihello" ``` -# Statistical Tokenizer +# Statistical Tokenizer -**Sentencepiece Unigram Encoder** is basically the Sentencepiece processor's re-implementation in julia. It can used vocab file generated by sentencepiece library containing both vocab and log probability. + - **Sentencepiece Unigram Encoder** is basically the Sentencepiece processor's re-implementation in julia. It can used vocab file generated by sentencepiece library containing both vocab and log probability. + For more detail about implementation refer the blog post [here](https://tejasvaidhyadev.github.io/blog/Sentencepiece) -For more detail about implementation refer the blog post [here](https://tejasvaidhyadev.github.io/blog/Sentencepiece) + - **GPT2 Tokenizer** is the subword tokenizer which uses Byte Level Pair Encoding to split unknown words into known subwords present in it's pretrained vocabulary. **Note** : - SentencePiece escapes the whitespace with a meta symbol "▁" (U+2581). +- GPT2Tokenizer treats whitespace before a word as part of the word and escapes it with meta symbol "Ġ" (U+0120). -### Pretrained +### Pretrained -Wordtokenizer provides pretrained vocab file of Albert (both version-1 and version-2) +Wordtokenizers provides pretrained vocab file of Albert (both version-1 and version-2) and GPT2. You can initialize the tokenizers by load function. ```julia julia> subtypes(PretrainedTokenizer) 2-element Array{Any,1}: ALBERT_V1 ALBERT_V2 + GPT2 -julia> tokenizerfiles(ALBERT_V1) +julia> tokenizer_files(ALBERT_V1) 4-element Array{String,1}: "albert_base_v1_30k-clean.vocab" "albert_large_v1_30k-clean.vocab" - "albert_xlarge_v1_30k-clean.vocab" + "albert_xlarge_v1_30k-clean.vocab" "albert_xxlarge_v1_30k-clean.vocab" + +julia> tokenizer_files(GPT2) +2-element Array{String,1}: + "GPT2/encoder.json" + "GPT2/vocab.bpe" ``` `DataDeps` will handle all the downloading part for us. You can also create an issue or PR for other pretrained models or directly load by providing path in `load` function ```julia -julia> spm = load(Albert_Version1) #loading Default Albert-base vocab in Sentencepiece +julia> spm = load(ALBERT_V1) #loading Default Albert-base vocab in Sentencepiece WordTokenizers.SentencePieceModel(Dict("▁shots"=>(-11.2373, 7281),"▁ordered"=>(-9.84973, 1906),"dev"=>(-12.0915, 14439),"▁silv"=>(-12.6564, 21065),"▁doubtful"=>(-12.7799, 22569),"▁without"=>(-8.34227, 367),"▁pol"=>(-10.7694, 4828),"chem"=>(-12.3713, 17661),"▁1947,"=>(-11.7544, 11199),"▁disrespect"=>(-13.13, 26682)…), 2) -julia> tk = tokenizer(spm, "i love the julia language") #or tk = spm("i love the julia language") +julia> tk = tokenize(spm, "i love the julia language") #or tk = spm("i love the julia language") 4-element Array{String,1}: "▁i" "▁love" @@ -337,7 +345,7 @@ julia> tk = tokenizer(spm, "i love the julia language") #or tk = spm("i love the "▁julia" "▁language" -julia> subword = tokenizer(spm, "unfriendly") +julia> subword = tokenize(spm, "unfriendly") 2-element Array{String,1}: "▁un" "friendly" @@ -359,8 +367,8 @@ julia> para = spm("Julia is a high-level, high-performance dynamic language for "▁dynamic" "▁language" "▁for" - "▁technical" - "▁computing" + "▁technical" + "▁computing" ``` Indices is usually used for deep learning models. @@ -382,13 +390,13 @@ julia> ids_from_tokens(spm, tk) 5424 817 #we can also get sentences back from tokens -julia> sentence_from_tokens(tk) +julia> sentence_from_tokens(spm, tk) "i love the julia language" -julia> sentence_from_token(subword) +julia> sentence_from_tokens(spm, subword) "unfriendly" -julia> sentence_from_tokens(para) +julia> sentence_from_tokens(spm, para) "Julia is a high-level, high-performance dynamic language for technical computing" ``` diff --git a/src/WordTokenizers.jl b/src/WordTokenizers.jl index e25eb73..70b46aa 100644 --- a/src/WordTokenizers.jl +++ b/src/WordTokenizers.jl @@ -4,7 +4,7 @@ module WordTokenizers using HTML_Entities using StrTables using Unicode -using DataDeps +using DataDeps, JSON, InternedStrings abstract type PretrainedTokenizer end @@ -17,7 +17,9 @@ export poormans_tokenize, punctuation_space_tokenize, set_tokenizer, set_sentence_splitter, rev_tokenize, rev_detokenize, toktok_tokenize -export ALBERT_V1, ALBERT_V2, load, tokenizer, sentence_from_tokens, ids_from_tokens + +export ALBERT_V1, ALBERT_V2, GPT2 +export load, tokenize, sentence_from_tokens, ids_from_tokens export PretrainedTokenizer, tokenizer_files include("words/fast.jl") @@ -33,6 +35,7 @@ include("set_method_api.jl") include("split_api.jl") include("statistical/unigram.jl") +include("statistical/gpt2tokenizer.jl") const pretrained = Dict{DataType, Vector{String}}() function tokenizer_files(::Type{T}) where T<:PretrainedTokenizer @@ -47,4 +50,11 @@ function __init__() init_vocab_datadeps() end +load(::Val{:ALBERT_V1}) = load_sp(ALBERT_V1) +load(::Val{:ALBERT_V2}) = load_sp(ALBERT_V2) +load(::Val{:GPT2}) = load_gpt2(GPT2) + +load(::Type{T}) where T<:PretrainedTokenizer = load(Val(Symbol(T))) + + end # module diff --git a/src/statistical/Vocab_DataDeps.jl b/src/statistical/Vocab_DataDeps.jl index d935ba7..ad09dd2 100644 --- a/src/statistical/Vocab_DataDeps.jl +++ b/src/statistical/Vocab_DataDeps.jl @@ -1,5 +1,6 @@ abstract type ALBERT_V1 <: PretrainedTokenizer end abstract type ALBERT_V2 <: PretrainedTokenizer end +abstract type GPT2 <: PretrainedTokenizer end const vectors_albertversion1 = [ ("albert_base_v1_30k-clean.vocab", @@ -40,6 +41,8 @@ const vectors_albertversion2 = [ "https://raw.githubusercontent.com/tejasvaidhyadev/ALBERT.jl/master/src/Vocabs/albert_xxlarge_v2_30k-clean.vocab") ] +const vectors_gpt2 = ["encoder.json", "vocab.bpe"] + function init_vocab_datadeps() for (depname, description, sha, link) in vectors_albertversion1 register(DataDep(depname, @@ -70,5 +73,17 @@ function init_vocab_datadeps() )) append!(tokenizer_files(ALBERT_V2), ["$depname"]) end -end + register(DataDep("GPT2", + """ + Pretrained gpt2 vocabulary and merges file by Open AI. + Website: https://openai.com/blog/better-language-models/ + Author: Radford et al + Licence: MIT + All GPT2 Models are trained on same size vocabulary. + """, + ["https://openaipublic.blob.core.windows.net/gpt-2/models/117M/$(file)" for file in vectors_gpt2], + "05805f21f823300551adf0646abe905eb036fb272f97c279f0d9c656c845ca46")) + + append!(tokenizer_files(GPT2), ["GPT2/$(file)" for file in vectors_gpt2]) +end diff --git a/src/statistical/gpt2tokenizer.jl b/src/statistical/gpt2tokenizer.jl new file mode 100644 index 0000000..6732311 --- /dev/null +++ b/src/statistical/gpt2tokenizer.jl @@ -0,0 +1,218 @@ +""" +struct GPT2Tokenizer + vocab::Dict{String, Any} + rank::Dict{Pair{String,String}, Int} + cache::Dict{String, Tuple} + pat::Regex +end +structure, To hold pretrained vocabulary map and merge rules for GPT2 +""" +struct GPT2Tokenizer + vocab::Dict{String, Any} + rank::Dict{Pair{String,String}, Int} + cache::Dict{String, Tuple} + pat::Regex + + function GPT2Tokenizer(::Type{T};pat=r"'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+") where T<:PretrainedTokenizer + + vocab_file = @datadep_str tokenizer_files(T)[1] + bfile = @datadep_str tokenizer_files(T)[2] + + vocab = Dict{String, Any}() + rank = Dict{Pair{String, String}, Int}() + cache = Dict{String, Tuple}() + + vocab = JSON.parsefile(vocab_file) + + open(bfile) do f + for (i, line) ∈ enumerate(eachline(f)) + if i==1 + identity + else + pair = Pair(split(line," ")...) + rank[pair] = i-1 + end + end + end + new(vocab, rank, cache, pat) + end +end + +""" +load_gpt2(ty::Type{T}) where T<:PretrainedTokenizer +Initializes the GPT2Tokenizer and loads the vocab and merges files from `DataDeps` +#Example +```julia-repl +julia> tokenizer = load(GPT2) + +``` +""" +function load_gpt2(::Type{T}) where T<:PretrainedTokenizer + GPT2Tokenizer(T) +end + +""" +Returns Dictionary of utf-8 encoding and corresponding unicode strings for Byte-Pair Encoding. +""" +function bytes_to_unicode() + bs = [33:255...] + cs = bs[:] + n=0 + for b in 0:255 + if b ∉ bs + append!(bs, b) + append!(cs, 256+n) + n+=1 + end + end + cs = [Char(n) for n in cs] + Dict(zip(bs,cs)) +end + +toStrTuple(x::Vector{String})=toStrTuple(join(x)) +function toStrTuple(x::AbstractString) + fs = intern.(split(chop(x), "")) + push!(fs, intern(x[end]*"")) + filter!((x)-> x != "", fs) + Tuple(fs) +end + +""" +get_pairs(word::NTuple{}) +Returns set of pairs in a word. Word is a tuple of strings. +""" +function get_pairs(word::NTuple{}) + pairs = Set{Pair{}}() + prev_char = word[1] + for char in word[2:end] + push!(pairs, Pair(prev_char, char)) + prev_char = char + end + pairs +end + +lowestpair(pairs::Set{Pair{}},tokenizer::GPT2Tokenizer) = lowestpair(collect(pairs), tokenizer::GPT2Tokenizer) +lowestpair(pairs::Vector{Pair{}}, tokenizer::GPT2Tokenizer) = argmin( + sizehint!(Dict( + map(pairs) do p + p=>get(tokenizer.rank, p, typemax(Int)) + end), + length(pairs)) + ) + + +function bpe(token::String, tokenizer::GPT2Tokenizer) + + haskey(tokenizer.cache, token) && return tokenizer.cache[token] + word = toStrTuple(token) + pairs = get_pairs(word) + isempty(pairs) && return token + + while true + pair = lowestpair(pairs, tokenizer) + !haskey(tokenizer.rank, pair) && break + first, second = pair + new_word=Vector{String}() + i=1 + + while i <= length(word) + + try + j = findnext(isequal(first), word, i) + append!(new_word, word[i:j-1]) + i=j + catch + append!(new_word,word[i:end]) + break + end + + if word[i]==first && i<=length(word)-1 && word[i+1]==second + push!(new_word, first*second) + i+=2 + else + push!(new_word, word[i]) + i+=1 + end + end + new_word = Tuple(new_word) + word = new_word + + if length(word)==1 + break + else + pairs = get_pairs(word) + end + end + tokenizer.cache[token] = word + word +end + +""" +tokenize(tokenizer::GPT2Tokenizer, text::String) +Implements tokenization of input text. This tokenizer doesn't include unknown and special tokens because +of its byte-level BPE tokenization. GPT2 model is only trained on end token `<|endoftext|>`. Has to be +manually added after the tokenization. +GPT2 Tokenizer treats whitespace as unicode character `\u0120 (Ġ)` before a word. + +# Example +```julia-repl +julia> tokens = tokenize(tokenizer, "Hi! How you doin") +6-element Array{String,1}: + "Hi" + "!" + "ĠHow" + "Ġyou" + "Ġdo" + "in" +``` +""" +function tokenize(tokenizer::GPT2Tokenizer, text::String) + mapping = bytes_to_unicode() + tokens=Vector{String}() + matches = map(eachmatch(tokenizer.pat, text)) do m + m.match + end + for token in matches + token = join([mapping[Int(b)] for b in token]) + append!(tokens, [string(bpe_token) for bpe_token in bpe(token, tokenizer)]) + end + tokens +end + +""" +ids_from_tokens(tokenizer::GPT2Tokenizer, tokens::Vector{String}) +Returns respective ids of tokens from pretrained vocabulary map + +# Example +```julia-repl +julia> tokens = tokenize("Hi! How you doin", tokenizer) +6-element Array{String,1}: + "Hi" + "!" + "ĠHow" + "Ġyou" + "Ġdo" + "in" + +julia> ids_from_tokens(tokenizer, tokens) +6-element Array{Int64,1}: + 17250 + 0 + 1374 + 345 + 466 + 259 +``` +""" +function ids_from_tokens(tokenizer::GPT2Tokenizer, tokens::Vector{String}) + map(tokens) do x + last(get(tokenizer.vocab, x, 0)) + end +end + +function sentence_from_tokens(tokenizer::GPT2Tokenizer, tokens::Array{String,1}) + sen = join(tokens) + sen = replace(sen, "Ġ" => " ") + sen = strip(sen) + return sen +end diff --git a/src/statistical/unigram.jl b/src/statistical/unigram.jl index 2901b1d..dd4df0d 100644 --- a/src/statistical/unigram.jl +++ b/src/statistical/unigram.jl @@ -3,7 +3,7 @@ struct SentencePieceModel vocab_map::Dict{String, Tuple{Float64, Int}} unk_id::Int end -structure, To hold unknown token index and map of vocabulary to log probability and index +structure, To hold unknown token index and map of vocabulary to log probability and index """ struct SentencePieceModel vocab_map::Dict{String, Tuple{Float64, Int}} @@ -11,25 +11,25 @@ struct SentencePieceModel end """ - load(ty::Type{T}, filenum::Int=1; unk_token="") where T<:PretrainedTokenizer + load_sp(ty::Type{T}, filenum::Int=1; unk_token="") where T<:PretrainedTokenizer use to initialize the `SentencePieceModel` by loading the file from `DataDeps` # Example ```julia-repl julia> spm = load(ALBERT_V1) ``` """ -function load(ty::Type{T}, filenum::Int=1; unk_token="") where T<:PretrainedTokenizer +function load_sp(ty::Type{T}, filenum::Int=1; unk_token="") where T<:PretrainedTokenizer filepath = @datadep_str tokenizer_files(ty)[filenum] name = tokenizer_files(ty)[filenum] filepath = "$filepath/$name" - load(filepath, unk_token=unk_token) + load_sp(filepath, unk_token=unk_token) end """ - load(path; unk_token="") + load_sp(path; unk_token="") use to initialize the SentencePieceModel by providing `vocab filepath` -""" -function load(path; unk_token="") +""" +function load_sp(path; unk_token="") vocab_path = readlines(path) vocabnlogp = split.(vocab_path, "\t") vocab_map = Dict(tok=>(parse(Float64, logp), index) for (index, (tok, logp)) in enumerate(vocabnlogp)) @@ -37,13 +37,13 @@ function load(path; unk_token="") unk_id = vocab_map[unk_token][2] else throw(DomainError(unk_token, "Unknown token is not in the vocabulary")) - end + end spm = SentencePieceModel(vocab_map, unk_id) return spm end """ -struct Nodes +struct Nodes text::String score::Float32 index::Int64 @@ -51,9 +51,9 @@ struct Nodes en::Int end Utility structure, To hold the results of the `forward pass` (the forward Viterbi lattice) -hold the token token string, score, vocabulary index, start and end character position +hold the token token string, score, vocabulary index, start and end character position """ -struct Nodes +struct Nodes text::String score::Float32 index::Int64 @@ -90,10 +90,10 @@ julia> node = WordTokenizers.decode_forward(spm, "I love julia language") WordTokenizers.Nodes("gua", -23.776f0, 15259, 17, 19) WordTokenizers.Nodes("ag", -34.1531f0, 3303, 19, 20) WordTokenizers.Nodes("language", -11.1965f0, 7021, 14, 21) -``` +``` """ function decode_forward(sp::SentencePieceModel, text::String) - results = Array{Nodes, 1}(undef, lastindex(text)) + results = Array{Nodes, 1}(undef, lastindex(text)) scores = fill(-Inf, lastindex(text)) scores[1] = 0 for char_end in eachindex(text) @@ -103,7 +103,7 @@ function decode_forward(sp::SentencePieceModel, text::String) if haskey(sp.vocab_map, subtoken) subtokenid = sp.vocab_map[subtoken][2] local_score = scores[char_start] + sp.vocab_map[subtoken][1] - if local_score > scores[char_end] + if local_score > scores[char_end] results[char_end] = Nodes(SubString(text, char_start:char_end), local_score, subtokenid, char_start, char_end) scores[char_end] = local_score end @@ -141,7 +141,7 @@ julia> WordTokenizers.decode_backward(spm, node, text) function decode_backward(sp::SentencePieceModel, nodes::Array{Nodes,1}, text::AbstractString) next_nodes = nodes[end] best_seq = Nodes[] - + while next_nodes.start > 1 node_value = next_nodes next_nodes = nodes[prevind(text, node_value.start)] @@ -152,11 +152,11 @@ function decode_backward(sp::SentencePieceModel, nodes::Array{Nodes,1}, text::Ab end """ - tokenizer(sp::SentencePieceModel,text::AbstractString) + tokenize(sp::SentencePieceModel,text::AbstractString) It does all the preprocessing step needed and perform `decode_forward` and `decode_backward` ouput tokenize tokens as Array{String,1} """ -function tokenizer(sp::SentencePieceModel, text::AbstractString) +function tokenize(sp::SentencePieceModel, text::AbstractString) text = replace(text, " " => "▁") if text[1] != '▁' text = "▁" * text @@ -166,7 +166,7 @@ function tokenizer(sp::SentencePieceModel, text::AbstractString) tokens = reverse(tokens) tks = [node.text for node in tokens] return tks - + end """ @@ -174,26 +174,26 @@ end It does all the preprocessing step needed and perform `decode_forward` and `decode_backward`. """ function (sp::SentencePieceModel)(text::AbstractString) - tokenizer(sp, text) + tokenize(sp, text) end """ ids_from_tokens(spm::SentencePieceModel, tk::Array{String,1}) given tokens it provide its indices -""" -function ids_from_tokens(spm::SentencePieceModel, tk::Array{String,1}) +""" +function ids_from_tokens(spm::SentencePieceModel, tk::Array{String,1}) map(tk) do x last(get(spm.vocab_map, x, spm.unk_id)) end end """ - sentence_from_tokens(tk::Array{String,1}) + sentence_from_tokens(spm::SentencePieceModel, tk::Array{String,1}) given tokens it provide its sentences """ -function sentence_from_tokens(tk::Array{String,1}) +function sentence_from_tokens(spm::SentencePieceModel, tk::Array{String,1}) sen = join(tk) sen = replace(sen, "▁" => " ") sen = strip(sen) - return sen + return sen end diff --git a/test/gpt2_tokenizer.jl b/test/gpt2_tokenizer.jl new file mode 100644 index 0000000..195daae --- /dev/null +++ b/test/gpt2_tokenizer.jl @@ -0,0 +1,42 @@ +using WordTokenizers +using Test + +gpt2_tokenizer = load(GPT2) + +@testset "Pretrained" begin + @test typeof(gpt2_tokenizer) == WordTokenizers.GPT2Tokenizer + @test typeof(gpt2_tokenizer.vocab) == Dict{String, Any} + @test typeof(gpt2_tokenizer.rank) == Dict{Pair{String,String}, Int} + @test typeof(gpt2_tokenizer.cache) == Dict{String, Tuple} + @test typeof(WordTokenizers.pretrained) == Dict{DataType,Array{String,1}} + @test length(WordTokenizers.pretrained[GPT2]) == 2 +end + +@testset "Tokenizer and helper function" begin + @test gpt2_tokenizer.vocab["Hi"] == 17250 + @test tokenize(gpt2_tokenizer, "I love julia language") == ["I", + "Ġlove", + "Ġj", + "ulia", + "Ġlanguage"] + tokens = tokenize(gpt2_tokenizer, "I love julia language") + @test ids_from_tokens(gpt2_tokenizer, tokens) == [40, 1842, 474, 43640, 3303] + @test sentence_from_tokens(gpt2_tokenizer, tokens) == "I love julia language" + + tokens= tokenize(gpt2_tokenizer, "A census taker once tried to test me. I ate his liver with some fava beans and a nice Chianti.") + @test tokens == ["A", "Ġcensus", "Ġt", "aker", "Ġonce", + "Ġtried", "Ġto", "Ġtest", "Ġme", ".", + "ĠI", "Ġate", "Ġhis", "Ġliver", "Ġwith", + "Ġsome", "Ġfav", "a","Ġbeans", "Ġand", + "Ġa", "Ġnice", "ĠCh", "iant", "i", "."] + @test ids_from_tokens(gpt2_tokenizer, tokens) == [32, 21649, 256, 3110, 1752, 3088, 284, 1332, 502, 13, 314, 15063, + 465, 14383, 351, 617, 2090, 64, 16567, 290, 257, 3621, 609, 3014, + 72, 13] + + text = "Badges? We ain't got no badges:) We don't need no badges:p I don't have to show you any stinking badges!" + tokens = tokenize(gpt2_tokenizer, text) + @test tokens == ["Bad", "ges", "?", "ĠWe", "Ġain", "'t", "Ġgot", "Ġno", "Ġbadges", ":", ")", "ĠWe", + "Ġdon", "'t", "Ġneed", "Ġno", "Ġbadges", ":", "p", "ĠI", "Ġdon", "'t", "Ġhave", + "Ġto", "Ġshow", "Ġyou", "Ġany", "Ġst", "inking", "Ġbadges", "!"] + @test sentence_from_tokens(gpt2_tokenizer, tokens) == text +end diff --git a/test/runtests.jl b/test/runtests.jl index 10bb818..588dfad 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -8,7 +8,8 @@ files = ["simple", "tweet_tokenize", "reversible_tok", "toktok", - "sp_unigram" + "sp_unigram", + "gpt2_tokenizer" ] @testset "$file" for file in files diff --git a/test/sp_unigram.jl b/test/sp_unigram.jl index f85686e..416ca1e 100644 --- a/test/sp_unigram.jl +++ b/test/sp_unigram.jl @@ -16,12 +16,12 @@ end end @testset "Tokenizers and helper function" begin @test spm.vocab_map["now"][2] == 1388 - @test tokenizer(spm, "I love julia language") == ["▁", - "I", - "▁love", - "▁julia", + @test tokenize(spm, "I love julia language") == ["▁", + "I", + "▁love", + "▁julia", "▁language"] - tks = tokenizer(spm, "i love julia language") + tks = tokenize(spm, "i love julia language") @test ids_from_tokens(spm, tks) == [32, 340, 5424, 817] - @test sentence_from_tokens(tks) == "i love julia language" + @test sentence_from_tokens(spm, tks) == "i love julia language" end