diff --git a/.gitignore b/.gitignore
index 8c960ec..3f02ca7 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,4 @@
 *.jl.cov
 *.jl.*.cov
 *.jl.mem
+Manifest.toml
diff --git a/Project.toml b/Project.toml
index a81d92f..526c3ec 100644
--- a/Project.toml
+++ b/Project.toml
@@ -5,14 +5,18 @@ version = "0.5.6"
 [deps]
 DataDeps = "124859b0-ceae-595e-8997-d05f6a7a8dfe"
 HTML_Entities = "7693890a-d069-55fe-a829-b4a6d304f0ee"
+InternedStrings = "7d512f48-7fb1-5a58-b986-67e6dc259f01"
+JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6"
 StrTables = "9700d1a9-a7c8-5760-9816-a99fda30bb8f"
 Unicode = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5"
 
 [compat]
 DataDeps = "0.6.5, 0.7"
-julia = "1"
-HTML_Entities= "1"
+HTML_Entities = "1"
 StrTables = "1"
+julia = "1, 1.1"
+JSON = "0.21.1"
+InternedStrings = "0.7.0"
 
 [extras]
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
diff --git a/README.md b/README.md
index 148f666..b394a81 100644
--- a/README.md
+++ b/README.md
@@ -294,42 +294,50 @@ julia> tokenize("hi__hello")
  "__"
  "hihello"
 ```
-# Statistical Tokenizer 
+# Statistical Tokenizer
 
-**Sentencepiece Unigram Encoder** is basically  the Sentencepiece processor's re-implementation in julia. It can used vocab file generated by sentencepiece library containing both vocab and log probability.
+ - **Sentencepiece Unigram Encoder** is basically the Sentencepiece processor's re-implementation in julia. It can used vocab file generated by sentencepiece library containing both vocab and log probability.
+ For more detail about implementation refer the blog post [here](https://tejasvaidhyadev.github.io/blog/Sentencepiece)
 
-For more detail about implementation refer the blog post [here](https://tejasvaidhyadev.github.io/blog/Sentencepiece)
+ - **GPT2 Tokenizer** is the subword tokenizer which uses Byte Level Pair Encoding to split unknown words into known subwords present in it's pretrained vocabulary.
 
 **Note** :
 
 - SentencePiece escapes the whitespace with a meta symbol "▁" (U+2581).
+- GPT2Tokenizer treats whitespace before a word as part of the word and escapes it with meta symbol "Ġ" (U+0120).
 
 
-### Pretrained 
+### Pretrained
 
-Wordtokenizer provides pretrained vocab file of Albert (both version-1 and version-2) 
+Wordtokenizers provides pretrained vocab file of Albert (both version-1 and version-2) and GPT2. You can initialize the tokenizers by load function.
 
 ```julia
 julia> subtypes(PretrainedTokenizer)
 2-element Array{Any,1}:
  ALBERT_V1
  ALBERT_V2
+ GPT2
 
-julia> tokenizerfiles(ALBERT_V1)
+julia> tokenizer_files(ALBERT_V1)
 4-element Array{String,1}:
  "albert_base_v1_30k-clean.vocab"   
  "albert_large_v1_30k-clean.vocab"  
- "albert_xlarge_v1_30k-clean.vocab" 
+ "albert_xlarge_v1_30k-clean.vocab"
  "albert_xxlarge_v1_30k-clean.vocab"
+
+julia> tokenizer_files(GPT2)
+2-element Array{String,1}:
+ "GPT2/encoder.json"
+ "GPT2/vocab.bpe"
 ```
 
 `DataDeps` will handle all the downloading part for us.  You can also create an issue or PR for other pretrained models or directly load by providing path in `load` function
 
 ```julia
-julia> spm = load(Albert_Version1) #loading Default Albert-base vocab in Sentencepiece
+julia> spm = load(ALBERT_V1) #loading Default Albert-base vocab in Sentencepiece
 WordTokenizers.SentencePieceModel(Dict("▁shots"=>(-11.2373, 7281),"▁ordered"=>(-9.84973, 1906),"dev"=>(-12.0915, 14439),"▁silv"=>(-12.6564, 21065),"▁doubtful"=>(-12.7799, 22569),"▁without"=>(-8.34227, 367),"▁pol"=>(-10.7694, 4828),"chem"=>(-12.3713, 17661),"▁1947,"=>(-11.7544, 11199),"▁disrespect"=>(-13.13, 26682)…), 2)
 
-julia> tk = tokenizer(spm, "i love the julia language") #or tk = spm("i love the julia language")
+julia> tk = tokenize(spm, "i love the julia language") #or tk = spm("i love the julia language")
 4-element Array{String,1}:
  "▁i"       
  "▁love"
@@ -337,7 +345,7 @@ julia> tk = tokenizer(spm, "i love the julia language") #or tk = spm("i love the
  "▁julia"   
  "▁language"
 
-julia> subword = tokenizer(spm, "unfriendly")
+julia> subword = tokenize(spm, "unfriendly")
 2-element Array{String,1}:
  "▁un"
  "friendly"
@@ -359,8 +367,8 @@ julia> para = spm("Julia is a high-level, high-performance dynamic language for
  "▁dynamic"   
  "▁language"  
  "▁for"       
- "▁technical" 
- "▁computing" 
+ "▁technical"
+ "▁computing"
 ```
 
 Indices is usually used for deep learning models.
@@ -382,13 +390,13 @@ julia> ids_from_tokens(spm, tk)
  5424
   817
 #we can also get sentences back from tokens
-julia> sentence_from_tokens(tk)
+julia> sentence_from_tokens(spm, tk)
  "i love the julia language"
 
-julia> sentence_from_token(subword)
+julia> sentence_from_tokens(spm, subword)
  "unfriendly"
 
-julia> sentence_from_tokens(para)
+julia> sentence_from_tokens(spm, para)
  "Julia is a high-level, high-performance dynamic language for technical computing"
 ```
 
diff --git a/src/WordTokenizers.jl b/src/WordTokenizers.jl
index e25eb73..70b46aa 100644
--- a/src/WordTokenizers.jl
+++ b/src/WordTokenizers.jl
@@ -4,7 +4,7 @@ module WordTokenizers
 using HTML_Entities
 using StrTables
 using Unicode
-using DataDeps
+using DataDeps, JSON, InternedStrings
 
 abstract type PretrainedTokenizer end
 
@@ -17,7 +17,9 @@ export poormans_tokenize, punctuation_space_tokenize,
        set_tokenizer, set_sentence_splitter,
        rev_tokenize, rev_detokenize,
        toktok_tokenize
-export ALBERT_V1, ALBERT_V2, load, tokenizer, sentence_from_tokens, ids_from_tokens
+
+export ALBERT_V1, ALBERT_V2, GPT2
+export load, tokenize, sentence_from_tokens, ids_from_tokens
 export PretrainedTokenizer, tokenizer_files
 include("words/fast.jl")
 
@@ -33,6 +35,7 @@ include("set_method_api.jl")
 include("split_api.jl")
 
 include("statistical/unigram.jl")
+include("statistical/gpt2tokenizer.jl")
 
 const pretrained = Dict{DataType, Vector{String}}()
 function tokenizer_files(::Type{T}) where T<:PretrainedTokenizer
@@ -47,4 +50,11 @@ function __init__()
     init_vocab_datadeps()
 end
 
+load(::Val{:ALBERT_V1}) = load_sp(ALBERT_V1)
+load(::Val{:ALBERT_V2}) = load_sp(ALBERT_V2)
+load(::Val{:GPT2}) = load_gpt2(GPT2)
+
+load(::Type{T}) where T<:PretrainedTokenizer = load(Val(Symbol(T)))
+
+
 end # module
diff --git a/src/statistical/Vocab_DataDeps.jl b/src/statistical/Vocab_DataDeps.jl
index d935ba7..ad09dd2 100644
--- a/src/statistical/Vocab_DataDeps.jl
+++ b/src/statistical/Vocab_DataDeps.jl
@@ -1,5 +1,6 @@
 abstract type ALBERT_V1 <: PretrainedTokenizer end
 abstract type ALBERT_V2 <: PretrainedTokenizer end
+abstract type GPT2 <: PretrainedTokenizer end
 
 const vectors_albertversion1 = [
     ("albert_base_v1_30k-clean.vocab",
@@ -40,6 +41,8 @@ const vectors_albertversion2 = [
     "https://raw.githubusercontent.com/tejasvaidhyadev/ALBERT.jl/master/src/Vocabs/albert_xxlarge_v2_30k-clean.vocab")
 ]
 
+const vectors_gpt2 = ["encoder.json", "vocab.bpe"]
+
 function init_vocab_datadeps()
     for (depname, description, sha, link) in vectors_albertversion1
         register(DataDep(depname,
@@ -70,5 +73,17 @@ function init_vocab_datadeps()
                  ))
         append!(tokenizer_files(ALBERT_V2), ["$depname"])
     end
-end
 
+    register(DataDep("GPT2",
+    """
+    Pretrained gpt2 vocabulary and merges file by Open AI.
+    Website: https://openai.com/blog/better-language-models/
+    Author: Radford et al
+    Licence: MIT
+    All GPT2 Models are trained on same size vocabulary.
+    """,
+    ["https://openaipublic.blob.core.windows.net/gpt-2/models/117M/$(file)" for file in vectors_gpt2],
+    "05805f21f823300551adf0646abe905eb036fb272f97c279f0d9c656c845ca46"))
+
+    append!(tokenizer_files(GPT2), ["GPT2/$(file)" for file in vectors_gpt2])
+end
diff --git a/src/statistical/gpt2tokenizer.jl b/src/statistical/gpt2tokenizer.jl
new file mode 100644
index 0000000..6732311
--- /dev/null
+++ b/src/statistical/gpt2tokenizer.jl
@@ -0,0 +1,218 @@
+"""
+struct GPT2Tokenizer
+    vocab::Dict{String, Any}
+    rank::Dict{Pair{String,String}, Int}
+    cache::Dict{String, Tuple}
+    pat::Regex
+end
+structure, To hold pretrained vocabulary map and merge rules for GPT2
+"""
+struct GPT2Tokenizer
+    vocab::Dict{String, Any}
+    rank::Dict{Pair{String,String}, Int}
+    cache::Dict{String, Tuple}
+    pat::Regex
+
+    function GPT2Tokenizer(::Type{T};pat=r"'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+") where T<:PretrainedTokenizer
+
+        vocab_file = @datadep_str tokenizer_files(T)[1]
+        bfile = @datadep_str tokenizer_files(T)[2]
+
+        vocab = Dict{String, Any}()
+        rank = Dict{Pair{String, String}, Int}()
+        cache = Dict{String, Tuple}()
+
+        vocab = JSON.parsefile(vocab_file)
+
+        open(bfile) do f
+            for (i, line) ∈ enumerate(eachline(f))
+                if i==1
+                    identity
+                else
+                    pair = Pair(split(line," ")...)
+                    rank[pair] = i-1
+                end
+            end
+        end
+        new(vocab, rank, cache, pat)
+    end
+end
+
+"""
+load_gpt2(ty::Type{T}) where T<:PretrainedTokenizer
+Initializes the GPT2Tokenizer and loads the vocab and merges files from `DataDeps`
+#Example
+```julia-repl
+julia> tokenizer = load(GPT2)
+
+```
+"""
+function load_gpt2(::Type{T}) where T<:PretrainedTokenizer
+    GPT2Tokenizer(T)
+end
+
+"""
+Returns Dictionary of utf-8 encoding and corresponding unicode strings for Byte-Pair Encoding.
+"""
+function bytes_to_unicode()
+    bs = [33:255...]
+    cs = bs[:]
+    n=0
+    for b in 0:255
+        if b ∉ bs
+            append!(bs, b)
+            append!(cs, 256+n)
+            n+=1
+        end
+    end
+    cs = [Char(n) for n in cs]
+    Dict(zip(bs,cs))
+end
+
+toStrTuple(x::Vector{String})=toStrTuple(join(x))
+function toStrTuple(x::AbstractString)
+    fs = intern.(split(chop(x), ""))
+    push!(fs, intern(x[end]*""))
+    filter!((x)-> x != "", fs)
+    Tuple(fs)
+end
+
+"""
+get_pairs(word::NTuple{})
+Returns set of pairs in a word. Word is a tuple of strings.
+"""
+function get_pairs(word::NTuple{})
+    pairs = Set{Pair{}}()
+    prev_char = word[1]
+    for char in word[2:end]
+        push!(pairs, Pair(prev_char, char))
+        prev_char = char
+    end
+    pairs
+end
+
+lowestpair(pairs::Set{Pair{}},tokenizer::GPT2Tokenizer) = lowestpair(collect(pairs), tokenizer::GPT2Tokenizer)
+lowestpair(pairs::Vector{Pair{}}, tokenizer::GPT2Tokenizer) = argmin(
+    sizehint!(Dict(
+    map(pairs) do p
+        p=>get(tokenizer.rank, p, typemax(Int))
+    end),
+          length(pairs))
+    )
+
+
+function bpe(token::String, tokenizer::GPT2Tokenizer)
+
+    haskey(tokenizer.cache, token) && return tokenizer.cache[token]
+    word = toStrTuple(token)
+    pairs = get_pairs(word)
+    isempty(pairs) && return token
+
+    while true
+        pair = lowestpair(pairs, tokenizer)
+        !haskey(tokenizer.rank, pair) && break
+        first, second = pair
+        new_word=Vector{String}()
+        i=1
+
+        while i <= length(word)
+
+            try
+                j = findnext(isequal(first), word, i)
+                append!(new_word, word[i:j-1])
+                i=j
+            catch
+                append!(new_word,word[i:end])
+                break
+            end
+
+            if word[i]==first && i<=length(word)-1 && word[i+1]==second
+                push!(new_word, first*second)
+                i+=2
+            else
+                push!(new_word, word[i])
+                i+=1
+            end
+        end
+        new_word = Tuple(new_word)
+        word = new_word
+
+        if length(word)==1
+            break
+        else
+            pairs = get_pairs(word)
+        end
+    end
+    tokenizer.cache[token] = word
+    word
+end
+
+"""
+tokenize(tokenizer::GPT2Tokenizer, text::String)
+Implements tokenization of input text. This tokenizer doesn't include unknown and special tokens because
+of its byte-level BPE tokenization. GPT2 model is only trained on end token `<|endoftext|>`. Has to be
+manually added after the tokenization.
+GPT2 Tokenizer treats whitespace as unicode character `\u0120 (Ġ)` before a word.
+
+# Example
+```julia-repl
+julia> tokens = tokenize(tokenizer, "Hi! How you doin")
+6-element Array{String,1}:
+ "Hi"
+ "!"
+ "ĠHow"
+ "Ġyou"
+ "Ġdo"
+ "in"
+```
+"""
+function tokenize(tokenizer::GPT2Tokenizer, text::String)
+    mapping = bytes_to_unicode()
+    tokens=Vector{String}()
+    matches = map(eachmatch(tokenizer.pat, text)) do m
+        m.match
+    end
+    for token in matches
+        token = join([mapping[Int(b)] for b in token])
+        append!(tokens, [string(bpe_token) for bpe_token in bpe(token, tokenizer)])
+    end
+    tokens
+end
+
+"""
+ids_from_tokens(tokenizer::GPT2Tokenizer, tokens::Vector{String})
+Returns respective ids of tokens from pretrained vocabulary map
+
+# Example
+```julia-repl
+julia> tokens = tokenize("Hi! How you doin", tokenizer)
+6-element Array{String,1}:
+ "Hi"
+ "!"
+ "ĠHow"
+ "Ġyou"
+ "Ġdo"
+ "in"
+
+julia> ids_from_tokens(tokenizer, tokens)
+6-element Array{Int64,1}:
+ 17250
+     0
+  1374
+   345
+   466
+   259
+```
+"""
+function ids_from_tokens(tokenizer::GPT2Tokenizer, tokens::Vector{String})
+    map(tokens) do x
+        last(get(tokenizer.vocab, x, 0))
+    end
+end
+
+function sentence_from_tokens(tokenizer::GPT2Tokenizer, tokens::Array{String,1})
+    sen = join(tokens)
+    sen = replace(sen, "Ġ" => " ")
+    sen = strip(sen)
+    return sen
+end
diff --git a/src/statistical/unigram.jl b/src/statistical/unigram.jl
index 2901b1d..dd4df0d 100644
--- a/src/statistical/unigram.jl
+++ b/src/statistical/unigram.jl
@@ -3,7 +3,7 @@ struct SentencePieceModel
   vocab_map::Dict{String, Tuple{Float64, Int}}
   unk_id::Int
 end
-structure, To hold unknown token index and map of vocabulary to log probability and index   
+structure, To hold unknown token index and map of vocabulary to log probability and index
 """
 struct SentencePieceModel
     vocab_map::Dict{String, Tuple{Float64, Int}}
@@ -11,25 +11,25 @@ struct SentencePieceModel
 end
 
 """
-    load(ty::Type{T}, filenum::Int=1; unk_token="<unk>") where T<:PretrainedTokenizer
+    load_sp(ty::Type{T}, filenum::Int=1; unk_token="<unk>") where T<:PretrainedTokenizer
 use to initialize the `SentencePieceModel` by loading the file from `DataDeps`
 # Example
 ```julia-repl
 julia> spm = load(ALBERT_V1)
 ```
 """
-function load(ty::Type{T}, filenum::Int=1; unk_token="<unk>") where T<:PretrainedTokenizer
+function load_sp(ty::Type{T}, filenum::Int=1; unk_token="<unk>") where T<:PretrainedTokenizer
     filepath = @datadep_str tokenizer_files(ty)[filenum]
     name = tokenizer_files(ty)[filenum]
     filepath = "$filepath/$name"
-    load(filepath, unk_token=unk_token)  
+    load_sp(filepath, unk_token=unk_token)
 end
 
 """
-    load(path; unk_token="<unk>") 
+    load_sp(path; unk_token="<unk>")
 use to initialize the SentencePieceModel by providing `vocab filepath`
-"""    
-function load(path; unk_token="<unk>")
+"""
+function load_sp(path; unk_token="<unk>")
     vocab_path = readlines(path)
     vocabnlogp = split.(vocab_path, "\t")
     vocab_map = Dict(tok=>(parse(Float64, logp), index) for (index, (tok, logp)) in enumerate(vocabnlogp))
@@ -37,13 +37,13 @@ function load(path; unk_token="<unk>")
         unk_id = vocab_map[unk_token][2]
     else
         throw(DomainError(unk_token, "Unknown token is not in the vocabulary"))
-    end 
+    end
     spm = SentencePieceModel(vocab_map, unk_id)
     return spm
 end
 
 """
-struct Nodes 
+struct Nodes
     text::String
     score::Float32
     index::Int64
@@ -51,9 +51,9 @@ struct Nodes
     en::Int
 end
 Utility structure, To hold the results of the `forward pass` (the forward Viterbi lattice)
-hold the token token string, score, vocabulary index, start and end character position   
+hold the token token string, score, vocabulary index, start and end character position
 """
-struct Nodes 
+struct Nodes
     text::String
     score::Float32
     index::Int64
@@ -90,10 +90,10 @@ julia> node = WordTokenizers.decode_forward(spm, "I love julia language")
  WordTokenizers.Nodes("gua", -23.776f0, 15259, 17, 19)
  WordTokenizers.Nodes("ag", -34.1531f0, 3303, 19, 20)
  WordTokenizers.Nodes("language", -11.1965f0, 7021, 14, 21)
-``` 
+```
 """
 function decode_forward(sp::SentencePieceModel, text::String)
-    results = Array{Nodes, 1}(undef, lastindex(text)) 
+    results = Array{Nodes, 1}(undef, lastindex(text))
     scores = fill(-Inf, lastindex(text))
     scores[1] = 0
     for char_end in eachindex(text)
@@ -103,7 +103,7 @@ function decode_forward(sp::SentencePieceModel, text::String)
             if haskey(sp.vocab_map, subtoken)
                 subtokenid =  sp.vocab_map[subtoken][2]
                 local_score = scores[char_start] + sp.vocab_map[subtoken][1]
-                if local_score > scores[char_end]   
+                if local_score > scores[char_end]
                     results[char_end] = Nodes(SubString(text, char_start:char_end), local_score, subtokenid, char_start, char_end)
                     scores[char_end] = local_score
                 end
@@ -141,7 +141,7 @@ julia> WordTokenizers.decode_backward(spm, node, text)
 function decode_backward(sp::SentencePieceModel, nodes::Array{Nodes,1}, text::AbstractString)
     next_nodes = nodes[end]
     best_seq = Nodes[]
-    
+
     while next_nodes.start > 1
         node_value = next_nodes
         next_nodes = nodes[prevind(text, node_value.start)]
@@ -152,11 +152,11 @@ function decode_backward(sp::SentencePieceModel, nodes::Array{Nodes,1}, text::Ab
 end
 
 """
-    tokenizer(sp::SentencePieceModel,text::AbstractString)
+    tokenize(sp::SentencePieceModel,text::AbstractString)
 It does all the preprocessing step needed and perform `decode_forward` and `decode_backward`
 ouput tokenize tokens as Array{String,1}
 """
-function tokenizer(sp::SentencePieceModel, text::AbstractString)
+function tokenize(sp::SentencePieceModel, text::AbstractString)
     text = replace(text, " " => "▁")
     if text[1] != '▁'
         text = "▁" * text
@@ -166,7 +166,7 @@ function tokenizer(sp::SentencePieceModel, text::AbstractString)
     tokens = reverse(tokens)
     tks = [node.text for node in tokens]
     return tks
-    
+
 end
 
 """
@@ -174,26 +174,26 @@ end
 It does all the preprocessing step needed and perform `decode_forward` and `decode_backward`.
 """
 function (sp::SentencePieceModel)(text::AbstractString)
-    tokenizer(sp, text)
+    tokenize(sp, text)
 end
 
 """
     ids_from_tokens(spm::SentencePieceModel, tk::Array{String,1})
 given tokens it provide its indices
-"""     
-function ids_from_tokens(spm::SentencePieceModel, tk::Array{String,1})  
+"""
+function ids_from_tokens(spm::SentencePieceModel, tk::Array{String,1})
     map(tk) do x
         last(get(spm.vocab_map, x, spm.unk_id))
     end
 end
 
 """
-    sentence_from_tokens(tk::Array{String,1})
+    sentence_from_tokens(spm::SentencePieceModel, tk::Array{String,1})
 given tokens it provide its sentences
 """
-function sentence_from_tokens(tk::Array{String,1})
+function sentence_from_tokens(spm::SentencePieceModel, tk::Array{String,1})
     sen = join(tk)
     sen = replace(sen, "▁" => " ")
     sen = strip(sen)
-    return sen     
+    return sen
 end
diff --git a/test/gpt2_tokenizer.jl b/test/gpt2_tokenizer.jl
new file mode 100644
index 0000000..195daae
--- /dev/null
+++ b/test/gpt2_tokenizer.jl
@@ -0,0 +1,42 @@
+using WordTokenizers
+using Test
+
+gpt2_tokenizer = load(GPT2)
+
+@testset "Pretrained" begin
+    @test typeof(gpt2_tokenizer) == WordTokenizers.GPT2Tokenizer
+    @test typeof(gpt2_tokenizer.vocab) == Dict{String, Any}
+    @test typeof(gpt2_tokenizer.rank) == Dict{Pair{String,String}, Int}
+    @test typeof(gpt2_tokenizer.cache) == Dict{String, Tuple}
+    @test typeof(WordTokenizers.pretrained) == Dict{DataType,Array{String,1}}
+    @test length(WordTokenizers.pretrained[GPT2]) == 2
+end
+
+@testset "Tokenizer and helper function" begin
+    @test gpt2_tokenizer.vocab["Hi"] == 17250
+    @test tokenize(gpt2_tokenizer, "I love julia language") == ["I",
+                                                           "Ġlove",
+                                                           "Ġj",
+                                                           "ulia",
+                                                           "Ġlanguage"]
+    tokens = tokenize(gpt2_tokenizer, "I love julia language")
+    @test ids_from_tokens(gpt2_tokenizer, tokens) == [40, 1842, 474, 43640, 3303]
+    @test sentence_from_tokens(gpt2_tokenizer, tokens) == "I love julia language"
+
+    tokens= tokenize(gpt2_tokenizer, "A census taker once tried to test me. I ate his liver with some fava beans and a nice Chianti.")
+    @test tokens  == ["A", "Ġcensus", "Ġt", "aker", "Ġonce",
+                     "Ġtried", "Ġto", "Ġtest", "Ġme", ".",
+                     "ĠI", "Ġate", "Ġhis", "Ġliver", "Ġwith",
+                     "Ġsome", "Ġfav", "a","Ġbeans", "Ġand",
+                     "Ġa", "Ġnice", "ĠCh", "iant", "i", "."]
+    @test ids_from_tokens(gpt2_tokenizer, tokens) == [32, 21649, 256, 3110, 1752, 3088, 284, 1332, 502, 13, 314, 15063,
+                                      465, 14383, 351, 617, 2090, 64, 16567, 290, 257, 3621, 609, 3014,
+                                      72, 13]
+
+   text = "Badges? We ain't got no badges:) We don't need no badges:p I don't have to show you any stinking badges!"
+   tokens = tokenize(gpt2_tokenizer, text)
+   @test tokens == ["Bad", "ges", "?", "ĠWe", "Ġain", "'t", "Ġgot", "Ġno", "Ġbadges", ":", ")", "ĠWe",
+                    "Ġdon", "'t", "Ġneed", "Ġno", "Ġbadges", ":", "p", "ĠI", "Ġdon", "'t", "Ġhave",
+                    "Ġto", "Ġshow", "Ġyou", "Ġany", "Ġst", "inking", "Ġbadges", "!"]
+   @test sentence_from_tokens(gpt2_tokenizer, tokens) == text
+end
diff --git a/test/runtests.jl b/test/runtests.jl
index 10bb818..588dfad 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -8,7 +8,8 @@ files = ["simple",
          "tweet_tokenize",
          "reversible_tok",
          "toktok",
-         "sp_unigram"
+         "sp_unigram",
+         "gpt2_tokenizer"
         ]
 
 @testset "$file" for file in files
diff --git a/test/sp_unigram.jl b/test/sp_unigram.jl
index f85686e..416ca1e 100644
--- a/test/sp_unigram.jl
+++ b/test/sp_unigram.jl
@@ -16,12 +16,12 @@ end
 end
 @testset "Tokenizers and helper function" begin
     @test spm.vocab_map["now"][2] == 1388
-    @test tokenizer(spm, "I love julia language") == ["▁",        
-                                                      "I",        
-                                                      "▁love",    
-                                                      "▁julia",   
+    @test tokenize(spm, "I love julia language") == ["▁",
+                                                      "I",
+                                                      "▁love",
+                                                      "▁julia",
                                                       "▁language"]
-    tks = tokenizer(spm, "i love julia language")
+    tks = tokenize(spm, "i love julia language")
     @test ids_from_tokens(spm, tks) == [32, 340, 5424, 817]
-    @test sentence_from_tokens(tks) == "i love julia language"
+    @test sentence_from_tokens(spm, tks) == "i love julia language"
 end