-
Notifications
You must be signed in to change notification settings - Fork 25
Expand file tree
/
Copy pathVocab_DataDeps.jl
More file actions
89 lines (81 loc) · 4 KB
/
Vocab_DataDeps.jl
File metadata and controls
89 lines (81 loc) · 4 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
abstract type ALBERT_V1 <: PretrainedTokenizer end
abstract type ALBERT_V2 <: PretrainedTokenizer end
abstract type GPT2 <: PretrainedTokenizer end
const vectors_albertversion1 = [
("albert_base_v1_30k-clean.vocab",
"albert base version1 of size ~800kb download.",
"1de4ad94a1b98f5f5f2c75af0f52bc85714d67b8578aa8f7650521bb123335c0",
"https://raw.githubusercontent.com/tejasvaidhyadev/ALBERT.jl/master/src/Vocabs/albert_base_v1_30k-clean.vocab"),
("albert_large_v1_30k-clean.vocab",
" albert large version1 of size ~800kb download.",
"1de4ad94a1b98f5f5f2c75af0f52bc85714d67b8578aa8f7650521bb123335c0",
"https://raw.githubusercontent.com/tejasvaidhyadev/ALBERT.jl/master/src/Vocabs/albert_large_v1_30k-clean.vocab"),
("albert_xlarge_v1_30k-clean.vocab",
"albert xlarge version1 of size ~800kb download",
"1de4ad94a1b98f5f5f2c75af0f52bc85714d67b8578aa8f7650521bb123335c0",
"https://raw.githubusercontent.com/tejasvaidhyadev/ALBERT.jl/master/src/Vocabs/albert_xlarge_v1_30k-clean.vocab"),
("albert_xxlarge_v1_30k-clean.vocab",
"albert xxlarge version1 of size ~800kb download",
"1de4ad94a1b98f5f5f2c75af0f52bc85714d67b8578aa8f7650521bb123335c0",
"https://raw.githubusercontent.com/tejasvaidhyadev/ALBERT.jl/master/src/Vocabs/albert_xxlarge_v1_30k-clean.vocab")
]
const vectors_albertversion2 = [
("albert_base_v2_30k-clean.vocab",
"albert base version2 of size ~800kb download.",
"1de4ad94a1b98f5f5f2c75af0f52bc85714d67b8578aa8f7650521bb123335c0",
"https://raw.githubusercontent.com/tejasvaidhyadev/ALBERT.jl/master/src/Vocabs/albert_base_v2_30k-clean.vocab"),
("albert_large_v2_30k-clean.vocab",
" albert large version2 of size ~800kb download.",
"1de4ad94a1b98f5f5f2c75af0f52bc85714d67b8578aa8f7650521bb123335c0",
"https://raw.githubusercontent.com/tejasvaidhyadev/ALBERT.jl/master/src/Vocabs/albert_large_v2_30k-clean.vocab"),
("albert_xlarge_v2_30k-clean.vocab",
"albert xlarge version2 of size ~800kb download.",
"1de4ad94a1b98f5f5f2c75af0f52bc85714d67b8578aa8f7650521bb123335c0",
"https://raw.githubusercontent.com/tejasvaidhyadev/ALBERT.jl/master/src/Vocabs/albert_xlarge_v2_30k-clean.vocab"),
("albert_xxlarge_v2_30k-clean.vocab",
"albert xxlarge version2 of size ~800kb download.",
"1de4ad94a1b98f5f5f2c75af0f52bc85714d67b8578aa8f7650521bb123335c0",
"https://raw.githubusercontent.com/tejasvaidhyadev/ALBERT.jl/master/src/Vocabs/albert_xxlarge_v2_30k-clean.vocab")
]
const vectors_gpt2 = ["encoder.json", "vocab.bpe"]
function init_vocab_datadeps()
for (depname, description, sha, link) in vectors_albertversion1
register(DataDep(depname,
"""
sentencepiece albert vocabulary file by google research .
Website: https://github.com/google-research/albert
Author: Google Research
Licence: Apache License 2.0
$description
""",
link,
sha
))
append!(tokenizer_files(ALBERT_V1), ["$depname"])
end
for (depname, description, sha, link) in vectors_albertversion2
register(DataDep(depname,
"""
sentencepiece albert vocabulary file by google research .
Website: https://github.com/google-research/albert
Author: Google Research
Licence: Apache License 2.0
$description
""",
link,
sha
))
append!(tokenizer_files(ALBERT_V2), ["$depname"])
end
register(DataDep("GPT2",
"""
Pretrained gpt2 vocabulary and merges file by Open AI.
Website: https://openai.com/blog/better-language-models/
Author: Radford et al
Licence: MIT
All GPT2 Models are trained on same size vocabulary.
""",
["https://openaipublic.blob.core.windows.net/gpt-2/models/117M/$(file)" for file in vectors_gpt2],
"05805f21f823300551adf0646abe905eb036fb272f97c279f0d9c656c845ca46"))
append!(tokenizer_files(GPT2), ["GPT2/$(file)" for file in vectors_gpt2])
end