11import os
2- from tiktoken .load import data_gym_to_mergeable_bpe_ranks , load_tiktoken_bpe
2+ from tiktoken .load import data_gym_to_mergeable_bpe_ranks , load_tiktoken_bpe , read_file_cached
33
44ENDOFTEXT = "<|endoftext|>"
55FIM_PREFIX = "<|fim_prefix|>"
66FIM_MIDDLE = "<|fim_middle|>"
77FIM_SUFFIX = "<|fim_suffix|>"
88ENDOFPROMPT = "<|endofprompt|>"
99
10- ENCODINGS_HOST = os .getenv ("ENCODINGS_HOST" , "https://openaipublic.blob.core.windows.net" )
10+ ENCODINGS_HOST = os .getenv ("ENCODINGS_HOST" , None )
11+
12+ if "ENCODINGS_HOST" in os .environ :
13+ ENCODINGS_HOST = os .environ ["ENCODINGS_HOST" ]
14+ IS_HOSTING_ENCODINGS = True
15+ else :
16+ ENCODINGS_HOST = "https://openaipublic.blob.core.windows.net"
17+ IS_HOSTING_ENCODINGS = False
18+
19+ VOCAB_BPE_FILE = f"{ ENCODINGS_HOST } /gpt-2/encodings/main/vocab.bpe"
20+ VOCAB_BPE_HASH = "1ce1664773c50f3e0cc8842619a93edc4624525b728b188a9e0be33b7726adc5"
21+ ENCODER_JSON_FILE = f"{ ENCODINGS_HOST } /gpt-2/encodings/main/encoder.json"
22+ ENCODER_JSON_HASH = "196139668be63f3b5d6574427317ae82f612a97c5d1cdaf36ed2256dbf636783"
23+ R50K_BASE_FILE = f"{ ENCODINGS_HOST } /encodings/r50k_base.tiktoken"
24+ R50K_BASE_HASH = "306cd27f03c1a714eca7108e03d66b7dc042abe8c258b44c199a7ed9838dd930"
25+ P50K_BASE_FILE = f"{ ENCODINGS_HOST } /encodings/p50k_base.tiktoken"
26+ P50K_BASE_HASH = "94b5ca7dff4d00767bc256fdd1b27e5b17361d7b8a5f968547f9f23eb70d2069"
27+ CL100K_BASE_FILE = f"{ ENCODINGS_HOST } /encodings/cl100k_base.tiktoken"
28+ CL100K_BASE_HASH = "223921b76ee99bde995b7ff738513eef100fb51d18c93597a113bcffe865b2a7"
1129
1230def gpt2 ():
31+ vocab_bpe_contents = read_file_cached (
32+ VOCAB_BPE_FILE ,
33+ VOCAB_BPE_HASH ,
34+ IS_HOSTING_ENCODINGS
35+ ).decode ()
36+ encoder_json_contents = read_file_cached (
37+ ENCODER_JSON_FILE ,
38+ ENCODER_JSON_HASH ,
39+ IS_HOSTING_ENCODINGS
40+ )
1341 mergeable_ranks = data_gym_to_mergeable_bpe_ranks (
14- vocab_bpe_file = f"{ ENCODINGS_HOST } /gpt-2/encodings/main/vocab.bpe" ,
15- encoder_json_file = f"{ ENCODINGS_HOST } /gpt-2/encodings/main/encoder.json" ,
16- vocab_bpe_hash = "1ce1664773c50f3e0cc8842619a93edc4624525b728b188a9e0be33b7726adc5" ,
17- encoder_json_hash = "196139668be63f3b5d6574427317ae82f612a97c5d1cdaf36ed2256dbf636783" ,
42+ vocab_bpe_contents = vocab_bpe_contents ,
43+ encoder_json_contents = encoder_json_contents
1844 )
1945 return {
2046 "name" : "gpt2" ,
@@ -29,10 +55,8 @@ def gpt2():
2955
3056
3157def r50k_base ():
32- mergeable_ranks = load_tiktoken_bpe (
33- f"{ ENCODINGS_HOST } /encodings/r50k_base.tiktoken" ,
34- expected_hash = "306cd27f03c1a714eca7108e03d66b7dc042abe8c258b44c199a7ed9838dd930" ,
35- )
58+ contents = read_file_cached (R50K_BASE_FILE , R50K_BASE_HASH , IS_HOSTING_ENCODINGS )
59+ mergeable_ranks = load_tiktoken_bpe (contents )
3660 return {
3761 "name" : "r50k_base" ,
3862 "explicit_n_vocab" : 50257 ,
@@ -43,10 +67,8 @@ def r50k_base():
4367
4468
4569def p50k_base ():
46- mergeable_ranks = load_tiktoken_bpe (
47- f"{ ENCODINGS_HOST } /encodings/p50k_base.tiktoken" ,
48- expected_hash = "94b5ca7dff4d00767bc256fdd1b27e5b17361d7b8a5f968547f9f23eb70d2069" ,
49- )
70+ contents = read_file_cached (P50K_BASE_FILE , P50K_BASE_HASH , IS_HOSTING_ENCODINGS )
71+ mergeable_ranks = load_tiktoken_bpe (contents )
5072 return {
5173 "name" : "p50k_base" ,
5274 "explicit_n_vocab" : 50281 ,
@@ -57,10 +79,8 @@ def p50k_base():
5779
5880
5981def p50k_edit ():
60- mergeable_ranks = load_tiktoken_bpe (
61- f"{ ENCODINGS_HOST } /encodings/p50k_base.tiktoken" ,
62- expected_hash = "94b5ca7dff4d00767bc256fdd1b27e5b17361d7b8a5f968547f9f23eb70d2069" ,
63- )
82+ contents = read_file_cached (P50K_BASE_FILE , P50K_BASE_HASH , IS_HOSTING_ENCODINGS )
83+ mergeable_ranks = load_tiktoken_bpe (contents )
6484 special_tokens = {ENDOFTEXT : 50256 , FIM_PREFIX : 50281 , FIM_MIDDLE : 50282 , FIM_SUFFIX : 50283 }
6585 return {
6686 "name" : "p50k_edit" ,
@@ -71,10 +91,8 @@ def p50k_edit():
7191
7292
7393def cl100k_base ():
74- mergeable_ranks = load_tiktoken_bpe (
75- f"{ ENCODINGS_HOST } /encodings/cl100k_base.tiktoken" ,
76- expected_hash = "223921b76ee99bde995b7ff738513eef100fb51d18c93597a113bcffe865b2a7" ,
77- )
94+ contents = read_file_cached (CL100K_BASE_FILE , CL100K_BASE_HASH , IS_HOSTING_ENCODINGS )
95+ mergeable_ranks = load_tiktoken_bpe (contents )
7896 special_tokens = {
7997 ENDOFTEXT : 100257 ,
8098 FIM_PREFIX : 100258 ,
0 commit comments