66FIM_SUFFIX = "<|fim_suffix|>"
77ENDOFPROMPT = "<|endofprompt|>"
88
9+ # The pattern in the original GPT-2 release is:
10+ # r"""'s|'t|'re|'ve|'m|'ll|'d| ?[\p{L}]+| ?[\p{N}]+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"""
11+ # This is equivalent, but executes faster:
12+ _legacy_splitter_regex = r"""'(?:[sdmt]|ll|ve|re)| ?\p{L}++| ?\p{N}++| ?[^\s\p{L}\p{N}]++|\s+(?!\S)|\s++"""
13+
914
1015def gpt2 ():
1116 mergeable_ranks = data_gym_to_mergeable_bpe_ranks (
@@ -17,10 +22,7 @@ def gpt2():
1722 return {
1823 "name" : "gpt2" ,
1924 "explicit_n_vocab" : 50257 ,
20- # The pattern in the original GPT-2 release is:
21- # r"""'s|'t|'re|'ve|'m|'ll|'d| ?[\p{L}]+| ?[\p{N}]+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"""
22- # This is equivalent, but executes faster:
23- "pat_str" : r"""'(?:[sdmt]|ll|ve|re)| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""" ,
25+ "pat_str" : _legacy_splitter_regex ,
2426 "mergeable_ranks" : mergeable_ranks ,
2527 "special_tokens" : {ENDOFTEXT : 50256 },
2628 }
@@ -34,7 +36,7 @@ def r50k_base():
3436 return {
3537 "name" : "r50k_base" ,
3638 "explicit_n_vocab" : 50257 ,
37- "pat_str" : r"""'(?:[sdmt]|ll|ve|re)| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""" ,
39+ "pat_str" : _legacy_splitter_regex ,
3840 "mergeable_ranks" : mergeable_ranks ,
3941 "special_tokens" : {ENDOFTEXT : 50256 },
4042 }
@@ -48,7 +50,7 @@ def p50k_base():
4850 return {
4951 "name" : "p50k_base" ,
5052 "explicit_n_vocab" : 50281 ,
51- "pat_str" : r"""'(?:[sdmt]|ll|ve|re)| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""" ,
53+ "pat_str" : _legacy_splitter_regex ,
5254 "mergeable_ranks" : mergeable_ranks ,
5355 "special_tokens" : {ENDOFTEXT : 50256 },
5456 }
@@ -62,7 +64,7 @@ def p50k_edit():
6264 special_tokens = {ENDOFTEXT : 50256 , FIM_PREFIX : 50281 , FIM_MIDDLE : 50282 , FIM_SUFFIX : 50283 }
6365 return {
6466 "name" : "p50k_edit" ,
65- "pat_str" : r"""'(?:[sdmt]|ll|ve|re)| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""" ,
67+ "pat_str" : _legacy_splitter_regex ,
6668 "mergeable_ranks" : mergeable_ranks ,
6769 "special_tokens" : special_tokens ,
6870 }
@@ -82,7 +84,7 @@ def cl100k_base():
8284 }
8385 return {
8486 "name" : "cl100k_base" ,
85- "pat_str" : r"""'(?i:[sdmt]|ll|ve|re)|[^\r\n\p{L}\p{N}]?+\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]++[\r\n]*|\s*[\r\n]|\s+(?!\S)|\s+""" ,
87+ "pat_str" : r"""'(?i:[sdmt]|ll|ve|re)|[^\r\n\p{L}\p{N}]?+\p{L}++ |\p{N}{1,3}+ | ?[^\s\p{L}\p{N}]++[\r\n]*+ |\s*[\r\n]|\s+(?!\S)|\s+ +""" ,
8688 "mergeable_ranks" : mergeable_ranks ,
8789 "special_tokens" : special_tokens ,
8890 }
0 commit comments