@@ -26,6 +26,11 @@ const IM_SEP: &str = "<|im_sep|>";
26
26
#[ derive( Clone , Debug , Copy ) ]
27
27
pub struct EncodingFactory { }
28
28
impl EncodingFactory {
29
+ // The pattern in the original GPT-2 release is:
30
+ // r"'s|'t|'re|'ve|'m|'ll|'d| ?[\p{L}]+| ?[\p{N}]+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"""
31
+ // This is equivalent, but executes faster:
32
+ const LEGACY_SPLITTER_REGEX : & str = r"'(?:[sdmt]|ll|ve|re)| ?\p{L}++| ?\p{N}++| ?[^\s\p{L}\p{N}]++|\s++$|\s+(?!\S)|\s" ;
33
+
29
34
pub fn gpt2 ( ) -> Result < Encoding , EncodingFactoryError > {
30
35
// todo!
31
36
// vocab_bpe_file: sha256 = 1ce1664773c50f3e0cc8842619a93edc4624525b728b188a9e0be33b7726adc5
@@ -45,7 +50,7 @@ impl EncodingFactory {
45
50
special_tokens. shrink_to_fit ( ) ;
46
51
Encoding :: new (
47
52
"r50k_base" ,
48
- r"'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+" ,
53
+ EncodingFactory :: LEGACY_SPLITTER_REGEX ,
49
54
mergeable_ranks,
50
55
special_tokens,
51
56
Some ( 50257 ) ,
@@ -64,7 +69,7 @@ impl EncodingFactory {
64
69
special_tokens. shrink_to_fit ( ) ;
65
70
Encoding :: new (
66
71
"p50k_base" ,
67
- r"'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+" ,
72
+ EncodingFactory :: LEGACY_SPLITTER_REGEX ,
68
73
mergeable_ranks,
69
74
special_tokens,
70
75
Some ( 50281 ) ,
@@ -107,9 +112,12 @@ impl EncodingFactory {
107
112
. map_err ( |_| EncodingFactoryError :: FailedToLoadEncoding ) ?;
108
113
let mut special_tokens: HashMap < String , Rank > = special_tokens. iter ( ) . cloned ( ) . collect ( ) ;
109
114
special_tokens. shrink_to_fit ( ) ;
115
+ // use faster version from tiktoken upstream https://github.com/openai/tiktoken/pull/258/files#r1487668172
116
+ // const PATTERN: &str = r"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+";
117
+ const PATTERN : & str = r"'(?i:[sdmt]|ll|ve|re)|[^\r\n\p{L}\p{N}]?+\p{L}++|\p{N}{1,3}+| ?[^\s\p{L}\p{N}]++[\r\n]*+|\s++$|\s*[\r\n]|\s+(?!\S)|\s" ;
110
118
Encoding :: new (
111
119
"cl100k_base" ,
112
- r"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+" ,
120
+ PATTERN ,
113
121
mergeable_ranks,
114
122
special_tokens,
115
123
None ,
0 commit comments