@@ -26,6 +26,11 @@ const IM_SEP: &str = "<|im_sep|>";
2626#[ derive( Clone , Debug , Copy ) ]
2727pub struct EncodingFactory { }
2828impl EncodingFactory {
29+ // The pattern in the original GPT-2 release is:
30+ // r"'s|'t|'re|'ve|'m|'ll|'d| ?[\p{L}]+| ?[\p{N}]+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"""
31+ // This is equivalent, but executes faster:
32+ const LEGACY_SPLITTER_REGEX : & str = r"'(?:[sdmt]|ll|ve|re)| ?\p{L}++| ?\p{N}++| ?[^\s\p{L}\p{N}]++|\s++$|\s+(?!\S)|\s" ;
33+
2934 pub fn gpt2 ( ) -> Result < Encoding , EncodingFactoryError > {
3035 // todo!
3136 // vocab_bpe_file: sha256 = 1ce1664773c50f3e0cc8842619a93edc4624525b728b188a9e0be33b7726adc5
@@ -45,7 +50,7 @@ impl EncodingFactory {
4550 special_tokens. shrink_to_fit ( ) ;
4651 Encoding :: new (
4752 "r50k_base" ,
48- r"'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+" ,
53+ EncodingFactory :: LEGACY_SPLITTER_REGEX ,
4954 mergeable_ranks,
5055 special_tokens,
5156 Some ( 50257 ) ,
@@ -64,7 +69,7 @@ impl EncodingFactory {
6469 special_tokens. shrink_to_fit ( ) ;
6570 Encoding :: new (
6671 "p50k_base" ,
67- r"'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+" ,
72+ EncodingFactory :: LEGACY_SPLITTER_REGEX ,
6873 mergeable_ranks,
6974 special_tokens,
7075 Some ( 50281 ) ,
@@ -107,9 +112,12 @@ impl EncodingFactory {
107112 . map_err ( |_| EncodingFactoryError :: FailedToLoadEncoding ) ?;
108113 let mut special_tokens: HashMap < String , Rank > = special_tokens. iter ( ) . cloned ( ) . collect ( ) ;
109114 special_tokens. shrink_to_fit ( ) ;
115+ // use faster version from tiktoken upstream https://github.com/openai/tiktoken/pull/258/files#r1487668172
116+ // const PATTERN: &str = r"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+";
117+ const PATTERN : & str = r"'(?i:[sdmt]|ll|ve|re)|[^\r\n\p{L}\p{N}]?+\p{L}++|\p{N}{1,3}+| ?[^\s\p{L}\p{N}]++[\r\n]*+|\s++$|\s*[\r\n]|\s+(?!\S)|\s" ;
110118 Encoding :: new (
111119 "cl100k_base" ,
112- r"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+" ,
120+ PATTERN ,
113121 mergeable_ranks,
114122 special_tokens,
115123 None ,
0 commit comments