Skip to content

Commit 995c7a9

Browse files
authored
use updated regexes from upstream (#19)
via https://github.com/openai/tiktoken/pull/258/files
1 parent a4b3165 commit 995c7a9

File tree

1 file changed

+11
-3
lines changed

1 file changed

+11
-3
lines changed

src/openai_public.rs

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,11 @@ const IM_SEP: &str = "<|im_sep|>";
2626
#[derive(Clone, Debug, Copy)]
2727
pub struct EncodingFactory {}
2828
impl EncodingFactory {
29+
// The pattern in the original GPT-2 release is:
30+
// r"'s|'t|'re|'ve|'m|'ll|'d| ?[\p{L}]+| ?[\p{N}]+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"""
31+
// This is equivalent, but executes faster:
32+
const LEGACY_SPLITTER_REGEX: &str = r"'(?:[sdmt]|ll|ve|re)| ?\p{L}++| ?\p{N}++| ?[^\s\p{L}\p{N}]++|\s++$|\s+(?!\S)|\s";
33+
2934
pub fn gpt2() -> Result<Encoding, EncodingFactoryError> {
3035
// todo!
3136
// vocab_bpe_file: sha256 = 1ce1664773c50f3e0cc8842619a93edc4624525b728b188a9e0be33b7726adc5
@@ -45,7 +50,7 @@ impl EncodingFactory {
4550
special_tokens.shrink_to_fit();
4651
Encoding::new(
4752
"r50k_base",
48-
r"'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+",
53+
EncodingFactory::LEGACY_SPLITTER_REGEX,
4954
mergeable_ranks,
5055
special_tokens,
5156
Some(50257),
@@ -64,7 +69,7 @@ impl EncodingFactory {
6469
special_tokens.shrink_to_fit();
6570
Encoding::new(
6671
"p50k_base",
67-
r"'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+",
72+
EncodingFactory::LEGACY_SPLITTER_REGEX,
6873
mergeable_ranks,
6974
special_tokens,
7075
Some(50281),
@@ -107,9 +112,12 @@ impl EncodingFactory {
107112
.map_err(|_| EncodingFactoryError::FailedToLoadEncoding)?;
108113
let mut special_tokens: HashMap<String, Rank> = special_tokens.iter().cloned().collect();
109114
special_tokens.shrink_to_fit();
115+
// use faster version from tiktoken upstream https://github.com/openai/tiktoken/pull/258/files#r1487668172
116+
// const PATTERN: &str = r"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+";
117+
const PATTERN: &str = r"'(?i:[sdmt]|ll|ve|re)|[^\r\n\p{L}\p{N}]?+\p{L}++|\p{N}{1,3}+| ?[^\s\p{L}\p{N}]++[\r\n]*+|\s++$|\s*[\r\n]|\s+(?!\S)|\s";
110118
Encoding::new(
111119
"cl100k_base",
112-
r"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+",
120+
PATTERN,
113121
mergeable_ranks,
114122
special_tokens,
115123
None,

0 commit comments

Comments
 (0)