@@ -384,8 +384,10 @@ public static Task<Tokenizer> CreateByModelNameAsync(
384
384
}
385
385
}
386
386
387
- private const string Cl100kBaseRegexPattern = @"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+" ;
388
- private const string P50kBaseRegexPattern = @"'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+" ;
387
+ // Regex patterns based on https://github.com/openai/tiktoken/blob/main/tiktoken_ext/openai_public.py
388
+
389
+ private const string Cl100kBaseRegexPattern = /*lang=regex*/ @"'(?i:[sdmt]|re|ve|ll)|(?>[^\r\n\p{L}\p{N}]?)\p{L}+|\p{N}{1,3}| ?(?>[^\s\p{L}\p{N}]+)[\r\n]*|\s*[\r\n]|\s+(?!\S)|\s+" ;
390
+ private const string P50kBaseRegexPattern = /*lang=regex*/ @"'(?:[sdmt]|re|ve|ll)| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+" ;
389
391
390
392
private const string Cl100kBaseVocabUrl = @"https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken" ;
391
393
private const string P50RanksUrl = @"https://openaipublic.blob.core.windows.net/encodings/p50k_base.tiktoken" ;
@@ -397,13 +399,13 @@ public static Task<Tokenizer> CreateByModelNameAsync(
397
399
private static partial Regex Cl100kBaseRegex ( ) ;
398
400
399
401
[ GeneratedRegex ( P50kBaseRegexPattern ) ]
400
- private static partial Regex P50kBaseRegex ( ) ;
402
+ internal static partial Regex P50kBaseRegex ( ) ;
401
403
#else
402
404
private static Regex ? _cl100kBaseRegex ;
403
405
private static Regex Cl100kBaseRegex ( ) => _cl100kBaseRegex ??= new Regex ( Cl100kBaseRegexPattern , RegexOptions . Compiled ) ;
404
406
405
407
private static Regex ? _p50kBaseRegex ;
406
- private static Regex P50kBaseRegex ( ) => _p50kBaseRegex ??= new Regex ( P50kBaseRegexPattern , RegexOptions . Compiled ) ;
408
+ internal static Regex P50kBaseRegex ( ) => _p50kBaseRegex ??= new Regex ( P50kBaseRegexPattern , RegexOptions . Compiled ) ;
407
409
#endif
408
410
409
411
/// <summary>
0 commit comments