Skip to content

Commit 4b89d98

Browse files
authored
Optimize regexes used in tiktoken (#7020)
* Optimize regexes used in tiktoken * Add comment and consolidate duplicate regex from Roberta
1 parent a139371 commit 4b89d98

File tree

3 files changed

+8
-16
lines changed

3 files changed

+8
-16
lines changed

src/Microsoft.ML.Tokenizers/PreTokenizer/Roberta.cs

Lines changed: 1 addition & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44

55
using System;
66
using System.Collections.Generic;
7-
using System.Text.RegularExpressions;
87

98
namespace Microsoft.ML.Tokenizers
109
{
@@ -18,15 +17,6 @@ public sealed partial class RobertaPreTokenizer : PreTokenizer
1817
/// </summary>
1918
public static RobertaPreTokenizer Instance { get; } = new RobertaPreTokenizer();
2019

21-
private const string PretokenizePattern = @"'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+";
22-
#if NET7_0_OR_GREATER
23-
[GeneratedRegex(PretokenizePattern)]
24-
private static partial Regex PretokenizeRegex();
25-
#else
26-
private static readonly Regex _regex = new Regex(PretokenizePattern, RegexOptions.Compiled);
27-
private static Regex PretokenizeRegex() => _regex;
28-
#endif
29-
3020
/// <summary>
3121
/// Splits the given string in multiple substrings at the word boundary, keeping track of the offsets of said substrings from the original string.
3222
/// </summary>
@@ -40,7 +30,7 @@ public override IEnumerable<Split> PreTokenize(string sentence, bool skipSpecial
4030
return Array.Empty<Split>();
4131
}
4232

43-
return SplitSentence(sentence, PretokenizeRegex());
33+
return SplitSentence(sentence, Tokenizer.P50kBaseRegex());
4434
}
4535
}
4636
}

src/Microsoft.ML.Tokenizers/PreTokenizer/Whitespace.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ public sealed partial class WhiteSpace : PreTokenizer
1919
/// </summary>
2020
public static WhiteSpace Instance { get; } = new WhiteSpace();
2121

22-
private const string PretokenizePattern = @"\w+|[^\w\s]+";
22+
private const string PretokenizePattern = /*lang=regex*/ @"\w+|[^\w\s]+";
2323
#if NET7_0_OR_GREATER
2424
[GeneratedRegex(PretokenizePattern)]
2525
private static partial Regex PretokenizeRegex();

src/Microsoft.ML.Tokenizers/Tokenizer.cs

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -384,8 +384,10 @@ public static Task<Tokenizer> CreateByModelNameAsync(
384384
}
385385
}
386386

387-
private const string Cl100kBaseRegexPattern = @"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+";
388-
private const string P50kBaseRegexPattern = @"'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+";
387+
// Regex patterns based on https://github.com/openai/tiktoken/blob/main/tiktoken_ext/openai_public.py
388+
389+
private const string Cl100kBaseRegexPattern = /*lang=regex*/ @"'(?i:[sdmt]|re|ve|ll)|(?>[^\r\n\p{L}\p{N}]?)\p{L}+|\p{N}{1,3}| ?(?>[^\s\p{L}\p{N}]+)[\r\n]*|\s*[\r\n]|\s+(?!\S)|\s+";
390+
private const string P50kBaseRegexPattern = /*lang=regex*/ @"'(?:[sdmt]|re|ve|ll)| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+";
389391

390392
private const string Cl100kBaseVocabUrl = @"https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken";
391393
private const string P50RanksUrl = @"https://openaipublic.blob.core.windows.net/encodings/p50k_base.tiktoken";
@@ -397,13 +399,13 @@ public static Task<Tokenizer> CreateByModelNameAsync(
397399
private static partial Regex Cl100kBaseRegex();
398400

399401
[GeneratedRegex(P50kBaseRegexPattern)]
400-
private static partial Regex P50kBaseRegex();
402+
internal static partial Regex P50kBaseRegex();
401403
#else
402404
private static Regex? _cl100kBaseRegex;
403405
private static Regex Cl100kBaseRegex() => _cl100kBaseRegex ??= new Regex(Cl100kBaseRegexPattern, RegexOptions.Compiled);
404406

405407
private static Regex? _p50kBaseRegex;
406-
private static Regex P50kBaseRegex() => _p50kBaseRegex ??= new Regex(P50kBaseRegexPattern, RegexOptions.Compiled);
408+
internal static Regex P50kBaseRegex() => _p50kBaseRegex ??= new Regex(P50kBaseRegexPattern, RegexOptions.Compiled);
407409
#endif
408410

409411
/// <summary>

0 commit comments

Comments
 (0)