Skip to content

Commit ab8e72d

Browse files
authored
Defer the allocation of matchValue. (#61)
matchValue isn't needed on the fast path, so don't allocate it until it's actually needed. Additionally, use a Span-based conversion to UTF-8 bytes to avoid the creation of a temporary char array.
1 parent a1b5ca5 commit ab8e72d

File tree

1 file changed

+28
-3
lines changed

1 file changed

+28
-3
lines changed

src/libs/Tiktoken.Core/CoreBPE.cs

Lines changed: 28 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
using System.Collections.Concurrent;
2+
using System.Runtime.CompilerServices;
23
using System.Text;
34
using System.Text.RegularExpressions;
45
using Tiktoken.Core;
@@ -86,12 +87,12 @@ public int CountTokensNative(string text)
8687
var tokens = 0;
8788
#if NET7_0_OR_GREATER
8889
var textSpan = text.AsSpan();
90+
Span<byte> pieceBytes = stackalloc byte[128];
8991
#endif
9092

9193
#if NET7_0_OR_GREATER
9294
foreach (var match in Regex.EnumerateMatches(textSpan))
9395
{
94-
var matchValue = textSpan.Slice(match.Index, match.Length).ToArray();
9596
var fastKey = new string(textSpan.Slice(match.Index, match.Length));
9697
#else
9798
foreach (Match match in Regex.Matches(text))
@@ -110,7 +111,11 @@ public int CountTokensNative(string text)
110111
continue;
111112
}
112113

114+
#if NET7_0_OR_GREATER
115+
var piece = GetUtf8Bytes(textSpan.Slice(match.Index, match.Length), pieceBytes);
116+
#else
113117
var piece = System.Text.Encoding.UTF8.GetBytes(matchValue);
118+
#endif
114119
if (Encoder.ContainsKey(piece))
115120
{
116121
tokens++;
@@ -148,6 +153,7 @@ public IReadOnlyCollection<int> EncodeNative(
148153
var tokens = new List<int>();
149154
#if NET7_0_OR_GREATER
150155
var textSpan = text.AsSpan();
156+
Span<byte> pieceBytes = stackalloc byte[128];
151157
#endif
152158

153159
var specialTokens = new List<(int Index, int Length)>(capacity: 32);
@@ -181,7 +187,6 @@ public IReadOnlyCollection<int> EncodeNative(
181187
#if NET7_0_OR_GREATER
182188
foreach (var match in Regex.EnumerateMatches(textSpan[start..specialStart]))
183189
{
184-
var matchValue = textSpan.Slice(match.Index, match.Length).ToArray();
185190
var fastKey = new string(textSpan.Slice(match.Index, match.Length));
186191
#else
187192
foreach (Match match in Regex.Matches(text[start..specialStart]))
@@ -199,8 +204,12 @@ public IReadOnlyCollection<int> EncodeNative(
199204
tokens.AddRange(fastTokens);
200205
continue;
201206
}
202-
207+
208+
#if NET7_0_OR_GREATER
209+
var piece = GetUtf8Bytes(textSpan.Slice(match.Index, match.Length), pieceBytes);
210+
#else
203211
var piece = System.Text.Encoding.UTF8.GetBytes(matchValue);
212+
#endif
204213
if (Encoder.TryGetValue(piece, out var token))
205214
{
206215
tokens.Add(token);
@@ -544,4 +553,20 @@ public byte[] DecodeNative(IReadOnlyCollection<int> tokens)
544553
}
545554
return ret.ToArray();
546555
}
556+
557+
#if NETSTANDARD2_1_OR_GREATER || NET5_0_OR_GREATER
558+
[MethodImpl(MethodImplOptions.AggressiveInlining)]
559+
private static byte[] GetUtf8Bytes(ReadOnlySpan<char> text, Span<byte> scratch)
560+
{
561+
// check if text can be decoded into the buffer; each UTF-16 char can become at most 3 UTF-8 bytes
562+
if (text.Length * 3 < scratch.Length)
563+
{
564+
return scratch[..System.Text.Encoding.UTF8.GetBytes(text, scratch)].ToArray();
565+
}
566+
else
567+
{
568+
return System.Text.Encoding.UTF8.GetBytes(text.ToArray());
569+
}
570+
}
571+
#endif
547572
}

0 commit comments

Comments
 (0)