Skip to content

Commit df580f6

Browse files
HavenDVclaude
andcommitted
feat: add EncodeUtf8 zero-alloc API for encoding directly from UTF-8 bytes
Add EncodeUtf8(ReadOnlySpan<byte>, Span<int>) that encodes UTF-8 text directly into a caller-provided token buffer. Pairs with existing CountTokens(ReadOnlySpan<byte>) for buffer sizing. Uses stackalloc for small inputs, ArrayPool for large ones. Add EncodeUtf8 benchmark category. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent de4fe89 commit df580f6

File tree

3 files changed

+81
-0
lines changed

3 files changed

+81
-0
lines changed

src/benchmarks/Tiktoken.Benchmarks/Benchmarks.cs

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -202,4 +202,29 @@ public int Tiktoken_DecodeToUtf8()
202202
public int Tiktoken_CountTokens_FromUtf8() => _tiktoken.CountTokens(_dataUtf8.AsSpan());
203203

204204

205+
[Benchmark(Baseline = true)]
206+
[BenchmarkCategory("EncodeUtf8")]
207+
public IReadOnlyCollection<int> Tiktoken_Encode_Baseline() => _tiktoken.Encode(Data);
208+
209+
[Benchmark]
210+
[BenchmarkCategory("EncodeUtf8")]
211+
public int Tiktoken_EncodeUtf8()
212+
{
213+
var tokenCount = _tiktoken.CountTokens(_dataUtf8.AsSpan());
214+
if (tokenCount <= 1024)
215+
{
216+
Span<int> buffer = stackalloc int[tokenCount];
217+
return _tiktoken.EncodeUtf8(_dataUtf8.AsSpan(), buffer);
218+
}
219+
220+
var rented = System.Buffers.ArrayPool<int>.Shared.Rent(tokenCount);
221+
try
222+
{
223+
return _tiktoken.EncodeUtf8(_dataUtf8.AsSpan(), rented.AsSpan(0, tokenCount));
224+
}
225+
finally
226+
{
227+
System.Buffers.ArrayPool<int>.Shared.Return(rented);
228+
}
229+
}
205230
}

src/libs/Tiktoken.Core/CoreBPE.cs

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -229,6 +229,49 @@ internal int CountTokensFromUtf8(ReadOnlySpan<byte> utf8Text)
229229
System.Buffers.ArrayPool<char>.Shared.Return(rented);
230230
}
231231
}
232+
233+
/// <summary>
234+
/// Encodes UTF-8 bytes directly into a caller-provided token buffer.
235+
/// Converts to chars internally using stackalloc/ArrayPool, then encodes.
236+
/// </summary>
237+
internal int EncodeFromUtf8(
238+
ReadOnlySpan<byte> utf8Text,
239+
Span<int> tokenDestination,
240+
HashSet<string> disallowedSpecial)
241+
{
242+
var charCount = System.Text.Encoding.UTF8.GetCharCount(utf8Text);
243+
char[]? rentedChars = null;
244+
var chars = charCount <= 1024
245+
? stackalloc char[charCount]
246+
: (rentedChars = System.Buffers.ArrayPool<char>.Shared.Rent(charCount)).AsSpan(0, charCount);
247+
248+
try
249+
{
250+
System.Text.Encoding.UTF8.GetChars(utf8Text, chars);
251+
var tokens = EncodeNativeAllDisallowed(chars, disallowedSpecial);
252+
253+
if (tokens.Count > tokenDestination.Length)
254+
{
255+
throw new ArgumentException(
256+
"Destination buffer is too small. Use CountTokens to determine the required size.",
257+
nameof(tokenDestination));
258+
}
259+
260+
var i = 0;
261+
foreach (var token in tokens)
262+
{
263+
tokenDestination[i++] = token;
264+
}
265+
return tokens.Count;
266+
}
267+
finally
268+
{
269+
if (rentedChars != null)
270+
{
271+
System.Buffers.ArrayPool<char>.Shared.Return(rentedChars);
272+
}
273+
}
274+
}
232275
#endif
233276

234277
/// <summary>

src/libs/Tiktoken.Core/Encoder.cs

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,19 @@ public int CountTokens(ReadOnlySpan<byte> utf8Text)
6565
{
6666
return _corePbe.CountTokensFromUtf8(utf8Text);
6767
}
68+
69+
/// <summary>
70+
/// Encodes UTF-8 text directly into a caller-provided token buffer for zero-allocation encode.
71+
/// Use <see cref="CountTokens(ReadOnlySpan{byte})"/> to determine the required buffer size.
72+
/// </summary>
73+
/// <param name="utf8Text">The UTF-8 encoded text to tokenize.</param>
74+
/// <param name="tokenDestination">The destination buffer for token IDs.</param>
75+
/// <returns>The number of tokens written to <paramref name="tokenDestination"/>.</returns>
76+
/// <exception cref="ArgumentException">The destination buffer is too small.</exception>
77+
public int EncodeUtf8(ReadOnlySpan<byte> utf8Text, Span<int> tokenDestination)
78+
{
79+
return _corePbe.EncodeFromUtf8(utf8Text, tokenDestination, _specialTokensSet);
80+
}
6881
#endif
6982

7083
/// <summary>

0 commit comments

Comments
 (0)