Skip to content

Commit ae6a023

Browse files
Copilotgewarren
andcommitted
Add Microsoft.ML.Tokenizers documentation and working examples
Co-authored-by: gewarren <[email protected]>
1 parent 7b925bc commit ae6a023

File tree

7 files changed

+294
-0
lines changed

7 files changed

+294
-0
lines changed
Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
using System;
2+
using System.Collections.Generic;
3+
using Microsoft.ML.Tokenizers;
4+
5+
internal class BpeExample
6+
{
7+
public static void Run()
8+
{
9+
BasicUsage();
10+
}
11+
12+
private static void BasicUsage()
13+
{
14+
// <BpeBasic>
15+
// Create a BPE tokenizer using Tiktoken
16+
// BPE (Byte Pair Encoding) is the underlying algorithm used by many tokenizers
17+
Tokenizer tokenizer = TiktokenTokenizer.CreateForModel("gpt-4o");
18+
19+
string text = "Hello, how are you doing today?";
20+
21+
// Encode text to token IDs
22+
IReadOnlyList<int> ids = tokenizer.EncodeToIds(text);
23+
Console.WriteLine($"Token IDs: {string.Join(", ", ids)}");
24+
25+
// Count tokens
26+
int tokenCount = tokenizer.CountTokens(text);
27+
Console.WriteLine($"Token count: {tokenCount}");
28+
29+
// Get detailed token information
30+
IReadOnlyList<EncodedToken> tokens = tokenizer.EncodeToTokens(text, out string? normalizedString);
31+
Console.WriteLine("Tokens:");
32+
foreach (var token in tokens)
33+
{
34+
Console.WriteLine($" ID: {token.Id}, Value: '{token.Value}'");
35+
}
36+
37+
// Decode tokens back to text
38+
string? decoded = tokenizer.Decode(ids);
39+
Console.WriteLine($"Decoded: {decoded}");
40+
// </BpeBasic>
41+
}
42+
}
Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
using System;
2+
using System.Collections.Generic;
3+
using System.IO;
4+
using System.Net.Http;
5+
using System.Threading.Tasks;
6+
using Microsoft.ML.Tokenizers;
7+
8+
internal class LlamaExample
9+
{
10+
public static async Task RunAsync()
11+
{
12+
await BasicUsageAsync();
13+
AdvancedOptions();
14+
}
15+
16+
private static async Task BasicUsageAsync()
17+
{
18+
// <LlamaBasic>
19+
// Open a stream to the remote Llama tokenizer model data file
20+
using HttpClient httpClient = new();
21+
const string modelUrl = @"https://huggingface.co/hf-internal-testing/llama-tokenizer/resolve/main/tokenizer.model";
22+
using Stream remoteStream = await httpClient.GetStreamAsync(modelUrl);
23+
24+
// Create the Llama tokenizer using the remote stream
25+
Tokenizer llamaTokenizer = LlamaTokenizer.Create(remoteStream);
26+
27+
string input = "Hello, world!";
28+
29+
// Encode text to token IDs
30+
IReadOnlyList<int> ids = llamaTokenizer.EncodeToIds(input);
31+
Console.WriteLine($"Token IDs: {string.Join(", ", ids)}");
32+
// Output: Token IDs: 1, 15043, 29892, 3186, 29991
33+
34+
// Count the tokens
35+
Console.WriteLine($"Tokens: {llamaTokenizer.CountTokens(input)}");
36+
// Output: Tokens: 5
37+
38+
// Decode token IDs back to text
39+
string? decoded = llamaTokenizer.Decode(ids);
40+
Console.WriteLine($"Decoded: {decoded}");
41+
// Output: Decoded: Hello, world!
42+
// </LlamaBasic>
43+
}
44+
45+
private static void AdvancedOptions()
46+
{
47+
// For demonstration purposes, we'll use a mock tokenizer
48+
// In real scenarios, you would initialize this properly
49+
Tokenizer tokenizer = TiktokenTokenizer.CreateForModel("gpt-4o");
50+
51+
// <LlamaAdvanced>
52+
ReadOnlySpan<char> textSpan = "Hello World".AsSpan();
53+
54+
// Bypass normalization during encoding
55+
IReadOnlyList<int> ids = tokenizer.EncodeToIds(textSpan, considerNormalization: false);
56+
57+
// Bypass pretokenization during encoding
58+
ids = tokenizer.EncodeToIds(textSpan, considerPreTokenization: false);
59+
60+
// Bypass both normalization and pretokenization
61+
ids = tokenizer.EncodeToIds(textSpan, considerNormalization: false, considerPreTokenization: false);
62+
// </LlamaAdvanced>
63+
}
64+
}
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
using System;
2+
using System.Threading.Tasks;
3+
4+
// Run examples
5+
Console.WriteLine("=== Tiktoken Examples ===");
6+
TiktokenExample.Run();
7+
8+
Console.WriteLine("\n=== Llama Examples ===");
9+
try
10+
{
11+
await LlamaExample.RunAsync();
12+
}
13+
catch (Exception ex)
14+
{
15+
Console.WriteLine($"Note: Llama example requires network access to download model files: {ex.Message}");
16+
}
17+
18+
Console.WriteLine("\n=== BPE Examples ===");
19+
BpeExample.Run();
Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
using System;
2+
using System.Collections.Generic;
3+
using Microsoft.ML.Tokenizers;
4+
5+
internal class TiktokenExample
6+
{
7+
public static void Run()
8+
{
9+
BasicUsage();
10+
TrimText();
11+
}
12+
13+
private static void BasicUsage()
14+
{
15+
// <TiktokenBasic>
16+
// Initialize the tokenizer for the gpt-4o model
17+
Tokenizer tokenizer = TiktokenTokenizer.CreateForModel("gpt-4o");
18+
19+
string source = "Text tokenization is the process of splitting a string into a list of tokens.";
20+
21+
// Count the tokens in the text
22+
Console.WriteLine($"Tokens: {tokenizer.CountTokens(source)}");
23+
// Output: Tokens: 16
24+
25+
// Encode text to token IDs
26+
IReadOnlyList<int> ids = tokenizer.EncodeToIds(source);
27+
Console.WriteLine($"Token IDs: {string.Join(", ", ids)}");
28+
// Output: Token IDs: 1199, 4037, 2065, 374, 279, 1920, 315, 45473, 264, 925, 1139, 264, 1160, 315, 11460, 13
29+
30+
// Decode token IDs back to text
31+
string? decoded = tokenizer.Decode(ids);
32+
Console.WriteLine($"Decoded: {decoded}");
33+
// Output: Decoded: Text tokenization is the process of splitting a string into a list of tokens.
34+
// </TiktokenBasic>
35+
}
36+
37+
private static void TrimText()
38+
{
39+
// <TiktokenTrim>
40+
Tokenizer tokenizer = TiktokenTokenizer.CreateForModel("gpt-4o");
41+
42+
string source = "Text tokenization is the process of splitting a string into a list of tokens.";
43+
44+
// Get the last 5 tokens from the text
45+
var trimIndex = tokenizer.GetIndexByTokenCountFromEnd(source, 5, out string? processedText, out _);
46+
if (processedText is not null)
47+
{
48+
Console.WriteLine($"Last 5 tokens: {processedText.Substring(trimIndex)}");
49+
// Output: Last 5 tokens: a list of tokens.
50+
}
51+
52+
// Get the first 5 tokens from the text
53+
trimIndex = tokenizer.GetIndexByTokenCount(source, 5, out processedText, out _);
54+
if (processedText is not null)
55+
{
56+
Console.WriteLine($"First 5 tokens: {processedText.Substring(0, trimIndex)}");
57+
// Output: First 5 tokens: Text tokenization is the
58+
}
59+
// </TiktokenTrim>
60+
}
61+
}
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
<Project Sdk="Microsoft.NET.Sdk">
2+
3+
<PropertyGroup>
4+
<OutputType>Exe</OutputType>
5+
<TargetFramework>net9.0</TargetFramework>
6+
<ImplicitUsings>enable</ImplicitUsings>
7+
<Nullable>enable</Nullable>
8+
</PropertyGroup>
9+
10+
<ItemGroup>
11+
<PackageReference Include="Microsoft.ML.Tokenizers" Version="1.0.3" />
12+
<PackageReference Include="Microsoft.ML.Tokenizers.Data.O200kBase" Version="1.0.3" />
13+
</ItemGroup>
14+
15+
</Project>

docs/ai/how-to/use-tokenizers.md

Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,89 @@
1+
---
2+
title: Use Microsoft.ML.Tokenizers for text tokenization
3+
description: Learn how to use the Microsoft.ML.Tokenizers library to tokenize text for AI models, manage token counts, and work with various tokenization algorithms.
4+
ms.topic: how-to
5+
ms.date: 10/29/2025
6+
ai-usage: ai-assisted
7+
#customer intent: As a .NET developer, I want to use the Microsoft.ML.Tokenizers library to tokenize text so I can work with AI models, manage costs, and handle token limits effectively.
8+
---
9+
# Use Microsoft.ML.Tokenizers for text tokenization
10+
11+
The [Microsoft.ML.Tokenizers](https://www.nuget.org/packages/Microsoft.ML.Tokenizers) library provides a comprehensive set of tools for tokenizing text in .NET applications. Tokenization is essential when working with large language models (LLMs), as it allows you to manage token counts, estimate costs, and preprocess text for AI models.
12+
13+
This article shows you how to use the library's key features and work with different tokenizer models.
14+
15+
## Prerequisites
16+
17+
- [.NET 8.0 SDK](https://dotnet.microsoft.com/download/dotnet/8.0) or later
18+
19+
## Install the package
20+
21+
Install the Microsoft.ML.Tokenizers NuGet package:
22+
23+
```dotnetcli
24+
dotnet add package Microsoft.ML.Tokenizers
25+
```
26+
27+
## Key features
28+
29+
The Microsoft.ML.Tokenizers library provides:
30+
31+
- **Extensible tokenizer architecture**: Allows specialization of Normalizer, PreTokenizer, Model/Encoder, and Decoder components.
32+
- **Multiple tokenization algorithms**: Supports BPE (Byte Pair Encoding), Tiktoken, Llama, CodeGen, and more.
33+
- **Token counting and estimation**: Helps manage costs and context limits when working with AI services.
34+
- **Flexible encoding options**: Provides methods to encode text to token IDs, count tokens, and decode tokens back to text.
35+
36+
## Use Tiktoken tokenizer
37+
38+
The Tiktoken tokenizer is commonly used with OpenAI models like GPT-4. The following example shows how to initialize a Tiktoken tokenizer and perform common operations:
39+
40+
:::code language="csharp" source="./snippets/use-tokenizers/csharp/TokenizersExamples/TiktokenExample.cs" id="TiktokenBasic":::
41+
42+
The tokenizer instance should be cached and reused throughout your application for better performance.
43+
44+
### Manage token limits
45+
46+
When working with LLMs, you often need to manage text within token limits. The following example shows how to trim text to a specific token count:
47+
48+
:::code language="csharp" source="./snippets/use-tokenizers/csharp/TokenizersExamples/TiktokenExample.cs" id="TiktokenTrim":::
49+
50+
## Use Llama tokenizer
51+
52+
The Llama tokenizer is designed for the Llama family of models. It requires a tokenizer model file, which you can download from model repositories like Hugging Face:
53+
54+
:::code language="csharp" source="./snippets/use-tokenizers/csharp/TokenizersExamples/LlamaExample.cs" id="LlamaBasic":::
55+
56+
### Advanced encoding options
57+
58+
The tokenizer supports advanced encoding options, such as controlling normalization and pretokenization:
59+
60+
:::code language="csharp" source="./snippets/use-tokenizers/csharp/TokenizersExamples/LlamaExample.cs" id="LlamaAdvanced":::
61+
62+
## Use BPE tokenizer
63+
64+
Byte Pair Encoding (BPE) is the underlying algorithm used by many tokenizers, including Tiktoken. The following example demonstrates BPE tokenization:
65+
66+
:::code language="csharp" source="./snippets/use-tokenizers/csharp/TokenizersExamples/BpeExample.cs" id="BpeBasic":::
67+
68+
The library also provides specialized tokenizers like `BpeTokenizer` and `EnglishRobertaTokenizer` that you can configure with custom vocabularies for specific models.
69+
70+
## Common tokenizer operations
71+
72+
All tokenizers in the library implement the `Tokenizer` base class, which provides a consistent API:
73+
74+
- **`EncodeToIds`**: Converts text to a list of token IDs
75+
- **`Decode`**: Converts token IDs back to text
76+
- **`CountTokens`**: Returns the number of tokens in a text string
77+
- **`EncodeToTokens`**: Returns detailed token information including values and IDs
78+
- **`GetIndexByTokenCount`**: Finds the character index for a specific token count from the start
79+
- **`GetIndexByTokenCountFromEnd`**: Finds the character index for a specific token count from the end
80+
81+
## Migration from other libraries
82+
83+
If you're currently using `DeepDev.TokenizerLib` or `SharpToken`, consider migrating to Microsoft.ML.Tokenizers. The library has been enhanced to cover scenarios from those libraries and provides better performance and support. For migration guidance, see the [migration guide](https://github.com/dotnet/machinelearning/blob/main/docs/code/microsoft-ml-tokenizers-migration-guide.md).
84+
85+
## Related content
86+
87+
- [Understanding tokens](../conceptual/understanding-tokens.md)
88+
- [Microsoft.ML.Tokenizers API reference](/dotnet/api/microsoft.ml.tokenizers)
89+
- [Microsoft.ML.Tokenizers NuGet package](https://www.nuget.org/packages/Microsoft.ML.Tokenizers)

docs/ai/toc.yml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,10 @@ items:
7474
href: quickstarts/text-to-image.md
7575
- name: Generate images using OpenAI.Images.ImageClient
7676
href: quickstarts/generate-images.md
77+
- name: Tokenization
78+
items:
79+
- name: Use Microsoft.ML.Tokenizers
80+
href: how-to/use-tokenizers.md
7781
- name: Security and content safety
7882
items:
7983
- name: Authentication for Azure-hosted apps and services

0 commit comments

Comments
 (0)