-
Notifications
You must be signed in to change notification settings - Fork 6.1k
Add Microsoft.ML.Tokenizers documentation to .NET AI docs #49536
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Open
Copilot
wants to merge
7
commits into
main
Choose a base branch
from
copilot/add-documentation-for-tokenizers
base: main
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
Open
Changes from 6 commits
Commits
Show all changes
7 commits
Select commit
Hold shift + click to select a range
7b925bc
Initial plan
Copilot ae6a023
Add Microsoft.ML.Tokenizers documentation and working examples
Copilot aa30753
Add data package requirement and cross-references
Copilot 693150c
Add .gitignore to exclude build artifacts
Copilot a5e763a
human edits
gewarren 9be4112
code edits
gewarren cd51406
Update docs/ai/how-to/use-tokenizers.md
gewarren File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
3 changes: 3 additions & 0 deletions
3
docs/ai/how-to/snippets/use-tokenizers/csharp/TokenizersExamples/.gitignore
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,3 @@ | ||
| ## Build artifacts | ||
| bin/ | ||
| obj/ |
41 changes: 41 additions & 0 deletions
41
docs/ai/how-to/snippets/use-tokenizers/csharp/TokenizersExamples/BpeExample.cs
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,41 @@ | ||
| using System; | ||
| using System.Collections.Generic; | ||
| using Microsoft.ML.Tokenizers; | ||
|
|
||
| internal class BpeExample | ||
| { | ||
| public static void Run() | ||
| { | ||
| BasicUsage(); | ||
| } | ||
|
|
||
| private static void BasicUsage() | ||
| { | ||
| // <BpeBasic> | ||
| // Create a BPE tokenizer using Tiktoken. | ||
| Tokenizer tokenizer = TiktokenTokenizer.CreateForModel("gpt-4o"); | ||
|
|
||
| string text = "Hello, how are you doing today?"; | ||
|
|
||
| // Encode text to token IDs. | ||
| IReadOnlyList<int> ids = tokenizer.EncodeToIds(text); | ||
| Console.WriteLine($"Token IDs: {string.Join(", ", ids)}"); | ||
|
|
||
| // Count tokens. | ||
| int tokenCount = tokenizer.CountTokens(text); | ||
| Console.WriteLine($"Token count: {tokenCount}"); | ||
|
|
||
| // Get detailed token information. | ||
| IReadOnlyList<EncodedToken> tokens = tokenizer.EncodeToTokens(text, out string? normalizedString); | ||
| Console.WriteLine("Tokens:"); | ||
| foreach (EncodedToken token in tokens) | ||
| { | ||
| Console.WriteLine($" ID: {token.Id}, Value: '{token.Value}'"); | ||
| } | ||
|
|
||
| // Decode tokens back to text. | ||
| string? decoded = tokenizer.Decode(ids); | ||
| Console.WriteLine($"Decoded: {decoded}"); | ||
| // </BpeBasic> | ||
| } | ||
| } |
64 changes: 64 additions & 0 deletions
64
docs/ai/how-to/snippets/use-tokenizers/csharp/TokenizersExamples/LlamaExample.cs
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,64 @@ | ||
| using System; | ||
| using System.Collections.Generic; | ||
| using System.IO; | ||
| using System.Net.Http; | ||
| using System.Threading.Tasks; | ||
| using Microsoft.ML.Tokenizers; | ||
|
|
||
| internal class LlamaExample | ||
| { | ||
| public static async Task RunAsync() | ||
| { | ||
| await BasicUsageAsync(); | ||
| AdvancedOptions(); | ||
| } | ||
|
|
||
| private static async Task BasicUsageAsync() | ||
| { | ||
| // <LlamaBasic> | ||
| // Open a stream to the remote Llama tokenizer model data file. | ||
| using HttpClient httpClient = new(); | ||
| const string modelUrl = @"https://huggingface.co/hf-internal-testing/llama-tokenizer/resolve/main/tokenizer.model"; | ||
| using Stream remoteStream = await httpClient.GetStreamAsync(modelUrl); | ||
|
|
||
| // Create the Llama tokenizer using the remote stream. | ||
| Tokenizer llamaTokenizer = LlamaTokenizer.Create(remoteStream); | ||
|
|
||
| string input = "Hello, world!"; | ||
|
|
||
| // Encode text to token IDs. | ||
| IReadOnlyList<int> ids = llamaTokenizer.EncodeToIds(input); | ||
| Console.WriteLine($"Token IDs: {string.Join(", ", ids)}"); | ||
| // Output: Token IDs: 1, 15043, 29892, 3186, 29991 | ||
|
|
||
| // Count the tokens. | ||
| Console.WriteLine($"Tokens: {llamaTokenizer.CountTokens(input)}"); | ||
| // Output: Tokens: 5 | ||
|
|
||
| // Decode token IDs back to text. | ||
| string? decoded = llamaTokenizer.Decode(ids); | ||
| Console.WriteLine($"Decoded: {decoded}"); | ||
| // Output: Decoded: Hello, world! | ||
| // </LlamaBasic> | ||
| } | ||
|
|
||
| private static void AdvancedOptions() | ||
| { | ||
| // For demonstration purposes, we'll use a mock tokenizer. | ||
| // In real scenarios, you would initialize this properly. | ||
| Tokenizer tokenizer = TiktokenTokenizer.CreateForModel("gpt-4o"); | ||
|
|
||
| // <LlamaAdvanced> | ||
| ReadOnlySpan<char> textSpan = "Hello World".AsSpan(); | ||
|
|
||
| // Bypass normalization during encoding. | ||
| IReadOnlyList<int> ids = tokenizer.EncodeToIds(textSpan, considerNormalization: false); | ||
|
|
||
| // Bypass pretokenization during encoding. | ||
| ids = tokenizer.EncodeToIds(textSpan, considerPreTokenization: false); | ||
|
|
||
| // Bypass both normalization and pretokenization. | ||
| ids = tokenizer.EncodeToIds(textSpan, considerNormalization: false, considerPreTokenization: false); | ||
| // </LlamaAdvanced> | ||
| } | ||
| } |
19 changes: 19 additions & 0 deletions
19
docs/ai/how-to/snippets/use-tokenizers/csharp/TokenizersExamples/Program.cs
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,19 @@ | ||
| using System; | ||
| using System.Threading.Tasks; | ||
|
|
||
| // Run examples | ||
| Console.WriteLine("=== Tiktoken Examples ==="); | ||
| TiktokenExample.Run(); | ||
|
|
||
| Console.WriteLine("\n=== Llama Examples ==="); | ||
| try | ||
| { | ||
| await LlamaExample.RunAsync(); | ||
| } | ||
| catch (Exception ex) | ||
| { | ||
| Console.WriteLine($"Note: Llama example requires network access to download model files: {ex.Message}"); | ||
| } | ||
|
|
||
| Console.WriteLine("\n=== BPE Examples ==="); | ||
| BpeExample.Run(); |
61 changes: 61 additions & 0 deletions
61
docs/ai/how-to/snippets/use-tokenizers/csharp/TokenizersExamples/TiktokenExample.cs
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,61 @@ | ||
| using System; | ||
| using System.Collections.Generic; | ||
| using Microsoft.ML.Tokenizers; | ||
|
|
||
| internal class TiktokenExample | ||
| { | ||
| public static void Run() | ||
| { | ||
| BasicUsage(); | ||
| TrimText(); | ||
| } | ||
|
|
||
| private static void BasicUsage() | ||
| { | ||
| // <TiktokenBasic> | ||
| // Initialize the tokenizer for the gpt-4o model. | ||
| Tokenizer tokenizer = TiktokenTokenizer.CreateForModel("gpt-4o"); | ||
|
|
||
| string source = "Text tokenization is the process of splitting a string into a list of tokens."; | ||
|
|
||
| // Count the tokens in the text. | ||
| Console.WriteLine($"Tokens: {tokenizer.CountTokens(source)}"); | ||
| // Output: Tokens: 16 | ||
|
|
||
| // Encode text to token IDs. | ||
| IReadOnlyList<int> ids = tokenizer.EncodeToIds(source); | ||
| Console.WriteLine($"Token IDs: {string.Join(", ", ids)}"); | ||
| // Output: Token IDs: 1199, 4037, 2065, 374, 279, 1920, 315, 45473, 264, 925, 1139, 264, 1160, 315, 11460, 13 | ||
|
|
||
| // Decode token IDs back to text. | ||
| string? decoded = tokenizer.Decode(ids); | ||
| Console.WriteLine($"Decoded: {decoded}"); | ||
| // Output: Decoded: Text tokenization is the process of splitting a string into a list of tokens. | ||
| // </TiktokenBasic> | ||
| } | ||
|
|
||
| private static void TrimText() | ||
| { | ||
| // <TiktokenTrim> | ||
| Tokenizer tokenizer = TiktokenTokenizer.CreateForModel("gpt-4o"); | ||
|
|
||
| string source = "Text tokenization is the process of splitting a string into a list of tokens."; | ||
|
|
||
| // Get the last 5 tokens from the text. | ||
| var trimIndex = tokenizer.GetIndexByTokenCountFromEnd(source, 5, out string? processedText, out _); | ||
| if (processedText is not null) | ||
| { | ||
| Console.WriteLine($"Last 5 tokens: {processedText.Substring(trimIndex)}"); | ||
| // Output: Last 5 tokens: a list of tokens. | ||
| } | ||
|
|
||
| // Get the first 5 tokens from the text. | ||
| trimIndex = tokenizer.GetIndexByTokenCount(source, 5, out processedText, out _); | ||
| if (processedText is not null) | ||
| { | ||
| Console.WriteLine($"First 5 tokens: {processedText.Substring(0, trimIndex)}"); | ||
| // Output: First 5 tokens: Text tokenization is the | ||
| } | ||
| // </TiktokenTrim> | ||
| } | ||
| } |
15 changes: 15 additions & 0 deletions
15
docs/ai/how-to/snippets/use-tokenizers/csharp/TokenizersExamples/TokenizersExamples.csproj
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,15 @@ | ||
| <Project Sdk="Microsoft.NET.Sdk"> | ||
|
|
||
| <PropertyGroup> | ||
| <OutputType>Exe</OutputType> | ||
| <TargetFramework>net9.0</TargetFramework> | ||
| <ImplicitUsings>enable</ImplicitUsings> | ||
| <Nullable>enable</Nullable> | ||
| </PropertyGroup> | ||
|
|
||
| <ItemGroup> | ||
| <PackageReference Include="Microsoft.ML.Tokenizers" Version="1.0.3" /> | ||
| <PackageReference Include="Microsoft.ML.Tokenizers.Data.O200kBase" Version="1.0.3" /> | ||
| </ItemGroup> | ||
|
|
||
| </Project> |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,94 @@ | ||
| --- | ||
| title: Use Microsoft.ML.Tokenizers for text tokenization | ||
| description: Learn how to use the Microsoft.ML.Tokenizers library to tokenize text for AI models, manage token counts, and work with various tokenization algorithms. | ||
| ms.topic: how-to | ||
| ms.date: 10/29/2025 | ||
| ai-usage: ai-assisted | ||
| --- | ||
| # Use Microsoft.ML.Tokenizers for text tokenization | ||
|
|
||
| The [Microsoft.ML.Tokenizers](https://www.nuget.org/packages/Microsoft.ML.Tokenizers) library provides a comprehensive set of tools for tokenizing text in .NET applications. Tokenization is essential when you work with large language models (LLMs), as it allows you to manage token counts, estimate costs, and preprocess text for AI models. | ||
|
|
||
| This article shows you how to use the library's key features and work with different tokenizer models. | ||
|
|
||
| ## Prerequisites | ||
|
|
||
| - [.NET 8 SDK](https://dotnet.microsoft.com/download/dotnet/8.0) or later | ||
|
|
||
| ## Install the package | ||
|
|
||
| Install the Microsoft.ML.Tokenizers NuGet package: | ||
|
|
||
| ```dotnetcli | ||
| dotnet add package Microsoft.ML.Tokenizers | ||
| ``` | ||
|
|
||
| For Tiktoken models (like GPT-4), you also need to install the corresponding data package: | ||
|
|
||
| ```dotnetcli | ||
| dotnet add package Microsoft.ML.Tokenizers.Data.O200kBase | ||
| ``` | ||
|
|
||
| ## Key features | ||
|
|
||
| The Microsoft.ML.Tokenizers library provides: | ||
|
|
||
| - **Extensible tokenizer architecture**: Allows specialization of Normalizer, PreTokenizer, Model/Encoder, and Decoder components. | ||
| - **Multiple tokenization algorithms**: Supports BPE (byte-pair encoding), Tiktoken, Llama, CodeGen, and more. | ||
| - **Token counting and estimation**: Helps manage costs and context limits when working with AI services. | ||
| - **Flexible encoding options**: Provides methods to encode text to token IDs, count tokens, and decode tokens back to text. | ||
|
|
||
| ## Use Tiktoken tokenizer | ||
|
|
||
| The Tiktoken tokenizer is commonly used with OpenAI models like GPT-4. The following example shows how to initialize a Tiktoken tokenizer and perform common operations: | ||
|
|
||
| :::code language="csharp" source="./snippets/use-tokenizers/csharp/TokenizersExamples/TiktokenExample.cs" id="TiktokenBasic"::: | ||
|
|
||
| For better performance, you should cache and reuse the tokenizer instance throughout your app. | ||
|
|
||
| When you work with LLMs, you often need to manage text within token limits. The following example shows how to trim text to a specific token count: | ||
|
|
||
| :::code language="csharp" source="./snippets/use-tokenizers/csharp/TokenizersExamples/TiktokenExample.cs" id="TiktokenTrim"::: | ||
|
|
||
| ## Use Llama tokenizer | ||
|
|
||
| The Llama tokenizer is designed for the Llama family of models. It requires a tokenizer model file, which you can download from model repositories like Hugging Face: | ||
|
|
||
| :::code language="csharp" source="./snippets/use-tokenizers/csharp/TokenizersExamples/LlamaExample.cs" id="LlamaBasic"::: | ||
|
|
||
| The tokenizer supports advanced encoding options, such as controlling normalization and pretokenization: | ||
|
|
||
| :::code language="csharp" source="./snippets/use-tokenizers/csharp/TokenizersExamples/LlamaExample.cs" id="LlamaAdvanced"::: | ||
|
|
||
| ## Use BPE tokenizer | ||
|
|
||
| *Byte-pair encoding* (BPE) is the underlying algorithm used by many tokenizers, including Tiktoken. BPE was initially developed as an algorithm to compress texts, and then used by OpenAI for tokenization when it pretrained the GPT model. The following example demonstrates BPE tokenization: | ||
|
|
||
| :::code language="csharp" source="./snippets/use-tokenizers/csharp/TokenizersExamples/BpeExample.cs" id="BpeBasic"::: | ||
|
|
||
| The library also provides specialized tokenizers like <xref:Microsoft.ML.Tokenizers.BpeTokenizer> and <xref:Microsoft.ML.Tokenizers.EnglishRobertaTokenizer> that you can configure with custom vocabularies for specific models. | ||
|
|
||
| For more information about BPE, see [Byte-pair encoding tokenization](https://huggingface.co/learn/llm-course/chapter6/5). | ||
|
|
||
| ## Common tokenizer operations | ||
|
|
||
| All tokenizers in the library implement the <xref:Microsoft.ML.Tokenizers.Tokenizer> base class. The following table shows the available methods. | ||
|
|
||
| | Method | Description | | ||
| |-------------------------------------------------------|--------------------------------------| | ||
| | <xref:Microsoft.ML.Tokenizers.Tokenizer.EncodeToIds*> | Converts text to a list of token IDs | | ||
| | <xref:Microsoft.ML.Tokenizers.Tokenizer.Decode*> | Converts token IDs back to text | | ||
gewarren marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| | <xref:Microsoft.ML.Tokenizers.Tokenizer.CountTokens*> | Returns the number of tokens in a text string | | ||
gewarren marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| | <xref:Microsoft.ML.Tokenizers.Tokenizer.EncodeToTokens*> | Returns detailed token information including values and IDs | | ||
gewarren marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| | <xref:Microsoft.ML.Tokenizers.Tokenizer.GetIndexByTokenCount*> | Finds the character index for a specific token count from the start | | ||
gewarren marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| | <xref:Microsoft.ML.Tokenizers.Tokenizer.GetIndexByTokenCountFromEnd*> | Finds the character index for a specific token count from the end | | ||
gewarren marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
gewarren marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
|
|
||
| ## Migrate from other libraries | ||
|
|
||
| If you're currently using `DeepDev.TokenizerLib` or `SharpToken`, consider migrating to Microsoft.ML.Tokenizers. The library has been enhanced to cover scenarios from those libraries and provides better performance and support. For migration guidance, see the [migration guide](https://github.com/dotnet/machinelearning/blob/main/docs/code/microsoft-ml-tokenizers-migration-guide.md). | ||
|
|
||
| ## Related content | ||
|
|
||
| - [Understanding tokens](../conceptual/understanding-tokens.md) | ||
| - [Microsoft.ML.Tokenizers API reference](/dotnet/api/microsoft.ml.tokenizers) | ||
| - [Microsoft.ML.Tokenizers NuGet package](https://www.nuget.org/packages/Microsoft.ML.Tokenizers) | ||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.