Add Microsoft.ML.Tokenizers documentation and working examples

Copilot · gewarren · Copilot · commit ae6a023c6c8b · 2025-10-29T23:00:45.000Z
Co-authored-by: gewarren &lt;24882762+gewarren@users.noreply.github.com&gt;
diff --git a/docs/ai/how-to/snippets/use-tokenizers/csharp/TokenizersExamples/BpeExample.cs b/docs/ai/how-to/snippets/use-tokenizers/csharp/TokenizersExamples/BpeExample.cs
@@ -0,0 +1,42 @@
+using System;
+using System.Collections.Generic;
+using Microsoft.ML.Tokenizers;
+
+internal class BpeExample
+{
+    public static void Run()
+    {
+        BasicUsage();
+    }
+
+    private static void BasicUsage()
+    {
+        // <BpeBasic>
+        // Create a BPE tokenizer using Tiktoken
+        // BPE (Byte Pair Encoding) is the underlying algorithm used by many tokenizers
+        Tokenizer tokenizer = TiktokenTokenizer.CreateForModel("gpt-4o");
+
+        string text = "Hello, how are you doing today?";
+
+        // Encode text to token IDs
+        IReadOnlyList<int> ids = tokenizer.EncodeToIds(text);
+        Console.WriteLine($"Token IDs: {string.Join(", ", ids)}");
+
+        // Count tokens
+        int tokenCount = tokenizer.CountTokens(text);
+        Console.WriteLine($"Token count: {tokenCount}");
+
+        // Get detailed token information
+        IReadOnlyList<EncodedToken> tokens = tokenizer.EncodeToTokens(text, out string? normalizedString);
+        Console.WriteLine("Tokens:");
+        foreach (var token in tokens)
+        {
+            Console.WriteLine($"  ID: {token.Id}, Value: '{token.Value}'");
+        }
+
+        // Decode tokens back to text
+        string? decoded = tokenizer.Decode(ids);
+        Console.WriteLine($"Decoded: {decoded}");
+        // </BpeBasic>
+    }
+}
diff --git a/docs/ai/how-to/snippets/use-tokenizers/csharp/TokenizersExamples/LlamaExample.cs b/docs/ai/how-to/snippets/use-tokenizers/csharp/TokenizersExamples/LlamaExample.cs
@@ -0,0 +1,64 @@
+using System;
+using System.Collections.Generic;
+using System.IO;
+using System.Net.Http;
+using System.Threading.Tasks;
+using Microsoft.ML.Tokenizers;
+
+internal class LlamaExample
+{
+    public static async Task RunAsync()
+    {
+        await BasicUsageAsync();
+        AdvancedOptions();
+    }
+
+    private static async Task BasicUsageAsync()
+    {
+        // <LlamaBasic>
+        // Open a stream to the remote Llama tokenizer model data file
+        using HttpClient httpClient = new();
+        const string modelUrl = @"https://huggingface.co/hf-internal-testing/llama-tokenizer/resolve/main/tokenizer.model";
+        using Stream remoteStream = await httpClient.GetStreamAsync(modelUrl);
+
+        // Create the Llama tokenizer using the remote stream
+        Tokenizer llamaTokenizer = LlamaTokenizer.Create(remoteStream);
+
+        string input = "Hello, world!";
+
+        // Encode text to token IDs
+        IReadOnlyList<int> ids = llamaTokenizer.EncodeToIds(input);
+        Console.WriteLine($"Token IDs: {string.Join(", ", ids)}");
+        // Output: Token IDs: 1, 15043, 29892, 3186, 29991
+
+        // Count the tokens
+        Console.WriteLine($"Tokens: {llamaTokenizer.CountTokens(input)}");
+        // Output: Tokens: 5
+
+        // Decode token IDs back to text
+        string? decoded = llamaTokenizer.Decode(ids);
+        Console.WriteLine($"Decoded: {decoded}");
+        // Output: Decoded: Hello, world!
+        // </LlamaBasic>
+    }
+
+    private static void AdvancedOptions()
+    {
+        // For demonstration purposes, we'll use a mock tokenizer
+        // In real scenarios, you would initialize this properly
+        Tokenizer tokenizer = TiktokenTokenizer.CreateForModel("gpt-4o");
+        
+        // <LlamaAdvanced>
+        ReadOnlySpan<char> textSpan = "Hello World".AsSpan();
+
+        // Bypass normalization during encoding
+        IReadOnlyList<int> ids = tokenizer.EncodeToIds(textSpan, considerNormalization: false);
+
+        // Bypass pretokenization during encoding
+        ids = tokenizer.EncodeToIds(textSpan, considerPreTokenization: false);
+
+        // Bypass both normalization and pretokenization
+        ids = tokenizer.EncodeToIds(textSpan, considerNormalization: false, considerPreTokenization: false);
+        // </LlamaAdvanced>
+    }
+}
diff --git a/docs/ai/how-to/snippets/use-tokenizers/csharp/TokenizersExamples/Program.cs b/docs/ai/how-to/snippets/use-tokenizers/csharp/TokenizersExamples/Program.cs
@@ -0,0 +1,19 @@
+﻿using System;
+using System.Threading.Tasks;
+
+// Run examples
+Console.WriteLine("=== Tiktoken Examples ===");
+TiktokenExample.Run();
+
+Console.WriteLine("\n=== Llama Examples ===");
+try
+{
+    await LlamaExample.RunAsync();
+}
+catch (Exception ex)
+{
+    Console.WriteLine($"Note: Llama example requires network access to download model files: {ex.Message}");
+}
+
+Console.WriteLine("\n=== BPE Examples ===");
+BpeExample.Run();
diff --git a/docs/ai/how-to/snippets/use-tokenizers/csharp/TokenizersExamples/TiktokenExample.cs b/docs/ai/how-to/snippets/use-tokenizers/csharp/TokenizersExamples/TiktokenExample.cs
@@ -0,0 +1,61 @@
+using System;
+using System.Collections.Generic;
+using Microsoft.ML.Tokenizers;
+
+internal class TiktokenExample
+{
+    public static void Run()
+    {
+        BasicUsage();
+        TrimText();
+    }
+
+    private static void BasicUsage()
+    {
+        // <TiktokenBasic>
+        // Initialize the tokenizer for the gpt-4o model
+        Tokenizer tokenizer = TiktokenTokenizer.CreateForModel("gpt-4o");
+
+        string source = "Text tokenization is the process of splitting a string into a list of tokens.";
+
+        // Count the tokens in the text
+        Console.WriteLine($"Tokens: {tokenizer.CountTokens(source)}");
+        // Output: Tokens: 16
+
+        // Encode text to token IDs
+        IReadOnlyList<int> ids = tokenizer.EncodeToIds(source);
+        Console.WriteLine($"Token IDs: {string.Join(", ", ids)}");
+        // Output: Token IDs: 1199, 4037, 2065, 374, 279, 1920, 315, 45473, 264, 925, 1139, 264, 1160, 315, 11460, 13
+
+        // Decode token IDs back to text
+        string? decoded = tokenizer.Decode(ids);
+        Console.WriteLine($"Decoded: {decoded}");
+        // Output: Decoded: Text tokenization is the process of splitting a string into a list of tokens.
+        // </TiktokenBasic>
+    }
+
+    private static void TrimText()
+    {
+        // <TiktokenTrim>
+        Tokenizer tokenizer = TiktokenTokenizer.CreateForModel("gpt-4o");
+
+        string source = "Text tokenization is the process of splitting a string into a list of tokens.";
+
+        // Get the last 5 tokens from the text
+        var trimIndex = tokenizer.GetIndexByTokenCountFromEnd(source, 5, out string? processedText, out _);
+        if (processedText is not null)
+        {
+            Console.WriteLine($"Last 5 tokens: {processedText.Substring(trimIndex)}");
+            // Output: Last 5 tokens:  a list of tokens.
+        }
+
+        // Get the first 5 tokens from the text
+        trimIndex = tokenizer.GetIndexByTokenCount(source, 5, out processedText, out _);
+        if (processedText is not null)
+        {
+            Console.WriteLine($"First 5 tokens: {processedText.Substring(0, trimIndex)}");
+            // Output: First 5 tokens: Text tokenization is the
+        }
+        // </TiktokenTrim>
+    }
+}
diff --git a/docs/ai/how-to/snippets/use-tokenizers/csharp/TokenizersExamples/TokenizersExamples.csproj b/docs/ai/how-to/snippets/use-tokenizers/csharp/TokenizersExamples/TokenizersExamples.csproj
@@ -0,0 +1,15 @@
+﻿<Project Sdk="Microsoft.NET.Sdk">
+
+  <PropertyGroup>
+    <OutputType>Exe</OutputType>
+    <TargetFramework>net9.0</TargetFramework>
+    <ImplicitUsings>enable</ImplicitUsings>
+    <Nullable>enable</Nullable>
+  </PropertyGroup>
+
+  <ItemGroup>
+    <PackageReference Include="Microsoft.ML.Tokenizers" Version="1.0.3" />
+    <PackageReference Include="Microsoft.ML.Tokenizers.Data.O200kBase" Version="1.0.3" />
+  </ItemGroup>
+
+</Project>
diff --git a/docs/ai/how-to/use-tokenizers.md b/docs/ai/how-to/use-tokenizers.md
@@ -0,0 +1,89 @@
+---
+title: Use Microsoft.ML.Tokenizers for text tokenization
+description: Learn how to use the Microsoft.ML.Tokenizers library to tokenize text for AI models, manage token counts, and work with various tokenization algorithms.
+ms.topic: how-to
+ms.date: 10/29/2025
+ai-usage: ai-assisted
+#customer intent: As a .NET developer, I want to use the Microsoft.ML.Tokenizers library to tokenize text so I can work with AI models, manage costs, and handle token limits effectively.
+---
+# Use Microsoft.ML.Tokenizers for text tokenization
+
+The [Microsoft.ML.Tokenizers](https://www.nuget.org/packages/Microsoft.ML.Tokenizers) library provides a comprehensive set of tools for tokenizing text in .NET applications. Tokenization is essential when working with large language models (LLMs), as it allows you to manage token counts, estimate costs, and preprocess text for AI models.
+
+This article shows you how to use the library's key features and work with different tokenizer models.
+
+## Prerequisites
+
+- [.NET 8.0 SDK](https://dotnet.microsoft.com/download/dotnet/8.0) or later
+
+## Install the package
+
+Install the Microsoft.ML.Tokenizers NuGet package:
+
+```dotnetcli
+dotnet add package Microsoft.ML.Tokenizers
+```
+
+## Key features
+
+The Microsoft.ML.Tokenizers library provides:
+
+- **Extensible tokenizer architecture**: Allows specialization of Normalizer, PreTokenizer, Model/Encoder, and Decoder components.
+- **Multiple tokenization algorithms**: Supports BPE (Byte Pair Encoding), Tiktoken, Llama, CodeGen, and more.
+- **Token counting and estimation**: Helps manage costs and context limits when working with AI services.
+- **Flexible encoding options**: Provides methods to encode text to token IDs, count tokens, and decode tokens back to text.
+
+## Use Tiktoken tokenizer
+
+The Tiktoken tokenizer is commonly used with OpenAI models like GPT-4. The following example shows how to initialize a Tiktoken tokenizer and perform common operations:
+
+:::code language="csharp" source="./snippets/use-tokenizers/csharp/TokenizersExamples/TiktokenExample.cs" id="TiktokenBasic":::
+
+The tokenizer instance should be cached and reused throughout your application for better performance.
+
+### Manage token limits
+
+When working with LLMs, you often need to manage text within token limits. The following example shows how to trim text to a specific token count:
+
+:::code language="csharp" source="./snippets/use-tokenizers/csharp/TokenizersExamples/TiktokenExample.cs" id="TiktokenTrim":::
+
+## Use Llama tokenizer
+
+The Llama tokenizer is designed for the Llama family of models. It requires a tokenizer model file, which you can download from model repositories like Hugging Face:
+
+:::code language="csharp" source="./snippets/use-tokenizers/csharp/TokenizersExamples/LlamaExample.cs" id="LlamaBasic":::
+
+### Advanced encoding options
+
+The tokenizer supports advanced encoding options, such as controlling normalization and pretokenization:
+
+:::code language="csharp" source="./snippets/use-tokenizers/csharp/TokenizersExamples/LlamaExample.cs" id="LlamaAdvanced":::
+
+## Use BPE tokenizer
+
+Byte Pair Encoding (BPE) is the underlying algorithm used by many tokenizers, including Tiktoken. The following example demonstrates BPE tokenization:
+
+:::code language="csharp" source="./snippets/use-tokenizers/csharp/TokenizersExamples/BpeExample.cs" id="BpeBasic":::
+
+The library also provides specialized tokenizers like `BpeTokenizer` and `EnglishRobertaTokenizer` that you can configure with custom vocabularies for specific models.
+
+## Common tokenizer operations
+
+All tokenizers in the library implement the `Tokenizer` base class, which provides a consistent API:
+
+- **`EncodeToIds`**: Converts text to a list of token IDs
+- **`Decode`**: Converts token IDs back to text
+- **`CountTokens`**: Returns the number of tokens in a text string
+- **`EncodeToTokens`**: Returns detailed token information including values and IDs
+- **`GetIndexByTokenCount`**: Finds the character index for a specific token count from the start
+- **`GetIndexByTokenCountFromEnd`**: Finds the character index for a specific token count from the end
+
+## Migration from other libraries
+
+If you're currently using `DeepDev.TokenizerLib` or `SharpToken`, consider migrating to Microsoft.ML.Tokenizers. The library has been enhanced to cover scenarios from those libraries and provides better performance and support. For migration guidance, see the [migration guide](https://github.com/dotnet/machinelearning/blob/main/docs/code/microsoft-ml-tokenizers-migration-guide.md).
+
+## Related content
+
+- [Understanding tokens](../conceptual/understanding-tokens.md)
+- [Microsoft.ML.Tokenizers API reference](/dotnet/api/microsoft.ml.tokenizers)
+- [Microsoft.ML.Tokenizers NuGet package](https://www.nuget.org/packages/Microsoft.ML.Tokenizers)
diff --git a/docs/ai/toc.yml b/docs/ai/toc.yml
@@ -74,6 +74,10 @@ items:
       href: quickstarts/text-to-image.md
     - name: Generate images using OpenAI.Images.ImageClient
       href: quickstarts/generate-images.md
+- name: Tokenization
+  items:
+    - name: Use Microsoft.ML.Tokenizers
+      href: how-to/use-tokenizers.md
 - name: Security and content safety
   items:
   - name: Authentication for Azure-hosted apps and services