Skip to content

Commit 9be4112

Browse files
committed
code edits
1 parent a5e763a commit 9be4112

File tree

4 files changed

+27
-26
lines changed

4 files changed

+27
-26
lines changed

docs/ai/how-to/snippets/use-tokenizers/csharp/TokenizersExamples/BpeExample.cs

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -12,29 +12,28 @@ public static void Run()
1212
private static void BasicUsage()
1313
{
1414
// <BpeBasic>
15-
// Create a BPE tokenizer using Tiktoken
16-
// BPE (Byte Pair Encoding) is the underlying algorithm used by many tokenizers
15+
// Create a BPE tokenizer using Tiktoken.
1716
Tokenizer tokenizer = TiktokenTokenizer.CreateForModel("gpt-4o");
1817

1918
string text = "Hello, how are you doing today?";
2019

21-
// Encode text to token IDs
20+
// Encode text to token IDs.
2221
IReadOnlyList<int> ids = tokenizer.EncodeToIds(text);
2322
Console.WriteLine($"Token IDs: {string.Join(", ", ids)}");
2423

25-
// Count tokens
24+
// Count tokens.
2625
int tokenCount = tokenizer.CountTokens(text);
2726
Console.WriteLine($"Token count: {tokenCount}");
2827

29-
// Get detailed token information
28+
// Get detailed token information.
3029
IReadOnlyList<EncodedToken> tokens = tokenizer.EncodeToTokens(text, out string? normalizedString);
3130
Console.WriteLine("Tokens:");
32-
foreach (var token in tokens)
31+
foreach (EncodedToken token in tokens)
3332
{
3433
Console.WriteLine($" ID: {token.Id}, Value: '{token.Value}'");
3534
}
3635

37-
// Decode tokens back to text
36+
// Decode tokens back to text.
3837
string? decoded = tokenizer.Decode(ids);
3938
Console.WriteLine($"Decoded: {decoded}");
4039
// </BpeBasic>

docs/ai/how-to/snippets/use-tokenizers/csharp/TokenizersExamples/LlamaExample.cs

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -16,26 +16,26 @@ public static async Task RunAsync()
1616
private static async Task BasicUsageAsync()
1717
{
1818
// <LlamaBasic>
19-
// Open a stream to the remote Llama tokenizer model data file
19+
// Open a stream to the remote Llama tokenizer model data file.
2020
using HttpClient httpClient = new();
2121
const string modelUrl = @"https://huggingface.co/hf-internal-testing/llama-tokenizer/resolve/main/tokenizer.model";
2222
using Stream remoteStream = await httpClient.GetStreamAsync(modelUrl);
2323

24-
// Create the Llama tokenizer using the remote stream
24+
// Create the Llama tokenizer using the remote stream.
2525
Tokenizer llamaTokenizer = LlamaTokenizer.Create(remoteStream);
2626

2727
string input = "Hello, world!";
2828

29-
// Encode text to token IDs
29+
// Encode text to token IDs.
3030
IReadOnlyList<int> ids = llamaTokenizer.EncodeToIds(input);
3131
Console.WriteLine($"Token IDs: {string.Join(", ", ids)}");
3232
// Output: Token IDs: 1, 15043, 29892, 3186, 29991
3333

34-
// Count the tokens
34+
// Count the tokens.
3535
Console.WriteLine($"Tokens: {llamaTokenizer.CountTokens(input)}");
3636
// Output: Tokens: 5
3737

38-
// Decode token IDs back to text
38+
// Decode token IDs back to text.
3939
string? decoded = llamaTokenizer.Decode(ids);
4040
Console.WriteLine($"Decoded: {decoded}");
4141
// Output: Decoded: Hello, world!
@@ -44,20 +44,20 @@ private static async Task BasicUsageAsync()
4444

4545
private static void AdvancedOptions()
4646
{
47-
// For demonstration purposes, we'll use a mock tokenizer
48-
// In real scenarios, you would initialize this properly
47+
// For demonstration purposes, we'll use a mock tokenizer.
48+
// In real scenarios, you would initialize this properly.
4949
Tokenizer tokenizer = TiktokenTokenizer.CreateForModel("gpt-4o");
50-
50+
5151
// <LlamaAdvanced>
5252
ReadOnlySpan<char> textSpan = "Hello World".AsSpan();
5353

54-
// Bypass normalization during encoding
54+
// Bypass normalization during encoding.
5555
IReadOnlyList<int> ids = tokenizer.EncodeToIds(textSpan, considerNormalization: false);
5656

57-
// Bypass pretokenization during encoding
57+
// Bypass pretokenization during encoding.
5858
ids = tokenizer.EncodeToIds(textSpan, considerPreTokenization: false);
5959

60-
// Bypass both normalization and pretokenization
60+
// Bypass both normalization and pretokenization.
6161
ids = tokenizer.EncodeToIds(textSpan, considerNormalization: false, considerPreTokenization: false);
6262
// </LlamaAdvanced>
6363
}

docs/ai/how-to/snippets/use-tokenizers/csharp/TokenizersExamples/TiktokenExample.cs

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -13,21 +13,21 @@ public static void Run()
1313
private static void BasicUsage()
1414
{
1515
// <TiktokenBasic>
16-
// Initialize the tokenizer for the gpt-4o model
16+
// Initialize the tokenizer for the gpt-4o model.
1717
Tokenizer tokenizer = TiktokenTokenizer.CreateForModel("gpt-4o");
1818

1919
string source = "Text tokenization is the process of splitting a string into a list of tokens.";
2020

21-
// Count the tokens in the text
21+
// Count the tokens in the text.
2222
Console.WriteLine($"Tokens: {tokenizer.CountTokens(source)}");
2323
// Output: Tokens: 16
2424

25-
// Encode text to token IDs
25+
// Encode text to token IDs.
2626
IReadOnlyList<int> ids = tokenizer.EncodeToIds(source);
2727
Console.WriteLine($"Token IDs: {string.Join(", ", ids)}");
2828
// Output: Token IDs: 1199, 4037, 2065, 374, 279, 1920, 315, 45473, 264, 925, 1139, 264, 1160, 315, 11460, 13
2929

30-
// Decode token IDs back to text
30+
// Decode token IDs back to text.
3131
string? decoded = tokenizer.Decode(ids);
3232
Console.WriteLine($"Decoded: {decoded}");
3333
// Output: Decoded: Text tokenization is the process of splitting a string into a list of tokens.
@@ -41,15 +41,15 @@ private static void TrimText()
4141

4242
string source = "Text tokenization is the process of splitting a string into a list of tokens.";
4343

44-
// Get the last 5 tokens from the text
44+
// Get the last 5 tokens from the text.
4545
var trimIndex = tokenizer.GetIndexByTokenCountFromEnd(source, 5, out string? processedText, out _);
4646
if (processedText is not null)
4747
{
4848
Console.WriteLine($"Last 5 tokens: {processedText.Substring(trimIndex)}");
4949
// Output: Last 5 tokens: a list of tokens.
5050
}
5151

52-
// Get the first 5 tokens from the text
52+
// Get the first 5 tokens from the text.
5353
trimIndex = tokenizer.GetIndexByTokenCount(source, 5, out processedText, out _);
5454
if (processedText is not null)
5555
{

docs/ai/how-to/use-tokenizers.md

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -62,12 +62,14 @@ The tokenizer supports advanced encoding options, such as controlling normalizat
6262

6363
## Use BPE tokenizer
6464

65-
Byte-pair encoding (BPE) is the underlying algorithm used by many tokenizers, including Tiktoken. The following example demonstrates BPE tokenization:
65+
*Byte-pair encoding* (BPE) is the underlying algorithm used by many tokenizers, including Tiktoken. BPE was initially developed as an algorithm to compress texts, and then used by OpenAI for tokenization when it pretrained the GPT model. The following example demonstrates BPE tokenization:
6666

6767
:::code language="csharp" source="./snippets/use-tokenizers/csharp/TokenizersExamples/BpeExample.cs" id="BpeBasic":::
6868

6969
The library also provides specialized tokenizers like <xref:Microsoft.ML.Tokenizers.BpeTokenizer> and <xref:Microsoft.ML.Tokenizers.EnglishRobertaTokenizer> that you can configure with custom vocabularies for specific models.
7070

71+
For more information about BPE, see [Byte-pair encoding tokenization](https://huggingface.co/learn/llm-course/chapter6/5).
72+
7173
## Common tokenizer operations
7274

7375
All tokenizers in the library implement the <xref:Microsoft.ML.Tokenizers.Tokenizer> base class. The following table shows the available methods.
@@ -81,7 +83,7 @@ All tokenizers in the library implement the <xref:Microsoft.ML.Tokenizers.Tokeni
8183
| <xref:Microsoft.ML.Tokenizers.Tokenizer.GetIndexByTokenCount*> | Finds the character index for a specific token count from the start |
8284
| <xref:Microsoft.ML.Tokenizers.Tokenizer.GetIndexByTokenCountFromEnd*> | Finds the character index for a specific token count from the end |
8385

84-
## Migration from other libraries
86+
## Migrate from other libraries
8587

8688
If you're currently using `DeepDev.TokenizerLib` or `SharpToken`, consider migrating to Microsoft.ML.Tokenizers. The library has been enhanced to cover scenarios from those libraries and provides better performance and support. For migration guidance, see the [migration guide](https://github.com/dotnet/machinelearning/blob/main/docs/code/microsoft-ml-tokenizers-migration-guide.md).
8789

0 commit comments

Comments
 (0)