11## About  
22
3- Microsoft.ML.Tokenizers supports various the implementation of the tokenization used in the NLP transforms .
3+ Microsoft.ML.Tokenizers provides an abstraction for tokenizers as well as implementations of common tokenization algorithms .
44
55## Key Features  
66
@@ -15,28 +15,28 @@ Microsoft.ML.Tokenizers supports various the implementation of the tokenization
1515
1616``` c# 
1717using  Microsoft .ML .Tokenizers ;
18- using  System .Net .Http ;
1918using  System .IO ;
19+ using  System .Net .Http ;
2020
2121// 
2222//  Using Tiktoken Tokenizer
2323// 
2424
25- //  initialize  the tokenizer for `gpt-4 ` model
26- Tokenizer  tokenizer  =  TiktokenTokenizer .CreateForModel (" gpt-4 "  );
25+ //  Initialize  the tokenizer for the  `gpt-4o ` model. This instance should be cached for all subsequent use. 
26+ Tokenizer  tokenizer  =  TiktokenTokenizer .CreateForModel (" gpt-4o "  );
2727
2828string  source  =  " Text tokenization is the process of splitting a string into a list of tokens."  ;
2929
3030Console .WriteLine ($" Tokens: {tokenizer .CountTokens (source )}"  );
31- //  print : Tokens: 16
31+ //  prints : Tokens: 16
3232
3333var  trimIndex  =  tokenizer .GetIndexByTokenCountFromEnd (source , 5 , out  string  processedText , out  _ );
3434Console .WriteLine ($" 5 tokens from end: {processedText .Substring (trimIndex )}"  );
35- //  5 tokens from end:  a list of tokens.
35+ //  prints:  5 tokens from end:  a list of tokens.
3636
3737trimIndex  =  tokenizer .GetIndexByTokenCount (source , 5 , out  processedText , out  _ );
3838Console .WriteLine ($" 5 tokens from start: {processedText .Substring (0 , trimIndex )}"  );
39- //  5 tokens from start: Text tokenization is the
39+ //  prints:  5 tokens from start: Text tokenization is the
4040
4141IReadOnlyList < int >  ids  =  tokenizer .EncodeToIds (source );
4242Console .WriteLine (string .Join (" , "  , ids ));
@@ -46,20 +46,21 @@ Console.WriteLine(string.Join(", ", ids));
4646//  Using Llama Tokenizer
4747// 
4848
49- //  Open stream of  remote Llama tokenizer model data file
49+ //  Open a  stream to the  remote Llama tokenizer model data file. 
5050using  HttpClient  httpClient  =  new ();
5151const  string  modelUrl  =  @" https://huggingface.co/hf-internal-testing/llama-tokenizer/resolve/main/tokenizer.model"  ;
5252using  Stream  remoteStream  =  await  httpClient .GetStreamAsync (modelUrl );
5353
54- //  Create the Llama tokenizer using the remote stream
54+ //  Create the Llama tokenizer using the remote stream. This should be cached for all subsequent use. 
5555Tokenizer  llamaTokenizer  =  LlamaTokenizer .Create (remoteStream );
56+ 
5657string  input  =  " Hello, world!"  ;
5758ids  =  llamaTokenizer .EncodeToIds (input );
5859Console .WriteLine (string .Join (" , "  , ids ));
5960//  prints: 1, 15043, 29892, 3186, 29991
6061
6162Console .WriteLine ($" Tokens: {llamaTokenizer .CountTokens (input )}"  );
62- //  print : Tokens: 5
63+ //  prints : Tokens: 5
6364``` 
6465
6566## Main Types  
0 commit comments