@@ -1013,6 +1013,15 @@ public override OperationStatus Decode(IEnumerable<int> ids, Span<char> destinat
1013
1013
private const string IMStart = "<|im_start|>" ;
1014
1014
private const string IMEnd = "<|im_end|>" ;
1015
1015
private const string IMSep = "<|im_sep|>" ;
1016
+ private const string StartOfText = "<|startoftext|>" ;
1017
+ private const string Return = "<|return|>" ;
1018
+ private const string Constrain = "<|constrain|>" ;
1019
+ private const string Channel = "<|channel|>" ;
1020
+ private const string Start = "<|start|>" ;
1021
+ private const string End = "<|end|>" ;
1022
+ private const string Message = "<|message|>" ;
1023
+ private const string Call = "<|call|>" ;
1024
+ private const string ReservedPrefix = "<|reserved_" ;
1016
1025
1017
1026
private enum ModelEncoding
1018
1027
{
@@ -1022,40 +1031,69 @@ private enum ModelEncoding
1022
1031
P50kEdit ,
1023
1032
R50kBase ,
1024
1033
GPT2 ,
1025
- O200kBase
1034
+ O200kBase ,
1035
+ O200kHarmony
1026
1036
}
1027
1037
1028
1038
private const string Phi4ModelName = "phi-4" ;
1029
1039
1030
1040
private static readonly ( string Prefix , ModelEncoding Encoding ) [ ] _modelPrefixToEncoding =
1031
1041
[
1032
- // chat
1033
1042
( "o1-" , ModelEncoding . O200kBase ) , // e.g. o1-mini
1034
1043
( "o3-" , ModelEncoding . O200kBase ) , // e.g. o3-mini
1044
+ ( "o4-mini-" , ModelEncoding . O200kBase ) , // e.g. o4-mini
1045
+
1046
+ // chat
1047
+ ( "gpt-5-" , ModelEncoding . O200kBase ) ,
1035
1048
( "gpt-4.1-" , ModelEncoding . O200kBase ) , // e.g., gpt-4.1-mini
1049
+ ( "gpt-4.5-" , ModelEncoding . O200kBase ) , // e.g., gpt-4.5
1036
1050
( "gpt-4o-" , ModelEncoding . O200kBase ) , // e.g., gpt-4o-2024-05-13
1051
+ ( "chatgpt-4o-" , ModelEncoding . O200kBase ) ,
1037
1052
( "gpt-4-" , ModelEncoding . Cl100kBase ) , // e.g., gpt-4-0314, etc., plus gpt-4-32k
1038
1053
( "gpt-3.5-" , ModelEncoding . Cl100kBase ) , // e.g, gpt-3.5-turbo-0301, -0401, etc.
1039
- ( "gpt-35-" , ModelEncoding . Cl100kBase ) // Azure deployment name
1054
+ ( "gpt-35-" , ModelEncoding . Cl100kBase ) , // Azure deployment name
1055
+ ( "gpt-oss-" , ModelEncoding . O200kHarmony ) ,
1056
+
1057
+ // fine-tuned
1058
+ ( "ft:gpt-4o" , ModelEncoding . O200kBase ) ,
1059
+ ( "ft:gpt-4" , ModelEncoding . Cl100kBase ) ,
1060
+ ( "ft:gpt-3.5-turbo" , ModelEncoding . Cl100kBase ) ,
1061
+ ( "ft:davinci-002" , ModelEncoding . Cl100kBase ) ,
1062
+ ( "ft:babbage-002" , ModelEncoding . Cl100kBase ) ,
1040
1063
] ;
1041
1064
1042
1065
private static readonly Dictionary < string , ModelEncoding > _modelToEncoding =
1043
1066
new Dictionary < string , ModelEncoding > ( StringComparer . OrdinalIgnoreCase )
1044
1067
{
1045
- // chat
1046
- { "gpt-4o" , ModelEncoding . O200kBase } ,
1068
+ // reasoning
1047
1069
{ "o1" , ModelEncoding . O200kBase } ,
1048
1070
{ "o3" , ModelEncoding . O200kBase } ,
1049
1071
{ "o4-mini" , ModelEncoding . O200kBase } ,
1072
+
1073
+ // chat
1074
+ { "gpt-5" , ModelEncoding . O200kBase } ,
1050
1075
{ "gpt-4.1" , ModelEncoding . O200kBase } ,
1076
+ { "gpt-4o" , ModelEncoding . O200kBase } ,
1051
1077
{ "gpt-4" , ModelEncoding . Cl100kBase } ,
1052
1078
{ "gpt-3.5-turbo" , ModelEncoding . Cl100kBase } ,
1079
+ { "gpt-3.5" , ModelEncoding . Cl100kBase } ,
1053
1080
{ "gpt-3.5-turbo-16k" , ModelEncoding . Cl100kBase } ,
1054
1081
{ "gpt-35" , ModelEncoding . Cl100kBase } , // Azure deployment name
1055
1082
{ "gpt-35-turbo" , ModelEncoding . Cl100kBase } , // Azure deployment name
1056
1083
{ "gpt-35-turbo-16k" , ModelEncoding . Cl100kBase } , // Azure deployment name
1057
1084
1058
- // text
1085
+ // Base
1086
+ { "davinci-002" , ModelEncoding . Cl100kBase } ,
1087
+ { "babbage-002" , ModelEncoding . Cl100kBase } ,
1088
+
1089
+ // embeddings
1090
+ // https://platform.openai.com/docs/guides/embeddings/what-are-embeddings
1091
+ { "text-embedding-ada-002" , ModelEncoding . Cl100kBase } ,
1092
+ { "text-embedding-3-small" , ModelEncoding . Cl100kBase } ,
1093
+ { "text-embedding-3-large" , ModelEncoding . Cl100kBase } ,
1094
+
1095
+ // DEPRECATED MODELS
1096
+ // text (DEPRECATED)
1059
1097
{ "text-davinci-003" , ModelEncoding . P50kBase } ,
1060
1098
{ "text-davinci-002" , ModelEncoding . P50kBase } ,
1061
1099
{ "text-davinci-001" , ModelEncoding . R50kBase } ,
@@ -1067,25 +1105,20 @@ private static readonly (string Prefix, ModelEncoding Encoding)[] _modelPrefixTo
1067
1105
{ "babbage" , ModelEncoding . R50kBase } ,
1068
1106
{ "ada" , ModelEncoding . R50kBase } ,
1069
1107
1070
- // code
1108
+ // code (DEPRECATED)
1071
1109
{ "code-davinci-002" , ModelEncoding . P50kBase } ,
1072
1110
{ "code-davinci-001" , ModelEncoding . P50kBase } ,
1073
1111
{ "code-cushman-002" , ModelEncoding . P50kBase } ,
1074
1112
{ "code-cushman-001" , ModelEncoding . P50kBase } ,
1075
1113
{ "davinci-codex" , ModelEncoding . P50kBase } ,
1076
1114
{ "cushman-codex" , ModelEncoding . P50kBase } ,
1077
1115
1078
- // edit
1116
+ // edit (DEPRECATED)
1079
1117
{ "text-davinci-edit-001" , ModelEncoding . P50kEdit } ,
1080
1118
{ "code-davinci-edit-001" , ModelEncoding . P50kEdit } ,
1081
1119
1082
- // embeddings
1083
- // https://platform.openai.com/docs/guides/embeddings/what-are-embeddings
1084
- { "text-embedding-ada-002" , ModelEncoding . Cl100kBase } ,
1085
- { "text-embedding-3-small" , ModelEncoding . Cl100kBase } ,
1086
- { "text-embedding-3-large" , ModelEncoding . Cl100kBase } ,
1087
1120
1088
- // old embeddings
1121
+ // old embeddings (DEPRECATED)
1089
1122
{ "text-similarity-davinci-001" , ModelEncoding . R50kBase } ,
1090
1123
{ "text-similarity-curie-001" , ModelEncoding . R50kBase } ,
1091
1124
{ "text-similarity-babbage-001" , ModelEncoding . R50kBase } ,
@@ -1099,6 +1132,7 @@ private static readonly (string Prefix, ModelEncoding Encoding)[] _modelPrefixTo
1099
1132
1100
1133
// open source
1101
1134
{ "gpt2" , ModelEncoding . GPT2 } ,
1135
+ { "gpt-2" , ModelEncoding . GPT2 } ,
1102
1136
1103
1137
// phi-4
1104
1138
{ Phi4ModelName , ModelEncoding . Cl100kBase } ,
@@ -1126,6 +1160,32 @@ private static ModelEncoding GetModelEncoding(string modelName)
1126
1160
return encoder ;
1127
1161
}
1128
1162
1163
+ private static Dictionary < string , int > CreateHarmonyEncodingSpecialTokens ( ) =>
1164
+ new Dictionary < string , int >
1165
+ {
1166
+ { StartOfText , 199998 } ,
1167
+ { EndOfText , 199999 } ,
1168
+ { $ "{ ReservedPrefix } 200000|>", 200000 } ,
1169
+ { $ "{ ReservedPrefix } 200001|>", 200001 } ,
1170
+ { Return , 200002 } ,
1171
+ { Constrain , 200003 } ,
1172
+ { $ "{ ReservedPrefix } 200004|>", 200004 } ,
1173
+ { Channel , 200005 } ,
1174
+ { Start , 200006 } ,
1175
+ { End , 200007 } ,
1176
+ { Message , 200008 } ,
1177
+ { $ "{ ReservedPrefix } 200009|>", 200009 } ,
1178
+ { $ "{ ReservedPrefix } 200010|>", 200010 } ,
1179
+ { $ "{ ReservedPrefix } 200011|>", 200011 } ,
1180
+ { Call , 200012 } ,
1181
+ { $ "{ ReservedPrefix } 200013|>", 200013 } ,
1182
+ { $ "{ ReservedPrefix } 200014|>", 200014 } ,
1183
+ { $ "{ ReservedPrefix } 200015|>", 200015 } ,
1184
+ { $ "{ ReservedPrefix } 200016|>", 200016 } ,
1185
+ { $ "{ ReservedPrefix } 200017|>", 200017 } ,
1186
+ { EndOfPrompt , 200018 } ,
1187
+ } ;
1188
+
1129
1189
private static ( Dictionary < string , int > SpecialTokens , Regex Regex , string VocabFile , Type ? DataType , string PackageName ) GetTiktokenConfigurations ( string modelName ) => GetTiktokenConfigurations ( GetModelEncoding ( modelName ) , modelName ) ;
1130
1190
1131
1191
private static ( Dictionary < string , int > SpecialTokens , Regex Regex , string VocabFile , Type ? DataType , string PackageName ) GetTiktokenConfigurations ( ModelEncoding modelEncoding , string ? modelName = null )
@@ -1157,6 +1217,9 @@ private static (Dictionary<string, int> SpecialTokens, Regex Regex, string Vocab
1157
1217
case ModelEncoding . R50kBase :
1158
1218
return ( new Dictionary < string , int > { { EndOfText , 50256 } } , P50kBaseRegex ( ) , R50RanksFile , Type . GetType ( R50kBaseTypeName ) , R50kBasePackageName ) ;
1159
1219
1220
+ case ModelEncoding . O200kHarmony :
1221
+ return ( CreateHarmonyEncodingSpecialTokens ( ) , O200kBaseRegex ( ) , O200kBaseFile , Type . GetType ( O200kBaseTypeName ) , O200kBasePackageName ) ;
1222
+
1160
1223
default :
1161
1224
throw new NotSupportedException ( $ "The model '{ modelName ?? modelEncoding . ToString ( ) } ' is not supported.") ;
1162
1225
}
@@ -1179,6 +1242,7 @@ private static (Dictionary<string, int> SpecialTokens, Regex Regex, string Vocab
1179
1242
internal const string P50kEditEncodingName = "p50k_edit" ;
1180
1243
internal const string R50kBaseEncodingName = "r50k_base" ;
1181
1244
internal const string O200kBaseEncodingName = "o200k_base" ;
1245
+ internal const string O200kHarmonyEncodingName = "o200k_harmony" ;
1182
1246
1183
1247
internal const string Cl100kBasePackageName = "Microsoft.ML.Tokenizers.Data.Cl100kBase" ;
1184
1248
internal const string Gpt2PackageName = "Microsoft.ML.Tokenizers.Data.Gpt2" ;
@@ -1474,6 +1538,10 @@ public static TiktokenTokenizer CreateForEncoding(string encodingName, IReadOnly
1474
1538
{
1475
1539
modelEncoding = ModelEncoding . O200kBase ;
1476
1540
}
1541
+ else if ( encodingName . Equals ( O200kHarmonyEncodingName , StringComparison . OrdinalIgnoreCase ) )
1542
+ {
1543
+ modelEncoding = ModelEncoding . O200kHarmony ;
1544
+ }
1477
1545
else if ( encodingName . Equals ( P50kBaseEncodingName , StringComparison . OrdinalIgnoreCase ) )
1478
1546
{
1479
1547
modelEncoding = ModelEncoding . P50kBase ;
0 commit comments