Skip to content

Commit fb39755

Browse files
authored
Support OpenAI OSS Models with Tiktoken tokenizer (#7494)
* Support OpenAI OSS Models with Tiktoken tokenizer * Add gpt-5 support
1 parent d9694c4 commit fb39755

File tree

2 files changed

+181
-39
lines changed

2 files changed

+181
-39
lines changed

src/Microsoft.ML.Tokenizers/Model/TiktokenTokenizer.cs

Lines changed: 82 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1013,6 +1013,15 @@ public override OperationStatus Decode(IEnumerable<int> ids, Span<char> destinat
10131013
private const string IMStart = "<|im_start|>";
10141014
private const string IMEnd = "<|im_end|>";
10151015
private const string IMSep = "<|im_sep|>";
1016+
private const string StartOfText = "<|startoftext|>";
1017+
private const string Return = "<|return|>";
1018+
private const string Constrain = "<|constrain|>";
1019+
private const string Channel = "<|channel|>";
1020+
private const string Start = "<|start|>";
1021+
private const string End = "<|end|>";
1022+
private const string Message = "<|message|>";
1023+
private const string Call = "<|call|>";
1024+
private const string ReservedPrefix = "<|reserved_";
10161025

10171026
private enum ModelEncoding
10181027
{
@@ -1022,40 +1031,69 @@ private enum ModelEncoding
10221031
P50kEdit,
10231032
R50kBase,
10241033
GPT2,
1025-
O200kBase
1034+
O200kBase,
1035+
O200kHarmony
10261036
}
10271037

10281038
private const string Phi4ModelName = "phi-4";
10291039

10301040
private static readonly (string Prefix, ModelEncoding Encoding)[] _modelPrefixToEncoding =
10311041
[
1032-
// chat
10331042
( "o1-", ModelEncoding.O200kBase ), // e.g. o1-mini
10341043
( "o3-", ModelEncoding.O200kBase ), // e.g. o3-mini
1044+
( "o4-mini-", ModelEncoding.O200kBase ), // e.g. o4-mini
1045+
1046+
// chat
1047+
( "gpt-5-", ModelEncoding.O200kBase),
10351048
( "gpt-4.1-", ModelEncoding.O200kBase), // e.g., gpt-4.1-mini
1049+
( "gpt-4.5-", ModelEncoding.O200kBase), // e.g., gpt-4.5
10361050
( "gpt-4o-", ModelEncoding.O200kBase), // e.g., gpt-4o-2024-05-13
1051+
( "chatgpt-4o-", ModelEncoding.O200kBase),
10371052
( "gpt-4-", ModelEncoding.Cl100kBase), // e.g., gpt-4-0314, etc., plus gpt-4-32k
10381053
( "gpt-3.5-", ModelEncoding.Cl100kBase), // e.g, gpt-3.5-turbo-0301, -0401, etc.
1039-
( "gpt-35-", ModelEncoding.Cl100kBase ) // Azure deployment name
1054+
( "gpt-35-", ModelEncoding.Cl100kBase ), // Azure deployment name
1055+
( "gpt-oss-", ModelEncoding.O200kHarmony ),
1056+
1057+
// fine-tuned
1058+
( "ft:gpt-4o", ModelEncoding.O200kBase ),
1059+
( "ft:gpt-4", ModelEncoding.Cl100kBase ),
1060+
( "ft:gpt-3.5-turbo", ModelEncoding.Cl100kBase ),
1061+
( "ft:davinci-002", ModelEncoding.Cl100kBase ),
1062+
( "ft:babbage-002", ModelEncoding.Cl100kBase ),
10401063
];
10411064

10421065
private static readonly Dictionary<string, ModelEncoding> _modelToEncoding =
10431066
new Dictionary<string, ModelEncoding>(StringComparer.OrdinalIgnoreCase)
10441067
{
1045-
// chat
1046-
{ "gpt-4o", ModelEncoding.O200kBase },
1068+
// reasoning
10471069
{ "o1", ModelEncoding.O200kBase },
10481070
{ "o3", ModelEncoding.O200kBase },
10491071
{ "o4-mini", ModelEncoding.O200kBase },
1072+
1073+
// chat
1074+
{ "gpt-5", ModelEncoding.O200kBase },
10501075
{ "gpt-4.1", ModelEncoding.O200kBase },
1076+
{ "gpt-4o", ModelEncoding.O200kBase },
10511077
{ "gpt-4", ModelEncoding.Cl100kBase },
10521078
{ "gpt-3.5-turbo", ModelEncoding.Cl100kBase },
1079+
{ "gpt-3.5", ModelEncoding.Cl100kBase },
10531080
{ "gpt-3.5-turbo-16k", ModelEncoding.Cl100kBase },
10541081
{ "gpt-35", ModelEncoding.Cl100kBase }, // Azure deployment name
10551082
{ "gpt-35-turbo", ModelEncoding.Cl100kBase }, // Azure deployment name
10561083
{ "gpt-35-turbo-16k", ModelEncoding.Cl100kBase }, // Azure deployment name
10571084

1058-
// text
1085+
// Base
1086+
{ "davinci-002", ModelEncoding.Cl100kBase },
1087+
{ "babbage-002", ModelEncoding.Cl100kBase },
1088+
1089+
// embeddings
1090+
// https://platform.openai.com/docs/guides/embeddings/what-are-embeddings
1091+
{ "text-embedding-ada-002", ModelEncoding.Cl100kBase },
1092+
{ "text-embedding-3-small", ModelEncoding.Cl100kBase },
1093+
{ "text-embedding-3-large", ModelEncoding.Cl100kBase },
1094+
1095+
// DEPRECATED MODELS
1096+
// text (DEPRECATED)
10591097
{ "text-davinci-003", ModelEncoding.P50kBase },
10601098
{ "text-davinci-002", ModelEncoding.P50kBase },
10611099
{ "text-davinci-001", ModelEncoding.R50kBase },
@@ -1067,25 +1105,20 @@ private static readonly (string Prefix, ModelEncoding Encoding)[] _modelPrefixTo
10671105
{ "babbage", ModelEncoding.R50kBase },
10681106
{ "ada", ModelEncoding.R50kBase },
10691107

1070-
// code
1108+
// code (DEPRECATED)
10711109
{ "code-davinci-002", ModelEncoding.P50kBase },
10721110
{ "code-davinci-001", ModelEncoding.P50kBase },
10731111
{ "code-cushman-002", ModelEncoding.P50kBase },
10741112
{ "code-cushman-001", ModelEncoding.P50kBase },
10751113
{ "davinci-codex", ModelEncoding.P50kBase },
10761114
{ "cushman-codex", ModelEncoding.P50kBase },
10771115

1078-
// edit
1116+
// edit (DEPRECATED)
10791117
{ "text-davinci-edit-001", ModelEncoding.P50kEdit },
10801118
{ "code-davinci-edit-001", ModelEncoding.P50kEdit },
10811119

1082-
// embeddings
1083-
// https://platform.openai.com/docs/guides/embeddings/what-are-embeddings
1084-
{ "text-embedding-ada-002", ModelEncoding.Cl100kBase },
1085-
{ "text-embedding-3-small", ModelEncoding.Cl100kBase },
1086-
{ "text-embedding-3-large", ModelEncoding.Cl100kBase },
10871120

1088-
// old embeddings
1121+
// old embeddings (DEPRECATED)
10891122
{ "text-similarity-davinci-001", ModelEncoding.R50kBase },
10901123
{ "text-similarity-curie-001", ModelEncoding.R50kBase },
10911124
{ "text-similarity-babbage-001", ModelEncoding.R50kBase },
@@ -1099,6 +1132,7 @@ private static readonly (string Prefix, ModelEncoding Encoding)[] _modelPrefixTo
10991132

11001133
// open source
11011134
{ "gpt2", ModelEncoding.GPT2 },
1135+
{ "gpt-2", ModelEncoding.GPT2 },
11021136

11031137
// phi-4
11041138
{ Phi4ModelName, ModelEncoding.Cl100kBase },
@@ -1126,6 +1160,32 @@ private static ModelEncoding GetModelEncoding(string modelName)
11261160
return encoder;
11271161
}
11281162

1163+
private static Dictionary<string, int> CreateHarmonyEncodingSpecialTokens() =>
1164+
new Dictionary<string, int>
1165+
{
1166+
{ StartOfText, 199998 },
1167+
{ EndOfText, 199999 },
1168+
{ $"{ReservedPrefix}200000|>", 200000 },
1169+
{ $"{ReservedPrefix}200001|>", 200001 },
1170+
{ Return, 200002 },
1171+
{ Constrain, 200003 },
1172+
{ $"{ReservedPrefix}200004|>", 200004 },
1173+
{ Channel, 200005 },
1174+
{ Start, 200006 },
1175+
{ End, 200007 },
1176+
{ Message, 200008 },
1177+
{ $"{ReservedPrefix}200009|>", 200009 },
1178+
{ $"{ReservedPrefix}200010|>", 200010 },
1179+
{ $"{ReservedPrefix}200011|>", 200011 },
1180+
{ Call, 200012 },
1181+
{ $"{ReservedPrefix}200013|>", 200013 },
1182+
{ $"{ReservedPrefix}200014|>", 200014 },
1183+
{ $"{ReservedPrefix}200015|>", 200015 },
1184+
{ $"{ReservedPrefix}200016|>", 200016 },
1185+
{ $"{ReservedPrefix}200017|>", 200017 },
1186+
{ EndOfPrompt, 200018 },
1187+
};
1188+
11291189
private static (Dictionary<string, int> SpecialTokens, Regex Regex, string VocabFile, Type? DataType, string PackageName) GetTiktokenConfigurations(string modelName) => GetTiktokenConfigurations(GetModelEncoding(modelName), modelName);
11301190

11311191
private static (Dictionary<string, int> SpecialTokens, Regex Regex, string VocabFile, Type? DataType, string PackageName) GetTiktokenConfigurations(ModelEncoding modelEncoding, string? modelName = null)
@@ -1157,6 +1217,9 @@ private static (Dictionary<string, int> SpecialTokens, Regex Regex, string Vocab
11571217
case ModelEncoding.R50kBase:
11581218
return (new Dictionary<string, int> { { EndOfText, 50256 } }, P50kBaseRegex(), R50RanksFile, Type.GetType(R50kBaseTypeName), R50kBasePackageName);
11591219

1220+
case ModelEncoding.O200kHarmony:
1221+
return (CreateHarmonyEncodingSpecialTokens(), O200kBaseRegex(), O200kBaseFile, Type.GetType(O200kBaseTypeName), O200kBasePackageName);
1222+
11601223
default:
11611224
throw new NotSupportedException($"The model '{modelName ?? modelEncoding.ToString()}' is not supported.");
11621225
}
@@ -1179,6 +1242,7 @@ private static (Dictionary<string, int> SpecialTokens, Regex Regex, string Vocab
11791242
internal const string P50kEditEncodingName = "p50k_edit";
11801243
internal const string R50kBaseEncodingName = "r50k_base";
11811244
internal const string O200kBaseEncodingName = "o200k_base";
1245+
internal const string O200kHarmonyEncodingName = "o200k_harmony";
11821246

11831247
internal const string Cl100kBasePackageName = "Microsoft.ML.Tokenizers.Data.Cl100kBase";
11841248
internal const string Gpt2PackageName = "Microsoft.ML.Tokenizers.Data.Gpt2";
@@ -1474,6 +1538,10 @@ public static TiktokenTokenizer CreateForEncoding(string encodingName, IReadOnly
14741538
{
14751539
modelEncoding = ModelEncoding.O200kBase;
14761540
}
1541+
else if (encodingName.Equals(O200kHarmonyEncodingName, StringComparison.OrdinalIgnoreCase))
1542+
{
1543+
modelEncoding = ModelEncoding.O200kHarmony;
1544+
}
14771545
else if (encodingName.Equals(P50kBaseEncodingName, StringComparison.OrdinalIgnoreCase))
14781546
{
14791547
modelEncoding = ModelEncoding.P50kBase;

0 commit comments

Comments
 (0)