Skip to content

Commit a30eec9

Browse files
authored
Cleanup SentencePiece tokenizer (#7427)
1 parent 444573b commit a30eec9

File tree

6 files changed

+8
-579
lines changed

6 files changed

+8
-579
lines changed

src/Microsoft.ML.Tokenizers/Model/SentencePieceBaseModel.cs

Lines changed: 0 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -59,67 +59,6 @@ internal SentencePieceBaseModel(ModelProto modelProto, bool addBos = false, bool
5959
specialTokens);
6060
}
6161

62-
internal SentencePieceBaseModel(SentencePieceOptions options)
63-
{
64-
if (options is null)
65-
{
66-
throw new ArgumentNullException(nameof(options));
67-
}
68-
69-
if (options.Vocabulary is null)
70-
{
71-
throw new ArgumentNullException(nameof(options.Vocabulary));
72-
}
73-
74-
if (options.BeginningOfSentenceToken is null)
75-
{
76-
throw new ArgumentNullException(nameof(options.BeginningOfSentenceToken));
77-
}
78-
79-
if (options.EndOfSentenceToken is null)
80-
{
81-
throw new ArgumentNullException(nameof(options.EndOfSentenceToken));
82-
}
83-
84-
if (options.UnknownToken is null)
85-
{
86-
throw new ArgumentNullException(nameof(options.UnknownToken));
87-
}
88-
89-
AddBeginningOfSentence = options.AddBeginningOfSentence;
90-
AddEndOfSentence = options.AddEndOfSentence;
91-
BeginningOfSentenceToken = options.BeginningOfSentenceToken;
92-
EndOfSentenceToken = options.EndOfSentenceToken;
93-
UnknownToken = options.UnknownToken;
94-
AddDummyPrefix = options.AddDummyPrefix;
95-
EscapeWhiteSpaces = options.EscapeWhiteSpaces;
96-
TreatWhitespaceAsSuffix = options.TreatWhitespaceAsSuffix;
97-
ByteFallback = options.ByteFallback;
98-
SpecialTokens = options.SpecialTokens;
99-
100-
if (SpecialTokens is not null && SpecialTokens.Count > 0)
101-
{
102-
InternalSpecialTokens = new Dictionary<StringSpanOrdinalKey, int>();
103-
SpecialTokensReverse = new Dictionary<int, string>();
104-
105-
foreach (var item in SpecialTokens)
106-
{
107-
InternalSpecialTokens.Add(new StringSpanOrdinalKey(item.Key), item.Value);
108-
SpecialTokensReverse.Add(item.Value, item.Key);
109-
}
110-
111-
// We create this Regex object without a timeout, as we expect the match operation to complete in O(N) time complexity. Note that `specialTokens` are treated as constants after the tokenizer is created.
112-
SpecialTokensRegex = new Regex(string.Join("|", SpecialTokens.Keys.Select(s => Regex.Escape(s))), RegexOptions.Compiled);
113-
}
114-
115-
Normalizer = new SentencePieceNormalizer(
116-
options.PrecompiledNormalizationData,
117-
options.RemoveExtraWhiteSpaces,
118-
options.AddDummyPrefix, options.EscapeWhiteSpaces,
119-
options.TreatWhitespaceAsSuffix,
120-
SpecialTokens);
121-
}
122-
12362
internal Regex? SpecialTokensRegex { get; }
12463

12564
internal Dictionary<StringSpanOrdinalKey, int>? InternalSpecialTokens { get; }

src/Microsoft.ML.Tokenizers/Model/SentencePieceBpeModel.cs

Lines changed: 0 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -41,52 +41,6 @@ internal SentencePieceBpeModel(ModelProto modelProto, bool addBos, bool addEos,
4141
OneByteUtf8EncodingMaxId = ByteCodeToIdOffset + 0x7F; // 0x7F is the maximum value of the one byte UTF-8 character.
4242
}
4343

44-
internal SentencePieceBpeModel(SentencePieceOptions options) : base(options)
45-
{
46-
if (options.PrecompiledNormalizationData is not null)
47-
{
48-
throw new NotSupportedException("Normalization data is not supported for SentencePieceBpeModel.");
49-
}
50-
51-
Debug.Assert(options.Vocabulary is not null);
52-
53-
int id = 0;
54-
foreach (var item in options.Vocabulary!)
55-
{
56-
_vocab.Add(new StringSpanOrdinalKey(item.Token), (id, item.Score, (byte)ModelProto.Types.SentencePiece.Types.Type.Normal));
57-
_vocabReverse.Add(id++, item.Token);
58-
}
59-
60-
if (options.ByteFallback)
61-
{
62-
if (!_vocab.TryGetValue("<0x00>", out (int Id, float Score, byte Type) value))
63-
{
64-
throw new ArgumentException("'ByteFallback' is enabled but the vocabulary must include a special token for each byte value (0-255) in the format <0xNN>, where NN represents the byte's hexadecimal value.");
65-
}
66-
67-
ByteCodeToIdOffset = value.Id;
68-
OneByteUtf8EncodingMaxId = ByteCodeToIdOffset + 0x7F; // 0x7F is the maximum value of the one byte UTF-8 character.
69-
}
70-
71-
if (!_vocab.TryGetValue(options.UnknownToken, out (int Id, float Score, byte Type) unknownToken))
72-
{
73-
throw new ArgumentException($"The vocabulary must include the unknown token '{options.UnknownToken}'.");
74-
}
75-
UnknownId = unknownToken.Id;
76-
77-
if (!_vocab.TryGetValue(options.BeginningOfSentenceToken, out (int Id, float Score, byte Type) beginOfSentenceToken))
78-
{
79-
throw new ArgumentException($"The vocabulary must include the beginning of sentence token '{options.BeginningOfSentenceToken}'.");
80-
}
81-
BeginningOfSentenceId = beginOfSentenceToken.Id;
82-
83-
if (!_vocab.TryGetValue(options.EndOfSentenceToken, out (int Id, float Score, byte Type) endOfSentenceToken))
84-
{
85-
throw new ArgumentException($"The vocabulary must include the end of sentence token '{options.EndOfSentenceToken}'.");
86-
}
87-
EndOfSentenceId = endOfSentenceToken.Id;
88-
}
89-
9044
public override IReadOnlyDictionary<string, int> Vocabulary
9145
{
9246
get

src/Microsoft.ML.Tokenizers/Model/SentencePieceOptions.cs

Lines changed: 0 additions & 118 deletions
This file was deleted.

src/Microsoft.ML.Tokenizers/Model/SentencePieceTokenizer.cs

Lines changed: 0 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -30,16 +30,6 @@ internal SentencePieceTokenizer(ModelProto modelProto, bool addBos, bool addEos,
3030
};
3131
}
3232

33-
internal SentencePieceTokenizer(SentencePieceOptions options)
34-
{
35-
_model = options.ModelType switch
36-
{
37-
SentencePieceModelType.Bpe => new SentencePieceBpeModel(options),
38-
SentencePieceModelType.Unigram => new SentencePieceUnigramModel(options),
39-
_ => throw new ArgumentException($"The model type '{options.ModelType}' is not supported.", nameof(options.ModelType))
40-
};
41-
}
42-
4333
/// <summary>
4434
/// The special tokens.
4535
/// </summary>
@@ -467,19 +457,5 @@ public static SentencePieceTokenizer Create(
467457

468458
return new SentencePieceTokenizer(modelProto, addBeginOfSentence, addEndOfSentence, specialTokens);
469459
}
470-
471-
/// <summary>
472-
/// Creates an instance of SentencePieceTokenizer.
473-
/// </summary>
474-
/// <param name="options">The options to use for the sentence piece tokenizer.</param>
475-
public static SentencePieceTokenizer Create(SentencePieceOptions options)
476-
{
477-
if (options is null)
478-
{
479-
throw new ArgumentNullException(nameof(options));
480-
}
481-
482-
return new SentencePieceTokenizer(options);
483-
}
484460
}
485461
}

src/Microsoft.ML.Tokenizers/Model/SentencePieceUnigramModel.cs

Lines changed: 8 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,14 @@ internal sealed class SentencePieceUnigramModel : SentencePieceBaseModel
2727
public SentencePieceUnigramModel(ModelProto modelProto, bool addBos, bool addEos, IReadOnlyDictionary<string, int>? specialTokens = null) : base(modelProto, addBos, addEos, specialTokens)
2828
{
2929
_vocab = new SortedDictionary<string, int>(OrdinalUtf8StringComparer.Instance);
30+
31+
if (modelProto.TrainerSpec.BosId >= modelProto.Pieces.Count ||
32+
modelProto.TrainerSpec.EosId >= modelProto.Pieces.Count ||
33+
modelProto.TrainerSpec.UnkId >= modelProto.Pieces.Count)
34+
{
35+
throw new ArgumentException("The BOS, EOS, or UNK token is not present in the vocabulary.");
36+
}
37+
3038
_vocabReverse = new (string Piece, float Score, ModelProto.Types.SentencePiece.Types.Type Type)[modelProto.Pieces.Count];
3139

3240
_minScore = float.MaxValue;
@@ -85,64 +93,6 @@ public SentencePieceUnigramModel(ModelProto modelProto, bool addBos, bool addEos
8593
}
8694
}
8795

88-
public SentencePieceUnigramModel(SentencePieceOptions options) : base(options)
89-
{
90-
_vocab = new SortedDictionary<string, int>(OrdinalUtf8StringComparer.Instance);
91-
// _vocabReverse = new (string Piece, float Score, ModelProto.Types.SentencePiece.Types.Type Type)[];
92-
93-
// 250_000 using big number to avoid reallocation during the initialization.
94-
List<(string Piece, float Score, ModelProto.Types.SentencePiece.Types.Type Type)> vocabReverse = new(250_000);
95-
96-
_minScore = float.MaxValue;
97-
_maxScore = float.MinValue;
98-
99-
int id = 0;
100-
foreach ((string Token, float Score) item in options.Vocabulary!)
101-
{
102-
_vocab.Add(item.Token, id++);
103-
vocabReverse.Add((item.Token, item.Score, ModelProto.Types.SentencePiece.Types.Type.Normal));
104-
_minScore = Math.Min(_minScore, item.Score);
105-
_maxScore = Math.Max(_maxScore, item.Score);
106-
}
107-
108-
_vocabReverse = vocabReverse.ToArray();
109-
110-
if (options.ByteFallback)
111-
{
112-
if (!_vocab.TryGetValue("<0x00>", out id))
113-
{
114-
throw new ArgumentException("'ByteFallback' is enabled but the vocabulary must include a special token for each byte value (0-255) in the format <0xNN>, where NN represents the byte's hexadecimal value.");
115-
}
116-
117-
ByteCodeToIdOffset = id;
118-
OneByteUtf8EncodingMaxId = ByteCodeToIdOffset + 0x7F; // 0x7F is the maximum value of the one byte UTF-8 character.
119-
MaxIdByteFallbackId = ByteCodeToIdOffset + 0xFF; // from <0x00> to <0xFF>.
120-
}
121-
122-
_trie = new DoubleArrayTrie(_vocab);
123-
124-
_vocabReverse[BeginningOfSentenceId] = (BeginningOfSentenceToken, 0f, 0);
125-
_vocabReverse[EndOfSentenceId] = (EndOfSentenceToken, 0f, 0);
126-
127-
if (!_vocab.TryGetValue(options.UnknownToken, out int unknownToken))
128-
{
129-
throw new ArgumentException($"The vocabulary must include the unknown token '{options.UnknownToken}'.");
130-
}
131-
UnknownId = unknownToken;
132-
133-
if (!_vocab.TryGetValue(options.BeginningOfSentenceToken, out int beginOfSentenceToken))
134-
{
135-
throw new ArgumentException($"The vocabulary must include the beginning of sentence token '{options.BeginningOfSentenceToken}'.");
136-
}
137-
BeginningOfSentenceId = beginOfSentenceToken;
138-
139-
if (!_vocab.TryGetValue(options.EndOfSentenceToken, out int endOfSentenceToken))
140-
{
141-
throw new ArgumentException($"The vocabulary must include the end of sentence token '{options.EndOfSentenceToken}'.");
142-
}
143-
EndOfSentenceId = endOfSentenceToken;
144-
}
145-
14696
public override IReadOnlyDictionary<string, int> Vocabulary => new ReadOnlyDictionary<string, int>(_vocab);
14797

14898
public int MaxIdByteFallbackId { get; }

0 commit comments

Comments
 (0)