Skip to content

Commit 7125a24

Browse files
authored
Fixed Bug with Unicode Tokenizer Vocab Size (#243)
1 parent b5e0ef2 commit 7125a24

File tree

1 file changed

+5
-5
lines changed

1 file changed

+5
-5
lines changed

keras_nlp/tokenizers/unicode_character_tokenizer.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -242,7 +242,7 @@ def __init__(
242242
self.replacement_char = replacement_char
243243
self.input_encoding = input_encoding
244244
self.output_encoding = output_encoding
245-
self.vocabulary_size = vocabulary_size
245+
self._vocabulary_size = vocabulary_size
246246

247247
def get_config(self):
248248
config = super().get_config()
@@ -255,15 +255,15 @@ def get_config(self):
255255
"replacement_char": self.replacement_char,
256256
"input_encoding": self.input_encoding,
257257
"output_encoding": self.output_encoding,
258-
"vocabulary_size": self.vocabulary_size,
258+
"vocabulary_size": self._vocabulary_size,
259259
}
260260
)
261261
return config
262262

263263
def vocabulary_size(self) -> int:
264264
"""Get the size of the tokenizer vocabulary. None implies no vocabulary
265265
size was provided"""
266-
return self.vocabulary_size
266+
return self._vocabulary_size
267267

268268
def tokenize(self, inputs):
269269
if not isinstance(inputs, (tf.Tensor, tf.RaggedTensor)):
@@ -299,8 +299,8 @@ def tokenize(self, inputs):
299299

300300
# Optionally clamps the output code point values to be in the
301301
# range [0, vocabulary_size)
302-
if self.vocabulary_size:
303-
tokens = tf.clip_by_value(tokens, 0, self.vocabulary_size - 1)
302+
if self._vocabulary_size:
303+
tokens = tf.clip_by_value(tokens, 0, self._vocabulary_size - 1)
304304

305305
return tokens
306306

0 commit comments

Comments
 (0)