Fixed Bug with Unicode Tokenizer Vocab Size (#243)

aflah02 · web-flow · commit 7125a247d693 · 2022-06-29T12:59:52.000-07:00
diff --git a/keras_nlp/tokenizers/unicode_character_tokenizer.py b/keras_nlp/tokenizers/unicode_character_tokenizer.py
@@ -242,7 +242,7 @@ def __init__(
         self.replacement_char = replacement_char
         self.input_encoding = input_encoding
         self.output_encoding = output_encoding
-        self.vocabulary_size = vocabulary_size
+        self._vocabulary_size = vocabulary_size
 
     def get_config(self):
         config = super().get_config()
@@ -255,15 +255,15 @@ def get_config(self):
                 "replacement_char": self.replacement_char,
                 "input_encoding": self.input_encoding,
                 "output_encoding": self.output_encoding,
-                "vocabulary_size": self.vocabulary_size,
+                "vocabulary_size": self._vocabulary_size,
             }
         )
         return config
 
     def vocabulary_size(self) -> int:
         """Get the size of the tokenizer vocabulary. None implies no vocabulary
         size was provided"""
-        return self.vocabulary_size
+        return self._vocabulary_size
 
     def tokenize(self, inputs):
         if not isinstance(inputs, (tf.Tensor, tf.RaggedTensor)):
@@ -299,8 +299,8 @@ def tokenize(self, inputs):
 
         # Optionally clamps the output code point values to be in the
         # range [0, vocabulary_size)
-        if self.vocabulary_size:
-            tokens = tf.clip_by_value(tokens, 0, self.vocabulary_size - 1)
+        if self._vocabulary_size:
+            tokens = tf.clip_by_value(tokens, 0, self._vocabulary_size - 1)
 
         return tokens