Don't accept a string dtype for unicode tokenizer (#147)

mattdangerw · web-flow · commit 7e678e402e16 · 2022-04-29T17:56:55.000-07:00
* Don't accept a string dtype for unicode tokenizer

This tokenizer cannot output strings.

* cast the output to the layer dtype
diff --git a/keras_nlp/tokenizers/unicode_character_tokenizer.py b/keras_nlp/tokenizers/unicode_character_tokenizer.py
@@ -175,9 +175,9 @@ def __init__(
             kwargs["dtype"] = tf.int32
         else:
             dtype = tf.dtypes.as_dtype(kwargs["dtype"])
-            if not dtype.is_integer and dtype != tf.string:
+            if not dtype.is_integer:
                 raise ValueError(
-                    "Output dtype must be an integer type of a string. "
+                    "Output dtype must be an integer type. "
                     f"Received: dtype={dtype}"
                 )
 
@@ -251,6 +251,7 @@ def tokenize(self, inputs):
             replacement_char=self.replacement_char,
             input_encoding=self.input_encoding,
         )
+        tokens = tf.cast(tokens, self.compute_dtype)
 
         if self.sequence_length:
             output_shape = tokens.shape.as_list()