Adding Utility to Detokenize as list of Strings to Tokenizer Base Class (#124)

aflah02 · mattdangerw · web-flow · commit 375082ef96a9 · 2022-05-03T10:21:19.000-07:00
* Added Functions to Base Class

* Tightened Logic started Work on Tests

* Added tests

* Updated Docstring

* Fixing Tokenizer

* Fixed Broken Tests

* Ran format and lint

* Fix docstring summary to fit on single line

Adds a little more description as well

* Remove trailing whitespace

* fix

* Ported tensor_to_string_list to tensor_utils

Co-authored-by: Matt Watson &lt;1389937+mattdangerw@users.noreply.github.com&gt;
Co-authored-by: Matt Watson &lt;mattdangerw@gmail.com&gt;
diff --git a/keras_nlp/__init__.py b/keras_nlp/__init__.py
@@ -15,5 +15,6 @@
 from keras_nlp import layers
 from keras_nlp import metrics
 from keras_nlp import tokenizers
+from keras_nlp import utils
 
 __version__ = "0.2.0-dev.1"
diff --git a/keras_nlp/utils/tensor_utils.py b/keras_nlp/utils/tensor_utils.py
@@ -0,0 +1,47 @@
+# Copyright 2022 The KerasNLP Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import tensorflow as tf
+
+
+def _decode_strings_to_utf8(inputs):
+    """Recursively decodes to list of strings with 'utf-8' encoding."""
+    if isinstance(inputs, bytes):
+        # Handles the case when the input is a scalar string.
+        return inputs.decode("utf-8")
+    else:
+        # Recursively iterate when input is a list.
+        return [_decode_strings_to_utf8(x) for x in inputs]
+
+
+def tensor_to_string_list(inputs):
+    """Detokenize and convert tensor to nested lists of python strings.
+
+    This is a convenience method which converts each byte string to a python
+    string.
+
+    Args:
+        inputs: Input tensor, or dict/list/tuple of input tensors.
+        *args: Additional positional arguments.
+        **kwargs: Additional keyword arguments.
+    """
+    if not isinstance(inputs, (tf.RaggedTensor, tf.Tensor)):
+        inputs = tf.convert_to_tensor(inputs)
+    if isinstance(inputs, tf.RaggedTensor):
+        list_outputs = inputs.to_list()
+    elif isinstance(inputs, tf.Tensor):
+        list_outputs = inputs.numpy()
+        if inputs.shape.rank != 0:
+            list_outputs = list_outputs.tolist()
+    return _decode_strings_to_utf8(list_outputs)
diff --git a/keras_nlp/utils/tensor_utils_test.py b/keras_nlp/utils/tensor_utils_test.py
@@ -0,0 +1,33 @@
+# Copyright 2022 The KerasNLP Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import tensorflow as tf
+from tensor_utils import tensor_to_string_list
+
+
+class TensorToStringListTest(tf.test.TestCase):
+    def test_detokenize_to_strings_for_ragged(self):
+        input_data = tf.ragged.constant([["▀▁▂▃", "samurai"]])
+        detokenize_output = tensor_to_string_list(input_data)
+        self.assertAllEqual(detokenize_output, [["▀▁▂▃", "samurai"]])
+
+    def test_detokenize_to_strings_for_dense(self):
+        input_data = tf.constant([["▀▁▂▃", "samurai"]])
+        detokenize_output = tensor_to_string_list(input_data)
+        self.assertAllEqual(detokenize_output, [["▀▁▂▃", "samurai"]])
+
+    def test_detokenize_to_strings_for_scalar(self):
+        input_data = tf.constant("▀▁▂▃")
+        detokenize_output = tensor_to_string_list(input_data)
+        self.assertEqual(detokenize_output, "▀▁▂▃")