fix(model): support saved model and tflite conversion

nglehuy · nglehuy · commit e3183482104a · 2021-11-07T18:39:39.000+07:00
diff --git a/README.md b/README.md
@@ -120,9 +120,9 @@ docker-compose up -d
 
 - For _training, testing and using_ **CTC Models**, run `./scripts/install_ctc_decoders.sh`
 
-- For _training_ **Transducer Models** with RNNT Loss from [warp-transducer](https://github.com/HawkAaron/warp-transducer), run `export CUDA_HOME=/usr/local/cuda && ./scripts/install_rnnt_loss.sh` (**Note**: only `export CUDA_HOME` when you have CUDA)
+- For _training_ **Transducer Models** with RNNT Loss in TF, make sure that [warp-transducer](https://github.com/HawkAaron/warp-transducer) **is not installed** (by simply run `pip3 uninstall warprnnt-tensorflow`) (**Recommended**)
 
-- For _training_ **Transducer Models** with RNNT Loss in TF, make sure that [warp-transducer](https://github.com/HawkAaron/warp-transducer) **is not installed** (by simply run `pip3 uninstall warprnnt-tensorflow`)
+- For _training_ **Transducer Models** with RNNT Loss from [warp-transducer](https://github.com/HawkAaron/warp-transducer), run `export CUDA_HOME=/usr/local/cuda && ./scripts/install_rnnt_loss.sh` (**Note**: only `export CUDA_HOME` when you have CUDA)
 
 - For _mixed precision training_, use flag `--mxp` when running python scripts from [examples](./examples)
 
diff --git a/examples/conformer/inference/gen_saved_model.py b/examples/conformer/inference/gen_saved_model.py
@@ -69,14 +69,24 @@
 conformer.add_featurizers(speech_featurizer, text_featurizer)
 
 
-# TODO: Support saved model conversion
-# class ConformerModule(tf.Module):
-#     def __init__(self, model: Conformer, name=None):
-#         super().__init__(name=name)
-#         self.model = model
-#         self.pred = model.make_tflite_function()
-
-
-# model = ConformerModule(model=conformer)
-# tf.saved_model.save(model, args.output_dir)
-conformer.save(args.output_dir, include_optimizer=False, save_format="tf")
+class ConformerModule(tf.Module):
+    def __init__(self, model: Conformer, name=None):
+        super().__init__(name=name)
+        self.model = model
+        self.num_rnns = config.model_config["prediction_num_rnns"]
+        self.rnn_units = config.model_config["prediction_rnn_units"]
+        self.rnn_nstates = 2 if config.model_config["prediction_rnn_type"] == "lstm" else 1
+
+    @tf.function(input_signature=[tf.TensorSpec(shape=[None], dtype=tf.float32)])
+    def pred(self, signal):
+        predicted = tf.constant(0, dtype=tf.int32)
+        states = tf.zeros([self.num_rnns, self.rnn_nstates, 1, self.rnn_units], dtype=tf.float32)
+        features = self.model.speech_featurizer.tf_extract(signal)
+        encoded = self.model.encoder_inference(features)
+        hypothesis = self.model._perform_greedy(encoded, tf.shape(encoded)[0], predicted, states, tflite=False)
+        transcript = self.model.text_featurizer.indices2upoints(hypothesis.prediction)
+        return transcript
+
+
+module = ConformerModule(model=conformer)
+tf.saved_model.save(module, export_dir=args.output_dir, signatures=module.pred.get_concrete_function())
diff --git a/examples/conformer/inference/gen_tflite_model.py b/examples/conformer/inference/gen_tflite_model.py
@@ -27,30 +27,25 @@
 DEFAULT_YAML = os.path.join(os.path.abspath(os.path.dirname(__file__)), "config.yml")
 
 tf.keras.backend.clear_session()
+tf.compat.v1.enable_control_flow_v2()
 
 parser = argparse.ArgumentParser(prog="Conformer TFLite")
 
 parser.add_argument("--config", type=str, default=DEFAULT_YAML, help="The file path of model configuration file")
 
-parser.add_argument("--saved", type=str, default=None, help="Path to saved model")
+parser.add_argument("--h5", type=str, default=None, help="Path to saved model")
 
-parser.add_argument("--subwords", action="store_true", help="Use subwords")
-
-parser.add_argument("--vocabulary", type=str, default=None, required=False,
-                    help="Path to vocabulary. Overrides path in config, if given.")
+parser.add_argument("--subwords", default=False, action="store_true", help="Use subwords")
 
 parser.add_argument("output", type=str, default=None, help="TFLite file path to be exported")
 
 args = parser.parse_args()
 
-assert args.saved and args.output
+assert args.h5 and args.output
 
 config = Config(args.config)
 speech_featurizer = TFSpeechFeaturizer(config.speech_config)
 
-if args.vocabulary is not None:
-    config.decoder_config["vocabulary"] = args.vocabulary
-
 if args.subwords:
     text_featurizer = SubwordFeaturizer(config.decoder_config)
 else:
@@ -59,7 +54,7 @@
 # build model
 conformer = Conformer(**config.model_config, vocabulary_size=text_featurizer.num_classes)
 conformer.make(speech_featurizer.shape)
-conformer.load_weights(args.saved, by_name=True)
+conformer.load_weights(args.h5, by_name=True)
 conformer.summary(line_length=100)
 conformer.add_featurizers(speech_featurizer, text_featurizer)
 
diff --git a/examples/conformer/inference/run_saved_model.py b/examples/conformer/inference/run_saved_model.py
@@ -0,0 +1,42 @@
+# Copyright 2020 Huy Le Nguyen (@usimarit)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import os
+
+from tensorflow_asr.utils import env_util
+
+logger = env_util.setup_environment()
+import tensorflow as tf
+
+DEFAULT_YAML = os.path.join(os.path.abspath(os.path.dirname(__file__)), "config.yml")
+
+tf.keras.backend.clear_session()
+
+parser = argparse.ArgumentParser()
+
+parser.add_argument("--saved_model", type=str, default=None, help="The file path of saved model")
+
+parser.add_argument("filename", type=str, default=None, help="Audio file path")
+
+args = parser.parse_args()
+
+from tensorflow_asr.featurizers.speech_featurizers import read_raw_audio
+
+module = tf.saved_model.load(export_dir=args.saved_model)
+
+signal = read_raw_audio(args.filename)
+transcript = module.pred(signal)
+
+print("Transcript: ", "".join([chr(u) for u in transcript]))
diff --git a/examples/conformer/inference/run_tflite_model.py b/examples/conformer/inference/run_tflite_model.py
@@ -0,0 +1,50 @@
+# Copyright 2020 Huy Le Nguyen (@usimarit)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import tensorflow as tf
+
+from tensorflow_asr.featurizers.speech_featurizers import read_raw_audio
+
+parser = argparse.ArgumentParser()
+
+parser.add_argument("filename", metavar="FILENAME", help="Audio file to be played back")
+
+parser.add_argument("--tflite", type=str, default=None, help="Path to conformer tflite")
+
+parser.add_argument("--blank", type=int, default=0, help="Blank index")
+
+parser.add_argument("--num_rnns", type=int, default=1, help="Number of RNN layers in prediction network")
+
+parser.add_argument("--nstates", type=int, default=2, help="Number of RNN states in prediction network")
+
+parser.add_argument("--statesize", type=int, default=320, help="Size of RNN state in prediction network")
+
+args = parser.parse_args()
+
+tflitemodel = tf.lite.Interpreter(model_path=args.tflite)
+
+signal = read_raw_audio(args.filename)
+
+input_details = tflitemodel.get_input_details()
+output_details = tflitemodel.get_output_details()
+tflitemodel.resize_tensor_input(input_details[0]["index"], signal.shape)
+tflitemodel.allocate_tensors()
+tflitemodel.set_tensor(input_details[0]["index"], signal)
+tflitemodel.set_tensor(input_details[1]["index"], tf.constant(args.blank, dtype=tf.int32))
+tflitemodel.set_tensor(input_details[2]["index"], tf.zeros([args.num_rnns, args.nstates, 1, args.statesize], dtype=tf.float32))
+tflitemodel.invoke()
+hyp = tflitemodel.get_tensor(output_details[0]["index"])
+
+print("".join([chr(u) for u in hyp]))
diff --git a/examples/demonstration/conformer.py b/examples/demonstration/conformer.py
@@ -14,7 +14,7 @@
 
 import os
 import argparse
-from tensorflow_asr.utils import env_util, math_util
+from tensorflow_asr.utils import env_util
 
 logger = env_util.setup_environment()
 import tensorflow as tf
@@ -79,12 +79,14 @@
     logger.info(f"Transcript: {transcript[0].numpy().decode('UTF-8')}")
 elif args.timestamp:
     transcript, stime, etime, _, _ = conformer.recognize_tflite_with_timestamp(
-        signal, tf.constant(text_featurizer.blank, dtype=tf.int32), conformer.predict_net.get_initial_state())
+        signal, tf.constant(text_featurizer.blank, dtype=tf.int32), conformer.predict_net.get_initial_state()
+    )
     logger.info(f"Transcript: {transcript}")
     logger.info(f"Start time: {stime}")
     logger.info(f"End time: {etime}")
 else:
     code_points, _, _ = conformer.recognize_tflite(
-        signal, tf.constant(text_featurizer.blank, dtype=tf.int32), conformer.predict_net.get_initial_state())
-    transcript = tf.strings.unicode_encode(code_points, 'UTF-8').numpy().decode('UTF-8')
+        signal, tf.constant(text_featurizer.blank, dtype=tf.int32), conformer.predict_net.get_initial_state()
+    )
+    transcript = tf.strings.unicode_encode(code_points, "UTF-8").numpy().decode("UTF-8")
     logger.info(f"Transcript: {transcript}")
diff --git a/setup.py b/setup.py
@@ -41,7 +41,7 @@ def parse_requirements(lines: List[str]):
 
 setup(
     name="TensorFlowASR",
-    version="1.0.3",
+    version="1.0.2",
     author="Huy Le Nguyen",
     author_email="nlhuy.cs.16@gmail.com",
     description="Almost State-of-the-art Automatic Speech Recognition using Tensorflow 2",
diff --git a/tensorflow_asr/models/layers/embedding.py b/tensorflow_asr/models/layers/embedding.py
@@ -32,10 +32,7 @@ def __init__(
         self.regularizer = tf.keras.regularizers.get(regularizer)
         self.initializer = tf.keras.initializers.get(initializer)
 
-    def build(
-        self,
-        input_shape,
-    ):
+    def build(self, input_shape):
         self.embeddings = self.add_weight(
             name="embeddings",
             dtype=tf.float32,
@@ -47,10 +44,11 @@ def build(
         )
         self.built = True
 
-    def call(
-        self,
-        inputs,
-    ):
+    def call(self, inputs):
+        outputs = tf.cast(inputs, dtype=tf.int32)
+        return tf.nn.embedding_lookup(self.embeddings, outputs)
+
+    def recognize_tflite(self, inputs):
         outputs = tf.cast(tf.expand_dims(inputs, axis=-1), dtype=tf.int32)
         return tf.gather_nd(self.embeddings, outputs)
 
diff --git a/tensorflow_asr/models/transducer/base_transducer.py b/tensorflow_asr/models/transducer/base_transducer.py
@@ -104,7 +104,7 @@ def call(self, inputs, training=False, **kwargs):
                 outputs = rnn["projection"](outputs, training=training)
         return outputs
 
-    def recognize(self, inputs, states):
+    def recognize(self, inputs, states, tflite: bool = False):
         """Recognize function for prediction network
 
         Args:
@@ -115,7 +115,10 @@ def recognize(self, inputs, states):
             tf.Tensor: outputs with shape [1, 1, P]
             tf.Tensor: new states with shape [num_lstms, 2, 1, P]
         """
-        outputs = self.embed(inputs, training=False)
+        if tflite:
+            outputs = self.embed.recognize_tflite(inputs)
+        else:
+            outputs = self.embed(inputs, training=False)
         outputs = self.do(outputs, training=False)
         new_states = []
         for i, rnn in enumerate(self.rnns):
@@ -390,6 +393,7 @@ def decoder_inference(
         encoded: tf.Tensor,
         predicted: tf.Tensor,
         states: tf.Tensor,
+        tflite: bool = False,
     ):
         """Infer function for decoder
 
@@ -404,7 +408,7 @@ def decoder_inference(
         with tf.name_scope(f"{self.name}_decoder"):
             encoded = tf.reshape(encoded, [1, 1, -1])  # [E] => [1, 1, E]
             predicted = tf.reshape(predicted, [1, 1])  # [] => [1, 1]
-            y, new_states = self.predict_net.recognize(predicted, states)  # [1, 1, P], states
+            y, new_states = self.predict_net.recognize(predicted, states, tflite=tflite)  # [1, 1, P], states
             ytu = tf.nn.log_softmax(self.joint_net([encoded, y], training=False))  # [1, 1, V]
             ytu = tf.reshape(ytu, shape=[-1])  # [1, 1, V] => [V]
             return ytu, new_states
@@ -455,7 +459,7 @@ def recognize_tflite(
         """
         features = self.speech_featurizer.tf_extract(signal)
         encoded = self.encoder_inference(features)
-        hypothesis = self._perform_greedy(encoded, tf.shape(encoded)[0], predicted, states)
+        hypothesis = self._perform_greedy(encoded, tf.shape(encoded)[0], predicted, states, tflite=True)
         transcript = self.text_featurizer.indices2upoints(hypothesis.prediction)
         return transcript, hypothesis.index, hypothesis.states
 
@@ -467,7 +471,7 @@ def recognize_tflite_with_timestamp(
     ):
         features = self.speech_featurizer.tf_extract(signal)
         encoded = self.encoder_inference(features)
-        hypothesis = self._perform_greedy(encoded, tf.shape(encoded)[0], predicted, states)
+        hypothesis = self._perform_greedy(encoded, tf.shape(encoded)[0], predicted, states, tflite=True)
         indices = self.text_featurizer.normalize_indices(hypothesis.prediction)
         upoints = tf.gather_nd(self.text_featurizer.upoints, tf.expand_dims(indices, axis=-1))  # [None, max_subword_length]
 
@@ -540,6 +544,7 @@ def _perform_greedy(
         states: tf.Tensor,
         parallel_iterations: int = 10,
         swap_memory: bool = False,
+        tflite: bool = False,
     ):
         with tf.name_scope(f"{self.name}_greedy"):
             time = tf.constant(0, dtype=tf.int32)
@@ -566,6 +571,7 @@ def body(_time, _hypothesis):
                     encoded=tf.gather_nd(encoded, tf.reshape(_time, shape=[1])),
                     predicted=_hypothesis.index,
                     states=_hypothesis.states,
+                    tflite=tflite,
                 )
                 _predict = tf.argmax(ytu, axis=-1, output_type=tf.int32)  # => argmax []
 
@@ -605,6 +611,7 @@ def _perform_greedy_v2(
         states: tf.Tensor,
         parallel_iterations: int = 10,
         swap_memory: bool = False,
+        tflite: bool = False,
     ):
         """Ref: https://arxiv.org/pdf/1801.00841.pdf"""
         with tf.name_scope(f"{self.name}_greedy_v2"):
@@ -632,6 +639,7 @@ def body(_time, _hypothesis):
                     encoded=tf.gather_nd(encoded, tf.reshape(_time, shape=[1])),
                     predicted=_hypothesis.index,
                     states=_hypothesis.states,
+                    tflite=tflite,
                 )
                 _predict = tf.argmax(ytu, axis=-1, output_type=tf.int32)  # => argmax []
 
@@ -736,6 +744,7 @@ def _perform_beam_search(
         lm: bool = False,
         parallel_iterations: int = 10,
         swap_memory: bool = True,
+        tflite: bool = False,
     ):
         with tf.name_scope(f"{self.name}_beam_search"):
             beam_width = tf.cond(
@@ -834,7 +843,9 @@ def beam_body(beam, beam_width, A, A_i, B):
                     )
                     A_i = tf.cond(tf.equal(A_i, 0), true_fn=lambda: A_i, false_fn=lambda: A_i - 1)
 
-                    ytu, new_states = self.decoder_inference(encoded=encoded_t, predicted=y_hat_index, states=y_hat_states)
+                    ytu, new_states = self.decoder_inference(
+                        encoded=encoded_t, predicted=y_hat_index, states=y_hat_states, tflite=tflite
+                    )
 
                     def predict_condition(pred, A, A_i, B):
                         return tf.less(pred, self.text_featurizer.num_classes)