Merge pull request #233 from TensorSpeech/fix/models

nglehuy · web-flow · commit 937ef5abec09 · 2021-11-07T18:42:54.000+07:00
Support saved model conversion for transducer models
diff --git a/README.md b/README.md
@@ -120,9 +120,9 @@ docker-compose up -d
 
 - For _training, testing and using_ **CTC Models**, run `./scripts/install_ctc_decoders.sh`
 
-- For _training_ **Transducer Models** with RNNT Loss from [warp-transducer](https://github.com/HawkAaron/warp-transducer), run `export CUDA_HOME=/usr/local/cuda && ./scripts/install_rnnt_loss.sh` (**Note**: only `export CUDA_HOME` when you have CUDA)
+- For _training_ **Transducer Models** with RNNT Loss in TF, make sure that [warp-transducer](https://github.com/HawkAaron/warp-transducer) **is not installed** (by simply run `pip3 uninstall warprnnt-tensorflow`) (**Recommended**)
 
-- For _training_ **Transducer Models** with RNNT Loss in TF, make sure that [warp-transducer](https://github.com/HawkAaron/warp-transducer) **is not installed** (by simply run `pip3 uninstall warprnnt-tensorflow`)
+- For _training_ **Transducer Models** with RNNT Loss from [warp-transducer](https://github.com/HawkAaron/warp-transducer), run `export CUDA_HOME=/usr/local/cuda && ./scripts/install_rnnt_loss.sh` (**Note**: only `export CUDA_HOME` when you have CUDA)
 
 - For _mixed precision training_, use flag `--mxp` when running python scripts from [examples](./examples)
 
diff --git a/examples/conformer/inference/gen_saved_model.py b/examples/conformer/inference/gen_saved_model.py
@@ -69,14 +69,24 @@
 conformer.add_featurizers(speech_featurizer, text_featurizer)
 
 
-# TODO: Support saved model conversion
-# class ConformerModule(tf.Module):
-#     def __init__(self, model: Conformer, name=None):
-#         super().__init__(name=name)
-#         self.model = model
-#         self.pred = model.make_tflite_function()
-
-
-# model = ConformerModule(model=conformer)
-# tf.saved_model.save(model, args.output_dir)
-conformer.save(args.output_dir, include_optimizer=False, save_format="tf")
+class ConformerModule(tf.Module):
+    def __init__(self, model: Conformer, name=None):
+        super().__init__(name=name)
+        self.model = model
+        self.num_rnns = config.model_config["prediction_num_rnns"]
+        self.rnn_units = config.model_config["prediction_rnn_units"]
+        self.rnn_nstates = 2 if config.model_config["prediction_rnn_type"] == "lstm" else 1
+
+    @tf.function(input_signature=[tf.TensorSpec(shape=[None], dtype=tf.float32)])
+    def pred(self, signal):
+        predicted = tf.constant(0, dtype=tf.int32)
+        states = tf.zeros([self.num_rnns, self.rnn_nstates, 1, self.rnn_units], dtype=tf.float32)
+        features = self.model.speech_featurizer.tf_extract(signal)
+        encoded = self.model.encoder_inference(features)
+        hypothesis = self.model._perform_greedy(encoded, tf.shape(encoded)[0], predicted, states, tflite=False)
+        transcript = self.model.text_featurizer.indices2upoints(hypothesis.prediction)
+        return transcript
+
+
+module = ConformerModule(model=conformer)
+tf.saved_model.save(module, export_dir=args.output_dir, signatures=module.pred.get_concrete_function())
diff --git a/examples/conformer/inference/gen_tflite_model.py b/examples/conformer/inference/gen_tflite_model.py
@@ -27,30 +27,25 @@
 DEFAULT_YAML = os.path.join(os.path.abspath(os.path.dirname(__file__)), "config.yml")
 
 tf.keras.backend.clear_session()
+tf.compat.v1.enable_control_flow_v2()
 
 parser = argparse.ArgumentParser(prog="Conformer TFLite")
 
 parser.add_argument("--config", type=str, default=DEFAULT_YAML, help="The file path of model configuration file")
 
-parser.add_argument("--saved", type=str, default=None, help="Path to saved model")
+parser.add_argument("--h5", type=str, default=None, help="Path to saved model")
 
-parser.add_argument("--subwords", action="store_true", help="Use subwords")
-
-parser.add_argument("--vocabulary", type=str, default=None, required=False,
-                    help="Path to vocabulary. Overrides path in config, if given.")
+parser.add_argument("--subwords", default=False, action="store_true", help="Use subwords")
 
 parser.add_argument("output", type=str, default=None, help="TFLite file path to be exported")
 
 args = parser.parse_args()
 
-assert args.saved and args.output
+assert args.h5 and args.output
 
 config = Config(args.config)
 speech_featurizer = TFSpeechFeaturizer(config.speech_config)
 
-if args.vocabulary is not None:
-    config.decoder_config["vocabulary"] = args.vocabulary
-
 if args.subwords:
     text_featurizer = SubwordFeaturizer(config.decoder_config)
 else:
@@ -59,7 +54,7 @@
 # build model
 conformer = Conformer(**config.model_config, vocabulary_size=text_featurizer.num_classes)
 conformer.make(speech_featurizer.shape)
-conformer.load_weights(args.saved, by_name=True)
+conformer.load_weights(args.h5, by_name=True)
 conformer.summary(line_length=100)
 conformer.add_featurizers(speech_featurizer, text_featurizer)
 
diff --git a/examples/conformer/inference/run_saved_model.py b/examples/conformer/inference/run_saved_model.py
@@ -0,0 +1,42 @@
+# Copyright 2020 Huy Le Nguyen (@usimarit)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import os
+
+from tensorflow_asr.utils import env_util
+
+logger = env_util.setup_environment()
+import tensorflow as tf
+
+DEFAULT_YAML = os.path.join(os.path.abspath(os.path.dirname(__file__)), "config.yml")
+
+tf.keras.backend.clear_session()
+
+parser = argparse.ArgumentParser()
+
+parser.add_argument("--saved_model", type=str, default=None, help="The file path of saved model")
+
+parser.add_argument("filename", type=str, default=None, help="Audio file path")
+
+args = parser.parse_args()
+
+from tensorflow_asr.featurizers.speech_featurizers import read_raw_audio
+
+module = tf.saved_model.load(export_dir=args.saved_model)
+
+signal = read_raw_audio(args.filename)
+transcript = module.pred(signal)
+
+print("Transcript: ", "".join([chr(u) for u in transcript]))
diff --git a/examples/conformer/inference/run_tflite_model.py b/examples/conformer/inference/run_tflite_model.py
@@ -0,0 +1,50 @@
+# Copyright 2020 Huy Le Nguyen (@usimarit)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import tensorflow as tf
+
+from tensorflow_asr.featurizers.speech_featurizers import read_raw_audio
+
+parser = argparse.ArgumentParser()
+
+parser.add_argument("filename", metavar="FILENAME", help="Audio file to be played back")
+
+parser.add_argument("--tflite", type=str, default=None, help="Path to conformer tflite")
+
+parser.add_argument("--blank", type=int, default=0, help="Blank index")
+
+parser.add_argument("--num_rnns", type=int, default=1, help="Number of RNN layers in prediction network")
+
+parser.add_argument("--nstates", type=int, default=2, help="Number of RNN states in prediction network")
+
+parser.add_argument("--statesize", type=int, default=320, help="Size of RNN state in prediction network")
+
+args = parser.parse_args()
+
+tflitemodel = tf.lite.Interpreter(model_path=args.tflite)
+
+signal = read_raw_audio(args.filename)
+
+input_details = tflitemodel.get_input_details()
+output_details = tflitemodel.get_output_details()
+tflitemodel.resize_tensor_input(input_details[0]["index"], signal.shape)
+tflitemodel.allocate_tensors()
+tflitemodel.set_tensor(input_details[0]["index"], signal)
+tflitemodel.set_tensor(input_details[1]["index"], tf.constant(args.blank, dtype=tf.int32))
+tflitemodel.set_tensor(input_details[2]["index"], tf.zeros([args.num_rnns, args.nstates, 1, args.statesize], dtype=tf.float32))
+tflitemodel.invoke()
+hyp = tflitemodel.get_tensor(output_details[0]["index"])
+
+print("".join([chr(u) for u in hyp]))
diff --git a/examples/demonstration/conformer.py b/examples/demonstration/conformer.py
@@ -14,7 +14,7 @@
 
 import os
 import argparse
-from tensorflow_asr.utils import env_util, math_util
+from tensorflow_asr.utils import env_util
 
 logger = env_util.setup_environment()
 import tensorflow as tf
@@ -79,12 +79,14 @@
     logger.info(f"Transcript: {transcript[0].numpy().decode('UTF-8')}")
 elif args.timestamp:
     transcript, stime, etime, _, _ = conformer.recognize_tflite_with_timestamp(
-        signal, tf.constant(text_featurizer.blank, dtype=tf.int32), conformer.predict_net.get_initial_state())
+        signal, tf.constant(text_featurizer.blank, dtype=tf.int32), conformer.predict_net.get_initial_state()
+    )
     logger.info(f"Transcript: {transcript}")
     logger.info(f"Start time: {stime}")
     logger.info(f"End time: {etime}")
 else:
     code_points, _, _ = conformer.recognize_tflite(
-        signal, tf.constant(text_featurizer.blank, dtype=tf.int32), conformer.predict_net.get_initial_state())
-    transcript = tf.strings.unicode_encode(code_points, 'UTF-8').numpy().decode('UTF-8')
+        signal, tf.constant(text_featurizer.blank, dtype=tf.int32), conformer.predict_net.get_initial_state()
+    )
+    transcript = tf.strings.unicode_encode(code_points, "UTF-8").numpy().decode("UTF-8")
     logger.info(f"Transcript: {transcript}")
diff --git a/requirements.txt b/requirements.txt
@@ -8,4 +8,45 @@ librosa==0.8.1
 PyYAML==5.4.1
 Pillow==8.3.2
 black==21.7b0
-flake8==3.9.2
+flake8==3.9.2
+sounddevice==0.4.3
+
+# extra=tf2.3
+tensorflow~=2.3.0
+tensorflow-text~=2.3.0
+tensorflow-io~=0.16.0
+
+# extra=tf2.3-gpu
+tensorflow-gpu~=2.3.0
+tensorflow-text~=2.3.0
+tensorflow-io~=0.16.0
+
+# extra=tf2.4
+tensorflow~=2.4.0
+tensorflow-text~=2.4.0
+tensorflow-io~=0.17.0
+
+# extra=tf2.4-gpu
+tensorflow-gpu~=2.4.0
+tensorflow-text~=2.4.0
+tensorflow-io~=0.17.0
+
+# extra=tf2.5
+tensorflow~=2.5.0
+tensorflow-text~=2.5.0
+tensorflow-io~=0.18.0
+
+# extra=tf2.5-gpu
+tensorflow-gpu~=2.5.0
+tensorflow-text~=2.5.0
+tensorflow-io~=0.18.0
+
+# extra=tf2.6
+tensorflow~=2.6.0
+tensorflow-text~=2.6.0
+tensorflow-io~=0.20.0
+
+# extra=tf2.6-gpu
+tensorflow-gpu~=2.6.0
+tensorflow-text~=2.6.0
+tensorflow-io~=0.20.0
diff --git a/setup.py b/setup.py
@@ -12,23 +12,34 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import os
+from setuptools import find_packages, setup
+from typing import List
+from collections import defaultdict
 
-import setuptools
 
-readme_path = os.path.join(os.path.dirname(__file__), "README.md")
+def parse_requirements(lines: List[str]):
+    extras_requires = defaultdict(list)
+    extra = "requires"
+    for line in lines:
+        line = line.strip()
+        if line.startswith("# extra="):
+            extra = line.split("=")[1].strip()
+            continue
+        if line and line[0] != "#":
+            lib_package = line.split("#")[0].strip()  # split comments
+            extras_requires[extra].append(lib_package)
+    install_requires = extras_requires.pop("requires")
+    return install_requires, extras_requires
 
-with open(readme_path, "r", encoding="utf-8") as fh:
-    long_description = fh.read()
 
-requirements_path = os.path.join(os.path.dirname(__file__), "requirements.txt")
+with open("requirements.txt", "r", encoding="utf-8") as fr:
+    install_requires, extras_requires = parse_requirements(fr.readlines())
 
-with open(requirements_path, "r") as fr:
-    requirements = fr.read().splitlines()
-    print(requirements)
+with open("README.md", "r", encoding="utf-8") as fh:
+    long_description = fh.read()
 
 
-setuptools.setup(
+setup(
     name="TensorFlowASR",
     version="1.0.2",
     author="Huy Le Nguyen",
@@ -37,26 +48,18 @@
     long_description=long_description,
     long_description_content_type="text/markdown",
     url="https://github.com/TensorSpeech/TensorFlowASR",
-    packages=setuptools.find_packages(include=["tensorflow_asr*"]),
-    install_requires=requirements,
-    extras_require={
-        "tf2.3": ["tensorflow~=2.3.0", "tensorflow-text~=2.3.0", "tensorflow-io~=0.16.0"],
-        "tf2.3-gpu": ["tensorflow-gpu~=2.3.0", "tensorflow-text~=2.3.0", "tensorflow-io~=0.16.0"],
-        "tf2.4": ["tensorflow~=2.4.0", "tensorflow-text~=2.4.0", "tensorflow-io~=0.17.0"],
-        "tf2.4-gpu": ["tensorflow-gpu~=2.4.0", "tensorflow-text~=2.4.0", "tensorflow-io~=0.17.0"],
-        "tf2.5": ["tensorflow~=2.5.0", "tensorflow-text~=2.5.0", "tensorflow-io~=0.18.0"],
-        "tf2.5-gpu": ["tensorflow-gpu~=2.5.0", "tensorflow-text~=2.5.0", "tensorflow-io~=0.18.0"],
-        "tf2.6": ["tensorflow~=2.6.0", "tensorflow-text~=2.6.0rc0", "tensorflow-io~=0.20.0"],
-        "tf2.6-gpu": ["tensorflow-gpu~=2.6.0", "tensorflow-text~=2.6.0rc0", "tensorflow-io~=0.20.0"],
-    },
+    packages=find_packages(include=("tensorflow_asr", "tensorflow_asr.*")),
+    install_requires=install_requires,
+    extras_require=extras_requires,
     classifiers=[
         "Programming Language :: Python :: 3.6",
         "Programming Language :: Python :: 3.7",
         "Programming Language :: Python :: 3.8",
+        "Programming Language :: Python :: 3.9",
         "Intended Audience :: Science/Research",
         "Operating System :: POSIX :: Linux",
         "License :: OSI Approved :: Apache Software License",
         "Topic :: Software Development :: Libraries :: Python Modules",
     ],
-    python_requires=">=3.6",
+    python_requires=">=3.6, <4",
 )
diff --git a/tensorflow_asr/models/layers/embedding.py b/tensorflow_asr/models/layers/embedding.py
@@ -32,10 +32,7 @@ def __init__(
         self.regularizer = tf.keras.regularizers.get(regularizer)
         self.initializer = tf.keras.initializers.get(initializer)
 
-    def build(
-        self,
-        input_shape,
-    ):
+    def build(self, input_shape):
         self.embeddings = self.add_weight(
             name="embeddings",
             dtype=tf.float32,
@@ -47,12 +44,13 @@ def build(
         )
         self.built = True
 
-    def call(
-        self,
-        inputs,
-    ):
+    def call(self, inputs):
+        outputs = tf.cast(inputs, dtype=tf.int32)
+        return tf.nn.embedding_lookup(self.embeddings, outputs)
+
+    def recognize_tflite(self, inputs):
         outputs = tf.cast(tf.expand_dims(inputs, axis=-1), dtype=tf.int32)
-        return tf.gather_nd(self.embeddings, outputs)
+        return tf.gather_nd(self.embeddings, outputs)  # https://github.com/tensorflow/tensorflow/issues/42410
 
     def get_config(self):
         conf = super(Embedding, self).get_config()
diff --git a/tensorflow_asr/models/transducer/base_transducer.py b/tensorflow_asr/models/transducer/base_transducer.py