✍️ update testing script

nglehuy · nglehuy · commit 4d07e9c48ea0 · 2021-04-16T01:24:23.000+07:00
diff --git a/examples/conformer/test.py b/examples/conformer/test.py
@@ -14,9 +14,9 @@
 
 import os
 import argparse
-from tensorflow_asr.utils import setup_environment, setup_devices
+from tensorflow_asr.utils import env_util, file_util
 
-setup_environment()
+env_util.setup_environment()
 import tensorflow as tf
 
 DEFAULT_YAML = os.path.join(os.path.abspath(os.path.dirname(__file__)), "config.yml")
@@ -33,65 +33,77 @@
 
 parser.add_argument("--mxp", default=False, action="store_true", help="Enable mixed precision")
 
+parser.add_argument("--bs", type=int, default=None, help="Test batch size")
+
 parser.add_argument("--sentence_piece", default=False, action="store_true", help="Whether to use `SentencePiece` model")
 
+parser.add_argument("--subwords", default=False, action="store_true", help="Use subwords")
+
 parser.add_argument("--device", type=int, default=0, help="Device's id to run test on")
 
 parser.add_argument("--cpu", default=False, action="store_true", help="Whether to only use cpu")
 
-parser.add_argument("--subwords", type=str, default=None, help="Path to file that stores generated subwords")
-
-parser.add_argument("--output_name", type=str, default="test", help="Result filename name prefix")
+parser.add_argument("--output", type=str, default="test.tsv", help="Result filepath")
 
 args = parser.parse_args()
 
+assert args.saved
+
 tf.config.optimizer.set_experimental_options({"auto_mixed_precision": args.mxp})
 
-setup_devices([args.device], cpu=args.cpu)
+env_util.setup_devices([args.device], cpu=args.cpu)
 
 from tensorflow_asr.configs.config import Config
 from tensorflow_asr.datasets.asr_dataset import ASRTFRecordDataset, ASRSliceDataset
 from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer
-from tensorflow_asr.featurizers.text_featurizers import SubwordFeaturizer, SentencePieceFeaturizer
-from tensorflow_asr.runners.base_runners import BaseTester
-from tensorflow_asr.models.conformer import Conformer
+from tensorflow_asr.featurizers.text_featurizers import SubwordFeaturizer, SentencePieceFeaturizer, CharFeaturizer
+from tensorflow_asr.models.transducer.conformer import Conformer
 
 config = Config(args.config)
 speech_featurizer = TFSpeechFeaturizer(config.speech_config)
 
 if args.sentence_piece:
-    print("Loading SentencePiece model ...")
-    text_featurizer = SentencePieceFeaturizer.load_from_file(config.decoder_config, args.subwords)
-elif args.subwords and os.path.exists(args.subwords):
-    print("Loading subwords ...")
-    text_featurizer = SubwordFeaturizer.load_from_file(config.decoder_config, args.subwords)
+    print("Use SentencePiece ...")
+    text_featurizer = SentencePieceFeaturizer(config.decoder_config)
+elif args.subwords:
+    print("Use subwords ...")
+    text_featurizer = SubwordFeaturizer(config.decoder_config)
 else:
-    raise ValueError("subwords must be set")
+    print("Use characters ...")
+    text_featurizer = CharFeaturizer(config.decoder_config)
 
 tf.random.set_seed(0)
-assert args.saved
 
 if args.tfrecords:
     test_dataset = ASRTFRecordDataset(
-        speech_featurizer=speech_featurizer, text_featurizer=text_featurizer,
+        speech_featurizer=speech_featurizer,
+        text_featurizer=text_featurizer,
         **vars(config.learning_config.test_dataset_config)
     )
 else:
     test_dataset = ASRSliceDataset(
-        speech_featurizer=speech_featurizer, text_featurizer=text_featurizer,
+        speech_featurizer=speech_featurizer,
+        text_featurizer=text_featurizer,
         **vars(config.learning_config.test_dataset_config)
     )
 
 # build model
 conformer = Conformer(**config.model_config, vocabulary_size=text_featurizer.num_classes)
 conformer._build(speech_featurizer.shape)
 conformer.load_weights(args.saved)
-conformer.summary(line_length=120)
+conformer.summary(line_length=100)
 conformer.add_featurizers(speech_featurizer, text_featurizer)
 
-conformer_tester = BaseTester(
-    config=config.learning_config.running_config,
-    output_name=args.output_name
-)
-conformer_tester.compile(conformer)
-conformer_tester.run(test_dataset)
+batch_size = args.bs or config.learning_config.running_config.batch_size
+test_data_loader = test_dataset.create(batch_size)
+
+results = conformer.predict(test_data_loader)
+
+with file_util.save_file(file_util.preprocess_paths(args.output)) as filepath:
+    print(f"Saving result to {args.output} ...")
+    with open(filepath, "w") as openfile:
+        openfile.write("PATH\tDURATION\tGROUNDTRUTH\tGREEDY\tBEAMSEARCH\n")
+        for i, entry in test_dataset.entries:
+            groundtruth, greedy, beamsearch = results[i]
+            path, duration, _ = entry
+            openfile.write(f"{path}\t{duration}\t{groundtruth}\t{greedy}\t{beamsearch}\n")
diff --git a/tensorflow_asr/losses/ctc_loss.py b/tensorflow_asr/losses/ctc_loss.py
@@ -21,13 +21,11 @@ def __init__(self, blank=0, global_batch_size=None, name=None):
         self.global_batch_size = global_batch_size
 
     def call(self, y_true, y_pred):
-        logits, logits_length = y_pred.values()
-        labels, labels_length = y_true.values()
         loss = ctc_loss(
-            y_pred=logits,
-            input_length=logits_length,
-            y_true=labels,
-            label_length=labels_length,
+            y_pred=y_pred["logits"],
+            input_length=y_pred["logits_length"],
+            y_true=y_true["labels"],
+            label_length=y_true["labels_length"],
             blank=self.blank,
             name=self.name
         )
diff --git a/tensorflow_asr/losses/rnnt_loss.py b/tensorflow_asr/losses/rnnt_loss.py
@@ -37,13 +37,11 @@ def __init__(self, blank=0, global_batch_size=None, name=None):
         self.global_batch_size = global_batch_size
 
     def call(self, y_true, y_pred):
-        logits, logits_length = y_pred.values()
-        labels, labels_length = y_true.values()
         loss = rnnt_loss(
-            logits=logits,
-            logit_length=logits_length,
-            labels=labels,
-            label_length=labels_length,
+            logits=y_pred["logits"],
+            logit_length=y_pred["logits_length"],
+            labels=y_true["labels"],
+            label_length=y_true["labels_length"],
             blank=self.blank,
             name=self.name
         )
diff --git a/tensorflow_asr/models/base_model.py b/tensorflow_asr/models/base_model.py
@@ -111,9 +111,12 @@ def predict_step(self, batch):
             [tf.Tensor]: stacked tensor of shape [B, 3] with each row is the text [truth, greedy, beam_search]
         """
         inputs, y_true = batch
-        labels = self.text_featurizer.iextract(y_true)
+        labels = self.text_featurizer.iextract(y_true["labels"])
         greedy_decoding = self.recognize(inputs)
-        beam_search_decoding = self.recognize_beam(inputs)
+        if self.text_featurizer.decoder_config.beam_width == 0:
+            beam_search_decoding = tf.map_fn(lambda _: tf.convert_to_tensor("", dtype=tf.string), labels)
+        else:
+            beam_search_decoding = self.recognize_beam(inputs)
         return tf.stack([labels, greedy_decoding, beam_search_decoding], axis=-1)
 
     def recognize(self, features, input_lengths, **kwargs):
diff --git a/tensorflow_asr/models/ctc/ctc.py b/tensorflow_asr/models/ctc/ctc.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import Optional, Union
+from typing import Dict, Union
 import numpy as np
 import tensorflow as tf
 
@@ -69,19 +69,18 @@ def add_featurizers(self,
         self.text_featurizer = text_featurizer
 
     def call(self, inputs, training=False, **kwargs):
-        inputs, inputs_length, _, _ = inputs.values()
-        logits = self.encoder(inputs, training=training, **kwargs)
+        logits = self.encoder(inputs["inputs"], training=training, **kwargs)
         logits = self.decoder(logits, training=training, **kwargs)
         return data_util.create_logits(
             logits=logits,
-            logits_length=math_util.get_reduced_length(inputs_length, self.time_reduction_factor)
+            logits_length=math_util.get_reduced_length(inputs["inputs_length"], self.time_reduction_factor)
         )
 
     # -------------------------------- GREEDY -------------------------------------
 
     @tf.function
-    def recognize(self, features: tf.Tensor, input_length: Optional[tf.Tensor]):
-        logits = self(features, training=False)
+    def recognize(self, inputs: Dict[str, tf.Tensor]):
+        logits = self(inputs["inputs"], training=False)
         probs = tf.nn.softmax(logits)
 
         def map_fn(prob): return tf.numpy_function(self._perform_greedy, inp=[prob], Tout=tf.string)
@@ -119,8 +118,8 @@ def recognize_tflite(self, signal):
     # -------------------------------- BEAM SEARCH -------------------------------------
 
     @tf.function
-    def recognize_beam(self, features: tf.Tensor, input_length: Optional[tf.Tensor], lm: bool = False):
-        logits = self(features, training=False)
+    def recognize_beam(self, inputs: Dict[str, tf.Tensor], lm: bool = False):
+        logits = self(inputs["inputs"], training=False)
         probs = tf.nn.softmax(logits)
 
         def map_fn(prob): return tf.numpy_function(self._perform_beam_search, inp=[prob, lm], Tout=tf.string)
diff --git a/tensorflow_asr/models/transducer/contextnet.py b/tensorflow_asr/models/transducer/contextnet.py
@@ -12,11 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import List
+from typing import Dict, List
 import tensorflow as tf
 
 from ..encoders.contextnet import ContextNetEncoder, L2
 from .transducer import Transducer
+from ...utils import math_util
 
 
 class ContextNet(Transducer):
@@ -95,11 +96,7 @@ def encoder_inference(self, features: tf.Tensor, input_length: tf.Tensor):
     # -------------------------------- GREEDY -------------------------------------
 
     @tf.function
-    def recognize(self,
-                  features: tf.Tensor,
-                  input_length: tf.Tensor,
-                  parallel_iterations: int = 10,
-                  swap_memory: bool = True):
+    def recognize(self, inputs: Dict[str, tf.Tensor]):
         """
         RNN Transducer Greedy decoding
         Args:
@@ -108,12 +105,9 @@ def recognize(self,
         Returns:
             tf.Tensor: a batch of decoded transcripts
         """
-        encoded = self.encoder([features, input_length], training=False)
-        return self._perform_greedy_batch(
-            encoded, input_length,
-            parallel_iterations=parallel_iterations,
-            swap_memory=swap_memory
-        )
+        encoded = self.encoder([inputs["inputs"], inputs["inputs_length"]], training=False)
+        encoded_length = math_util.get_reduced_length(inputs["inputs_length"], self.time_reduction_factor)
+        return self._perform_greedy_batch(encoded=encoded, encoded_length=encoded_length)
 
     def recognize_tflite(self, signal, predicted, prediction_states):
         """
@@ -161,12 +155,7 @@ def recognize_tflite_with_timestamp(self, signal, predicted, states):
     # -------------------------------- BEAM SEARCH -------------------------------------
 
     @tf.function
-    def recognize_beam(self,
-                       features: tf.Tensor,
-                       input_length: tf.Tensor,
-                       lm: bool = False,
-                       parallel_iterations: int = 10,
-                       swap_memory: bool = True):
+    def recognize_beam(self, inputs: Dict[str, tf.Tensor], lm: bool = False):
         """
         RNN Transducer Beam Search
         Args:
@@ -176,9 +165,6 @@ def recognize_beam(self,
         Returns:
             tf.Tensor: a batch of decoded transcripts
         """
-        encoded = self.encoder([features, input_length], training=False)
-        return self._perform_beam_search_batch(
-            encoded, input_length, lm,
-            parallel_iterations=parallel_iterations,
-            swap_memory=swap_memory
-        )
+        encoded = self.encoder([inputs["inputs"], inputs["inputs_length"]], training=False)
+        encoded_length = math_util.get_reduced_length(inputs["inputs_length"], self.time_reduction_factor)
+        return self._perform_beam_search_batch(encoded=encoded, encoded_length=encoded_length, lm=lm)
diff --git a/tensorflow_asr/models/transducer/rnn_transducer.py b/tensorflow_asr/models/transducer/rnn_transducer.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 """ http://arxiv.org/abs/1811.06621 """
 
+from typing import Dict
 import tensorflow as tf
 
 from ..layers.subsampling import TimeReduction
@@ -256,11 +257,7 @@ def encoder_inference(self, features: tf.Tensor, states: tf.Tensor):
     # -------------------------------- GREEDY -------------------------------------
 
     @tf.function
-    def recognize(self,
-                  features: tf.Tensor,
-                  input_length: tf.Tensor,
-                  parallel_iterations: int = 10,
-                  swap_memory: bool = True):
+    def recognize(self, inputs: Dict[str, tf.Tensor]):
         """
         RNN Transducer Greedy decoding
         Args:
@@ -269,10 +266,10 @@ def recognize(self,
         Returns:
             tf.Tensor: a batch of decoded transcripts
         """
-        batch_size, _, _, _ = shape_util.shape_list(features)
-        encoded, _ = self.encoder.recognize(features, self.encoder.get_initial_state(batch_size))
-        return self._perform_greedy_batch(encoded, input_length,
-                                          parallel_iterations=parallel_iterations, swap_memory=swap_memory)
+        batch_size, _, _, _ = shape_util.shape_list(inputs["inputs"])
+        encoded, _ = self.encoder.recognize(inputs["inputs"], self.encoder.get_initial_state(batch_size))
+        encoded_length = math_util.get_reduced_length(inputs["inputs_length"], self.time_reduction_factor)
+        return self._perform_greedy_batch(encoded=encoded, encoded_length=encoded_length)
 
     def recognize_tflite(self, signal, predicted, encoder_states, prediction_states):
         """
@@ -321,12 +318,7 @@ def recognize_tflite_with_timestamp(self, signal, predicted, encoder_states, pre
     # -------------------------------- BEAM SEARCH -------------------------------------
 
     @tf.function
-    def recognize_beam(self,
-                       features: tf.Tensor,
-                       input_length: tf.Tensor,
-                       lm: bool = False,
-                       parallel_iterations: int = 10,
-                       swap_memory: bool = True):
+    def recognize_beam(self, inputs: Dict[str, tf.Tensor], lm: bool = False):
         """
         RNN Transducer Beam Search
         Args:
@@ -336,10 +328,10 @@ def recognize_beam(self,
         Returns:
             tf.Tensor: a batch of decoded transcripts
         """
-        batch_size, _, _, _ = shape_util.shape_list(features)
-        encoded, _ = self.encoder.recognize(features, self.encoder.get_initial_state(batch_size))
-        return self._perform_beam_search_batch(encoded, input_length, lm,
-                                               parallel_iterations=parallel_iterations, swap_memory=swap_memory)
+        batch_size, _, _, _ = shape_util.shape_list(inputs["inputs"])
+        encoded, _ = self.encoder.recognize(inputs["inputs"], self.encoder.get_initial_state(batch_size))
+        encoded_length = math_util.get_reduced_length(inputs["inputs_length"], self.time_reduction_factor)
+        return self._perform_beam_search_batch(encoded=encoded, encoded_length=encoded_length, lm=lm)
 
     # -------------------------------- TFLITE -------------------------------------
 
diff --git a/tensorflow_asr/models/transducer/transducer.py b/tensorflow_asr/models/transducer/transducer.py
diff --git a/tensorflow_asr/utils/env_util.py b/tensorflow_asr/utils/env_util.py