🚀 finish adding option to choose numpy fn or pure tf in dataset

nglehuy · nglehuy · commit 66a18fd1d9c7 · 2021-01-30T19:02:25.000+07:00
diff --git a/examples/conformer/train_keras_subword_conformer.py b/examples/conformer/train_keras_subword_conformer.py
@@ -59,7 +59,7 @@
 strategy = setup_strategy(args.devices)
 
 from tensorflow_asr.configs.config import Config
-from tensorflow_asr.datasets.keras import ASRTFRecordDatasetKeras, ASRDatasetKeras
+from tensorflow_asr.datasets.keras import ASRTFRecordDatasetKeras, ASRSliceDatasetKeras
 from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer
 from tensorflow_asr.featurizers.text_featurizers import SubwordFeaturizer, SentencePieceFeaturizer
 from tensorflow_asr.models.keras.conformer import Conformer
@@ -103,15 +103,15 @@
         shuffle=True, buffer_size=args.bfs,
     )
 else:
-    train_dataset = ASRDatasetKeras(
+    train_dataset = ASRSliceDatasetKeras(
         data_paths=config.learning_config.dataset_config.train_paths,
         speech_featurizer=speech_featurizer,
         text_featurizer=text_featurizer,
         augmentations=config.learning_config.augmentations,
         stage="train", cache=args.cache,
         shuffle=True, buffer_size=args.bfs,
     )
-    eval_dataset = ASRDatasetKeras(
+    eval_dataset = ASRSliceDatasetKeras(
         data_paths=config.learning_config.dataset_config.eval_paths,
         speech_featurizer=speech_featurizer,
         text_featurizer=text_featurizer,
diff --git a/examples/conformer/train_subword_conformer.py b/examples/conformer/train_subword_conformer.py
@@ -59,7 +59,7 @@
 strategy = setup_strategy(args.devices)
 
 from tensorflow_asr.configs.config import Config
-from tensorflow_asr.datasets.asr_dataset import TFASRTFRecordDataset, ASRSliceDataset
+from tensorflow_asr.datasets.asr_dataset import ASRTFRecordDataset, ASRSliceDataset
 from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer
 from tensorflow_asr.featurizers.text_featurizers import SubwordFeaturizer, SentencePieceFeaturizer
 from tensorflow_asr.runners.transducer_runners import TransducerTrainer
@@ -84,7 +84,7 @@
     text_featurizer.save_to_file(args.subwords)
 
 if args.tfrecords:
-    train_dataset = TFASRTFRecordDataset(
+    train_dataset = ASRTFRecordDataset(
         data_paths=config.learning_config.dataset_config.train_paths,
         tfrecords_dir=config.learning_config.dataset_config.tfrecords_dir,
         speech_featurizer=speech_featurizer,
@@ -94,7 +94,7 @@
         stage="train", cache=args.cache,
         shuffle=True, buffer_size=args.bfs,
     )
-    eval_dataset = TFASRTFRecordDataset(
+    eval_dataset = ASRTFRecordDataset(
         data_paths=config.learning_config.dataset_config.eval_paths,
         tfrecords_dir=config.learning_config.dataset_config.tfrecords_dir,
         tfrecords_shards=args.tfrecords_shards,
diff --git a/setup.py b/setup.py
@@ -36,7 +36,7 @@
 
 setuptools.setup(
     name="TensorFlowASR",
-    version="0.7.0",
+    version="0.7.1",
     author="Huy Le Nguyen",
     author_email="nlhuy.cs.16@gmail.com",
     description="Almost State-of-the-art Automatic Speech Recognition using Tensorflow 2",
diff --git a/tensorflow_asr/augmentations/augments.py b/tensorflow_asr/augmentations/augments.py
@@ -54,13 +54,13 @@ def augment(self, inputs):
 class Augmentation:
     def __init__(self, config: dict = None):
         if not config: config = {}
-        use_tf = config.get("use_tf", False)
-        if use_tf:
-            self.before = self.tf_parse(config.get("before", {}))
-            self.after = self.tf_parse(config.get("after", {}))
+        self.use_tf = config.pop("use_tf", False)
+        if self.use_tf:
+            self.before = self.tf_parse(config.pop("before", {}))
+            self.after = self.tf_parse(config.pop("after", {}))
         else:
-            self.before = self.parse(config.get("before", {}))
-            self.after = self.parse(config.get("after", {}))
+            self.before = self.parse(config.pop("before", {}))
+            self.after = self.parse(config.pop("after", {}))
 
     @staticmethod
     def parse(config: dict) -> list:
diff --git a/tensorflow_asr/configs/config.py b/tensorflow_asr/configs/config.py
@@ -40,6 +40,7 @@ def __init__(self, config: dict = None):
         self.eval_paths = preprocess_paths(config.pop("eval_paths", None))
         self.test_paths = preprocess_paths(config.pop("test_paths", None))
         self.tfrecords_dir = preprocess_paths(config.pop("tfrecords_dir", None))
+        self.use_tf = config.pop("use_tf", False)
         for k, v in config.items(): setattr(self, k, v)
 
 
diff --git a/tensorflow_asr/datasets/asr_dataset.py b/tensorflow_asr/datasets/asr_dataset.py
@@ -28,21 +28,6 @@
 TFRECORD_SHARDS = 16
 
 
-def write_tfrecord_file(splitted_entries):
-    shard_path, entries = splitted_entries
-    with tf.io.TFRecordWriter(shard_path, options='ZLIB') as out:
-        for path, audio, indices in entries:
-            feature = {
-                "path": bytestring_feature([bytes(path, "utf-8")]),
-                "audio": bytestring_feature([audio]),
-                "indices": bytestring_feature([bytes(indices, "utf-8")])
-            }
-            example = tf.train.Example(features=tf.train.Features(feature=feature))
-            out.write(example.SerializeToString())
-            print_one_line("Processed:", path)
-    print(f"\nCreated {shard_path}")
-
-
 class ASRDataset(BaseDataset):
     """ Dataset for ASR using Generator """
 
@@ -54,40 +39,39 @@ def __init__(self,
                  augmentations: Augmentation = Augmentation(None),
                  cache: bool = False,
                  shuffle: bool = False,
-                 use_tf: bool = False,
                  drop_remainder: bool = True,
                  buffer_size: int = BUFFER_SIZE):
         super(ASRDataset, self).__init__(
             data_paths=data_paths, augmentations=augmentations,
             cache=cache, shuffle=shuffle, stage=stage, buffer_size=buffer_size,
-            use_tf=use_tf, drop_remainder=drop_remainder
+            drop_remainder=drop_remainder
         )
         self.speech_featurizer = speech_featurizer
         self.text_featurizer = text_featurizer
 
     def read_entries(self):
-        self.lines = []
+        self.entries = []
         for file_path in self.data_paths:
             print(f"Reading {file_path} ...")
             with tf.io.gfile.GFile(file_path, "r") as f:
                 temp_lines = f.read().splitlines()
                 # Skip the header of tsv file
-                self.lines += temp_lines[1:]
+                self.entries += temp_lines[1:]
         # The files is "\t" seperated
-        self.lines = [line.split("\t", 2) for line in self.lines]
-        self.lines = np.array(self.lines)
-        for i, line in enumerate(self.lines):
-            self.lines[i][-1] = " ".join([str(x) for x in self.text_featurizer.extract(line[-1]).numpy()])
-        if self.shuffle: np.random.shuffle(self.lines)  # Mix transcripts.tsv
-        self.total_steps = len(self.lines)
+        self.entries = [line.split("\t", 2) for line in self.entries]
+        for i, line in enumerate(self.entries):
+            self.entries[i][-1] = " ".join([str(x) for x in self.text_featurizer.extract(line[-1]).numpy()])
+        self.entries = np.array(self.entries)
+        if self.shuffle: np.random.shuffle(self.entries)  # Mix transcripts.tsv
+        self.total_steps = len(self.entries)
 
     def generator(self):
-        for path, _, indices in self.lines:
-            audio = load_and_convert_to_wav(path)
-            yield path, audio, indices
+        for path, _, indices in self.entries:
+            audio = load_and_convert_to_wav(path).numpy()
+            yield bytes(path, "utf-8"), audio, bytes(indices, "utf-8")
 
-    def preprocess(self, path, audio, indices):
-        def fn(_path, _audio, _indices):
+    def preprocess(self, path: tf.Tensor, audio: tf.Tensor, indices: tf.Tensor):
+        def fn(_path: bytes, _audio: bytes, _indices: bytes):
             with tf.device("/CPU:0"):
                 signal = read_raw_audio(_audio, self.speech_featurizer.sample_rate)
 
@@ -111,7 +95,7 @@ def fn(_path, _audio, _indices):
             Tout=[tf.string, tf.float32, tf.int32, tf.int32, tf.int32, tf.int32, tf.int32]
         )
 
-    def tf_preprocess(self, path, audio, indices):
+    def tf_preprocess(self, path: tf.Tensor, audio: tf.Tensor, indices: tf.Tensor):
         with tf.device("/CPU:0"):
             signal = tf_read_raw_audio(audio, self.speech_featurizer.sample_rate)
 
@@ -130,7 +114,7 @@ def tf_preprocess(self, path, audio, indices):
 
             return path, features, input_length, label, label_length, prediction, prediction_length
 
-    def process(self, dataset, batch_size):
+    def process(self, dataset: tf.data.Dataset, batch_size: int):
         dataset = dataset.map(self.parse, num_parallel_calls=AUTOTUNE)
 
         if self.cache:
@@ -193,18 +177,34 @@ def __init__(self,
                  tfrecords_shards: int = TFRECORD_SHARDS,
                  cache: bool = False,
                  shuffle: bool = False,
-                 use_tf: bool = False,
+                 drop_remainder: bool = True,
                  buffer_size: int = BUFFER_SIZE):
         super(ASRTFRecordDataset, self).__init__(
             stage=stage, speech_featurizer=speech_featurizer, text_featurizer=text_featurizer,
             data_paths=data_paths, augmentations=augmentations, cache=cache, shuffle=shuffle, buffer_size=buffer_size,
-            use_tf=use_tf
+            drop_remainder=drop_remainder
         )
         self.tfrecords_dir = tfrecords_dir
         if tfrecords_shards <= 0: raise ValueError("tfrecords_shards must be positive")
         self.tfrecords_shards = tfrecords_shards
         if not tf.io.gfile.exists(self.tfrecords_dir): tf.io.gfile.makedirs(self.tfrecords_dir)
 
+    @staticmethod
+    def write_tfrecord_file(splitted_entries):
+        shard_path, entries = splitted_entries
+        with tf.io.TFRecordWriter(shard_path, options='ZLIB') as out:
+            for path, _, indices in entries:
+                audio = load_and_convert_to_wav(path).numpy()
+                feature = {
+                    "path": bytestring_feature([bytes(path, "utf-8")]),
+                    "audio": bytestring_feature([audio]),
+                    "indices": bytestring_feature([bytes(indices, "utf-8")])
+                }
+                example = tf.train.Example(features=tf.train.Features(feature=feature))
+                out.write(example.SerializeToString())
+                print_one_line("Processed:", path)
+        print(f"\nCreated {shard_path}")
+
     def create_tfrecords(self):
         if not tf.io.gfile.exists(self.tfrecords_dir):
             tf.io.gfile.makedirs(self.tfrecords_dir)
@@ -217,16 +217,15 @@ def create_tfrecords(self):
 
         self.read_entries()
         if not self.total_steps or self.total_steps == 0: return False
-        entries = np.fromiter(self.generator(), dtype=str)
 
         def get_shard_path(shard_id):
             return os.path.join(self.tfrecords_dir, f"{self.stage}_{shard_id}.tfrecord")
 
         shards = [get_shard_path(idx) for idx in range(1, self.tfrecords_shards + 1)]
 
-        splitted_entries = np.array_split(entries, self.tfrecords_shards)
+        splitted_entries = np.array_split(self.entries, self.tfrecords_shards)
         with multiprocessing.Pool(self.tfrecords_shards) as pool:
-            pool.map(write_tfrecord_file, zip(shards, splitted_entries))
+            pool.map(self.write_tfrecord_file, zip(shards, splitted_entries))
 
         return True
 
@@ -260,12 +259,13 @@ class ASRSliceDataset(ASRDataset):
 
     @staticmethod
     def load(record: tf.Tensor):
-        audio = load_and_convert_to_wav(record[0])
+        def fn(path: bytes): return load_and_convert_to_wav(path.decode("utf-8")).numpy()
+        audio = tf.numpy_function(fn, inp=[record[0]], Tout=tf.string)
         return record[0], audio, record[2]
 
     def create(self, batch_size: int):
         self.read_entries()
         if not self.total_steps or self.total_steps == 0: return None
-        dataset = tf.data.Dataset.from_tensor_slices(self.lines)
+        dataset = tf.data.Dataset.from_tensor_slices(self.entries)
         dataset = dataset.map(self.load, num_parallel_calls=AUTOTUNE)
         return self.process(dataset, batch_size)
diff --git a/tensorflow_asr/datasets/base_dataset.py b/tensorflow_asr/datasets/base_dataset.py
@@ -27,7 +27,6 @@ def __init__(self,
                  cache: bool = False,
                  shuffle: bool = False,
                  buffer_size: int = BUFFER_SIZE,
-                 use_tf: bool = False,
                  drop_remainder: bool = True,
                  stage: str = "train"):
         self.data_paths = data_paths
@@ -38,7 +37,7 @@ def __init__(self,
             raise ValueError("buffer_size must be positive when shuffle is on")
         self.buffer_size = buffer_size  # shuffle buffer size
         self.stage = stage  # for defining tfrecords files
-        self.use_tf = use_tf  # whether to use only pure tf in the dataset pipeline
+        self.use_tf = self.augmentations.use_tf
         self.drop_remainder = drop_remainder  # whether to drop remainder for multi gpu training
         self.total_steps = None  # for better training visualization
 
diff --git a/tensorflow_asr/datasets/keras/asr_dataset.py b/tensorflow_asr/datasets/keras/asr_dataset.py
@@ -14,7 +14,7 @@
 
 import tensorflow as tf
 
-from ..asr_dataset import ASRDataset, ASRTFRecordDataset, ASRSliceDataset, AUTOTUNE
+from ..asr_dataset import ASRDataset, ASRTFRecordDataset, ASRSliceDataset, AUTOTUNE, TFRECORD_SHARDS
 from ..base_dataset import BUFFER_SIZE
 from ...featurizers.speech_featurizers import SpeechFeaturizer
 from ...featurizers.text_featurizers import TextFeaturizer
@@ -101,30 +101,37 @@ class ASRTFRecordDatasetKeras(ASRDatasetKeras, ASRTFRecordDataset):
     """ Keras Dataset for ASR using TFRecords """
 
     def __init__(self,
-                 stage: str,
+                 data_paths: list,
+                 tfrecords_dir: str,
                  speech_featurizer: SpeechFeaturizer,
                  text_featurizer: TextFeaturizer,
-                 data_paths: list,
+                 stage: str,
                  augmentations: Augmentation = Augmentation(None),
+                 tfrecords_shards: int = TFRECORD_SHARDS,
                  cache: bool = False,
                  shuffle: bool = False,
-                 use_tf: bool = False,
                  drop_remainder: bool = True,
                  buffer_size: int = BUFFER_SIZE):
         ASRTFRecordDataset.__init__(
             self, stage=stage, speech_featurizer=speech_featurizer, text_featurizer=text_featurizer,
-            data_paths=data_paths, augmentations=augmentations, cache=cache, shuffle=shuffle, use_tf=use_tf,
-            drop_remainder=drop_remainder, buffer_size=buffer_size
+            data_paths=data_paths, tfrecords_dir=tfrecords_dir, augmentations=augmentations, cache=cache, shuffle=shuffle,
+            tfrecords_shards=tfrecords_shards, drop_remainder=drop_remainder, buffer_size=buffer_size
         )
         ASRDatasetKeras.__init__(
             self, stage=stage, speech_featurizer=speech_featurizer, text_featurizer=text_featurizer,
-            data_paths=data_paths, augmentations=augmentations, cache=cache, shuffle=shuffle, use_tf=use_tf,
+            data_paths=data_paths, augmentations=augmentations, cache=cache, shuffle=shuffle,
             drop_remainder=drop_remainder, buffer_size=buffer_size
         )
 
     @tf.function
-    def parse(self, path: tf.Tensor, audio: tf.Tensor, indices: tf.Tensor):
-        return ASRDatasetKeras.parse(self, path, audio, indices)
+    def parse(self, record: tf.Tensor):
+        feature_description = {
+            "path": tf.io.FixedLenFeature([], tf.string),
+            "audio": tf.io.FixedLenFeature([], tf.string),
+            "indices": tf.io.FixedLenFeature([], tf.string)
+        }
+        example = tf.io.parse_single_example(record, feature_description)
+        return ASRDatasetKeras.parse(self, **example)
 
     def process(self, dataset: tf.data.Dataset, batch_size: int):
         return ASRDatasetKeras.process(self, dataset, batch_size)
@@ -141,17 +148,16 @@ def __init__(self,
                  augmentations: Augmentation = Augmentation(None),
                  cache: bool = False,
                  shuffle: bool = False,
-                 use_tf: bool = False,
                  drop_remainder: bool = True,
                  buffer_size: int = BUFFER_SIZE):
         ASRSliceDataset.__init__(
             self, stage=stage, speech_featurizer=speech_featurizer, text_featurizer=text_featurizer,
-            data_paths=data_paths, augmentations=augmentations, cache=cache, shuffle=shuffle, use_tf=use_tf,
+            data_paths=data_paths, augmentations=augmentations, cache=cache, shuffle=shuffle,
             drop_remainder=drop_remainder, buffer_size=buffer_size
         )
         ASRDatasetKeras.__init__(
             self, stage=stage, speech_featurizer=speech_featurizer, text_featurizer=text_featurizer,
-            data_paths=data_paths, augmentations=augmentations, cache=cache, shuffle=shuffle, use_tf=use_tf,
+            data_paths=data_paths, augmentations=augmentations, cache=cache, shuffle=shuffle,
             drop_remainder=drop_remainder, buffer_size=buffer_size
         )
 
diff --git a/tensorflow_asr/losses/rnnt_losses.py b/tensorflow_asr/losses/rnnt_losses.py
@@ -14,12 +14,16 @@
 # RNNT loss implementation in pure TensorFlow is borrowed from [iamjanvijay's repo](https://github.com/iamjanvijay/rnnt)
 
 import tensorflow as tf
+
+from ..utils.utils import has_gpu_or_tpu
+
+use_cpu = not has_gpu_or_tpu()
+
 try:
     from warprnnt_tensorflow import rnnt_loss as warp_rnnt_loss
     use_warprnnt = True
 except ImportError:
     print("Cannot import RNNT loss in warprnnt. Falls back to RNNT in TensorFlow")
-    print("Note: The RNNT in Tensorflow is not supported for CPU yet")
     from tensorflow.python.ops.gen_array_ops import matrix_diag_part_v2
     use_warprnnt = False
 
@@ -208,7 +212,7 @@ def compute_rnnt_loss_and_grad_helper(logits, labels, label_length, logit_length
     a = tf.tile(tf.reshape(tf.range(target_max_len - 1, dtype=tf.int64), shape=(1, 1, target_max_len - 1, 1)),
                 multiples=[batch_size, 1, 1, 1])
     b = tf.cast(tf.reshape(labels - 1, shape=(batch_size, 1, target_max_len - 1, 1)), dtype=tf.int64)
-    # b = tf.where(tf.equal(b, -1), tf.zeros_like(b), b)  # for cpu testing (index -1 on cpu will raise errors)
+    if use_cpu: b = tf.where(tf.equal(b, -1), tf.zeros_like(b), b)  # for cpu testing (index -1 on cpu will raise errors)
     c = tf.concat([a, b], axis=3)
     d = tf.tile(c, multiples=(1, input_max_len, 1, 1))
     e = tf.tile(tf.reshape(tf.range(input_max_len, dtype=tf.int64), shape=(1, input_max_len, 1, 1)),
diff --git a/tensorflow_asr/utils/utils.py b/tensorflow_asr/utils/utils.py
@@ -160,3 +160,10 @@ def get_reduced_length(length, reduction_factor):
 
 def count_non_blank(tensor: tf.Tensor, blank: int or tf.Tensor = 0, axis=None):
     return tf.reduce_sum(tf.where(tf.not_equal(tensor, blank), x=tf.ones_like(tensor), y=tf.zeros_like(tensor)), axis=axis)
+
+
+def has_gpu_or_tpu():
+    gpus = tf.config.list_logical_devices("GPU")
+    tpus = tf.config.list_logical_devices("TPU")
+    if len(gpus) == 0 and len(tpus) == 0: return False
+    return True