TensorSpeech
diff --git a/‎README.md‎
Lines changed: 3 additions & 8 deletions b/‎README.md‎
Lines changed: 3 additions & 8 deletions
diff --git a/‎examples/conformer/config.yml‎
Lines changed: 1 addition & 1 deletion b/‎examples/conformer/config.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/conformer/masking/README.md‎
Lines changed: 5 additions & 0 deletions b/‎examples/conformer/masking/README.md‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎examples/conformer/masking/masking.py‎
Lines changed: 32 additions & 0 deletions b/‎examples/conformer/masking/masking.py‎
Lines changed: 32 additions & 0 deletions
diff --git a/‎examples/conformer/masking/train_ga_masking_conformer.py‎
Lines changed: 131 additions & 0 deletions b/‎examples/conformer/masking/train_ga_masking_conformer.py‎
Lines changed: 131 additions & 0 deletions
diff --git a/‎examples/conformer/masking/train_ga_masking_subword_conformer.py‎
Lines changed: 147 additions & 0 deletions b/‎examples/conformer/masking/train_ga_masking_subword_conformer.py‎
Lines changed: 147 additions & 0 deletions
@@ -2,11 +2,11 @@
 <p>TensorFlowASR :zap:</p>
 <p align="center">
 <a href="https://github.com/TensorSpeech/TensorFlowASR/blob/main/LICENSE">
-  <img alt="GitHub" src="https://img.shields.io/github/license/TensorSpeech/TensorFlowASR?logo=apache">
+  <img alt="GitHub" src="https://img.shields.io/github/license/TensorSpeech/TensorFlowASR?logo=apache&logoColor=green">
 </a>
 <img alt="python" src="https://img.shields.io/badge/python-%3E%3D3.6-blue?logo=python">
 <img alt="tensorflow" src="https://img.shields.io/badge/tensorflow-%3E%3D2.3.0-orange?logo=tensorflow">
-<img alt="PyPI" src="https://img.shields.io/pypi/v/TensorFlowASR?color=%2300B4EF&label=release&logo=pypi&logoColor=%2300B4EF">
+<img alt="PyPI" src="https://img.shields.io/pypi/v/TensorFlowASR?color=%234285F4&label=release&logo=pypi&logoColor=%234285F4">
 </p>
 </h1>
 <h2 align="center">
@@ -19,16 +19,11 @@ TensorFlowASR implements some automatic speech recognition architectures such as
 
 ## What's New?
 
+- (12/12/2020) Add support for using masking
 - (11/14/2020) Supported Gradient Accumulation for Training in Larger Batch Size
 - (11/3/2020) Reduce differences between `librosa.stft` and `tf.signal.stft`
 - (10/31/2020) Update DeepSpeech2 and Supported Jasper [https://arxiv.org/abs/1904.03288](https://arxiv.org/abs/1904.03288)
 - (10/18/2020) Supported Streaming Transducer [https://arxiv.org/abs/1811.06621](https://arxiv.org/abs/1811.06621)
-- (10/15/2020) Add gradients accumulation and Refactor to TensorflowASR
-- (10/10/2020) Update documents and upload package to pypi
-- (10/6/2020) Change `nlpaug` version to `>=1.0.1`
-- (9/18/2020) Support `word-pieces` (aka `subwords`) using `tensorflow-datasets`
-- Support `transducer` tflite greedy decoding (conversion and invocation)
-- Distributed training using `tf.distribute.MirroredStrategy`
 
 ## Table of Contents
 
 
@@ -85,7 +85,7 @@ learning_config:
     epsilon: 1e-9
 
   running_config:
-    batch_size: 4
+    batch_size: 2
     accumulation_steps: 4
     num_epochs: 20
     outdir: /mnt/d/Models/local/conformer
 
@@ -0,0 +1,5 @@
+# Training Conformer with Attention Masking
+
+This is an example for anyone who wants to apply masking in Conformer.
+
+**Note**: This is not a good practice since Conformer uses time reduction, which leads to create incorrect maskings.
@@ -0,0 +1,32 @@
+import tensorflow as tf
+from tensorflow_asr.utils.utils import shape_list
+
+
+def create_padding_mask(features, input_length, time_reduction_factor):
+    """
+    Create masking with 0 for paddings and 1 for non-paddings
+    Args:
+        features ([tf.Tensor]): audio features with shape [B, T, F, C]
+        input_length ([tf.Tensor]): audio features length with shape [B]
+        time_reduction_factor ([int])
+
+    Returns:
+        [tf.Tensor]: with shape [B, Tquery, Tkey]
+    """
+    batch_size, padded_time, _, _ = shape_list(features)
+    reduced_padded_time = tf.math.ceil(padded_time / time_reduction_factor)
+
+    def create_mask(length):
+        reduced_length = tf.math.ceil(length / time_reduction_factor)
+        mask = tf.ones([reduced_length, reduced_length], dtype=tf.float32)
+        return tf.pad(
+            mask,
+            [
+                [0, reduced_padded_time - reduced_length],
+                [0, reduced_padded_time - reduced_length]
+            ],
+            mode="CONSTANT",
+            constant_values=0.0
+        )
+
+    return tf.map_fn(create_mask, input_length, fn_output_signature=tf.TensorSpec([None, None], dtype=tf.float32))
@@ -0,0 +1,131 @@
+# Copyright 2020 Huy Le Nguyen (@usimarit)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import math
+import argparse
+from tensorflow_asr.utils import setup_environment, setup_strategy
+
+setup_environment()
+import tensorflow as tf
+
+DEFAULT_YAML = os.path.join(os.path.abspath(os.path.dirname(__file__)), "..", "config.yml")
+
+tf.keras.backend.clear_session()
+
+parser = argparse.ArgumentParser(prog="Conformer Training")
+
+parser.add_argument("--config", type=str, default=DEFAULT_YAML,
+                    help="The file path of model configuration file")
+
+parser.add_argument("--max_ckpts", type=int, default=10,
+                    help="Max number of checkpoints to keep")
+
+parser.add_argument("--tfrecords", default=False, action="store_true",
+                    help="Whether to use tfrecords")
+
+parser.add_argument("--tbs", type=int, default=None,
+                    help="Train batch size per replica")
+
+parser.add_argument("--ebs", type=int, default=None,
+                    help="Evaluation batch size per replica")
+
+parser.add_argument("--acs", type=int, default=None,
+                    help="Train accumulation steps")
+
+parser.add_argument("--devices", type=int, nargs="*", default=[0],
+                    help="Devices' ids to apply distributed training")
+
+parser.add_argument("--mxp", default=False, action="store_true",
+                    help="Enable mixed precision")
+
+parser.add_argument("--cache", default=False, action="store_true",
+                    help="Enable caching for dataset")
+
+args = parser.parse_args()
+
+tf.config.optimizer.set_experimental_options({"auto_mixed_precision": args.mxp})
+
+strategy = setup_strategy(args.devices)
+
+from tensorflow_asr.configs.config import Config
+from tensorflow_asr.datasets.asr_dataset import ASRTFRecordDataset, ASRSliceDataset
+from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer
+from tensorflow_asr.featurizers.text_featurizers import CharFeaturizer
+from trainer import TrainerWithMaskingGA
+from tensorflow_asr.models.conformer import Conformer
+from tensorflow_asr.optimizers.schedules import TransformerSchedule
+
+config = Config(args.config, learning=True)
+speech_featurizer = TFSpeechFeaturizer(config.speech_config)
+text_featurizer = CharFeaturizer(config.decoder_config)
+
+if args.tfrecords:
+    train_dataset = ASRTFRecordDataset(
+        data_paths=config.learning_config.dataset_config.train_paths,
+        tfrecords_dir=config.learning_config.dataset_config.tfrecords_dir,
+        speech_featurizer=speech_featurizer,
+        text_featurizer=text_featurizer,
+        augmentations=config.learning_config.augmentations,
+        stage="train", cache=args.cache, shuffle=True
+    )
+    eval_dataset = ASRTFRecordDataset(
+        data_paths=config.learning_config.dataset_config.eval_paths,
+        tfrecords_dir=config.learning_config.dataset_config.tfrecords_dir,
+        speech_featurizer=speech_featurizer,
+        text_featurizer=text_featurizer,
+        stage="eval", cache=args.cache, shuffle=True
+    )
+else:
+    train_dataset = ASRSliceDataset(
+        data_paths=config.learning_config.dataset_config.train_paths,
+        speech_featurizer=speech_featurizer,
+        text_featurizer=text_featurizer,
+        augmentations=config.learning_config.augmentations,
+        stage="train", cache=args.cache, shuffle=True
+    )
+    eval_dataset = ASRSliceDataset(
+        data_paths=config.learning_config.dataset_config.eval_paths,
+        speech_featurizer=speech_featurizer,
+        text_featurizer=text_featurizer,
+        stage="eval", cache=args.cache, shuffle=True
+    )
+
+conformer_trainer = TrainerWithMaskingGA(
+    config=config.learning_config.running_config,
+    text_featurizer=text_featurizer, strategy=strategy
+)
+
+with conformer_trainer.strategy.scope():
+    # build model
+    conformer = Conformer(**config.model_config, vocabulary_size=text_featurizer.num_classes)
+    conformer._build(speech_featurizer.shape)
+    conformer.summary(line_length=120)
+
+    optimizer = tf.keras.optimizers.Adam(
+        TransformerSchedule(
+            d_model=config.model_config["encoder_dmodel"],
+            warmup_steps=config.learning_config.optimizer_config["warmup_steps"],
+            max_lr=(0.05 / math.sqrt(config.model_config["encoder_dmodel"]))
+        ),
+        beta_1=config.learning_config.optimizer_config["beta1"],
+        beta_2=config.learning_config.optimizer_config["beta2"],
+        epsilon=config.learning_config.optimizer_config["epsilon"]
+    )
+
+conformer_trainer.compile(model=conformer, optimizer=optimizer,
+                          max_to_keep=args.max_ckpts)
+
+conformer_trainer.fit(train_dataset, eval_dataset,
+                      train_bs=args.tbs, eval_bs=args.ebs, train_acs=args.acs)
@@ -0,0 +1,147 @@
+# Copyright 2020 Huy Le Nguyen (@usimarit)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import math
+import argparse
+from tensorflow_asr.utils import setup_environment, setup_strategy
+
+setup_environment()
+import tensorflow as tf
+
+DEFAULT_YAML = os.path.join(os.path.abspath(os.path.dirname(__file__)), "..", "config.yml")
+
+tf.keras.backend.clear_session()
+
+parser = argparse.ArgumentParser(prog="Conformer Training")
+
+parser.add_argument("--config", type=str, default=DEFAULT_YAML,
+                    help="The file path of model configuration file")
+
+parser.add_argument("--max_ckpts", type=int, default=10,
+                    help="Max number of checkpoints to keep")
+
+parser.add_argument("--tfrecords", default=False, action="store_true",
+                    help="Whether to use tfrecords")
+
+parser.add_argument("--tbs", type=int, default=None,
+                    help="Train batch size per replica")
+
+parser.add_argument("--ebs", type=int, default=None,
+                    help="Evaluation batch size per replica")
+
+parser.add_argument("--acs", type=int, default=None,
+                    help="Train accumulation steps")
+
+parser.add_argument("--devices", type=int, nargs="*", default=[0],
+                    help="Devices' ids to apply distributed training")
+
+parser.add_argument("--mxp", default=False, action="store_true",
+                    help="Enable mixed precision")
+
+parser.add_argument("--cache", default=False, action="store_true",
+                    help="Enable caching for dataset")
+
+parser.add_argument("--subwords", type=str, default=None,
+                    help="Path to file that stores generated subwords")
+
+parser.add_argument("--subwords_corpus", nargs="*", type=str, default=[],
+                    help="Transcript files for generating subwords")
+
+args = parser.parse_args()
+
+tf.config.optimizer.set_experimental_options({"auto_mixed_precision": args.mxp})
+
+strategy = setup_strategy(args.devices)
+
+from tensorflow_asr.configs.config import Config
+from tensorflow_asr.datasets.asr_dataset import ASRTFRecordDataset, ASRSliceDataset
+from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer
+from tensorflow_asr.featurizers.text_featurizers import SubwordFeaturizer
+from trainer import TrainerWithMaskingGA
+from tensorflow_asr.models.conformer import Conformer
+from tensorflow_asr.optimizers.schedules import TransformerSchedule
+
+config = Config(args.config, learning=True)
+speech_featurizer = TFSpeechFeaturizer(config.speech_config)
+
+if args.subwords and os.path.exists(args.subwords):
+    print("Loading subwords ...")
+    text_featurizer = SubwordFeaturizer.load_from_file(config.decoder_config, args.subwords)
+else:
+    print("Generating subwords ...")
+    text_featurizer = SubwordFeaturizer.build_from_corpus(
+        config.decoder_config,
+        corpus_files=args.subwords_corpus
+    )
+    text_featurizer.save_to_file(args.subwords)
+
+if args.tfrecords:
+    train_dataset = ASRTFRecordDataset(
+        data_paths=config.learning_config.dataset_config.train_paths,
+        tfrecords_dir=config.learning_config.dataset_config.tfrecords_dir,
+        speech_featurizer=speech_featurizer,
+        text_featurizer=text_featurizer,
+        augmentations=config.learning_config.augmentations,
+        stage="train", cache=args.cache, shuffle=True
+    )
+    eval_dataset = ASRTFRecordDataset(
+        data_paths=config.learning_config.dataset_config.eval_paths,
+        tfrecords_dir=config.learning_config.dataset_config.tfrecords_dir,
+        speech_featurizer=speech_featurizer,
+        text_featurizer=text_featurizer,
+        stage="eval", cache=args.cache, shuffle=True
+    )
+else:
+    train_dataset = ASRSliceDataset(
+        data_paths=config.learning_config.dataset_config.train_paths,
+        speech_featurizer=speech_featurizer,
+        text_featurizer=text_featurizer,
+        augmentations=config.learning_config.augmentations,
+        stage="train", cache=args.cache, shuffle=True
+    )
+    eval_dataset = ASRSliceDataset(
+        data_paths=config.learning_config.dataset_config.eval_paths,
+        speech_featurizer=speech_featurizer,
+        text_featurizer=text_featurizer,
+        stage="eval", cache=args.cache, shuffle=True
+    )
+
+conformer_trainer = TrainerWithMaskingGA(
+    config=config.learning_config.running_config,
+    text_featurizer=text_featurizer, strategy=strategy
+)
+
+with conformer_trainer.strategy.scope():
+    # build model
+    conformer = Conformer(**config.model_config, vocabulary_size=text_featurizer.num_classes)
+    conformer._build(speech_featurizer.shape)
+    conformer.summary(line_length=120)
+
+    optimizer = tf.keras.optimizers.Adam(
+        TransformerSchedule(
+            d_model=config.model_config["encoder_dmodel"],
+            warmup_steps=config.learning_config.optimizer_config["warmup_steps"],
+            max_lr=(0.05 / math.sqrt(config.model_config["encoder_dmodel"]))
+        ),
+        beta_1=config.learning_config.optimizer_config["beta1"],
+        beta_2=config.learning_config.optimizer_config["beta2"],
+        epsilon=config.learning_config.optimizer_config["epsilon"]
+    )
+
+conformer_trainer.compile(model=conformer, optimizer=optimizer,
+                          max_to_keep=args.max_ckpts)
+
+conformer_trainer.fit(train_dataset, eval_dataset,
+                      train_bs=args.tbs, eval_bs=args.ebs, train_acs=args.acs)