TensorSpeech
diff --git a/‎examples/deepspeech2/README.md‎
Lines changed: 15 additions & 36 deletions b/‎examples/deepspeech2/README.md‎
Lines changed: 15 additions & 36 deletions
diff --git a/‎examples/deepspeech2/configs/vivos.yml‎ renamed to ‎examples/deepspeech2/config.yml‎
Lines changed: 15 additions & 16 deletions b/‎examples/deepspeech2/configs/vivos.yml‎ renamed to ‎examples/deepspeech2/config.yml‎
Lines changed: 15 additions & 16 deletions
diff --git a/‎examples/deepspeech2/figs/ds2_vivos_ctc_loss.svg‎
Lines changed: 0 additions & 1 deletion b/‎examples/deepspeech2/figs/ds2_vivos_ctc_loss.svg‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎examples/deepspeech2/model.py‎ b/‎examples/deepspeech2/model.py‎
diff --git a/‎examples/deepspeech2/test_ds2.py‎
Lines changed: 4 additions & 7 deletions b/‎examples/deepspeech2/test_ds2.py‎
Lines changed: 4 additions & 7 deletions
diff --git a/‎examples/deepspeech2/train_ds2.py‎
Lines changed: 4 additions & 7 deletions b/‎examples/deepspeech2/train_ds2.py‎
Lines changed: 4 additions & 7 deletions
diff --git a/‎examples/jasper/README.md‎
Lines changed: 20 additions & 0 deletions b/‎examples/jasper/README.md‎
Lines changed: 20 additions & 0 deletions
diff --git a/‎examples/sadeepspeech2/config.yml‎ renamed to ‎examples/jasper/config.yml‎
Lines changed: 29 additions & 22 deletions b/‎examples/sadeepspeech2/config.yml‎ renamed to ‎examples/jasper/config.yml‎
Lines changed: 29 additions & 22 deletions
diff --git a/‎examples/jasper/figs/jasper_arch.png‎
279 KB b/‎examples/jasper/figs/jasper_arch.png‎
279 KB
diff --git a/‎examples/sadeepspeech2/test_sadeepspeech2.py‎ renamed to ‎examples/jasper/test_jasper.py‎
Lines changed: 29 additions & 22 deletions b/‎examples/sadeepspeech2/test_sadeepspeech2.py‎ renamed to ‎examples/jasper/test_jasper.py‎
Lines changed: 29 additions & 22 deletions
@@ -6,22 +6,19 @@ References: [https://arxiv.org/abs/1512.02595](https://arxiv.org/abs/1512.02595)
 
 ```yaml
 model_config:
-  conv_conf:
-    conv_type: 2
-    conv_kernels: [[11, 41], [11, 21], [11, 11]]
-    conv_strides: [[2, 2], [1, 2], [1, 2]]
-    conv_filters: [32, 32, 96]
-    conv_dropout: 0
-  rnn_conf:
-    rnn_layers:        5
-    rnn_type:          lstm
-    rnn_units:         512
-    rnn_bidirectional: True
-    rnn_rowconv:       False
-    rnn_dropout:       0
-  fc_conf:
-    fc_units: [1024]
-    fc_dropout: 0
+  conv_type: conv2d
+  conv_kernels: [[11, 41], [11, 21], [11, 11]]
+  conv_strides: [[2, 2], [1, 2], [1, 2]]
+  conv_filters: [32, 32, 96]
+  conv_dropout: 0.1
+  rnn_nlayers: 5
+  rnn_type: lstm
+  rnn_units: 512
+  rnn_bidirectional: True
+  rnn_rowconv: 0
+  rnn_dropout: 0.1
+  fc_nlayers: 0
+  fc_units: 1024
 ```
 
 ## Architecture
@@ -30,24 +27,6 @@ model_config:
 
 ## Training and Testing
 
-See `python examples/deepspeech2/run_ds2.py --help`
+See `python examples/deepspeech2/train_ds2.py --help`
 
-## Results on VIVOS Dataset
-
-* Features: Spectrogram with `80` frequency channels
-* KenLM: `alpha = 2.0` and `beta = 1.0`
-* Epochs: `20`
-* Train set split ratio: `90:10`
-* Augmentation: `None`
-* Model architecture: same as [vivos.yaml](./configs/vivos.yml)
-
-**CTC Loss**
-
-<img src="./figs/ds2_vivos_ctc_loss.svg" alt="ds2_vivos_ctc_loss" width="300px" />
-
-**Error rates**
-
-|                 |    WER (%)     |    CER (%)     |
-| :-------------- | :------------: | :------------: |
-| *BeamSearch*    |    43.75243    |   17.991581    |
-| *BeamSearch LM* | **20.7561836** | **11.0304441** |
+See `python examples/deepspeech2/test_ds2.py --help`
@@ -24,7 +24,7 @@ speech_config:
   normalize_per_feature: False
 
 decoder_config:
-  vocabulary: /mnt/Projects/asrk16/TiramisuASR/vocabularies/vietnamese.txt
+  vocabulary: ./vocabularies/vietnamese.characters
   blank_at_zero: False
   beam_width: 500
   lm_config:
@@ -33,21 +33,20 @@ decoder_config:
     beta: 1.0
 
 model_config:
-  conv_conf:
-    conv_type: 2
-    conv_kernels: [[11, 41], [11, 21], [11, 11]]
-    conv_strides: [[2, 2], [1, 2], [1, 2]]
-    conv_filters: [32, 32, 96]
-    conv_dropout: 0
-  rnn_conf:
-    rnn_layers: 5
-    rnn_type: lstm
-    rnn_units: 512
-    rnn_bidirectional: True
-    rnn_rowconv: False
-    rnn_dropout: 0
-  fc_conf:
-    fc_units: null
+  name: deepspeech2
+  conv_type: conv2d
+  conv_kernels: [[11, 41], [11, 21], [11, 11]]
+  conv_strides: [[2, 2], [1, 2], [1, 2]]
+  conv_filters: [32, 32, 96]
+  conv_dropout: 0.1
+  rnn_nlayers: 5
+  rnn_type: lstm
+  rnn_units: 512
+  rnn_bidirectional: True
+  rnn_rowconv: 0
+  rnn_dropout: 0.1
+  fc_nlayers: 0
+  fc_units: 1024
 
 learning_config:
   augmentations: null
 
@@ -19,7 +19,7 @@
 setup_environment()
 import tensorflow as tf
 
-DEFAULT_YAML = os.path.join(os.path.abspath(os.path.dirname(__file__)), "configs", "vivos.yml")
+DEFAULT_YAML = os.path.join(os.path.abspath(os.path.dirname(__file__)), "config.yml")
 
 tf.keras.backend.clear_session()
 
@@ -54,7 +54,7 @@
 from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer
 from tensorflow_asr.featurizers.text_featurizers import CharFeaturizer
 from tensorflow_asr.runners.base_runners import BaseTester
-from model import DeepSpeech2
+from tensorflow_asr.models.deepspeech2 import DeepSpeech2
 
 tf.random.set_seed(0)
 assert args.export
@@ -63,13 +63,10 @@
 speech_featurizer = TFSpeechFeaturizer(config["speech_config"])
 text_featurizer = CharFeaturizer(config["decoder_config"])
 # Build DS2 model
-ds2_model = DeepSpeech2(input_shape=speech_featurizer.shape,
-                        arch_config=config["model_config"],
-                        num_classes=text_featurizer.num_classes,
-                        name="deepspeech2")
+ds2_model = DeepSpeech2(**config["model_config"], vocabulary_size=text_featurizer.num_classes)
 ds2_model._build(speech_featurizer.shape)
 ds2_model.load_weights(args.saved, by_name=True)
-ds2_model.summary(line_length=150)
+ds2_model.summary(line_length=120)
 ds2_model.add_featurizers(speech_featurizer, text_featurizer)
 
 if args.tfrecords:
 
@@ -19,7 +19,7 @@
 setup_environment()
 import tensorflow as tf
 
-DEFAULT_YAML = os.path.join(os.path.abspath(os.path.dirname(__file__)), "configs", "vivos.yml")
+DEFAULT_YAML = os.path.join(os.path.abspath(os.path.dirname(__file__)), "config.yml")
 
 tf.keras.backend.clear_session()
 
@@ -60,7 +60,7 @@
 from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer
 from tensorflow_asr.featurizers.text_featurizers import CharFeaturizer
 from tensorflow_asr.runners.ctc_runners import CTCTrainer
-from model import DeepSpeech2
+from tensorflow_asr.models.deepspeech2 import DeepSpeech2
 
 config = UserConfig(DEFAULT_YAML, args.config, learning=True)
 speech_featurizer = TFSpeechFeaturizer(config["speech_config"])
@@ -100,12 +100,9 @@
 ctc_trainer = CTCTrainer(text_featurizer, config["learning_config"]["running_config"])
 # Build DS2 model
 with ctc_trainer.strategy.scope():
-    ds2_model = DeepSpeech2(input_shape=speech_featurizer.shape,
-                            arch_config=config["model_config"],
-                            num_classes=text_featurizer.num_classes,
-                            name="deepspeech2")
+    ds2_model = DeepSpeech2(**config["model_config"], vocabulary_size=text_featurizer.num_classes)
     ds2_model._build(speech_featurizer.shape)
-    ds2_model.summary(line_length=150)
+    ds2_model.summary(line_length=120)
 # Compile
 ctc_trainer.compile(ds2_model, config["learning_config"]["optimizer_config"],
                     max_to_keep=args.max_ckpts)
 
@@ -0,0 +1,20 @@
+# Jasper
+
+References: [https://arxiv.org/abs/1904.03288](https://arxiv.org/abs/1904.03288)
+
+## Model YAML Config Structure
+
+```yaml
+model_config:
+
+```
+
+## Architecture
+
+<img src="./figs/jasper_arch.png" alt="jasper_arch" width="500px" />
+
+## Training and Testing
+
+See `python examples/jasper/train_jasper.py --help`
+
+See `python examples/jasper/test_jasper.py --help`
@@ -24,7 +24,7 @@ speech_config:
   normalize_per_feature: False
 
 decoder_config:
-  vocabulary: /mnt/Projects/asrk16/TiramisuASR/vocabularies/vietnamese.txt
+  vocabulary: ./vocabularies/vietnamese.characters
   blank_at_zero: False
   beam_width: 500
   lm_config:
@@ -33,20 +33,27 @@ decoder_config:
     beta: 1.0
 
 model_config:
-  subsampling:
-    filters: 144
-    kernel_size: 32
-    strides: 2
-  att:
-    layers: 16
-    head_size: 36
-    num_heads: 4
-    ffn_size: 1024
-    dropout: 0
-  rnn:
-    layers: 1
-    units: 320
-    dropout: 0
+  name: jasper
+  dense: True
+  first_additional_block_channels: 256
+  first_additional_block_kernels: 11
+  first_additional_block_strides: 2
+  first_additional_block_dilation: 1
+  first_additional_block_dropout: 0.2
+  nsubblocks: 3
+  block_channels: [256, 384, 512, 640, 768]
+  block_kernels: [11, 13, 17, 21, 25]
+  block_dropout: [0.2, 0.2, 0.2, 0.3, 0.3]
+  second_additional_block_channels: 896
+  second_additional_block_kernels: 1
+  second_additional_block_strides: 1
+  second_additional_block_dilation: 2
+  second_additional_block_dropout: 0.4
+  third_additional_block_channels: 1024
+  third_additional_block_kernels: 1
+  third_additional_block_strides: 1
+  third_additional_block_dilation: 1
+  third_additional_block_dropout: 0.4
 
 learning_config:
   augmentations: null
@@ -61,14 +68,14 @@ learning_config:
     tfrecords_dir: /mnt/Data/ML/ASR/Preprocessed/Vivos/TFRecords
 
   optimizer_config:
-    name: transformer_adam
+    class_name: adam
     config:
-      warmup_steps: 10000
+      learning_rate: 0.0001
 
   running_config:
-    batch_size: 2
+    batch_size: 8
     num_epochs: 20
-    outdir: /mnt/Projects/asrk16/trained/local/vivos_self_att_ds2
-    log_interval_steps: 500
-    save_interval_steps: 500
-    eval_interval_steps: 700
+    outdir: /mnt/Projects/asrk16/trained/local/jasper
+    log_interval_steps: 400
+    save_interval_steps: 400
+    eval_interval_steps: 800
@@ -1,3 +1,17 @@
+# Copyright 2020 Huy Le Nguyen (@usimarit)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import os
 import argparse
 from tensorflow_asr.utils import setup_environment, setup_devices
@@ -7,16 +21,18 @@
 
 DEFAULT_YAML = os.path.join(os.path.abspath(os.path.dirname(__file__)), "config.yml")
 
-parser = argparse.ArgumentParser(prog="Self Attention DS2")
+tf.keras.backend.clear_session()
+
+parser = argparse.ArgumentParser(prog="Jasper Testing")
 
 parser.add_argument("--config", "-c", type=str, default=DEFAULT_YAML,
                     help="The file path of model configuration file")
 
 parser.add_argument("--saved", type=str, default=None,
-                    help="Path to saved model")
+                    help="Path to the model file to be exported")
 
 parser.add_argument("--tfrecords", default=False, action="store_true",
-                    help="Whether to use tfrecords")
+                    help="Whether to use tfrecords dataset")
 
 parser.add_argument("--mxp", default=False, action="store_true",
                     help="Enable mixed precision")
@@ -33,34 +49,25 @@
 
 setup_devices([args.device])
 
-from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer
-from tensorflow_asr.featurizers.text_featurizers import CharFeaturizer
 from tensorflow_asr.configs.user_config import UserConfig
 from tensorflow_asr.datasets.asr_dataset import ASRTFRecordDataset, ASRSliceDataset
-from model import SelfAttentionDS2
+from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer
+from tensorflow_asr.featurizers.text_featurizers import CharFeaturizer
 from tensorflow_asr.runners.base_runners import BaseTester
-from ctc_decoders import Scorer
+from tensorflow_asr.models.jasper import Jasper
 
 tf.random.set_seed(0)
-assert args.saved
+assert args.export
 
 config = UserConfig(DEFAULT_YAML, args.config, learning=True)
 speech_featurizer = TFSpeechFeaturizer(config["speech_config"])
 text_featurizer = CharFeaturizer(config["decoder_config"])
-
-text_featurizer.add_scorer(Scorer(**text_featurizer.decoder_config["lm_config"],
-                                  vocabulary=text_featurizer.vocab_array))
-
 # Build DS2 model
-satt_ds2_model = SelfAttentionDS2(
-    input_shape=speech_featurizer.shape,
-    arch_config=config["model_config"],
-    num_classes=text_featurizer.num_classes
-)
-satt_ds2_model._build(speech_featurizer.shape)
-satt_ds2_model.load_weights(args.saved, by_name=True)
-satt_ds2_model.summary(line_length=150)
-satt_ds2_model.add_featurizers(speech_featurizer, text_featurizer)
+jasper = Jasper(**config["model_config"], vocabulary_size=text_featurizer.num_classes)
+jasper._build(speech_featurizer.shape)
+jasper.load_weights(args.saved, by_name=True)
+jasper.summary(line_length=120)
+jasper.add_featurizers(speech_featurizer, text_featurizer)
 
 if args.tfrecords:
     test_dataset = ASRTFRecordDataset(
@@ -82,5 +89,5 @@
     config=config["learning_config"]["running_config"],
     output_name=args.output_name
 )
-ctc_tester.compile(satt_ds2_model)
+ctc_tester.compile(jasper)
 ctc_tester.run(test_dataset)