TensorSpeech
diff --git a/‎README.md‎
Lines changed: 16 additions & 8 deletions b/‎README.md‎
Lines changed: 16 additions & 8 deletions
diff --git a/‎examples/conformer/README.md‎
Lines changed: 5 additions & 78 deletions b/‎examples/conformer/README.md‎
Lines changed: 5 additions & 78 deletions
diff --git a/‎examples/conformer/config.yml‎
Lines changed: 43 additions & 19 deletions b/‎examples/conformer/config.yml‎
Lines changed: 43 additions & 19 deletions
diff --git a/‎examples/conformer/test_conformer.py‎
Lines changed: 14 additions & 26 deletions b/‎examples/conformer/test_conformer.py‎
Lines changed: 14 additions & 26 deletions
diff --git a/‎examples/conformer/test_subword_conformer.py‎
Lines changed: 15 additions & 28 deletions b/‎examples/conformer/test_subword_conformer.py‎
Lines changed: 15 additions & 28 deletions
@@ -59,8 +59,8 @@ TensorFlowASR implements some automatic speech recognition architectures such as
 
 ### Baselines
 
-- **CTCModel** (End2end models using CTC Loss for training)
-- **Transducer Models** (End2end models using RNNT Loss for training)
+- **CTCModel** (End2end models using CTC Loss for training, currently supported DeepSpeech2, Jasper)
+- **Transducer Models** (End2end models using RNNT Loss for training, currently supported Conformer, ContextNet, Streaming Transducer)
 
 ### Publications
 
@@ -110,7 +110,9 @@ pip install .
 
 - For _training, testing and using_ **CTC Models**, run `./scripts/install_ctc_decoders.sh`
 
-- For _training_ **Transducer Models**, run `export CUDA_HOME=/usr/local/cuda && ./scripts/install_rnnt_loss.sh` (**Note**: only `export CUDA_HOME` when you have CUDA)
+- For _training_ **Transducer Models** with RNNT Loss from [warp-transducer](https://github.com/HawkAaron/warp-transducer), run `export CUDA_HOME=/usr/local/cuda && ./scripts/install_rnnt_loss.sh` (**Note**: only `export CUDA_HOME` when you have CUDA)
+
+- For _training_ **Transducer Models** with RNNT Loss in TF, make sure that [warp-transducer](https://github.com/HawkAaron/warp-transducer) **is not installed** (by simply run `pip3 uninstall warprnnt-tensorflow`)
 
 - For _mixed precision training_, use flag `--mxp` when running python scripts from [examples](./examples)
 
@@ -166,11 +168,17 @@ speech_config: ...
 model_config: ...
 decoder_config: ...
 learning_config:
-  augmentations: ...
-  dataset_config:
-    train_paths: ...
-    eval_paths: ...
-    test_paths: ...
+  train_dataset_config:
+    augmentation_config: ...
+    data_paths: ...
+    tfrecords_dir: ...
+  eval_dataset_config:
+    augmentation_config: ...
+    data_paths: ...
+    tfrecords_dir: ...
+  test_dataset_config:
+    augmentation_config: ...
+    data_paths: ...
     tfrecords_dir: ...
   optimizer_config: ...
   running_config:
 
@@ -6,81 +6,7 @@ Reference: [https://arxiv.org/abs/2005.08100](https://arxiv.org/abs/2005.08100)
 
 ## Example Model YAML Config
 
-```yaml
-speech_config:
-  sample_rate: 16000
-  frame_ms: 25
-  stride_ms: 10
-  feature_type: log_mel_spectrogram
-  num_feature_bins: 80
-  preemphasis: 0.97
-  normalize_signal: True
-  normalize_feature: True
-  normalize_per_feature: False
-
-decoder_config:
-  vocabulary: null
-  target_vocab_size: 1024
-  max_subword_length: 4
-  blank_at_zero: True
-  beam_width: 5
-  norm_score: True
-
-model_config:
-  name: conformer
-  subsampling:
-    type: conv2
-    kernel_size: 3
-    strides: 2
-    filters: 144
-  positional_encoding: sinusoid_concat
-  dmodel: 144
-  num_blocks: 16
-  head_size: 36
-  num_heads: 4
-  mha_type: relmha
-  kernel_size: 32
-  fc_factor: 0.5
-  dropout: 0.1
-  embed_dim: 320
-  embed_dropout: 0.0
-  num_rnns: 1
-  rnn_units: 320
-  rnn_type: lstm
-  layer_norm: True
-  joint_dim: 320
-
-learning_config:
-  augmentations:
-    after:
-      time_masking:
-        num_masks: 10
-        mask_factor: 100
-        p_upperbound: 0.2
-      freq_masking:
-        num_masks: 1
-        mask_factor: 27
-
-  dataset_config:
-    train_paths: ...
-    eval_paths: ...
-    test_paths: ...
-    tfrecords_dir: ...
-
-  optimizer_config:
-    warmup_steps: 10000
-    beta1: 0.9
-    beta2: 0.98
-    epsilon: 1e-9
-
-  running_config:
-    batch_size: 4
-    num_epochs: 22
-    outdir: ...
-    log_interval_steps: 400
-    save_interval_steps: 400
-    eval_interval_steps: 1000
-```
+Go to [config.yml](./config.yml)
 
 ## Usage
 
@@ -108,9 +34,10 @@ TFLite Conversion, see `python examples/conformer/tflite_*.py --help`
 
 **Error Rates**
 
-| **Test-clean** |  WER (%)  |  CER (%)   |
-| :------------: | :-------: | :--------: |
-|    _Greedy_    | 6.4476862 | 2.51828337 |
+| **Test-clean** |  WER (%)   |  CER (%)   |
+| :------------: | :--------: | :--------: |
+|    _Greedy_    | 6.37933683 | 2.4757576  |
+|  _Greedy V2_   | 7.86670732 | 2.82563138 |
 
 | **Test-other** |  WER (%)   |  CER (%)   |
 | :------------: | :--------: | :--------: |
 
@@ -24,12 +24,16 @@ speech_config:
   normalize_per_feature: False
 
 decoder_config:
-  vocabulary: null
-  target_vocab_size: 1024
+  vocabulary: ./vocabularies/librispeech_train_4_4076.subwords
+  target_vocab_size: 4096
   max_subword_length: 4
   blank_at_zero: True
   beam_width: 5
   norm_score: True
+  corpus_files:
+    - /media/nlhuy/Data/ML/ASR/Raw/LibriSpeech/LibriSpeech/train-clean-100/transcripts.tsv
+    - /media/nlhuy/Data/ML/ASR/Raw/LibriSpeech/LibriSpeech/train-clean-360/transcripts.tsv
+    - /media/nlhuy/Data/ML/ASR/Raw/LibriSpeech/LibriSpeech/train-other-500/transcripts.tsv
 
 model_config:
   name: conformer
@@ -53,31 +57,51 @@ model_config:
   prediction_rnn_units: 320
   prediction_rnn_type: lstm
   prediction_rnn_implementation: 2
-  prediction_layer_norm: True
+  prediction_layer_norm: False
   prediction_projection_units: 0
-  joint_dim: 320
+  joint_dim: 640
   joint_activation: tanh
 
 learning_config:
-  augmentations:
-    after:
-      time_masking:
-        num_masks: 10
-        mask_factor: 100
-        p_upperbound: 0.05
-      freq_masking:
-        num_masks: 1
-        mask_factor: 27
-
-  dataset_config:
-    train_paths:
+  train_dataset_config:
+    use_tf: True
+    augmentation_config:
+      after:
+        time_masking:
+          num_masks: 10
+          mask_factor: 100
+          p_upperbound: 0.05
+        freq_masking:
+          num_masks: 1
+          mask_factor: 27
+    data_paths:
       - /mnt/Miscellanea/Datasets/Speech/LibriSpeech/train-clean-100/transcripts.tsv
-    eval_paths:
+    tfrecords_dir: /mnt/Miscellanea/Datasets/Speech/LibriSpeech/tfrecords-test
+    shuffle: True
+    cache: True
+    buffer_size: 100
+    drop_remainder: True
+
+  eval_dataset_config:
+    use_tf: True
+    data_paths:
       - /mnt/Miscellanea/Datasets/Speech/LibriSpeech/dev-clean/transcripts.tsv
       - /mnt/Miscellanea/Datasets/Speech/LibriSpeech/dev-other/transcripts.tsv
-    test_paths:
+    tfrecords_dir: /mnt/Miscellanea/Datasets/Speech/LibriSpeech/tfrecords-test
+    shuffle: False
+    cache: True
+    buffer_size: 100
+    drop_remainder: True
+
+  test_dataset_config:
+    use_tf: True
+    data_paths:
       - /mnt/Miscellanea/Datasets/Speech/LibriSpeech/test-clean/transcripts.tsv
-    tfrecords_dir: /mnt/Miscellanea/Datasets/Speech/LibriSpeech/tfrecords
+    tfrecords_dir: /mnt/Miscellanea/Datasets/Speech/LibriSpeech/tfrecords-test
+    shuffle: False
+    cache: True
+    buffer_size: 100
+    drop_remainder: True
 
   optimizer_config:
     warmup_steps: 40000
 
@@ -25,26 +25,19 @@
 
 parser = argparse.ArgumentParser(prog="Conformer Testing")
 
-parser.add_argument("--config", type=str, default=DEFAULT_YAML,
-                    help="The file path of model configuration file")
+parser.add_argument("--config", type=str, default=DEFAULT_YAML, help="The file path of model configuration file")
 
-parser.add_argument("--saved", type=str, default=None,
-                    help="Path to saved model")
+parser.add_argument("--saved", type=str, default=None, help="Path to saved model")
 
-parser.add_argument("--tfrecords", default=False, action="store_true",
-                    help="Whether to use tfrecords as dataset")
+parser.add_argument("--tfrecords", default=False, action="store_true", help="Whether to use tfrecords as dataset")
 
-parser.add_argument("--mxp", default=False, action="store_true",
-                    help="Enable mixed precision")
+parser.add_argument("--mxp", default=False, action="store_true", help="Enable mixed precision")
 
-parser.add_argument("--device", type=int, default=0,
-                    help="Device's id to run test on")
+parser.add_argument("--device", type=int, default=0, help="Device's id to run test on")
 
-parser.add_argument("--cpu", default=False, action="store_true",
-                    help="Whether to only use cpu")
+parser.add_argument("--cpu", default=False, action="store_true", help="Whether to only use cpu")
 
-parser.add_argument("--output_name", type=str, default="test",
-                    help="Result filename name prefix")
+parser.add_argument("--output_name", type=str, default="test", help="Result filename name prefix")
 
 args = parser.parse_args()
 
@@ -53,7 +46,7 @@
 setup_devices([args.device], cpu=args.cpu)
 
 from tensorflow_asr.configs.config import Config
-from tensorflow_asr.datasets.asr_dataset import ASRTFRecordTestDataset, ASRSliceTestDataset
+from tensorflow_asr.datasets.asr_dataset import ASRTFRecordDataset, ASRSliceDataset
 from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer
 from tensorflow_asr.featurizers.text_featurizers import CharFeaturizer
 from tensorflow_asr.runners.base_runners import BaseTester
@@ -67,19 +60,14 @@
 assert args.saved
 
 if args.tfrecords:
-    test_dataset = ASRTFRecordTestDataset(
-        data_paths=config.learning_config.dataset_config.test_paths,
-        tfrecords_dir=config.learning_config.dataset_config.tfrecords_dir,
-        speech_featurizer=speech_featurizer,
-        text_featurizer=text_featurizer,
-        stage="test", shuffle=False
+    test_dataset = ASRTFRecordDataset(
+        speech_featurizer=speech_featurizer, text_featurizer=text_featurizer,
+        **vars(config.learning_config.test_dataset_config)
     )
 else:
-    test_dataset = ASRSliceTestDataset(
-        data_paths=config.learning_config.dataset_config.test_paths,
-        speech_featurizer=speech_featurizer,
-        text_featurizer=text_featurizer,
-        stage="test", shuffle=False
+    test_dataset = ASRSliceDataset(
+        speech_featurizer=speech_featurizer, text_featurizer=text_featurizer,
+        **vars(config.learning_config.test_dataset_config)
     )
 
 # build model
 
@@ -25,31 +25,23 @@
 
 parser = argparse.ArgumentParser(prog="Conformer Testing")
 
-parser.add_argument("--config", type=str, default=DEFAULT_YAML,
-                    help="The file path of model configuration file")
+parser.add_argument("--config", type=str, default=DEFAULT_YAML, help="The file path of model configuration file")
 
-parser.add_argument("--saved", type=str, default=None,
-                    help="Path to saved model")
+parser.add_argument("--saved", type=str, default=None, help="Path to saved model")
 
-parser.add_argument("--tfrecords", default=False, action="store_true",
-                    help="Whether to use tfrecords as dataset")
+parser.add_argument("--tfrecords", default=False, action="store_true", help="Whether to use tfrecords as dataset")
 
-parser.add_argument("--mxp", default=False, action="store_true",
-                    help="Enable mixed precision")
+parser.add_argument("--mxp", default=False, action="store_true", help="Enable mixed precision")
 
 parser.add_argument("--sentence_piece", default=False, action="store_true", help="Whether to use `SentencePiece` model")
 
-parser.add_argument("--device", type=int, default=0,
-                    help="Device's id to run test on")
+parser.add_argument("--device", type=int, default=0, help="Device's id to run test on")
 
-parser.add_argument("--cpu", default=False, action="store_true",
-                    help="Whether to only use cpu")
+parser.add_argument("--cpu", default=False, action="store_true", help="Whether to only use cpu")
 
-parser.add_argument("--subwords", type=str, default=None,
-                    help="Path to file that stores generated subwords")
+parser.add_argument("--subwords", type=str, default=None, help="Path to file that stores generated subwords")
 
-parser.add_argument("--output_name", type=str, default="test",
-                    help="Result filename name prefix")
+parser.add_argument("--output_name", type=str, default="test", help="Result filename name prefix")
 
 args = parser.parse_args()
 
@@ -58,7 +50,7 @@
 setup_devices([args.device], cpu=args.cpu)
 
 from tensorflow_asr.configs.config import Config
-from tensorflow_asr.datasets.asr_dataset import ASRTFRecordTestDataset, ASRSliceTestDataset
+from tensorflow_asr.datasets.asr_dataset import ASRTFRecordDataset, ASRSliceDataset
 from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer
 from tensorflow_asr.featurizers.text_featurizers import SubwordFeaturizer, SentencePieceFeaturizer
 from tensorflow_asr.runners.base_runners import BaseTester
@@ -80,19 +72,14 @@
 assert args.saved
 
 if args.tfrecords:
-    test_dataset = ASRTFRecordTestDataset(
-        data_paths=config.learning_config.dataset_config.test_paths,
-        tfrecords_dir=config.learning_config.dataset_config.tfrecords_dir,
-        speech_featurizer=speech_featurizer,
-        text_featurizer=text_featurizer,
-        stage="test", shuffle=False
+    test_dataset = ASRTFRecordDataset(
+        speech_featurizer=speech_featurizer, text_featurizer=text_featurizer,
+        **vars(config.learning_config.test_dataset_config)
     )
 else:
-    test_dataset = ASRSliceTestDataset(
-        data_paths=config.learning_config.dataset_config.test_paths,
-        speech_featurizer=speech_featurizer,
-        text_featurizer=text_featurizer,
-        stage="test", shuffle=False
+    test_dataset = ASRSliceDataset(
+        speech_featurizer=speech_featurizer, text_featurizer=text_featurizer,
+        **vars(config.learning_config.test_dataset_config)
     )
 
 # build model