TensorSpeech
diff --git a/‎README.md‎
Lines changed: 1 addition & 0 deletions b/‎README.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎examples/contextnet/config.yml‎
Lines changed: 11 additions & 18 deletions b/‎examples/contextnet/config.yml‎
Lines changed: 11 additions & 18 deletions
diff --git a/‎notebooks/conformer.ipynb‎ b/‎notebooks/conformer.ipynb‎
diff --git a/‎notebooks/contextnet.ipynb‎ b/‎notebooks/contextnet.ipynb‎
diff --git a/‎notebooks/deepspeech2.ipynb‎ b/‎notebooks/deepspeech2.ipynb‎
diff --git a/‎notebooks/jasper.ipynb‎ b/‎notebooks/jasper.ipynb‎
diff --git a/‎tensorflow_asr/models/base_model.py‎
Lines changed: 6 additions & 2 deletions b/‎tensorflow_asr/models/base_model.py‎
Lines changed: 6 additions & 2 deletions
diff --git a/‎tensorflow_asr/models/ctc/ctc.py‎
Lines changed: 0 additions & 4 deletions b/‎tensorflow_asr/models/ctc/ctc.py‎
Lines changed: 0 additions & 4 deletions
diff --git a/‎tensorflow_asr/models/transducer/contextnet.py‎
Lines changed: 8 additions & 6 deletions b/‎tensorflow_asr/models/transducer/contextnet.py‎
Lines changed: 8 additions & 6 deletions
@@ -21,6 +21,7 @@ TensorFlowASR implements some automatic speech recognition architectures such as
 
 ## What's New?
 
+- (04/17/2021) Refactor repository with new version 1.x
 - (02/16/2021) Supported for TPU training
 - (12/27/2020) Supported _naive_ token level timestamp, see [demo](./examples/demonstration/conformer.py) with flag `--timestamp`
 - (12/17/2020) Supported ContextNet [http://arxiv.org/abs/2005.03191](http://arxiv.org/abs/2005.03191)
 
@@ -207,8 +207,8 @@ learning_config:
           num_masks: 1
           mask_factor: 27
     data_paths:
-      - /mnt/Miscellanea/Datasets/Speech/LibriSpeech/train-clean-100/transcripts.tsv
-    tfrecords_dir: /mnt/Miscellanea/Datasets/Speech/LibriSpeech/tfrecords
+      - /mnt/h/ML/Datasets/ASR/Raw/LibriSpeech/train-clean-100/transcripts.tsv
+    tfrecords_dir: null
     shuffle: True
     cache: True
     buffer_size: 100
@@ -217,10 +217,8 @@ learning_config:
 
   eval_dataset_config:
     use_tf: True
-    data_paths:
-      - /mnt/Miscellanea/Datasets/Speech/LibriSpeech/dev-clean/transcripts.tsv
-      - /mnt/Miscellanea/Datasets/Speech/LibriSpeech/dev-other/transcripts.tsv
-    tfrecords_dir: /mnt/Miscellanea/Datasets/Speech/LibriSpeech/tfrecords
+    data_paths: null
+    tfrecords_dir: null
     shuffle: False
     cache: True
     buffer_size: 100
@@ -230,8 +228,8 @@ learning_config:
   test_dataset_config:
     use_tf: True
     data_paths:
-      - /mnt/Miscellanea/Datasets/Speech/LibriSpeech/test-clean/transcripts.tsv
-    tfrecords_dir: /mnt/Miscellanea/Datasets/Speech/LibriSpeech/tfrecords
+      - /mnt/e/Datasets/Speech/LibriSpeech/test-clean/transcripts.tsv
+    tfrecords_dir: null
     shuffle: False
     cache: True
     buffer_size: 100
@@ -240,26 +238,21 @@ learning_config:
 
   optimizer_config:
     warmup_steps: 40000
-    beta1: 0.9
-    beta2: 0.98
+    beta_1: 0.9
+    beta_2: 0.98
     epsilon: 1e-9
 
   running_config:
     batch_size: 2
-    accumulation_steps: 4
     num_epochs: 20
-    outdir: /mnt/Miscellanea/Models/local/contextnet
-    log_interval_steps: 300
-    eval_interval_steps: 500
-    save_interval_steps: 1000
     checkpoint:
-      filepath: /mnt/Miscellanea/Models/local/contextnet/checkpoints/{epoch:02d}.h5
+      filepath: /mnt/e/Models/local/contextnet/checkpoints/{epoch:02d}.h5
       save_best_only: True
       save_weights_only: False
       save_freq: epoch
-    states_dir: /mnt/Miscellanea/Models/local/contextnet/states
+    states_dir: /mnt/e/Models/local/contextnet/states
     tensorboard:
-      log_dir: /mnt/Miscellanea/Models/local/contextnet/tensorboard
+      log_dir: /mnt/e/Models/local/contextnet/tensorboard
       histogram_freq: 1
       write_graph: True
       write_images: True
 
@@ -95,10 +95,12 @@ def train_step(self, batch):
             y_pred = self(inputs, training=True)
             loss = self.loss(y_true, y_pred)
             if self.use_loss_scale:
-                loss = self.optimizer.get_scaled_loss(loss)
-        gradients = tape.gradient(loss, self.trainable_weights)
+                scaled_loss = self.optimizer.get_scaled_loss(loss)
         if self.use_loss_scale:
+            gradients = tape.gradient(scaled_loss, self.trainable_weights)
             gradients = self.optimizer.get_unscaled_gradients(gradients)
+        else:
+            gradients = tape.gradient(loss, self.trainable_weights)
         self.optimizer.apply_gradients(zip(gradients, self.trainable_variables))
         self._metrics["loss"].update_state(loss)
         return {m.name: m.result() for m in self.metrics}
@@ -127,6 +129,8 @@ def predict_step(self, batch):
             beam_search_decoding = self.recognize_beam(inputs)
         return tf.stack([labels, greedy_decoding, beam_search_decoding], axis=-1)
 
+    # -------------------------------- INFERENCE FUNCTIONS -------------------------------------
+
     def recognize(self, features, input_lengths, **kwargs):
         pass
 
 
@@ -38,10 +38,6 @@ def __init__(self,
             self.decoder = decoder
         self.time_reduction_factor = 1
 
-    @property
-    def metrics(self):
-        return [self.loss_metric]
-
     def _build(self, input_shape, batch_size=None):
         inputs = tf.keras.Input(input_shape, batch_size=batch_size, dtype=tf.float32)
         inputs_length = tf.keras.Input(shape=[], batch_size=batch_size, dtype=tf.int32)
 
@@ -17,7 +17,7 @@
 
 from ..encoders.contextnet import ContextNetEncoder, L2
 from .transducer import Transducer
-from ...utils import math_util
+from ...utils import math_util, data_util
 
 
 class ContextNet(Transducer):
@@ -80,11 +80,13 @@ def __init__(self,
         for block in self.encoder.blocks: self.time_reduction_factor *= block.time_reduction_factor
 
     def call(self, inputs, training=False, **kwargs):
-        features, input_length, prediction, prediction_length = inputs
-        enc = self.encoder([features, input_length], training=training, **kwargs)
-        pred = self.predict_net([prediction, prediction_length], training=training, **kwargs)
-        outputs = self.joint_net([enc, pred], training=training, **kwargs)
-        return outputs
+        enc = self.encoder([inputs["inputs"], inputs["inputs_length"]], training=training, **kwargs)
+        pred = self.predict_net([inputs["predictions"], inputs["predictions_length"]], training=training, **kwargs)
+        logits = self.joint_net([enc, pred], training=training, **kwargs)
+        return data_util.create_logits(
+            logits=logits,
+            logits_length=math_util.get_reduced_length(inputs["inputs_length"], self.time_reduction_factor)
+        )
 
     def encoder_inference(self, features: tf.Tensor, input_length: tf.Tensor):
         with tf.name_scope(f"{self.name}_encoder"):