TensorSpeech
diff --git a/‎README.md‎
Lines changed: 1 addition & 0 deletions b/‎README.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎examples/fastspeech2/conf/fastspeech2.v1.yaml‎
Lines changed: 0 additions & 1 deletion b/‎examples/fastspeech2/conf/fastspeech2.v1.yaml‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎examples/fastspeech2/train_fastspeech2.py‎
Lines changed: 11 additions & 28 deletions b/‎examples/fastspeech2/train_fastspeech2.py‎
Lines changed: 11 additions & 28 deletions
diff --git a/‎notebooks/TensorFlowTTS_FastSpeech_with_TFLite.ipynb‎
Lines changed: 849 additions & 0 deletions b/‎notebooks/TensorFlowTTS_FastSpeech_with_TFLite.ipynb‎
Lines changed: 849 additions & 0 deletions
diff --git a/‎notebooks/TensorFlowTTS_Tacotron2_with_TFLite.ipynb‎
Lines changed: 827 additions & 0 deletions b/‎notebooks/TensorFlowTTS_Tacotron2_with_TFLite.ipynb‎
Lines changed: 827 additions & 0 deletions
diff --git a/‎tensorflow_tts/models/fastspeech.py‎
Lines changed: 131 additions & 53 deletions b/‎tensorflow_tts/models/fastspeech.py‎
Lines changed: 131 additions & 53 deletions
diff --git a/‎tensorflow_tts/models/fastspeech2.py‎
Lines changed: 1 addition & 8 deletions b/‎tensorflow_tts/models/fastspeech2.py‎
Lines changed: 1 addition & 8 deletions
@@ -19,6 +19,7 @@
 :zany_face: TensorflowTTS provides real-time state-of-the-art speech synthesis architectures such as Tacotron-2, Melgan, Multiband-Melgan, FastSpeech, FastSpeech2 based-on TensorFlow 2. With Tensorflow 2, we can speed-up training/inference progress, optimizer further by using [fake-quantize aware](https://www.tensorflow.org/model_optimization/guide/quantization/training_comprehensive_guide) and [pruning](https://www.tensorflow.org/model_optimization/guide/pruning/pruning_with_keras), make TTS models can be run faster than real-time and be able to deploy on mobile devices or embedded systems. 
 
 ## What's new
+- 2020/07/05 **(New!)** Support Convert Tacotron-2, FastSpeech to Tflite. Pls see the [colab](https://colab.research.google.com/drive/1HudLLpT9CQdh2k04c06bHUwLubhGTWxA?usp=sharing). Thank @jaeyoo from TFlite team for his support.
 - 2020/06/20 **(New!)** [FastSpeech2](https://arxiv.org/abs/2006.04558) implementation with Tensorflow is supported.
 - 2020/06/07 **(New!)** [Multi-band MelGAN (MB MelGAN)](https://github.com/dathudeptrai/TensorflowTTS/blob/master/examples/multiband_melgan/) implementation with Tensorflow is supported. 
 
 
@@ -66,7 +66,6 @@ train_max_steps: 200000               # Number of training steps.
 save_interval_steps: 5000             # Interval steps to save checkpoint.
 eval_interval_steps: 500              # Interval steps to evaluate the network.
 log_interval_steps: 200               # Interval steps to record the training log.
-delay_f0_energy_steps: 3              # 2 steps use LR outputs only then 1 steps LR + F0 + Energy.
 ###########################################################
 #                     OTHER SETTING                       #
 ###########################################################
 
@@ -14,30 +14,23 @@
 # limitations under the License.
 """Train FastSpeech2."""
 
+from tensorflow_tts.optimizers import AdamWeightDecay
+from tensorflow_tts.optimizers import WarmUp
+from tensorflow_tts.models import TFFastSpeech2
+from tensorflow_tts.configs import FastSpeech2Config
+from examples.fastspeech2.fastspeech2_dataset import CharactorDurationF0EnergyMelDataset
+from examples.fastspeech.train_fastspeech import FastSpeechTrainer
+from tqdm import tqdm
+import tensorflow_tts
+import yaml
+import tensorflow as tf
+import numpy as np
 import argparse
 import logging
 import os
 import sys
 sys.path.append(".")
 
-import numpy as np
-import tensorflow as tf
-import yaml
-
-import tensorflow_tts
-
-from tqdm import tqdm
-
-from examples.fastspeech.train_fastspeech import FastSpeechTrainer
-from examples.fastspeech2.fastspeech2_dataset import CharactorDurationF0EnergyMelDataset
-
-from tensorflow_tts.configs import FastSpeech2Config
-
-from tensorflow_tts.models import TFFastSpeech2
-
-from tensorflow_tts.optimizers import WarmUp
-from tensorflow_tts.optimizers import AdamWeightDecay
-
 
 class FastSpeech2Trainer(FastSpeechTrainer):
     """FastSpeech2 Trainer class based on FastSpeechTrainer."""
@@ -88,12 +81,6 @@ def _train_step(self, batch):
         self.steps += 1
         self.tqdm.update(1)
         self._check_train_finish()
-        self._apply_delay_using_f0_energy()
-
-    def _apply_delay_using_f0_energy(self):
-        self.model.is_use_f0_energy = tf.cast(
-            (self.steps % self.config["delay_f0_energy_steps"] == 0), tf.float32
-        )
 
     @tf.function(
         experimental_relax_shapes=True,
@@ -157,9 +144,6 @@ def _eval_epoch(self):
         """Evaluate model one epoch."""
         logging.info(f"(Steps: {self.steps}) Start evaluation.")
 
-        # force to use f0/energy embedding when evaluation.
-        self.model.is_use_f0_energy = tf.constant(1.0)
-
         # calculate loss for each batch
         for eval_steps_per_epoch, batch in enumerate(
             tqdm(self.eval_data_loader, desc="[eval]"), 1
@@ -188,7 +172,6 @@ def _eval_epoch(self):
 
         # reset
         self.reset_states_eval()
-        self.model.is_use_f0_energy = tf.constant(0.0)
 
     @tf.function(
         experimental_relax_shapes=True,
 
@@ -573,14 +573,14 @@ def call(self, inputs, training=False):
 class TFFastSpeechLengthRegulator(tf.keras.layers.Layer):
     """FastSpeech lengthregulator module."""
 
-    def __init__(self, config, **kwargs):
+    def __init__(self, config, enable_tflite_convertible=False, **kwargs):
         """Init variables."""
         super().__init__(**kwargs)
         self.config = config
+        self.enable_tflite_convertible = enable_tflite_convertible
 
     def call(self, inputs, training=False):
         """Call logic.
-
         Args:
             1. encoder_hidden_states, Tensor (float32) shape [batch_size, length, hidden_size]
             2. durations_gt, Tensor (float32/int32) shape [batch_size, length]
@@ -601,83 +601,101 @@ def _length_regulator(self, encoder_hidden_states, durations_gt):
         hidden_size = input_shape[-1]
 
         # initialize output hidden states and encoder masking.
-        outputs = tf.zeros(shape=[0, max_durations, hidden_size], dtype=tf.float32)
-        encoder_masks = tf.zeros(shape=[0, max_durations], dtype=tf.int32)
-
-        def condition(
-            i,
-            batch_size,
-            outputs,
-            encoder_masks,
-            encoder_hidden_states,
-            durations_gt,
-            max_durations,
-        ):
-            return tf.less(i, batch_size)
-
-        def body(
-            i,
-            batch_size,
-            outputs,
-            encoder_masks,
-            encoder_hidden_states,
-            durations_gt,
-            max_durations,
-        ):
-            repeats = durations_gt[i]
+        if self.enable_tflite_convertible:
+            # There is only 1 batch in inference, so we don't have to use
+            # `tf.While` op with 3-D output tensor.
+            repeats = durations_gt[0]
             real_length = tf.reduce_sum(repeats)
             pad_size = max_durations - real_length
+            # masks : [max_durations]
             masks = tf.sequence_mask([real_length], max_durations, dtype=tf.int32)
             repeat_encoder_hidden_states = tf.repeat(
-                encoder_hidden_states[i], repeats=repeats, axis=0
+                encoder_hidden_states[0], repeats=repeats, axis=0
             )
             repeat_encoder_hidden_states = tf.expand_dims(
                 tf.pad(repeat_encoder_hidden_states, [[0, pad_size], [0, 0]]), 0
             )  # [1, max_durations, hidden_size]
-            outputs = tf.concat([outputs, repeat_encoder_hidden_states], axis=0)
-            encoder_masks = tf.concat([encoder_masks, masks], axis=0)
-            return [
-                i + 1,
+
+            outputs = repeat_encoder_hidden_states
+            encoder_masks = masks
+        else:
+            outputs = tf.zeros(shape=[0, max_durations, hidden_size], dtype=tf.float32)
+            encoder_masks = tf.zeros(shape=[0, max_durations], dtype=tf.int32)
+
+            def condition(
+                i,
                 batch_size,
                 outputs,
                 encoder_masks,
                 encoder_hidden_states,
                 durations_gt,
                 max_durations,
-            ]
+            ):
+                return tf.less(i, batch_size)
 
-        # initialize iteration i.
-        i = tf.constant(0, dtype=tf.int32)
-        _, _, outputs, encoder_masks, _, _, _, = tf.while_loop(
-            condition,
-            body,
-            [
+            def body(
                 i,
                 batch_size,
                 outputs,
                 encoder_masks,
                 encoder_hidden_states,
                 durations_gt,
                 max_durations,
-            ],
-            shape_invariants=[
-                i.get_shape(),
-                batch_size.get_shape(),
-                tf.TensorShape([None, None, self.config.hidden_size]),
-                tf.TensorShape([None, None]),
-                encoder_hidden_states.get_shape(),
-                durations_gt.get_shape(),
-                max_durations.get_shape(),
-            ],
-        )
+            ):
+                repeats = durations_gt[i]
+                real_length = tf.reduce_sum(repeats)
+                pad_size = max_durations - real_length
+                masks = tf.sequence_mask([real_length], max_durations, dtype=tf.int32)
+                repeat_encoder_hidden_states = tf.repeat(
+                    encoder_hidden_states[i], repeats=repeats, axis=0
+                )
+                repeat_encoder_hidden_states = tf.expand_dims(
+                    tf.pad(repeat_encoder_hidden_states, [[0, pad_size], [0, 0]]), 0
+                )  # [1, max_durations, hidden_size]
+                outputs = tf.concat([outputs, repeat_encoder_hidden_states], axis=0)
+                encoder_masks = tf.concat([encoder_masks, masks], axis=0)
+                return [
+                    i + 1,
+                    batch_size,
+                    outputs,
+                    encoder_masks,
+                    encoder_hidden_states,
+                    durations_gt,
+                    max_durations,
+                ]
+
+            # initialize iteration i.
+            i = tf.constant(0, dtype=tf.int32)
+            _, _, outputs, encoder_masks, _, _, _, = tf.while_loop(
+                condition,
+                body,
+                [
+                    i,
+                    batch_size,
+                    outputs,
+                    encoder_masks,
+                    encoder_hidden_states,
+                    durations_gt,
+                    max_durations,
+                ],
+                shape_invariants=[
+                    i.get_shape(),
+                    batch_size.get_shape(),
+                    tf.TensorShape([None, None, self.config.hidden_size]),
+                    tf.TensorShape([None, None]),
+                    encoder_hidden_states.get_shape(),
+                    durations_gt.get_shape(),
+                    max_durations.get_shape(),
+                ],
+            )
 
         return outputs, encoder_masks
 
 
 class TFFastSpeech(tf.keras.Model):
     """TF Fastspeech module."""
 
-    def __init__(self, config, **kwargs):
+    def __init__(self, config, enable_tflite_convertible=False, **kwargs):
         """Init layers for fastspeech."""
         super().__init__(**kwargs)
         self.embeddings = TFFastSpeechEmbeddings(config, name="embeddings")
@@ -686,12 +704,16 @@ def __init__(self, config, **kwargs):
             config, name="duration_predictor"
         )
         self.length_regulator = TFFastSpeechLengthRegulator(
-            config, name="length_regulator"
+            config,
+            enable_tflite_convertible=enable_tflite_convertible,
+            name="length_regulator"
         )
         self.decoder = TFFastSpeechDecoder(config, name="decoder")
         self.mel_dense = tf.keras.layers.Dense(units=config.num_mels, name="mel_before")
         self.postnet = TFTacotronPostnet(config=config, name="postnet")
 
+        self.enable_tflite_convertible = enable_tflite_convertible
+
     def _build(self):
         """Dummy input for building model."""
         # fake inputs
@@ -749,8 +771,8 @@ def call(
         input_signature=[
             tf.TensorSpec(shape=[None, None], dtype=tf.int32),
             tf.TensorSpec(shape=[None, None], dtype=tf.bool),
-            tf.TensorSpec(shape=[None,], dtype=tf.int32),
-            tf.TensorSpec(shape=[None,], dtype=tf.float32),
+            tf.TensorSpec(shape=[None, ], dtype=tf.int32),
+            tf.TensorSpec(shape=[None, ], dtype=tf.float32),
         ],
     )
     def inference(self, input_ids, attention_mask, speaker_ids, speed_ratios):
@@ -799,3 +821,59 @@ def inference(self, input_ids, attention_mask, speaker_ids, speed_ratios):
 
         outputs = (mel_before, mel_after, duration_outputs)
         return outputs
+
+    @tf.function(
+        experimental_relax_shapes=True,
+        input_signature=[
+            tf.TensorSpec(shape=[1, None], dtype=tf.int32),
+            tf.TensorSpec(shape=[1, None], dtype=tf.bool),
+            tf.TensorSpec(shape=[1, ], dtype=tf.int32),
+            tf.TensorSpec(shape=[1, ], dtype=tf.float32),
+        ],
+    )
+    def inference_tflite(self, input_ids, attention_mask, speaker_ids, speed_ratios):
+        """Call logic."""
+        embedding_output = self.embeddings([input_ids, speaker_ids], training=False)
+        encoder_output = self.encoder(
+            [embedding_output, attention_mask], training=False
+        )
+        last_encoder_hidden_states = encoder_output[0]
+
+        # duration predictor, here use last_encoder_hidden_states, u can use more hidden_states layers
+        # rather than just use last_hidden_states of encoder for duration_predictor.
+        duration_outputs = self.duration_predictor(
+            [last_encoder_hidden_states, attention_mask]
+        )  # [batch_size, length]
+        duration_outputs = tf.math.exp(duration_outputs) - 1.0
+
+        if speed_ratios is None:
+            speed_ratios = tf.convert_to_tensor(np.array([1.0]), dtype=tf.float32)
+
+        duration_outputs = tf.cast(
+            tf.math.round(duration_outputs * speed_ratios), tf.int32
+        )
+
+        length_regulator_outputs, encoder_masks = self.length_regulator(
+            [last_encoder_hidden_states, duration_outputs], training=False
+        )
+
+        # create decoder positional embedding
+        decoder_pos = tf.range(
+            1, tf.shape(length_regulator_outputs)[1] + 1, dtype=tf.int32
+        )
+        masked_decoder_pos = tf.expand_dims(decoder_pos, 0) * encoder_masks
+
+        decoder_output = self.decoder(
+            [length_regulator_outputs, speaker_ids, encoder_masks, masked_decoder_pos],
+            training=False,
+        )
+        last_decoder_hidden_states = decoder_output[0]
+
+        # here u can use sum or concat more than 1 hidden states layers from decoder.
+        mel_before = self.mel_dense(last_decoder_hidden_states)
+        mel_after = (
+            self.postnet([mel_before, encoder_masks], training=False) + mel_before
+        )
+
+        outputs = (mel_before, mel_after, duration_outputs)
+        return outputs
@@ -118,9 +118,6 @@ def __init__(self, config, **kwargs):
         )
         self.energy_dropout = tf.keras.layers.Dropout(config.energy_dropout_rate)
 
-        # set flag to use f0/energy embedding.
-        self.is_use_f0_energy = tf.constant(0.0)
-
     def _build(self):
         """Dummy input for building model."""
         # fake inputs
@@ -180,11 +177,7 @@ def call(
         energy_embedding = self.energy_dropout(energy_embedding, training=True)
 
         # sum features
-        last_encoder_hidden_states = (
-            last_encoder_hidden_states
-            + self.is_use_f0_energy * f0_embedding
-            + self.is_use_f0_energy * energy_embedding
-        )
+        last_encoder_hidden_states += f0_embedding + energy_embedding
 
         length_regulator_outputs, encoder_masks = self.length_regulator(
             [last_encoder_hidden_states, duration_gts], training=training