Skip to content

Commit 5c7d36f

Browse files
authored
Merge pull request #1 from TensorSpeech/master
Update
2 parents 4d725de + d1ede2e commit 5c7d36f

File tree

10 files changed

+2263
-108
lines changed

10 files changed

+2263
-108
lines changed

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
:zany_face: TensorflowTTS provides real-time state-of-the-art speech synthesis architectures such as Tacotron-2, Melgan, Multiband-Melgan, FastSpeech, FastSpeech2 based-on TensorFlow 2. With Tensorflow 2, we can speed-up training/inference progress, optimizer further by using [fake-quantize aware](https://www.tensorflow.org/model_optimization/guide/quantization/training_comprehensive_guide) and [pruning](https://www.tensorflow.org/model_optimization/guide/pruning/pruning_with_keras), make TTS models can be run faster than real-time and be able to deploy on mobile devices or embedded systems.
2020

2121
## What's new
22+
- 2020/07/05 **(New!)** Support Convert Tacotron-2, FastSpeech to Tflite. Pls see the [colab](https://colab.research.google.com/drive/1HudLLpT9CQdh2k04c06bHUwLubhGTWxA?usp=sharing). Thank @jaeyoo from TFlite team for his support.
2223
- 2020/06/20 **(New!)** [FastSpeech2](https://arxiv.org/abs/2006.04558) implementation with Tensorflow is supported.
2324
- 2020/06/07 **(New!)** [Multi-band MelGAN (MB MelGAN)](https://github.com/dathudeptrai/TensorflowTTS/blob/master/examples/multiband_melgan/) implementation with Tensorflow is supported.
2425

examples/fastspeech2/conf/fastspeech2.v1.yaml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,6 @@ train_max_steps: 200000 # Number of training steps.
6666
save_interval_steps: 5000 # Interval steps to save checkpoint.
6767
eval_interval_steps: 500 # Interval steps to evaluate the network.
6868
log_interval_steps: 200 # Interval steps to record the training log.
69-
delay_f0_energy_steps: 3 # 2 steps use LR outputs only then 1 steps LR + F0 + Energy.
7069
###########################################################
7170
# OTHER SETTING #
7271
###########################################################

examples/fastspeech2/train_fastspeech2.py

Lines changed: 11 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -14,30 +14,23 @@
1414
# limitations under the License.
1515
"""Train FastSpeech2."""
1616

17+
from tensorflow_tts.optimizers import AdamWeightDecay
18+
from tensorflow_tts.optimizers import WarmUp
19+
from tensorflow_tts.models import TFFastSpeech2
20+
from tensorflow_tts.configs import FastSpeech2Config
21+
from examples.fastspeech2.fastspeech2_dataset import CharactorDurationF0EnergyMelDataset
22+
from examples.fastspeech.train_fastspeech import FastSpeechTrainer
23+
from tqdm import tqdm
24+
import tensorflow_tts
25+
import yaml
26+
import tensorflow as tf
27+
import numpy as np
1728
import argparse
1829
import logging
1930
import os
2031
import sys
2132
sys.path.append(".")
2233

23-
import numpy as np
24-
import tensorflow as tf
25-
import yaml
26-
27-
import tensorflow_tts
28-
29-
from tqdm import tqdm
30-
31-
from examples.fastspeech.train_fastspeech import FastSpeechTrainer
32-
from examples.fastspeech2.fastspeech2_dataset import CharactorDurationF0EnergyMelDataset
33-
34-
from tensorflow_tts.configs import FastSpeech2Config
35-
36-
from tensorflow_tts.models import TFFastSpeech2
37-
38-
from tensorflow_tts.optimizers import WarmUp
39-
from tensorflow_tts.optimizers import AdamWeightDecay
40-
4134

4235
class FastSpeech2Trainer(FastSpeechTrainer):
4336
"""FastSpeech2 Trainer class based on FastSpeechTrainer."""
@@ -88,12 +81,6 @@ def _train_step(self, batch):
8881
self.steps += 1
8982
self.tqdm.update(1)
9083
self._check_train_finish()
91-
self._apply_delay_using_f0_energy()
92-
93-
def _apply_delay_using_f0_energy(self):
94-
self.model.is_use_f0_energy = tf.cast(
95-
(self.steps % self.config["delay_f0_energy_steps"] == 0), tf.float32
96-
)
9784

9885
@tf.function(
9986
experimental_relax_shapes=True,
@@ -157,9 +144,6 @@ def _eval_epoch(self):
157144
"""Evaluate model one epoch."""
158145
logging.info(f"(Steps: {self.steps}) Start evaluation.")
159146

160-
# force to use f0/energy embedding when evaluation.
161-
self.model.is_use_f0_energy = tf.constant(1.0)
162-
163147
# calculate loss for each batch
164148
for eval_steps_per_epoch, batch in enumerate(
165149
tqdm(self.eval_data_loader, desc="[eval]"), 1
@@ -188,7 +172,6 @@ def _eval_epoch(self):
188172

189173
# reset
190174
self.reset_states_eval()
191-
self.model.is_use_f0_energy = tf.constant(0.0)
192175

193176
@tf.function(
194177
experimental_relax_shapes=True,

notebooks/TensorFlowTTS_FastSpeech_with_TFLite.ipynb

Lines changed: 849 additions & 0 deletions
Large diffs are not rendered by default.

notebooks/TensorFlowTTS_Tacotron2_with_TFLite.ipynb

Lines changed: 827 additions & 0 deletions
Large diffs are not rendered by default.

tensorflow_tts/models/fastspeech.py

Lines changed: 131 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -573,14 +573,14 @@ def call(self, inputs, training=False):
573573
class TFFastSpeechLengthRegulator(tf.keras.layers.Layer):
574574
"""FastSpeech lengthregulator module."""
575575

576-
def __init__(self, config, **kwargs):
576+
def __init__(self, config, enable_tflite_convertible=False, **kwargs):
577577
"""Init variables."""
578578
super().__init__(**kwargs)
579579
self.config = config
580+
self.enable_tflite_convertible = enable_tflite_convertible
580581

581582
def call(self, inputs, training=False):
582583
"""Call logic.
583-
584584
Args:
585585
1. encoder_hidden_states, Tensor (float32) shape [batch_size, length, hidden_size]
586586
2. durations_gt, Tensor (float32/int32) shape [batch_size, length]
@@ -601,83 +601,101 @@ def _length_regulator(self, encoder_hidden_states, durations_gt):
601601
hidden_size = input_shape[-1]
602602

603603
# initialize output hidden states and encoder masking.
604-
outputs = tf.zeros(shape=[0, max_durations, hidden_size], dtype=tf.float32)
605-
encoder_masks = tf.zeros(shape=[0, max_durations], dtype=tf.int32)
606-
607-
def condition(
608-
i,
609-
batch_size,
610-
outputs,
611-
encoder_masks,
612-
encoder_hidden_states,
613-
durations_gt,
614-
max_durations,
615-
):
616-
return tf.less(i, batch_size)
617-
618-
def body(
619-
i,
620-
batch_size,
621-
outputs,
622-
encoder_masks,
623-
encoder_hidden_states,
624-
durations_gt,
625-
max_durations,
626-
):
627-
repeats = durations_gt[i]
604+
if self.enable_tflite_convertible:
605+
# There is only 1 batch in inference, so we don't have to use
606+
# `tf.While` op with 3-D output tensor.
607+
repeats = durations_gt[0]
628608
real_length = tf.reduce_sum(repeats)
629609
pad_size = max_durations - real_length
610+
# masks : [max_durations]
630611
masks = tf.sequence_mask([real_length], max_durations, dtype=tf.int32)
631612
repeat_encoder_hidden_states = tf.repeat(
632-
encoder_hidden_states[i], repeats=repeats, axis=0
613+
encoder_hidden_states[0], repeats=repeats, axis=0
633614
)
634615
repeat_encoder_hidden_states = tf.expand_dims(
635616
tf.pad(repeat_encoder_hidden_states, [[0, pad_size], [0, 0]]), 0
636617
) # [1, max_durations, hidden_size]
637-
outputs = tf.concat([outputs, repeat_encoder_hidden_states], axis=0)
638-
encoder_masks = tf.concat([encoder_masks, masks], axis=0)
639-
return [
640-
i + 1,
618+
619+
outputs = repeat_encoder_hidden_states
620+
encoder_masks = masks
621+
else:
622+
outputs = tf.zeros(shape=[0, max_durations, hidden_size], dtype=tf.float32)
623+
encoder_masks = tf.zeros(shape=[0, max_durations], dtype=tf.int32)
624+
625+
def condition(
626+
i,
641627
batch_size,
642628
outputs,
643629
encoder_masks,
644630
encoder_hidden_states,
645631
durations_gt,
646632
max_durations,
647-
]
633+
):
634+
return tf.less(i, batch_size)
648635

649-
# initialize iteration i.
650-
i = tf.constant(0, dtype=tf.int32)
651-
_, _, outputs, encoder_masks, _, _, _, = tf.while_loop(
652-
condition,
653-
body,
654-
[
636+
def body(
655637
i,
656638
batch_size,
657639
outputs,
658640
encoder_masks,
659641
encoder_hidden_states,
660642
durations_gt,
661643
max_durations,
662-
],
663-
shape_invariants=[
664-
i.get_shape(),
665-
batch_size.get_shape(),
666-
tf.TensorShape([None, None, self.config.hidden_size]),
667-
tf.TensorShape([None, None]),
668-
encoder_hidden_states.get_shape(),
669-
durations_gt.get_shape(),
670-
max_durations.get_shape(),
671-
],
672-
)
644+
):
645+
repeats = durations_gt[i]
646+
real_length = tf.reduce_sum(repeats)
647+
pad_size = max_durations - real_length
648+
masks = tf.sequence_mask([real_length], max_durations, dtype=tf.int32)
649+
repeat_encoder_hidden_states = tf.repeat(
650+
encoder_hidden_states[i], repeats=repeats, axis=0
651+
)
652+
repeat_encoder_hidden_states = tf.expand_dims(
653+
tf.pad(repeat_encoder_hidden_states, [[0, pad_size], [0, 0]]), 0
654+
) # [1, max_durations, hidden_size]
655+
outputs = tf.concat([outputs, repeat_encoder_hidden_states], axis=0)
656+
encoder_masks = tf.concat([encoder_masks, masks], axis=0)
657+
return [
658+
i + 1,
659+
batch_size,
660+
outputs,
661+
encoder_masks,
662+
encoder_hidden_states,
663+
durations_gt,
664+
max_durations,
665+
]
666+
667+
# initialize iteration i.
668+
i = tf.constant(0, dtype=tf.int32)
669+
_, _, outputs, encoder_masks, _, _, _, = tf.while_loop(
670+
condition,
671+
body,
672+
[
673+
i,
674+
batch_size,
675+
outputs,
676+
encoder_masks,
677+
encoder_hidden_states,
678+
durations_gt,
679+
max_durations,
680+
],
681+
shape_invariants=[
682+
i.get_shape(),
683+
batch_size.get_shape(),
684+
tf.TensorShape([None, None, self.config.hidden_size]),
685+
tf.TensorShape([None, None]),
686+
encoder_hidden_states.get_shape(),
687+
durations_gt.get_shape(),
688+
max_durations.get_shape(),
689+
],
690+
)
673691

674692
return outputs, encoder_masks
675693

676694

677695
class TFFastSpeech(tf.keras.Model):
678696
"""TF Fastspeech module."""
679697

680-
def __init__(self, config, **kwargs):
698+
def __init__(self, config, enable_tflite_convertible=False, **kwargs):
681699
"""Init layers for fastspeech."""
682700
super().__init__(**kwargs)
683701
self.embeddings = TFFastSpeechEmbeddings(config, name="embeddings")
@@ -686,12 +704,16 @@ def __init__(self, config, **kwargs):
686704
config, name="duration_predictor"
687705
)
688706
self.length_regulator = TFFastSpeechLengthRegulator(
689-
config, name="length_regulator"
707+
config,
708+
enable_tflite_convertible=enable_tflite_convertible,
709+
name="length_regulator"
690710
)
691711
self.decoder = TFFastSpeechDecoder(config, name="decoder")
692712
self.mel_dense = tf.keras.layers.Dense(units=config.num_mels, name="mel_before")
693713
self.postnet = TFTacotronPostnet(config=config, name="postnet")
694714

715+
self.enable_tflite_convertible = enable_tflite_convertible
716+
695717
def _build(self):
696718
"""Dummy input for building model."""
697719
# fake inputs
@@ -749,8 +771,8 @@ def call(
749771
input_signature=[
750772
tf.TensorSpec(shape=[None, None], dtype=tf.int32),
751773
tf.TensorSpec(shape=[None, None], dtype=tf.bool),
752-
tf.TensorSpec(shape=[None,], dtype=tf.int32),
753-
tf.TensorSpec(shape=[None,], dtype=tf.float32),
774+
tf.TensorSpec(shape=[None, ], dtype=tf.int32),
775+
tf.TensorSpec(shape=[None, ], dtype=tf.float32),
754776
],
755777
)
756778
def inference(self, input_ids, attention_mask, speaker_ids, speed_ratios):
@@ -799,3 +821,59 @@ def inference(self, input_ids, attention_mask, speaker_ids, speed_ratios):
799821

800822
outputs = (mel_before, mel_after, duration_outputs)
801823
return outputs
824+
825+
@tf.function(
826+
experimental_relax_shapes=True,
827+
input_signature=[
828+
tf.TensorSpec(shape=[1, None], dtype=tf.int32),
829+
tf.TensorSpec(shape=[1, None], dtype=tf.bool),
830+
tf.TensorSpec(shape=[1, ], dtype=tf.int32),
831+
tf.TensorSpec(shape=[1, ], dtype=tf.float32),
832+
],
833+
)
834+
def inference_tflite(self, input_ids, attention_mask, speaker_ids, speed_ratios):
835+
"""Call logic."""
836+
embedding_output = self.embeddings([input_ids, speaker_ids], training=False)
837+
encoder_output = self.encoder(
838+
[embedding_output, attention_mask], training=False
839+
)
840+
last_encoder_hidden_states = encoder_output[0]
841+
842+
# duration predictor, here use last_encoder_hidden_states, u can use more hidden_states layers
843+
# rather than just use last_hidden_states of encoder for duration_predictor.
844+
duration_outputs = self.duration_predictor(
845+
[last_encoder_hidden_states, attention_mask]
846+
) # [batch_size, length]
847+
duration_outputs = tf.math.exp(duration_outputs) - 1.0
848+
849+
if speed_ratios is None:
850+
speed_ratios = tf.convert_to_tensor(np.array([1.0]), dtype=tf.float32)
851+
852+
duration_outputs = tf.cast(
853+
tf.math.round(duration_outputs * speed_ratios), tf.int32
854+
)
855+
856+
length_regulator_outputs, encoder_masks = self.length_regulator(
857+
[last_encoder_hidden_states, duration_outputs], training=False
858+
)
859+
860+
# create decoder positional embedding
861+
decoder_pos = tf.range(
862+
1, tf.shape(length_regulator_outputs)[1] + 1, dtype=tf.int32
863+
)
864+
masked_decoder_pos = tf.expand_dims(decoder_pos, 0) * encoder_masks
865+
866+
decoder_output = self.decoder(
867+
[length_regulator_outputs, speaker_ids, encoder_masks, masked_decoder_pos],
868+
training=False,
869+
)
870+
last_decoder_hidden_states = decoder_output[0]
871+
872+
# here u can use sum or concat more than 1 hidden states layers from decoder.
873+
mel_before = self.mel_dense(last_decoder_hidden_states)
874+
mel_after = (
875+
self.postnet([mel_before, encoder_masks], training=False) + mel_before
876+
)
877+
878+
outputs = (mel_before, mel_after, duration_outputs)
879+
return outputs

tensorflow_tts/models/fastspeech2.py

Lines changed: 1 addition & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -118,9 +118,6 @@ def __init__(self, config, **kwargs):
118118
)
119119
self.energy_dropout = tf.keras.layers.Dropout(config.energy_dropout_rate)
120120

121-
# set flag to use f0/energy embedding.
122-
self.is_use_f0_energy = tf.constant(0.0)
123-
124121
def _build(self):
125122
"""Dummy input for building model."""
126123
# fake inputs
@@ -180,11 +177,7 @@ def call(
180177
energy_embedding = self.energy_dropout(energy_embedding, training=True)
181178

182179
# sum features
183-
last_encoder_hidden_states = (
184-
last_encoder_hidden_states
185-
+ self.is_use_f0_energy * f0_embedding
186-
+ self.is_use_f0_energy * energy_embedding
187-
)
180+
last_encoder_hidden_states += f0_embedding + energy_embedding
188181

189182
length_regulator_outputs, encoder_masks = self.length_regulator(
190183
[last_encoder_hidden_states, duration_gts], training=training

0 commit comments

Comments
 (0)