TensorSpeech
diff --git a/‎.gitignore‎
Lines changed: 3 additions & 0 deletions b/‎.gitignore‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎examples/fastspeech2_multispeaker/README.md‎
Lines changed: 64 additions & 0 deletions b/‎examples/fastspeech2_multispeaker/README.md‎
Lines changed: 64 additions & 0 deletions
diff --git a/‎examples/fastspeech2_multispeaker/conf/fastspeech2libritts.yaml‎
Lines changed: 75 additions & 0 deletions b/‎examples/fastspeech2_multispeaker/conf/fastspeech2libritts.yaml‎
Lines changed: 75 additions & 0 deletions
diff --git a/‎examples/fastspeech2_multispeaker/fastspeech2_dataset.py‎
Lines changed: 237 additions & 0 deletions b/‎examples/fastspeech2_multispeaker/fastspeech2_dataset.py‎
Lines changed: 237 additions & 0 deletions
@@ -32,3 +32,6 @@ ljspeech
 /datasets
 /examples/tacotron2/exp/
 /temp/
+LibriTTS/
+dataset/
+mfa/
@@ -0,0 +1,64 @@
+# Fast speech 2 multi-speaker english lang based
+
+## Prepare
+Everything is done from main repo folder so TensorflowTTS/
+
+0. Optional* [Download](http://www.openslr.org/60/) and prepare libritts (helper to prepare libri in examples/fastspeech2_multispeaker/libri_experiment/prepare_libri.ipynb)
+- Dataset structure after finish this step:
+    ```
+    |- TensorFlowTTS/
+    |   |- LibriTTS/
+    |   |-  |- train-clean-100/
+    |   |-  |- SPEAKERS.txt
+    |   |-  |- ...
+    |   |- dataset/
+    |   |-  |- 200/
+    |   |-  |-  |- 200_124139_000001_000000.txt
+    |   |-  |-  |- 200_124139_000001_000000.wav
+    |   |-  |-  |- ...
+    |   |-  |- 250/
+    |   |-  |- ...
+    |   |- tensorflow_tts/
+    |       |- models/
+    |       |- ...
+    ``` 
+1. Extract Duration (use examples/mfa_extraction or pretrained tacotron2) 
+2. Optional* build docker 
+- ```
+  bash examples/fastspeech2_multispeaker/scripts/build.sh
+  ```
+3. Optional* run docker
+- ```
+  bash examples/fastspeech2_multispeaker/scripts/interactive.sh
+  ```
+4. Preprocessing:
+- ```
+  tensorflow-tts-preprocess --rootdir ./dataset \
+    --outdir ./dump \
+    --config preprocess/preprocess_libritts.yaml \
+    --dataset multispeaker
+  ```
+
+5. Normalization:
+- ```
+  tensorflow-tts-normalize --rootdir ./dump \
+    --outdir ./dump \
+    --config preprocess/preprocess_libritts.yaml \
+    --dataset multispeaker
+  ```
+
+6. Change CharactorDurationF0EnergyMelDataset speaker mapper in fastspeech2_dataset to match your dataset (if you use libri with mfa_extraction you didnt need to change anything)
+7. Change train_libri.sh to match your dataset and run:
+- ```
+  bash examples/fastspeech2_multispeaker/scripts/train_libri.sh
+  ```
+8. Optional* If u have problems with tensor sizes mismatch check step 5 in `examples/mfa_extraction` directory
+
+## Comments
+
+This version is using popular train.txt '|' split used in other repos. Training files should looks like this =>
+
+Wav Path | Text | Speaker Name
+
+Wav Path2 | Text | Speaker Name
+
@@ -0,0 +1,75 @@
+# This is the hyperparameter configuration file for FastSpeech2 v1.
+# Please make sure this is adjusted for the LibriTTS dataset. If you want to
+# apply to the other dataset, you might need to carefully change some parameters.
+# This configuration performs 200k iters but a best checkpoint is around 150k iters.
+
+###########################################################
+#                FEATURE EXTRACTION SETTING               #
+###########################################################
+hop_size: 256            # Hop size.
+format: "npy"
+
+###########################################################
+#              NETWORK ARCHITECTURE SETTING               #
+###########################################################
+model_type: fastspeech2
+
+fastspeech2_params:
+    n_speakers: 20
+    encoder_hidden_size: 384
+    encoder_num_hidden_layers: 4
+    encoder_num_attention_heads: 2
+    encoder_attention_head_size: 192  # hidden_size // num_attention_heads
+    encoder_intermediate_size: 1024
+    encoder_intermediate_kernel_size: 3
+    encoder_hidden_act: "mish"
+    decoder_hidden_size: 384
+    decoder_num_hidden_layers: 4
+    decoder_num_attention_heads: 2
+    decoder_attention_head_size: 192  # hidden_size // num_attention_heads
+    decoder_intermediate_size: 1024
+    decoder_intermediate_kernel_size: 3
+    decoder_hidden_act: "mish"
+    variant_prediction_num_conv_layers: 2
+    variant_predictor_filter: 256
+    variant_predictor_kernel_size: 3
+    variant_predictor_dropout_rate: 0.5
+    num_mels: 80
+    hidden_dropout_prob: 0.2
+    attention_probs_dropout_prob: 0.1
+    max_position_embeddings: 2048
+    initializer_range: 0.02
+    output_attentions: False
+    output_hidden_states: False
+
+###########################################################
+#                  DATA LOADER SETTING                    #
+###########################################################
+batch_size: 32               # Batch size.
+remove_short_samples: true  # Whether to remove samples the length of which are less than batch_max_steps.
+allow_cache: true           # Whether to allow cache in dataset. If true, it requires cpu memory.
+mel_length_threshold: 48    # remove all targets has mel_length <= 32
+is_shuffle: true            # shuffle dataset after each epoch.
+###########################################################
+#             OPTIMIZER & SCHEDULER SETTING               #
+###########################################################
+optimizer_params:
+    initial_learning_rate: 0.0001
+    end_learning_rate: 0.00001
+    decay_steps: 120000          # < train_max_steps is recommend.
+    warmup_proportion: 0.02
+    weight_decay: 0.001
+    
+    
+###########################################################
+#                    INTERVAL SETTING                     #
+###########################################################
+train_max_steps: 150000               # Number of training steps.
+save_interval_steps: 5000             # Interval steps to save checkpoint.
+eval_interval_steps: 5000              # Interval steps to evaluate the network.
+log_interval_steps: 200               # Interval steps to record the training log.
+###########################################################
+#                     OTHER SETTING                       #
+###########################################################
+use_griffin: true                 # Use GL on evaluation or not.
+num_save_intermediate_results: 1  # Number of batch to be saved as intermediate results.
@@ -0,0 +1,237 @@
+# -*- coding: utf-8 -*-
+# Copyright 2020 TensorFlowTTS Team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Dataset modules."""
+
+import os
+import numpy as np
+import tensorflow as tf
+
+from tensorflow_tts.datasets.abstract_dataset import AbstractDataset
+from tensorflow_tts.utils import find_files, remove_outlier
+
+
+def average_by_duration(x, durs):
+    mel_len = durs.sum()
+    durs_cum = np.cumsum(np.pad(durs, (1, 0)))
+
+    # calculate charactor f0/energy
+    x_char = np.zeros((durs.shape[0],), dtype=np.float32)
+    for idx, start, end in zip(range(mel_len), durs_cum[:-1], durs_cum[1:]):
+        values = x[start:end][np.where(x[start:end] != 0.0)[0]]
+        x_char[idx] = np.mean(values) if len(values) > 0 else 0.0  # np.mean([]) = nan.
+
+    return x_char.astype(np.float32)
+
+
+@tf.function(
+    input_signature=[tf.TensorSpec(None, tf.float32), tf.TensorSpec(None, tf.int32)]
+)
+def tf_average_by_duration(x, durs):
+    outs = tf.numpy_function(average_by_duration, [x, durs], tf.float32)
+    return outs
+
+
+class CharactorDurationF0EnergyMelDataset(AbstractDataset):
+    """Tensorflow Charactor Duration F0 Energy Mel dataset."""
+
+    def __init__(
+        self,
+        root_dir,
+        charactor_query="*-ids.npy",
+        mel_query="*-norm-feats.npy",
+        duration_query="*-durations.npy",
+        f0_query="*-raw-f0.npy",
+        energy_query="*-raw-energy.npy",
+        f0_stat="./dump/stats_f0.npy",
+        energy_stat="./dump/stats_energy.npy",
+        charactor_load_fn=np.load,
+        mel_load_fn=np.load,
+        duration_load_fn=np.load,
+        f0_load_fn=np.load,
+        energy_load_fn=np.load,
+        mel_length_threshold=0,
+    ):
+        """Initialize dataset.
+
+        Args:
+            root_dir (str): Root directory including dumped files.
+            charactor_query (str): Query to find charactor files in root_dir.
+            mel_query (str): Query to find feature files in root_dir.
+            duration_query (str): Query to find duration files in root_dir.
+            f0_query (str): Query to find f0 files in root_dir.
+            energy_query (str): Query to find energy files in root_dir.
+            f0_stat (str): str path of f0_stat.
+            energy_stat (str): str path of energy_stat.
+            charactor_load_fn (func): Function to load charactor file.
+            mel_load_fn (func): Function to load feature file.
+            duration_load_fn (func): Function to load duration file.
+            f0_load_fn (func): Function to load f0 file.
+            energy_load_fn (func): Function to load energy file.
+            mel_length_threshold (int): Threshold to remove short feature files.
+
+        """
+        # find all of charactor and mel files.
+        charactor_files = sorted(find_files(root_dir, charactor_query))
+        mel_files = sorted(find_files(root_dir, mel_query))
+        duration_files = sorted(find_files(root_dir, duration_query))
+        f0_files = sorted(find_files(root_dir, f0_query))
+        energy_files = sorted(find_files(root_dir, energy_query))
+
+        # assert the number of files
+        assert len(mel_files) != 0, f"Not found any mels files in ${root_dir}."
+        assert (
+            len(mel_files)
+            == len(charactor_files)
+            == len(duration_files)
+            == len(f0_files)
+            == len(energy_files)
+        ), f"Number of charactor, mel, duration, f0 and energy files are different"
+
+        if ".npy" in charactor_query:
+            suffix = charactor_query[1:]
+            utt_ids = [os.path.basename(f).replace(suffix, "") for f in charactor_files]
+
+        # set global params
+        self.utt_ids = utt_ids
+        self.mel_files = mel_files
+        self.charactor_files = charactor_files
+        self.duration_files = duration_files
+        self.f0_files = f0_files
+        self.energy_files = energy_files
+        self.mel_load_fn = mel_load_fn
+        self.charactor_load_fn = charactor_load_fn
+        self.duration_load_fn = duration_load_fn
+        self.f0_load_fn = f0_load_fn
+        self.energy_load_fn = energy_load_fn
+        self.mel_length_threshold = mel_length_threshold
+
+        self.speakers_map = {}  # TODO
+        sp_id = 0
+        for i in self.utt_ids:
+            sp_name = i.split("_")[0]
+            if sp_name not in self.speakers_map:
+                self.speakers_map[sp_name] = sp_id
+                sp_id += 1
+        self.speakers = [self.speakers_map[i.split("_")[0]] for i in self.utt_ids]  # TODO change but at the moment mfa folder name = speaker name
+
+        self.f0_stat = np.load(f0_stat)
+        self.energy_stat = np.load(energy_stat)
+
+    def get_args(self):
+        return [self.utt_ids]
+
+    def _norm_mean_std(self, x, mean, std):
+        x = remove_outlier(x)
+        zero_idxs = np.where(x == 0.0)[0]
+        x = (x - mean) / std
+        x[zero_idxs] = 0.0
+        return x
+
+    def generator(self, utt_ids):
+        for i, utt_id in enumerate(utt_ids):
+            mel_file = self.mel_files[i]
+            charactor_file = self.charactor_files[i]
+            duration_file = self.duration_files[i]
+            f0_file = self.f0_files[i]
+            energy_file = self.energy_files[i]
+            mel = self.mel_load_fn(mel_file)
+            charactor = self.charactor_load_fn(charactor_file)
+            duration = self.duration_load_fn(duration_file)
+            f0 = self.f0_load_fn(f0_file)
+            energy = self.energy_load_fn(energy_file)
+
+            f0 = self._norm_mean_std(f0, self.f0_stat[0], self.f0_stat[1])
+            energy = self._norm_mean_std(
+                energy, self.energy_stat[0], self.energy_stat[1]
+            )
+
+            # calculate charactor f0/energy
+            f0 = tf_average_by_duration(f0, duration)
+            energy = tf_average_by_duration(energy, duration)
+            speaker_id = self.speakers[i]
+            items = {
+                "utt_ids": utt_id,
+                "input_ids": charactor,
+                "speaker_ids": speaker_id,
+                "duration_gts": duration,
+                "f0_gts": f0,
+                "energy_gts": energy,
+                "mel_gts": mel,
+                "mel_lengths": len(mel),
+            }
+
+            yield items
+
+    def create(
+        self,
+        allow_cache=False,
+        batch_size=1,
+        is_shuffle=False,
+        map_fn=None,
+        reshuffle_each_iteration=True,
+    ):
+        """Create tf.dataset function."""
+        output_types = self.get_output_dtypes()
+        datasets = tf.data.Dataset.from_generator(
+            self.generator, output_types=output_types, args=(self.get_args())
+        )
+
+        datasets = datasets.filter(
+            lambda x: x["mel_lengths"] > self.mel_length_threshold
+        )
+
+        if allow_cache:
+            datasets = datasets.cache()
+
+        if is_shuffle:
+            datasets = datasets.shuffle(
+                self.get_len_dataset(),
+                reshuffle_each_iteration=reshuffle_each_iteration,
+            )
+
+        # define padded shapes
+        padded_shapes = {
+            "utt_ids": [],
+            "input_ids": [None],
+            "speaker_ids": [],
+            "duration_gts": [None],
+            "f0_gts": [None],
+            "energy_gts": [None],
+            "mel_gts": [None, None],
+            "mel_lengths": [],
+        }
+
+        datasets = datasets.padded_batch(batch_size, padded_shapes=padded_shapes)
+        datasets = datasets.prefetch(tf.data.experimental.AUTOTUNE)
+        return datasets
+
+    def get_output_dtypes(self):
+        output_types = {
+            "utt_ids": tf.string,
+            "input_ids": tf.int32,
+            "speaker_ids": tf.int32,
+            "duration_gts": tf.int32,
+            "f0_gts": tf.float32,
+            "energy_gts": tf.float32,
+            "mel_gts": tf.float32,
+            "mel_lengths": tf.int32,
+        }
+        return output_types
+
+    def get_len_dataset(self):
+        return len(self.utt_ids)
+
+    def __name__(self):
+        return "CharactorDurationF0EnergyMelDataset"