💉 Add libritts processor.

dathudeptrai · dathudeptrai · commit 138f11cbb9aa · 2020-08-15T14:11:12.000+07:00
diff --git a/.gitignore b/.gitignore
@@ -33,4 +33,4 @@ ljspeech
 /examples/tacotron2/exp/
 /temp/
 kss
-
+LibriTTS
diff --git a/tensorflow_tts/bin/preprocess.py b/tensorflow_tts/bin/preprocess.py
@@ -32,9 +32,11 @@
 
 from tensorflow_tts.processor import LJSpeechProcessor
 from tensorflow_tts.processor import KSSProcessor
+from tensorflow_tts.processor import LibriTTSProcessor
 
 from tensorflow_tts.processor.ljspeech import LJSPEECH_SYMBOLS
 from tensorflow_tts.processor.kss import KSS_SYMBOLS
+from tensorflow_tts.processor.libritts import LIBRITTS_SYMBOLS
 
 from tensorflow_tts.utils import remove_outlier
 
@@ -65,7 +67,7 @@ def parse_and_config():
         "--dataset",
         type=str,
         default="ljspeech",
-        choices=["ljspeech", "kss"],
+        choices=["ljspeech", "kss", "libritts"],
         help="Dataset to preprocess.",
     )
     parser.add_argument(
@@ -107,6 +109,64 @@ def parse_and_config():
     return config
 
 
+def ph_based_trim(
+    config,
+    utt_id: str,
+    text_ids: np.array,
+    raw_text: str,
+    audio: np.array,
+    hop_size: int,
+) -> (bool, np.array, np.array):
+    """
+    Args:
+        config: Parsed yaml config
+        utt_id: file name
+        text_ids: array with text ids
+        raw_text: raw text of file
+        audio: parsed wav file
+        hop_size: Hop size
+    Returns: (bool, np.array, np.array) => if trimmed return True, new text_ids, new audio_array
+    """
+
+    os.makedirs(os.path.join(config["rootdir"], "trimmed-durations"), exist_ok=True)
+    duration_path = config.get(
+        "duration_path", os.path.join(config["rootdir"], "durations")
+    )
+    duration_fixed_path = config.get(
+        "duration_fixed_path", os.path.join(config["rootdir"], "trimmed-durations")
+    )
+    sil_ph = ["SIL", "END"]  # TODO FIX hardcoded values
+    text = raw_text.split(" ")
+
+    trim_start, trim_end = False, False
+
+    if text[0] in sil_ph:
+        trim_start = True
+
+    if text[-1] in sil_ph:
+        trim_end = True
+
+    if not trim_start and not trim_end:
+        return False, text_ids, audio
+
+    idx_start, idx_end = (
+        0 if not trim_start else 1,
+        text_ids.__len__() if not trim_end else -1,
+    )
+    text_ids = text_ids[idx_start:idx_end]
+    durations = np.load(os.path.join(duration_path, f"{utt_id}-durations.npy"))
+    if trim_start:
+        s_trim = int(durations[0] * hop_size)
+        audio = audio[s_trim:]
+    if trim_end:
+        e_trim = int(durations[-1] * hop_size)
+        audio = audio[:-e_trim]
+
+    durations = durations[idx_start:idx_end]
+    np.save(os.path.join(duration_fixed_path, f"{utt_id}-durations.npy"), durations)
+    return True, text_ids, audio
+
+
 def gen_audio_features(item, config):
     """Generate audio features and transformations
     Args:
@@ -132,12 +192,29 @@ def gen_audio_features(item, config):
 
     # trim silence
     if config["trim_silence"]:
-        audio, _ = librosa.effects.trim(
-            audio,
-            top_db=config["trim_threshold_in_db"],
-            frame_length=config["trim_frame_size"],
-            hop_length=config["trim_hop_size"],
-        )
+        if "trim_mfa" in config and config["trim_mfa"]:
+            _, item["text_ids"], audio = ph_based_trim(
+                config,
+                utt_id,
+                item["text_ids"],
+                item["raw_text"],
+                audio,
+                config["hop_size"],
+            )
+            if (
+                audio.__len__() < 1
+            ):  # very short files can get trimmed fully if mfa didnt extract any tokens LibriTTS maybe take only longer files?
+                logging.warning(
+                    f"File have only silence or MFA didnt extract any token {utt_id}"
+                )
+                return False, None, None, None, item
+        else:
+            audio, _ = librosa.effects.trim(
+                audio,
+                top_db=config["trim_threshold_in_db"],
+                frame_length=config["trim_frame_size"],
+                hop_length=config["trim_hop_size"],
+            )
 
     # resample audio if necessary
     if "sampling_rate_for_feats" in config:
@@ -207,7 +284,7 @@ def gen_audio_features(item, config):
     item["mel"] = mel
     item["f0"] = f0
     item["energy"] = energy
-    return mel, energy, f0, item
+    return True, mel, energy, f0, item
 
 
 def save_statistics_to_file(scaler_list, config):
@@ -261,14 +338,20 @@ def preprocess():
     dataset_processor = {
         "ljspeech": LJSpeechProcessor,
         "kss": KSSProcessor,
+        "libritts": LibriTTSProcessor,
     }
 
     dataset_symbol = {
         "ljspeech": LJSPEECH_SYMBOLS,
         "kss": KSS_SYMBOLS,
+        "libritts": LIBRITTS_SYMBOLS,
     }
 
-    dataset_cleaner = {"ljspeech": "english_cleaners", "kss": "korean_cleaners"}
+    dataset_cleaner = {
+        "ljspeech": "english_cleaners",
+        "kss": "korean_cleaners",
+        "libritts": None,
+    }
 
     logging.info(f"Selected '{config['dataset']}' processor.")
     processor = dataset_processor[config["dataset"]](
@@ -291,9 +374,21 @@ def preprocess():
     )
 
     # build train test split
-    train_split, valid_split = train_test_split(
-        processor.items, test_size=config["test_size"], random_state=42, shuffle=True,
-    )
+    if config["dataset"] == "libritts":
+        train_split, valid_split, _, _ = train_test_split(
+            processor.items,
+            [i[-1] for i in processor.items],
+            test_size=config["test_size"],
+            random_state=42,
+            shuffle=True,
+        )
+    else:
+        train_split, valid_split = train_test_split(
+            processor.items,
+            test_size=config["test_size"],
+            random_state=42,
+            shuffle=True,
+        )
     logging.info(f"Training items: {len(train_split)}")
     logging.info(f"Validation items: {len(valid_split)}")
 
@@ -327,15 +422,33 @@ def iterator_data(items_list):
     scaler_energy = StandardScaler(copy=False)
     scaler_f0 = StandardScaler(copy=False)
 
-    for mel, energy, f0, features in train_map:
+    id_to_remove = []
+    for result, mel, energy, f0, features in train_map:
+        if not result:
+            id_to_remove.append(features["utt_id"])
+            continue
         save_features_to_file(features, "train", config)
         # remove outliers
         energy = remove_outlier(energy)
+        f0 = remove_outlier(f0)
+        # partial fitting of scalers
+        if len(energy[energy != 0]) == 0 or len(f0[f0 != 0]) == 0:
+            id_to_remove.append(features["utt_id"])
+            continue
         # partial fitting of scalers
         scaler_mel.partial_fit(mel)
         scaler_energy.partial_fit(energy[energy != 0].reshape(-1, 1))
         scaler_f0.partial_fit(f0[f0 != 0].reshape(-1, 1))
 
+    if len(id_to_remove) > 0:
+        np.save(
+            os.path.join(config["outdir"], "train_utt_ids.npy"),
+            [i for i in train_utt_ids if i not in id_to_remove],
+        )
+        logging.info(
+            f"removed {len(id_to_remove)} cause of too many outliers or bad mfa extraction"
+        )
+
     # save statistics to file
     logging.info("Saving computed statistics.")
     scaler_list = [(scaler_mel, ""), (scaler_energy, "_energy"), (scaler_f0, "_f0")]
diff --git a/tensorflow_tts/processor/__init__.py b/tensorflow_tts/processor/__init__.py
@@ -2,3 +2,4 @@
 
 from tensorflow_tts.processor.ljspeech import LJSpeechProcessor
 from tensorflow_tts.processor.kss import KSSProcessor
+from tensorflow_tts.processor.libritts import LibriTTSProcessor
diff --git a/tensorflow_tts/processor/libritts.py b/tensorflow_tts/processor/libritts.py
@@ -0,0 +1,115 @@
+# -*- coding: utf-8 -*-
+# Copyright 2020 TensorFlowTTS Team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Perform preprocessing and raw feature extraction for LibriTTS dataset."""
+
+from dataclasses import dataclass
+
+import numpy as np
+import soundfile as sf
+from g2p_en import g2p as grapheme_to_phonem
+
+from tensorflow_tts.processor.base_processor import BaseProcessor
+
+g2p = grapheme_to_phonem.G2p()
+
+valid_symbols = g2p.phonemes
+valid_symbols.append("SIL")
+valid_symbols.append("END")
+
+_punctuation = "!'(),.:;? "
+_arpabet = ["@" + s for s in valid_symbols]
+
+LIBRITTS_SYMBOLS = _arpabet + list(_punctuation)
+
+
+@dataclass
+class LibriTTSProcessor(BaseProcessor):
+
+    mode: str = "train"
+    train_f_name: str = "train.txt"
+    positions = {
+        "file": 0,
+        "text": 1,
+        "speaker_name": 2,
+    }  # positions of file,text,speaker_name after split line
+    f_extension: str = ".wav"
+    cleaner_names: str = None
+
+    def create_items(self):
+        with open(
+            os.path.join(self.data_dir, self.train_f_name), mode="r", encoding="utf-8"
+        ) as f:
+            for line in f:
+                parts = line.strip().split(self.delimiter)
+                wav_path = os.path.join(self.data_dir, parts[self.positions["file"]])
+                wav_path = (
+                    wav_path + self.f_extension
+                    if wav_path[-len(self.f_extension) :] != self.f_extension
+                    else wav_path
+                )
+                text = parts[self.positions["text"]]
+                speaker_name = parts[self.positions["speaker_name"]]
+                self.items.append([text, wav_path, speaker_name])
+
+    def get_one_sample(self, item):
+        text, wav_path, speaker_name = item
+        audio, rate = sf.read(wav_path, dtype="float32")
+
+        text_ids = np.asarray(self.text_to_sequence(text), np.int32)
+
+        sample = {
+            "raw_text": text,
+            "text_ids": text_ids,
+            "audio": audio,
+            "utt_id": wav_file.split("/")[-1].split(".")[0],
+            "speaker_name": speaker_name,
+            "rate": rate,
+        }
+
+        return sample
+
+    def text_to_sequence(self, text):
+        if (
+            self.mode == "train"
+        ):  # in train mode text should be already transformed to phonemes
+            return self.symbols_to_ids(clean_g2p(text.split(" ")))
+        else:
+            return self.inference_text_to_seq(text)
+
+    @staticmethod
+    def inference_text_to_seq(text: str):
+        return self.symbols_to_ids(self.text_to_ph(text))
+
+    def symbols_to_ids(self, symbols_list: list):
+        return [self.symbol_to_id[s] for s in symbols_list]
+
+    def text_to_ph(self, text: str):
+        return self.clean_g2p(g2p(text))
+
+    def clean_g2p(self, g2p_text: list):
+        data = []
+        for i, txt in enumerate(g2p_text):
+            if i == len(g2p_text) - 1:
+                if txt != " " and txt != "SIL":
+                    data.append("@" + txt)
+                else:
+                    data.append(
+                        "@END"
+                    )  # TODO try learning without end token and compare results
+                break
+            data.append("@" + txt) if txt != " " else data.append(
+                "@SIL"
+            )  # TODO change it in inference
+        return data
diff --git a/test/test_base_processor.py b/test/test_base_processor.py
@@ -33,7 +33,7 @@ def processor(tmpdir):
 def mapper_processor(tmpdir):
     copyfile("test/files/train.txt", f"{tmpdir}/train.txt")
     copyfile("test/files/mapper.json", f"{tmpdir}/mapper.json")
-    processor = LJ(data_dir=tmpdir, load_mapper=True)
+    processor = LJ(data_dir=tmpdir, loaded_mapper_path=f"{tmpdir}/mapper.json")
     return processor
 
 

Original file line number	Diff line number	Diff line change
`@@ -2,3 +2,4 @@`
`2`	`2`
`3`	`3`	`from tensorflow_tts.processor.ljspeech import LJSpeechProcessor`
`4`	`4`	`from tensorflow_tts.processor.kss import KSSProcessor`
	`5`	`+from tensorflow_tts.processor.libritts import LibriTTSProcessor`