TensorSpeech
diff --git a/‎.gitignore‎
Lines changed: 1 addition & 0 deletions b/‎.gitignore‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎README.md‎
Lines changed: 23 additions & 0 deletions b/‎README.md‎
Lines changed: 23 additions & 0 deletions
diff --git a/‎examples/fastspeech2/conf/fastspeech2.baker.v2.yaml‎
Lines changed: 78 additions & 0 deletions b/‎examples/fastspeech2/conf/fastspeech2.baker.v2.yaml‎
Lines changed: 78 additions & 0 deletions
diff --git a/‎examples/multiband_melgan/conf/multiband_melgan.baker.v1.yaml‎
Lines changed: 106 additions & 0 deletions b/‎examples/multiband_melgan/conf/multiband_melgan.baker.v1.yaml‎
Lines changed: 106 additions & 0 deletions
diff --git a/‎examples/multiband_melgan/decode_mb_melgan.py‎
Lines changed: 2 additions & 2 deletions b/‎examples/multiband_melgan/decode_mb_melgan.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎examples/tacotron2/conf/tacotron2.baker.v1.yaml‎
Lines changed: 83 additions & 0 deletions b/‎examples/tacotron2/conf/tacotron2.baker.v1.yaml‎
Lines changed: 83 additions & 0 deletions
diff --git a/‎examples/tacotron2/conf/tacotron2.v1.yaml‎
Lines changed: 1 addition & 0 deletions b/‎examples/tacotron2/conf/tacotron2.v1.yaml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎examples/tacotron2/decode_tacotron2.py‎
Lines changed: 3 additions & 0 deletions b/‎examples/tacotron2/decode_tacotron2.py‎
Lines changed: 3 additions & 0 deletions
@@ -35,3 +35,4 @@ ljspeech
 LibriTTS/
 dataset/
 mfa/
+kss
@@ -184,6 +184,29 @@ After preprocessing, the structure of the project folder should be:
 
 We use suffix (`ids`, `raw-feats`, `raw-energy`, `raw-f0`, `norm-feats` and `wave`) for each type of input.
 
+### Preprocessing Chinese Dataset
+please download the open dataset from [Data-Baker](https://weixinxcxdb.oss-cn-beijing.aliyuncs.com/gwYinPinKu/BZNSYP.rar), and extract data like this:
+```
+.
+├── PhoneLabeling
+│   ├── 000001.interval
+│   ├── ...
+│   └── 010000.interval
+├── ProsodyLabeling
+│   └── 000001-010000.txt
+└── Wave
+    ├── 000001.wav
+    ├── ...
+    └── 010000.wav
+```
+
+after install tensorflowtts, you can process data like this:
+```shell
+tensorflow-tts-preprocess --dataset baker --rootdir ./baker --outdir ./dump --config ./preprocess/baker_preprocess.yaml
+tensorflow-tts-normalize --rootdir ./dump --outdir ./dump --config  ./preprocess/baker_preprocess.yaml --dataset baker
+```
+
+
 **IMPORTANT NOTES**:
 - This preprocessing step is based on [ESPnet](https://github.com/espnet/espnet) so you can combine all models here with other models from ESPnet repository.
 
 
@@ -0,0 +1,78 @@
+# This is the hyperparameter configuration file for FastSpeech2 v2.
+# the different of v2 and v1 is that v2 apply linformer technique.
+# Please make sure this is adjusted for the Baker dataset. If you want to
+# apply to the other dataset, you might need to carefully change some parameters.
+# This configuration performs 200k iters but a best checkpoint is around 150k iters.
+
+###########################################################
+#                FEATURE EXTRACTION SETTING               #
+###########################################################
+hop_size: 256            # Hop size.
+format: "npy"
+
+
+###########################################################
+#              NETWORK ARCHITECTURE SETTING               #
+###########################################################
+model_type: "fastspeech2"
+
+fastspeech2_params:
+    dataset: baker
+    n_speakers: 1
+    encoder_hidden_size: 256
+    encoder_num_hidden_layers: 3
+    encoder_num_attention_heads: 2
+    encoder_attention_head_size: 16  # in v1, = 384//2
+    encoder_intermediate_size: 1024
+    encoder_intermediate_kernel_size: 3
+    encoder_hidden_act: "mish"
+    decoder_hidden_size: 256
+    decoder_num_hidden_layers: 3
+    decoder_num_attention_heads: 2
+    decoder_attention_head_size: 16  # in v1, = 384//2
+    decoder_intermediate_size: 1024
+    decoder_intermediate_kernel_size: 3
+    decoder_hidden_act: "mish"
+    variant_prediction_num_conv_layers: 2
+    variant_predictor_filter: 256
+    variant_predictor_kernel_size: 3
+    variant_predictor_dropout_rate: 0.5
+    num_mels: 80
+    hidden_dropout_prob: 0.2
+    attention_probs_dropout_prob: 0.1
+    max_position_embeddings: 2048
+    initializer_range: 0.02
+    output_attentions: False
+    output_hidden_states: False
+
+###########################################################
+#                  DATA LOADER SETTING                    #
+###########################################################
+batch_size: 16              # Batch size.
+remove_short_samples: true  # Whether to remove samples the length of which are less than batch_max_steps.
+allow_cache: true           # Whether to allow cache in dataset. If true, it requires cpu memory.
+mel_length_threshold: 32    # remove all targets has mel_length <= 32 
+is_shuffle: true            # shuffle dataset after each epoch.
+###########################################################
+#             OPTIMIZER & SCHEDULER SETTING               #
+###########################################################
+optimizer_params:
+    initial_learning_rate: 0.001
+    end_learning_rate: 0.00005
+    decay_steps: 150000          # < train_max_steps is recommend.
+    warmup_proportion: 0.02
+    weight_decay: 0.001
+    
+    
+###########################################################
+#                    INTERVAL SETTING                     #
+###########################################################
+train_max_steps: 200000               # Number of training steps.
+save_interval_steps: 5000             # Interval steps to save checkpoint.
+eval_interval_steps: 500              # Interval steps to evaluate the network.
+log_interval_steps: 200               # Interval steps to record the training log.
+delay_f0_energy_steps: 3              # 2 steps use LR outputs only then 1 steps LR + F0 + Energy.
+###########################################################
+#                     OTHER SETTING                       #
+###########################################################
+num_save_intermediate_results: 1  # Number of batch to be saved as intermediate results.
@@ -0,0 +1,106 @@
+
+# This is the hyperparameter configuration file for Multi-Band MelGAN.
+# Please make sure this is adjusted for the Baker dataset. If you want to
+# apply to the other dataset, you might need to carefully change some parameters.
+# This configuration performs 1000k iters.
+
+###########################################################
+#                FEATURE EXTRACTION SETTING               #
+###########################################################
+sampling_rate: 24000
+hop_size: 300            # Hop size.
+format: "npy"
+
+
+###########################################################
+#         GENERATOR NETWORK ARCHITECTURE SETTING          #
+###########################################################
+model_type: "multiband_melgan_generator"
+
+multiband_melgan_generator_params:
+    out_channels: 4               # Number of output channels (number of subbands).
+    kernel_size: 7                # Kernel size of initial and final conv layers.
+    filters: 384                  # Initial number of channels for conv layers.
+    upsample_scales: [3, 5, 5]    # List of Upsampling scales.
+    stack_kernel_size: 3          # Kernel size of dilated conv layers in residual stack.
+    stacks: 4                     # Number of stacks in a single residual stack module.
+    is_weight_norm: false         # Use weight-norm or not.
+
+###########################################################
+#       DISCRIMINATOR NETWORK ARCHITECTURE SETTING        #
+###########################################################
+discriminator_params:
+    out_channels: 1                   # Number of output channels.
+    scales: 3                         # Number of multi-scales.
+    downsample_pooling: "AveragePooling1D"   # Pooling type for the input downsampling.
+    downsample_pooling_params:        # Parameters of the above pooling function.
+        pool_size: 4
+        strides: 2
+    kernel_sizes: [5, 3]              # List of kernel size.
+    filters: 16                       # Number of channels of the initial conv layer.
+    max_downsample_filters: 512       # Maximum number of channels of downsampling layers.
+    downsample_scales: [4, 4, 4]      # List of downsampling scales.
+    nonlinear_activation: "LeakyReLU" # Nonlinear activation function.
+    nonlinear_activation_params:      # Parameters of nonlinear activation function.
+        alpha: 0.2
+    is_weight_norm: false             # Use weight-norm or not.
+
+###########################################################
+#                   STFT LOSS SETTING                     #
+###########################################################
+stft_loss_params:
+    fft_lengths: [1024, 2048, 512]  # List of FFT size for STFT-based loss.
+    frame_steps: [120, 240, 50]     # List of hop size for STFT-based loss
+    frame_lengths: [600, 1200, 240] # List of window length for STFT-based loss.
+
+subband_stft_loss_params:
+    fft_lengths: [384, 683, 171]  # List of FFT size for STFT-based loss.
+    frame_steps: [30, 60, 10]     # List of hop size for STFT-based loss
+    frame_lengths: [150, 300, 60] # List of window length for STFT-based loss.
+
+###########################################################
+#               ADVERSARIAL LOSS SETTING                  #
+###########################################################
+lambda_feat_match: 10.0      # Loss balancing coefficient for feature matching loss
+lambda_adv: 2.5              # Loss balancing coefficient for adversarial loss.
+
+###########################################################
+#                  DATA LOADER SETTING                    #
+###########################################################
+batch_size: 64                 # Batch size.
+batch_max_steps: 9600          # Length of each audio in batch for training. Make sure dividable by hop_size.
+batch_max_steps_valid: 48000   # Length of each audio for validation. Make sure dividable by hope_size.
+remove_short_samples: true     # Whether to remove samples the length of which are less than batch_max_steps.
+allow_cache: true              # Whether to allow cache in dataset. If true, it requires cpu memory.
+is_shuffle: true               # shuffle dataset after each epoch.
+
+###########################################################
+#             OPTIMIZER & SCHEDULER SETTING               #
+###########################################################
+generator_optimizer_params:
+    lr_fn: "PiecewiseConstantDecay"
+    lr_params: 
+        boundaries: [100000, 200000, 300000, 400000, 500000, 600000, 700000]
+        values: [0.001, 0.0005, 0.00025, 0.000125, 0.0000625, 0.00003125, 0.000015625, 0.000001]
+    amsgrad: false
+
+discriminator_optimizer_params:
+    lr_fn: "PiecewiseConstantDecay"
+    lr_params: 
+        boundaries: [100000, 200000, 300000, 400000, 500000]
+        values: [0.00025, 0.000125, 0.0000625, 0.00003125, 0.000015625, 0.000001]
+    amsgrad: false
+
+###########################################################
+#                    INTERVAL SETTING                     #
+###########################################################
+discriminator_train_start_steps: 200000  # steps begin training discriminator
+train_max_steps: 4000000                 # Number of training steps.
+save_interval_steps: 20000               # Interval steps to save checkpoint.
+eval_interval_steps: 5000                # Interval steps to evaluate the network.
+log_interval_steps: 200                  # Interval steps to record the training log.
+
+###########################################################
+#                     OTHER SETTING                       #
+###########################################################
+num_save_intermediate_results: 1  # Number of batch to be saved as intermediate results.
@@ -110,14 +110,14 @@ def main():
 
     # define model and load checkpoint
     mb_melgan = TFMelGANGenerator(
-        config=MultiBandMelGANGeneratorConfig(**config["multiband_melgan_generator"]),
+        config=MultiBandMelGANGeneratorConfig(**config["multiband_melgan_generator_params"]),
         name="multiband_melgan_generator",
     )
     mb_melgan._build()
     mb_melgan.load_weights(args.checkpoint)
 
     pqmf = TFPQMF(
-        config=MultiBandMelGANGeneratorConfig(**config["multiband_melgan_generator"]), name="pqmf"
+        config=MultiBandMelGANGeneratorConfig(**config["multiband_melgan_generator_params"]), name="pqmf"
     )
 
     for data in tqdm(dataset, desc="[Decoding]"):
 
@@ -0,0 +1,83 @@
+# This is the hyperparameter configuration file for Tacotron2 v1.
+# Please make sure this is adjusted for the Baker dataset. If you want to
+# apply to the other dataset, you might need to carefully change some parameters.
+# This configuration performs 200k iters but 65k iters is enough to get a good models.
+
+###########################################################
+#                FEATURE EXTRACTION SETTING               #
+###########################################################
+hop_size: 256            # Hop size.
+format: "npy"
+
+
+###########################################################
+#              NETWORK ARCHITECTURE SETTING               #
+###########################################################
+model_type: "tacotron2"
+
+tacotron2_params:
+    dataset: baker
+    embedding_hidden_size: 512
+    initializer_range: 0.5
+    embedding_dropout_prob: 0.1
+    n_speakers: 1
+    n_conv_encoder: 5
+    encoder_conv_filters: 512
+    encoder_conv_kernel_sizes: 5
+    encoder_conv_activation: 'relu'
+    encoder_conv_dropout_rate: 0.5
+    encoder_lstm_units: 256
+    n_prenet_layers: 2
+    prenet_units: 256
+    prenet_activation: 'relu'
+    prenet_dropout_rate: 0.5
+    n_lstm_decoder: 1
+    reduction_factor: 2
+    decoder_lstm_units: 1024
+    attention_dim: 128
+    attention_filters: 32
+    attention_kernel: 31
+    n_mels: 80
+    n_conv_postnet: 5
+    postnet_conv_filters: 512
+    postnet_conv_kernel_sizes: 5
+    postnet_dropout_rate: 0.1
+    attention_type: "lsa"
+
+###########################################################
+#                  DATA LOADER SETTING                    #
+###########################################################
+batch_size: 32             # Batch size.
+remove_short_samples: true # Whether to remove samples the length of which are less than batch_max_steps.
+allow_cache: true          # Whether to allow cache in dataset. If true, it requires cpu memory.
+mel_length_threshold: 32   # remove all targets has mel_length <= 32 
+is_shuffle: true           # shuffle dataset after each epoch.
+use_fixed_shapes: true     # use_fixed_shapes for training (2x speed-up)
+                           # refer (https://github.com/dathudeptrai/TensorflowTTS/issues/34#issuecomment-642309118)
+
+###########################################################
+#             OPTIMIZER & SCHEDULER SETTING               #
+###########################################################
+optimizer_params:
+    initial_learning_rate: 0.001
+    end_learning_rate: 0.00001
+    decay_steps: 150000          # < train_max_steps is recommend.
+    warmup_proportion: 0.02
+    weight_decay: 0.001
+    
+    
+###########################################################
+#                    INTERVAL SETTING                     #
+###########################################################
+train_max_steps: 200000                 # Number of training steps.
+save_interval_steps: 5000               # Interval steps to save checkpoint.
+eval_interval_steps: 500                # Interval steps to evaluate the network.
+log_interval_steps: 100                 # Interval steps to record the training log.
+start_schedule_teacher_forcing: 200001  # don't need to apply schedule teacher forcing.
+start_ratio_value: 0.5                  # start ratio of scheduled teacher forcing.
+schedule_decay_steps: 50000             # decay step scheduled teacher forcing.
+end_ratio_value: 0.0                    # end ratio of scheduled teacher forcing.
+###########################################################
+#                     OTHER SETTING                       #
+###########################################################
+num_save_intermediate_results: 1  # Number of results to be saved as intermediate results.
@@ -16,6 +16,7 @@ format: "npy"
 model_type: "tacotron2"
 
 tacotron2_params:
+    dataset: ljspeech
     embedding_hidden_size: 512
     initializer_range: 0.02
     embedding_dropout_prob: 0.1
 
@@ -25,6 +25,7 @@
 import tensorflow as tf
 import yaml
 from tqdm import tqdm
+import matplotlib.pyplot as plt
 
 from examples.tacotron2.tacotron_dataset import CharactorMelDataset
 from tensorflow_tts.configs import Tacotron2Config
@@ -109,11 +110,13 @@ def main():
 
     # define data-loader
     dataset = CharactorMelDataset(
+        dataset=config["tacotron2_params"]["dataset"],
         root_dir=args.rootdir,
         charactor_query=char_query,
         mel_query=mel_query,
         charactor_load_fn=char_load_fn,
         mel_load_fn=mel_load_fn,
+        reduction_factor=config["tacotron2_params"]["reduction_factor"]
     )
     dataset = dataset.create(allow_cache=True, batch_size=args.batch_size)
Original file line number	Diff line number	Diff line change
`@@ -110,14 +110,14 @@ def main():`
`110`	`110`
`111`	`111`	`# define model and load checkpoint`
`112`	`112`	`mb_melgan = TFMelGANGenerator(`
`113`		`- config=MultiBandMelGANGeneratorConfig(**config["multiband_melgan_generator"]),`
	`113`	`+ config=MultiBandMelGANGeneratorConfig(**config["multiband_melgan_generator_params"]),`
`114`	`114`	`name="multiband_melgan_generator",`
`115`	`115`	`)`
`116`	`116`	`mb_melgan._build()`
`117`	`117`	`mb_melgan.load_weights(args.checkpoint)`
`118`	`118`
`119`	`119`	`pqmf = TFPQMF(`
`120`		`- config=MultiBandMelGANGeneratorConfig(**config["multiband_melgan_generator"]), name="pqmf"`
	`120`	`+ config=MultiBandMelGANGeneratorConfig(**config["multiband_melgan_generator_params"]), name="pqmf"`
`121`	`121`	`)`
`122`	`122`
`123`	`123`	`for data in tqdm(dataset, desc="[Decoding]"):`