Skip to content

Commit 97db74c

Browse files
authored
Merge pull request #1314 from yt605155624/add_new_tacotron2
[TTS]Add new tacotron2
2 parents 320bb0f + 9632381 commit 97db74c

File tree

46 files changed

+3224
-518
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

46 files changed

+3224
-518
lines changed

examples/aishell3/tts3/conf/default.yaml

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,8 @@ fmax: 7600 # Maximum frequency of Mel basis.
1616
n_mels: 80 # The number of mel basis.
1717

1818
# Only used for the model using pitch features (e.g. FastSpeech2)
19-
f0min: 80 # Maximum f0 for pitch extraction.
20-
f0max: 400 # Minimum f0 for pitch extraction.
19+
f0min: 80 # Minimum f0 for pitch extraction.
20+
f0max: 400 # Maximum f0 for pitch extraction.
2121

2222

2323
###########################################################
@@ -64,14 +64,14 @@ model:
6464
pitch_predictor_dropout: 0.5 # dropout rate in pitch predictor
6565
pitch_embed_kernel_size: 1 # kernel size of conv embedding layer for pitch
6666
pitch_embed_dropout: 0.0 # dropout rate after conv embedding layer for pitch
67-
stop_gradient_from_pitch_predictor: true # whether to stop the gradient from pitch predictor to encoder
67+
stop_gradient_from_pitch_predictor: True # whether to stop the gradient from pitch predictor to encoder
6868
energy_predictor_layers: 2 # number of conv layers in energy predictor
6969
energy_predictor_chans: 256 # number of channels of conv layers in energy predictor
7070
energy_predictor_kernel_size: 3 # kernel size of conv leyers in energy predictor
7171
energy_predictor_dropout: 0.5 # dropout rate in energy predictor
7272
energy_embed_kernel_size: 1 # kernel size of conv embedding layer for energy
7373
energy_embed_dropout: 0.0 # dropout rate after conv embedding layer for energy
74-
stop_gradient_from_energy_predictor: false # whether to stop the gradient from energy predictor to encoder
74+
stop_gradient_from_energy_predictor: False # whether to stop the gradient from energy predictor to encoder
7575
spk_embed_dim: 256 # speaker embedding dimension
7676
spk_embed_integration_type: concat # speaker embedding integration type
7777

@@ -84,7 +84,6 @@ updater:
8484
use_masking: True # whether to apply masking for padded part in loss calculation
8585

8686

87-
8887
###########################################################
8988
# OPTIMIZER SETTING #
9089
###########################################################

examples/aishell3/vc1/conf/default.yaml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,8 @@ fmax: 7600 # Maximum frequency of Mel basis.
1616
n_mels: 80 # The number of mel basis.
1717

1818
# Only used for the model using pitch features (e.g. FastSpeech2)
19-
f0min: 80 # Maximum f0 for pitch extraction.
20-
f0max: 400 # Minimum f0 for pitch extraction.
19+
f0min: 80 # Minimum f0 for pitch extraction.
20+
f0max: 400 # Maximum f0 for pitch extraction.
2121

2222

2323
###########################################################
@@ -64,14 +64,14 @@ model:
6464
pitch_predictor_dropout: 0.5 # dropout rate in pitch predictor
6565
pitch_embed_kernel_size: 1 # kernel size of conv embedding layer for pitch
6666
pitch_embed_dropout: 0.0 # dropout rate after conv embedding layer for pitch
67-
stop_gradient_from_pitch_predictor: true # whether to stop the gradient from pitch predictor to encoder
67+
stop_gradient_from_pitch_predictor: True # whether to stop the gradient from pitch predictor to encoder
6868
energy_predictor_layers: 2 # number of conv layers in energy predictor
6969
energy_predictor_chans: 256 # number of channels of conv layers in energy predictor
7070
energy_predictor_kernel_size: 3 # kernel size of conv leyers in energy predictor
7171
energy_predictor_dropout: 0.5 # dropout rate in energy predictor
7272
energy_embed_kernel_size: 1 # kernel size of conv embedding layer for energy
7373
energy_embed_dropout: 0.0 # dropout rate after conv embedding layer for energy
74-
stop_gradient_from_energy_predictor: false # whether to stop the gradient from energy predictor to encoder
74+
stop_gradient_from_energy_predictor: False # whether to stop the gradient from energy predictor to encoder
7575
spk_embed_dim: 256 # speaker embedding dimension
7676
spk_embed_integration_type: concat # speaker embedding integration type
7777

examples/aishell3/voc1/conf/default.yaml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ generator_params:
3333
aux_context_window: 2 # Context window size for auxiliary feature.
3434
# If set to 2, previous 2 and future 2 frames will be considered.
3535
dropout: 0.0 # Dropout rate. 0.0 means no dropout applied.
36-
use_weight_norm: true # Whether to use weight norm.
36+
use_weight_norm: True # Whether to use weight norm.
3737
# If set to true, it will be applied to all of the conv layers.
3838
upsample_scales: [4, 5, 3, 5] # Upsampling scales. prod(upsample_scales) == n_shift
3939

@@ -46,8 +46,8 @@ discriminator_params:
4646
kernel_size: 3 # Number of output channels.
4747
layers: 10 # Number of conv layers.
4848
conv_channels: 64 # Number of chnn layers.
49-
bias: true # Whether to use bias parameter in conv.
50-
use_weight_norm: true # Whether to use weight norm.
49+
bias: True # Whether to use bias parameter in conv.
50+
use_weight_norm: True # Whether to use weight norm.
5151
# If set to true, it will be applied to all of the conv layers.
5252
nonlinear_activation: "leakyrelu" # Nonlinear function after each conv.
5353
nonlinear_activation_params: # Nonlinear function parameters
Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,91 @@
1+
# This configuration is for Paddle to train Tacotron 2. Compared to the
2+
# original paper, this configuration additionally use the guided attention
3+
# loss to accelerate the learning of the diagonal attention. It requires
4+
# only a single GPU with 12 GB memory and it takes ~1 days to finish the
5+
# training on Titan V.
6+
7+
###########################################################
8+
# FEATURE EXTRACTION SETTING #
9+
###########################################################
10+
11+
fs: 24000 # sr
12+
n_fft: 2048 # FFT size (samples).
13+
n_shift: 300 # Hop size (samples). 12.5ms
14+
win_length: 1200 # Window length (samples). 50ms
15+
# If set to null, it will be the same as fft_size.
16+
window: "hann" # Window function.
17+
18+
# Only used for feats_type != raw
19+
20+
fmin: 80 # Minimum frequency of Mel basis.
21+
fmax: 7600 # Maximum frequency of Mel basis.
22+
n_mels: 80 # The number of mel basis.
23+
24+
###########################################################
25+
# DATA SETTING #
26+
###########################################################
27+
batch_size: 64
28+
num_workers: 2
29+
30+
###########################################################
31+
# MODEL SETTING #
32+
###########################################################
33+
model: # keyword arguments for the selected model
34+
embed_dim: 512 # char or phn embedding dimension
35+
elayers: 1 # number of blstm layers in encoder
36+
eunits: 512 # number of blstm units
37+
econv_layers: 3 # number of convolutional layers in encoder
38+
econv_chans: 512 # number of channels in convolutional layer
39+
econv_filts: 5 # filter size of convolutional layer
40+
atype: location # attention function type
41+
adim: 512 # attention dimension
42+
aconv_chans: 32 # number of channels in convolutional layer of attention
43+
aconv_filts: 15 # filter size of convolutional layer of attention
44+
cumulate_att_w: True # whether to cumulate attention weight
45+
dlayers: 2 # number of lstm layers in decoder
46+
dunits: 1024 # number of lstm units in decoder
47+
prenet_layers: 2 # number of layers in prenet
48+
prenet_units: 256 # number of units in prenet
49+
postnet_layers: 5 # number of layers in postnet
50+
postnet_chans: 512 # number of channels in postnet
51+
postnet_filts: 5 # filter size of postnet layer
52+
output_activation: null # activation function for the final output
53+
use_batch_norm: True # whether to use batch normalization in encoder
54+
use_concate: True # whether to concatenate encoder embedding with decoder outputs
55+
use_residual: False # whether to use residual connection in encoder
56+
dropout_rate: 0.5 # dropout rate
57+
zoneout_rate: 0.1 # zoneout rate
58+
reduction_factor: 1 # reduction factor
59+
spk_embed_dim: null # speaker embedding dimension
60+
61+
62+
###########################################################
63+
# UPDATER SETTING #
64+
###########################################################
65+
updater:
66+
use_masking: True # whether to apply masking for padded part in loss calculation
67+
bce_pos_weight: 5.0 # weight of positive sample in binary cross entropy calculation
68+
use_guided_attn_loss: True # whether to use guided attention loss
69+
guided_attn_loss_sigma: 0.4 # sigma of guided attention loss
70+
guided_attn_loss_lambda: 1.0 # strength of guided attention loss
71+
72+
73+
##########################################################
74+
# OPTIMIZER SETTING #
75+
##########################################################
76+
optimizer:
77+
optim: adam # optimizer type
78+
learning_rate: 1.0e-03 # learning rate
79+
epsilon: 1.0e-06 # epsilon
80+
weight_decay: 0.0 # weight decay coefficient
81+
82+
###########################################################
83+
# TRAINING SETTING #
84+
###########################################################
85+
max_epoch: 200
86+
num_snapshots: 5
87+
88+
###########################################################
89+
# OTHER SETTING #
90+
###########################################################
91+
seed: 42
Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
#!/bin/bash
2+
3+
stage=0
4+
stop_stage=100
5+
6+
config_path=$1
7+
8+
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
9+
# get durations from MFA's result
10+
echo "Generate durations.txt from MFA results ..."
11+
python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \
12+
--inputdir=./baker_alignment_tone \
13+
--output=durations.txt \
14+
--config=${config_path}
15+
fi
16+
17+
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
18+
# extract features
19+
echo "Extract features ..."
20+
python3 ${BIN_DIR}/preprocess.py \
21+
--dataset=baker \
22+
--rootdir=~/datasets/BZNSYP/ \
23+
--dumpdir=dump \
24+
--dur-file=durations.txt \
25+
--config=${config_path} \
26+
--num-cpu=20 \
27+
--cut-sil=True
28+
fi
29+
30+
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
31+
# get features' stats(mean and std)
32+
echo "Get features' stats ..."
33+
python3 ${MAIN_ROOT}/utils/compute_statistics.py \
34+
--metadata=dump/train/raw/metadata.jsonl \
35+
--field-name="speech"
36+
37+
fi
38+
39+
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
40+
# normalize and covert phone to id, dev and test should use train's stats
41+
echo "Normalize ..."
42+
python3 ${BIN_DIR}/normalize.py \
43+
--metadata=dump/train/raw/metadata.jsonl \
44+
--dumpdir=dump/train/norm \
45+
--speech-stats=dump/train/speech_stats.npy \
46+
--phones-dict=dump/phone_id_map.txt \
47+
--speaker-dict=dump/speaker_id_map.txt
48+
49+
python3 ${BIN_DIR}/normalize.py \
50+
--metadata=dump/dev/raw/metadata.jsonl \
51+
--dumpdir=dump/dev/norm \
52+
--speech-stats=dump/train/speech_stats.npy \
53+
--phones-dict=dump/phone_id_map.txt \
54+
--speaker-dict=dump/speaker_id_map.txt
55+
56+
python3 ${BIN_DIR}/normalize.py \
57+
--metadata=dump/test/raw/metadata.jsonl \
58+
--dumpdir=dump/test/norm \
59+
--speech-stats=dump/train/speech_stats.npy \
60+
--phones-dict=dump/phone_id_map.txt \
61+
--speaker-dict=dump/speaker_id_map.txt
62+
fi
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
#!/bin/bash
2+
3+
config_path=$1
4+
train_output_path=$2
5+
ckpt_name=$3
6+
7+
FLAGS_allocator_strategy=naive_best_fit \
8+
FLAGS_fraction_of_gpu_memory_to_use=0.01 \
9+
python3 ${BIN_DIR}/../synthesize.py \
10+
--am=tacotron2_csmsc \
11+
--am_config=${config_path} \
12+
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
13+
--am_stat=dump/train/speech_stats.npy \
14+
--voc=pwgan_csmsc \
15+
--voc_config=pwg_baker_ckpt_0.4/pwg_default.yaml \
16+
--voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
17+
--voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
18+
--test_metadata=dump/test/norm/metadata.jsonl \
19+
--output_dir=${train_output_path}/test \
20+
--phones_dict=dump/phone_id_map.txt
Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,91 @@
1+
#!/bin/bash
2+
3+
config_path=$1
4+
train_output_path=$2
5+
ckpt_name=$3
6+
7+
stage=0
8+
stop_stage=0
9+
10+
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
11+
FLAGS_allocator_strategy=naive_best_fit \
12+
FLAGS_fraction_of_gpu_memory_to_use=0.01 \
13+
python3 ${BIN_DIR}/../synthesize_e2e.py \
14+
--am=tacotron2_csmsc \
15+
--am_config=${config_path} \
16+
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
17+
--am_stat=dump/train/speech_stats.npy \
18+
--voc=pwgan_csmsc \
19+
--voc_config=pwg_baker_ckpt_0.4/pwg_default.yaml \
20+
--voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
21+
--voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
22+
--lang=zh \
23+
--text=${BIN_DIR}/../sentences.txt \
24+
--output_dir=${train_output_path}/test_e2e \
25+
--inference_dir=${train_output_path}/inference \
26+
--phones_dict=dump/phone_id_map.txt
27+
fi
28+
29+
# for more GAN Vocoders
30+
# multi band melgan
31+
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
32+
FLAGS_allocator_strategy=naive_best_fit \
33+
FLAGS_fraction_of_gpu_memory_to_use=0.01 \
34+
python3 ${BIN_DIR}/../synthesize_e2e.py \
35+
--am=fastspeech2_csmsc \
36+
--am_config=${config_path} \
37+
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
38+
--am_stat=dump/train/speech_stats.npy \
39+
--voc=mb_melgan_csmsc \
40+
--voc_config=mb_melgan_baker_finetune_ckpt_0.5/finetune.yaml \
41+
--voc_ckpt=mb_melgan_baker_finetune_ckpt_0.5/snapshot_iter_2000000.pdz\
42+
--voc_stat=mb_melgan_baker_finetune_ckpt_0.5/feats_stats.npy \
43+
--lang=zh \
44+
--text=${BIN_DIR}/../sentences.txt \
45+
--output_dir=${train_output_path}/test_e2e \
46+
--inference_dir=${train_output_path}/inference \
47+
--phones_dict=dump/phone_id_map.txt
48+
fi
49+
50+
# the pretrained models haven't release now
51+
# style melgan
52+
# style melgan's Dygraph to Static Graph is not ready now
53+
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
54+
FLAGS_allocator_strategy=naive_best_fit \
55+
FLAGS_fraction_of_gpu_memory_to_use=0.01 \
56+
python3 ${BIN_DIR}/../synthesize_e2e.py \
57+
--am=fastspeech2_csmsc \
58+
--am_config=${config_path} \
59+
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
60+
--am_stat=dump/train/speech_stats.npy \
61+
--voc=style_melgan_csmsc \
62+
--voc_config=style_melgan_csmsc_ckpt_0.1.1/default.yaml \
63+
--voc_ckpt=style_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1500000.pdz \
64+
--voc_stat=style_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
65+
--lang=zh \
66+
--text=${BIN_DIR}/../sentences.txt \
67+
--output_dir=${train_output_path}/test_e2e \
68+
--phones_dict=dump/phone_id_map.txt
69+
# --inference_dir=${train_output_path}/inference
70+
fi
71+
72+
# hifigan
73+
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
74+
echo "in hifigan syn_e2e"
75+
FLAGS_allocator_strategy=naive_best_fit \
76+
FLAGS_fraction_of_gpu_memory_to_use=0.01 \
77+
python3 ${BIN_DIR}/../synthesize_e2e.py \
78+
--am=fastspeech2_csmsc \
79+
--am_config=${config_path} \
80+
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
81+
--am_stat=dump/train/speech_stats.npy \
82+
--voc=hifigan_csmsc \
83+
--voc_config=hifigan_csmsc_ckpt_0.1.1/default.yaml \
84+
--voc_ckpt=hifigan_csmsc_ckpt_0.1.1/snapshot_iter_2500000.pdz \
85+
--voc_stat=hifigan_csmsc_ckpt_0.1.1/feats_stats.npy \
86+
--lang=zh \
87+
--text=${BIN_DIR}/../sentences.txt \
88+
--output_dir=${train_output_path}/test_e2e \
89+
--inference_dir=${train_output_path}/inference \
90+
--phones_dict=dump/phone_id_map.txt
91+
fi

examples/csmsc/tts0/local/train.sh

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
#!/bin/bash
2+
3+
config_path=$1
4+
train_output_path=$2
5+
6+
python3 ${BIN_DIR}/train.py \
7+
--train-metadata=dump/train/norm/metadata.jsonl \
8+
--dev-metadata=dump/dev/norm/metadata.jsonl \
9+
--config=${config_path} \
10+
--output-dir=${train_output_path} \
11+
--ngpu=1 \
12+
--phones-dict=dump/phone_id_map.txt

examples/csmsc/tts0/path.sh

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
#!/bin/bash
2+
export MAIN_ROOT=`realpath ${PWD}/../../../`
3+
4+
export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
5+
export LC_ALL=C
6+
7+
export PYTHONDONTWRITEBYTECODE=1
8+
# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
9+
export PYTHONIOENCODING=UTF-8
10+
export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
11+
12+
MODEL=new_tacotron2
13+
export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL}

0 commit comments

Comments
 (0)