Skip to content

Commit 1f128a0

Browse files
authored
Merge pull request #2117 from yt605155624/ernie_sat_trainer
[TTS]add ernie sat trainer
2 parents 663cfc0 + 1bf78fa commit 1f128a0

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

51 files changed

+4992
-91
lines changed
Lines changed: 283 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,283 @@
1+
###########################################################
2+
# FEATURE EXTRACTION SETTING #
3+
###########################################################
4+
5+
fs: 24000 # sr
6+
n_fft: 2048 # FFT size (samples).
7+
n_shift: 300 # Hop size (samples). 12.5ms
8+
win_length: 1200 # Window length (samples). 50ms
9+
# If set to null, it will be the same as fft_size.
10+
window: "hann" # Window function.
11+
12+
# Only used for feats_type != raw
13+
14+
fmin: 80 # Minimum frequency of Mel basis.
15+
fmax: 7600 # Maximum frequency of Mel basis.
16+
n_mels: 80 # The number of mel basis.
17+
18+
mean_phn_span: 8
19+
mlm_prob: 0.8
20+
21+
###########################################################
22+
# DATA SETTING #
23+
###########################################################
24+
batch_size: 20
25+
num_workers: 2
26+
27+
###########################################################
28+
# MODEL SETTING #
29+
###########################################################
30+
model:
31+
text_masking: false
32+
postnet_layers: 5
33+
postnet_filts: 5
34+
postnet_chans: 256
35+
encoder_type: conformer
36+
decoder_type: conformer
37+
enc_input_layer: sega_mlm
38+
enc_pre_speech_layer: 0
39+
enc_cnn_module_kernel: 7
40+
enc_attention_dim: 384
41+
enc_attention_heads: 2
42+
enc_linear_units: 1536
43+
enc_num_blocks: 4
44+
enc_dropout_rate: 0.2
45+
enc_positional_dropout_rate: 0.2
46+
enc_attention_dropout_rate: 0.2
47+
enc_normalize_before: true
48+
enc_macaron_style: true
49+
enc_use_cnn_module: true
50+
enc_selfattention_layer_type: legacy_rel_selfattn
51+
enc_activation_type: swish
52+
enc_pos_enc_layer_type: legacy_rel_pos
53+
enc_positionwise_layer_type: conv1d
54+
enc_positionwise_conv_kernel_size: 3
55+
dec_cnn_module_kernel: 31
56+
dec_attention_dim: 384
57+
dec_attention_heads: 2
58+
dec_linear_units: 1536
59+
dec_num_blocks: 4
60+
dec_dropout_rate: 0.2
61+
dec_positional_dropout_rate: 0.2
62+
dec_attention_dropout_rate: 0.2
63+
dec_macaron_style: true
64+
dec_use_cnn_module: true
65+
dec_selfattention_layer_type: legacy_rel_selfattn
66+
dec_activation_type: swish
67+
dec_pos_enc_layer_type: legacy_rel_pos
68+
dec_positionwise_layer_type: conv1d
69+
dec_positionwise_conv_kernel_size: 3
70+
71+
###########################################################
72+
# OPTIMIZER SETTING #
73+
###########################################################
74+
scheduler_params:
75+
d_model: 384
76+
warmup_steps: 4000
77+
grad_clip: 1.0
78+
79+
###########################################################
80+
# TRAINING SETTING #
81+
###########################################################
82+
max_epoch: 1500
83+
num_snapshots: 50
84+
85+
###########################################################
86+
# OTHER SETTING #
87+
###########################################################
88+
seed: 0
89+
90+
token_list:
91+
- <blank>
92+
- <unk>
93+
- d
94+
- sp
95+
- sh
96+
- ii
97+
- j
98+
- zh
99+
- l
100+
- x
101+
- b
102+
- g
103+
- uu
104+
- e5
105+
- h
106+
- q
107+
- m
108+
- i1
109+
- t
110+
- z
111+
- ch
112+
- f
113+
- s
114+
- u4
115+
- ix4
116+
- i4
117+
- n
118+
- i3
119+
- iu3
120+
- vv
121+
- ian4
122+
- ix2
123+
- r
124+
- e4
125+
- ai4
126+
- k
127+
- ing2
128+
- a1
129+
- en2
130+
- ui4
131+
- ong1
132+
- uo3
133+
- u2
134+
- u3
135+
- ao4
136+
- ee
137+
- p
138+
- an1
139+
- eng2
140+
- i2
141+
- in1
142+
- c
143+
- ai2
144+
- ian2
145+
- e2
146+
- an4
147+
- ing4
148+
- v4
149+
- ai3
150+
- a5
151+
- ian3
152+
- eng1
153+
- ong4
154+
- ang4
155+
- ian1
156+
- ing1
157+
- iy4
158+
- ao3
159+
- ang1
160+
- uo4
161+
- u1
162+
- iao4
163+
- iu4
164+
- a4
165+
- van2
166+
- ie4
167+
- ang2
168+
- ou4
169+
- iang4
170+
- ix1
171+
- er4
172+
- iy1
173+
- e1
174+
- en1
175+
- ui2
176+
- an3
177+
- ei4
178+
- ong2
179+
- uo1
180+
- ou3
181+
- uo2
182+
- iao1
183+
- ou1
184+
- an2
185+
- uan4
186+
- ia4
187+
- ia1
188+
- ang3
189+
- v3
190+
- iu2
191+
- iao3
192+
- in4
193+
- a3
194+
- ei3
195+
- iang3
196+
- v2
197+
- eng4
198+
- en3
199+
- aa
200+
- uan1
201+
- v1
202+
- ao1
203+
- ve4
204+
- ie3
205+
- ai1
206+
- ing3
207+
- iang1
208+
- a2
209+
- ui1
210+
- en4
211+
- en5
212+
- in3
213+
- uan3
214+
- e3
215+
- ie1
216+
- ve2
217+
- ei2
218+
- in2
219+
- ix3
220+
- uan2
221+
- iang2
222+
- ie2
223+
- ua4
224+
- ou2
225+
- uai4
226+
- er2
227+
- eng3
228+
- uang3
229+
- un1
230+
- ong3
231+
- uang4
232+
- vn4
233+
- un2
234+
- iy3
235+
- iz4
236+
- ui3
237+
- iao2
238+
- iong4
239+
- un4
240+
- van4
241+
- ao2
242+
- uang1
243+
- iy5
244+
- o2
245+
- ei1
246+
- ua1
247+
- iu1
248+
- uang2
249+
- er5
250+
- o1
251+
- un3
252+
- vn1
253+
- vn2
254+
- o4
255+
- ve1
256+
- van3
257+
- ua2
258+
- er3
259+
- iong3
260+
- van1
261+
- ia2
262+
- iy2
263+
- ia3
264+
- iong1
265+
- uo5
266+
- oo
267+
- ve3
268+
- ou5
269+
- uai3
270+
- ian5
271+
- iong2
272+
- uai2
273+
- uai1
274+
- ua3
275+
- vn3
276+
- ia5
277+
- ie5
278+
- ueng1
279+
- o5
280+
- o3
281+
- iang5
282+
- ei5
283+
- <sos/eos>
Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
#!/bin/bash
2+
3+
stage=0
4+
stop_stage=100
5+
6+
config_path=$1
7+
8+
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
9+
# get durations from MFA's result
10+
echo "Generate durations.txt from MFA results ..."
11+
python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \
12+
--inputdir=./aishell3_alignment_tone \
13+
--output durations.txt \
14+
--config=${config_path}
15+
fi
16+
17+
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
18+
# extract features
19+
echo "Extract features ..."
20+
python3 ${BIN_DIR}/preprocess.py \
21+
--dataset=aishell3 \
22+
--rootdir=~/datasets/data_aishell3/ \
23+
--dumpdir=dump \
24+
--dur-file=durations.txt \
25+
--config=${config_path} \
26+
--num-cpu=20 \
27+
--cut-sil=True
28+
fi
29+
30+
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
31+
# get features' stats(mean and std)
32+
echo "Get features' stats ..."
33+
python3 ${MAIN_ROOT}/utils/compute_statistics.py \
34+
--metadata=dump/train/raw/metadata.jsonl \
35+
--field-name="speech"
36+
fi
37+
38+
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
39+
# normalize and covert phone/speaker to id, dev and test should use train's stats
40+
echo "Normalize ..."
41+
python3 ${BIN_DIR}/normalize.py \
42+
--metadata=dump/train/raw/metadata.jsonl \
43+
--dumpdir=dump/train/norm \
44+
--speech-stats=dump/train/speech_stats.npy \
45+
--phones-dict=dump/phone_id_map.txt \
46+
--speaker-dict=dump/speaker_id_map.txt
47+
48+
python3 ${BIN_DIR}/normalize.py \
49+
--metadata=dump/dev/raw/metadata.jsonl \
50+
--dumpdir=dump/dev/norm \
51+
--speech-stats=dump/train/speech_stats.npy \
52+
--phones-dict=dump/phone_id_map.txt \
53+
--speaker-dict=dump/speaker_id_map.txt
54+
55+
python3 ${BIN_DIR}/normalize.py \
56+
--metadata=dump/test/raw/metadata.jsonl \
57+
--dumpdir=dump/test/norm \
58+
--speech-stats=dump/train/speech_stats.npy \
59+
--phones-dict=dump/phone_id_map.txt \
60+
--speaker-dict=dump/speaker_id_map.txt
61+
fi
Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
#!/bin/bash
2+
3+
config_path=$1
4+
train_output_path=$2
5+
ckpt_name=$3
6+
7+
stage=1
8+
stop_stage=1
9+
10+
# pwgan
11+
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
12+
FLAGS_allocator_strategy=naive_best_fit \
13+
FLAGS_fraction_of_gpu_memory_to_use=0.01 \
14+
python3 ${BIN_DIR}/synthesize.py \
15+
--erniesat_config=${config_path} \
16+
--erniesat_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
17+
--erniesat_stat=dump/train/speech_stats.npy \
18+
--voc=pwgan_aishell3 \
19+
--voc_config=pwg_aishell3_ckpt_0.5/default.yaml \
20+
--voc_ckpt=pwg_aishell3_ckpt_0.5/snapshot_iter_1000000.pdz \
21+
--voc_stat=pwg_aishell3_ckpt_0.5/feats_stats.npy \
22+
--test_metadata=dump/test/norm/metadata.jsonl \
23+
--output_dir=${train_output_path}/test \
24+
--phones_dict=dump/phone_id_map.txt
25+
fi
26+
27+
# hifigan
28+
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
29+
FLAGS_allocator_strategy=naive_best_fit \
30+
FLAGS_fraction_of_gpu_memory_to_use=0.01 \
31+
python3 ${BIN_DIR}/synthesize.py \
32+
--erniesat_config=${config_path} \
33+
--erniesat_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
34+
--erniesat_stat=dump/train/speech_stats.npy \
35+
--voc=hifigan_aishell3 \
36+
--voc_config=hifigan_aishell3_ckpt_0.2.0/default.yaml \
37+
--voc_ckpt=hifigan_aishell3_ckpt_0.2.0/snapshot_iter_2500000.pdz \
38+
--voc_stat=hifigan_aishell3_ckpt_0.2.0/feats_stats.npy \
39+
--test_metadata=dump/test/norm/metadata.jsonl \
40+
--output_dir=${train_output_path}/test \
41+
--phones_dict=dump/phone_id_map.txt
42+
fi

0 commit comments

Comments
 (0)