PaddlePaddle
diff --git a/‎examples/aishell3/ernie_sat/conf/default.yaml‎
Lines changed: 283 additions & 0 deletions b/‎examples/aishell3/ernie_sat/conf/default.yaml‎
Lines changed: 283 additions & 0 deletions
diff --git a/‎examples/aishell3/ernie_sat/local/preprocess.sh‎
Lines changed: 61 additions & 0 deletions b/‎examples/aishell3/ernie_sat/local/preprocess.sh‎
Lines changed: 61 additions & 0 deletions
diff --git a/‎examples/aishell3/ernie_sat/local/synthesize.sh‎
Lines changed: 42 additions & 0 deletions b/‎examples/aishell3/ernie_sat/local/synthesize.sh‎
Lines changed: 42 additions & 0 deletions
@@ -0,0 +1,283 @@
+###########################################################
+#                FEATURE EXTRACTION SETTING               #
+###########################################################
+
+fs: 24000          # sr
+n_fft: 2048        # FFT size (samples).
+n_shift: 300       # Hop size (samples). 12.5ms
+win_length: 1200   # Window length (samples). 50ms
+                   # If set to null, it will be the same as fft_size.
+window: "hann"     # Window function.
+
+# Only used for feats_type != raw
+
+fmin: 80           # Minimum frequency of Mel basis.
+fmax: 7600         # Maximum frequency of Mel basis.
+n_mels: 80         # The number of mel basis.
+
+mean_phn_span: 8
+mlm_prob: 0.8
+
+###########################################################
+#                       DATA SETTING                      #
+###########################################################
+batch_size: 20
+num_workers: 2
+
+###########################################################
+#                       MODEL SETTING                     #
+###########################################################
+model:
+    text_masking: false
+    postnet_layers: 5
+    postnet_filts: 5
+    postnet_chans: 256
+    encoder_type: conformer
+    decoder_type: conformer
+    enc_input_layer: sega_mlm
+    enc_pre_speech_layer: 0
+    enc_cnn_module_kernel: 7
+    enc_attention_dim: 384
+    enc_attention_heads: 2
+    enc_linear_units: 1536
+    enc_num_blocks: 4
+    enc_dropout_rate: 0.2
+    enc_positional_dropout_rate: 0.2
+    enc_attention_dropout_rate: 0.2
+    enc_normalize_before: true
+    enc_macaron_style: true
+    enc_use_cnn_module: true
+    enc_selfattention_layer_type: legacy_rel_selfattn
+    enc_activation_type: swish
+    enc_pos_enc_layer_type: legacy_rel_pos
+    enc_positionwise_layer_type: conv1d
+    enc_positionwise_conv_kernel_size: 3
+    dec_cnn_module_kernel: 31
+    dec_attention_dim: 384
+    dec_attention_heads: 2
+    dec_linear_units: 1536
+    dec_num_blocks: 4
+    dec_dropout_rate: 0.2
+    dec_positional_dropout_rate: 0.2
+    dec_attention_dropout_rate: 0.2
+    dec_macaron_style: true
+    dec_use_cnn_module: true
+    dec_selfattention_layer_type: legacy_rel_selfattn
+    dec_activation_type: swish
+    dec_pos_enc_layer_type: legacy_rel_pos
+    dec_positionwise_layer_type: conv1d
+    dec_positionwise_conv_kernel_size: 3
+
+###########################################################
+#                     OPTIMIZER SETTING                   #
+###########################################################
+scheduler_params:
+    d_model: 384
+    warmup_steps: 4000
+grad_clip: 1.0
+
+###########################################################
+#                     TRAINING SETTING                    #
+###########################################################
+max_epoch: 1500
+num_snapshots: 50
+
+###########################################################
+#                       OTHER SETTING                     #
+###########################################################
+seed: 0
+
+token_list:
+- <blank>
+- <unk>
+- d
+- sp
+- sh
+- ii
+- j
+- zh
+- l
+- x
+- b
+- g
+- uu
+- e5
+- h
+- q
+- m
+- i1
+- t
+- z
+- ch
+- f
+- s
+- u4
+- ix4
+- i4
+- n
+- i3
+- iu3
+- vv
+- ian4
+- ix2
+- r
+- e4
+- ai4
+- k
+- ing2
+- a1
+- en2
+- ui4
+- ong1
+- uo3
+- u2
+- u3
+- ao4
+- ee
+- p
+- an1
+- eng2
+- i2
+- in1
+- c
+- ai2
+- ian2
+- e2
+- an4
+- ing4
+- v4
+- ai3
+- a5
+- ian3
+- eng1
+- ong4
+- ang4
+- ian1
+- ing1
+- iy4
+- ao3
+- ang1
+- uo4
+- u1
+- iao4
+- iu4
+- a4
+- van2
+- ie4
+- ang2
+- ou4
+- iang4
+- ix1
+- er4
+- iy1
+- e1
+- en1
+- ui2
+- an3
+- ei4
+- ong2
+- uo1
+- ou3
+- uo2
+- iao1
+- ou1
+- an2
+- uan4
+- ia4
+- ia1
+- ang3
+- v3
+- iu2
+- iao3
+- in4
+- a3
+- ei3
+- iang3
+- v2
+- eng4
+- en3
+- aa
+- uan1
+- v1
+- ao1
+- ve4
+- ie3
+- ai1
+- ing3
+- iang1
+- a2
+- ui1
+- en4
+- en5
+- in3
+- uan3
+- e3
+- ie1
+- ve2
+- ei2
+- in2
+- ix3
+- uan2
+- iang2
+- ie2
+- ua4
+- ou2
+- uai4
+- er2
+- eng3
+- uang3
+- un1
+- ong3
+- uang4
+- vn4
+- un2
+- iy3
+- iz4
+- ui3
+- iao2
+- iong4
+- un4
+- van4
+- ao2
+- uang1
+- iy5
+- o2
+- ei1
+- ua1
+- iu1
+- uang2
+- er5
+- o1
+- un3
+- vn1
+- vn2
+- o4
+- ve1
+- van3
+- ua2
+- er3
+- iong3
+- van1
+- ia2
+- iy2
+- ia3
+- iong1
+- uo5
+- oo
+- ve3
+- ou5
+- uai3
+- ian5
+- iong2
+- uai2
+- uai1
+- ua3
+- vn3
+- ia5
+- ie5
+- ueng1
+- o5
+- o3
+- iang5
+- ei5
+- <sos/eos>
@@ -0,0 +1,61 @@
+#!/bin/bash
+
+stage=0
+stop_stage=100
+
+config_path=$1
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    # get durations from MFA's result
+    echo "Generate durations.txt from MFA results ..."
+    python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \
+        --inputdir=./aishell3_alignment_tone \
+        --output durations.txt \
+        --config=${config_path}
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    # extract features
+    echo "Extract features ..."
+    python3 ${BIN_DIR}/preprocess.py \
+        --dataset=aishell3 \
+        --rootdir=~/datasets/data_aishell3/ \
+        --dumpdir=dump \
+        --dur-file=durations.txt \
+        --config=${config_path} \
+        --num-cpu=20 \
+        --cut-sil=True
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    # get features' stats(mean and std)
+    echo "Get features' stats ..."
+    python3 ${MAIN_ROOT}/utils/compute_statistics.py \
+        --metadata=dump/train/raw/metadata.jsonl \
+        --field-name="speech"
+fi
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    # normalize and covert phone/speaker to id, dev and test should use train's stats
+    echo "Normalize ..."
+    python3 ${BIN_DIR}/normalize.py \
+        --metadata=dump/train/raw/metadata.jsonl \
+        --dumpdir=dump/train/norm \
+        --speech-stats=dump/train/speech_stats.npy \
+        --phones-dict=dump/phone_id_map.txt \
+        --speaker-dict=dump/speaker_id_map.txt
+
+    python3 ${BIN_DIR}/normalize.py \
+        --metadata=dump/dev/raw/metadata.jsonl \
+        --dumpdir=dump/dev/norm \
+        --speech-stats=dump/train/speech_stats.npy \
+        --phones-dict=dump/phone_id_map.txt \
+        --speaker-dict=dump/speaker_id_map.txt
+
+    python3 ${BIN_DIR}/normalize.py \
+        --metadata=dump/test/raw/metadata.jsonl \
+        --dumpdir=dump/test/norm \
+        --speech-stats=dump/train/speech_stats.npy \
+        --phones-dict=dump/phone_id_map.txt \
+        --speaker-dict=dump/speaker_id_map.txt
+fi
@@ -0,0 +1,42 @@
+#!/bin/bash
+
+config_path=$1
+train_output_path=$2
+ckpt_name=$3
+
+stage=1
+stop_stage=1
+
+# pwgan
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    FLAGS_allocator_strategy=naive_best_fit \
+    FLAGS_fraction_of_gpu_memory_to_use=0.01 \
+    python3 ${BIN_DIR}/synthesize.py \
+        --erniesat_config=${config_path} \
+        --erniesat_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+        --erniesat_stat=dump/train/speech_stats.npy \
+        --voc=pwgan_aishell3 \
+        --voc_config=pwg_aishell3_ckpt_0.5/default.yaml \
+        --voc_ckpt=pwg_aishell3_ckpt_0.5/snapshot_iter_1000000.pdz \
+        --voc_stat=pwg_aishell3_ckpt_0.5/feats_stats.npy \
+        --test_metadata=dump/test/norm/metadata.jsonl \
+        --output_dir=${train_output_path}/test \
+        --phones_dict=dump/phone_id_map.txt
+fi
+
+# hifigan
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    FLAGS_allocator_strategy=naive_best_fit \
+    FLAGS_fraction_of_gpu_memory_to_use=0.01 \
+    python3 ${BIN_DIR}/synthesize.py \
+        --erniesat_config=${config_path} \
+        --erniesat_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+        --erniesat_stat=dump/train/speech_stats.npy \
+        --voc=hifigan_aishell3 \
+        --voc_config=hifigan_aishell3_ckpt_0.2.0/default.yaml \
+        --voc_ckpt=hifigan_aishell3_ckpt_0.2.0/snapshot_iter_2500000.pdz \
+        --voc_stat=hifigan_aishell3_ckpt_0.2.0/feats_stats.npy \
+        --test_metadata=dump/test/norm/metadata.jsonl \
+        --output_dir=${train_output_path}/test \
+        --phones_dict=dump/phone_id_map.txt
+fi