Merge branch 'main' into valle_resume

yuantuo666 · web-flow · commit b5eca77ab724 · 2024-02-25T01:57:21.000+08:00
diff --git a/config/base.json b/config/base.json
@@ -122,7 +122,7 @@
     "align_mel_duration": false
   },
   "train": {
-    "ddp": true,
+    "ddp": false,
     "random_seed": 970227,
     "batch_size": 16,
     "max_steps": 1000000,
diff --git a/config/fs2.json b/config/fs2.json
@@ -93,6 +93,7 @@
     },
     "train":{
       "batch_size": 16,
+      "max_epoch": 100,
       "sort_sample": true,
       "drop_last": true,
       "group_size": 4,
diff --git a/egs/tts/FastSpeech2/README.md b/egs/tts/FastSpeech2/README.md
@@ -83,6 +83,11 @@ sh egs/tts/FastSpeech2/run.sh --stage 2 --name [YourExptName]
 
 ## 4. Inference
 
+### Pre-trained Fastspeech 2 and HiFi-GAN Download
+
+We released a pre-trained Amphion [Fastspeech 2](https://huggingface.co/amphion/fastspeech2_ljspeech) model and [HiFi-GAN](https://huggingface.co/amphion/hifigan_ljspeech) trained on LJSpeech. So you can download the them and generate speech according to the following inference instruction.
+
+
 ### Configuration
 
 For inference, you need to specify the following configurations when running `run.sh`:
@@ -96,6 +101,8 @@ For inference, you need to specify the following configurations when running `ru
 | `--infer_dataset`                            | The dataset used for inference.  |  For LJSpeech dataset, the inference dataset would be `LJSpeech`.                                                                                                                                    |
 | `--infer_testing_set`                             | The subset of the inference dataset used for inference, e.g., train, test, golden_test | For LJSpeech dataset, the testing set would be  "`test`" split from LJSpeech at the feature extraction, or "`golden_test`" cherry-picked from test set as template testing set.                                                                                                                                    |
 | `--infer_text`                            | The text to be synthesized. | "`This is a clip of generated speech with the given text from a TTS model.`"                                                                                                                                    |
+| `--vocoder_dir`                           | The directory for the vocoder. | "`ckpts/vocoder/hifigan_ljspeech`"                                                                                                                                    |
+
 
 ### Run
 For example, if you want to generate speech of all testing set split from LJSpeech, just run:
@@ -106,7 +113,8 @@ sh egs/tts/FastSpeech2/run.sh --stage 3 \
     --infer_output_dir ckpts/tts/[YourExptName]/result \
     --infer_mode "batch" \
     --infer_dataset "LJSpeech" \
-    --infer_testing_set "test"
+    --infer_testing_set "test" \
+    --vocoder_dir ckpts/vocoder/hifigan_ljspeech/checkpoints
 ```
 
 Or, if you want to generate a single clip of speech from a given text, just run:
@@ -116,10 +124,28 @@ sh egs/tts/FastSpeech2/run.sh --stage 3 \
     --infer_expt_dir ckpts/tts/[YourExptName] \
     --infer_output_dir ckpts/tts/[YourExptName]/result \
     --infer_mode "single" \
-    --infer_text "This is a clip of generated speech with the given text from a TTS model."
+    --infer_text "This is a clip of generated speech with the given text from a TTS model." \
+    --vocoder_dir ckpts/vocoder/hifigan_ljspeech
+```
+
+### ISSUES and Solutions
+
 ```
+NotImplementedError: Using RTX 3090 or 4000 series doesn't support faster communication broadband via P2P or IB. Please set `NCCL_P2P_DISABLE="1"` and `NCCL_IB_DISABLE="1" or use `accelerate launch` which will do this automatically.
+2024-02-24 10:57:49 | INFO | torch.distributed.distributed_c10d | Rank 0: Completed store-based barrier for key:store_based_barrier_key:1 with 2 nodes.
+```
+The error message is related to an incompatibility issue with the NVIDIA RTX 3090 or 4000 series GPUs when trying to use peer-to-peer (P2P) communication or InfiniBand (IB) for faster communication. This incompatibility arises within the PyTorch accelerate library, which facilitates distributed training and inference.
+
+To fix this issue, before running your script, you can set the environment variables in your terminal:
+```
+export NCCL_P2P_DISABLE=1
+export NCCL_IB_DISABLE=1
+```
+
+### Noted
+Extensive logging messages related to `torch._subclasses.fake_tensor` and `torch._dynamo.output_graph` may be observed during inference. Despite attempts to ignore these logs, no effective solution has been found. However, it does not impact the inference process.
+
 
-We will release a pre-trained FastSpeech2 model trained on LJSpeech. So you can download the pre-trained model and generate speech following the above inference instruction.
 
 
 ```bibtex
diff --git a/egs/tts/FastSpeech2/exp_config.json b/egs/tts/FastSpeech2/exp_config.json
@@ -17,5 +17,6 @@
   },
   "train": {
     "batch_size": 16,
+    "max_epoch": 100,
   }
 }
diff --git a/egs/tts/FastSpeech2/run.sh b/egs/tts/FastSpeech2/run.sh
@@ -21,7 +21,7 @@ echo $mfa_dir
 
 ######## Parse the Given Parameters from the Commond ###########
 # options=$(getopt -o c:n:s --long gpu:,config:,infer_expt_dir:,infer_output_dir:,infer_source_file:,infer_source_audio_dir:,infer_target_speaker:,infer_key_shift:,infer_vocoder_dir:,name:,stage: -- "$@")
-options=$(getopt -o c:n:s --long gpu:,config:,infer_expt_dir:,infer_output_dir:,infer_mode:,infer_dataset:,infer_testing_set:,infer_text:,name:,stage: -- "$@")
+options=$(getopt -o c:n:s --long gpu:,config:,infer_expt_dir:,infer_output_dir:,infer_mode:,infer_dataset:,infer_testing_set:,infer_text:,name:,stage:,vocoder_dir: -- "$@")
 eval set -- "$options"
 
 while true; do
@@ -47,6 +47,8 @@ while true; do
     --infer_testing_set) shift; infer_testing_set=$1 ; shift ;;
     # [Only for Inference] The text to be synthesized from. It is only used when the inference model is "single". 
     --infer_text) shift; infer_text=$1 ; shift ;;
+    # [Only for Inference] The output dir to the vocoder. 
+    --vocoder_dir) shift; vocoder_dir=$1 ; shift ;;
 
     --) shift ; break ;;
     *) echo "Invalid option: $1" exit 1 ;;
@@ -104,6 +106,11 @@ if [ $running_stage -eq 3 ]; then
     if [ -z "$infer_output_dir" ]; then
         infer_output_dir="$expt_dir/result"
     fi
+    
+    if [ -z "$vocoder_dir" ]; then
+        echo "[Error] Please specify the vocoder directory to reconstruct waveform from mel spectrogram."
+        exit 1
+    fi
 
     if [ -z "$infer_mode" ]; then
         echo "[Error] Please specify the inference mode, e.g., "batch", "single""
@@ -143,8 +150,6 @@ if [ $running_stage -eq 3 ]; then
         --testing_set $infer_testing_set \
         --text "$infer_text" \
         --log_level debug \
-        --vocoder_dir /mntnfs/lee_data1/chenxi/processed_data/ljspeech/model_ckpt/hifigan/checkpoints
-
-
+        --vocoder_dir $vocoder_dir
 
 fi
diff --git a/egs/tts/VITS/README.md b/egs/tts/VITS/README.md
@@ -143,6 +143,11 @@ Here are some example scenarios to better understand how to use these arguments:
 
 ## 4. Inference
 
+### Pre-trained Model Download
+
+We released a pre-trained Amphion VITS model trained on LJSpeech. So you can download the pre-trained model [here](https://huggingface.co/amphion/vits-ljspeech) and generate speech according to the following inference instruction.
+
+
 ### Configuration
 
 For inference, you need to specify the following configurations when running `run.sh`:

Original file line number	Diff line number	Diff line change
`@@ -17,5 +17,6 @@`
`17`	`17`	`},`
`18`	`18`	`"train": {`
`19`	`19`	`"batch_size": 16,`
	`20`	`+ "max_epoch": 100,`
`20`	`21`	`}`
`21`	`22`	`}`