Merge remote-tracking branch 'origin/main'

autumn-2-net · autumn-2-net · commit 009dfd4aab89 · 2024-02-24T23:17:49.000+08:00
diff --git a/README.md b/README.md
@@ -65,7 +65,7 @@ python [export_ckpt.py](export_ckpt.py) --ckpt_path ckpt路径  --save_path 导
 
 因为 pytorch-lightning 的问题所以说在 GAN 训练过程中实际的步数是它显示步数的一半
 
-如果你需要微调社区声码器权重建议使用[ft_hifigan.yaml](configs/ft_hifigan.yaml) 配置文件
+如果你需要微调社区声码器请使用[ft_hifigan.yaml](configs/ft_hifigan.yaml) 配置文件，并用 'finetune_ckpt_path' 选项指定权重路径
 
 如何使用微调功能建议参考 openvpi/diffsinger [项目文档](https://github.com/openvpi/DiffSinger/blob/main/docs/BestPractices.md#fine-tuning-and-parameter-freezing)
 
@@ -106,11 +106,11 @@ python export_ckpt.py --ckpt_path (your ckpt path)  --save_path (output ckpt pat
 # 注意事项
 实际步数是显示的一半
 
-微调 nsf-hifigan 声码器请将 [releases](https://github.com/openvpi/SingingVocoders/releases) 中的权重解压后放至主目录下，并使用 [ft_hifigan.yaml](configs/ft_hifigan.yaml)
+微调 nsf-hifigan 声码器请下载并解压 [releases](https://github.com/openvpi/SingingVocoders/releases) 中的权重，并将 [ft_hifigan.yaml](configs/ft_hifigan.yaml) 中的 'finetune_ckpt_path' 选项改为权重路径
 
 微调请使用 44100 Hz 采样率音频，并不要修改其他 mel 参数，除非你明确知道你在做什么
 
-微调功能使用请参考 openvpi/DiffSinger [项目文档](https://github.com/openvpi/DiffSinger/blob/main/docs/BestPractices.md#fine-tuning-and-parameter-freezing)
+微调的其他功能使用请参考 openvpi/DiffSinger [项目文档](https://github.com/openvpi/DiffSinger/blob/main/docs/BestPractices.md#fine-tuning-and-parameter-freezing)
 
 导出的权重可以在 [DDSP-SVC](https://github.com/yxlllc/DDSP-SVC), [Diffusion-SVC](https://github.com/CNChTu/Diffusion-SVC), [so-vits-svc](https://github.com/svc-develop-team/so-vits-svc) 和 [DiffSinger (openvpi)](https://github.com/openvpi/DiffSinger) 等项目中使用
 
diff --git a/README_en.md b/README_en.md
@@ -77,7 +77,7 @@ Note that data augmentation may damage the sound quality!
 # Note
 Because of some problems the actual number of steps is half of what it shows
 
-To fine-tune the nsf-hifigan vocoder, please unzip the weights in [releases](https://github.com/openvpi/SingingVocoders/releases) and put them in the main directory, and use [ft_hifigan.yaml](configs%2Fft_hifigan.yaml)
+To fine-tune the nsf-hifigan vocoder, please download and unzip the weights in [releases](https://github.com/openvpi/SingingVocoders/releases), and modify the 'finetune_ckpt_path' item in [ft_hifigan.yaml](configs%2Fft_hifigan.yaml) to the checkpoint file.
 
 For fine-tuning please use 44100 Hz samplerate audio and do not modify other mel parameters unless you know exactly what you are doing
 
diff --git a/configs/ft_hifigan.yaml b/configs/ft_hifigan.yaml
@@ -142,7 +142,7 @@ seed: 114514
 ###########
 
 finetune_enabled: true
-finetune_ckpt_path: hifi.ckpt
+finetune_ckpt_path: nsf_hifigan_44.1k_hop512_128bin_2024.02.ckpt
 finetune_ignored_params: []
 finetune_strict_shapes: true
 
diff --git a/training/nsf_HiFigan_task.py b/training/nsf_HiFigan_task.py
@@ -83,13 +83,23 @@ def __getitem__(self, index):
             if random.random() < self.key_aug_prob:
                 audio = torch.from_numpy(data['audio'])
                 speed = random.uniform(self.config['aug_min'], self.config['aug_max'])
-                audiox = wav_aug(audio, self.config["hop_size"], speed=speed)
-                mel = dynamic_range_compression_torch(self.mel_spec_transform(audiox[None,:]))
-                f0, uv = get_pitch(audio.numpy(), hparams=self.config, speed=speed, interp_uv=True, length=len(mel[0].T))
+                crop_mel_frames = int(np.ceil((self.config['crop_mel_frames'] + 4) * speed))
+                samples_per_frame = self.config['hop_size']
+                crop_wav_samples = crop_mel_frames * samples_per_frame
+                if crop_wav_samples >= audio.shape[0]:
+                    return {'f0': data['f0'], 'spectrogram': data['mel'], 'audio': data['audio']}
+                start = random.randint(0, audio.shape[0] - 1 - crop_wav_samples)
+                end = start + crop_wav_samples
+                audio = audio[start:end]
+                audio_aug = wav_aug(audio, self.config["hop_size"], speed=speed)
+                mel_aug = dynamic_range_compression_torch(self.mel_spec_transform(audio_aug[None,:]))
+                f0, uv = get_pitch(audio.numpy(), hparams=self.config, speed=speed, interp_uv=True, length=mel_aug.shape[-1])
                 if f0 is None:
                     return {'f0': data['f0'], 'spectrogram': data['mel'], 'audio': data['audio']}
-                f0 *= speed
-                return {'f0': f0, 'spectrogram': mel[0].T.numpy(), 'audio': audiox.numpy()}
+                audio_aug = audio_aug[2*samples_per_frame: -2*samples_per_frame].numpy()
+                mel_aug = mel_aug[0, :, 2:-2].T.numpy()
+                f0_aug = f0[2:-2] * speed
+                return {'f0': f0_aug, 'spectrogram': mel_aug, 'audio': audio_aug}
 
             else:
                 return {'f0': data['f0'], 'spectrogram': data['mel'], 'audio': data['audio']}
@@ -107,13 +117,15 @@ def collater(self, minibatch):
         for record in minibatch:
 
             # Filter out records that aren't long enough.
-            if len(record['spectrogram']) <= crop_mel_frames:
+            if record['spectrogram'].shape[0] < crop_mel_frames:
                 del record['spectrogram']
                 del record['audio']
                 del record['f0']
                 continue
-
-            start = random.randint(0, record['spectrogram'].shape[0] - 1 - crop_mel_frames)
+            elif record['spectrogram'].shape[0] == crop_mel_frames:
+                start = 0
+            else:
+                start = random.randint(0, record['spectrogram'].shape[0] - 1 - crop_mel_frames)
             end = start + crop_mel_frames
             if self.infer:
                 record['spectrogram'] = record['spectrogram'].T