Skip to content

Commit 009dfd4

Browse files
committed
Merge remote-tracking branch 'origin/main'
2 parents a2e706b + 0973257 commit 009dfd4

File tree

4 files changed

+25
-13
lines changed

4 files changed

+25
-13
lines changed

README.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@ python [export_ckpt.py](export_ckpt.py) --ckpt_path ckpt路径 --save_path 导
6565

6666
因为 pytorch-lightning 的问题所以说在 GAN 训练过程中实际的步数是它显示步数的一半
6767

68-
如果你需要微调社区声码器权重建议使用[ft_hifigan.yaml](configs/ft_hifigan.yaml) 配置文件
68+
如果你需要微调社区声码器请使用[ft_hifigan.yaml](configs/ft_hifigan.yaml) 配置文件,并用 'finetune_ckpt_path' 选项指定权重路径
6969

7070
如何使用微调功能建议参考 openvpi/diffsinger [项目文档](https://github.com/openvpi/DiffSinger/blob/main/docs/BestPractices.md#fine-tuning-and-parameter-freezing)
7171

@@ -106,11 +106,11 @@ python export_ckpt.py --ckpt_path (your ckpt path) --save_path (output ckpt pat
106106
# 注意事项
107107
实际步数是显示的一半
108108

109-
微调 nsf-hifigan 声码器请将 [releases](https://github.com/openvpi/SingingVocoders/releases) 中的权重解压后放至主目录下,并使用 [ft_hifigan.yaml](configs/ft_hifigan.yaml)
109+
微调 nsf-hifigan 声码器请下载并解压 [releases](https://github.com/openvpi/SingingVocoders/releases) 中的权重,并将 [ft_hifigan.yaml](configs/ft_hifigan.yaml) 中的 'finetune_ckpt_path' 选项改为权重路径
110110

111111
微调请使用 44100 Hz 采样率音频,并不要修改其他 mel 参数,除非你明确知道你在做什么
112112

113-
微调功能使用请参考 openvpi/DiffSinger [项目文档](https://github.com/openvpi/DiffSinger/blob/main/docs/BestPractices.md#fine-tuning-and-parameter-freezing)
113+
微调的其他功能使用请参考 openvpi/DiffSinger [项目文档](https://github.com/openvpi/DiffSinger/blob/main/docs/BestPractices.md#fine-tuning-and-parameter-freezing)
114114

115115
导出的权重可以在 [DDSP-SVC](https://github.com/yxlllc/DDSP-SVC), [Diffusion-SVC](https://github.com/CNChTu/Diffusion-SVC), [so-vits-svc](https://github.com/svc-develop-team/so-vits-svc)[DiffSinger (openvpi)](https://github.com/openvpi/DiffSinger) 等项目中使用
116116

README_en.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,7 @@ Note that data augmentation may damage the sound quality!
7777
# Note
7878
Because of some problems the actual number of steps is half of what it shows
7979
80-
To fine-tune the nsf-hifigan vocoder, please unzip the weights in [releases](https://github.com/openvpi/SingingVocoders/releases) and put them in the main directory, and use [ft_hifigan.yaml](configs%2Fft_hifigan.yaml)
80+
To fine-tune the nsf-hifigan vocoder, please download and unzip the weights in [releases](https://github.com/openvpi/SingingVocoders/releases), and modify the 'finetune_ckpt_path' item in [ft_hifigan.yaml](configs%2Fft_hifigan.yaml) to the checkpoint file.
8181
8282
For fine-tuning please use 44100 Hz samplerate audio and do not modify other mel parameters unless you know exactly what you are doing
8383

configs/ft_hifigan.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -142,7 +142,7 @@ seed: 114514
142142
###########
143143

144144
finetune_enabled: true
145-
finetune_ckpt_path: hifi.ckpt
145+
finetune_ckpt_path: nsf_hifigan_44.1k_hop512_128bin_2024.02.ckpt
146146
finetune_ignored_params: []
147147
finetune_strict_shapes: true
148148

training/nsf_HiFigan_task.py

Lines changed: 20 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -83,13 +83,23 @@ def __getitem__(self, index):
8383
if random.random() < self.key_aug_prob:
8484
audio = torch.from_numpy(data['audio'])
8585
speed = random.uniform(self.config['aug_min'], self.config['aug_max'])
86-
audiox = wav_aug(audio, self.config["hop_size"], speed=speed)
87-
mel = dynamic_range_compression_torch(self.mel_spec_transform(audiox[None,:]))
88-
f0, uv = get_pitch(audio.numpy(), hparams=self.config, speed=speed, interp_uv=True, length=len(mel[0].T))
86+
crop_mel_frames = int(np.ceil((self.config['crop_mel_frames'] + 4) * speed))
87+
samples_per_frame = self.config['hop_size']
88+
crop_wav_samples = crop_mel_frames * samples_per_frame
89+
if crop_wav_samples >= audio.shape[0]:
90+
return {'f0': data['f0'], 'spectrogram': data['mel'], 'audio': data['audio']}
91+
start = random.randint(0, audio.shape[0] - 1 - crop_wav_samples)
92+
end = start + crop_wav_samples
93+
audio = audio[start:end]
94+
audio_aug = wav_aug(audio, self.config["hop_size"], speed=speed)
95+
mel_aug = dynamic_range_compression_torch(self.mel_spec_transform(audio_aug[None,:]))
96+
f0, uv = get_pitch(audio.numpy(), hparams=self.config, speed=speed, interp_uv=True, length=mel_aug.shape[-1])
8997
if f0 is None:
9098
return {'f0': data['f0'], 'spectrogram': data['mel'], 'audio': data['audio']}
91-
f0 *= speed
92-
return {'f0': f0, 'spectrogram': mel[0].T.numpy(), 'audio': audiox.numpy()}
99+
audio_aug = audio_aug[2*samples_per_frame: -2*samples_per_frame].numpy()
100+
mel_aug = mel_aug[0, :, 2:-2].T.numpy()
101+
f0_aug = f0[2:-2] * speed
102+
return {'f0': f0_aug, 'spectrogram': mel_aug, 'audio': audio_aug}
93103

94104
else:
95105
return {'f0': data['f0'], 'spectrogram': data['mel'], 'audio': data['audio']}
@@ -107,13 +117,15 @@ def collater(self, minibatch):
107117
for record in minibatch:
108118

109119
# Filter out records that aren't long enough.
110-
if len(record['spectrogram']) <= crop_mel_frames:
120+
if record['spectrogram'].shape[0] < crop_mel_frames:
111121
del record['spectrogram']
112122
del record['audio']
113123
del record['f0']
114124
continue
115-
116-
start = random.randint(0, record['spectrogram'].shape[0] - 1 - crop_mel_frames)
125+
elif record['spectrogram'].shape[0] == crop_mel_frames:
126+
start = 0
127+
else:
128+
start = random.randint(0, record['spectrogram'].shape[0] - 1 - crop_mel_frames)
117129
end = start + crop_mel_frames
118130
if self.infer:
119131
record['spectrogram'] = record['spectrogram'].T

0 commit comments

Comments
 (0)