Skip to content

Commit 276cfa0

Browse files
authored
Merge pull request #925 from FunAudioLLM/dev/lyuxiang.lx
fix pitch computation
2 parents c6c3f27 + 190840b commit 276cfa0

File tree

5 files changed

+19
-14
lines changed

5 files changed

+19
-14
lines changed

cosyvoice/dataset/processor.py

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
import torchaudio
2121
from torch.nn.utils.rnn import pad_sequence
2222
import torch.nn.functional as F
23+
import pyworld as pw
2324

2425

2526
AUDIO_FORMAT_SETS = {'flac', 'mp3', 'm4a', 'ogg', 'opus', 'wav', 'wma'}
@@ -178,7 +179,7 @@ def compute_fbank(data,
178179
yield sample
179180

180181

181-
def compute_f0(data, pitch_extractor, mode='train'):
182+
def compute_f0(data, sample_rate, hop_size, mode='train'):
182183
""" Extract f0
183184
184185
Args:
@@ -187,15 +188,19 @@ def compute_f0(data, pitch_extractor, mode='train'):
187188
Returns:
188189
Iterable[{key, feat, label}]
189190
"""
191+
frame_period = hop_size * 1000 / sample_rate
190192
for sample in data:
191193
assert 'sample_rate' in sample
192194
assert 'speech' in sample
193195
assert 'utt' in sample
194196
assert 'text_token' in sample
195197
waveform = sample['speech']
196-
mat = pitch_extractor(waveform).transpose(1, 2)
197-
mat = F.interpolate(mat, size=sample['speech_feat'].shape[0], mode='linear')
198-
sample['pitch_feat'] = mat[0, 0]
198+
_f0, t = pw.harvest(waveform.squeeze(dim=0).numpy().astype('double'), sample_rate, frame_period=frame_period)
199+
if sum(_f0 != 0) < 5: # this happens when the algorithm fails
200+
_f0, t = pw.dio(waveform.squeeze(dim=0).numpy().astype('double'), sample_rate, frame_period=frame_period) # if harvest fails, try dio
201+
f0 = pw.stonemask(waveform.squeeze(dim=0).numpy().astype('double'), _f0, t, sample_rate)
202+
f0 = F.interpolate(torch.from_numpy(f0).view(1, 1, -1), size=sample['speech_feat'].shape[0], mode='linear').view(-1)
203+
sample['pitch_feat'] = f0
199204
yield sample
200205

201206

cosyvoice/utils/mask.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
# limitations under the License.
1616

1717
import torch
18+
from cosyvoice.utils.file_utils import logging
1819
'''
1920
def subsequent_mask(
2021
size: int,
@@ -230,6 +231,10 @@ def add_optional_chunk_mask(xs: torch.Tensor,
230231
chunk_masks = masks & chunk_masks # (B, L, L)
231232
else:
232233
chunk_masks = masks
234+
assert chunk_masks.dtype == torch.bool
235+
if (chunk_masks.sum(dim=-1) == 0).sum().item() != 0:
236+
logging.warning('get chunk_masks all false at some timestep, force set to true, make sure they are masked in futuer computation!')
237+
chunk_masks[chunk_masks.sum(dim=-1)==0] = True
233238
return chunk_masks
234239

235240

examples/libritts/cosyvoice/conf/cosyvoice.fromscratch.yaml

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -183,12 +183,9 @@ feat_extractor: !name:matcha.utils.audio.mel_spectrogram
183183
center: False
184184
compute_fbank: !name:cosyvoice.dataset.processor.compute_fbank
185185
feat_extractor: !ref <feat_extractor>
186-
pitch_extractor: !name:torchaudio.functional.compute_kaldi_pitch
187-
sample_rate: !ref <sample_rate>
188-
frame_length: 46.4 # match feat_extractor win_size/sampling_rate
189-
frame_shift: 11.6 # match feat_extractor hop_size/sampling_rate
190186
compute_f0: !name:cosyvoice.dataset.processor.compute_f0
191-
pitch_extractor: !ref <pitch_extractor>
187+
sample_rate: !ref <sample_rate>
188+
hop_size: 256
192189
parse_embedding: !name:cosyvoice.dataset.processor.parse_embedding
193190
normalize: True
194191
shuffle: !name:cosyvoice.dataset.processor.shuffle

examples/libritts/cosyvoice/conf/cosyvoice.yaml

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -183,12 +183,9 @@ feat_extractor: !name:matcha.utils.audio.mel_spectrogram
183183
center: False
184184
compute_fbank: !name:cosyvoice.dataset.processor.compute_fbank
185185
feat_extractor: !ref <feat_extractor>
186-
pitch_extractor: !name:torchaudio.functional.compute_kaldi_pitch
187-
sample_rate: !ref <sample_rate>
188-
frame_length: 46.4 # match feat_extractor win_size/sampling_rate
189-
frame_shift: 11.6 # match feat_extractor hop_size/sampling_rate
190186
compute_f0: !name:cosyvoice.dataset.processor.compute_f0
191-
pitch_extractor: !ref <pitch_extractor>
187+
sample_rate: !ref <sample_rate>
188+
hop_size: 256
192189
parse_embedding: !name:cosyvoice.dataset.processor.parse_embedding
193190
normalize: True
194191
shuffle: !name:cosyvoice.dataset.processor.shuffle

requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ onnxruntime==1.18.0; sys_platform == 'darwin' or sys_platform == 'windows'
2222
openai-whisper==20231117
2323
protobuf==4.25
2424
pydantic==2.7.0
25+
pyworld==0.3.4
2526
rich==13.7.1
2627
soundfile==0.12.1
2728
tensorboard==2.14.0

0 commit comments

Comments
 (0)