diff --git a/configs/acoustic.yaml b/configs/acoustic.yaml index 64ed8385c..3d7d4c591 100644 --- a/configs/acoustic.yaml +++ b/configs/acoustic.yaml @@ -46,6 +46,7 @@ energy_smooth_width: 0.12 breathiness_smooth_width: 0.12 voicing_smooth_width: 0.12 tension_smooth_width: 0.12 +falsetto_smooth_width: 0.12 use_lang_id: false num_lang: 1 @@ -55,6 +56,7 @@ use_energy_embed: false use_breathiness_embed: false use_voicing_embed: false use_tension_embed: false +use_falsetto_embed: false use_key_shift_embed: false use_speed_embed: false diff --git a/configs/templates/config_acoustic.yaml b/configs/templates/config_acoustic.yaml index 59778df99..46202ab96 100644 --- a/configs/templates/config_acoustic.yaml +++ b/configs/templates/config_acoustic.yaml @@ -49,6 +49,7 @@ use_energy_embed: false use_breathiness_embed: false use_voicing_embed: false use_tension_embed: false +use_falsetto_embed: false use_key_shift_embed: true use_speed_embed: true diff --git a/configs/templates/config_variance.yaml b/configs/templates/config_variance.yaml index 7d5b211aa..51447cb7a 100644 --- a/configs/templates/config_variance.yaml +++ b/configs/templates/config_variance.yaml @@ -50,6 +50,7 @@ predict_energy: false predict_breathiness: false predict_voicing: false predict_tension: false +predict_falsetto: false energy_db_min: -96.0 energy_db_max: -12.0 @@ -63,6 +64,9 @@ voicing_db_max: -12.0 tension_logit_min: -10.0 tension_logit_max: 10.0 +falsetto_ratio_min: 0 +falsetto_ratio_max: 1 + enc_ffn_kernel_size: 3 use_rope: true hidden_size: 256 @@ -107,11 +111,11 @@ pitch_prediction_args: # strong_cond: true variances_prediction_args: - total_repeat_bins: 48 + total_repeat_bins: 72 backbone_type: 'wavenet' backbone_args: num_layers: 10 - num_channels: 192 + num_channels: 256 dilation_cycle_length: 4 # backbone_type: 'lynxnet' # backbone_args: diff --git a/configs/variance.yaml b/configs/variance.yaml index 61c508a1b..62e9c60e9 100644 --- a/configs/variance.yaml +++ b/configs/variance.yaml @@ -33,6 +33,7 @@ predict_energy: false predict_breathiness: false predict_voicing: false predict_tension: false +predict_falsetto: false enc_ffn_kernel_size: 3 use_rope: true @@ -86,6 +87,10 @@ tension_logit_min: -10.0 tension_logit_max: 10.0 tension_smooth_width: 0.12 +falsetto_ratio_min: 0 +falsetto_ratio_max: 1 +falsetto_smooth_width: 0.12 + variances_prediction_args: total_repeat_bins: 48 backbone_type: 'wavenet' diff --git a/inference/ds_acoustic.py b/inference/ds_acoustic.py index 8b139f62f..5d62c236e 100644 --- a/inference/ds_acoustic.py +++ b/inference/ds_acoustic.py @@ -34,6 +34,8 @@ def __init__(self, device=None, load_model=True, load_vocoder=True, ckpt_steps=N self.variances_to_embed.add('voicing') if hparams.get('use_tension_embed', False): self.variances_to_embed.add('tension') + if hparams.get('use_falsetto_embed', False): + self.variances_to_embed.add('falsetto_dev') self.phoneme_dictionary = load_phoneme_dictionary() if hparams['use_spk_id']: diff --git a/modules/fastspeech/acoustic_encoder.py b/modules/fastspeech/acoustic_encoder.py index b6f986bb0..669662c42 100644 --- a/modules/fastspeech/acoustic_encoder.py +++ b/modules/fastspeech/acoustic_encoder.py @@ -33,6 +33,7 @@ def __init__(self, vocab_size): self.use_breathiness_embed = hparams.get('use_breathiness_embed', False) self.use_voicing_embed = hparams.get('use_voicing_embed', False) self.use_tension_embed = hparams.get('use_tension_embed', False) + self.use_falsetto_embed = hparams.get('use_falsetto_embed', False) if self.use_energy_embed: self.variance_embed_list.append('energy') if self.use_breathiness_embed: @@ -41,6 +42,8 @@ def __init__(self, vocab_size): self.variance_embed_list.append('voicing') if self.use_tension_embed: self.variance_embed_list.append('tension') + if self.use_falsetto_embed: + self.variance_embed_list.append('falsetto_dev') self.use_variance_embeds = len(self.variance_embed_list) > 0 if self.use_variance_embeds: diff --git a/modules/fastspeech/param_adaptor.py b/modules/fastspeech/param_adaptor.py index 77ebb8331..ce0016263 100644 --- a/modules/fastspeech/param_adaptor.py +++ b/modules/fastspeech/param_adaptor.py @@ -7,7 +7,7 @@ from utils import filter_kwargs from utils.hparams import hparams -VARIANCE_CHECKLIST = ['energy', 'breathiness', 'voicing', 'tension'] +VARIANCE_CHECKLIST = ['energy', 'breathiness', 'voicing', 'tension', 'falsetto_dev'] class ParameterAdaptorModule(torch.nn.Module): @@ -18,6 +18,7 @@ def __init__(self): self.predict_breathiness = hparams.get('predict_breathiness', False) self.predict_voicing = hparams.get('predict_voicing', False) self.predict_tension = hparams.get('predict_tension', False) + self.predict_falsetto = hparams.get('predict_falsetto', False) if self.predict_energy: self.variance_prediction_list.append('energy') if self.predict_breathiness: @@ -26,6 +27,8 @@ def __init__(self): self.variance_prediction_list.append('voicing') if self.predict_tension: self.variance_prediction_list.append('tension') + if self.predict_falsetto: + self.variance_prediction_list.append('falsetto_dev') self.predict_variances = len(self.variance_prediction_list) > 0 def build_adaptor(self, cls=MultiVarianceDiffusion): @@ -63,6 +66,16 @@ def build_adaptor(self, cls=MultiVarianceDiffusion): hparams['tension_logit_max'] )) + if self.predict_falsetto: + ranges.append(( + hparams['falsetto_ratio_min'], + hparams['falsetto_ratio_max'] + )) + clamps.append(( + hparams['falsetto_ratio_min'], + hparams['falsetto_ratio_max'] + )) + variances_hparams = hparams['variances_prediction_args'] total_repeat_bins = variances_hparams['total_repeat_bins'] assert total_repeat_bins % len(self.variance_prediction_list) == 0, \ diff --git a/preprocessing/acoustic_binarizer.py b/preprocessing/acoustic_binarizer.py index 0455c4f94..10872deee 100644 --- a/preprocessing/acoustic_binarizer.py +++ b/preprocessing/acoustic_binarizer.py @@ -28,6 +28,7 @@ get_breathiness, get_voicing, get_tension_base_harmonic, + get_falsetto_base_harmonic, ) from utils.decomposed_waveform import DecomposedWaveform from utils.hparams import hparams @@ -44,6 +45,7 @@ 'breathiness', 'voicing', 'tension', + 'falsetto_dev', 'key_shift', 'speed', ] @@ -53,6 +55,7 @@ breathiness_smooth: SinusoidalSmoothingConv1d = None voicing_smooth: SinusoidalSmoothingConv1d = None tension_smooth: SinusoidalSmoothingConv1d = None +falsetto_smooth: SinusoidalSmoothingConv1d = None class AcousticBinarizer(BaseBinarizer): @@ -63,6 +66,7 @@ def __init__(self): self.need_breathiness = hparams['use_breathiness_embed'] self.need_voicing = hparams['use_voicing_embed'] self.need_tension = hparams['use_tension_embed'] + self.need_falsetto = hparams['use_falsetto_embed'] assert hparams['mel_base'] == 'e', ( "Mel base must be set to \'e\' according to 2nd stage of the migration plan. " "See https://github.com/openvpi/DiffSinger/releases/tag/v2.3.0 for more details." @@ -212,6 +216,21 @@ def process_item(self, item_name, meta_data, binarization_args): processed_input['tension'] = tension.cpu().numpy() + if self.need_falsetto: + # get ground truth falsetto + falsetto = get_falsetto_base_harmonic( + dec_waveform, None, None, length=length + ) + + global falsetto_smooth + if falsetto_smooth is None: + falsetto_smooth = SinusoidalSmoothingConv1d( + round(hparams['falsetto_smooth_width'] / self.timestep) + ).eval().to(self.device) + falsetto = falsetto_smooth(torch.from_numpy(falsetto).to(self.device)[None])[0] + + processed_input['falsetto_dev'] = falsetto.cpu().numpy() + if hparams['use_key_shift_embed']: processed_input['key_shift'] = 0. diff --git a/preprocessing/variance_binarizer.py b/preprocessing/variance_binarizer.py index 84d9ea499..1ce3f3c35 100644 --- a/preprocessing/variance_binarizer.py +++ b/preprocessing/variance_binarizer.py @@ -20,6 +20,7 @@ get_breathiness, get_voicing, get_tension_base_harmonic, + get_falsetto_base_harmonic, ) from utils.decomposed_waveform import DecomposedWaveform from utils.hparams import hparams @@ -48,6 +49,7 @@ 'breathiness', # frame-level RMS of aperiodic parts (dB), float32[T_s,] 'voicing', # frame-level RMS of harmonic parts (dB), float32[T_s,] 'tension', # frame-level tension (logit), float32[T_s,] + 'falsetto_dev', # frame-level falsetto (ratio), float32[T_s,] ] DS_INDEX_SEP = '#' @@ -59,6 +61,7 @@ breathiness_smooth: SinusoidalSmoothingConv1d = None voicing_smooth: SinusoidalSmoothingConv1d = None tension_smooth: SinusoidalSmoothingConv1d = None +falsetto_smooth: SinusoidalSmoothingConv1d = None class VarianceBinarizer(BaseBinarizer): @@ -80,7 +83,8 @@ def __init__(self): predict_breathiness = hparams['predict_breathiness'] predict_voicing = hparams['predict_voicing'] predict_tension = hparams['predict_tension'] - self.predict_variances = predict_energy or predict_breathiness or predict_voicing or predict_tension + predict_falsetto = hparams['predict_falsetto'] + self.predict_variances = predict_energy or predict_breathiness or predict_voicing or predict_tension or predict_falsetto self.lr = LengthRegulator().to(self.device) self.prefer_ds = self.binarization_args['prefer_ds'] self.cached_ds = {} @@ -511,6 +515,37 @@ def process_item(self, item_name, meta_data, binarization_args): processed_input['tension'] = tension + # Below: extract falsetto + if hparams['predict_falsetto']: + falsetto = None + falsetto_from_wav = False + if self.prefer_ds: + falsetto_seq = self.load_attr_from_ds(ds_id, name, 'falsetto', idx=ds_seg_idx) + if falsetto_seq is not None: + falsetto = resample_align_curve( + np.array(falsetto_seq.split(), np.float32), + original_timestep=float(self.load_attr_from_ds( + ds_id, name, 'falsetto_timestep', idx=ds_seg_idx + )), + target_timestep=self.timestep, + align_length=length + ) + if falsetto is None: + falsetto = get_falsetto_base_harmonic( + dec_waveform, None, None, length=length + ) + falsetto_from_wav = True + + if falsetto_from_wav: + global falsetto_smooth + if falsetto_smooth is None: + falsetto_smooth = SinusoidalSmoothingConv1d( + round(hparams['falsetto_smooth_width'] / self.timestep) + ).eval().to(self.device) + falsetto = falsetto_smooth(torch.from_numpy(falsetto).to(self.device)[None])[0].cpu().numpy() + + processed_input['falsetto_dev'] = falsetto + return processed_input def arrange_data_augmentation(self, data_iterator): diff --git a/training/acoustic_task.py b/training/acoustic_task.py index ca6a71c65..71c6fc7d5 100644 --- a/training/acoustic_task.py +++ b/training/acoustic_task.py @@ -31,6 +31,8 @@ def __init__(self, prefix, preload=False): self.required_variances['voicing'] = 0.0 if hparams['use_tension_embed']: self.required_variances['tension'] = 0.0 + if hparams['use_falsetto_embed']: + self.required_variances['falsetto_dev'] = 0.0 self.need_key_shift = hparams['use_key_shift_embed'] self.need_speed = hparams['use_speed_embed'] @@ -92,6 +94,8 @@ def __init__(self): self.required_variances.append('voicing') if hparams['use_tension_embed']: self.required_variances.append('tension') + if hparams['use_falsetto_embed']: + self.required_variances.append('falsetto_dev') super()._finish_init() def _build_model(self): diff --git a/training/variance_task.py b/training/variance_task.py index 646d9540a..c69dc9af7 100644 --- a/training/variance_task.py +++ b/training/variance_task.py @@ -26,7 +26,8 @@ def __init__(self, prefix, preload=False): need_breathiness = hparams['predict_breathiness'] need_voicing = hparams['predict_voicing'] need_tension = hparams['predict_tension'] - self.predict_variances = need_energy or need_breathiness or need_voicing or need_tension + need_falsetto = hparams['predict_falsetto'] + self.predict_variances = need_energy or need_breathiness or need_voicing or need_tension or need_falsetto def collater(self, samples): batch = super().collater(samples) @@ -67,6 +68,8 @@ def collater(self, samples): batch['voicing'] = utils.collate_nd([s['voicing'] for s in samples], 0) if hparams['predict_tension']: batch['tension'] = utils.collate_nd([s['tension'] for s in samples], 0) + if hparams['predict_falsetto']: + batch['falsetto_dev'] = utils.collate_nd([s['falsetto_dev'] for s in samples], 0) return batch @@ -102,6 +105,7 @@ def __init__(self): predict_breathiness = hparams['predict_breathiness'] predict_voicing = hparams['predict_voicing'] predict_tension = hparams['predict_tension'] + predict_falsetto = hparams['predict_falsetto'] self.variance_prediction_list = [] if predict_energy: self.variance_prediction_list.append('energy') @@ -111,6 +115,8 @@ def __init__(self): self.variance_prediction_list.append('voicing') if predict_tension: self.variance_prediction_list.append('tension') + if predict_falsetto: + self.variance_prediction_list.append('falsetto_dev') self.predict_variances = len(self.variance_prediction_list) > 0 self.lambda_var_loss = hparams['lambda_var_loss'] super()._finish_init() @@ -180,6 +186,7 @@ def run_model(self, sample, infer=False): breathiness = sample.get('breathiness') # [B, T_s] voicing = sample.get('voicing') # [B, T_s] tension = sample.get('tension') # [B, T_s] + falsetto = sample.get('falsetto_dev') # [B, T_s] pitch_retake = variance_retake = None if (self.predict_pitch or self.predict_variances) and not infer: @@ -202,7 +209,7 @@ def run_model(self, sample, infer=False): note_midi=note_midi, note_rest=note_rest, note_dur=note_dur, note_glide=note_glide, mel2note=mel2note, base_pitch=base_pitch, pitch=pitch, - energy=energy, breathiness=breathiness, voicing=voicing, tension=tension, + energy=energy, breathiness=breathiness, voicing=voicing, tension=tension, falsetto_dev=falsetto, pitch_retake=pitch_retake, variance_retake=variance_retake, spk_id=spk_ids, infer=infer ) diff --git a/utils/binarizer_utils.py b/utils/binarizer_utils.py index df5216429..b36b2aa42 100644 --- a/utils/binarizer_utils.py +++ b/utils/binarizer_utils.py @@ -209,6 +209,45 @@ def get_tension_base_harmonic( return tension +def get_falsetto_base_harmonic( + waveform: Union[np.ndarray, DecomposedWaveform], + samplerate, f0, length, + *, hop_size=None, fft_size=None, win_size=None +): + """ + Definition of falsetto: Attenuation ratio from the second harmonic to the fourth harmonic (H2 / (H2 + H4)). + Refer to : ACOUSTIC MEASURES OF FALSETTO VOICE (DOI:10.1121/1.4877544) + :param waveform: All other analysis parameters will not take effect if a DeconstructedWaveform is given + :param samplerate: sampling rate + :param f0: reference f0 + :param length: Expected number of frames + :param hop_size: Frame width, in number of samples + :param fft_size: Number of fft bins + :param win_size: Window size, in number of samples + :return: falsetto + """ + if not isinstance(waveform, DecomposedWaveform): + waveform = DecomposedWaveform( + waveform=waveform, samplerate=samplerate, f0=f0, + hop_size=hop_size, fft_size=fft_size, win_size=win_size + ) + waveform_h2 = waveform.harmonic(1) # H2 + waveform_h4 = waveform.harmonic(3) # H4 + energy_h2 = get_energy_librosa( + waveform_h2, length, + hop_size=waveform.hop_size, win_size=waveform.win_size, + domain='amplitude' + ) + energy_h4 = get_energy_librosa( + waveform_h4, length, + hop_size=waveform.hop_size, win_size=waveform.win_size, + domain='amplitude' + ) + falsetto = energy_h2 / (energy_h2 + energy_h4 + 1e-5) + falsetto = np.clip(falsetto, a_min=0, a_max=1) + return falsetto + + class SinusoidalSmoothingConv1d(torch.nn.Conv1d): def __init__(self, kernel_size): super().__init__(