Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
2b58ef9
Update new variance parameter : falsetto
KakaruHayate Mar 29, 2025
a34ca27
Update new variance parameter : falsetto
KakaruHayate Mar 30, 2025
13406a2
update lynxnet2 backbone
yxlllc Mar 30, 2025
ac05b4d
fix typo
KakaruHayate Apr 2, 2025
de7958e
fix typo
KakaruHayate Apr 2, 2025
51d3d3d
Merge branch 'lynxnet2' into muon_lynxnet2
yxlllc Apr 3, 2025
4a4ee3d
support muon optimizer
yxlllc Apr 3, 2025
f9fda27
optimize
yxlllc Apr 3, 2025
5f7a1be
Merge branch 'main' into muon_lynxnet2
yxlllc Apr 4, 2025
eb3b606
stabilize fp16 training
yxlllc Apr 4, 2025
300676a
stabilize fp16 training
yxlllc Apr 4, 2025
2f0495f
Merge branch 'openvpi:main' into falsetto
KakaruHayate Apr 9, 2025
97cb3e9
Merge pull request #1 from KakaruHayate/falsetto
agentasteriski Apr 14, 2025
82cd534
fix variable typos
agentasteriski Apr 14, 2025
ac94abf
correct variable typo
agentasteriski Apr 14, 2025
53c166c
Merge pull request #18 from agentasteriski/falsetto
KakaruHayate Apr 15, 2025
7d04fe8
change 'falsetto' to 'falsetto_dev'
KakaruHayate Apr 16, 2025
e168058
change 'falsetto' to 'falsetto_dev'
KakaruHayate Apr 16, 2025
01afd71
Change variance model args for more variance embed
KakaruHayate Apr 16, 2025
3ea0861
Change variance model args for more variance embed
KakaruHayate Apr 16, 2025
7e92924
change 'falsetto' to 'falsetto_dev'
KakaruHayate Apr 16, 2025
acf194e
Merge pull request #19 from openvpi/main
KakaruHayate Apr 16, 2025
4f4dece
Merge branch 'muon_dev' into falsetto
KakaruHayate Apr 17, 2025
5169daf
Revert "Merge branch 'muon_dev' into falsetto"
KakaruHayate Apr 17, 2025
8603f7f
Fix when only predict falsetto missing mel2ph
KakaruHayate Apr 22, 2025
0ae9c57
Update config_variance.yaml
KakaruHayate May 29, 2025
2b0969e
retake/inpaint and expr and temperature
KakaruHayate Aug 9, 2025
f50c2ae
Revert "retake/inpaint and expr and temperature"
KakaruHayate Aug 9, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions configs/acoustic.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ energy_smooth_width: 0.12
breathiness_smooth_width: 0.12
voicing_smooth_width: 0.12
tension_smooth_width: 0.12
falsetto_smooth_width: 0.12

use_lang_id: false
num_lang: 1
Expand All @@ -55,6 +56,7 @@ use_energy_embed: false
use_breathiness_embed: false
use_voicing_embed: false
use_tension_embed: false
use_falsetto_embed: false
use_key_shift_embed: false
use_speed_embed: false

Expand Down
1 change: 1 addition & 0 deletions configs/templates/config_acoustic.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ use_energy_embed: false
use_breathiness_embed: false
use_voicing_embed: false
use_tension_embed: false
use_falsetto_embed: false

use_key_shift_embed: true
use_speed_embed: true
Expand Down
8 changes: 6 additions & 2 deletions configs/templates/config_variance.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ predict_energy: false
predict_breathiness: false
predict_voicing: false
predict_tension: false
predict_falsetto: false

energy_db_min: -96.0
energy_db_max: -12.0
Expand All @@ -63,6 +64,9 @@ voicing_db_max: -12.0
tension_logit_min: -10.0
tension_logit_max: 10.0

falsetto_ratio_min: 0
falsetto_ratio_max: 1

enc_ffn_kernel_size: 3
use_rope: true
hidden_size: 256
Expand Down Expand Up @@ -107,11 +111,11 @@ pitch_prediction_args:
# strong_cond: true

variances_prediction_args:
total_repeat_bins: 48
total_repeat_bins: 72
backbone_type: 'wavenet'
backbone_args:
num_layers: 10
num_channels: 192
num_channels: 256
dilation_cycle_length: 4
# backbone_type: 'lynxnet'
# backbone_args:
Expand Down
5 changes: 5 additions & 0 deletions configs/variance.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ predict_energy: false
predict_breathiness: false
predict_voicing: false
predict_tension: false
predict_falsetto: false

enc_ffn_kernel_size: 3
use_rope: true
Expand Down Expand Up @@ -86,6 +87,10 @@ tension_logit_min: -10.0
tension_logit_max: 10.0
tension_smooth_width: 0.12

falsetto_ratio_min: 0
falsetto_ratio_max: 1
falsetto_smooth_width: 0.12

variances_prediction_args:
total_repeat_bins: 48
backbone_type: 'wavenet'
Expand Down
2 changes: 2 additions & 0 deletions inference/ds_acoustic.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@ def __init__(self, device=None, load_model=True, load_vocoder=True, ckpt_steps=N
self.variances_to_embed.add('voicing')
if hparams.get('use_tension_embed', False):
self.variances_to_embed.add('tension')
if hparams.get('use_falsetto_embed', False):
self.variances_to_embed.add('falsetto_dev')

self.phoneme_dictionary = load_phoneme_dictionary()
if hparams['use_spk_id']:
Expand Down
3 changes: 3 additions & 0 deletions modules/fastspeech/acoustic_encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ def __init__(self, vocab_size):
self.use_breathiness_embed = hparams.get('use_breathiness_embed', False)
self.use_voicing_embed = hparams.get('use_voicing_embed', False)
self.use_tension_embed = hparams.get('use_tension_embed', False)
self.use_falsetto_embed = hparams.get('use_falsetto_embed', False)
if self.use_energy_embed:
self.variance_embed_list.append('energy')
if self.use_breathiness_embed:
Expand All @@ -41,6 +42,8 @@ def __init__(self, vocab_size):
self.variance_embed_list.append('voicing')
if self.use_tension_embed:
self.variance_embed_list.append('tension')
if self.use_falsetto_embed:
self.variance_embed_list.append('falsetto_dev')

self.use_variance_embeds = len(self.variance_embed_list) > 0
if self.use_variance_embeds:
Expand Down
15 changes: 14 additions & 1 deletion modules/fastspeech/param_adaptor.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from utils import filter_kwargs
from utils.hparams import hparams

VARIANCE_CHECKLIST = ['energy', 'breathiness', 'voicing', 'tension']
VARIANCE_CHECKLIST = ['energy', 'breathiness', 'voicing', 'tension', 'falsetto_dev']


class ParameterAdaptorModule(torch.nn.Module):
Expand All @@ -18,6 +18,7 @@ def __init__(self):
self.predict_breathiness = hparams.get('predict_breathiness', False)
self.predict_voicing = hparams.get('predict_voicing', False)
self.predict_tension = hparams.get('predict_tension', False)
self.predict_falsetto = hparams.get('predict_falsetto', False)
if self.predict_energy:
self.variance_prediction_list.append('energy')
if self.predict_breathiness:
Expand All @@ -26,6 +27,8 @@ def __init__(self):
self.variance_prediction_list.append('voicing')
if self.predict_tension:
self.variance_prediction_list.append('tension')
if self.predict_falsetto:
self.variance_prediction_list.append('falsetto_dev')
self.predict_variances = len(self.variance_prediction_list) > 0

def build_adaptor(self, cls=MultiVarianceDiffusion):
Expand Down Expand Up @@ -63,6 +66,16 @@ def build_adaptor(self, cls=MultiVarianceDiffusion):
hparams['tension_logit_max']
))

if self.predict_falsetto:
ranges.append((
hparams['falsetto_ratio_min'],
hparams['falsetto_ratio_max']
))
clamps.append((
hparams['falsetto_ratio_min'],
hparams['falsetto_ratio_max']
))

variances_hparams = hparams['variances_prediction_args']
total_repeat_bins = variances_hparams['total_repeat_bins']
assert total_repeat_bins % len(self.variance_prediction_list) == 0, \
Expand Down
19 changes: 19 additions & 0 deletions preprocessing/acoustic_binarizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
get_breathiness,
get_voicing,
get_tension_base_harmonic,
get_falsetto_base_harmonic,
)
from utils.decomposed_waveform import DecomposedWaveform
from utils.hparams import hparams
Expand All @@ -44,6 +45,7 @@
'breathiness',
'voicing',
'tension',
'falsetto_dev',
'key_shift',
'speed',
]
Expand All @@ -53,6 +55,7 @@
breathiness_smooth: SinusoidalSmoothingConv1d = None
voicing_smooth: SinusoidalSmoothingConv1d = None
tension_smooth: SinusoidalSmoothingConv1d = None
falsetto_smooth: SinusoidalSmoothingConv1d = None


class AcousticBinarizer(BaseBinarizer):
Expand All @@ -63,6 +66,7 @@ def __init__(self):
self.need_breathiness = hparams['use_breathiness_embed']
self.need_voicing = hparams['use_voicing_embed']
self.need_tension = hparams['use_tension_embed']
self.need_falsetto = hparams['use_falsetto_embed']
assert hparams['mel_base'] == 'e', (
"Mel base must be set to \'e\' according to 2nd stage of the migration plan. "
"See https://github.com/openvpi/DiffSinger/releases/tag/v2.3.0 for more details."
Expand Down Expand Up @@ -212,6 +216,21 @@ def process_item(self, item_name, meta_data, binarization_args):

processed_input['tension'] = tension.cpu().numpy()

if self.need_falsetto:
# get ground truth falsetto
falsetto = get_falsetto_base_harmonic(
dec_waveform, None, None, length=length
)

global falsetto_smooth
if falsetto_smooth is None:
falsetto_smooth = SinusoidalSmoothingConv1d(
round(hparams['falsetto_smooth_width'] / self.timestep)
).eval().to(self.device)
falsetto = falsetto_smooth(torch.from_numpy(falsetto).to(self.device)[None])[0]

processed_input['falsetto_dev'] = falsetto.cpu().numpy()

if hparams['use_key_shift_embed']:
processed_input['key_shift'] = 0.

Expand Down
37 changes: 36 additions & 1 deletion preprocessing/variance_binarizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
get_breathiness,
get_voicing,
get_tension_base_harmonic,
get_falsetto_base_harmonic,
)
from utils.decomposed_waveform import DecomposedWaveform
from utils.hparams import hparams
Expand Down Expand Up @@ -48,6 +49,7 @@
'breathiness', # frame-level RMS of aperiodic parts (dB), float32[T_s,]
'voicing', # frame-level RMS of harmonic parts (dB), float32[T_s,]
'tension', # frame-level tension (logit), float32[T_s,]
'falsetto_dev', # frame-level falsetto (ratio), float32[T_s,]
]
DS_INDEX_SEP = '#'

Expand All @@ -59,6 +61,7 @@
breathiness_smooth: SinusoidalSmoothingConv1d = None
voicing_smooth: SinusoidalSmoothingConv1d = None
tension_smooth: SinusoidalSmoothingConv1d = None
falsetto_smooth: SinusoidalSmoothingConv1d = None


class VarianceBinarizer(BaseBinarizer):
Expand All @@ -80,7 +83,8 @@ def __init__(self):
predict_breathiness = hparams['predict_breathiness']
predict_voicing = hparams['predict_voicing']
predict_tension = hparams['predict_tension']
self.predict_variances = predict_energy or predict_breathiness or predict_voicing or predict_tension
predict_falsetto = hparams['predict_falsetto']
self.predict_variances = predict_energy or predict_breathiness or predict_voicing or predict_tension or predict_falsetto
self.lr = LengthRegulator().to(self.device)
self.prefer_ds = self.binarization_args['prefer_ds']
self.cached_ds = {}
Expand Down Expand Up @@ -511,6 +515,37 @@ def process_item(self, item_name, meta_data, binarization_args):

processed_input['tension'] = tension

# Below: extract falsetto
if hparams['predict_falsetto']:
falsetto = None
falsetto_from_wav = False
if self.prefer_ds:
falsetto_seq = self.load_attr_from_ds(ds_id, name, 'falsetto', idx=ds_seg_idx)
if falsetto_seq is not None:
falsetto = resample_align_curve(
np.array(falsetto_seq.split(), np.float32),
original_timestep=float(self.load_attr_from_ds(
ds_id, name, 'falsetto_timestep', idx=ds_seg_idx
)),
target_timestep=self.timestep,
align_length=length
)
if falsetto is None:
falsetto = get_falsetto_base_harmonic(
dec_waveform, None, None, length=length
)
falsetto_from_wav = True

if falsetto_from_wav:
global falsetto_smooth
if falsetto_smooth is None:
falsetto_smooth = SinusoidalSmoothingConv1d(
round(hparams['falsetto_smooth_width'] / self.timestep)
).eval().to(self.device)
falsetto = falsetto_smooth(torch.from_numpy(falsetto).to(self.device)[None])[0].cpu().numpy()

processed_input['falsetto_dev'] = falsetto

return processed_input

def arrange_data_augmentation(self, data_iterator):
Expand Down
4 changes: 4 additions & 0 deletions training/acoustic_task.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@ def __init__(self, prefix, preload=False):
self.required_variances['voicing'] = 0.0
if hparams['use_tension_embed']:
self.required_variances['tension'] = 0.0
if hparams['use_falsetto_embed']:
self.required_variances['falsetto_dev'] = 0.0

self.need_key_shift = hparams['use_key_shift_embed']
self.need_speed = hparams['use_speed_embed']
Expand Down Expand Up @@ -92,6 +94,8 @@ def __init__(self):
self.required_variances.append('voicing')
if hparams['use_tension_embed']:
self.required_variances.append('tension')
if hparams['use_falsetto_embed']:
self.required_variances.append('falsetto_dev')
super()._finish_init()

def _build_model(self):
Expand Down
11 changes: 9 additions & 2 deletions training/variance_task.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,8 @@ def __init__(self, prefix, preload=False):
need_breathiness = hparams['predict_breathiness']
need_voicing = hparams['predict_voicing']
need_tension = hparams['predict_tension']
self.predict_variances = need_energy or need_breathiness or need_voicing or need_tension
need_falsetto = hparams['predict_falsetto']
self.predict_variances = need_energy or need_breathiness or need_voicing or need_tension or need_falsetto

def collater(self, samples):
batch = super().collater(samples)
Expand Down Expand Up @@ -67,6 +68,8 @@ def collater(self, samples):
batch['voicing'] = utils.collate_nd([s['voicing'] for s in samples], 0)
if hparams['predict_tension']:
batch['tension'] = utils.collate_nd([s['tension'] for s in samples], 0)
if hparams['predict_falsetto']:
batch['falsetto_dev'] = utils.collate_nd([s['falsetto_dev'] for s in samples], 0)

return batch

Expand Down Expand Up @@ -102,6 +105,7 @@ def __init__(self):
predict_breathiness = hparams['predict_breathiness']
predict_voicing = hparams['predict_voicing']
predict_tension = hparams['predict_tension']
predict_falsetto = hparams['predict_falsetto']
self.variance_prediction_list = []
if predict_energy:
self.variance_prediction_list.append('energy')
Expand All @@ -111,6 +115,8 @@ def __init__(self):
self.variance_prediction_list.append('voicing')
if predict_tension:
self.variance_prediction_list.append('tension')
if predict_falsetto:
self.variance_prediction_list.append('falsetto_dev')
self.predict_variances = len(self.variance_prediction_list) > 0
self.lambda_var_loss = hparams['lambda_var_loss']
super()._finish_init()
Expand Down Expand Up @@ -180,6 +186,7 @@ def run_model(self, sample, infer=False):
breathiness = sample.get('breathiness') # [B, T_s]
voicing = sample.get('voicing') # [B, T_s]
tension = sample.get('tension') # [B, T_s]
falsetto = sample.get('falsetto_dev') # [B, T_s]

pitch_retake = variance_retake = None
if (self.predict_pitch or self.predict_variances) and not infer:
Expand All @@ -202,7 +209,7 @@ def run_model(self, sample, infer=False):
note_midi=note_midi, note_rest=note_rest,
note_dur=note_dur, note_glide=note_glide, mel2note=mel2note,
base_pitch=base_pitch, pitch=pitch,
energy=energy, breathiness=breathiness, voicing=voicing, tension=tension,
energy=energy, breathiness=breathiness, voicing=voicing, tension=tension, falsetto_dev=falsetto,
pitch_retake=pitch_retake, variance_retake=variance_retake,
spk_id=spk_ids, infer=infer
)
Expand Down
39 changes: 39 additions & 0 deletions utils/binarizer_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -209,6 +209,45 @@ def get_tension_base_harmonic(
return tension


def get_falsetto_base_harmonic(
waveform: Union[np.ndarray, DecomposedWaveform],
samplerate, f0, length,
*, hop_size=None, fft_size=None, win_size=None
):
"""
Definition of falsetto: Attenuation ratio from the second harmonic to the fourth harmonic (H2 / (H2 + H4)).
Refer to : ACOUSTIC MEASURES OF FALSETTO VOICE (DOI:10.1121/1.4877544)
:param waveform: All other analysis parameters will not take effect if a DeconstructedWaveform is given
:param samplerate: sampling rate
:param f0: reference f0
:param length: Expected number of frames
:param hop_size: Frame width, in number of samples
:param fft_size: Number of fft bins
:param win_size: Window size, in number of samples
:return: falsetto
"""
if not isinstance(waveform, DecomposedWaveform):
waveform = DecomposedWaveform(
waveform=waveform, samplerate=samplerate, f0=f0,
hop_size=hop_size, fft_size=fft_size, win_size=win_size
)
waveform_h2 = waveform.harmonic(1) # H2
waveform_h4 = waveform.harmonic(3) # H4
energy_h2 = get_energy_librosa(
waveform_h2, length,
hop_size=waveform.hop_size, win_size=waveform.win_size,
domain='amplitude'
)
energy_h4 = get_energy_librosa(
waveform_h4, length,
hop_size=waveform.hop_size, win_size=waveform.win_size,
domain='amplitude'
)
falsetto = energy_h2 / (energy_h2 + energy_h4 + 1e-5)
falsetto = np.clip(falsetto, a_min=0, a_max=1)
return falsetto


class SinusoidalSmoothingConv1d(torch.nn.Conv1d):
def __init__(self, kernel_size):
super().__init__(
Expand Down