From 2b58ef9def3d7b7da2d30f66c49a07294f8373eb Mon Sep 17 00:00:00 2001 From: KakaruHayate Date: Sat, 29 Mar 2025 22:40:57 +0800 Subject: [PATCH 01/21] Update new variance parameter : falsetto --- utils/binarizer_utils.py | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/utils/binarizer_utils.py b/utils/binarizer_utils.py index df5216429..e3f531300 100644 --- a/utils/binarizer_utils.py +++ b/utils/binarizer_utils.py @@ -209,6 +209,45 @@ def get_tension_base_harmonic( return tension +def get_falestto_base_harmonic( + waveform: Union[np.ndarray, DecomposedWaveform], + samplerate, f0, length, + *, hop_size=None, fft_size=None, win_size=None +): + """ + Definition of falestto: Attenuation ratio from the second harmonic to the fourth harmonic (H2 / (H2 + H4)). + Refer to : ACOUSTIC MEASURES OF FALSETTO VOICE (DOI:10.1121/1.4877544) + :param waveform: All other analysis parameters will not take effect if a DeconstructedWaveform is given + :param samplerate: sampling rate + :param f0: reference f0 + :param length: Expected number of frames + :param hop_size: Frame width, in number of samples + :param fft_size: Number of fft bins + :param win_size: Window size, in number of samples + :return: falestto + """ + if not isinstance(waveform, DecomposedWaveform): + waveform = DecomposedWaveform( + waveform=waveform, samplerate=samplerate, f0=f0, + hop_size=hop_size, fft_size=fft_size, win_size=win_size + ) + waveform_h2 = waveform.harmonic(1) # H2 + waveform_h4 = waveform.harmonic(3) # H4 + energy_h2 = get_energy_librosa( + waveform_h2, length, + hop_size=waveform.hop_size, win_size=waveform.win_size, + domain='amplitude' + ) + energy_h4 = get_energy_librosa( + waveform_h4, length, + hop_size=waveform.hop_size, win_size=waveform.win_size, + domain='amplitude' + ) + falestto = energy_h2 / (energy_h2 + energy_h4 + 1e-5) + falestto = np.clip(falestto, a_min=0, a_max=1) + return falestto + + class SinusoidalSmoothingConv1d(torch.nn.Conv1d): def __init__(self, kernel_size): super().__init__( From a34ca2778b4534a313d18d9e0e72b029b0c438b7 Mon Sep 17 00:00:00 2001 From: KakaruHayate Date: Sun, 30 Mar 2025 12:23:45 +0800 Subject: [PATCH 02/21] Update new variance parameter : falsetto --- configs/acoustic.yaml | 2 ++ configs/templates/config_acoustic.yaml | 1 + configs/templates/config_variance.yaml | 4 +++ configs/variance.yaml | 5 ++++ inference/ds_acoustic.py | 2 ++ modules/fastspeech/acoustic_encoder.py | 3 +++ modules/fastspeech/param_adaptor.py | 15 +++++++++++- preprocessing/acoustic_binarizer.py | 19 ++++++++++++++ preprocessing/variance_binarizer.py | 34 ++++++++++++++++++++++++++ training/acoustic_task.py | 4 +++ training/variance_task.py | 11 +++++++-- 11 files changed, 97 insertions(+), 3 deletions(-) diff --git a/configs/acoustic.yaml b/configs/acoustic.yaml index 9f27733f7..1156526a0 100644 --- a/configs/acoustic.yaml +++ b/configs/acoustic.yaml @@ -46,6 +46,7 @@ energy_smooth_width: 0.12 breathiness_smooth_width: 0.12 voicing_smooth_width: 0.12 tension_smooth_width: 0.12 +falsetto_smooth_width: 0.12 use_lang_id: false num_lang: 1 @@ -55,6 +56,7 @@ use_energy_embed: false use_breathiness_embed: false use_voicing_embed: false use_tension_embed: false +use_falsetto_embed: false use_key_shift_embed: false use_speed_embed: false diff --git a/configs/templates/config_acoustic.yaml b/configs/templates/config_acoustic.yaml index 59778df99..46202ab96 100644 --- a/configs/templates/config_acoustic.yaml +++ b/configs/templates/config_acoustic.yaml @@ -49,6 +49,7 @@ use_energy_embed: false use_breathiness_embed: false use_voicing_embed: false use_tension_embed: false +use_falsetto_embed: false use_key_shift_embed: true use_speed_embed: true diff --git a/configs/templates/config_variance.yaml b/configs/templates/config_variance.yaml index 7d5b211aa..e62ebca63 100644 --- a/configs/templates/config_variance.yaml +++ b/configs/templates/config_variance.yaml @@ -50,6 +50,7 @@ predict_energy: false predict_breathiness: false predict_voicing: false predict_tension: false +predict_falsetto: false energy_db_min: -96.0 energy_db_max: -12.0 @@ -63,6 +64,9 @@ voicing_db_max: -12.0 tension_logit_min: -10.0 tension_logit_max: 10.0 +falsetto_ratio_min: 0 +falsetto_ratio_max: 1 + enc_ffn_kernel_size: 3 use_rope: true hidden_size: 256 diff --git a/configs/variance.yaml b/configs/variance.yaml index 61c508a1b..62e9c60e9 100644 --- a/configs/variance.yaml +++ b/configs/variance.yaml @@ -33,6 +33,7 @@ predict_energy: false predict_breathiness: false predict_voicing: false predict_tension: false +predict_falsetto: false enc_ffn_kernel_size: 3 use_rope: true @@ -86,6 +87,10 @@ tension_logit_min: -10.0 tension_logit_max: 10.0 tension_smooth_width: 0.12 +falsetto_ratio_min: 0 +falsetto_ratio_max: 1 +falsetto_smooth_width: 0.12 + variances_prediction_args: total_repeat_bins: 48 backbone_type: 'wavenet' diff --git a/inference/ds_acoustic.py b/inference/ds_acoustic.py index 8b139f62f..7ad1b08ab 100644 --- a/inference/ds_acoustic.py +++ b/inference/ds_acoustic.py @@ -34,6 +34,8 @@ def __init__(self, device=None, load_model=True, load_vocoder=True, ckpt_steps=N self.variances_to_embed.add('voicing') if hparams.get('use_tension_embed', False): self.variances_to_embed.add('tension') + if hparams.get('use_falsetto_embed', False): + self.variances_to_embed.add('falsetto') self.phoneme_dictionary = load_phoneme_dictionary() if hparams['use_spk_id']: diff --git a/modules/fastspeech/acoustic_encoder.py b/modules/fastspeech/acoustic_encoder.py index b6f986bb0..dce265568 100644 --- a/modules/fastspeech/acoustic_encoder.py +++ b/modules/fastspeech/acoustic_encoder.py @@ -33,6 +33,7 @@ def __init__(self, vocab_size): self.use_breathiness_embed = hparams.get('use_breathiness_embed', False) self.use_voicing_embed = hparams.get('use_voicing_embed', False) self.use_tension_embed = hparams.get('use_tension_embed', False) + self.use_falsetto_embed = hparams.get('use_falsetto_embed', False) if self.use_energy_embed: self.variance_embed_list.append('energy') if self.use_breathiness_embed: @@ -41,6 +42,8 @@ def __init__(self, vocab_size): self.variance_embed_list.append('voicing') if self.use_tension_embed: self.variance_embed_list.append('tension') + if self.use_falsetto_embed: + self.variance_embed_list.append('falsetto') self.use_variance_embeds = len(self.variance_embed_list) > 0 if self.use_variance_embeds: diff --git a/modules/fastspeech/param_adaptor.py b/modules/fastspeech/param_adaptor.py index 77ebb8331..9ad835b18 100644 --- a/modules/fastspeech/param_adaptor.py +++ b/modules/fastspeech/param_adaptor.py @@ -7,7 +7,7 @@ from utils import filter_kwargs from utils.hparams import hparams -VARIANCE_CHECKLIST = ['energy', 'breathiness', 'voicing', 'tension'] +VARIANCE_CHECKLIST = ['energy', 'breathiness', 'voicing', 'tension', 'falsetto'] class ParameterAdaptorModule(torch.nn.Module): @@ -18,6 +18,7 @@ def __init__(self): self.predict_breathiness = hparams.get('predict_breathiness', False) self.predict_voicing = hparams.get('predict_voicing', False) self.predict_tension = hparams.get('predict_tension', False) + self.predict_falsetto = hparams.get('predict_falsetto', False) if self.predict_energy: self.variance_prediction_list.append('energy') if self.predict_breathiness: @@ -26,6 +27,8 @@ def __init__(self): self.variance_prediction_list.append('voicing') if self.predict_tension: self.variance_prediction_list.append('tension') + if self.predict_falsetto: + self.variance_prediction_list.append('falsetto') self.predict_variances = len(self.variance_prediction_list) > 0 def build_adaptor(self, cls=MultiVarianceDiffusion): @@ -63,6 +66,16 @@ def build_adaptor(self, cls=MultiVarianceDiffusion): hparams['tension_logit_max'] )) + if self.predict_falsetto: + ranges.append(( + hparams['falsetto_ratio_min'], + hparams['falsetto_ratio_max'] + )) + clamps.append(( + hparams['falsetto_ratio_min'], + hparams['falsetto_ratio_max'] + )) + variances_hparams = hparams['variances_prediction_args'] total_repeat_bins = variances_hparams['total_repeat_bins'] assert total_repeat_bins % len(self.variance_prediction_list) == 0, \ diff --git a/preprocessing/acoustic_binarizer.py b/preprocessing/acoustic_binarizer.py index 0455c4f94..7aa5c4f2c 100644 --- a/preprocessing/acoustic_binarizer.py +++ b/preprocessing/acoustic_binarizer.py @@ -28,6 +28,7 @@ get_breathiness, get_voicing, get_tension_base_harmonic, + get_falestto_base_harmonic, ) from utils.decomposed_waveform import DecomposedWaveform from utils.hparams import hparams @@ -44,6 +45,7 @@ 'breathiness', 'voicing', 'tension', + 'falsetto', 'key_shift', 'speed', ] @@ -53,6 +55,7 @@ breathiness_smooth: SinusoidalSmoothingConv1d = None voicing_smooth: SinusoidalSmoothingConv1d = None tension_smooth: SinusoidalSmoothingConv1d = None +falsetto_smooth: SinusoidalSmoothingConv1d = None class AcousticBinarizer(BaseBinarizer): @@ -63,6 +66,7 @@ def __init__(self): self.need_breathiness = hparams['use_breathiness_embed'] self.need_voicing = hparams['use_voicing_embed'] self.need_tension = hparams['use_tension_embed'] + self.need_falsetto = hparams['use_falsetto_embed'] assert hparams['mel_base'] == 'e', ( "Mel base must be set to \'e\' according to 2nd stage of the migration plan. " "See https://github.com/openvpi/DiffSinger/releases/tag/v2.3.0 for more details." @@ -212,6 +216,21 @@ def process_item(self, item_name, meta_data, binarization_args): processed_input['tension'] = tension.cpu().numpy() + if self.need_falsetto: + # get ground truth falsetto + falsetto = get_falestto_base_harmonic( + dec_waveform, None, None, length=length + ) + + global falsetto_smooth + if falsetto_smooth is None: + falsetto_smooth = SinusoidalSmoothingConv1d( + round(hparams['falsetto_smooth_width'] / self.timestep) + ).eval().to(self.device) + falsetto = falsetto_smooth(torch.from_numpy(falsetto).to(self.device)[None])[0] + + processed_input['falsetto'] = falsetto.cpu().numpy() + if hparams['use_key_shift_embed']: processed_input['key_shift'] = 0. diff --git a/preprocessing/variance_binarizer.py b/preprocessing/variance_binarizer.py index 84d9ea499..53a873c73 100644 --- a/preprocessing/variance_binarizer.py +++ b/preprocessing/variance_binarizer.py @@ -20,6 +20,7 @@ get_breathiness, get_voicing, get_tension_base_harmonic, + get_falestto_base_harmonic, ) from utils.decomposed_waveform import DecomposedWaveform from utils.hparams import hparams @@ -48,6 +49,7 @@ 'breathiness', # frame-level RMS of aperiodic parts (dB), float32[T_s,] 'voicing', # frame-level RMS of harmonic parts (dB), float32[T_s,] 'tension', # frame-level tension (logit), float32[T_s,] + 'falsetto', # frame-level falsetto (ratio), float32[T_s,] ] DS_INDEX_SEP = '#' @@ -59,6 +61,7 @@ breathiness_smooth: SinusoidalSmoothingConv1d = None voicing_smooth: SinusoidalSmoothingConv1d = None tension_smooth: SinusoidalSmoothingConv1d = None +falsetto_smooth: SinusoidalSmoothingConv1d = None class VarianceBinarizer(BaseBinarizer): @@ -511,6 +514,37 @@ def process_item(self, item_name, meta_data, binarization_args): processed_input['tension'] = tension + # Below: extract falsetto + if hparams['predict_falsetto']: + falsetto = None + falsetto_from_wav = False + if self.prefer_ds: + falsetto_seq = self.load_attr_from_ds(ds_id, name, 'falsetto', idx=ds_seg_idx) + if falsetto_seq is not None: + falsetto = resample_align_curve( + np.array(falsetto_seq.split(), np.float32), + original_timestep=float(self.load_attr_from_ds( + ds_id, name, 'falsetto_timestep', idx=ds_seg_idx + )), + target_timestep=self.timestep, + align_length=length + ) + if falsetto is None: + falsetto = get_falsetto_base_harmonic( + dec_waveform, None, None, length=length + ) + falsetto_from_wav = True + + if falsetto_from_wav: + global falsetto_smooth + if falsetto_smooth is None: + falsetto_smooth = SinusoidalSmoothingConv1d( + round(hparams['falsetto_smooth_width'] / self.timestep) + ).eval().to(self.device) + falsetto = falsetto_smooth(torch.from_numpy(falsetto).to(self.device)[None])[0].cpu().numpy() + + processed_input['falsetto'] = falsetto + return processed_input def arrange_data_augmentation(self, data_iterator): diff --git a/training/acoustic_task.py b/training/acoustic_task.py index ca6a71c65..3118e6f82 100644 --- a/training/acoustic_task.py +++ b/training/acoustic_task.py @@ -31,6 +31,8 @@ def __init__(self, prefix, preload=False): self.required_variances['voicing'] = 0.0 if hparams['use_tension_embed']: self.required_variances['tension'] = 0.0 + if hparams['use_falsetto_embed']: + self.required_variances['falsetto'] = 0.0 self.need_key_shift = hparams['use_key_shift_embed'] self.need_speed = hparams['use_speed_embed'] @@ -92,6 +94,8 @@ def __init__(self): self.required_variances.append('voicing') if hparams['use_tension_embed']: self.required_variances.append('tension') + if hparams['use_falsetto_embed']: + self.required_variances.append('falsetto') super()._finish_init() def _build_model(self): diff --git a/training/variance_task.py b/training/variance_task.py index 2fdc599f6..63175aaf1 100644 --- a/training/variance_task.py +++ b/training/variance_task.py @@ -25,7 +25,8 @@ def __init__(self, prefix, preload=False): need_breathiness = hparams['predict_breathiness'] need_voicing = hparams['predict_voicing'] need_tension = hparams['predict_tension'] - self.predict_variances = need_energy or need_breathiness or need_voicing or need_tension + need_falsetto = hparams['predict_falsetto'] + self.predict_variances = need_energy or need_breathiness or need_voicing or need_tension or need_falsetto def collater(self, samples): batch = super().collater(samples) @@ -66,6 +67,8 @@ def collater(self, samples): batch['voicing'] = utils.collate_nd([s['voicing'] for s in samples], 0) if hparams['predict_tension']: batch['tension'] = utils.collate_nd([s['tension'] for s in samples], 0) + if hparams['predict_falsetto']: + batch['falsetto'] = utils.collate_nd([s['falsetto'] for s in samples], 0) return batch @@ -101,6 +104,7 @@ def __init__(self): predict_breathiness = hparams['predict_breathiness'] predict_voicing = hparams['predict_voicing'] predict_tension = hparams['predict_tension'] + predict_tension = hparams['predict_falsetto'] self.variance_prediction_list = [] if predict_energy: self.variance_prediction_list.append('energy') @@ -110,6 +114,8 @@ def __init__(self): self.variance_prediction_list.append('voicing') if predict_tension: self.variance_prediction_list.append('tension') + if predict_falsetto: + self.variance_prediction_list.append('falsetto') self.predict_variances = len(self.variance_prediction_list) > 0 self.lambda_var_loss = hparams['lambda_var_loss'] super()._finish_init() @@ -176,6 +182,7 @@ def run_model(self, sample, infer=False): breathiness = sample.get('breathiness') # [B, T_s] voicing = sample.get('voicing') # [B, T_s] tension = sample.get('tension') # [B, T_s] + falsetto = sample.get('falsetto') # [B, T_s] pitch_retake = variance_retake = None if (self.predict_pitch or self.predict_variances) and not infer: @@ -198,7 +205,7 @@ def run_model(self, sample, infer=False): note_midi=note_midi, note_rest=note_rest, note_dur=note_dur, note_glide=note_glide, mel2note=mel2note, base_pitch=base_pitch, pitch=pitch, - energy=energy, breathiness=breathiness, voicing=voicing, tension=tension, + energy=energy, breathiness=breathiness, voicing=voicing, tension=tension, falsetto=falsetto, pitch_retake=pitch_retake, variance_retake=variance_retake, spk_id=spk_ids, infer=infer ) From 13406a2832dd4ea76c84a35260a01c3b85271528 Mon Sep 17 00:00:00 2001 From: yxlllc Date: Sun, 30 Mar 2025 12:24:04 +0800 Subject: [PATCH 03/21] update lynxnet2 backbone --- configs/acoustic.yaml | 3 +- configs/templates/config_acoustic.yaml | 3 +- configs/templates/config_variance.yaml | 38 +++++----- configs/variance.yaml | 16 ++-- modules/backbones/__init__.py | 4 +- modules/backbones/lynxnet.py | 20 +---- modules/backbones/lynxnet2.py | 101 +++++++++++++++++++++++++ modules/backbones/wavenet.py | 8 +- modules/commons/common_layers.py | 16 ++++ 9 files changed, 151 insertions(+), 58 deletions(-) create mode 100644 modules/backbones/lynxnet2.py diff --git a/configs/acoustic.yaml b/configs/acoustic.yaml index 9f27733f7..994e49923 100644 --- a/configs/acoustic.yaml +++ b/configs/acoustic.yaml @@ -70,13 +70,12 @@ sampling_steps: 20 diff_accelerator: ddim diff_speedup: 10 hidden_size: 256 -backbone_type: 'lynxnet' +backbone_type: 'lynxnet2' backbone_args: num_channels: 1024 num_layers: 6 kernel_size: 31 dropout_rate: 0.0 - strong_cond: true main_loss_type: l2 main_loss_log_norm: false schedule_type: 'linear' diff --git a/configs/templates/config_acoustic.yaml b/configs/templates/config_acoustic.yaml index 59778df99..8d99b01e5 100644 --- a/configs/templates/config_acoustic.yaml +++ b/configs/templates/config_acoustic.yaml @@ -76,13 +76,12 @@ T_start: 0.4 T_start_infer: 0.4 K_step: 300 K_step_infer: 300 -backbone_type: 'lynxnet' +backbone_type: 'lynxnet2' backbone_args: num_channels: 1024 num_layers: 6 kernel_size: 31 dropout_rate: 0.0 - strong_cond: true #backbone_type: 'wavenet' #backbone_args: # num_channels: 512 diff --git a/configs/templates/config_variance.yaml b/configs/templates/config_variance.yaml index 7d5b211aa..908deba4a 100644 --- a/configs/templates/config_variance.yaml +++ b/configs/templates/config_variance.yaml @@ -94,31 +94,29 @@ pitch_prediction_args: pitd_clip_min: -12.0 pitd_clip_max: 12.0 repeat_bins: 64 - backbone_type: 'wavenet' - backbone_args: - num_layers: 20 - num_channels: 256 - dilation_cycle_length: 5 -# backbone_type: 'lynxnet' +# backbone_type: 'wavenet' # backbone_args: -# num_layers: 6 -# num_channels: 512 -# dropout_rate: 0.0 -# strong_cond: true +# num_layers: 20 +# num_channels: 256 +# dilation_cycle_length: 5 + backbone_type: 'lynxnet2' + backbone_args: + num_layers: 6 + num_channels: 512 + dropout_rate: 0.0 variances_prediction_args: total_repeat_bins: 48 - backbone_type: 'wavenet' - backbone_args: - num_layers: 10 - num_channels: 192 - dilation_cycle_length: 4 -# backbone_type: 'lynxnet' +# backbone_type: 'wavenet' # backbone_args: -# num_layers: 6 -# num_channels: 384 -# dropout_rate: 0.0 -# strong_cond: true +# num_layers: 10 +# num_channels: 192 +# dilation_cycle_length: 4 + backbone_type: 'lynxnet2' + backbone_args: + num_layers: 6 + num_channels: 384 + dropout_rate: 0.0 lambda_dur_loss: 1.0 lambda_pitch_loss: 1.0 diff --git a/configs/variance.yaml b/configs/variance.yaml index 61c508a1b..c99b13381 100644 --- a/configs/variance.yaml +++ b/configs/variance.yaml @@ -65,11 +65,11 @@ pitch_prediction_args: pitd_clip_min: -12.0 pitd_clip_max: 12.0 repeat_bins: 64 - backbone_type: 'wavenet' + backbone_type: 'lynxnet2' backbone_args: - num_layers: 20 - num_channels: 256 - dilation_cycle_length: 5 + num_layers: 6 + num_channels: 512 + dropout_rate: 0.0 energy_db_min: -96.0 energy_db_max: -12.0 @@ -88,11 +88,11 @@ tension_smooth_width: 0.12 variances_prediction_args: total_repeat_bins: 48 - backbone_type: 'wavenet' + backbone_type: 'lynxnet2' backbone_args: - num_layers: 10 - num_channels: 192 - dilation_cycle_length: 4 + num_layers: 6 + num_channels: 384 + dropout_rate: 0.0 lambda_dur_loss: 1.0 lambda_pitch_loss: 1.0 diff --git a/modules/backbones/__init__.py b/modules/backbones/__init__.py index 8fce796ab..ebd903456 100644 --- a/modules/backbones/__init__.py +++ b/modules/backbones/__init__.py @@ -1,11 +1,13 @@ import torch.nn from modules.backbones.wavenet import WaveNet from modules.backbones.lynxnet import LYNXNet +from modules.backbones.lynxnet2 import LYNXNet2 from utils import filter_kwargs BACKBONES = { 'wavenet': WaveNet, - 'lynxnet': LYNXNet + 'lynxnet': LYNXNet, + 'lynxnet2': LYNXNet2, } diff --git a/modules/backbones/lynxnet.py b/modules/backbones/lynxnet.py index 5dbd1d0a1..88b2be348 100644 --- a/modules/backbones/lynxnet.py +++ b/modules/backbones/lynxnet.py @@ -6,26 +6,10 @@ import torch.nn as nn import torch.nn.functional as F -from modules.commons.common_layers import SinusoidalPosEmb, SwiGLU +from modules.commons.common_layers import SinusoidalPosEmb, SwiGLU, Conv1d, Transpose from utils.hparams import hparams -class Conv1d(torch.nn.Conv1d): - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - nn.init.kaiming_normal_(self.weight) - - -class Transpose(nn.Module): - def __init__(self, dims): - super().__init__() - assert len(dims) == 2, 'dims must be a tuple of two dimensions' - self.dims = dims - - def forward(self, x): - return x.transpose(*self.dims) - - class LYNXConvModule(nn.Module): @staticmethod def calc_same_padding(kernel_size): @@ -150,7 +134,7 @@ def forward(self, spec, diffusion_step, cond): # post-norm x = self.norm(x.transpose(1, 2)).transpose(1, 2) - # MLP and GLU + # output_projection x = self.output_projection(x) # [B, 128, T] if self.n_feats == 1: diff --git a/modules/backbones/lynxnet2.py b/modules/backbones/lynxnet2.py new file mode 100644 index 000000000..5a10a856b --- /dev/null +++ b/modules/backbones/lynxnet2.py @@ -0,0 +1,101 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F + +from modules.commons.common_layers import SinusoidalPosEmb, SwiGLU, Conv1d, Transpose +from utils.hparams import hparams + + +class LYNXNet2Block(nn.Module): + def __init__(self, dim, expansion_factor, kernel_size=31, dropout=0.): + super().__init__() + inner_dim = int(dim * expansion_factor) + if float(dropout) > 0.: + _dropout = nn.Dropout(dropout) + else: + _dropout = nn.Identity() + self.net = nn.Sequential( + nn.LayerNorm(dim), + Transpose((1, 2)), + nn.Conv1d(dim, dim, kernel_size=kernel_size, padding=kernel_size // 2, groups=dim), + Transpose((1, 2)), + nn.Linear(dim, inner_dim * 2), + SwiGLU(), + nn.Linear(inner_dim, inner_dim * 2), + SwiGLU(), + nn.Linear(inner_dim, dim), + _dropout + ) + + def forward(self, x): + return x + self.net(x) + + +class LYNXNet2(nn.Module): + def __init__(self, in_dims, n_feats, *, num_layers=6, num_channels=512, expansion_factor=1, kernel_size=31, + dropout=0.0): + """ + LYNXNet2(Linear Gated Depthwise Separable Convolution Network Version 2) + """ + super().__init__() + self.in_dims = in_dims + self.n_feats = n_feats + self.input_projection = nn.Linear(in_dims * n_feats, num_channels) + self.conditioner_projection = nn.Linear(hparams['hidden_size'], num_channels) + self.diffusion_embedding = nn.Sequential( + SinusoidalPosEmb(num_channels), + nn.Linear(num_channels, num_channels * 4), + nn.GELU(), + nn.Linear(num_channels * 4, num_channels), + ) + self.residual_layers = nn.ModuleList( + [ + LYNXNet2Block( + dim=num_channels, + expansion_factor=expansion_factor, + kernel_size=kernel_size, + dropout=dropout + ) + for i in range(num_layers) + ] + ) + self.norm = nn.LayerNorm(num_channels) + self.output_projection = nn.Linear(num_channels, in_dims * n_feats) + nn.init.kaiming_normal_(self.input_projection.weight) + nn.init.kaiming_normal_(self.conditioner_projection.weight) + nn.init.zeros_(self.output_projection.weight) + + def forward(self, spec, diffusion_step, cond): + """ + :param spec: [B, F, M, T] + :param diffusion_step: [B, 1] + :param cond: [B, H, T] + :return: + """ + + if self.n_feats == 1: + x = spec[:, 0] # [B, M, T] + else: + x = spec.flatten(start_dim=1, end_dim=2) # [B, F x M, T] + + x = self.input_projection(x.transpose(1, 2)) # [B, T, F x M] + x = x + self.conditioner_projection(cond.transpose(1, 2)) + x = x + self.diffusion_embedding(diffusion_step).unsqueeze(1) + + for layer in self.residual_layers: + x = layer(x) + + # post-norm + x = self.norm(x) + + # output projection + x = self.output_projection(x).transpose(1, 2) # [B, 128, T] + + if self.n_feats == 1: + x = x[:, None, :, :] + else: + # This is the temporary solution since PyTorch 1.13 + # does not support exporting aten::unflatten to ONNX + # x = x.unflatten(dim=1, sizes=(self.n_feats, self.in_dims)) + x = x.reshape(-1, self.n_feats, self.in_dims, x.shape[2]) + return x diff --git a/modules/backbones/wavenet.py b/modules/backbones/wavenet.py index 08e57eff4..58724e5aa 100644 --- a/modules/backbones/wavenet.py +++ b/modules/backbones/wavenet.py @@ -5,16 +5,10 @@ import torch.nn as nn import torch.nn.functional as F -from modules.commons.common_layers import SinusoidalPosEmb +from modules.commons.common_layers import SinusoidalPosEmb, Conv1d from utils.hparams import hparams -class Conv1d(torch.nn.Conv1d): - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - nn.init.kaiming_normal_(self.weight) - - class ResidualBlock(nn.Module): def __init__(self, encoder_hidden, residual_channels, dilation): super().__init__() diff --git a/modules/commons/common_layers.py b/modules/commons/common_layers.py index bf4a2822c..2938d7bb5 100644 --- a/modules/commons/common_layers.py +++ b/modules/commons/common_layers.py @@ -117,6 +117,22 @@ def forward(self, x): return out * F.silu(gate) +class Conv1d(torch.nn.Conv1d): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + nn.init.kaiming_normal_(self.weight) + + +class Transpose(nn.Module): + def __init__(self, dims): + super().__init__() + assert len(dims) == 2, 'dims must be a tuple of two dimensions' + self.dims = dims + + def forward(self, x): + return x.transpose(*self.dims) + + class TransformerFFNLayer(nn.Module): def __init__(self, hidden_size, filter_size, kernel_size=1, dropout=0., act='gelu'): super().__init__() From ac05b4d49cde32cbf8a0b6551d443b27a35cfe6f Mon Sep 17 00:00:00 2001 From: KakaruHayate Date: Wed, 2 Apr 2025 22:43:01 +0800 Subject: [PATCH 04/21] fix typo --- preprocessing/acoustic_binarizer.py | 2 +- preprocessing/variance_binarizer.py | 2 +- utils/binarizer_utils.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/preprocessing/acoustic_binarizer.py b/preprocessing/acoustic_binarizer.py index 7aa5c4f2c..e1b535be1 100644 --- a/preprocessing/acoustic_binarizer.py +++ b/preprocessing/acoustic_binarizer.py @@ -28,7 +28,7 @@ get_breathiness, get_voicing, get_tension_base_harmonic, - get_falestto_base_harmonic, + get_falsetto_base_harmonic, ) from utils.decomposed_waveform import DecomposedWaveform from utils.hparams import hparams diff --git a/preprocessing/variance_binarizer.py b/preprocessing/variance_binarizer.py index 53a873c73..3c2f7fe15 100644 --- a/preprocessing/variance_binarizer.py +++ b/preprocessing/variance_binarizer.py @@ -20,7 +20,7 @@ get_breathiness, get_voicing, get_tension_base_harmonic, - get_falestto_base_harmonic, + get_falsetto_base_harmonic, ) from utils.decomposed_waveform import DecomposedWaveform from utils.hparams import hparams diff --git a/utils/binarizer_utils.py b/utils/binarizer_utils.py index e3f531300..5bcffd2a1 100644 --- a/utils/binarizer_utils.py +++ b/utils/binarizer_utils.py @@ -209,7 +209,7 @@ def get_tension_base_harmonic( return tension -def get_falestto_base_harmonic( +def get_falsetto_base_harmonic( waveform: Union[np.ndarray, DecomposedWaveform], samplerate, f0, length, *, hop_size=None, fft_size=None, win_size=None From de7958e9ba156ecf52693e9650dde351fe85ac0b Mon Sep 17 00:00:00 2001 From: KakaruHayate Date: Wed, 2 Apr 2025 22:59:37 +0800 Subject: [PATCH 05/21] fix typo --- training/variance_task.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/training/variance_task.py b/training/variance_task.py index 63175aaf1..92fbf82b7 100644 --- a/training/variance_task.py +++ b/training/variance_task.py @@ -104,7 +104,7 @@ def __init__(self): predict_breathiness = hparams['predict_breathiness'] predict_voicing = hparams['predict_voicing'] predict_tension = hparams['predict_tension'] - predict_tension = hparams['predict_falsetto'] + predict_falsetto = hparams['predict_falsetto'] self.variance_prediction_list = [] if predict_energy: self.variance_prediction_list.append('energy') From 4a4ee3defb4cbbbf985c1064d1c33cb4661f2625 Mon Sep 17 00:00:00 2001 From: yxlllc Date: Thu, 3 Apr 2025 23:59:04 +0800 Subject: [PATCH 06/21] support muon optimizer --- basics/base_task.py | 2 +- configs/acoustic.yaml | 9 +- configs/templates/config_acoustic.yaml | 10 +- configs/templates/config_variance.yaml | 14 ++- configs/variance.yaml | 13 ++- modules/fastspeech/tts_modules.py | 39 +++++--- modules/fastspeech/variance_encoder.py | 3 +- modules/optimizer/chained_optimizer.py | 122 +++++++++++++++++++++++ modules/optimizer/muon.py | 129 +++++++++++++++++++++++++ utils/__init__.py | 3 +- 10 files changed, 316 insertions(+), 28 deletions(-) create mode 100644 modules/optimizer/chained_optimizer.py create mode 100644 modules/optimizer/muon.py diff --git a/basics/base_task.py b/basics/base_task.py index 065f8273a..656893d96 100644 --- a/basics/base_task.py +++ b/basics/base_task.py @@ -307,7 +307,7 @@ def build_optimizer(self, model): optimizer = build_object_from_class_name( optimizer_args['optimizer_cls'], torch.optim.Optimizer, - model.parameters(), + model if optimizer_args['optimizer_cls'] == 'modules.optimizer.muon.Muon_AdamW' else model.parameters(), **optimizer_args ) return optimizer diff --git a/configs/acoustic.yaml b/configs/acoustic.yaml index 5c94c23de..cdc7c754c 100644 --- a/configs/acoustic.yaml +++ b/configs/acoustic.yaml @@ -104,10 +104,15 @@ lambda_aux_mel_loss: 0.2 # train and eval num_sanity_val_steps: 1 optimizer_args: + optimizer_cls: modules.optimizer.muon.Muon_AdamW lr: 0.0006 + muon_args: + weight_decay: 0.1 + adamw_args: + weight_decay: 0.0 lr_scheduler_args: - step_size: 10000 - gamma: 0.75 + step_size: 5000 + gamma: 0.8 max_batch_frames: 50000 max_batch_size: 64 dataset_size_key: 'lengths' diff --git a/configs/templates/config_acoustic.yaml b/configs/templates/config_acoustic.yaml index 8d99b01e5..65e276dc7 100644 --- a/configs/templates/config_acoustic.yaml +++ b/configs/templates/config_acoustic.yaml @@ -101,11 +101,15 @@ shallow_diffusion_args: lambda_aux_mel_loss: 0.2 optimizer_args: + optimizer_cls: modules.optimizer.muon.Muon_AdamW lr: 0.0006 + muon_args: + weight_decay: 0.1 + adamw_args: + weight_decay: 0.0 lr_scheduler_args: - scheduler_cls: torch.optim.lr_scheduler.StepLR - step_size: 10000 - gamma: 0.75 + step_size: 5000 + gamma: 0.8 max_batch_frames: 50000 max_batch_size: 64 max_updates: 160000 diff --git a/configs/templates/config_variance.yaml b/configs/templates/config_variance.yaml index 908deba4a..7022f2000 100644 --- a/configs/templates/config_variance.yaml +++ b/configs/templates/config_variance.yaml @@ -67,8 +67,8 @@ enc_ffn_kernel_size: 3 use_rope: true hidden_size: 256 dur_prediction_args: - arch: fs2 - hidden_size: 512 + arch: resnet + hidden_size: 256 dropout: 0.1 num_layers: 5 kernel_size: 3 @@ -123,11 +123,15 @@ lambda_pitch_loss: 1.0 lambda_var_loss: 1.0 optimizer_args: + optimizer_cls: modules.optimizer.muon.Muon_AdamW lr: 0.0006 + muon_args: + weight_decay: 0.1 + adamw_args: + weight_decay: 0.0 lr_scheduler_args: - scheduler_cls: torch.optim.lr_scheduler.StepLR - step_size: 10000 - gamma: 0.75 + step_size: 5000 + gamma: 0.8 max_batch_frames: 80000 max_batch_size: 48 max_updates: 160000 diff --git a/configs/variance.yaml b/configs/variance.yaml index c99b13381..5c0048411 100644 --- a/configs/variance.yaml +++ b/configs/variance.yaml @@ -40,8 +40,8 @@ rel_pos: true hidden_size: 256 dur_prediction_args: - arch: fs2 - hidden_size: 512 + arch: resnet + hidden_size: 256 dropout: 0.1 num_layers: 5 kernel_size: 3 @@ -114,10 +114,15 @@ diff_speedup: 10 # train and eval num_sanity_val_steps: 1 optimizer_args: + optimizer_cls: modules.optimizer.muon.Muon_AdamW lr: 0.0006 + muon_args: + weight_decay: 0.1 + adamw_args: + weight_decay: 0.0 lr_scheduler_args: - step_size: 10000 - gamma: 0.75 + step_size: 5000 + gamma: 0.8 max_batch_frames: 80000 max_batch_size: 48 dataset_size_key: 'lengths' diff --git a/modules/fastspeech/tts_modules.py b/modules/fastspeech/tts_modules.py index 391de11ab..16b358e3c 100644 --- a/modules/fastspeech/tts_modules.py +++ b/modules/fastspeech/tts_modules.py @@ -62,7 +62,7 @@ class DurationPredictor(torch.nn.Module): """ def __init__(self, in_dims, n_layers=2, n_chans=384, kernel_size=3, - dropout_rate=0.1, offset=1.0, dur_loss_type='mse'): + dropout_rate=0.1, offset=1.0, dur_loss_type='mse', arch='resnet'): """Initialize duration predictor module. Args: in_dims (int): Input dimension. @@ -76,16 +76,29 @@ def __init__(self, in_dims, n_layers=2, n_chans=384, kernel_size=3, self.offset = offset self.conv = torch.nn.ModuleList() self.kernel_size = kernel_size + self.use_resnet = (arch == 'resnet') for idx in range(n_layers): in_chans = in_dims if idx == 0 else n_chans - self.conv.append(torch.nn.Sequential( - torch.nn.Identity(), # this is a placeholder for ConstantPad1d which is now merged into Conv1d - torch.nn.Conv1d(in_chans, n_chans, kernel_size, stride=1, padding=kernel_size // 2), - torch.nn.ReLU(), - LayerNorm(n_chans, dim=1), - torch.nn.Dropout(dropout_rate) - )) - + if self.use_resnet: + self.conv.append(nn.Sequential( + LayerNorm(in_chans, dim=1), + nn.Conv1d(in_chans, n_chans, kernel_size, stride=1, padding=kernel_size // 2), + nn.ReLU(), + nn.Conv1d(n_chans, n_chans, 1), + nn.Dropout(dropout_rate) + )) + else: + self.conv.append(nn.Sequential( + nn.Identity(), # this is a placeholder for ConstantPad1d which is now merged into Conv1d + nn.Conv1d(in_chans, n_chans, kernel_size, stride=1, padding=kernel_size // 2), + nn.ReLU(), + LayerNorm(n_chans, dim=1), + nn.Dropout(dropout_rate) + )) + if self.use_resnet and in_dims != n_chans: + self.res_conv = nn.Conv1d(in_dims, n_chans, 1) + else: + self.res_conv = None self.loss_type = dur_loss_type if self.loss_type in ['mse', 'huber']: self.out_dims = 1 @@ -121,8 +134,12 @@ def forward(self, xs, x_masks=None, infer=True): xs = xs.transpose(1, -1) # (B, idim, Tmax) masks = 1 - x_masks.float() masks_ = masks[:, None, :] - for f in self.conv: - xs = f(xs) # (B, C, Tmax) + for idx, f in enumerate(self.conv): + if self.use_resnet: + residual = self.res_conv(xs) if idx == 0 and self.res_conv is not None else xs + xs = residual + f(xs) + else: + xs = f(xs) if x_masks is not None: xs = xs * masks_ xs = self.linear(xs.transpose(1, -1)) # [B, T, C] diff --git a/modules/fastspeech/variance_encoder.py b/modules/fastspeech/variance_encoder.py index deab9ee84..557ee6ea1 100644 --- a/modules/fastspeech/variance_encoder.py +++ b/modules/fastspeech/variance_encoder.py @@ -46,7 +46,8 @@ def __init__(self, vocab_size): dropout_rate=dur_hparams['dropout'], kernel_size=dur_hparams['kernel_size'], offset=dur_hparams['log_offset'], - dur_loss_type=dur_hparams['loss_type'] + dur_loss_type=dur_hparams['loss_type'], + arch=dur_hparams['arch'] ) def forward( diff --git a/modules/optimizer/chained_optimizer.py b/modules/optimizer/chained_optimizer.py new file mode 100644 index 000000000..b123f58e8 --- /dev/null +++ b/modules/optimizer/chained_optimizer.py @@ -0,0 +1,122 @@ +from torch import Tensor +from torch.optim import Optimizer +from torch.optim.optimizer import ParamsT +from dataclasses import dataclass +from typing import Any, Dict, List, Type, Callable, Optional, Iterable + + +@dataclass +class OptimizerSpec: + """Spec for creating an optimizer that is part of a `ChainedOptimizer`.""" + + class_type: Type[Optimizer] + init_args: Dict[str, Any] + param_filter: Optional[Callable[[Tensor], bool]] + + +class ChainedOptimizer(Optimizer): + """ + A wrapper around multiple optimizers that allows for chaining them together. + The optimizers are applied in the order they are passed in the constructor. + Each optimizer is responsible for updating a subset of the parameters, which + is determined by the `param_filter` function. If no optimizer is found for a + parameter group, an exception is raised. + """ + + def __init__( + self, + params: ParamsT, + optimizer_specs: List[OptimizerSpec], + lr: float, + weight_decay: float = 0.0, + optimizer_selection_callback: Optional[Callable[[Tensor, int], None]] = None, + **common_kwargs, + ): + self.optimizer_specs = optimizer_specs + self.optimizer_selection_callback = optimizer_selection_callback + self.optimizers: List[Optimizer] = [] + defaults = dict(lr=lr, weight_decay=weight_decay) + super().__init__(params, defaults) + + # Split the params for each optimzier + params_for_optimizers = [[] for _ in optimizer_specs] + for param_group in self.param_groups: + params = param_group["params"] + indices = param_group["optimizer_and_param_group_indices"] = set() + for param in params: + assert isinstance(param, Tensor), f"Expected a Tensor, got {type(param)}" + for index, spec in enumerate(optimizer_specs): + if spec.param_filter is None or spec.param_filter(param): + if self.optimizer_selection_callback is not None: + self.optimizer_selection_callback(param, index) + params_for_optimizers[index].append(param) + indices.add((index, 0)) + break + + # Initialize the optimizers + for spec, selected_params in zip(optimizer_specs, params_for_optimizers): + optimizer_args = { + 'lr': lr, + 'weight_decay': weight_decay, + } + optimizer_args.update(common_kwargs) + optimizer_args.update(spec.init_args) + optimizer = spec.class_type(selected_params, **optimizer_args) + self.optimizers.append(optimizer) + + def state_dict(self) -> Dict[str, Any]: + return { + "optimizers": [opt.state_dict() for opt in self.optimizers], + **super().state_dict(), + } + + def load_state_dict(self, state_dict: Dict[str, Any]) -> None: + optimizers = state_dict.pop("optimizers") + super().load_state_dict(state_dict) + for i in range(len(self.optimizers)): + self.optimizers[i].load_state_dict(optimizers[i]) + + def zero_grad(self, set_to_none: bool = True) -> None: + for opt in self.optimizers: + opt.zero_grad(set_to_none=set_to_none) + + def _copy_lr_to_optimizers(self) -> None: + for param_group in self.param_groups: + indices = param_group["optimizer_and_param_group_indices"] + for optimizer_idx, param_group_idx in indices: + self.optimizers[optimizer_idx].param_groups[param_group_idx]["lr"] = param_group["lr"] + + def step(self, closure=None) -> None: + self._copy_lr_to_optimizers() + for opt in self.optimizers: + opt.step(closure) + + def add_param_group(self, param_group: Dict[str, Any]) -> None: + super().add_param_group(param_group) + + # If optimizer has not been initialized, skip adding the param groups + if not self.optimizers: + return + + # Split the params for each optimzier + params_for_optimizers = [[] for _ in self.optimizer_specs] + params = param_group["params"] + indices = param_group["optimizer_and_param_group_indices"] = set() + for param in params: + assert isinstance(param, Tensor), f"Expected a Tensor, got {type(param)}" + found_optimizer = False + for index, spec in enumerate(self.optimizer_specs): + if spec.param_filter is None or spec.param_filter(param): + if self.optimizer_selection_callback is not None: + self.optimizer_selection_callback(param, index) + params_for_optimizers[index].append(param) + indices.add((index, len(self.optimizers[index].param_groups))) + found_optimizer = True + break + if not found_optimizer: + raise ValueError("No valid optimizer found for the given parameter group") + + # Add the selected param group to the optimizers + for optimizer, selected_params in zip(self.optimizers, params_for_optimizers): + if selected_params: + optimizer.add_param_group({"params": selected_params}) diff --git a/modules/optimizer/muon.py b/modules/optimizer/muon.py new file mode 100644 index 000000000..9e59c9a5d --- /dev/null +++ b/modules/optimizer/muon.py @@ -0,0 +1,129 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch import Tensor +from torch.nn import Module, Parameter, Embedding +from typing import List +from .chained_optimizer import ChainedOptimizer, OptimizerSpec + + +def zeropower_via_newtonschulz5(G: Tensor, steps: int) -> Tensor: + """ + Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a + quintic iteration whose coefficients are selected to maximize the slope at zero. For the purpose + of minimizing steps, it turns out to be empirically effective to keep increasing the slope at + zero even beyond the point where the iteration no longer converges all the way to one everywhere + on the interval. This iteration therefore does not produce UV^T but rather something like US'V^T + where S' is diagonal with S_{ii}' ~ Uniform(0.5, 1.5), which turns out not to hurt model + performance at all relative to UV^T, where USV^T = G is the SVD. + """ + assert G.ndim == 3 # batched Muon implementation by @scottjmaddox, and put into practice in the record by @YouJiacheng + a, b, c = (3.4445, -4.7750, 2.0315) + X = G.float() + if G.size(-2) > G.size(-1): + X = X.mT + + # Ensure spectral norm is at most 1 + X = F.normalize(X, p=2.0, dim=(-2, -1), eps=1e-7) + + # Perform the NS iterations + for _ in range(steps): + A = X @ X.mT + B = torch.baddbmm(A, A, A, beta=b, alpha=c) + X = torch.baddbmm(X, B, X, beta=a, alpha=1) + + if G.size(-2) > G.size(-1): + X = X.mT + return X.to(G) + + +class Muon(torch.optim.Optimizer): + """ + Muon - MomentUm Orthogonalized by Newton-schulz + + https://kellerjordan.github.io/posts/muon/ + + Muon internally runs standard SGD-momentum, and then performs an orthogonalization post- + processing step, in which each 2D parameter's update is replaced with the nearest orthogonal + matrix. To efficiently orthogonalize each update, we use a Newton-Schulz iteration, which has + the advantage that it can be stably run in bfloat16 on the GPU. + + Some warnings: + - This optimizer should not be used for the embedding layer, the final fully connected layer, + or any {0,1}-D parameters; those should all be optimized by a standard method (e.g., AdamW). + - To use it with 4D convolutional filters, it works well to just flatten their last 3 dimensions. + + Arguments: + lr: The learning rate used by the internal SGD. + momentum: The momentum used by the internal SGD. + nesterov: Whether to use Nesterov-style momentum in the internal SGD. (recommended) + ns_steps: The number of Newton-Schulz iteration steps to use. + """ + + def __init__(self, params, lr=5e-4, weight_decay=0.1, momentum=0.95, nesterov=True, ns_steps=5): + defaults = dict(lr=lr, weight_decay=weight_decay, momentum=momentum, nesterov=nesterov, ns_steps=ns_steps) + super().__init__(params, defaults) + + @torch.no_grad() + def step(self, closure=None): + for group in self.param_groups: + shape_groups = {} + for p in filter(lambda p: p.grad is not None, group["params"]): + g = p.grad + state = self.state[p] + if "momentum_buffer" not in state: + state["momentum_buffer"] = torch.zeros_like(g) + buf: Tensor = state["momentum_buffer"] + key = (p.shape, p.device, p.dtype) + if key not in shape_groups: + shape_groups[key] = {"params": [], "grads": [], "buffers": []} + shape_groups[key]["params"].append(p) + shape_groups[key]["grads"].append(g) + shape_groups[key]["buffers"].append(buf) + for key in shape_groups: + group_data = shape_groups[key] + g = torch.stack(group_data["grads"]) + buf = torch.stack(group_data["buffers"]) + buf.lerp_(g, 1 - group["momentum"]) + g = g.lerp_(buf, group["momentum"]) if group["nesterov"] else buf + if g.ndim >= 4: # for the case of conv filters + g = g.view(g.size(0), g.size(1), -1) + g = zeropower_via_newtonschulz5(g, steps=group["ns_steps"]) + for i, p in enumerate(group_data["params"]): + if group["weight_decay"] > 0: + p.data.mul_(1 - group["lr"] * group["weight_decay"]) + p.data.add_(g[i].view_as(p), alpha=-group["lr"] * max(g[i].size()) ** 0.5) + self.state[p]["momentum_buffer"] = buf[i].clone() + + +def get_params_for_muon(model) -> List[Parameter]: + """ + Filter parameters of a module into two groups: those that can be optimized by Muon, + and those that should be optimized by a standard optimizer. + Args: + module: The module to filter parameters for. + Returns: + A list of parameters that should be optimized with muon. + """ + muon_params = [] + for module in model.modules(): + for param in module.parameters(recurse=False): + if not param.requires_grad: + continue + if not isinstance(module, nn.Embedding) and param.ndim >= 2: + muon_params.append(param) + return muon_params + + +class Muon_AdamW(ChainedOptimizer): + def __init__(self, model, lr=0.0005, weight_decay=0.0, muon_args={}, adamw_args={}, verbose=False): + muon_params_id_set = set(id(p) for p in get_params_for_muon(model)) + spec_muon = OptimizerSpec(Muon, muon_args, lambda param: id(param) in muon_params_id_set) + spec_adamw = OptimizerSpec(torch.optim.AdamW, adamw_args, None) + specs = [spec_muon, spec_adamw] + callback = None + if verbose: + callback = lambda p, spec_idx: print( + f"Adding param {p.shape} to optimizer{spec_idx} {str(specs[spec_idx].class_type)}" + ) + super().__init__(model.parameters(), specs, lr=lr, weight_decay=weight_decay, optimizer_selection_callback=callback) \ No newline at end of file diff --git a/utils/__init__.py b/utils/__init__.py index abb5df151..1f4c17c04 100644 --- a/utils/__init__.py +++ b/utils/__init__.py @@ -315,8 +315,9 @@ def helper(params): def simulate_lr_scheduler(optimizer_args, scheduler_args, step_count, num_param_groups=1): + optimizer_cls = optimizer_args['optimizer_cls'] optimizer = build_object_from_class_name( - optimizer_args['optimizer_cls'], + 'torch.optim.AdamW' if optimizer_cls == 'modules.optimizer.muon.Muon_AdamW' else optimizer_cls, torch.optim.Optimizer, [{'params': torch.nn.Parameter(), 'initial_lr': optimizer_args['lr']} for _ in range(num_param_groups)], **optimizer_args From f9fda2781414fe14a293fc116bd74126c1832c25 Mon Sep 17 00:00:00 2001 From: yxlllc Date: Fri, 4 Apr 2025 00:06:59 +0800 Subject: [PATCH 07/21] optimize --- configs/acoustic.yaml | 6 +++--- configs/templates/config_acoustic.yaml | 6 +++--- configs/templates/config_variance.yaml | 4 ++-- configs/variance.yaml | 4 ++-- 4 files changed, 10 insertions(+), 10 deletions(-) diff --git a/configs/acoustic.yaml b/configs/acoustic.yaml index cdc7c754c..73648dd1d 100644 --- a/configs/acoustic.yaml +++ b/configs/acoustic.yaml @@ -119,10 +119,10 @@ dataset_size_key: 'lengths' val_with_vocoder: true val_check_interval: 2000 num_valid_plots: 10 -max_updates: 160000 +max_updates: 100000 num_ckpt_keep: 5 -permanent_ckpt_start: 80000 -permanent_ckpt_interval: 20000 +permanent_ckpt_start: 60000 +permanent_ckpt_interval: 10000 finetune_enabled: false finetune_ckpt_path: null diff --git a/configs/templates/config_acoustic.yaml b/configs/templates/config_acoustic.yaml index 65e276dc7..2ea62b33a 100644 --- a/configs/templates/config_acoustic.yaml +++ b/configs/templates/config_acoustic.yaml @@ -112,13 +112,13 @@ lr_scheduler_args: gamma: 0.8 max_batch_frames: 50000 max_batch_size: 64 -max_updates: 160000 +max_updates: 100000 num_valid_plots: 10 val_with_vocoder: true val_check_interval: 2000 num_ckpt_keep: 5 -permanent_ckpt_start: 120000 -permanent_ckpt_interval: 20000 +permanent_ckpt_start: 60000 +permanent_ckpt_interval: 10000 pl_trainer_devices: 'auto' pl_trainer_precision: '16-mixed' diff --git a/configs/templates/config_variance.yaml b/configs/templates/config_variance.yaml index 7022f2000..1fa3be3a1 100644 --- a/configs/templates/config_variance.yaml +++ b/configs/templates/config_variance.yaml @@ -134,12 +134,12 @@ lr_scheduler_args: gamma: 0.8 max_batch_frames: 80000 max_batch_size: 48 -max_updates: 160000 +max_updates: 100000 num_valid_plots: 10 val_check_interval: 2000 num_ckpt_keep: 5 -permanent_ckpt_start: 80000 +permanent_ckpt_start: 60000 permanent_ckpt_interval: 10000 pl_trainer_devices: 'auto' pl_trainer_precision: '16-mixed' diff --git a/configs/variance.yaml b/configs/variance.yaml index 5c0048411..6ed3d8b94 100644 --- a/configs/variance.yaml +++ b/configs/variance.yaml @@ -128,9 +128,9 @@ max_batch_size: 48 dataset_size_key: 'lengths' val_check_interval: 2000 num_valid_plots: 10 -max_updates: 160000 +max_updates: 100000 num_ckpt_keep: 5 -permanent_ckpt_start: 80000 +permanent_ckpt_start: 60000 permanent_ckpt_interval: 10000 finetune_enabled: false From eb3b606de6d0b26299dfd525d170b57c0f906f32 Mon Sep 17 00:00:00 2001 From: yxlllc Date: Fri, 4 Apr 2025 20:14:13 +0800 Subject: [PATCH 08/21] stabilize fp16 training --- modules/commons/common_layers.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/modules/commons/common_layers.py b/modules/commons/common_layers.py index 2938d7bb5..7a72b1555 100644 --- a/modules/commons/common_layers.py +++ b/modules/commons/common_layers.py @@ -114,7 +114,15 @@ def forward(self, x): # out, gate = x.chunk(2, dim=self.dim) # Using torch.split instead of chunk for ONNX export compatibility. out, gate = torch.split(x, x.size(self.dim) // 2, dim=self.dim) - return out * F.silu(gate) + gate = F.silu(gate) + if x.dtype == torch.float16: + out_min, out_max = torch.aminmax(out.detach()) + gate_min, gate_max = torch.aminmax(gate.detach()) + max_abs_out = torch.max(-out_min, out_max).float() + max_abs_gate = torch.max(-gate_min, gate_max).float() + if max_abs_out * max_abs_gate > 65504: + return (out.float() * gate.float()).clamp(-65504, 65504).half() + return out * gate class Conv1d(torch.nn.Conv1d): From 300676a8c13036ee31a53ff11d5d2cdf4106c9d8 Mon Sep 17 00:00:00 2001 From: yxlllc Date: Sat, 5 Apr 2025 02:13:07 +0800 Subject: [PATCH 09/21] stabilize fp16 training --- modules/commons/common_layers.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/commons/common_layers.py b/modules/commons/common_layers.py index 7a72b1555..77381e2de 100644 --- a/modules/commons/common_layers.py +++ b/modules/commons/common_layers.py @@ -120,8 +120,8 @@ def forward(self, x): gate_min, gate_max = torch.aminmax(gate.detach()) max_abs_out = torch.max(-out_min, out_max).float() max_abs_gate = torch.max(-gate_min, gate_max).float() - if max_abs_out * max_abs_gate > 65504: - return (out.float() * gate.float()).clamp(-65504, 65504).half() + if max_abs_out * max_abs_gate > 1000: + return (out.float() * gate.float()).clamp(-1000, 1000).half() return out * gate From 82cd534ba8df8cf219b63125c3253ad9899326f8 Mon Sep 17 00:00:00 2001 From: agentasteriski <99069711+agentasteriski@users.noreply.github.com> Date: Mon, 14 Apr 2025 12:19:15 -0400 Subject: [PATCH 10/21] fix variable typos --- utils/binarizer_utils.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/utils/binarizer_utils.py b/utils/binarizer_utils.py index 5bcffd2a1..b36b2aa42 100644 --- a/utils/binarizer_utils.py +++ b/utils/binarizer_utils.py @@ -215,7 +215,7 @@ def get_falsetto_base_harmonic( *, hop_size=None, fft_size=None, win_size=None ): """ - Definition of falestto: Attenuation ratio from the second harmonic to the fourth harmonic (H2 / (H2 + H4)). + Definition of falsetto: Attenuation ratio from the second harmonic to the fourth harmonic (H2 / (H2 + H4)). Refer to : ACOUSTIC MEASURES OF FALSETTO VOICE (DOI:10.1121/1.4877544) :param waveform: All other analysis parameters will not take effect if a DeconstructedWaveform is given :param samplerate: sampling rate @@ -224,7 +224,7 @@ def get_falsetto_base_harmonic( :param hop_size: Frame width, in number of samples :param fft_size: Number of fft bins :param win_size: Window size, in number of samples - :return: falestto + :return: falsetto """ if not isinstance(waveform, DecomposedWaveform): waveform = DecomposedWaveform( @@ -243,9 +243,9 @@ def get_falsetto_base_harmonic( hop_size=waveform.hop_size, win_size=waveform.win_size, domain='amplitude' ) - falestto = energy_h2 / (energy_h2 + energy_h4 + 1e-5) - falestto = np.clip(falestto, a_min=0, a_max=1) - return falestto + falsetto = energy_h2 / (energy_h2 + energy_h4 + 1e-5) + falsetto = np.clip(falsetto, a_min=0, a_max=1) + return falsetto class SinusoidalSmoothingConv1d(torch.nn.Conv1d): From ac94abfe2a7203c51bfd2448775e7679676c1a2f Mon Sep 17 00:00:00 2001 From: agentasteriski <99069711+agentasteriski@users.noreply.github.com> Date: Mon, 14 Apr 2025 12:20:10 -0400 Subject: [PATCH 11/21] correct variable typo --- preprocessing/acoustic_binarizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/preprocessing/acoustic_binarizer.py b/preprocessing/acoustic_binarizer.py index e1b535be1..0bd4f50e9 100644 --- a/preprocessing/acoustic_binarizer.py +++ b/preprocessing/acoustic_binarizer.py @@ -218,7 +218,7 @@ def process_item(self, item_name, meta_data, binarization_args): if self.need_falsetto: # get ground truth falsetto - falsetto = get_falestto_base_harmonic( + falsetto = get_falsetto_base_harmonic( dec_waveform, None, None, length=length ) From 7d04fe86aaf0faae042015699269482a9cff70d9 Mon Sep 17 00:00:00 2001 From: KakaruHayate Date: Wed, 16 Apr 2025 11:52:18 +0800 Subject: [PATCH 12/21] change 'falsetto' to 'falsetto_dev' --- modules/fastspeech/param_adaptor.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/fastspeech/param_adaptor.py b/modules/fastspeech/param_adaptor.py index 9ad835b18..ce0016263 100644 --- a/modules/fastspeech/param_adaptor.py +++ b/modules/fastspeech/param_adaptor.py @@ -7,7 +7,7 @@ from utils import filter_kwargs from utils.hparams import hparams -VARIANCE_CHECKLIST = ['energy', 'breathiness', 'voicing', 'tension', 'falsetto'] +VARIANCE_CHECKLIST = ['energy', 'breathiness', 'voicing', 'tension', 'falsetto_dev'] class ParameterAdaptorModule(torch.nn.Module): @@ -28,7 +28,7 @@ def __init__(self): if self.predict_tension: self.variance_prediction_list.append('tension') if self.predict_falsetto: - self.variance_prediction_list.append('falsetto') + self.variance_prediction_list.append('falsetto_dev') self.predict_variances = len(self.variance_prediction_list) > 0 def build_adaptor(self, cls=MultiVarianceDiffusion): From e168058f892bca3c87044768c5066ebc52fcad5c Mon Sep 17 00:00:00 2001 From: KakaruHayate Date: Wed, 16 Apr 2025 11:55:03 +0800 Subject: [PATCH 13/21] change 'falsetto' to 'falsetto_dev' --- modules/fastspeech/acoustic_encoder.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/fastspeech/acoustic_encoder.py b/modules/fastspeech/acoustic_encoder.py index dce265568..669662c42 100644 --- a/modules/fastspeech/acoustic_encoder.py +++ b/modules/fastspeech/acoustic_encoder.py @@ -43,7 +43,7 @@ def __init__(self, vocab_size): if self.use_tension_embed: self.variance_embed_list.append('tension') if self.use_falsetto_embed: - self.variance_embed_list.append('falsetto') + self.variance_embed_list.append('falsetto_dev') self.use_variance_embeds = len(self.variance_embed_list) > 0 if self.use_variance_embeds: From 01afd71f749ec9f52cf688d6f9de4b6750d7194a Mon Sep 17 00:00:00 2001 From: KakaruHayate Date: Wed, 16 Apr 2025 12:08:40 +0800 Subject: [PATCH 14/21] Change variance model args for more variance embed --- configs/templates/config_variance.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/configs/templates/config_variance.yaml b/configs/templates/config_variance.yaml index e62ebca63..d27cfe888 100644 --- a/configs/templates/config_variance.yaml +++ b/configs/templates/config_variance.yaml @@ -111,11 +111,11 @@ pitch_prediction_args: # strong_cond: true variances_prediction_args: - total_repeat_bins: 48 + total_repeat_bins: 64 backbone_type: 'wavenet' backbone_args: num_layers: 10 - num_channels: 192 + num_channels: 256 dilation_cycle_length: 4 # backbone_type: 'lynxnet' # backbone_args: From 3ea086161c1dec75431b87651edd7bd90f56f393 Mon Sep 17 00:00:00 2001 From: KakaruHayate Date: Wed, 16 Apr 2025 12:12:34 +0800 Subject: [PATCH 15/21] Change variance model args for more variance embed --- inference/ds_acoustic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/inference/ds_acoustic.py b/inference/ds_acoustic.py index 7ad1b08ab..5d62c236e 100644 --- a/inference/ds_acoustic.py +++ b/inference/ds_acoustic.py @@ -35,7 +35,7 @@ def __init__(self, device=None, load_model=True, load_vocoder=True, ckpt_steps=N if hparams.get('use_tension_embed', False): self.variances_to_embed.add('tension') if hparams.get('use_falsetto_embed', False): - self.variances_to_embed.add('falsetto') + self.variances_to_embed.add('falsetto_dev') self.phoneme_dictionary = load_phoneme_dictionary() if hparams['use_spk_id']: From 7e9292409c7dd9e391f5e853822b4708bb964edd Mon Sep 17 00:00:00 2001 From: KakaruHayate Date: Wed, 16 Apr 2025 14:26:16 +0800 Subject: [PATCH 16/21] change 'falsetto' to 'falsetto_dev' --- preprocessing/acoustic_binarizer.py | 4 ++-- preprocessing/variance_binarizer.py | 4 ++-- training/acoustic_task.py | 4 ++-- training/variance_task.py | 8 ++++---- 4 files changed, 10 insertions(+), 10 deletions(-) diff --git a/preprocessing/acoustic_binarizer.py b/preprocessing/acoustic_binarizer.py index 0bd4f50e9..10872deee 100644 --- a/preprocessing/acoustic_binarizer.py +++ b/preprocessing/acoustic_binarizer.py @@ -45,7 +45,7 @@ 'breathiness', 'voicing', 'tension', - 'falsetto', + 'falsetto_dev', 'key_shift', 'speed', ] @@ -229,7 +229,7 @@ def process_item(self, item_name, meta_data, binarization_args): ).eval().to(self.device) falsetto = falsetto_smooth(torch.from_numpy(falsetto).to(self.device)[None])[0] - processed_input['falsetto'] = falsetto.cpu().numpy() + processed_input['falsetto_dev'] = falsetto.cpu().numpy() if hparams['use_key_shift_embed']: processed_input['key_shift'] = 0. diff --git a/preprocessing/variance_binarizer.py b/preprocessing/variance_binarizer.py index 3c2f7fe15..e640fa616 100644 --- a/preprocessing/variance_binarizer.py +++ b/preprocessing/variance_binarizer.py @@ -49,7 +49,7 @@ 'breathiness', # frame-level RMS of aperiodic parts (dB), float32[T_s,] 'voicing', # frame-level RMS of harmonic parts (dB), float32[T_s,] 'tension', # frame-level tension (logit), float32[T_s,] - 'falsetto', # frame-level falsetto (ratio), float32[T_s,] + 'falsetto_dev', # frame-level falsetto (ratio), float32[T_s,] ] DS_INDEX_SEP = '#' @@ -543,7 +543,7 @@ def process_item(self, item_name, meta_data, binarization_args): ).eval().to(self.device) falsetto = falsetto_smooth(torch.from_numpy(falsetto).to(self.device)[None])[0].cpu().numpy() - processed_input['falsetto'] = falsetto + processed_input['falsetto_dev'] = falsetto return processed_input diff --git a/training/acoustic_task.py b/training/acoustic_task.py index 3118e6f82..71c6fc7d5 100644 --- a/training/acoustic_task.py +++ b/training/acoustic_task.py @@ -32,7 +32,7 @@ def __init__(self, prefix, preload=False): if hparams['use_tension_embed']: self.required_variances['tension'] = 0.0 if hparams['use_falsetto_embed']: - self.required_variances['falsetto'] = 0.0 + self.required_variances['falsetto_dev'] = 0.0 self.need_key_shift = hparams['use_key_shift_embed'] self.need_speed = hparams['use_speed_embed'] @@ -95,7 +95,7 @@ def __init__(self): if hparams['use_tension_embed']: self.required_variances.append('tension') if hparams['use_falsetto_embed']: - self.required_variances.append('falsetto') + self.required_variances.append('falsetto_dev') super()._finish_init() def _build_model(self): diff --git a/training/variance_task.py b/training/variance_task.py index c9595781f..c69dc9af7 100644 --- a/training/variance_task.py +++ b/training/variance_task.py @@ -69,7 +69,7 @@ def collater(self, samples): if hparams['predict_tension']: batch['tension'] = utils.collate_nd([s['tension'] for s in samples], 0) if hparams['predict_falsetto']: - batch['falsetto'] = utils.collate_nd([s['falsetto'] for s in samples], 0) + batch['falsetto_dev'] = utils.collate_nd([s['falsetto_dev'] for s in samples], 0) return batch @@ -116,7 +116,7 @@ def __init__(self): if predict_tension: self.variance_prediction_list.append('tension') if predict_falsetto: - self.variance_prediction_list.append('falsetto') + self.variance_prediction_list.append('falsetto_dev') self.predict_variances = len(self.variance_prediction_list) > 0 self.lambda_var_loss = hparams['lambda_var_loss'] super()._finish_init() @@ -186,7 +186,7 @@ def run_model(self, sample, infer=False): breathiness = sample.get('breathiness') # [B, T_s] voicing = sample.get('voicing') # [B, T_s] tension = sample.get('tension') # [B, T_s] - falsetto = sample.get('falsetto') # [B, T_s] + falsetto = sample.get('falsetto_dev') # [B, T_s] pitch_retake = variance_retake = None if (self.predict_pitch or self.predict_variances) and not infer: @@ -209,7 +209,7 @@ def run_model(self, sample, infer=False): note_midi=note_midi, note_rest=note_rest, note_dur=note_dur, note_glide=note_glide, mel2note=mel2note, base_pitch=base_pitch, pitch=pitch, - energy=energy, breathiness=breathiness, voicing=voicing, tension=tension, falsetto=falsetto, + energy=energy, breathiness=breathiness, voicing=voicing, tension=tension, falsetto_dev=falsetto, pitch_retake=pitch_retake, variance_retake=variance_retake, spk_id=spk_ids, infer=infer ) From 5169daf8604bce124abdff2eed37230a165e0131 Mon Sep 17 00:00:00 2001 From: KakaruHayate Date: Thu, 17 Apr 2025 19:32:57 +0800 Subject: [PATCH 17/21] Revert "Merge branch 'muon_dev' into falsetto" This reverts commit 4f4dece53545687f2f6d88de175350f2f41712ed, reversing changes made to acf194efee6d5c64a6855f0e9d7072f7150e1042. --- basics/base_task.py | 2 +- configs/acoustic.yaml | 18 ++-- configs/templates/config_acoustic.yaml | 19 ++-- configs/templates/config_variance.yaml | 56 ++++++----- configs/variance.yaml | 33 +++---- modules/backbones/__init__.py | 4 +- modules/backbones/lynxnet.py | 20 +++- modules/backbones/lynxnet2.py | 101 ------------------- modules/backbones/wavenet.py | 8 +- modules/commons/common_layers.py | 26 +---- modules/fastspeech/tts_modules.py | 39 +++----- modules/fastspeech/variance_encoder.py | 3 +- modules/optimizer/chained_optimizer.py | 122 ----------------------- modules/optimizer/muon.py | 129 ------------------------- utils/__init__.py | 3 +- 15 files changed, 97 insertions(+), 486 deletions(-) delete mode 100644 modules/backbones/lynxnet2.py delete mode 100644 modules/optimizer/chained_optimizer.py delete mode 100644 modules/optimizer/muon.py diff --git a/basics/base_task.py b/basics/base_task.py index 656893d96..065f8273a 100644 --- a/basics/base_task.py +++ b/basics/base_task.py @@ -307,7 +307,7 @@ def build_optimizer(self, model): optimizer = build_object_from_class_name( optimizer_args['optimizer_cls'], torch.optim.Optimizer, - model if optimizer_args['optimizer_cls'] == 'modules.optimizer.muon.Muon_AdamW' else model.parameters(), + model.parameters(), **optimizer_args ) return optimizer diff --git a/configs/acoustic.yaml b/configs/acoustic.yaml index 0edeb5d55..3d7d4c591 100644 --- a/configs/acoustic.yaml +++ b/configs/acoustic.yaml @@ -72,12 +72,13 @@ sampling_steps: 20 diff_accelerator: ddim diff_speedup: 10 hidden_size: 256 -backbone_type: 'lynxnet2' +backbone_type: 'lynxnet' backbone_args: num_channels: 1024 num_layers: 6 kernel_size: 31 dropout_rate: 0.0 + strong_cond: true main_loss_type: l2 main_loss_log_norm: false schedule_type: 'linear' @@ -106,25 +107,20 @@ lambda_aux_mel_loss: 0.2 # train and eval num_sanity_val_steps: 1 optimizer_args: - optimizer_cls: modules.optimizer.muon.Muon_AdamW lr: 0.0006 - muon_args: - weight_decay: 0.1 - adamw_args: - weight_decay: 0.0 lr_scheduler_args: - step_size: 5000 - gamma: 0.8 + step_size: 10000 + gamma: 0.75 max_batch_frames: 50000 max_batch_size: 64 dataset_size_key: 'lengths' val_with_vocoder: true val_check_interval: 2000 num_valid_plots: 10 -max_updates: 100000 +max_updates: 160000 num_ckpt_keep: 5 -permanent_ckpt_start: 60000 -permanent_ckpt_interval: 10000 +permanent_ckpt_start: 80000 +permanent_ckpt_interval: 20000 finetune_enabled: false finetune_ckpt_path: null diff --git a/configs/templates/config_acoustic.yaml b/configs/templates/config_acoustic.yaml index e5c921d85..46202ab96 100644 --- a/configs/templates/config_acoustic.yaml +++ b/configs/templates/config_acoustic.yaml @@ -77,12 +77,13 @@ T_start: 0.4 T_start_infer: 0.4 K_step: 300 K_step_infer: 300 -backbone_type: 'lynxnet2' +backbone_type: 'lynxnet' backbone_args: num_channels: 1024 num_layers: 6 kernel_size: 31 dropout_rate: 0.0 + strong_cond: true #backbone_type: 'wavenet' #backbone_args: # num_channels: 512 @@ -102,24 +103,20 @@ shallow_diffusion_args: lambda_aux_mel_loss: 0.2 optimizer_args: - optimizer_cls: modules.optimizer.muon.Muon_AdamW lr: 0.0006 - muon_args: - weight_decay: 0.1 - adamw_args: - weight_decay: 0.0 lr_scheduler_args: - step_size: 5000 - gamma: 0.8 + scheduler_cls: torch.optim.lr_scheduler.StepLR + step_size: 10000 + gamma: 0.75 max_batch_frames: 50000 max_batch_size: 64 -max_updates: 100000 +max_updates: 160000 num_valid_plots: 10 val_with_vocoder: true val_check_interval: 2000 num_ckpt_keep: 5 -permanent_ckpt_start: 60000 -permanent_ckpt_interval: 10000 +permanent_ckpt_start: 120000 +permanent_ckpt_interval: 20000 pl_trainer_devices: 'auto' pl_trainer_precision: '16-mixed' diff --git a/configs/templates/config_variance.yaml b/configs/templates/config_variance.yaml index 21a07fd71..d27cfe888 100644 --- a/configs/templates/config_variance.yaml +++ b/configs/templates/config_variance.yaml @@ -71,8 +71,8 @@ enc_ffn_kernel_size: 3 use_rope: true hidden_size: 256 dur_prediction_args: - arch: resnet - hidden_size: 256 + arch: fs2 + hidden_size: 512 dropout: 0.1 num_layers: 5 kernel_size: 3 @@ -98,52 +98,50 @@ pitch_prediction_args: pitd_clip_min: -12.0 pitd_clip_max: 12.0 repeat_bins: 64 -# backbone_type: 'wavenet' -# backbone_args: -# num_layers: 20 -# num_channels: 256 -# dilation_cycle_length: 5 - backbone_type: 'lynxnet2' + backbone_type: 'wavenet' backbone_args: - num_layers: 6 - num_channels: 512 - dropout_rate: 0.0 + num_layers: 20 + num_channels: 256 + dilation_cycle_length: 5 +# backbone_type: 'lynxnet' +# backbone_args: +# num_layers: 6 +# num_channels: 512 +# dropout_rate: 0.0 +# strong_cond: true variances_prediction_args: total_repeat_bins: 64 -# backbone_type: 'wavenet' -# backbone_args: -# num_layers: 10 -# num_channels: 192 -# dilation_cycle_length: 4 - backbone_type: 'lynxnet2' + backbone_type: 'wavenet' backbone_args: - num_layers: 6 - num_channels: 384 - dropout_rate: 0.0 + num_layers: 10 + num_channels: 256 + dilation_cycle_length: 4 +# backbone_type: 'lynxnet' +# backbone_args: +# num_layers: 6 +# num_channels: 384 +# dropout_rate: 0.0 +# strong_cond: true lambda_dur_loss: 1.0 lambda_pitch_loss: 1.0 lambda_var_loss: 1.0 optimizer_args: - optimizer_cls: modules.optimizer.muon.Muon_AdamW lr: 0.0006 - muon_args: - weight_decay: 0.1 - adamw_args: - weight_decay: 0.0 lr_scheduler_args: - step_size: 5000 - gamma: 0.8 + scheduler_cls: torch.optim.lr_scheduler.StepLR + step_size: 10000 + gamma: 0.75 max_batch_frames: 80000 max_batch_size: 48 -max_updates: 100000 +max_updates: 160000 num_valid_plots: 10 val_check_interval: 2000 num_ckpt_keep: 5 -permanent_ckpt_start: 60000 +permanent_ckpt_start: 80000 permanent_ckpt_interval: 10000 pl_trainer_devices: 'auto' pl_trainer_precision: '16-mixed' diff --git a/configs/variance.yaml b/configs/variance.yaml index 98cb2fda3..62e9c60e9 100644 --- a/configs/variance.yaml +++ b/configs/variance.yaml @@ -41,8 +41,8 @@ rel_pos: true hidden_size: 256 dur_prediction_args: - arch: resnet - hidden_size: 256 + arch: fs2 + hidden_size: 512 dropout: 0.1 num_layers: 5 kernel_size: 3 @@ -66,11 +66,11 @@ pitch_prediction_args: pitd_clip_min: -12.0 pitd_clip_max: 12.0 repeat_bins: 64 - backbone_type: 'lynxnet2' + backbone_type: 'wavenet' backbone_args: - num_layers: 6 - num_channels: 512 - dropout_rate: 0.0 + num_layers: 20 + num_channels: 256 + dilation_cycle_length: 5 energy_db_min: -96.0 energy_db_max: -12.0 @@ -93,11 +93,11 @@ falsetto_smooth_width: 0.12 variances_prediction_args: total_repeat_bins: 48 - backbone_type: 'lynxnet2' + backbone_type: 'wavenet' backbone_args: - num_layers: 6 - num_channels: 384 - dropout_rate: 0.0 + num_layers: 10 + num_channels: 192 + dilation_cycle_length: 4 lambda_dur_loss: 1.0 lambda_pitch_loss: 1.0 @@ -119,23 +119,18 @@ diff_speedup: 10 # train and eval num_sanity_val_steps: 1 optimizer_args: - optimizer_cls: modules.optimizer.muon.Muon_AdamW lr: 0.0006 - muon_args: - weight_decay: 0.1 - adamw_args: - weight_decay: 0.0 lr_scheduler_args: - step_size: 5000 - gamma: 0.8 + step_size: 10000 + gamma: 0.75 max_batch_frames: 80000 max_batch_size: 48 dataset_size_key: 'lengths' val_check_interval: 2000 num_valid_plots: 10 -max_updates: 100000 +max_updates: 160000 num_ckpt_keep: 5 -permanent_ckpt_start: 60000 +permanent_ckpt_start: 80000 permanent_ckpt_interval: 10000 finetune_enabled: false diff --git a/modules/backbones/__init__.py b/modules/backbones/__init__.py index ebd903456..8fce796ab 100644 --- a/modules/backbones/__init__.py +++ b/modules/backbones/__init__.py @@ -1,13 +1,11 @@ import torch.nn from modules.backbones.wavenet import WaveNet from modules.backbones.lynxnet import LYNXNet -from modules.backbones.lynxnet2 import LYNXNet2 from utils import filter_kwargs BACKBONES = { 'wavenet': WaveNet, - 'lynxnet': LYNXNet, - 'lynxnet2': LYNXNet2, + 'lynxnet': LYNXNet } diff --git a/modules/backbones/lynxnet.py b/modules/backbones/lynxnet.py index 88b2be348..5dbd1d0a1 100644 --- a/modules/backbones/lynxnet.py +++ b/modules/backbones/lynxnet.py @@ -6,10 +6,26 @@ import torch.nn as nn import torch.nn.functional as F -from modules.commons.common_layers import SinusoidalPosEmb, SwiGLU, Conv1d, Transpose +from modules.commons.common_layers import SinusoidalPosEmb, SwiGLU from utils.hparams import hparams +class Conv1d(torch.nn.Conv1d): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + nn.init.kaiming_normal_(self.weight) + + +class Transpose(nn.Module): + def __init__(self, dims): + super().__init__() + assert len(dims) == 2, 'dims must be a tuple of two dimensions' + self.dims = dims + + def forward(self, x): + return x.transpose(*self.dims) + + class LYNXConvModule(nn.Module): @staticmethod def calc_same_padding(kernel_size): @@ -134,7 +150,7 @@ def forward(self, spec, diffusion_step, cond): # post-norm x = self.norm(x.transpose(1, 2)).transpose(1, 2) - # output_projection + # MLP and GLU x = self.output_projection(x) # [B, 128, T] if self.n_feats == 1: diff --git a/modules/backbones/lynxnet2.py b/modules/backbones/lynxnet2.py deleted file mode 100644 index 5a10a856b..000000000 --- a/modules/backbones/lynxnet2.py +++ /dev/null @@ -1,101 +0,0 @@ -import torch -import torch.nn as nn -import torch.nn.functional as F - -from modules.commons.common_layers import SinusoidalPosEmb, SwiGLU, Conv1d, Transpose -from utils.hparams import hparams - - -class LYNXNet2Block(nn.Module): - def __init__(self, dim, expansion_factor, kernel_size=31, dropout=0.): - super().__init__() - inner_dim = int(dim * expansion_factor) - if float(dropout) > 0.: - _dropout = nn.Dropout(dropout) - else: - _dropout = nn.Identity() - self.net = nn.Sequential( - nn.LayerNorm(dim), - Transpose((1, 2)), - nn.Conv1d(dim, dim, kernel_size=kernel_size, padding=kernel_size // 2, groups=dim), - Transpose((1, 2)), - nn.Linear(dim, inner_dim * 2), - SwiGLU(), - nn.Linear(inner_dim, inner_dim * 2), - SwiGLU(), - nn.Linear(inner_dim, dim), - _dropout - ) - - def forward(self, x): - return x + self.net(x) - - -class LYNXNet2(nn.Module): - def __init__(self, in_dims, n_feats, *, num_layers=6, num_channels=512, expansion_factor=1, kernel_size=31, - dropout=0.0): - """ - LYNXNet2(Linear Gated Depthwise Separable Convolution Network Version 2) - """ - super().__init__() - self.in_dims = in_dims - self.n_feats = n_feats - self.input_projection = nn.Linear(in_dims * n_feats, num_channels) - self.conditioner_projection = nn.Linear(hparams['hidden_size'], num_channels) - self.diffusion_embedding = nn.Sequential( - SinusoidalPosEmb(num_channels), - nn.Linear(num_channels, num_channels * 4), - nn.GELU(), - nn.Linear(num_channels * 4, num_channels), - ) - self.residual_layers = nn.ModuleList( - [ - LYNXNet2Block( - dim=num_channels, - expansion_factor=expansion_factor, - kernel_size=kernel_size, - dropout=dropout - ) - for i in range(num_layers) - ] - ) - self.norm = nn.LayerNorm(num_channels) - self.output_projection = nn.Linear(num_channels, in_dims * n_feats) - nn.init.kaiming_normal_(self.input_projection.weight) - nn.init.kaiming_normal_(self.conditioner_projection.weight) - nn.init.zeros_(self.output_projection.weight) - - def forward(self, spec, diffusion_step, cond): - """ - :param spec: [B, F, M, T] - :param diffusion_step: [B, 1] - :param cond: [B, H, T] - :return: - """ - - if self.n_feats == 1: - x = spec[:, 0] # [B, M, T] - else: - x = spec.flatten(start_dim=1, end_dim=2) # [B, F x M, T] - - x = self.input_projection(x.transpose(1, 2)) # [B, T, F x M] - x = x + self.conditioner_projection(cond.transpose(1, 2)) - x = x + self.diffusion_embedding(diffusion_step).unsqueeze(1) - - for layer in self.residual_layers: - x = layer(x) - - # post-norm - x = self.norm(x) - - # output projection - x = self.output_projection(x).transpose(1, 2) # [B, 128, T] - - if self.n_feats == 1: - x = x[:, None, :, :] - else: - # This is the temporary solution since PyTorch 1.13 - # does not support exporting aten::unflatten to ONNX - # x = x.unflatten(dim=1, sizes=(self.n_feats, self.in_dims)) - x = x.reshape(-1, self.n_feats, self.in_dims, x.shape[2]) - return x diff --git a/modules/backbones/wavenet.py b/modules/backbones/wavenet.py index 58724e5aa..08e57eff4 100644 --- a/modules/backbones/wavenet.py +++ b/modules/backbones/wavenet.py @@ -5,10 +5,16 @@ import torch.nn as nn import torch.nn.functional as F -from modules.commons.common_layers import SinusoidalPosEmb, Conv1d +from modules.commons.common_layers import SinusoidalPosEmb from utils.hparams import hparams +class Conv1d(torch.nn.Conv1d): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + nn.init.kaiming_normal_(self.weight) + + class ResidualBlock(nn.Module): def __init__(self, encoder_hidden, residual_channels, dilation): super().__init__() diff --git a/modules/commons/common_layers.py b/modules/commons/common_layers.py index 77381e2de..bf4a2822c 100644 --- a/modules/commons/common_layers.py +++ b/modules/commons/common_layers.py @@ -114,33 +114,9 @@ def forward(self, x): # out, gate = x.chunk(2, dim=self.dim) # Using torch.split instead of chunk for ONNX export compatibility. out, gate = torch.split(x, x.size(self.dim) // 2, dim=self.dim) - gate = F.silu(gate) - if x.dtype == torch.float16: - out_min, out_max = torch.aminmax(out.detach()) - gate_min, gate_max = torch.aminmax(gate.detach()) - max_abs_out = torch.max(-out_min, out_max).float() - max_abs_gate = torch.max(-gate_min, gate_max).float() - if max_abs_out * max_abs_gate > 1000: - return (out.float() * gate.float()).clamp(-1000, 1000).half() - return out * gate + return out * F.silu(gate) -class Conv1d(torch.nn.Conv1d): - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - nn.init.kaiming_normal_(self.weight) - - -class Transpose(nn.Module): - def __init__(self, dims): - super().__init__() - assert len(dims) == 2, 'dims must be a tuple of two dimensions' - self.dims = dims - - def forward(self, x): - return x.transpose(*self.dims) - - class TransformerFFNLayer(nn.Module): def __init__(self, hidden_size, filter_size, kernel_size=1, dropout=0., act='gelu'): super().__init__() diff --git a/modules/fastspeech/tts_modules.py b/modules/fastspeech/tts_modules.py index 16b358e3c..391de11ab 100644 --- a/modules/fastspeech/tts_modules.py +++ b/modules/fastspeech/tts_modules.py @@ -62,7 +62,7 @@ class DurationPredictor(torch.nn.Module): """ def __init__(self, in_dims, n_layers=2, n_chans=384, kernel_size=3, - dropout_rate=0.1, offset=1.0, dur_loss_type='mse', arch='resnet'): + dropout_rate=0.1, offset=1.0, dur_loss_type='mse'): """Initialize duration predictor module. Args: in_dims (int): Input dimension. @@ -76,29 +76,16 @@ def __init__(self, in_dims, n_layers=2, n_chans=384, kernel_size=3, self.offset = offset self.conv = torch.nn.ModuleList() self.kernel_size = kernel_size - self.use_resnet = (arch == 'resnet') for idx in range(n_layers): in_chans = in_dims if idx == 0 else n_chans - if self.use_resnet: - self.conv.append(nn.Sequential( - LayerNorm(in_chans, dim=1), - nn.Conv1d(in_chans, n_chans, kernel_size, stride=1, padding=kernel_size // 2), - nn.ReLU(), - nn.Conv1d(n_chans, n_chans, 1), - nn.Dropout(dropout_rate) - )) - else: - self.conv.append(nn.Sequential( - nn.Identity(), # this is a placeholder for ConstantPad1d which is now merged into Conv1d - nn.Conv1d(in_chans, n_chans, kernel_size, stride=1, padding=kernel_size // 2), - nn.ReLU(), - LayerNorm(n_chans, dim=1), - nn.Dropout(dropout_rate) - )) - if self.use_resnet and in_dims != n_chans: - self.res_conv = nn.Conv1d(in_dims, n_chans, 1) - else: - self.res_conv = None + self.conv.append(torch.nn.Sequential( + torch.nn.Identity(), # this is a placeholder for ConstantPad1d which is now merged into Conv1d + torch.nn.Conv1d(in_chans, n_chans, kernel_size, stride=1, padding=kernel_size // 2), + torch.nn.ReLU(), + LayerNorm(n_chans, dim=1), + torch.nn.Dropout(dropout_rate) + )) + self.loss_type = dur_loss_type if self.loss_type in ['mse', 'huber']: self.out_dims = 1 @@ -134,12 +121,8 @@ def forward(self, xs, x_masks=None, infer=True): xs = xs.transpose(1, -1) # (B, idim, Tmax) masks = 1 - x_masks.float() masks_ = masks[:, None, :] - for idx, f in enumerate(self.conv): - if self.use_resnet: - residual = self.res_conv(xs) if idx == 0 and self.res_conv is not None else xs - xs = residual + f(xs) - else: - xs = f(xs) + for f in self.conv: + xs = f(xs) # (B, C, Tmax) if x_masks is not None: xs = xs * masks_ xs = self.linear(xs.transpose(1, -1)) # [B, T, C] diff --git a/modules/fastspeech/variance_encoder.py b/modules/fastspeech/variance_encoder.py index 557ee6ea1..deab9ee84 100644 --- a/modules/fastspeech/variance_encoder.py +++ b/modules/fastspeech/variance_encoder.py @@ -46,8 +46,7 @@ def __init__(self, vocab_size): dropout_rate=dur_hparams['dropout'], kernel_size=dur_hparams['kernel_size'], offset=dur_hparams['log_offset'], - dur_loss_type=dur_hparams['loss_type'], - arch=dur_hparams['arch'] + dur_loss_type=dur_hparams['loss_type'] ) def forward( diff --git a/modules/optimizer/chained_optimizer.py b/modules/optimizer/chained_optimizer.py deleted file mode 100644 index b123f58e8..000000000 --- a/modules/optimizer/chained_optimizer.py +++ /dev/null @@ -1,122 +0,0 @@ -from torch import Tensor -from torch.optim import Optimizer -from torch.optim.optimizer import ParamsT -from dataclasses import dataclass -from typing import Any, Dict, List, Type, Callable, Optional, Iterable - - -@dataclass -class OptimizerSpec: - """Spec for creating an optimizer that is part of a `ChainedOptimizer`.""" - - class_type: Type[Optimizer] - init_args: Dict[str, Any] - param_filter: Optional[Callable[[Tensor], bool]] - - -class ChainedOptimizer(Optimizer): - """ - A wrapper around multiple optimizers that allows for chaining them together. - The optimizers are applied in the order they are passed in the constructor. - Each optimizer is responsible for updating a subset of the parameters, which - is determined by the `param_filter` function. If no optimizer is found for a - parameter group, an exception is raised. - """ - - def __init__( - self, - params: ParamsT, - optimizer_specs: List[OptimizerSpec], - lr: float, - weight_decay: float = 0.0, - optimizer_selection_callback: Optional[Callable[[Tensor, int], None]] = None, - **common_kwargs, - ): - self.optimizer_specs = optimizer_specs - self.optimizer_selection_callback = optimizer_selection_callback - self.optimizers: List[Optimizer] = [] - defaults = dict(lr=lr, weight_decay=weight_decay) - super().__init__(params, defaults) - - # Split the params for each optimzier - params_for_optimizers = [[] for _ in optimizer_specs] - for param_group in self.param_groups: - params = param_group["params"] - indices = param_group["optimizer_and_param_group_indices"] = set() - for param in params: - assert isinstance(param, Tensor), f"Expected a Tensor, got {type(param)}" - for index, spec in enumerate(optimizer_specs): - if spec.param_filter is None or spec.param_filter(param): - if self.optimizer_selection_callback is not None: - self.optimizer_selection_callback(param, index) - params_for_optimizers[index].append(param) - indices.add((index, 0)) - break - - # Initialize the optimizers - for spec, selected_params in zip(optimizer_specs, params_for_optimizers): - optimizer_args = { - 'lr': lr, - 'weight_decay': weight_decay, - } - optimizer_args.update(common_kwargs) - optimizer_args.update(spec.init_args) - optimizer = spec.class_type(selected_params, **optimizer_args) - self.optimizers.append(optimizer) - - def state_dict(self) -> Dict[str, Any]: - return { - "optimizers": [opt.state_dict() for opt in self.optimizers], - **super().state_dict(), - } - - def load_state_dict(self, state_dict: Dict[str, Any]) -> None: - optimizers = state_dict.pop("optimizers") - super().load_state_dict(state_dict) - for i in range(len(self.optimizers)): - self.optimizers[i].load_state_dict(optimizers[i]) - - def zero_grad(self, set_to_none: bool = True) -> None: - for opt in self.optimizers: - opt.zero_grad(set_to_none=set_to_none) - - def _copy_lr_to_optimizers(self) -> None: - for param_group in self.param_groups: - indices = param_group["optimizer_and_param_group_indices"] - for optimizer_idx, param_group_idx in indices: - self.optimizers[optimizer_idx].param_groups[param_group_idx]["lr"] = param_group["lr"] - - def step(self, closure=None) -> None: - self._copy_lr_to_optimizers() - for opt in self.optimizers: - opt.step(closure) - - def add_param_group(self, param_group: Dict[str, Any]) -> None: - super().add_param_group(param_group) - - # If optimizer has not been initialized, skip adding the param groups - if not self.optimizers: - return - - # Split the params for each optimzier - params_for_optimizers = [[] for _ in self.optimizer_specs] - params = param_group["params"] - indices = param_group["optimizer_and_param_group_indices"] = set() - for param in params: - assert isinstance(param, Tensor), f"Expected a Tensor, got {type(param)}" - found_optimizer = False - for index, spec in enumerate(self.optimizer_specs): - if spec.param_filter is None or spec.param_filter(param): - if self.optimizer_selection_callback is not None: - self.optimizer_selection_callback(param, index) - params_for_optimizers[index].append(param) - indices.add((index, len(self.optimizers[index].param_groups))) - found_optimizer = True - break - if not found_optimizer: - raise ValueError("No valid optimizer found for the given parameter group") - - # Add the selected param group to the optimizers - for optimizer, selected_params in zip(self.optimizers, params_for_optimizers): - if selected_params: - optimizer.add_param_group({"params": selected_params}) diff --git a/modules/optimizer/muon.py b/modules/optimizer/muon.py deleted file mode 100644 index 9e59c9a5d..000000000 --- a/modules/optimizer/muon.py +++ /dev/null @@ -1,129 +0,0 @@ -import torch -import torch.nn as nn -import torch.nn.functional as F -from torch import Tensor -from torch.nn import Module, Parameter, Embedding -from typing import List -from .chained_optimizer import ChainedOptimizer, OptimizerSpec - - -def zeropower_via_newtonschulz5(G: Tensor, steps: int) -> Tensor: - """ - Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a - quintic iteration whose coefficients are selected to maximize the slope at zero. For the purpose - of minimizing steps, it turns out to be empirically effective to keep increasing the slope at - zero even beyond the point where the iteration no longer converges all the way to one everywhere - on the interval. This iteration therefore does not produce UV^T but rather something like US'V^T - where S' is diagonal with S_{ii}' ~ Uniform(0.5, 1.5), which turns out not to hurt model - performance at all relative to UV^T, where USV^T = G is the SVD. - """ - assert G.ndim == 3 # batched Muon implementation by @scottjmaddox, and put into practice in the record by @YouJiacheng - a, b, c = (3.4445, -4.7750, 2.0315) - X = G.float() - if G.size(-2) > G.size(-1): - X = X.mT - - # Ensure spectral norm is at most 1 - X = F.normalize(X, p=2.0, dim=(-2, -1), eps=1e-7) - - # Perform the NS iterations - for _ in range(steps): - A = X @ X.mT - B = torch.baddbmm(A, A, A, beta=b, alpha=c) - X = torch.baddbmm(X, B, X, beta=a, alpha=1) - - if G.size(-2) > G.size(-1): - X = X.mT - return X.to(G) - - -class Muon(torch.optim.Optimizer): - """ - Muon - MomentUm Orthogonalized by Newton-schulz - - https://kellerjordan.github.io/posts/muon/ - - Muon internally runs standard SGD-momentum, and then performs an orthogonalization post- - processing step, in which each 2D parameter's update is replaced with the nearest orthogonal - matrix. To efficiently orthogonalize each update, we use a Newton-Schulz iteration, which has - the advantage that it can be stably run in bfloat16 on the GPU. - - Some warnings: - - This optimizer should not be used for the embedding layer, the final fully connected layer, - or any {0,1}-D parameters; those should all be optimized by a standard method (e.g., AdamW). - - To use it with 4D convolutional filters, it works well to just flatten their last 3 dimensions. - - Arguments: - lr: The learning rate used by the internal SGD. - momentum: The momentum used by the internal SGD. - nesterov: Whether to use Nesterov-style momentum in the internal SGD. (recommended) - ns_steps: The number of Newton-Schulz iteration steps to use. - """ - - def __init__(self, params, lr=5e-4, weight_decay=0.1, momentum=0.95, nesterov=True, ns_steps=5): - defaults = dict(lr=lr, weight_decay=weight_decay, momentum=momentum, nesterov=nesterov, ns_steps=ns_steps) - super().__init__(params, defaults) - - @torch.no_grad() - def step(self, closure=None): - for group in self.param_groups: - shape_groups = {} - for p in filter(lambda p: p.grad is not None, group["params"]): - g = p.grad - state = self.state[p] - if "momentum_buffer" not in state: - state["momentum_buffer"] = torch.zeros_like(g) - buf: Tensor = state["momentum_buffer"] - key = (p.shape, p.device, p.dtype) - if key not in shape_groups: - shape_groups[key] = {"params": [], "grads": [], "buffers": []} - shape_groups[key]["params"].append(p) - shape_groups[key]["grads"].append(g) - shape_groups[key]["buffers"].append(buf) - for key in shape_groups: - group_data = shape_groups[key] - g = torch.stack(group_data["grads"]) - buf = torch.stack(group_data["buffers"]) - buf.lerp_(g, 1 - group["momentum"]) - g = g.lerp_(buf, group["momentum"]) if group["nesterov"] else buf - if g.ndim >= 4: # for the case of conv filters - g = g.view(g.size(0), g.size(1), -1) - g = zeropower_via_newtonschulz5(g, steps=group["ns_steps"]) - for i, p in enumerate(group_data["params"]): - if group["weight_decay"] > 0: - p.data.mul_(1 - group["lr"] * group["weight_decay"]) - p.data.add_(g[i].view_as(p), alpha=-group["lr"] * max(g[i].size()) ** 0.5) - self.state[p]["momentum_buffer"] = buf[i].clone() - - -def get_params_for_muon(model) -> List[Parameter]: - """ - Filter parameters of a module into two groups: those that can be optimized by Muon, - and those that should be optimized by a standard optimizer. - Args: - module: The module to filter parameters for. - Returns: - A list of parameters that should be optimized with muon. - """ - muon_params = [] - for module in model.modules(): - for param in module.parameters(recurse=False): - if not param.requires_grad: - continue - if not isinstance(module, nn.Embedding) and param.ndim >= 2: - muon_params.append(param) - return muon_params - - -class Muon_AdamW(ChainedOptimizer): - def __init__(self, model, lr=0.0005, weight_decay=0.0, muon_args={}, adamw_args={}, verbose=False): - muon_params_id_set = set(id(p) for p in get_params_for_muon(model)) - spec_muon = OptimizerSpec(Muon, muon_args, lambda param: id(param) in muon_params_id_set) - spec_adamw = OptimizerSpec(torch.optim.AdamW, adamw_args, None) - specs = [spec_muon, spec_adamw] - callback = None - if verbose: - callback = lambda p, spec_idx: print( - f"Adding param {p.shape} to optimizer{spec_idx} {str(specs[spec_idx].class_type)}" - ) - super().__init__(model.parameters(), specs, lr=lr, weight_decay=weight_decay, optimizer_selection_callback=callback) \ No newline at end of file diff --git a/utils/__init__.py b/utils/__init__.py index 1f4c17c04..abb5df151 100644 --- a/utils/__init__.py +++ b/utils/__init__.py @@ -315,9 +315,8 @@ def helper(params): def simulate_lr_scheduler(optimizer_args, scheduler_args, step_count, num_param_groups=1): - optimizer_cls = optimizer_args['optimizer_cls'] optimizer = build_object_from_class_name( - 'torch.optim.AdamW' if optimizer_cls == 'modules.optimizer.muon.Muon_AdamW' else optimizer_cls, + optimizer_args['optimizer_cls'], torch.optim.Optimizer, [{'params': torch.nn.Parameter(), 'initial_lr': optimizer_args['lr']} for _ in range(num_param_groups)], **optimizer_args From 8603f7fd5222de5622887c8b09260c654f94686a Mon Sep 17 00:00:00 2001 From: Kakaru <97896816+KakaruHayate@users.noreply.github.com> Date: Tue, 22 Apr 2025 11:03:58 +0800 Subject: [PATCH 18/21] Fix when only predict falsetto missing mel2ph --- preprocessing/variance_binarizer.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/preprocessing/variance_binarizer.py b/preprocessing/variance_binarizer.py index e640fa616..1ce3f3c35 100644 --- a/preprocessing/variance_binarizer.py +++ b/preprocessing/variance_binarizer.py @@ -83,7 +83,8 @@ def __init__(self): predict_breathiness = hparams['predict_breathiness'] predict_voicing = hparams['predict_voicing'] predict_tension = hparams['predict_tension'] - self.predict_variances = predict_energy or predict_breathiness or predict_voicing or predict_tension + predict_falsetto = hparams['predict_falsetto'] + self.predict_variances = predict_energy or predict_breathiness or predict_voicing or predict_tension or predict_falsetto self.lr = LengthRegulator().to(self.device) self.prefer_ds = self.binarization_args['prefer_ds'] self.cached_ds = {} From 0ae9c5783e507d5890ac36f921675f23a0f206a2 Mon Sep 17 00:00:00 2001 From: Kakaru <97896816+KakaruHayate@users.noreply.github.com> Date: Thu, 29 May 2025 17:41:42 +0800 Subject: [PATCH 19/21] Update config_variance.yaml --- configs/templates/config_variance.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configs/templates/config_variance.yaml b/configs/templates/config_variance.yaml index d27cfe888..51447cb7a 100644 --- a/configs/templates/config_variance.yaml +++ b/configs/templates/config_variance.yaml @@ -111,7 +111,7 @@ pitch_prediction_args: # strong_cond: true variances_prediction_args: - total_repeat_bins: 64 + total_repeat_bins: 72 backbone_type: 'wavenet' backbone_args: num_layers: 10 From 2b0969eb6ae9a9b47dc388c762ad44161eaaff44 Mon Sep 17 00:00:00 2001 From: Kakaru <97896816+KakaruHayate@users.noreply.github.com> Date: Sat, 9 Aug 2025 13:14:40 +0800 Subject: [PATCH 20/21] retake/inpaint and expr and temperature --- modules/core/reflow.py | 100 +++++++++++++++++++++++++++++++---------- 1 file changed, 77 insertions(+), 23 deletions(-) diff --git a/modules/core/reflow.py b/modules/core/reflow.py index f09eb2392..eca705620 100644 --- a/modules/core/reflow.py +++ b/modules/core/reflow.py @@ -64,37 +64,46 @@ def forward(self, condition, gt_spec=None, src_spec=None, infer=True): return self.denorm_spec(x) @torch.no_grad() - def sample_euler(self, x, t, dt, cond): - x += self.velocity_fn(x, self.time_scale_factor * t, cond) * dt + def _get_velocity(self, x, t, cond, noise, base, expr, is_guidance): + v_pred = self.velocity_fn(x, t, cond) + if not is_guidance: + return v_pred + + v_guidance = base - noise + return expr * v_pred + (1 - expr) * v_guidance + + @torch.no_grad() + def sample_euler(self, x, t, dt, cond, noise, base, expr, is_guidance): + x += self._get_velocity(x, self.time_scale_factor * t, cond, noise, base, expr, is_guidance) * dt t += dt return x, t @torch.no_grad() - def sample_rk2(self, x, t, dt, cond): - k_1 = self.velocity_fn(x, self.time_scale_factor * t, cond) - k_2 = self.velocity_fn(x + 0.5 * k_1 * dt, self.time_scale_factor * (t + 0.5 * dt), cond) + def sample_rk2(self, x, t, dt, cond, noise, base, expr, is_guidance): + k_1 = self._get_velocity(x, self.time_scale_factor * t, cond, noise, base, expr, is_guidance) + k_2 = self._get_velocity(x + 0.5 * k_1 * dt, self.time_scale_factor * (t + 0.5 * dt), cond, noise, base, expr, is_guidance) x += k_2 * dt t += dt return x, t @torch.no_grad() - def sample_rk4(self, x, t, dt, cond): - k_1 = self.velocity_fn(x, self.time_scale_factor * t, cond) - k_2 = self.velocity_fn(x + 0.5 * k_1 * dt, self.time_scale_factor * (t + 0.5 * dt), cond) - k_3 = self.velocity_fn(x + 0.5 * k_2 * dt, self.time_scale_factor * (t + 0.5 * dt), cond) - k_4 = self.velocity_fn(x + k_3 * dt, self.time_scale_factor * (t + dt), cond) + def sample_rk4(self, x, t, dt, cond, noise, base, expr, is_guidance): + k_1 = self._get_velocity(x, self.time_scale_factor * t, cond, noise, base, expr, is_guidance) + k_2 = self._get_velocity(x + 0.5 * k_1 * dt, self.time_scale_factor * (t + 0.5 * dt), cond, noise, base, expr, is_guidance) + k_3 = self._get_velocity(x + 0.5 * k_2 * dt, self.time_scale_factor * (t + 0.5 * dt), cond, noise, base, expr, is_guidance) + k_4 = self._get_velocity(x + k_3 * dt, self.time_scale_factor * (t + dt), cond) x += (k_1 + 2 * k_2 + 2 * k_3 + k_4) * dt / 6 t += dt return x, t @torch.no_grad() - def sample_rk5(self, x, t, dt, cond): - k_1 = self.velocity_fn(x, self.time_scale_factor * t, cond) - k_2 = self.velocity_fn(x + 0.25 * k_1 * dt, self.time_scale_factor * (t + 0.25 * dt), cond) - k_3 = self.velocity_fn(x + 0.125 * (k_2 + k_1) * dt, self.time_scale_factor * (t + 0.25 * dt), cond) - k_4 = self.velocity_fn(x + 0.5 * (-k_2 + 2 * k_3) * dt, self.time_scale_factor * (t + 0.5 * dt), cond) - k_5 = self.velocity_fn(x + 0.0625 * (3 * k_1 + 9 * k_4) * dt, self.time_scale_factor * (t + 0.75 * dt), cond) - k_6 = self.velocity_fn(x + (-3 * k_1 + 2 * k_2 + 12 * k_3 - 12 * k_4 + 8 * k_5) * dt / 7, + def sample_rk5(self, x, t, dt, cond, noise, base, expr, is_guidance): + k_1 = self._get_velocity(x, self.time_scale_factor * t, cond, noise, base, expr, is_guidance) + k_2 = self._get_velocity(x + 0.25 * k_1 * dt, self.time_scale_factor * (t + 0.25 * dt), cond, noise, base, expr, is_guidance) + k_3 = self._get_velocity(x + 0.125 * (k_2 + k_1) * dt, self.time_scale_factor * (t + 0.25 * dt), cond, noise, base, expr, is_guidance) + k_4 = self._get_velocity(x + 0.5 * (-k_2 + 2 * k_3) * dt, self.time_scale_factor * (t + 0.5 * dt), cond, noise, base, expr, is_guidance) + k_5 = self._get_velocity(x + 0.0625 * (3 * k_1 + 9 * k_4) * dt, self.time_scale_factor * (t + 0.75 * dt), cond, noise, base, expr, is_guidance) + k_6 = self._get_velocity(x + (-3 * k_1 + 2 * k_2 + 12 * k_3 - 12 * k_4 + 8 * k_5) * dt / 7, self.time_scale_factor * (t + dt), cond) x += (7 * k_1 + 32 * k_3 + 12 * k_4 + 32 * k_5 + 7 * k_6) * dt / 90 @@ -102,20 +111,59 @@ def sample_rk5(self, x, t, dt, cond): return x, t @torch.no_grad() - def inference(self, cond, b=1, x_end=None, device=None): - noise = torch.randn(b, self.num_feats, self.out_dims, cond.shape[2], device=device) + def inference(self, cond, b=1, x_end=None, device=None, input_mel=None, inpaint_mask=None, inpaint_weight=None, base=None, expr=1.0, temperature=1.0): + # 在这里进行inpainting机制开启的判断和输入的处理 + # input_mel与inference结果对齐([B, T, M] or [B, F, T, M]),调整到与noise对齐([B, F, M, T]) + # inpaint_mask是一个一维布尔值([B, T]),**与retake对齐,True为mask部分**,调整到与时间维度对齐([B, 1, 1, T]) + # inpaint_weight在这里定义为一个帧级数值([B,F, T] or [B, T]),调整到与时间维度对齐([B, F, 1, T]),取值范围为0~1 + is_inpaint = inpaint_mask is not None and input_mel is not None and inpaint_weight is not None + + if is_inpaint: + inpaint_mask = inpaint_mask.float().to(device).unsqueeze(-2) # [B, F, 1, T] or [B, 1, T] + inpaint_weight = inpaint_weight.float().to(device).unsqueeze(-2) # [B, F, 1, T] or [B, 1, T] + input_mel = self.norm_spec(input_mel).transpose(-2, -1) # [B, F, M, T] or [B, M, T] + if self.num_feats == 1: + inpaint_mask = inpaint_mask[:, None, :, :] # [B, 1, 1, T] + inpaint_weight = inpaint_weight[:, None, :, :] # [B, 1, 1, T] + input_mel = input_mel[:, None, :, :] # [B, 1, M, T] + + # Training-Free Guidance + # base:[B, T] + is_guidance = base is not None and expr < 1.0 + + if is_guidance: + base = self.norm_spec(base).transpose(-2, -1).unsqueeze(-2) + if self.num_feats == 1: + base = base[:, None, :, :] # [B, 1, 1, T] + + # 在这里noise要乘上temperature,temperature在默认情况下为1.0,降低temperature会降低结果的多样性,反之亦然 + # temperature ≠ 1.0时,与训练不对齐,理论上会降低质量,实践中因为数据质量分布差异调节可能会有改善 + noise = torch.randn(b, self.num_feats, self.out_dims, cond.shape[2], device=device) * temperature t_start = hparams.get('T_start_infer', self.t_start) if self.use_shallow_diffusion and t_start > 0: assert x_end is not None, 'Missing shallow diffusion source.' + # shallow diffusion的情况下,在这里构造x_end,把input_mel和前级的输出进行拼接 + # 也就是说对于保留部分,渲染的起点也是input_mel + # 起点不考虑inpaint_weight + if is_inpaint: + x_end = x_end * inpaint_mask + input_mel * (1 - inpaint_mask) if t_start >= 1.: t_start = 1. x = x_end else: x = t_start * x_end + (1 - t_start) * noise else: - t_start = 0. - x = noise - + # 考虑直接对input_mel进行shallow diffusion的情况, 也就是说对于全扩散模型也要考虑渲染深度问题 + if is_inpaint: + if t_start >= 1.: + t_start = 1. + x = input_mel + else: + x = t_start * input_mel + (1 - t_start) * noise + else: + t_start = 0. + x = noise + algorithm = hparams['sampling_algorithm'] infer_step = hparams['sampling_steps'] @@ -132,7 +180,13 @@ def inference(self, cond, b=1, x_end=None, device=None): dts = torch.tensor([dt]).to(x) for i in tqdm(range(infer_step), desc='sample time step', total=infer_step, disable=not hparams['infer'], leave=False): - x, _ = algorithm_fn(x, t_start + i * dts, dt, cond) + ti = t_start + i * dts + x, _ = algorithm_fn(x, ti, dt, cond, noise, base, expr, is_guidance) + # **关键**,这里每一步要把去噪的结果修正到保留部分+对应噪声的结果 + # 根据inpaint_weight修正到要保留的程度 + if is_inpaint: + weight = (1 - inpaint_mask) * inpaint_weight + x = x * (1 - weight) + (input_mel * ti + noise * (1 - ti)) * weight x = x.float() x = x.transpose(2, 3).squeeze(1) # [B, F, M, T] => [B, T, M] or [B, F, T, M] return x From f50c2ae8298306cc82e64145171fd9c4d39d4a85 Mon Sep 17 00:00:00 2001 From: KakaruHayate Date: Sat, 9 Aug 2025 14:27:05 +0800 Subject: [PATCH 21/21] Revert "retake/inpaint and expr and temperature" This reverts commit 2b0969eb6ae9a9b47dc388c762ad44161eaaff44. --- modules/core/reflow.py | 100 ++++++++++------------------------------- 1 file changed, 23 insertions(+), 77 deletions(-) diff --git a/modules/core/reflow.py b/modules/core/reflow.py index eca705620..f09eb2392 100644 --- a/modules/core/reflow.py +++ b/modules/core/reflow.py @@ -64,46 +64,37 @@ def forward(self, condition, gt_spec=None, src_spec=None, infer=True): return self.denorm_spec(x) @torch.no_grad() - def _get_velocity(self, x, t, cond, noise, base, expr, is_guidance): - v_pred = self.velocity_fn(x, t, cond) - if not is_guidance: - return v_pred - - v_guidance = base - noise - return expr * v_pred + (1 - expr) * v_guidance - - @torch.no_grad() - def sample_euler(self, x, t, dt, cond, noise, base, expr, is_guidance): - x += self._get_velocity(x, self.time_scale_factor * t, cond, noise, base, expr, is_guidance) * dt + def sample_euler(self, x, t, dt, cond): + x += self.velocity_fn(x, self.time_scale_factor * t, cond) * dt t += dt return x, t @torch.no_grad() - def sample_rk2(self, x, t, dt, cond, noise, base, expr, is_guidance): - k_1 = self._get_velocity(x, self.time_scale_factor * t, cond, noise, base, expr, is_guidance) - k_2 = self._get_velocity(x + 0.5 * k_1 * dt, self.time_scale_factor * (t + 0.5 * dt), cond, noise, base, expr, is_guidance) + def sample_rk2(self, x, t, dt, cond): + k_1 = self.velocity_fn(x, self.time_scale_factor * t, cond) + k_2 = self.velocity_fn(x + 0.5 * k_1 * dt, self.time_scale_factor * (t + 0.5 * dt), cond) x += k_2 * dt t += dt return x, t @torch.no_grad() - def sample_rk4(self, x, t, dt, cond, noise, base, expr, is_guidance): - k_1 = self._get_velocity(x, self.time_scale_factor * t, cond, noise, base, expr, is_guidance) - k_2 = self._get_velocity(x + 0.5 * k_1 * dt, self.time_scale_factor * (t + 0.5 * dt), cond, noise, base, expr, is_guidance) - k_3 = self._get_velocity(x + 0.5 * k_2 * dt, self.time_scale_factor * (t + 0.5 * dt), cond, noise, base, expr, is_guidance) - k_4 = self._get_velocity(x + k_3 * dt, self.time_scale_factor * (t + dt), cond) + def sample_rk4(self, x, t, dt, cond): + k_1 = self.velocity_fn(x, self.time_scale_factor * t, cond) + k_2 = self.velocity_fn(x + 0.5 * k_1 * dt, self.time_scale_factor * (t + 0.5 * dt), cond) + k_3 = self.velocity_fn(x + 0.5 * k_2 * dt, self.time_scale_factor * (t + 0.5 * dt), cond) + k_4 = self.velocity_fn(x + k_3 * dt, self.time_scale_factor * (t + dt), cond) x += (k_1 + 2 * k_2 + 2 * k_3 + k_4) * dt / 6 t += dt return x, t @torch.no_grad() - def sample_rk5(self, x, t, dt, cond, noise, base, expr, is_guidance): - k_1 = self._get_velocity(x, self.time_scale_factor * t, cond, noise, base, expr, is_guidance) - k_2 = self._get_velocity(x + 0.25 * k_1 * dt, self.time_scale_factor * (t + 0.25 * dt), cond, noise, base, expr, is_guidance) - k_3 = self._get_velocity(x + 0.125 * (k_2 + k_1) * dt, self.time_scale_factor * (t + 0.25 * dt), cond, noise, base, expr, is_guidance) - k_4 = self._get_velocity(x + 0.5 * (-k_2 + 2 * k_3) * dt, self.time_scale_factor * (t + 0.5 * dt), cond, noise, base, expr, is_guidance) - k_5 = self._get_velocity(x + 0.0625 * (3 * k_1 + 9 * k_4) * dt, self.time_scale_factor * (t + 0.75 * dt), cond, noise, base, expr, is_guidance) - k_6 = self._get_velocity(x + (-3 * k_1 + 2 * k_2 + 12 * k_3 - 12 * k_4 + 8 * k_5) * dt / 7, + def sample_rk5(self, x, t, dt, cond): + k_1 = self.velocity_fn(x, self.time_scale_factor * t, cond) + k_2 = self.velocity_fn(x + 0.25 * k_1 * dt, self.time_scale_factor * (t + 0.25 * dt), cond) + k_3 = self.velocity_fn(x + 0.125 * (k_2 + k_1) * dt, self.time_scale_factor * (t + 0.25 * dt), cond) + k_4 = self.velocity_fn(x + 0.5 * (-k_2 + 2 * k_3) * dt, self.time_scale_factor * (t + 0.5 * dt), cond) + k_5 = self.velocity_fn(x + 0.0625 * (3 * k_1 + 9 * k_4) * dt, self.time_scale_factor * (t + 0.75 * dt), cond) + k_6 = self.velocity_fn(x + (-3 * k_1 + 2 * k_2 + 12 * k_3 - 12 * k_4 + 8 * k_5) * dt / 7, self.time_scale_factor * (t + dt), cond) x += (7 * k_1 + 32 * k_3 + 12 * k_4 + 32 * k_5 + 7 * k_6) * dt / 90 @@ -111,59 +102,20 @@ def sample_rk5(self, x, t, dt, cond, noise, base, expr, is_guidance): return x, t @torch.no_grad() - def inference(self, cond, b=1, x_end=None, device=None, input_mel=None, inpaint_mask=None, inpaint_weight=None, base=None, expr=1.0, temperature=1.0): - # 在这里进行inpainting机制开启的判断和输入的处理 - # input_mel与inference结果对齐([B, T, M] or [B, F, T, M]),调整到与noise对齐([B, F, M, T]) - # inpaint_mask是一个一维布尔值([B, T]),**与retake对齐,True为mask部分**,调整到与时间维度对齐([B, 1, 1, T]) - # inpaint_weight在这里定义为一个帧级数值([B,F, T] or [B, T]),调整到与时间维度对齐([B, F, 1, T]),取值范围为0~1 - is_inpaint = inpaint_mask is not None and input_mel is not None and inpaint_weight is not None - - if is_inpaint: - inpaint_mask = inpaint_mask.float().to(device).unsqueeze(-2) # [B, F, 1, T] or [B, 1, T] - inpaint_weight = inpaint_weight.float().to(device).unsqueeze(-2) # [B, F, 1, T] or [B, 1, T] - input_mel = self.norm_spec(input_mel).transpose(-2, -1) # [B, F, M, T] or [B, M, T] - if self.num_feats == 1: - inpaint_mask = inpaint_mask[:, None, :, :] # [B, 1, 1, T] - inpaint_weight = inpaint_weight[:, None, :, :] # [B, 1, 1, T] - input_mel = input_mel[:, None, :, :] # [B, 1, M, T] - - # Training-Free Guidance - # base:[B, T] - is_guidance = base is not None and expr < 1.0 - - if is_guidance: - base = self.norm_spec(base).transpose(-2, -1).unsqueeze(-2) - if self.num_feats == 1: - base = base[:, None, :, :] # [B, 1, 1, T] - - # 在这里noise要乘上temperature,temperature在默认情况下为1.0,降低temperature会降低结果的多样性,反之亦然 - # temperature ≠ 1.0时,与训练不对齐,理论上会降低质量,实践中因为数据质量分布差异调节可能会有改善 - noise = torch.randn(b, self.num_feats, self.out_dims, cond.shape[2], device=device) * temperature + def inference(self, cond, b=1, x_end=None, device=None): + noise = torch.randn(b, self.num_feats, self.out_dims, cond.shape[2], device=device) t_start = hparams.get('T_start_infer', self.t_start) if self.use_shallow_diffusion and t_start > 0: assert x_end is not None, 'Missing shallow diffusion source.' - # shallow diffusion的情况下,在这里构造x_end,把input_mel和前级的输出进行拼接 - # 也就是说对于保留部分,渲染的起点也是input_mel - # 起点不考虑inpaint_weight - if is_inpaint: - x_end = x_end * inpaint_mask + input_mel * (1 - inpaint_mask) if t_start >= 1.: t_start = 1. x = x_end else: x = t_start * x_end + (1 - t_start) * noise else: - # 考虑直接对input_mel进行shallow diffusion的情况, 也就是说对于全扩散模型也要考虑渲染深度问题 - if is_inpaint: - if t_start >= 1.: - t_start = 1. - x = input_mel - else: - x = t_start * input_mel + (1 - t_start) * noise - else: - t_start = 0. - x = noise - + t_start = 0. + x = noise + algorithm = hparams['sampling_algorithm'] infer_step = hparams['sampling_steps'] @@ -180,13 +132,7 @@ def inference(self, cond, b=1, x_end=None, device=None, input_mel=None, inpaint_ dts = torch.tensor([dt]).to(x) for i in tqdm(range(infer_step), desc='sample time step', total=infer_step, disable=not hparams['infer'], leave=False): - ti = t_start + i * dts - x, _ = algorithm_fn(x, ti, dt, cond, noise, base, expr, is_guidance) - # **关键**,这里每一步要把去噪的结果修正到保留部分+对应噪声的结果 - # 根据inpaint_weight修正到要保留的程度 - if is_inpaint: - weight = (1 - inpaint_mask) * inpaint_weight - x = x * (1 - weight) + (input_mel * ti + noise * (1 - ti)) * weight + x, _ = algorithm_fn(x, t_start + i * dts, dt, cond) x = x.float() x = x.transpose(2, 3).squeeze(1) # [B, F, M, T] => [B, T, M] or [B, F, T, M] return x