diff --git a/helpers/model_init_scripts/init_dummy_model.py b/helpers/model_init_scripts/init_dummy_model.py index 25f18e8..0919073 100644 --- a/helpers/model_init_scripts/init_dummy_model.py +++ b/helpers/model_init_scripts/init_dummy_model.py @@ -57,7 +57,11 @@ model.generation_config.eos_token_id = encodec_vocab_size # set other default generation config params - model.generation_config.max_length = int(30 * model.audio_encoder.config.frame_rate) + ''' + In DAC, the 'model.audio_encoder.config.frame_rate' is 86 + 'model.generation_config.max_length' should be smaller than 'max_position_embeddings' in the decoder. + ''' + model.generation_config.max_length = int(20 * model.audio_encoder.config.frame_rate) model.generation_config.do_sample = True # True model.generation_config.guidance_scale = 1 # 3.0 diff --git a/helpers/model_init_scripts/init_dummy_model_with_encodec.py b/helpers/model_init_scripts/init_dummy_model_with_encodec.py index 32242b4..0cf0baa 100644 --- a/helpers/model_init_scripts/init_dummy_model_with_encodec.py +++ b/helpers/model_init_scripts/init_dummy_model_with_encodec.py @@ -54,7 +54,11 @@ model.generation_config.eos_token_id = encodec_vocab_size # set other default generation config params - model.generation_config.max_length = int(30 * model.audio_encoder.config.frame_rate) + ''' + In encodec, the 'model.audio_encoder.config.frame_rate' is 75 + 'model.generation_config.max_length' should be smaller than 'max_position_embeddings' in the decoder. + ''' + model.generation_config.max_length = int(25 * model.audio_encoder.config.frame_rate) model.generation_config.do_sample = True # True model.generation_config.guidance_scale = 1 # 3.0 diff --git a/training/run_parler_tts_training.py b/training/run_parler_tts_training.py index 22e091f..726dbf8 100644 --- a/training/run_parler_tts_training.py +++ b/training/run_parler_tts_training.py @@ -43,7 +43,7 @@ from accelerate import Accelerator -from accelerate.utils import set_seed, AutocastKwargs, InitProcessGroupKwargs, TorchDynamoPlugin +from accelerate.utils import set_seed, AutocastKwargs, InitProcessGroupKwargs, TorchDynamoPlugin, DistributedDataParallelKwargs from accelerate.utils.memory import release_memory from parler_tts import ( @@ -97,7 +97,8 @@ def main(): padding = "max_length" if data_args.pad_to_max_length else "longest" ####### A. Preparation - kwargs_handlers = [InitProcessGroupKwargs(timeout=timedelta(minutes=60))] + kwargs_handlers = [InitProcessGroupKwargs(timeout=timedelta(minutes=60)), + DistributedDataParallelKwargs(find_unused_parameters=True)] accelerator = Accelerator( gradient_accumulation_steps=training_args.gradient_accumulation_steps, @@ -432,8 +433,8 @@ def apply_audio_decoder(batch): if accelerator.is_main_process: lab = generate_labels["labels"].cpu().transpose(1, 2).to(torch.int16) - rat = generate_labels["ratio"].cpu().squeeze() - lens = generate_labels["len_audio"].cpu().squeeze() + rat = generate_labels["ratio"].cpu().reshape(-1) + lens = generate_labels["len_audio"].cpu().reshape(-1) lab = [l[:, : int(ratio * length)] for (l, ratio, length) in zip(lab, rat, lens)] all_generated_labels.extend(lab)