172172 " !pip install onnx onnxsim #onnx==1.12.0 onnxsim==0.4.10\n " ,
173173 " clear_output()\n " ,
174174 " !aria2c https://github.com/openvpi/vocoders/releases/download/nsf-hifigan-v1/nsf_hifigan_20221211.zip\n " ,
175+ " !aria2c https://github.com/openvpi/DiffSinger/releases/download/v2.1.0/rmvpe.zip\n " ,
175176 " !unzip -q /content/nsf_hifigan_20221211.zip -d /content/DiffSinger/checkpoints\n " ,
177+ " !unzip -q /content/rmvpe.zip -d /content/DiffSinger/checkpoints\n " ,
178+ " !rm /content/nsf_hifigan_20221211.zip\n " ,
179+ " !rm /content/rmvpe.zip\n " ,
176180 " clear_output()\n " ,
177181 " !pip install --upgrade tensorboard\n " ,
178182 " clear_output()\n " ,
217221 " \n " ,
218222 " data_zip_path = \"\" #@param {type:\" string\" }\n " ,
219223 " \n " ,
224+ " #@markdown ___\n " ,
225+ " \n " ,
226+ " #@markdown this lower section is for variance training\n " ,
220227 " \n " ,
221228 " #@markdown <font size=\" -1.5\" > Use this if you don't have .cvs that is for variance dataset (skippable if you are doing acoustic)\n " ,
222229 " \n " ,
223230 " estimate_midi = False # @param {type:\" boolean\" }\n " ,
224231 " \n " ,
232+ " #@markdown <font size=\" -1.5\" > Pitch extractor algorithm - for converting your data to DB-ready format (rmvpe is more accurate but has range limit)\n " ,
233+ " \n " ,
234+ " f0_ext = \" parselmouth\" # @param [\" parselmouth\" , \" rmvpe\" ]\n " ,
235+ " \n " ,
225236 " #@markdown <font size=\" -1.5\" > Use this if your data is not in diffsinger's preferred format (data are under 30 seconds | have \" AP\" label in your lab)\n " ,
226237 " \n " ,
227238 " default_converter_setting = False # @param {type:\" boolean\" }\n " ,
303314 " f.write(\" \" .join(consonant_data))\n " ,
304315 " \n " ,
305316 " # idk i just feel like 800 is a lil low for some people\n " ,
306- " new_f0_max = 1600 \n " ,
317+ " new_f0_max = 1760 \n " ,
307318 " og_script = \" /content/MakeDiffSinger/variance-temp-solution/get_pitch.py\"\n " ,
308319 " with open(og_script, 'r') as file:\n " ,
309320 " mate = file.read()\n " ,
314325 " if no_warn:\n " ,
315326 " !python /content/MakeDiffSinger/variance-temp-solution/add_ph_num.py {all_shits}/diffsinger_db/transcriptions.csv --vowels /content/DiffSinger/dictionaries/vowels.txt --consonants /content/DiffSinger/dictionaries/consonants.txt 2> /dev/null\n " ,
316327 " clear_output()\n " ,
317- " !python /content/MakeDiffSinger/variance-temp-solution/estimate_midi.py {all_shits}/diffsinger_db/transcriptions.csv {all_shits}/diffsinger_db/wavs 2> /dev/null\n " ,
328+ " !python /content/MakeDiffSinger/variance-temp-solution/estimate_midi.py {all_shits}/diffsinger_db/transcriptions.csv {all_shits}/diffsinger_db/wavs --pe {f0_est} 2> /dev/null\n " ,
318329 " clear_output()\n " ,
319330 " !python /content/MakeDiffSinger/variance-temp-solution/convert_ds.py csv2ds {all_shits}/diffsinger_db/transcriptions.csv {all_shits}/diffsinger_db/wavs 2> /dev/null\n " ,
320331 " clear_output()\n " ,
327338 " else:\n " ,
328339 " !python /content/MakeDiffSinger/variance-temp-solution/add_ph_num.py {all_shits}/diffsinger_db/transcriptions.csv --vowels /content/DiffSinger/dictionaries/vowels.txt --consonants /content/DiffSinger/dictionaries/consonants.txt\n " ,
329340 " clear_output()\n " ,
330- " !python /content/MakeDiffSinger/variance-temp-solution/estimate_midi.py {all_shits}/diffsinger_db/transcriptions.csv {all_shits}/diffsinger_db/wavs\n " ,
341+ " !python /content/MakeDiffSinger/variance-temp-solution/estimate_midi.py {all_shits}/diffsinger_db/transcriptions.csv {all_shits}/diffsinger_db/wavs --pe {f0_est} \n " ,
331342 " clear_output()\n " ,
332343 " !python /content/MakeDiffSinger/variance-temp-solution/convert_ds.py csv2ds {all_shits}/diffsinger_db/transcriptions.csv {all_shits}/diffsinger_db/wavs\n " ,
333344 " clear_output()\n " ,
368379 " #@markdown <font size=\" -1.5\" > Path to where you want to save your binary data for later use\n " ,
369380 " binary_save_dir = \"\" #@param{type:\" string\" }\n " ,
370381 " \n " ,
382+ " #@markdown <font size=\" -1.5\" > Pitch extractor algorithm\n " ,
383+ " \n " ,
384+ " f0_ext = \" parselmouth\" # @param [\" parselmouth\" , \" rmvpe\" ]\n " ,
385+ " if f0_ext == \" rmvpe\" :\n " ,
386+ " pe_ckpt_pth = \" checkpoints/rmvpe/model.pt\"\n " ,
387+ " else:\n " ,
388+ " pe_ckpt_pth = null\n " ,
389+ " \n " ,
371390 " #@markdown <font size=\" -1.5\" > Select this is you want to use data augmentation (default pitch shift and time stretch values)\n " ,
372391 " data_aug = False #@param {type:\" boolean\" }\n " ,
373392 " \n " ,
400419 " bitch_ass_config[\" use_speed_embed\" ] = data_aug\n " ,
401420 " bitch_ass_config[\" max_batch_size\" ] = 9 #ive never tried reaching the limit so ill trust kei's setting for this\n " ,
402421 " bitch_ass_config[\" val_check_interval\" ] = save_interval\n " ,
422+ " bitch_ass_config[\" pe\" ] = f0_ext\n " ,
423+ " bitch_ass_config[\" pe_ckpt\" ] = pe_ckpt_pth\n " ,
403424 " with open(\" /content/DiffSinger/configs/acoustic.yaml\" , \" w\" ) as config:\n " ,
404425 " yaml.dump(bitch_ass_config, config)\n " ,
405426 " else:\n " ,
412433 " bitch_ass_config[\" dictionary\" ] = \" dictionaries/custom_dict.txt\"\n " ,
413434 " bitch_ass_config[\" max_batch_size\" ] = 9 #ive never tried reaching the limit so ill trust kei's setting for this\n " ,
414435 " bitch_ass_config[\" val_check_interval\" ] = save_interval\n " ,
436+ " bitch_ass_config[\" pe\" ] = f0_ext # i think variance uses it for pitch ref as ground-truth for pitch training soooo\n " ,
437+ " bitch_ass_config[\" pe_ckpt\" ] = pe_ckpt_pth #same goes to this one\n " ,
415438 " with open(\" /content/DiffSinger/configs/variance.yaml\" , \" w\" ) as config:\n " ,
416439 " yaml.dump(bitch_ass_config, config)\n " ,
417440 " \n " ,
476499 " print(\"\\ n\" )\n " ,
477500 " print(f\" speaker name: {spk_name}\" )\n " ,
478501 " print(\"\\ n\" )\n " ,
502+ " print(f\" data augmentation: {data_aug}\" )\n " ,
503+ " print(\"\\ n\" )\n " ,
504+ " print(f\" pitch extractor: {f0_ext}\" )\n " ,
505+ " print(\"\\ n\" )\n " ,
479506 " print(f\" binary data save directory: {binary_save_dir}\" )\n " ,
480507 " print(\"\\ n\" )\n " ,
481508 " print(f\" your model will be saved every: {save_interval} steps\" )\n " ,
745772 },
746773 "execution_count" : null ,
747774 "outputs" : []
775+ },
776+ {
777+ "cell_type" : " markdown" ,
778+ "source" : [
779+ " # Last Section Note\n " ,
780+ " Wow you made it to the very bottom.... Why though lmao hahahahhshahhasdksajidhasjl\n " ,
781+ " \n " ,
782+ " Anyways, now that you are here i guess ill tell you my plan/todo list for this notebook \\\n " ,
783+ " (feel free to suggest anything via [discord](https://discord.com/invite/wwbu2JUMjj) my user display name is MLo7 and my user name is ghin_mlo7)\n " ,
784+ " \n " ,
785+ " todo list:\n " ,
786+ " - add support for premade/refined data\n " ,
787+ " - add multi-singer training\n " ,
788+ " - add OpenUtau voicebank builder\n " ,
789+ " - add link to vocoder training notebook (yet to be ready) or add a vocoder training section\n " ,
790+ " \n " ,
791+ " If you want to add anything to this list then again, just ping or message me lmao"
792+ ],
793+ "metadata" : {
794+ "id" : " Ljl8Yr6wM3Ma"
795+ }
748796 }
749797 ]
750798}
0 commit comments