Skip to content
This repository was archived by the owner on Dec 28, 2025. It is now read-only.

Commit 88bab4a

Browse files
committed
Added an option to use rmvpe
1 parent b5bc292 commit 88bab4a

File tree

1 file changed

+51
-3
lines changed

1 file changed

+51
-3
lines changed

DiffSinger_colab_notebook.ipynb

Lines changed: 51 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -172,7 +172,11 @@
172172
"!pip install onnx onnxsim #onnx==1.12.0 onnxsim==0.4.10\n",
173173
"clear_output()\n",
174174
"!aria2c https://github.com/openvpi/vocoders/releases/download/nsf-hifigan-v1/nsf_hifigan_20221211.zip\n",
175+
"!aria2c https://github.com/openvpi/DiffSinger/releases/download/v2.1.0/rmvpe.zip\n",
175176
"!unzip -q /content/nsf_hifigan_20221211.zip -d /content/DiffSinger/checkpoints\n",
177+
"!unzip -q /content/rmvpe.zip -d /content/DiffSinger/checkpoints\n",
178+
"!rm /content/nsf_hifigan_20221211.zip\n",
179+
"!rm /content/rmvpe.zip\n",
176180
"clear_output()\n",
177181
"!pip install --upgrade tensorboard\n",
178182
"clear_output()\n",
@@ -217,11 +221,18 @@
217221
"\n",
218222
"data_zip_path = \"\" #@param {type:\"string\"}\n",
219223
"\n",
224+
"#@markdown ___\n",
225+
"\n",
226+
"#@markdown this lower section is for variance training\n",
220227
"\n",
221228
"#@markdown <font size=\"-1.5\"> Use this if you don't have .cvs that is for variance dataset (skippable if you are doing acoustic)\n",
222229
"\n",
223230
"estimate_midi = False # @param {type:\"boolean\"}\n",
224231
"\n",
232+
"#@markdown <font size=\"-1.5\"> Pitch extractor algorithm - for converting your data to DB-ready format (rmvpe is more accurate but has range limit)\n",
233+
"\n",
234+
"f0_ext = \"parselmouth\" # @param [\"parselmouth\", \"rmvpe\"]\n",
235+
"\n",
225236
"#@markdown <font size=\"-1.5\"> Use this if your data is not in diffsinger's preferred format (data are under 30 seconds | have \"AP\" label in your lab)\n",
226237
"\n",
227238
"default_converter_setting = False # @param {type:\"boolean\"}\n",
@@ -303,7 +314,7 @@
303314
" f.write(\" \".join(consonant_data))\n",
304315
"\n",
305316
" # idk i just feel like 800 is a lil low for some people\n",
306-
" new_f0_max = 1600\n",
317+
" new_f0_max = 1760\n",
307318
" og_script = \"/content/MakeDiffSinger/variance-temp-solution/get_pitch.py\"\n",
308319
" with open(og_script, 'r') as file:\n",
309320
" mate = file.read()\n",
@@ -314,7 +325,7 @@
314325
" if no_warn:\n",
315326
" !python /content/MakeDiffSinger/variance-temp-solution/add_ph_num.py {all_shits}/diffsinger_db/transcriptions.csv --vowels /content/DiffSinger/dictionaries/vowels.txt --consonants /content/DiffSinger/dictionaries/consonants.txt 2> /dev/null\n",
316327
" clear_output()\n",
317-
" !python /content/MakeDiffSinger/variance-temp-solution/estimate_midi.py {all_shits}/diffsinger_db/transcriptions.csv {all_shits}/diffsinger_db/wavs 2> /dev/null\n",
328+
" !python /content/MakeDiffSinger/variance-temp-solution/estimate_midi.py {all_shits}/diffsinger_db/transcriptions.csv {all_shits}/diffsinger_db/wavs --pe {f0_est} 2> /dev/null\n",
318329
" clear_output()\n",
319330
" !python /content/MakeDiffSinger/variance-temp-solution/convert_ds.py csv2ds {all_shits}/diffsinger_db/transcriptions.csv {all_shits}/diffsinger_db/wavs 2> /dev/null\n",
320331
" clear_output()\n",
@@ -327,7 +338,7 @@
327338
" else:\n",
328339
" !python /content/MakeDiffSinger/variance-temp-solution/add_ph_num.py {all_shits}/diffsinger_db/transcriptions.csv --vowels /content/DiffSinger/dictionaries/vowels.txt --consonants /content/DiffSinger/dictionaries/consonants.txt\n",
329340
" clear_output()\n",
330-
" !python /content/MakeDiffSinger/variance-temp-solution/estimate_midi.py {all_shits}/diffsinger_db/transcriptions.csv {all_shits}/diffsinger_db/wavs\n",
341+
" !python /content/MakeDiffSinger/variance-temp-solution/estimate_midi.py {all_shits}/diffsinger_db/transcriptions.csv {all_shits}/diffsinger_db/wavs --pe {f0_est}\n",
331342
" clear_output()\n",
332343
" !python /content/MakeDiffSinger/variance-temp-solution/convert_ds.py csv2ds {all_shits}/diffsinger_db/transcriptions.csv {all_shits}/diffsinger_db/wavs\n",
333344
" clear_output()\n",
@@ -368,6 +379,14 @@
368379
"#@markdown <font size=\"-1.5\"> Path to where you want to save your binary data for later use\n",
369380
"binary_save_dir = \"\" #@param{type:\"string\"}\n",
370381
"\n",
382+
"#@markdown <font size=\"-1.5\"> Pitch extractor algorithm\n",
383+
"\n",
384+
"f0_ext = \"parselmouth\" # @param [\"parselmouth\", \"rmvpe\"]\n",
385+
"if f0_ext == \"rmvpe\":\n",
386+
" pe_ckpt_pth = \"checkpoints/rmvpe/model.pt\"\n",
387+
"else:\n",
388+
" pe_ckpt_pth = null\n",
389+
"\n",
371390
"#@markdown <font size=\"-1.5\"> Select this is you want to use data augmentation (default pitch shift and time stretch values)\n",
372391
"data_aug = False #@param {type:\"boolean\"}\n",
373392
"\n",
@@ -400,6 +419,8 @@
400419
" bitch_ass_config[\"use_speed_embed\"] = data_aug\n",
401420
" bitch_ass_config[\"max_batch_size\"] = 9 #ive never tried reaching the limit so ill trust kei's setting for this\n",
402421
" bitch_ass_config[\"val_check_interval\"] = save_interval\n",
422+
" bitch_ass_config[\"pe\"] = f0_ext\n",
423+
" bitch_ass_config[\"pe_ckpt\"] = pe_ckpt_pth\n",
403424
" with open(\"/content/DiffSinger/configs/acoustic.yaml\", \"w\") as config:\n",
404425
" yaml.dump(bitch_ass_config, config)\n",
405426
"else:\n",
@@ -412,6 +433,8 @@
412433
" bitch_ass_config[\"dictionary\"] = \"dictionaries/custom_dict.txt\"\n",
413434
" bitch_ass_config[\"max_batch_size\"] = 9 #ive never tried reaching the limit so ill trust kei's setting for this\n",
414435
" bitch_ass_config[\"val_check_interval\"] = save_interval\n",
436+
" bitch_ass_config[\"pe\"] = f0_ext # i think variance uses it for pitch ref as ground-truth for pitch training soooo\n",
437+
" bitch_ass_config[\"pe_ckpt\"] = pe_ckpt_pth #same goes to this one\n",
415438
" with open(\"/content/DiffSinger/configs/variance.yaml\", \"w\") as config:\n",
416439
" yaml.dump(bitch_ass_config, config)\n",
417440
"\n",
@@ -476,6 +499,10 @@
476499
"print(\"\\n\")\n",
477500
"print(f\"speaker name: {spk_name}\")\n",
478501
"print(\"\\n\")\n",
502+
"print(f\"data augmentation: {data_aug}\")\n",
503+
"print(\"\\n\")\n",
504+
"print(f\"pitch extractor: {f0_ext}\")\n",
505+
"print(\"\\n\")\n",
479506
"print(f\"binary data save directory: {binary_save_dir}\")\n",
480507
"print(\"\\n\")\n",
481508
"print(f\"your model will be saved every: {save_interval} steps\")\n",
@@ -745,6 +772,27 @@
745772
},
746773
"execution_count": null,
747774
"outputs": []
775+
},
776+
{
777+
"cell_type": "markdown",
778+
"source": [
779+
"# Last Section Note\n",
780+
"Wow you made it to the very bottom.... Why though lmao hahahahhshahhasdksajidhasjl\n",
781+
"\n",
782+
"Anyways, now that you are here i guess ill tell you my plan/todo list for this notebook \\\n",
783+
"(feel free to suggest anything via [discord](https://discord.com/invite/wwbu2JUMjj) my user display name is MLo7 and my user name is ghin_mlo7)\n",
784+
"\n",
785+
"todo list:\n",
786+
"- add support for premade/refined data\n",
787+
"- add multi-singer training\n",
788+
"- add OpenUtau voicebank builder\n",
789+
"- add link to vocoder training notebook (yet to be ready) or add a vocoder training section\n",
790+
"\n",
791+
"If you want to add anything to this list then again, just ping or message me lmao"
792+
],
793+
"metadata": {
794+
"id": "Ljl8Yr6wM3Ma"
795+
}
748796
}
749797
]
750798
}

0 commit comments

Comments
 (0)