Added an option to use rmvpe

MLo7Ghinsan · MLo7Ghinsan · commit 88bab4a97e48 · 2023-08-31T19:59:57.000-05:00
diff --git a/DiffSinger_colab_notebook.ipynb b/DiffSinger_colab_notebook.ipynb
@@ -172,7 +172,11 @@
         "!pip install onnx onnxsim #onnx==1.12.0 onnxsim==0.4.10\n",
         "clear_output()\n",
         "!aria2c https://github.com/openvpi/vocoders/releases/download/nsf-hifigan-v1/nsf_hifigan_20221211.zip\n",
+        "!aria2c https://github.com/openvpi/DiffSinger/releases/download/v2.1.0/rmvpe.zip\n",
         "!unzip -q /content/nsf_hifigan_20221211.zip -d /content/DiffSinger/checkpoints\n",
+        "!unzip -q /content/rmvpe.zip -d /content/DiffSinger/checkpoints\n",
+        "!rm /content/nsf_hifigan_20221211.zip\n",
+        "!rm /content/rmvpe.zip\n",
         "clear_output()\n",
         "!pip install --upgrade tensorboard\n",
         "clear_output()\n",
@@ -217,11 +221,18 @@
         "\n",
         "data_zip_path = \"\" #@param {type:\"string\"}\n",
         "\n",
+        "#@markdown ___\n",
+        "\n",
+        "#@markdown this lower section is for variance training\n",
         "\n",
         "#@markdown <font size=\"-1.5\"> Use this if you don't have .cvs that is for variance dataset (skippable if you are doing acoustic)\n",
         "\n",
         "estimate_midi = False # @param {type:\"boolean\"}\n",
         "\n",
+        "#@markdown <font size=\"-1.5\"> Pitch extractor algorithm - for converting your data to DB-ready format (rmvpe is more accurate but has range limit)\n",
+        "\n",
+        "f0_ext = \"parselmouth\" # @param [\"parselmouth\", \"rmvpe\"]\n",
+        "\n",
         "#@markdown <font size=\"-1.5\"> Use this if your data is not in diffsinger's preferred format (data are under 30 seconds | have \"AP\" label in your lab)\n",
         "\n",
         "default_converter_setting = False # @param {type:\"boolean\"}\n",
@@ -303,7 +314,7 @@
         "        f.write(\" \".join(consonant_data))\n",
         "\n",
         "    # idk i just feel like 800 is a lil low for some people\n",
-        "    new_f0_max = 1600\n",
+        "    new_f0_max = 1760\n",
         "    og_script = \"/content/MakeDiffSinger/variance-temp-solution/get_pitch.py\"\n",
         "    with open(og_script, 'r') as file:\n",
         "        mate = file.read()\n",
@@ -314,7 +325,7 @@
         "    if no_warn:\n",
         "        !python /content/MakeDiffSinger/variance-temp-solution/add_ph_num.py {all_shits}/diffsinger_db/transcriptions.csv --vowels /content/DiffSinger/dictionaries/vowels.txt --consonants /content/DiffSinger/dictionaries/consonants.txt 2> /dev/null\n",
         "        clear_output()\n",
-        "        !python /content/MakeDiffSinger/variance-temp-solution/estimate_midi.py {all_shits}/diffsinger_db/transcriptions.csv {all_shits}/diffsinger_db/wavs 2> /dev/null\n",
+        "        !python /content/MakeDiffSinger/variance-temp-solution/estimate_midi.py {all_shits}/diffsinger_db/transcriptions.csv {all_shits}/diffsinger_db/wavs --pe {f0_est} 2> /dev/null\n",
         "        clear_output()\n",
         "        !python /content/MakeDiffSinger/variance-temp-solution/convert_ds.py csv2ds {all_shits}/diffsinger_db/transcriptions.csv {all_shits}/diffsinger_db/wavs 2> /dev/null\n",
         "        clear_output()\n",
@@ -327,7 +338,7 @@
         "    else:\n",
         "        !python /content/MakeDiffSinger/variance-temp-solution/add_ph_num.py {all_shits}/diffsinger_db/transcriptions.csv --vowels /content/DiffSinger/dictionaries/vowels.txt --consonants /content/DiffSinger/dictionaries/consonants.txt\n",
         "        clear_output()\n",
-        "        !python /content/MakeDiffSinger/variance-temp-solution/estimate_midi.py {all_shits}/diffsinger_db/transcriptions.csv {all_shits}/diffsinger_db/wavs\n",
+        "        !python /content/MakeDiffSinger/variance-temp-solution/estimate_midi.py {all_shits}/diffsinger_db/transcriptions.csv {all_shits}/diffsinger_db/wavs --pe {f0_est}\n",
         "        clear_output()\n",
         "        !python /content/MakeDiffSinger/variance-temp-solution/convert_ds.py csv2ds {all_shits}/diffsinger_db/transcriptions.csv {all_shits}/diffsinger_db/wavs\n",
         "        clear_output()\n",
@@ -368,6 +379,14 @@
         "#@markdown <font size=\"-1.5\"> Path to where you want to save your binary data for later use\n",
         "binary_save_dir = \"\" #@param{type:\"string\"}\n",
         "\n",
+        "#@markdown <font size=\"-1.5\"> Pitch extractor algorithm\n",
+        "\n",
+        "f0_ext = \"parselmouth\" # @param [\"parselmouth\", \"rmvpe\"]\n",
+        "if f0_ext == \"rmvpe\":\n",
+        "    pe_ckpt_pth = \"checkpoints/rmvpe/model.pt\"\n",
+        "else:\n",
+        "    pe_ckpt_pth = null\n",
+        "\n",
         "#@markdown <font size=\"-1.5\"> Select this is you want to use data augmentation (default pitch shift and time stretch values)\n",
         "data_aug = False #@param {type:\"boolean\"}\n",
         "\n",
@@ -400,6 +419,8 @@
         "    bitch_ass_config[\"use_speed_embed\"] = data_aug\n",
         "    bitch_ass_config[\"max_batch_size\"] = 9 #ive never tried reaching the limit so ill trust kei's setting for this\n",
         "    bitch_ass_config[\"val_check_interval\"] = save_interval\n",
+        "    bitch_ass_config[\"pe\"] = f0_ext\n",
+        "    bitch_ass_config[\"pe_ckpt\"] = pe_ckpt_pth\n",
         "    with open(\"/content/DiffSinger/configs/acoustic.yaml\", \"w\") as config:\n",
         "        yaml.dump(bitch_ass_config, config)\n",
         "else:\n",
@@ -412,6 +433,8 @@
         "    bitch_ass_config[\"dictionary\"] = \"dictionaries/custom_dict.txt\"\n",
         "    bitch_ass_config[\"max_batch_size\"] = 9 #ive never tried reaching the limit so ill trust kei's setting for this\n",
         "    bitch_ass_config[\"val_check_interval\"] = save_interval\n",
+        "    bitch_ass_config[\"pe\"] = f0_ext # i think variance uses it for pitch ref as ground-truth for pitch training soooo\n",
+        "    bitch_ass_config[\"pe_ckpt\"] = pe_ckpt_pth #same goes to this one\n",
         "    with open(\"/content/DiffSinger/configs/variance.yaml\", \"w\") as config:\n",
         "        yaml.dump(bitch_ass_config, config)\n",
         "\n",
@@ -476,6 +499,10 @@
         "print(\"\\n\")\n",
         "print(f\"speaker name: {spk_name}\")\n",
         "print(\"\\n\")\n",
+        "print(f\"data augmentation: {data_aug}\")\n",
+        "print(\"\\n\")\n",
+        "print(f\"pitch extractor: {f0_ext}\")\n",
+        "print(\"\\n\")\n",
         "print(f\"binary data save directory: {binary_save_dir}\")\n",
         "print(\"\\n\")\n",
         "print(f\"your model will be saved every: {save_interval} steps\")\n",
@@ -745,6 +772,27 @@
       },
       "execution_count": null,
       "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "# Last Section Note\n",
+        "Wow you made it to the very bottom.... Why though lmao hahahahhshahhasdksajidhasjl\n",
+        "\n",
+        "Anyways, now that you are here i guess ill tell you my plan/todo list for this notebook \\\n",
+        "(feel free to suggest anything via [discord](https://discord.com/invite/wwbu2JUMjj) my user display name is MLo7 and my user name is ghin_mlo7)\n",
+        "\n",
+        "todo list:\n",
+        "- add support for premade/refined data\n",
+        "- add multi-singer training\n",
+        "- add OpenUtau voicebank builder\n",
+        "- add link to vocoder training notebook (yet to be ready) or add a vocoder training section\n",
+        "\n",
+        "If you want to add anything to this list then again, just ping or message me lmao"
+      ],
+      "metadata": {
+        "id": "Ljl8Yr6wM3Ma"
+      }
     }
   ]
 }