replace 300M reference to 600M and Mini

ylacombe · ylacombe · commit 5b593f5855b5 · 2024-04-10T13:29:00.000+02:00
diff --git a/README.md b/README.md
@@ -33,8 +33,8 @@ import torch
 
 device = "cuda:0" if torch.cuda.is_available() else "cpu"
 
-model = ParlerTTSForConditionalGeneration.from_pretrained("parler-tts/parler_tts_300M_v0.1").to(device)
-tokenizer = AutoTokenizer.from_pretrained("parler-tts/parler_tts_300M_v0.1")
+model = ParlerTTSForConditionalGeneration.from_pretrained("parler-tts/parler_tts_mini_v0.1").to(device)
+tokenizer = AutoTokenizer.from_pretrained("parler-tts/parler_tts_mini_v0.1")
 
 prompt = "Hey, how are you doing today?"
 description = "A female speaker with a slightly low-pitched voice delivers her words quite expressively, in a very confined sounding environment with clear audio quality. She speaks very fast."
diff --git a/helpers/gradio_demo/app.py b/helpers/gradio_demo/app.py
@@ -6,7 +6,7 @@
 
 device = "cuda:0" if torch.cuda.is_available() else "cpu"
 
-repo_id = "parler-tts/parler_tts_300M_v0.1"
+repo_id = "parler-tts/parler_tts_mini_v0.1"
 
 model = ParlerTTSForConditionalGeneration.from_pretrained(repo_id).to(device)
 tokenizer = AutoTokenizer.from_pretrained(repo_id)
diff --git a/helpers/model_init_scripts/init_model_300M.py b/helpers/model_init_scripts/init_model_300M.py
@@ -64,4 +64,4 @@
     model.config.pad_token_id = encodec_vocab_size
     model.config.decoder_start_token_id = encodec_vocab_size+1
 
-    model.save_pretrained(os.path.join(args.save_directory, "parler-tts-untrained-300M/"))
+    model.save_pretrained(os.path.join(args.save_directory, "parler-tts-untrained-600M/"))
diff --git a/helpers/push_to_hub_scripts/push_trained_parler_tts_to_hub.py b/helpers/push_to_hub_scripts/push_trained_parler_tts_to_hub.py
@@ -2,7 +2,7 @@
 from transformers import AutoTokenizer, AutoFeatureExtractor
 
 path = "TODO"
-repo_id = "parler_tts_300M"
+repo_id = "parler_tts_600M"
 
 
 AutoFeatureExtractor.from_pretrained("ylacombe/dac_44khZ_8kbps").push_to_hub(repo_id)
diff --git a/helpers/training_configs/librispeech_tts_r_300M_dummy.json b/helpers/training_configs/librispeech_tts_r_300M_dummy.json
@@ -1,5 +1,5 @@
 {
-    "model_name_or_path": "./parler-tts-untrained-300M/parler-tts-untrained-300M/",
+    "model_name_or_path": "./parler-tts-untrained-600M/parler-tts-untrained-600M/",
     "save_to_disk":  "./tmp_dataset_audio/",
     "temporary_save_to_disk": "./audio_code_tmp/",
 
diff --git a/helpers/training_configs/starting_point_0.01.json b/helpers/training_configs/starting_point_0.01.json
@@ -1,5 +1,5 @@
 {
-    "model_name_or_path": "./parler-tts-untrained-300M/parler-tts-untrained-300M/",
+    "model_name_or_path": "./parler-tts-untrained-600M/parler-tts-untrained-600M/",
     "save_to_disk":  "./tmp_dataset_audio/",
     "temporary_save_to_disk": "./audio_code_tmp/",
 
diff --git a/training/README.md b/training/README.md
@@ -71,18 +71,18 @@ And then enter an authentication token from https://huggingface.co/settings/toke
 
 Depending on your compute resources and your dataset, you need to choose between fine-tuning a pre-trained model and training a new model from scratch.
 
-In that sense, we released a 300M checkpoint trained on 10.5K hours of annotated data under the repository id: [`parler-tts/parler_tts_300M_v0.1`](https://huggingface.co/parler-tts/parler_tts_300M_v0.1), that you can fine-tune for your own use-case.
+In that sense, we released a 600M checkpoint trained on 10.5K hours of annotated data under the repository id: [`parler-tts/parler_tts_mini_v0.1`](https://huggingface.co/parler-tts/parler_tts_mini_v0.1), that you can fine-tune for your own use-case.
 
 You can also train you own model from scratch. You can find [here](/helpers/model_init_scripts/) examples on how to initialize a model from scratch. For example, you can initialize a dummy model with:
 
 ```sh
 python helpers/model_init_scripts/init_dummy_model.py ./parler-tts-untrained-dummy --text_model "google-t5/t5-small" --audio_model "parler-tts/dac_44khZ_8kbps"
 ```
 
-In the rest of this guide, and to reproduce the Parler-TTS v0.1 training recipe, we'll use a 300-M parameters that we'll initialize with:
+In the rest of this guide, and to reproduce the Parler-TTS v0.1 training recipe, we'll use a 600-M parameters model that we'll initialize with:
 
 ```sh
-python helpers/model_init_scripts/init_model_300M.py ./parler-tts-untrained-300M --text_model "google/flan-t5-base" --audio_model "parler-tts/dac_44khZ_8kbps"
+python helpers/model_init_scripts/init_model_600M.py ./parler-tts-untrained-600M --text_model "google/flan-t5-base" --audio_model "parler-tts/dac_44khZ_8kbps"
 ```
 
 
@@ -113,7 +113,7 @@ To train Parler-TTS v0.1, we roughly used:
 
 ```sh
 accelerate launch ./training/run_parler_tts_training.py \
-    --model_name_or_path "./parler-tts-untrained-300M/parler-tts-untrained-300M/" \
+    --model_name_or_path "./parler-tts-untrained-600M/parler-tts-untrained-600M/" \
     --feature_extractor_name "parler-tts/dac_44khZ_8kbps" \
     --description_tokenizer_name "google/flan-t5-base" \
     --prompt_tokenizer_name "google/flan-t5-base" \
@@ -202,4 +202,4 @@ And finally, two additional comments:
 
 > [!TIP]
 > Fine-tuning is as easy as modifying `model_name_or_path` to a pre-trained model.
-> For example: `--model_name_or_path parler-tts/parler_tts_300M_v0.1`.
+> For example: `--model_name_or_path parler-tts/parler_tts_mini_v0.1`.

Original file line number	Diff line number	Diff line change
`@@ -1,5 +1,5 @@`
`1`	`1`	`{`
`2`		`- "model_name_or_path": "./parler-tts-untrained-300M/parler-tts-untrained-300M/",`
	`2`	`+ "model_name_or_path": "./parler-tts-untrained-600M/parler-tts-untrained-600M/",`
`3`	`3`	`"save_to_disk": "./tmp_dataset_audio/",`
`4`	`4`	`"temporary_save_to_disk": "./audio_code_tmp/",`
`5`	`5`