|
2 | 2 | "cells": [ |
3 | 3 | { |
4 | 4 | "cell_type": "code", |
5 | | - "execution_count": 1, |
| 5 | + "execution_count": null, |
6 | 6 | "id": "466ccdc5", |
7 | 7 | "metadata": {}, |
8 | | - "outputs": [ |
9 | | - { |
10 | | - "name": "stderr", |
11 | | - "output_type": "stream", |
12 | | - "text": [ |
13 | | - "[NeMo W 2025-02-03 20:06:44 nemo_logging:361] /usr/local/lib/python3.10/dist-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", |
14 | | - " from .autonotebook import tqdm as notebook_tqdm\n", |
15 | | - " \n" |
16 | | - ] |
17 | | - } |
18 | | - ], |
| 8 | + "outputs": [], |
19 | 9 | "source": [ |
20 | 10 | "from nemo.collections.tts.models import T5TTS_Model\n", |
21 | 11 | "from nemo.collections.tts.data.text_to_speech_dataset import T5TTSDataset, DatasetSample\n", |
|
48 | 38 | "# Checkpoint and Hparams Paths\n", |
49 | 39 | "# hparams_file = \"/datap/misc/Experiments/SimpleT5Explore/LocalTraining_LRH/T5TTS/0/hparams.yaml\"\n", |
50 | 40 | "# checkpoint_file = \"/datap/misc/Experiments/SimpleT5Explore/LocalTraining_LRH/T5TTS/0/checkpoints/test.ckpt\"\n", |
| 41 | + "checkpoint_file = \"/datap/misc/continuouscheckpoints/chal/2502_finetune_challenging_LR1e-5_T5TTS--val_loss=5.2330-epoch=0-last.ckpt\" #T5TTS--val_loss=5.8671-epoch=1-last.ckpt\"\n", |
51 | 42 | "hparams_file = \"/home/rfejgin/release_2502/hparams__final_xform__yt_weight0.25_plus_18k_single_stage_enc3_fixes_phoneme_only.yaml\"\n", |
52 | | - "codecmodel_path = \"/home/rfejgin/release_2502/dpo_fine_tuning_beta0.1__final_xform_enc3_T5TTS--val_loss_0.3899-epoch_24.ckpt\"\n", |
53 | 43 | "\n", |
54 | 44 | "# Temp out dir for saving audios\n", |
55 | 45 | "out_dir = \"/datap/misc/t5tts_inference_notebook_samples\"\n", |
|
67 | 57 | }, |
68 | 58 | { |
69 | 59 | "cell_type": "code", |
70 | | - "execution_count": 3, |
| 60 | + "execution_count": null, |
71 | 61 | "id": "87bf66f9", |
72 | 62 | "metadata": {}, |
73 | | - "outputs": [ |
74 | | - { |
75 | | - "name": "stderr", |
76 | | - "output_type": "stream", |
77 | | - "text": [ |
78 | | - "[NeMo W 2025-02-03 20:06:51 experimental:26] `<class 'nemo.collections.tts.g2p.models.i18n_ipa.IpaG2p'>` is experimental and not ready for production yet. Use at your own risk.\n", |
79 | | - "[NeMo W 2025-02-03 20:06:52 i18n_ipa:124] apply_to_oov_word=None, This means that some of words will remain unchanged if they are not handled by any of the rules in self.parse_one_word(). This may be intended if phonemes and chars are both valid inputs, otherwise, you may see unexpected deletions in your input.\n", |
80 | | - "[NeMo W 2025-02-03 20:06:52 experimental:26] `<class 'nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.IPATokenizer'>` is experimental and not ready for production yet. Use at your own risk.\n", |
81 | | - "[NeMo W 2025-02-03 20:06:52 experimental:26] `<class 'nemo.collections.tts.g2p.models.i18n_ipa.IpaG2p'>` is experimental and not ready for production yet. Use at your own risk.\n", |
82 | | - "[NeMo W 2025-02-03 20:06:53 i18n_ipa:124] apply_to_oov_word=None, This means that some of words will remain unchanged if they are not handled by any of the rules in self.parse_one_word(). This may be intended if phonemes and chars are both valid inputs, otherwise, you may see unexpected deletions in your input.\n", |
83 | | - "[NeMo W 2025-02-03 20:06:53 experimental:26] `<class 'nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.IPATokenizer'>` is experimental and not ready for production yet. Use at your own risk.\n", |
84 | | - "[NeMo W 2025-02-03 20:06:53 experimental:26] `<class 'nemo.collections.tts.g2p.models.i18n_ipa.IpaG2p'>` is experimental and not ready for production yet. Use at your own risk.\n", |
85 | | - "[NeMo W 2025-02-03 20:06:54 i18n_ipa:124] apply_to_oov_word=None, This means that some of words will remain unchanged if they are not handled by any of the rules in self.parse_one_word(). This may be intended if phonemes and chars are both valid inputs, otherwise, you may see unexpected deletions in your input.\n", |
86 | | - "[NeMo W 2025-02-03 20:06:54 experimental:26] `<class 'nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.IPATokenizer'>` is experimental and not ready for production yet. Use at your own risk.\n", |
87 | | - "[NeMo W 2025-02-03 20:06:54 zh_cn_pinyin:100] apply_to_oov_word=None, This means that some of words will remain unchanged if they are not handled by any of the rules in self.parse_one_word(). This may be intended if phonemes and chars are both valid inputs, otherwise, you may see unexpected deletions in your input.\n", |
88 | | - "You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565\n" |
89 | | - ] |
90 | | - }, |
91 | | - { |
92 | | - "ename": "TypeError", |
93 | | - "evalue": "Transformer.__init__() got an unexpected keyword argument 'pos_emb'", |
94 | | - "output_type": "error", |
95 | | - "traceback": [ |
96 | | - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", |
97 | | - "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)", |
98 | | - "Cell \u001b[0;32mIn[3], line 28\u001b[0m\n\u001b[1;32m 24\u001b[0m model_cfg\u001b[38;5;241m.\u001b[39mtrain_ds \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 25\u001b[0m model_cfg\u001b[38;5;241m.\u001b[39mvalidation_ds \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[0;32m---> 28\u001b[0m model \u001b[38;5;241m=\u001b[39m \u001b[43mT5TTS_Model\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcfg\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmodel_cfg\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 29\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mLoading weights from checkpoint\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 30\u001b[0m ckpt \u001b[38;5;241m=\u001b[39m torch\u001b[38;5;241m.\u001b[39mload(checkpoint_file)\n", |
99 | | - "File \u001b[0;32m/home/rfejgin/NeMo/nemo/collections/tts/models/t5tts.py:153\u001b[0m, in \u001b[0;36mT5TTS_Model.__init__\u001b[0;34m(self, cfg, trainer)\u001b[0m\n\u001b[1;32m 150\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmodel_type \u001b[38;5;241m!=\u001b[39m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mdecoder_pretrain_synthesizer\u001b[39m\u001b[38;5;124m'\u001b[39m:\n\u001b[1;32m 151\u001b[0m \u001b[38;5;66;03m# Decoder pretrain synthesizer doesn't have transcript encoder/text embeddings\u001b[39;00m\n\u001b[1;32m 152\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtext_embedding \u001b[38;5;241m=\u001b[39m nn\u001b[38;5;241m.\u001b[39mEmbedding(num_tokens, cfg\u001b[38;5;241m.\u001b[39membedding_dim)\n\u001b[0;32m--> 153\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mt5_encoder \u001b[38;5;241m=\u001b[39m \u001b[43mt5tts_transformer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mTransformer\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;28;43mdict\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mcfg\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mt5_encoder\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 155\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mt5_decoder \u001b[38;5;241m=\u001b[39m t5tts_transformer\u001b[38;5;241m.\u001b[39mTransformer(\u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39m\u001b[38;5;28mdict\u001b[39m(cfg\u001b[38;5;241m.\u001b[39mt5_decoder))\n\u001b[1;32m 157\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mfinal_proj \u001b[38;5;241m=\u001b[39m nn\u001b[38;5;241m.\u001b[39mLinear(cfg\u001b[38;5;241m.\u001b[39mt5_decoder\u001b[38;5;241m.\u001b[39md_model, cfg\u001b[38;5;241m.\u001b[39mnum_audio_codebooks \u001b[38;5;241m*\u001b[39m cfg\u001b[38;5;241m.\u001b[39mnum_audio_tokens_per_codebook)\n", |
100 | | - "\u001b[0;31mTypeError\u001b[0m: Transformer.__init__() got an unexpected keyword argument 'pos_emb'" |
101 | | - ] |
102 | | - } |
103 | | - ], |
| 63 | + "outputs": [], |
104 | 64 | "source": [ |
105 | 65 | "#hparams_file = \"yt_weight_0.25_plus18k__dim1536__enc3_fixes_hparams.yaml\"\n", |
106 | 66 | "#checkpoint_file = \"yt_weight_0.25_plus18k__dim1536__enc3_fixes_val_loss_5.1870_epoch_25.ckpt\"\n", |
107 | 67 | "\n", |
108 | 68 | "#hparams_file = \"/data/t5_new_cp/configs/unnormalizedLalign005_singleencoder_kernel3_hparams.yaml\"\n", |
109 | 69 | "#checkpoint_file = \"/data/t5_new_cp/checkpoints/unnormalizedLalign005_singleencoder_kernel3_epoch_20.ckpt\" #\"/datap/misc/continuouscheckpoints/edresson_epoch21.ckpt\"\n", |
110 | | - "hparams_file = \"/datap/misc/continuouscheckpoints/yt_weight0.25_plus_18k_single_stage_decoder_context_kernel1_fixes_hparams.yaml\"\n", |
111 | | - "checkpoint_file =\"/datap/misc/continuouscheckpoints/yt_weight0.25_plus_18k_single_stage_decoder_context_kernel1_fixes_epoch_61.ckpt\" \n", |
| 70 | + "#hparams_file = \"/datap/misc/continuouscheckpoints/yt_weight0.25_plus_18k_single_stage_decoder_context_kernel1_fixes_hparams.yaml\"\n", |
| 71 | + "#checkpoint_file =\"/datap/misc/continuouscheckpoints/yt_weight0.25_plus_18k_single_stage_decoder_context_kernel1_fixes_epoch_61.ckpt\" \n", |
112 | 72 | "#hparams_file = \"/datap/misc/continuouscheckpoints/decoder_context_large_hparams.yaml\"\n", |
113 | 73 | "#checkpoint_file =\"/datap/misc/continuouscheckpoints/decoder_context_large_epoch_14.ckpt\" \n", |
114 | 74 | "\n", |
|
219 | 179 | }, |
220 | 180 | { |
221 | 181 | "cell_type": "code", |
222 | | - "execution_count": 4, |
| 182 | + "execution_count": 12, |
223 | 183 | "id": "74683d11", |
224 | 184 | "metadata": {}, |
225 | 185 | "outputs": [], |
226 | 186 | "source": [ |
227 | 187 | "usg_cfg = True\n", |
228 | | - "cfg_scale = 1.8\n", |
| 188 | + "cfg_scale = 2.5\n", |
229 | 189 | "audio_dir = \"/home/rfejgin/kb-snippets\"\n", |
230 | 190 | "#audio_dir = \"/data/NV-RESTRICTED/JHSD/22khz\"\n", |
231 | | - "texts = [\"Our GPUs aren’t just processors; they are engines for discovery, powering breakthroughs in everything from self-driving cars to disease research.\",\n", |
232 | | - " \"Our GPUs aren’t just processors; they are engines for discovery, powering breakthroughs in everything from self-driving cars to disease research.\",\n", |
233 | | - " \"Our GPUs aren’t just processors; they are engines for discovery, powering breakthroughs in everything from self-driving cars to disease research.\",\"Our GPUs aren’t just processors; they are engines for discovery, powering breakthroughs in everything from self-driving cars to disease research.\",\n", |
234 | | - " \"Our GPUs aren’t just processors; they are engines for discovery, powering breakthroughs in everything from self-driving cars to disease research.\",\n", |
235 | | - " \"When comparing Heavenly and Northstar ski resorts in Lake Tahoe, each offers unique advantages.\",\n", |
236 | | - " \"When comparing Heavenly and Northstar ski resorts in Lake Tahoe, each offers unique advantages.\",\n", |
237 | | - " \"When comparing Heavenly and Northstar ski resorts in Lake Tahoe, each offers unique advantages.\",\n", |
238 | | - " \"When comparing Heavenly and Northstar ski resorts in Lake Tahoe, each offers unique advantages.\",\n", |
239 | | - " \"When comparing Heavenly and Northstar ski resorts in Lake Tahoe, each offers unique advantages.\"]\n", |
| 191 | + "texts = [\"Let me confirm S D S D two two two two, one, two, four, four, h t t p, four, five, six, seven, eight. Is that correct?\",\n", |
| 192 | + " \"Let me confirm S D S D two two two two, one, two, four, four, h t t p, four, five, six, seven, eight. Is that correct?\",\n", |
| 193 | + " \"Let me confirm S D S D two two two two, one, two, four, four, h t t p, four, five, six, seven, eight. Is that correct?\",\n", |
| 194 | + " \"Let me confirm S D S D two two two two, one, two, four, four, h t t p, four, five, six, seven, eight. Is that correct?\",\n", |
| 195 | + " \"Let me confirm S D S D two two two two, one, two, four, four, h t t p, four, five, six, seven, eight. Is that correct?\",\n", |
| 196 | + " \"hello\",\n", |
| 197 | + " \"hello\",\n", |
| 198 | + " \"hello\",\n", |
| 199 | + " \"hello\",\n", |
| 200 | + " \"hello\",\n", |
| 201 | + "]\n", |
| 202 | + " #\"hi\", \"hi\",\"hi\",\"hi\",\"hi\",\n", |
| 203 | + " # \"Let me confirm that number: two, one, two, four, four, four, five, six, seven, eight. Is that correct?\",\n", |
| 204 | + " # \"Let me confirm that number: two, one, two, four, four, four, five, six, seven, eight. Is that correct?\",\n", |
| 205 | + " # \"Let me confirm that number: two, one, two, four, four, four, five, six, seven, eight. Is that correct?\",\n", |
| 206 | + " # \"Let me confirm that number: two, one, two, four, four, four, five, six, seven, eight. Is that correct?\",\n", |
| 207 | + " # \"Let me confirm that number: two, one, two, four, four, four, five, six, seven, eight. Is that correct?\"]\n", |
| 208 | + "# texts = [\"Our GPUs aren’t just processors; they are engines for discovery, powering breakthroughs in everything from self-driving cars to disease research.\",\n", |
| 209 | + "# \"Our GPUs aren’t just processors; they are engines for discovery, powering breakthroughs in everything from self-driving cars to disease research.\",\n", |
| 210 | + "# \"Our GPUs aren’t just processors; they are engines for discovery, powering breakthroughs in everything from self-driving cars to disease research.\",\"Our GPUs aren’t just processors; they are engines for discovery, powering breakthroughs in everything from self-driving cars to disease research.\",\n", |
| 211 | + "# \"Our GPUs aren’t just processors; they are engines for discovery, powering breakthroughs in everything from self-driving cars to disease research.\",\n", |
| 212 | + "# \"When comparing Heavenly and Northstar ski resorts in Lake Tahoe, each offers unique advantages.\",\n", |
| 213 | + "# \"When comparing Heavenly and Northstar ski resorts in Lake Tahoe, each offers unique advantages.\",\n", |
| 214 | + "# \"When comparing Heavenly and Northstar ski resorts in Lake Tahoe, each offers unique advantages.\",\n", |
| 215 | + "# \"When comparing Heavenly and Northstar ski resorts in Lake Tahoe, each offers unique advantages.\",\n", |
| 216 | + "# \"When comparing Heavenly and Northstar ski resorts in Lake Tahoe, each offers unique advantages.\"]\n", |
240 | 217 | "# texts = [\"NVIDIA's Riva is a powerful speech AI toolkit that offers state-of-the-art ASR and TTS capabilities.\",\n", |
241 | 218 | "# 'The platform supports multiple languages and provides enterprise-grade speech technology through GPU-accelerated SDKs and APIs.',\n", |
242 | 219 | "# 'What makes Riva unique is its ability to be customized for specific use cases while maintaining high performance and accuracy.',\n", |
|
264 | 241 | " \"duration\": 4.89,\n", |
265 | 242 | " \"text\": text,\n", |
266 | 243 | " \"speaker\": \"dummy\",\n", |
267 | | - " \"context_audio_filepath\": \"roy2_22050.wav\",#\"AMP20_KEYNOTE-VOOnly-44khz-16bit-mono_6.wav\",#\"adi-snippet1.wav\",\n", |
268 | | - " \"context_audio_duration\": 4.89\n", |
| 244 | + " \"context_text\": \"Speaker and Emotion: | Language:en Dataset:Riva Speaker:Lindy_WIZWIKI |\"\n", |
| 245 | + " #\"context_audio_filepath\": \"roy2_22050.wav\",#\"AMP20_KEYNOTE-VOOnly-44khz-16bit-mono_6.wav\",#\"adi-snippet1.wav\",\n", |
| 246 | + " #\"context_audio_duration\": 4.89\n", |
269 | 247 | " }\n", |
270 | 248 | " entries.append(entry)\n", |
271 | 249 | "data_samples = [DatasetSample(\n", |
|
291 | 269 | }, |
292 | 270 | { |
293 | 271 | "cell_type": "code", |
294 | | - "execution_count": 5, |
| 272 | + "execution_count": 13, |
295 | 273 | "id": "b7374d3f", |
296 | 274 | "metadata": {}, |
297 | 275 | "outputs": [], |
|
456 | 434 | "outputs": [], |
457 | 435 | "source": [ |
458 | 436 | "print(f\"Checkpoint: {checkpoint_file}\")\n", |
459 | | - "context_filepath = os.path.join(audio_dir, entry['context_audio_filepath'])\n", |
460 | | - "display(Audio(context_filepath))\n" |
| 437 | + "if 'context_audio_filepath' in entry:\n", |
| 438 | + " context_filepath = os.path.join(audio_dir, entry['context_audio_filepath'])\n", |
| 439 | + " display(Audio(context_filepath))\n" |
461 | 440 | ] |
462 | 441 | }, |
463 | 442 | { |
464 | 443 | "cell_type": "code", |
465 | | - "execution_count": null, |
| 444 | + "execution_count": 16, |
466 | 445 | "id": "0a72ccec", |
467 | 446 | "metadata": {}, |
468 | 447 | "outputs": [], |
469 | 448 | "source": [ |
470 | | - "entry['context_audio_filepath']" |
| 449 | + "if 'context_audio_filepath' in entry: entry['context_audio_filepath']" |
471 | 450 | ] |
472 | 451 | }, |
473 | 452 | { |
|
0 commit comments