DPO tweaks and notebook changes

rfejgin · rfejgin · commit 38104905e9a5 · 2025-02-07T18:13:47.000-08:00
diff --git a/scripts/t5tts/dpo/create_text_contextpairs.py b/scripts/t5tts/dpo/create_text_contextpairs.py
@@ -33,18 +33,27 @@ def main():
     parser.add_argument("--output_manifest", type=str)
     args = parser.parse_args()
     
-    with open(args.challenging_texts, 'r') as f:
-        challenging_texts = f.readlines()
-        challenging_texts = [text.strip() for text in challenging_texts if text.strip() != '']
-    
-    with open(args.regular_texts_for_audiocontext, 'r') as f:
-        regular_texts_for_audiocontext = f.readlines()
-        regular_texts_for_audiocontext = [text.strip() for text in regular_texts_for_audiocontext if text.strip() != '']
-    
-    with open(args.regular_texts_for_textcontext, 'r') as f:
-        regular_texts_for_textcontext = f.readlines()
-        regular_texts_for_textcontext = [text.strip() for text in regular_texts_for_textcontext if text.strip() != '']
+    if args.challenging_texts is not None:
+        with open(args.challenging_texts, 'r') as f:
+            challenging_texts = f.readlines()
+            challenging_texts = [text.strip() for text in challenging_texts if text.strip() != '']
+    else:
+        challenging_texts = None
+        
+    if args.regular_texts_for_audiocontext is not None:
+        with open(args.regular_texts_for_audiocontext, 'r') as f:
+            regular_texts_for_audiocontext = f.readlines()
+            regular_texts_for_audiocontext = [text.strip() for text in regular_texts_for_audiocontext if text.strip() != '']
+    else:
+        regular_texts_for_audiocontext = []
     
+    if args.regular_texts_for_textcontext is not None:
+        with open(args.regular_texts_for_textcontext, 'r') as f:
+            regular_texts_for_textcontext = f.readlines()
+            regular_texts_for_textcontext = [text.strip() for text in regular_texts_for_textcontext if text.strip() != '']
+    else:
+        regular_texts_for_textcontext = None
+
     with open(args.audio_contexts, 'r') as f:
         audio_contexts = f.readlines()
         audio_contexts = [json.loads(context.strip()) for context in audio_contexts if context.strip() != '']
@@ -66,18 +75,19 @@ def main():
             text_context = random.choice(text_contexts)
             record = create_text_context_record(challenging_text, text_context, dummy_audio_filepath, 'challenging', dummy_target_audio_codes_path)
             all_records.append(record)
-    
-    for regular_text in regular_texts_for_audiocontext:
-        for _ in range(args.n_audio_contexts_per_regular_text):
-            audio_context = random.choice(audio_contexts)
-            record = create_audio_context_record(regular_text, audio_context, 'regular')
-            all_records.append(record)
-    
-    for regular_text in regular_texts_for_textcontext:
-        for _ in range(args.n_text_contexts_per_regular_text):
-            text_context = random.choice(text_contexts)
-            record = create_text_context_record(regular_text, text_context, dummy_audio_filepath, 'regular', dummy_target_audio_codes_path)
-            all_records.append(record)
+
+    if regular_texts_for_audiocontext is not None:  
+        for regular_text in regular_texts_for_audiocontext:
+            for _ in range(args.n_audio_contexts_per_regular_text):
+                audio_context = random.choice(audio_contexts)
+                record = create_audio_context_record(regular_text, audio_context, 'regular')
+                all_records.append(record)
+    if regular_texts_for_textcontext is not None:
+        for regular_text in regular_texts_for_textcontext:
+            for _ in range(args.n_text_contexts_per_regular_text):
+                text_context = random.choice(text_contexts)
+                record = create_text_context_record(regular_text, text_context, dummy_audio_filepath, 'regular', dummy_target_audio_codes_path)
+                all_records.append(record)
     
     random.shuffle(all_records)
     repeated_records = []
diff --git a/t5tts_inference.ipynb b/t5tts_inference.ipynb
@@ -2,20 +2,10 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
    "id": "466ccdc5",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "[NeMo W 2025-02-03 20:06:44 nemo_logging:361] /usr/local/lib/python3.10/dist-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
-      "      from .autonotebook import tqdm as notebook_tqdm\n",
-      "    \n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "from nemo.collections.tts.models import T5TTS_Model\n",
     "from nemo.collections.tts.data.text_to_speech_dataset import T5TTSDataset, DatasetSample\n",
@@ -48,8 +38,8 @@
     "# Checkpoint and Hparams Paths\n",
     "# hparams_file = \"/datap/misc/Experiments/SimpleT5Explore/LocalTraining_LRH/T5TTS/0/hparams.yaml\"\n",
     "# checkpoint_file = \"/datap/misc/Experiments/SimpleT5Explore/LocalTraining_LRH/T5TTS/0/checkpoints/test.ckpt\"\n",
+    "checkpoint_file = \"/datap/misc/continuouscheckpoints/chal/2502_finetune_challenging_LR1e-5_T5TTS--val_loss=5.2330-epoch=0-last.ckpt\" #T5TTS--val_loss=5.8671-epoch=1-last.ckpt\"\n",
     "hparams_file = \"/home/rfejgin/release_2502/hparams__final_xform__yt_weight0.25_plus_18k_single_stage_enc3_fixes_phoneme_only.yaml\"\n",
-    "codecmodel_path = \"/home/rfejgin/release_2502/dpo_fine_tuning_beta0.1__final_xform_enc3_T5TTS--val_loss_0.3899-epoch_24.ckpt\"\n",
     "\n",
     "# Temp out dir for saving audios\n",
     "out_dir = \"/datap/misc/t5tts_inference_notebook_samples\"\n",
@@ -67,48 +57,18 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": null,
    "id": "87bf66f9",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "[NeMo W 2025-02-03 20:06:51 experimental:26] `<class 'nemo.collections.tts.g2p.models.i18n_ipa.IpaG2p'>` is experimental and not ready for production yet. Use at your own risk.\n",
-      "[NeMo W 2025-02-03 20:06:52 i18n_ipa:124] apply_to_oov_word=None, This means that some of words will remain unchanged if they are not handled by any of the rules in self.parse_one_word(). This may be intended if phonemes and chars are both valid inputs, otherwise, you may see unexpected deletions in your input.\n",
-      "[NeMo W 2025-02-03 20:06:52 experimental:26] `<class 'nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.IPATokenizer'>` is experimental and not ready for production yet. Use at your own risk.\n",
-      "[NeMo W 2025-02-03 20:06:52 experimental:26] `<class 'nemo.collections.tts.g2p.models.i18n_ipa.IpaG2p'>` is experimental and not ready for production yet. Use at your own risk.\n",
-      "[NeMo W 2025-02-03 20:06:53 i18n_ipa:124] apply_to_oov_word=None, This means that some of words will remain unchanged if they are not handled by any of the rules in self.parse_one_word(). This may be intended if phonemes and chars are both valid inputs, otherwise, you may see unexpected deletions in your input.\n",
-      "[NeMo W 2025-02-03 20:06:53 experimental:26] `<class 'nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.IPATokenizer'>` is experimental and not ready for production yet. Use at your own risk.\n",
-      "[NeMo W 2025-02-03 20:06:53 experimental:26] `<class 'nemo.collections.tts.g2p.models.i18n_ipa.IpaG2p'>` is experimental and not ready for production yet. Use at your own risk.\n",
-      "[NeMo W 2025-02-03 20:06:54 i18n_ipa:124] apply_to_oov_word=None, This means that some of words will remain unchanged if they are not handled by any of the rules in self.parse_one_word(). This may be intended if phonemes and chars are both valid inputs, otherwise, you may see unexpected deletions in your input.\n",
-      "[NeMo W 2025-02-03 20:06:54 experimental:26] `<class 'nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.IPATokenizer'>` is experimental and not ready for production yet. Use at your own risk.\n",
-      "[NeMo W 2025-02-03 20:06:54 zh_cn_pinyin:100] apply_to_oov_word=None, This means that some of words will remain unchanged if they are not handled by any of the rules in self.parse_one_word(). This may be intended if phonemes and chars are both valid inputs, otherwise, you may see unexpected deletions in your input.\n",
-      "You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565\n"
-     ]
-    },
-    {
-     "ename": "TypeError",
-     "evalue": "Transformer.__init__() got an unexpected keyword argument 'pos_emb'",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mTypeError\u001b[0m                                 Traceback (most recent call last)",
-      "Cell \u001b[0;32mIn[3], line 28\u001b[0m\n\u001b[1;32m     24\u001b[0m     model_cfg\u001b[38;5;241m.\u001b[39mtrain_ds \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m     25\u001b[0m     model_cfg\u001b[38;5;241m.\u001b[39mvalidation_ds \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[0;32m---> 28\u001b[0m model \u001b[38;5;241m=\u001b[39m \u001b[43mT5TTS_Model\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcfg\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmodel_cfg\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     29\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mLoading weights from checkpoint\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m     30\u001b[0m ckpt \u001b[38;5;241m=\u001b[39m torch\u001b[38;5;241m.\u001b[39mload(checkpoint_file)\n",
-      "File \u001b[0;32m/home/rfejgin/NeMo/nemo/collections/tts/models/t5tts.py:153\u001b[0m, in \u001b[0;36mT5TTS_Model.__init__\u001b[0;34m(self, cfg, trainer)\u001b[0m\n\u001b[1;32m    150\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmodel_type \u001b[38;5;241m!=\u001b[39m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mdecoder_pretrain_synthesizer\u001b[39m\u001b[38;5;124m'\u001b[39m:\n\u001b[1;32m    151\u001b[0m     \u001b[38;5;66;03m# Decoder pretrain synthesizer doesn't have transcript encoder/text embeddings\u001b[39;00m\n\u001b[1;32m    152\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtext_embedding \u001b[38;5;241m=\u001b[39m nn\u001b[38;5;241m.\u001b[39mEmbedding(num_tokens, cfg\u001b[38;5;241m.\u001b[39membedding_dim)\n\u001b[0;32m--> 153\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mt5_encoder \u001b[38;5;241m=\u001b[39m \u001b[43mt5tts_transformer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mTransformer\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;28;43mdict\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mcfg\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mt5_encoder\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    155\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mt5_decoder \u001b[38;5;241m=\u001b[39m t5tts_transformer\u001b[38;5;241m.\u001b[39mTransformer(\u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39m\u001b[38;5;28mdict\u001b[39m(cfg\u001b[38;5;241m.\u001b[39mt5_decoder))\n\u001b[1;32m    157\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mfinal_proj \u001b[38;5;241m=\u001b[39m nn\u001b[38;5;241m.\u001b[39mLinear(cfg\u001b[38;5;241m.\u001b[39mt5_decoder\u001b[38;5;241m.\u001b[39md_model, cfg\u001b[38;5;241m.\u001b[39mnum_audio_codebooks \u001b[38;5;241m*\u001b[39m cfg\u001b[38;5;241m.\u001b[39mnum_audio_tokens_per_codebook)\n",
-      "\u001b[0;31mTypeError\u001b[0m: Transformer.__init__() got an unexpected keyword argument 'pos_emb'"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "#hparams_file = \"yt_weight_0.25_plus18k__dim1536__enc3_fixes_hparams.yaml\"\n",
     "#checkpoint_file = \"yt_weight_0.25_plus18k__dim1536__enc3_fixes_val_loss_5.1870_epoch_25.ckpt\"\n",
     "\n",
     "#hparams_file = \"/data/t5_new_cp/configs/unnormalizedLalign005_singleencoder_kernel3_hparams.yaml\"\n",
     "#checkpoint_file = \"/data/t5_new_cp/checkpoints/unnormalizedLalign005_singleencoder_kernel3_epoch_20.ckpt\" #\"/datap/misc/continuouscheckpoints/edresson_epoch21.ckpt\"\n",
-    "hparams_file = \"/datap/misc/continuouscheckpoints/yt_weight0.25_plus_18k_single_stage_decoder_context_kernel1_fixes_hparams.yaml\"\n",
-    "checkpoint_file =\"/datap/misc/continuouscheckpoints/yt_weight0.25_plus_18k_single_stage_decoder_context_kernel1_fixes_epoch_61.ckpt\" \n",
+    "#hparams_file = \"/datap/misc/continuouscheckpoints/yt_weight0.25_plus_18k_single_stage_decoder_context_kernel1_fixes_hparams.yaml\"\n",
+    "#checkpoint_file =\"/datap/misc/continuouscheckpoints/yt_weight0.25_plus_18k_single_stage_decoder_context_kernel1_fixes_epoch_61.ckpt\" \n",
     "#hparams_file = \"/datap/misc/continuouscheckpoints/decoder_context_large_hparams.yaml\"\n",
     "#checkpoint_file =\"/datap/misc/continuouscheckpoints/decoder_context_large_epoch_14.ckpt\" \n",
     "\n",
@@ -219,24 +179,41 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 12,
    "id": "74683d11",
    "metadata": {},
    "outputs": [],
    "source": [
     "usg_cfg = True\n",
-    "cfg_scale = 1.8\n",
+    "cfg_scale = 2.5\n",
     "audio_dir = \"/home/rfejgin/kb-snippets\"\n",
     "#audio_dir = \"/data/NV-RESTRICTED/JHSD/22khz\"\n",
-    "texts = [\"Our GPUs aren’t just processors; they are engines for discovery, powering breakthroughs in everything from self-driving cars to disease research.\",\n",
-    "        \"Our GPUs aren’t just processors; they are engines for discovery, powering breakthroughs in everything from self-driving cars to disease research.\",\n",
-    "        \"Our GPUs aren’t just processors; they are engines for discovery, powering breakthroughs in everything from self-driving cars to disease research.\",\"Our GPUs aren’t just processors; they are engines for discovery, powering breakthroughs in everything from self-driving cars to disease research.\",\n",
-    "        \"Our GPUs aren’t just processors; they are engines for discovery, powering breakthroughs in everything from self-driving cars to disease research.\",\n",
-    "        \"When comparing Heavenly and Northstar ski resorts in Lake Tahoe, each offers unique advantages.\",\n",
-    "        \"When comparing Heavenly and Northstar ski resorts in Lake Tahoe, each offers unique advantages.\",\n",
-    "        \"When comparing Heavenly and Northstar ski resorts in Lake Tahoe, each offers unique advantages.\",\n",
-    "        \"When comparing Heavenly and Northstar ski resorts in Lake Tahoe, each offers unique advantages.\",\n",
-    "        \"When comparing Heavenly and Northstar ski resorts in Lake Tahoe, each offers unique advantages.\"]\n",
+    "texts = [\"Let me confirm S D S D two two two two, one, two, four, four, h t t p, four, five, six, seven, eight. Is that correct?\",\n",
+    "         \"Let me confirm S D S D two two two two, one, two, four, four, h t t p, four, five, six, seven, eight. Is that correct?\",\n",
+    "         \"Let me confirm S D S D two two two two, one, two, four, four, h t t p, four, five, six, seven, eight. Is that correct?\",\n",
+    "         \"Let me confirm S D S D two two two two, one, two, four, four, h t t p, four, five, six, seven, eight. Is that correct?\",\n",
+    "         \"Let me confirm S D S D two two two two, one, two, four, four, h t t p, four, five, six, seven, eight. Is that correct?\",\n",
+    "         \"hello\",\n",
+    "         \"hello\",\n",
+    "         \"hello\",\n",
+    "         \"hello\",\n",
+    "         \"hello\",\n",
+    "]\n",
+    "    #\"hi\", \"hi\",\"hi\",\"hi\",\"hi\",\n",
+    "        # \"Let me confirm that number: two, one, two, four, four, four, five, six, seven, eight. Is that correct?\",\n",
+    "        # \"Let me confirm that number: two, one, two, four, four, four, five, six, seven, eight. Is that correct?\",\n",
+    "        # \"Let me confirm that number: two, one, two, four, four, four, five, six, seven, eight. Is that correct?\",\n",
+    "        # \"Let me confirm that number: two, one, two, four, four, four, five, six, seven, eight. Is that correct?\",\n",
+    "        # \"Let me confirm that number: two, one, two, four, four, four, five, six, seven, eight. Is that correct?\"]\n",
+    "# texts = [\"Our GPUs aren’t just processors; they are engines for discovery, powering breakthroughs in everything from self-driving cars to disease research.\",\n",
+    "#         \"Our GPUs aren’t just processors; they are engines for discovery, powering breakthroughs in everything from self-driving cars to disease research.\",\n",
+    "#         \"Our GPUs aren’t just processors; they are engines for discovery, powering breakthroughs in everything from self-driving cars to disease research.\",\"Our GPUs aren’t just processors; they are engines for discovery, powering breakthroughs in everything from self-driving cars to disease research.\",\n",
+    "#         \"Our GPUs aren’t just processors; they are engines for discovery, powering breakthroughs in everything from self-driving cars to disease research.\",\n",
+    "#         \"When comparing Heavenly and Northstar ski resorts in Lake Tahoe, each offers unique advantages.\",\n",
+    "#         \"When comparing Heavenly and Northstar ski resorts in Lake Tahoe, each offers unique advantages.\",\n",
+    "#         \"When comparing Heavenly and Northstar ski resorts in Lake Tahoe, each offers unique advantages.\",\n",
+    "#         \"When comparing Heavenly and Northstar ski resorts in Lake Tahoe, each offers unique advantages.\",\n",
+    "#         \"When comparing Heavenly and Northstar ski resorts in Lake Tahoe, each offers unique advantages.\"]\n",
     "# texts = [\"NVIDIA's Riva is a powerful speech AI toolkit that offers state-of-the-art ASR and TTS capabilities.\",\n",
     "#  'The platform supports multiple languages and provides enterprise-grade speech technology through GPU-accelerated SDKs and APIs.',\n",
     "#  'What makes Riva unique is its ability to be customized for specific use cases while maintaining high performance and accuracy.',\n",
@@ -264,8 +241,9 @@
     "             \"duration\": 4.89,\n",
     "             \"text\": text,\n",
     "             \"speaker\": \"dummy\",\n",
-    "             \"context_audio_filepath\": \"roy2_22050.wav\",#\"AMP20_KEYNOTE-VOOnly-44khz-16bit-mono_6.wav\",#\"adi-snippet1.wav\",\n",
-    "             \"context_audio_duration\":  4.89\n",
+    "             \"context_text\": \"Speaker and Emotion: | Language:en Dataset:Riva Speaker:Lindy_WIZWIKI |\"\n",
+    "             #\"context_audio_filepath\": \"roy2_22050.wav\",#\"AMP20_KEYNOTE-VOOnly-44khz-16bit-mono_6.wav\",#\"adi-snippet1.wav\",\n",
+    "             #\"context_audio_duration\":  4.89\n",
     "    }\n",
     "    entries.append(entry)\n",
     "data_samples = [DatasetSample(\n",
@@ -291,7 +269,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 13,
    "id": "b7374d3f",
    "metadata": {},
    "outputs": [],
@@ -456,18 +434,19 @@
    "outputs": [],
    "source": [
     "print(f\"Checkpoint: {checkpoint_file}\")\n",
-    "context_filepath = os.path.join(audio_dir, entry['context_audio_filepath'])\n",
-    "display(Audio(context_filepath))\n"
+    "if 'context_audio_filepath' in entry:\n",
+    "    context_filepath = os.path.join(audio_dir, entry['context_audio_filepath'])\n",
+    "    display(Audio(context_filepath))\n"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 16,
    "id": "0a72ccec",
    "metadata": {},
    "outputs": [],
    "source": [
-    "entry['context_audio_filepath']"
+    "if 'context_audio_filepath' in entry: entry['context_audio_filepath']"
    ]
   },
   {