[TTS] Allow inference without reference audio (#15213)

rfejgin · web-flow · commit ace180b59d38 · 2026-01-07T08:51:04.000Z
In some cases we need to run inference on manifests that do not include context audio and/or ground truth audio:

* Text context manifests may not have a context audio included (and it wouldn't be very relevant anyway)
* When generating arbitrary text, the ground truth audio cannot be assumed to be available.

Note that even without the context and ground truth there are useful metrics that can be calculated, like WER, UTMOS, and inference speed.

This change adapts the inference scripts and the data loader used by them to allow the absence of context and/or ground truth audio. When these is missing, the the metrics that depend on them are set to 0.0.
diff --git a/nemo/collections/tts/data/text_to_speech_dataset.py b/nemo/collections/tts/data/text_to_speech_dataset.py
@@ -459,7 +459,7 @@ def __getitem__(self, index):
             if 'audio_filepath' in data.manifest_entry:
                 # If audio_filepath is available, then use the actual audio file path.
                 example['audio_filepath'] = data.manifest_entry['audio_filepath']
-        else:
+        elif 'audio_filepath' in data.manifest_entry:
             # Only load audio if codes are not available
             audio_array, _, audio_filepath_rel = load_audio(
                 manifest_entry=data.manifest_entry,
@@ -661,13 +661,15 @@ def collate_fn(self, batch: List[dict]):
         speaker_indices_list = []
         for example in batch:
             dataset_name_list.append(example["dataset_name"])
-            audio_filepath_list.append(example["audio_filepath"])
             raw_text_list.append(example["raw_text"])
             language_list.append(example["language"])
 
             token_list.append(example["tokens"])
             token_len_list.append(example["text_len"])
 
+            if 'audio_filepath' in example:
+                audio_filepath_list.append(example["audio_filepath"])
+
             if 'audio' in example:
                 audio_list.append(example["audio"])
                 audio_len_list.append(example["audio_len"])
@@ -774,14 +776,13 @@ def collate_fn(self, batch: List[dict]):
         if len(speaker_indices_list) > 0:
             batch_dict['speaker_indices'] = torch.tensor(speaker_indices_list, dtype=torch.int64)
 
-        # Assert only ONE of context_audio or context_audio_codes in the batch
-        assert ('audio' in batch_dict) ^ ('audio_codes' in batch_dict)
+        # Assert no more than one of audio or audio_codes in the batch
+        if 'audio' in batch_dict:
+            assert 'audio_codes' not in batch_dict
 
-        # Assert only ONE of context_audio or context_audio_codes in the batch
+        # Assert no more than one of context_audio or context_audio_codes in the batch
         if 'context_audio' in batch_dict:
             assert 'context_audio_codes' not in batch_dict
-        if 'context_audio_codes' in batch_dict:
-            assert 'context_audio' not in batch_dict
 
         return batch_dict
 
diff --git a/nemo/collections/tts/modules/magpietts_inference/evaluate_generated_audio.py b/nemo/collections/tts/modules/magpietts_inference/evaluate_generated_audio.py
@@ -246,9 +246,9 @@ def evaluate(
     gt_audio_texts = []
     total_generated_audio_seconds = 0.0
     for ridx, record in enumerate(records):
-        gt_audio_filepath = record['audio_filepath']
+        gt_audio_filepath = record.get('audio_filepath', None)
         context_audio_filepath = record.get('context_audio_filepath', None)
-        if audio_dir is not None:
+        if audio_dir is not None and gt_audio_filepath is not None:
             gt_audio_filepath = os.path.join(audio_dir, gt_audio_filepath)
             if context_audio_filepath is not None:
                 context_audio_filepath = os.path.join(audio_dir, context_audio_filepath)
@@ -265,17 +265,25 @@ def evaluate(
                 with torch.inference_mode():
                     pred_text = asr_model.transcribe([pred_audio_filepath], batch_size=1, use_lhotse=False)[0].text
                     pred_text = process_text(pred_text)
-                    gt_audio_text = asr_model.transcribe([gt_audio_filepath], batch_size=1, use_lhotse=False)[0].text
-                    gt_audio_text = process_text(gt_audio_text)
+                    if gt_audio_filepath is not None:
+                        gt_audio_text = asr_model.transcribe([gt_audio_filepath], batch_size=1, use_lhotse=False)[
+                            0
+                        ].text
+                        gt_audio_text = process_text(gt_audio_text)
+                    else:
+                        gt_audio_text = None
             else:
                 pred_text = transcribe_with_whisper(
                     whisper_model, whisper_processor, pred_audio_filepath, language, device
                 )
                 pred_text = process_text(pred_text)
-                gt_audio_text = transcribe_with_whisper(
-                    whisper_model, whisper_processor, gt_audio_filepath, language, device
-                )
-                gt_audio_text = process_text(gt_audio_text)
+                if gt_audio_filepath is not None:
+                    gt_audio_text = transcribe_with_whisper(
+                        whisper_model, whisper_processor, gt_audio_filepath, language, device
+                    )
+                    gt_audio_text = process_text(gt_audio_text)
+                else:
+                    gt_audio_text = None
         except Exception as e:
             logging.info("Error during ASR: {}".format(e))
             pred_text = ""
@@ -318,19 +326,29 @@ def evaluate(
                 sv_model_type=sv_model_type,
             )
 
-            # Ground truth vs. predicted
-            gt_speaker_embedding = extract_embedding_fn(audio_path=gt_audio_filepath)
-            pred_speaker_embedding = extract_embedding_fn(audio_path=pred_audio_filepath)
-            pred_gt_ssim = torch.nn.functional.cosine_similarity(
-                gt_speaker_embedding, pred_speaker_embedding, dim=0
-            ).item()
+            # Initialize SSIMs with a default since the context or ground truth audio
+            # may be unavailable.
+            pred_context_ssim = float('NaN')
+            gt_context_ssim = float('NaN')
+            pred_context_ssim_alternate = float('NaN')
+            gt_context_ssim_alternate = float('NaN')
+            pred_gt_ssim = float('NaN')
+            pred_gt_ssim_alternate = float('NaN')
+
+            if gt_audio_filepath is not None:
+                # Ground truth vs. predicted
+                gt_speaker_embedding = extract_embedding_fn(audio_path=gt_audio_filepath)
+                pred_speaker_embedding = extract_embedding_fn(audio_path=pred_audio_filepath)
+                pred_gt_ssim = torch.nn.functional.cosine_similarity(
+                    gt_speaker_embedding, pred_speaker_embedding, dim=0
+                ).item()
 
-            # Ground truth vs. predicted (alternate model)
-            gt_speaker_embedding_alternate = extract_embedding_fn_alternate(audio_path=gt_audio_filepath)
-            pred_speaker_embedding_alternate = extract_embedding_fn_alternate(audio_path=pred_audio_filepath)
-            pred_gt_ssim_alternate = torch.nn.functional.cosine_similarity(
-                gt_speaker_embedding_alternate, pred_speaker_embedding_alternate, dim=0
-            ).item()
+                # Ground truth vs. predicted (alternate model)
+                gt_speaker_embedding_alternate = extract_embedding_fn_alternate(audio_path=gt_audio_filepath)
+                pred_speaker_embedding_alternate = extract_embedding_fn_alternate(audio_path=pred_audio_filepath)
+                pred_gt_ssim_alternate = torch.nn.functional.cosine_similarity(
+                    gt_speaker_embedding_alternate, pred_speaker_embedding_alternate, dim=0
+                ).item()
 
             if context_audio_filepath is not None:
                 context_speaker_embedding = extract_embedding_fn(audio_path=context_audio_filepath)
@@ -341,18 +359,20 @@ def evaluate(
                     pred_speaker_embedding, context_speaker_embedding, dim=0
                 ).item()
                 # Ground truth vs. context
-                gt_context_ssim = torch.nn.functional.cosine_similarity(
-                    gt_speaker_embedding, context_speaker_embedding, dim=0
-                ).item()
+                if gt_audio_filepath is not None:
+                    gt_context_ssim = torch.nn.functional.cosine_similarity(
+                        gt_speaker_embedding, context_speaker_embedding, dim=0
+                    ).item()
 
                 # Predicted vs. context (alternate model)
                 pred_context_ssim_alternate = torch.nn.functional.cosine_similarity(
                     pred_speaker_embedding_alternate, context_speaker_embedding_alternate, dim=0
                 ).item()
                 # Ground truth vs. context (alternate model)
-                gt_context_ssim_alternate = torch.nn.functional.cosine_similarity(
-                    gt_speaker_embedding_alternate, context_speaker_embedding_alternate, dim=0
-                ).item()
+                if gt_audio_filepath is not None:
+                    gt_context_ssim_alternate = torch.nn.functional.cosine_similarity(
+                        gt_speaker_embedding_alternate, context_speaker_embedding_alternate, dim=0
+                    ).item()
             total_generated_audio_seconds += get_wav_file_duration(pred_audio_filepath)
 
         filewise_metrics.append(
@@ -415,12 +435,20 @@ def evaluate(
     avg_metrics['ssim_gt_context_avg_alternate'] = sum(
         [m['gt_context_ssim_alternate'] for m in filewise_metrics]
     ) / len(filewise_metrics)
-    avg_metrics["cer_gt_audio_cumulative"] = word_error_rate_detail(
-        hypotheses=gt_audio_texts, references=gt_texts, use_cer=True
-    )[0]
-    avg_metrics["wer_gt_audio_cumulative"] = word_error_rate_detail(
-        hypotheses=gt_audio_texts, references=gt_texts, use_cer=False
-    )[0]
+    if not None in gt_audio_texts:
+        avg_metrics["cer_gt_audio_cumulative"] = word_error_rate_detail(
+            hypotheses=gt_audio_texts, references=gt_texts, use_cer=True
+        )[0]
+        avg_metrics["wer_gt_audio_cumulative"] = word_error_rate_detail(
+            hypotheses=gt_audio_texts, references=gt_texts, use_cer=False
+        )[0]
+    else:
+        avg_metrics["cer_gt_audio_cumulative"] = float('NaN')
+        avg_metrics["wer_gt_audio_cumulative"] = float('NaN')
+        logging.warning(
+            "Ground truth audio files are missing. Setting cumulative CER and WER for ground truth audio to NaN."
+        )
+
     avg_metrics["utmosv2_avg"] = sum([m['utmosv2'] for m in filewise_metrics]) / len(filewise_metrics)
     avg_metrics["total_gen_audio_seconds"] = total_generated_audio_seconds
     pprint.pprint(avg_metrics)