Cleanup and add missing files

rfejgin · rfejgin · commit 3fc5f37e9151 · 2025-12-22T23:15:26.000-08:00
* address some CI linting issues
* include a file that was missed in last commit

Signed-off-by: Fejgin, Roy &lt;rfejgin@nvidia.com&gt;
diff --git a/examples/tts/magpietts_inference.py b/examples/tts/magpietts_inference.py
@@ -101,6 +101,7 @@ def append_metrics_to_csv(csv_path: str, checkpoint_name: str, dataset: str, met
         metrics.get('wer_gt_audio_cumulative', ''),
         metrics.get('utmosv2_avg', ''),
         metrics.get('total_gen_audio_seconds', ''),
+        metrics.get('frechet_codec_distance', ''),
     ]
     with open(csv_path, "a") as f:
         f.write(",".join(str(v) for v in values) + "\n")
@@ -181,7 +182,7 @@ def run_inference_and_evaluation(
         "wer_cumulative,ssim_pred_gt_avg,ssim_pred_context_avg,ssim_gt_context_avg,"
         "ssim_pred_gt_avg_alternate,ssim_pred_context_avg_alternate,"
         "ssim_gt_context_avg_alternate,cer_gt_audio_cumulative,wer_gt_audio_cumulative,"
-        "utmosv2_avg,total_gen_audio_seconds"
+        "utmosv2_avg,total_gen_audio_seconds,frechet_codec_distance"
     )
 
     for dataset in datasets:
@@ -222,7 +223,7 @@ def run_inference_and_evaluation(
                     f"Dataset length mismatch: {len(test_dataset)} vs {len(manifest_records)} manifest records"
                 )
 
-            rtf_metrics_list, _ = runner.run_inference_on_dataset(
+            rtf_metrics_list, _, codec_file_paths = runner.run_inference_on_dataset(
                 dataset=test_dataset,
                 output_dir=repeat_audio_dir,
                 manifest_records=manifest_records,
@@ -246,6 +247,7 @@ def run_inference_and_evaluation(
                 asr_model_name=eval_config.asr_model_name,
                 language=language,
                 with_utmosv2=eval_config.with_utmosv2,
+                codec_model_path=eval_config.codec_model_path,
             )
 
             metrics, filewise_metrics = evaluate_generated_audio_dir(
@@ -272,6 +274,10 @@ def run_inference_and_evaluation(
             violin_path = Path(eval_dir) / f"{dataset}_violin_{repeat_idx}.png"
             create_violin_plot(filewise_metrics, violin_plot_metrics, violin_path)
 
+            # Delete temporary predicted codes files
+            for codec_file_path in codec_file_paths:
+                os.remove(codec_file_path)
+
         if skip_evaluation or not metrics_all_repeats:
             continue
 
@@ -463,6 +469,7 @@ def create_argument_parser() -> argparse.ArgumentParser:
         nargs='*',
         default=['cer', 'pred_context_ssim', 'utmosv2'],
     )
+    eval_group.add_argument('--disable_fcd', action='store_true', help="Disable Frechet Codec Distance computation")
 
     # Quality targets (for CI/CD)
     target_group = parser.add_argument_group('Quality Targets')
@@ -520,6 +527,7 @@ def main():
         sv_model=args.sv_model,
         asr_model_name=args.asr_model_name,
         with_utmosv2=not args.disable_utmosv2,
+        codec_model_path=args.codecmodel_path if not args.disable_fcd else None,
     )
 
     cer, ssim = None, None
diff --git a/nemo/collections/tts/metrics/frechet_codec_distance.py b/nemo/collections/tts/metrics/frechet_codec_distance.py
@@ -50,9 +50,10 @@ def num_features(self) -> int:
 class FrechetCodecDistance(FrechetInceptionDistance):
     def __init__(self, codec_name: str):
         if codec_name.endswith(".nemo"):
+            # Local .nemo file
             codec = AudioCodecModel.restore_from(codec_name, strict=False)
         elif codec_name.startswith("nvidia/"):
-            # HuggingFace or NGC model name
+            # Model on HuggingFace or NGC
             codec = AudioCodecModel.from_pretrained(codec_name)
         else:
             raise ValueError(
@@ -82,7 +83,7 @@ def encode_from_file(self, audio_path: str) -> Tensor:
 
     def update(self, codes: Tensor, codes_len: Tensor, is_real: bool):
         if codes.numel() == 0:
-            logging.warning(f"FCD: No valid codes to update, skipping update")
+            logging.warning("FCD: No valid codes to update, skipping update")
             return
         if codes.shape[1] != self.codec.num_codebooks:
             logging.warning(
@@ -97,7 +98,7 @@ def update(self, codes: Tensor, codes_len: Tensor, is_real: bool):
         # combine into a single tensor. We treat each timestep independently so we can concatenate them all.
         codes_batch_all = torch.cat(codes_batch_all, dim=-1).permute(1, 0)  # (B*T, C)
         if len(codes_batch_all) == 0:
-            logging.warning(f"FCD: No valid codes to update, skipping update")
+            logging.warning("FCD: No valid codes to update, skipping update")
             return
         # update
         super().update(codes_batch_all, real=is_real)
@@ -113,7 +114,7 @@ def update_from_audio_file(self, audio_path: str, is_real: bool):
 
     def compute(self) -> Tensor:
         if not self.updated_since_last_reset:
-            logging.warning(f"FCD: No updates since last reset, returning 0")
+            logging.warning("FCD: No updates since last reset, returning 0")
             return torch.tensor(0.0, device=self.device)
         fcd = super().compute()
         min_allowed_fcd = -0.01  # a bit of tolerance for numerical issues
diff --git a/nemo/collections/tts/modules/magpietts_inference/evaluate_generated_audio.py b/nemo/collections/tts/modules/magpietts_inference/evaluate_generated_audio.py
@@ -32,7 +32,6 @@
 import nemo.collections.asr as nemo_asr
 from nemo.collections.asr.metrics.wer import word_error_rate_detail
 from nemo.collections.tts.metrics.frechet_codec_distance import FrechetCodecDistance
-from nemo.collections.tts.models import AudioCodecModel
 from nemo.utils import logging
 
 # Optional import for UTMOSv2 (audio quality metric)