NVIDIA-NeMo
diff --git a/‎examples/tts/magpietts_inference.py‎
Lines changed: 612 additions & 0 deletions b/‎examples/tts/magpietts_inference.py‎
Lines changed: 612 additions & 0 deletions
diff --git a/‎nemo/collections/tts/models/__init__.py‎
Lines changed: 2 additions & 1 deletion b/‎nemo/collections/tts/models/__init__.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎nemo/collections/tts/models/magpietts.py‎
Lines changed: 241 additions & 32 deletions b/‎nemo/collections/tts/models/magpietts.py‎
Lines changed: 241 additions & 32 deletions
diff --git a/‎nemo/collections/tts/models/magpietts_preference_optimization.py‎
Lines changed: 10 additions & 2 deletions b/‎nemo/collections/tts/models/magpietts_preference_optimization.py‎
Lines changed: 10 additions & 2 deletions
diff --git a/‎nemo/collections/tts/modules/magpietts_inference/__init__.py‎
Lines changed: 70 additions & 0 deletions b/‎nemo/collections/tts/modules/magpietts_inference/__init__.py‎
Lines changed: 70 additions & 0 deletions
diff --git a/‎nemo/collections/tts/modules/magpietts_inference/evalset_config.json‎
Lines changed: 8 additions & 0 deletions b/‎nemo/collections/tts/modules/magpietts_inference/evalset_config.json‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎…ts/magpietts/evaluate_generated_audio.py‎ ‎…ts_inference/evaluate_generated_audio.py‎scripts/magpietts/evaluate_generated_audio.py renamed to nemo/collections/tts/modules/magpietts_inference/evaluate_generated_audio.py
Lines changed: 29 additions & 37 deletions b/‎…ts/magpietts/evaluate_generated_audio.py‎ ‎…ts_inference/evaluate_generated_audio.py‎scripts/magpietts/evaluate_generated_audio.py renamed to nemo/collections/tts/modules/magpietts_inference/evaluate_generated_audio.py
Lines changed: 29 additions & 37 deletions
@@ -17,7 +17,7 @@
 from nemo.collections.tts.models.fastpitch import FastPitchModel
 from nemo.collections.tts.models.fastpitch_ssl import FastPitchModel_SSL
 from nemo.collections.tts.models.hifigan import HifiGanModel
-from nemo.collections.tts.models.magpietts import MagpieTTSModel
+from nemo.collections.tts.models.magpietts import InferBatchOutput, MagpieTTSModel
 from nemo.collections.tts.models.magpietts_preference_optimization import (
     MagpieTTSModelOfflinePO,
     MagpieTTSModelOfflinePODataGen,
@@ -41,6 +41,7 @@
     "SSLDisentangler",
     "GriffinLimModel",
     "HifiGanModel",
+    "InferBatchOutput",
     "MelPsuedoInverseModel",
     "MixerTTSModel",
     "RadTTSModel",
 
@@ -98,14 +98,18 @@ def test_step(self, batch, batch_idx):
             topk = self.cfg.get('inference_topk', 80)
             use_cfg = self.cfg.get('inference_use_cfg', False)
             cfg_scale = self.cfg.get('inference_cfg_scale', 1.0)
-            predicted_audio, predicted_audio_lens, predicted_codes, predicted_codes_lens, _ = self.infer_batch(
+            output = self.infer_batch(
                 batch,
                 max_decoder_steps=self.cfg.get('max_decoder_steps', 500),
                 temperature=temperature,
                 topk=topk,
                 use_cfg=use_cfg,
                 cfg_scale=cfg_scale,
             )
+            predicted_audio = output.predicted_audio
+            predicted_audio_lens = output.predicted_audio_lens
+            predicted_codes = output.predicted_codes
+            predicted_codes_lens = output.predicted_codes_lens
             predicted_audio_paths = []
             audio_durations = []
             batch_invalid = False
@@ -612,7 +616,7 @@ def generate_and_reward(
             use_cfg = random.random() < self.cfg.inference_cfg_prob
             cfg_scale = self.cfg.get('inference_cfg_scale', 1.0)
 
-        predicted_audio, predicted_audio_lens, predicted_codes, predicted_codes_lens, _ = self.infer_batch(
+        output = self.infer_batch(
             batch_repeated,
             max_decoder_steps=self.max_decoder_steps,
             temperature=temperature,
@@ -622,6 +626,10 @@ def generate_and_reward(
             use_local_transformer_for_inference=use_local_transformer_for_inference,
             use_LT_kv_cache=False,  # We don't use KV caching for local transformer in GRPO due to issues.
         )
+        predicted_audio = output.predicted_audio
+        predicted_audio_lens = output.predicted_audio_lens
+        predicted_codes = output.predicted_codes
+        predicted_codes_lens = output.predicted_codes_lens
         predicted_audio_paths = []
         audio_durations = []
         for idx in range(predicted_audio.size(0)):
 
@@ -0,0 +1,70 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+MagpieTTS inference and evaluation subpackage.
+
+This package provides modular components for:
+- Model loading and configuration (utils.py)
+- Batch inference (inference.py)
+- Audio quality evaluation (evaluation.py)
+- Metrics visualization (visualization.py)
+
+Example Usage:
+    from examples.tts.magpietts import (
+        InferenceConfig,
+        MagpieInferenceRunner,
+        load_magpie_model,
+        ModelLoadConfig,
+    )
+
+    # Load model
+    model_config = ModelLoadConfig(
+        nemo_file="/path/to/model.nemo",
+        codecmodel_path="/path/to/codec.nemo",
+    )
+    model, checkpoint_name = load_magpie_model(model_config)
+
+    # Create runner and run inference
+    inference_config = InferenceConfig(temperature=0.6, topk=80)
+    runner = MagpieInferenceRunner(model, inference_config)
+"""
+
+from nemo.collections.tts.modules.magpietts_inference.evaluation import (
+    DEFAULT_VIOLIN_METRICS,
+    STANDARD_METRIC_KEYS,
+    EvaluationConfig,
+    compute_mean_with_confidence_interval,
+    evaluate_generated_audio_dir,
+)
+from nemo.collections.tts.modules.magpietts_inference.inference import InferenceConfig, MagpieInferenceRunner
+from nemo.collections.tts.modules.magpietts_inference.utils import ModelLoadConfig, load_magpie_model
+from nemo.collections.tts.modules.magpietts_inference.visualization import create_combined_box_plot, create_violin_plot
+
+__all__ = [
+    # Utils
+    "ModelLoadConfig",
+    "load_magpie_model",
+    # Inference
+    "InferenceConfig",
+    "MagpieInferenceRunner",
+    # Evaluation
+    "EvaluationConfig",
+    "evaluate_generated_audio_dir",
+    "compute_mean_with_confidence_interval",
+    "STANDARD_METRIC_KEYS",
+    "DEFAULT_VIOLIN_METRICS",
+    # Visualization
+    "create_violin_plot",
+    "create_combined_box_plot",
+]
@@ -0,0 +1,8 @@
+{
+    "an4_val_ci": {
+        "manifest_path": "/home/TestData/an4_dataset/an4_val_context_v1.json",
+        "audio_dir": "/",
+        "feature_dir": null
+    }
+}
+
@@ -16,24 +16,36 @@
 """
 import argparse
 import json
-import logging
 import os
 import pprint
 import string
 import tempfile
 import time
-from contextlib import contextmanager
 from functools import partial
+from pathlib import Path
 
 import librosa
 import numpy as np
-import scripts.magpietts.evalset_config as evalset_config
 import soundfile as sf
 import torch
 from transformers import Wav2Vec2FeatureExtractor, WavLMForXVector, WhisperForConditionalGeneration, WhisperProcessor
 
 import nemo.collections.asr as nemo_asr
 from nemo.collections.asr.metrics.wer import word_error_rate_detail
+from nemo.utils import logging
+
+# Path to evalset config JSON
+EVALSET_CONFIG_PATH = Path(__file__).parent / 'evalset_config.json'
+
+
+def load_evalset_config(config_path: str = None) -> dict:
+    """Load dataset meta info from JSON config file."""
+    if config_path is None:
+        config_path = EVALSET_CONFIG_PATH
+    with open(config_path, 'r') as f:
+        return json.load(f)
+
+
 from nemo.collections.tts.modules.utmosv2 import UTMOSv2Calculator
 
 
@@ -126,31 +138,12 @@ def pad_audio_to_min_length(audio_np: np.ndarray, sampling_rate: int, min_second
     min_samples = round(min_seconds * sampling_rate)
 
     if n_samples < min_samples:
-        print(f"Padding audio from {n_samples/sampling_rate} seconds to {min_samples/sampling_rate} seconds")
+        logging.info(f"Padding audio from {n_samples/sampling_rate} seconds to {min_samples/sampling_rate} seconds")
         padding_needed = min_samples - n_samples
         audio_np = np.pad(audio_np, (0, padding_needed), mode='constant', constant_values=0)
     return audio_np
 
 
-@contextmanager
-def nemo_log_level(level):
-    """
-    A context manager that temporarily sets the logging level for the NeMo logger
-    and restores the original level when the context manager is exited.
-
-    Args:
-        level (int): The logging level to set.
-    """
-    logger = logging.getLogger("nemo_logger")
-    original_level = logger.level
-    logger.setLevel(level)
-    try:
-        yield
-    finally:
-        # restore the original level when the context manager is exited (even if an exception was raised)
-        logger.setLevel(original_level)
-
-
 def extract_embedding(model, extractor, audio_path, device, sv_model_type):
     speech_array, sampling_rate = librosa.load(audio_path, sr=16000)
     # pad to 0.5 seconds as the extractor may not be able to handle very short signals
@@ -170,14 +163,14 @@ def extract_embedding(model, extractor, audio_path, device, sv_model_type):
 
 
 def compute_utmosv2_scores(audio_dir, device):
-    print(f"\nComputing UTMOSv2 scores for files in {audio_dir}...")
+    logging.info(f"\nComputing UTMOSv2 scores for files in {audio_dir}...")
     start_time = time.time()
     utmosv2_calculator = UTMOSv2Calculator(device=device)
     utmosv2_scores = utmosv2_calculator.process_directory(audio_dir)
     # convert to to a dictionary indexed by file path
     utmosv2_scores_dict = {os.path.normpath(item['file_path']): item['predicted_mos'] for item in utmosv2_scores}
     end_time = time.time()
-    print(f"UTMOSv2 scores computed for {len(utmosv2_scores)} files in {end_time - start_time:.2f} seconds\n")
+    logging.info(f"UTMOSv2 scores computed for {len(utmosv2_scores)} files in {end_time - start_time:.2f} seconds\n")
     return utmosv2_scores_dict
 
 
@@ -221,12 +214,11 @@ def evaluate(
         )
         speaker_verification_model = speaker_verification_model.to(device)
         speaker_verification_model.eval()
-    with nemo_log_level(logging.ERROR):
-        # The model `titanet_small` prints thousands of lines during initialization, so suppress logs temporarily
-        print("Loading `titanet_small` model...")
-        speaker_verification_model_alternate = nemo_asr.models.EncDecSpeakerLabelModel.from_pretrained(
-            model_name='titanet_small'
-        )
+    # The model `titanet_small` prints thousands of lines during initialization, so suppress logs temporarily
+    logging.info("Loading `titanet_small` model...")
+    speaker_verification_model_alternate = nemo_asr.models.EncDecSpeakerLabelModel.from_pretrained(
+        model_name='titanet_small'
+    )
     speaker_verification_model_alternate = speaker_verification_model_alternate.to(device)
     speaker_verification_model_alternate.eval()
 
@@ -269,7 +261,7 @@ def evaluate(
                 )
                 gt_audio_text = process_text(gt_audio_text)
         except Exception as e:
-            print("Error during ASR: {}".format(e))
+            logging.info("Error during ASR: {}".format(e))
             pred_text = ""
             gt_audio_text = ""
 
@@ -283,10 +275,10 @@ def evaluate(
         detailed_cer = word_error_rate_detail(hypotheses=[pred_text], references=[gt_text], use_cer=True)
         detailed_wer = word_error_rate_detail(hypotheses=[pred_text], references=[gt_text], use_cer=False)
 
-        print("{} GT Text:".format(ridx), gt_text)
-        print("{} Pr Text:".format(ridx), pred_text)
+        logging.info(f"{ridx} GT Text: {gt_text}")
+        logging.info(f"{ridx} Pr Text: {pred_text}")
         # Format cer and wer to 2 decimal places
-        print("CER:", "{:.4f} | WER: {:.4f}".format(detailed_cer[0], detailed_wer[0]))
+        logging.info("CER:", "{:.4f} | WER: {:.4f}".format(detailed_cer[0], detailed_wer[0]))
 
         pred_texts.append(pred_text)
         gt_texts.append(gt_text)
@@ -431,8 +423,8 @@ def main():
     args = parser.parse_args()
 
     if args.evalset is not None:
-        dataset_meta_info = evalset_config.dataset_meta_info
-        assert args.evalset in dataset_meta_info
+        dataset_meta_info = load_evalset_config()
+        assert args.evalset in dataset_meta_info, f"Dataset '{args.evalset}' not found in evalset_config.json"
         args.manifest_path = dataset_meta_info[args.evalset]['manifest_path']
         args.audio_dir = dataset_meta_info[args.evalset]['audio_dir']