NVIDIA-NeMo
diff --git a/‎.github/workflows/cicd-main-nemo2.yml‎
Lines changed: 2 additions & 2 deletions b/‎.github/workflows/cicd-main-nemo2.yml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎.github/workflows/cicd-main-speech.yml‎
Lines changed: 4 additions & 0 deletions b/‎.github/workflows/cicd-main-speech.yml‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎examples/asr/asr_chunked_inference/rnnt/speech_to_text_streaming_infer_rnnt.py‎
Lines changed: 79 additions & 2 deletions b/‎examples/asr/asr_chunked_inference/rnnt/speech_to_text_streaming_infer_rnnt.py‎
Lines changed: 79 additions & 2 deletions
diff --git a/‎examples/asr/asr_streaming_inference/asr_streaming_infer.py‎
Lines changed: 30 additions & 7 deletions b/‎examples/asr/asr_streaming_inference/asr_streaming_infer.py‎
Lines changed: 30 additions & 7 deletions
diff --git a/‎examples/asr/conf/asr_streaming_inference/buffered_rnnt.yaml‎
Lines changed: 6 additions & 1 deletion b/‎examples/asr/conf/asr_streaming_inference/buffered_rnnt.yaml‎
Lines changed: 6 additions & 1 deletion
diff --git a/‎examples/asr/conf/asr_streaming_inference/cache_aware_ctc.yaml‎
Lines changed: 1 addition & 1 deletion b/‎examples/asr/conf/asr_streaming_inference/cache_aware_ctc.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/asr/conf/asr_streaming_inference/cache_aware_rnnt.yaml‎
Lines changed: 10 additions & 5 deletions b/‎examples/asr/conf/asr_streaming_inference/cache_aware_rnnt.yaml‎
Lines changed: 10 additions & 5 deletions
diff --git a/‎examples/tts/evalset_config.json‎
Lines changed: 5 additions & 0 deletions b/‎examples/tts/evalset_config.json‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎examples/tts/magpietts_inference.py‎
Lines changed: 7 additions & 3 deletions b/‎examples/tts/magpietts_inference.py‎
Lines changed: 7 additions & 3 deletions
@@ -201,8 +201,8 @@ jobs:
             runner: self-hosted-azure-gpus-1
           - script: L2_NeMo_2_Auto_Configurator_bert_TP1_PP1_MBS124
             runner: self-hosted-azure-gpus-1
-          - script: L2_NeMo_2_Auto_Configurator_t5_TP1_PP1_MBS124
-            runner: self-hosted-azure-gpus-1
+          # - script: L2_NeMo_2_Auto_Configurator_t5_TP1_PP1_MBS124 #skipping t5 hanging tests 
+          #   runner: self-hosted-azure-gpus-1
           - script: L2_NeMo_2_Auto_Configurator_callbacks
             runner: self-hosted-azure-gpus-1
           - script: L2_NeMo_2_Conversion_Test_Baichuan2
 
@@ -131,6 +131,10 @@ jobs:
             script: L2_Speech_Transcription_Speech_to_Text_Cache_Aware_Infer
           - runner: self-hosted-azure
             script: L2_Speech_Transcription_Streaming_Inference
+          - runner: self-hosted-azure
+            script: L2_Speech_Transcription_Speech_to_Text_Inference_Boost_GT
+          - runner: self-hosted-azure
+            script: L2_Speech_Transcription_Speech_to_Text_Transcribe_Boost_GT
           - runner: self-hosted-azure
             script: L2_Speech_Transcription_Canary_Transcribe_Full_Manifest
           - runner: self-hosted-azure
 
@@ -65,13 +65,15 @@
     os.environ["PYTORCH_CUDA_ALLOC_CONF"] = alloc_conf
 
 
+import librosa
 import lightning.pytorch as pl
 import torch
 from omegaconf import OmegaConf, open_dict
 from torch.utils.data import DataLoader
 from tqdm.auto import tqdm
 
 from nemo.collections.asr.models import EncDecHybridRNNTCTCModel, EncDecRNNTModel
+from nemo.collections.asr.parts.context_biasing.biasing_multi_model import BiasingRequestItemConfig
 from nemo.collections.asr.parts.submodules.rnnt_decoding import RNNTDecodingConfig
 from nemo.collections.asr.parts.submodules.transducer_decoding.label_looping_base import (
     GreedyBatchedLabelLoopingComputerBase,
@@ -95,6 +97,7 @@
 )
 from nemo.core.config import hydra_runner
 from nemo.utils import logging
+from nemo.utils.timers import SimpleTimer
 
 
 def make_divisible_by(num, factor: int) -> int:
@@ -113,6 +116,7 @@ class TranscriptionConfig:
     pretrained_name: Optional[str] = None  # Name of a pretrained model
     audio_dir: Optional[str] = None  # Path to a directory which contains audio files
     dataset_manifest: Optional[str] = None  # Path to dataset's JSON manifest
+    sort_by_duration: bool = True  # sort manifest/audio files by duration (descending)
 
     # General configs
     output_filename: Optional[str] = None
@@ -145,6 +149,8 @@ class TranscriptionConfig:
 
     # Decoding strategy for RNNT models
     decoding: RNNTDecodingConfig = field(default_factory=RNNTDecodingConfig)
+    # Per-utterance biasing with biasing config in the manifest
+    use_per_stream_biasing: bool = False
 
     timestamps: bool = False  # output timestamps
 
@@ -154,6 +160,8 @@ class TranscriptionConfig:
     langid: str = "en"  # specify this for convert_num_to_words step in groundtruth cleaning
     use_cer: bool = False
 
+    calculate_rtfx: bool = False
+
 
 @hydra_runner(config_name="TranscriptionConfig", schema=TranscriptionConfig)
 def main(cfg: TranscriptionConfig) -> TranscriptionConfig:
@@ -216,6 +224,8 @@ def main(cfg: TranscriptionConfig) -> TranscriptionConfig:
     asr_model = asr_model.to(asr_model.device)
     asr_model.to(compute_dtype)
 
+    use_per_stream_biasing = cfg.use_per_stream_biasing
+
     # Change Decoding Config
     with open_dict(cfg.decoding):
         if cfg.decoding.strategy != "greedy_batch" or cfg.decoding.greedy.loop_labels is not True:
@@ -226,6 +236,8 @@ def main(cfg: TranscriptionConfig) -> TranscriptionConfig:
         cfg.decoding.greedy.preserve_alignments = False
         cfg.decoding.fused_batch_size = -1  # temporarily stop fused batch during inference.
         cfg.decoding.beam.return_best_hypothesis = True  # return and write the best hypothsis only
+        if use_per_stream_biasing:
+            cfg.decoding.greedy.enable_per_stream_biasing = use_per_stream_biasing
 
     # Setup decoding strategy
     if hasattr(asr_model, 'change_decoding_strategy'):
@@ -250,6 +262,14 @@ def main(cfg: TranscriptionConfig) -> TranscriptionConfig:
         assert filepaths is not None
         records = [{"audio_filepath": audio_file} for audio_file in filepaths]
 
+    if cfg.sort_by_duration:
+        filepath2order = dict()
+        for i, record in enumerate(records):
+            if "duration" not in record:
+                record["duration"] = librosa.get_duration(path=record["audio_filepath"])
+            filepath2order[record["audio_filepath"]] = i
+        records.sort(key=lambda record: record["duration"], reverse=True)
+
     asr_model.preprocessor.featurizer.dither = 0.0
     asr_model.preprocessor.featurizer.pad_to = 0
     asr_model.eval()
@@ -289,8 +309,27 @@ def main(cfg: TranscriptionConfig) -> TranscriptionConfig:
     latency_secs = (context_samples.chunk + context_samples.right) / audio_sample_rate
     logging.info(f"Theoretical latency: {latency_secs:.2f} seconds")
 
+    biasing_requests: list[BiasingRequestItemConfig | None] | None
+    if use_per_stream_biasing:
+        biasing_requests = [
+            (
+                BiasingRequestItemConfig(
+                    **OmegaConf.to_container(
+                        OmegaConf.merge(OmegaConf.structured(BiasingRequestItemConfig), record["biasing_request"])
+                    )
+                )
+                if "biasing_request" in record
+                else None
+            )
+            for record in records
+        ]
+    else:
+        biasing_requests = None
+
     audio_dataset = SimpleAudioDataset(
-        audio_filenames=[record["audio_filepath"] for record in records], sample_rate=audio_sample_rate
+        audio_filenames=[record["audio_filepath"] for record in records],
+        sample_rate=audio_sample_rate,
+        biasing_requests=biasing_requests,
     )
     audio_dataloader = DataLoader(
         dataset=audio_dataset,
@@ -302,9 +341,11 @@ def main(cfg: TranscriptionConfig) -> TranscriptionConfig:
         in_order=True,
     )
 
+    timer = SimpleTimer()
     with torch.no_grad(), torch.inference_mode():
         all_hyps = []
         audio_data: AudioBatch
+        timer.start(device=map_location)
         for audio_data in tqdm(audio_dataloader):
             # get audio
             # NB: preprocessor runs on torch.float32, no need to cast dtype here
@@ -313,8 +354,21 @@ def main(cfg: TranscriptionConfig) -> TranscriptionConfig:
             batch_size = audio_batch.shape[0]
             device = audio_batch.device
 
-            # decode audio by chunks
+            # add biasing requests to the decoder
+            if use_per_stream_biasing:
+                multi_biasing_ids = torch.full([batch_size], fill_value=-1, dtype=torch.long, device=map_location)
+                if audio_data.biasing_requests is not None:
+                    for batch_i, request in enumerate(audio_data.biasing_requests):
+                        if request is not None:
+                            biasing_model = request.get_model(tokenizer=asr_model.tokenizer)
+                            if biasing_model is not None:
+                                multi_model_id = decoding_computer.biasing_multi_model.add_model(biasing_model)
+                                request.multi_model_id = multi_model_id
+                                multi_biasing_ids[batch_i] = multi_model_id
+            else:
+                multi_biasing_ids = None
 
+            # decode audio by chunks
             current_batched_hyps: BatchedHyps | None = None
             state = None
             left_sample = 0
@@ -368,6 +422,7 @@ def main(cfg: TranscriptionConfig) -> TranscriptionConfig:
                         encoder_context_batch.chunk,
                     ),
                     prev_batched_state=state,
+                    multi_biasing_ids=multi_biasing_ids,
                 )
                 # merge hyps with previous hyps
                 if current_batched_hyps is None:
@@ -380,7 +435,14 @@ def main(cfg: TranscriptionConfig) -> TranscriptionConfig:
                 left_sample = right_sample
                 right_sample = min(right_sample + context_samples.chunk, audio_batch.shape[1])  # add next chunk
 
+            # remove biasing requests from the decoder
+            if use_per_stream_biasing and audio_data.biasing_requests is not None:
+                for request in audio_data.biasing_requests:
+                    if request is not None and request.multi_model_id is not None:
+                        decoding_computer.biasing_multi_model.remove_model(request.multi_model_id)
+                        request.multi_model_id = None
             all_hyps.extend(batched_hyps_to_hypotheses(current_batched_hyps, None, batch_size=batch_size))
+        timer.stop(device=map_location)
 
     # convert text
     for i, hyp in enumerate(all_hyps):
@@ -394,11 +456,26 @@ def main(cfg: TranscriptionConfig) -> TranscriptionConfig:
             )
             all_hyps[i] = hyp
 
+    if cfg.sort_by_duration:
+        # restore order for all_hyps and records (all_hyps are consistent with records)
+        order_restored = sorted(
+            zip(records, all_hyps), key=lambda records_hyps: filepath2order[records_hyps[0]["audio_filepath"]]
+        )
+        records, all_hyps = map(list, zip(*order_restored))
+
     output_filename, pred_text_attr_name = write_transcription(
         all_hyps, cfg, model_name, filepaths=filepaths, compute_langs=False, timestamps=cfg.timestamps
     )
     logging.info(f"Finished writing predictions to {output_filename}!")
 
+    if cfg.calculate_rtfx:
+        durations = [
+            record["duration"] if "duration" in record else librosa.get_duration(path=record["audio_filepath"])
+            for record in records
+        ]
+        rtfx = sum(durations) / timer.total_sec()
+        logging.info(f"RTFx: {rtfx:.2f}")
+
     if cfg.calculate_wer:
         output_manifest_w_wer, total_res, _ = cal_write_wer(
             pred_manifest=output_filename,
 
@@ -42,15 +42,17 @@
 """
 
 
-from time import time
-
 import hydra
+from omegaconf import OmegaConf
 
 from nemo.collections.asr.inference.factory.pipeline_builder import PipelineBuilder
+from nemo.collections.asr.inference.streaming.framing.request_options import ASRRequestOptions
 from nemo.collections.asr.inference.utils.manifest_io import calculate_duration, dump_output, get_audio_filepaths
 from nemo.collections.asr.inference.utils.pipeline_eval import calculate_pipeline_laal, evaluate_pipeline
 from nemo.collections.asr.inference.utils.progressbar import TQDMProgressBar
+from nemo.collections.asr.parts.context_biasing.biasing_multi_model import BiasingRequestItemConfig
 from nemo.utils import logging
+from nemo.utils.timers import SimpleTimer
 
 # disable nemo_text_processing logging
 try:
@@ -80,15 +82,36 @@ def main(cfg):
     pipeline = PipelineBuilder.build_pipeline(cfg)
     progress_bar = TQDMProgressBar()
 
+    # Add biasing requests
+    if manifest:
+        options = [
+            ASRRequestOptions(
+                biasing_cfg=(
+                    BiasingRequestItemConfig(
+                        **OmegaConf.to_container(
+                            OmegaConf.merge(OmegaConf.structured(BiasingRequestItemConfig), record["biasing_request"])
+                        )
+                    )
+                    if "biasing_request" in record
+                    else None
+                )
+            )
+            for record in manifest
+        ]
+    else:
+        options = None
+
     # Run the pipeline
-    start = time()
-    output = pipeline.run(audio_filepaths, progress_bar=progress_bar)
-    exec_dur = time() - start
+    timer = SimpleTimer()
+    timer.start(pipeline.device)
+    output = pipeline.run(audio_filepaths, progress_bar=progress_bar, options=options)
+    timer.stop(pipeline.device)
+    exec_dur = timer.total_sec()
 
-    # Calculate RTFX
+    # Calculate RTFx
     data_dur, durations = calculate_duration(audio_filepaths)
     rtfx = data_dur / exec_dur if exec_dur > 0 else float('inf')
-    logging.info(f"RTFX: {rtfx:.2f} ({data_dur:.2f}s / {exec_dur:.2f}s)")
+    logging.info(f"RTFx: {rtfx:.2f} ({data_dur:.2f}s / {exec_dur:.2f}s)")
 
     # Calculate LAAL
     laal = calculate_pipeline_laal(output, durations, manifest, cfg)
 
@@ -13,6 +13,7 @@ asr:
     fused_batch_size: -1
     greedy:
       use_cuda_graph_decoder: true
+      enable_per_stream_biasing: true  # Per-stream biasing in decoder
       max_symbols: 10
       # n-gram LM
       ngram_lm_model: null  # The path to built '.nemo' NGPU-LM model
@@ -22,7 +23,11 @@ asr:
         model_path: null  # The path to built '.nemo' boosting tree model
         key_phrases_file: null  # The path to the context-biasing list file (one phrase per line)
         key_phrases_list: null  # The list of context-biasing phrases ['word1', 'word2', 'word3', ...]
-        source_lang: "en"  # The source language of the context-biasing phrases (for aggregate tokenizer)
+        key_phrase_items_list: null  # The list of context-biasing phrases with custom fields
+        # in CLI: [{phrase:"word1",lang:en},{phrase:"frase dos",lang:es}]
+        # in code: [PhraseItem(phrase="word1, lang="en"), PhraseItem(phrase2="frase dos", lang="es")]
+        source_lang: "en"  # The source language of the context-biasing phrases (for aggregate tokenizer),
+        # used with `key_phrases_file` and `key_phrases_list`
       boosting_tree_alpha: 0.0
 
 
 
@@ -75,7 +75,7 @@ streaming:
   use_cache: true                             # Whether to use cache for streaming
   use_feat_cache: true                        # Whether to cache mel-spec features, set false to re-calculate all mel-spec features in audio buffer
   chunk_size_in_secs: null                    # Amount of audio to load for each streaming step, e.g., 0.08s for FastConformer. Set to `null` for using default size equal to 1+lookahead frames.
-  request_type: frame                         # Type of request: frame, only frame is supported for cache-aware streaming
+  request_type: frame                         # Type of request: frame or feature_buffer
   num_slots: 1024                             # Number of slots in the context manager: must be >= batch_size
 
 
 
@@ -2,7 +2,7 @@
 # ASR Configuration
 # ================================
 asr:
-  model_name: stt_en_fastconformer_hybrid_large_streaming_multi         # Pre-trained CTC/hybrid model from NGC/HuggingFace or local .nemo file path
+  model_name: nvidia/nemotron-speech-streaming-en-0.6b         # Pre-trained CTC/hybrid model from NGC/HuggingFace or local .nemo file path
   device: cuda                                                          # Device for inference: 'cuda' or 'cpu'
   device_id: 0                                                          # GPU device ID
   compute_dtype: bfloat16                                               # Compute precision: 'bfloat16' for Ampere+, 'float16' for older GPUs, or 'float32'               
@@ -13,6 +13,7 @@ asr:
     fused_batch_size: -1
     greedy:
       use_cuda_graph_decoder: false  # Disabled due to issues with decoding
+      enable_per_stream_biasing: false  # Per-stream biasing in decoder
       max_symbols: 10
       # n-gram LM
       ngram_lm_model: null  # The path to built '.nemo' NGPU-LM model
@@ -22,7 +23,11 @@ asr:
         model_path: null  # The path to built '.nemo' boosting tree model
         key_phrases_file: null  # The path to the context-biasing list file (one phrase per line)
         key_phrases_list: null  # The list of context-biasing phrases ['word1', 'word2', 'word3', ...]
-        source_lang: "en"  # The source language of the context-biasing phrases (for aggregate tokenizer)
+        key_phrase_items_list: null  # The list of context-biasing phrases with custom fields
+        # in CLI: [{phrase:"word1",lang:en},{phrase:"frase dos",lang:es}]
+        # in code: [PhraseItem(phrase="word1, lang="en"), PhraseItem(phrase2="frase dos", lang="es")]
+        source_lang: "en"  # The source language of the context-biasing phrases (for aggregate tokenizer),
+        # used with `key_phrases_file` and `key_phrases_list`
       boosting_tree_alpha: 0.0  # Weight of the boosting tree
 
 # ==========================================
@@ -85,14 +90,14 @@ endpointing:
 # ========================
 streaming:
   sample_rate: 16000                          # Audio sample rate in Hz
-  batch_size: 256                             # Number of audio frames per batch
+  batch_size: 64                             # Number of audio frames per batch
   word_boundary_tolerance: 4                  # Tolerance for word boundaries
   att_context_size: [70,13]                   # Attention context size: [70,13],[70,6],[70,1],[70,0]
   use_cache: true                             # Whether to use cache for streaming
   use_feat_cache: true                        # Whether to cache mel-spec features, set false to re-calculate all mel-spec features in audio buffer
   chunk_size_in_secs: null                    # Amount of audio to load for each streaming step, e.g., 0.08s for FastConformer. Set to `null` for using default size equal to 1+lookahead frames.
-  request_type: frame                         # Type of request: frame, only frame is supported for cache-aware streaming
-  num_slots: 1024                             # Number of slots in the context manager: must be >= batch_size
+  request_type: frame                         # Type of request: frame or feature_buffer
+  num_slots: 256                             # Number of slots in the context manager: must be >= batch_size
 
 
 # ========================
 
@@ -3,6 +3,11 @@
         "manifest_path": "/home/TestData/an4_dataset/an4_val_context_v1.json",
         "audio_dir": "/",
         "feature_dir": null
+    },
+    "an4_val_tiny_ci": {
+        "manifest_path": "/home/TestData/an4_dataset/an4_val_context_v1_tiny.json",
+        "audio_dir": "/",
+        "feature_dir": null
     }
 }
 
@@ -504,10 +504,14 @@ def create_argument_parser() -> argparse.ArgumentParser:
     return parser
 
 
-def main():
-    """Main entry point."""
+def main(argv=None):
+    """Entry point for MagpieTTS inference and evaluation.
+
+    Args:
+        argv: Command-line arguments. If None, uses sys.argv.
+    """
     parser = create_argument_parser()
-    args = parser.parse_args()
+    args = parser.parse_args(argv)
 
     dataset_meta_info = load_evalset_config(args.datasets_json_path)
     datasets = filter_datasets(dataset_meta_info, args.datasets)
Original file line number	Diff line number	Diff line change
`@@ -3,6 +3,11 @@`
`3`	`3`	`"manifest_path": "/home/TestData/an4_dataset/an4_val_context_v1.json",`
`4`	`4`	`"audio_dir": "/",`
`5`	`5`	`"feature_dir": null`
	`6`	`+ },`
	`7`	`+ "an4_val_tiny_ci": {`
	`8`	`+ "manifest_path": "/home/TestData/an4_dataset/an4_val_context_v1_tiny.json",`
	`9`	`+ "audio_dir": "/",`
	`10`	`+ "feature_dir": null`
`6`	`11`	`}`
`7`	`12`	`}`
`8`	`13`