NVIDIA-NeMo
diff --git a/‎examples/asr/asr_streaming_inference/README.md‎
Lines changed: 1 addition & 0 deletions b/‎examples/asr/asr_streaming_inference/README.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎examples/asr/asr_streaming_inference/asr_streaming_infer.py‎
Lines changed: 17 additions & 6 deletions b/‎examples/asr/asr_streaming_inference/asr_streaming_infer.py‎
Lines changed: 17 additions & 6 deletions
diff --git a/‎examples/asr/conf/asr_streaming_inference/buffered_ctc.yaml‎
Lines changed: 44 additions & 1 deletion b/‎examples/asr/conf/asr_streaming_inference/buffered_ctc.yaml‎
Lines changed: 44 additions & 1 deletion
diff --git a/‎examples/asr/conf/asr_streaming_inference/buffered_rnnt.yaml‎
Lines changed: 43 additions & 1 deletion b/‎examples/asr/conf/asr_streaming_inference/buffered_rnnt.yaml‎
Lines changed: 43 additions & 1 deletion
diff --git a/‎examples/asr/conf/asr_streaming_inference/cache_aware_ctc.yaml‎
Lines changed: 43 additions & 0 deletions b/‎examples/asr/conf/asr_streaming_inference/cache_aware_ctc.yaml‎
Lines changed: 43 additions & 0 deletions
diff --git a/‎examples/asr/conf/asr_streaming_inference/cache_aware_rnnt.yaml‎
Lines changed: 44 additions & 1 deletion b/‎examples/asr/conf/asr_streaming_inference/cache_aware_rnnt.yaml‎
Lines changed: 44 additions & 1 deletion
@@ -6,6 +6,7 @@ Beyond streaming ASR, the script also supports:
 
 * **Inverse Text Normalization (ITN)**
 * **End-of-Utterance (EoU) Detection**
+* **Streaming Speech Translation (requires vLLM installation)**
 * **Word-level and Segment-level Output**
 
 All related configurations can be found in the `../conf/asr_streaming_inference/` directory.
@@ -30,7 +30,8 @@
         output_filename=<path to output jsonfile> \
         lang=en \
         enable_pnc=False \
-        enable_itn=True \
+        enable_itn=False \
+        enable_nmt=False \
         asr_output_granularity=segment \
         ...
         # See ../conf/asr_streaming_inference/*.yaml for all available options
@@ -45,9 +46,9 @@
 
 import hydra
 
-
 from nemo.collections.asr.inference.factory.pipeline_builder import PipelineBuilder
 from nemo.collections.asr.inference.utils.manifest_io import calculate_duration, dump_output, get_audio_filepaths
+from nemo.collections.asr.inference.utils.pipeline_eval import calculate_pipeline_laal, evaluate_pipeline
 from nemo.collections.asr.inference.utils.progressbar import TQDMProgressBar
 from nemo.utils import logging
 
@@ -69,8 +70,11 @@ def main(cfg):
     logging.setLevel(cfg.log_level)
 
     # Reading audio filepaths
-    audio_filepaths = get_audio_filepaths(cfg.audio_file, sort_by_duration=True)
+    audio_filepaths, manifest = get_audio_filepaths(cfg.audio_file, sort_by_duration=True)
     logging.info(f"Found {len(audio_filepaths)} audio files")
+    if manifest:
+        keys = list(manifest[0].keys())
+        logging.info(f"Found {len(keys)} keys in the input manifest: {keys}")
 
     # Build the pipeline
     pipeline = PipelineBuilder.build_pipeline(cfg)
@@ -82,13 +86,20 @@ def main(cfg):
     exec_dur = time() - start
 
     # Calculate RTFX
-    data_dur = calculate_duration(audio_filepaths)
+    data_dur, durations = calculate_duration(audio_filepaths)
     rtfx = data_dur / exec_dur if exec_dur > 0 else float('inf')
     logging.info(f"RTFX: {rtfx:.2f} ({data_dur:.2f}s / {exec_dur:.2f}s)")
 
+    # Calculate LAAL
+    laal = calculate_pipeline_laal(output, durations, manifest, cfg)
+    if laal is not None:
+        logging.info(f"LAAL: {laal:.2f}ms")
+
     # Dump the transcriptions to a output file
-    dump_output(output, cfg.output_filename, cfg.output_dir)
-    logging.info(f"Transcriptions written to {cfg.output_filename}")
+    dump_output(output, cfg.output_filename, cfg.output_dir, manifest)
+
+    # Evaluate the pipeline
+    evaluate_pipeline(cfg.output_filename, cfg)
     logging.info("Done!")
 
 
 
@@ -22,6 +22,27 @@ itn:
   n_jobs: 16                                    # Number of parallel jobs for ITN processing
 
 
+# ================================
+# Neural Machine Translation Configuration
+# ================================
+nmt:
+  model_name: "utter-project/EuroLLM-1.7B-Instruct"  # vLLM-supported model name
+  source_language: "English"                         # Source language code
+  target_language: "Russian"                         # Target language code
+  waitk: -1                                          # Max allowed lag (in words) between ASR transcript and translation; -1 disables it and uses only the longest common prefix between current and previous translations.
+  device: cuda                                       # Device for translation: 'cuda'. 'cpu' is not supported.
+  device_id: 1                                       # GPU device ID for translation
+  batch_size: 16                                     # Batch size for translation, if -1, the batch size is equal to the ASR batch size
+  llm_params:                                        # See https://docs.vllm.ai/en/v0.8.1/api/offline_inference/llm.html for more details
+    dtype: "auto"                                    # Compute precision
+    seed: 42                                         # The seed to initialize the random number generator for sampling
+  sampling_params:                                   # See https://docs.vllm.ai/en/v0.6.4/dev/sampling_params.html for more details
+    max_tokens: 100                                  # Maximum number of tokens to generate with LLM
+    temperature: 0.0                                 # LLM sampling temperature, default for translation is 0 (greedy)
+    top_p: 0.9                                       # The cumulative probability threshold for nucleus sampling
+    seed: 42                                         # The seed to initialize the random number generator for sampling
+
+
 # ========================
 # Confidence estimation
 # ========================
@@ -67,14 +88,36 @@ asr_decoding_type: ctc                        # Decoding method: ctc or rnnt
 
 
 # ========================
-# Runtime arguments defined at runtime   via command line
+# Runtime arguments defined at runtime via command line
 # ========================
 audio_file: null                              # Path to audio file, directory, or manifest JSON
 output_filename: null                         # Path to output transcription JSON file
 output_dir: null                              # Directory to save time-aligned output
 enable_pnc: false                             # Whether to apply punctuation & capitalization
 enable_itn: false                             # Whether to apply inverse text normalization
+enable_nmt: false                             # Whether to apply neural machine translation
 asr_output_granularity: segment               # Output granularity: word or segment
 cache_dir: null                               # Directory to store cache (e.g., .far files)
 lang: null                                    # Language code for ASR model
 return_tail_result: false                     # Whether to return the tail labels left in the right padded side of the buffer
+calculate_wer: true                           # Whether to calculate WER
+calculate_bleu: true                          # Whether to calculate BLEU score
+
+
+# ========================
+# Metrics
+# ========================
+metrics:
+  asr:
+    gt_text_attr_name: text                     # Attribute name for ground truth text
+    clean_groundtruth_text: false               # Whether to clean ground truth text
+    langid: en                                  # Language code for text normalization; only "en" is supported
+    use_cer: false                              # Whether to use character error rate
+    ignore_capitalization: true                 # Whether to ignore capitalization
+    ignore_punctuation: true                    # Whether to ignore punctuation
+    strip_punc_space: false                     # Whether to strip punctuation and space
+  nmt:
+    gt_text_attr_name: answer                   # Attribute name for ground truth text
+    ignore_capitalization: false                # Whether to ignore capitalization
+    ignore_punctuation: false                   # Whether to ignore punctuation
+    strip_punc_space: false                     # Whether to strip punctuation and space
@@ -39,6 +39,27 @@ itn:
   n_jobs: 16                                    # Number of parallel jobs for ITN processing
 
 
+# ================================
+# Neural Machine Translation Configuration
+# ================================
+nmt:
+  model_name: "utter-project/EuroLLM-1.7B-Instruct"  # vLLM-supported model name
+  source_language: "English"                         # Source language code
+  target_language: "Russian"                         # Target language code
+  waitk: -1                                          # Max allowed lag (in words) between ASR transcript and translation; -1 disables it and uses only the longest common prefix between current and previous translations.
+  device: cuda                                       # Device for translation: 'cuda'. 'cpu' is not supported.
+  device_id: 1                                       # GPU device ID for translation
+  batch_size: 16                                     # Batch size for translation, if -1, the batch size is equal to the ASR batch size
+  llm_params:                                        # See https://docs.vllm.ai/en/v0.8.1/api/offline_inference/llm.html for more details
+    dtype: "auto"                                    # Compute precision
+    seed: 42                                         # The seed to initialize the random number generator for sampling
+  sampling_params:                                   # See https://docs.vllm.ai/en/v0.6.4/dev/sampling_params.html for more details
+    max_tokens: 100                                  # Maximum number of tokens to generate with LLM
+    temperature: 0.0                                 # LLM sampling temperature, default for translation is 0 (greedy)
+    top_p: 0.9                                       # The cumulative probability threshold for nucleus sampling
+    seed: 42                                         # The seed to initialize the random number generator for sampling
+
+
 # ========================
 # Confidence estimation
 # ========================
@@ -85,14 +106,35 @@ asr_decoding_type: rnnt                    # Decoding method: ctc or rnnt
 
 
 # ========================
-# Runtime arguments defined at runtime   via command line
+# Runtime arguments defined at runtime via command line
 # ========================
 audio_file: null                              # Path to audio file, directory, or manifest JSON
 output_filename: null                         # Path to output transcription JSON file
 output_dir: null                              # Directory to save time-aligned output
 enable_pnc: false                             # Whether to apply punctuation & capitalization
 enable_itn: false                             # Whether to apply inverse text normalization
+enable_nmt: false                             # Whether to apply neural machine translation
 asr_output_granularity: segment               # Output granularity: word or segment
 cache_dir: null                               # Directory to store cache (e.g., .far files)
 lang: null                                    # Language code for ASR model
 return_tail_result: false                     # Whether to return the tail labels left in the right padded side of the buffer
+calculate_wer: true                           # Whether to calculate WER
+calculate_bleu: true                          # Whether to calculate BLEU score
+
+# ========================
+# Metrics
+# ========================
+metrics:
+  asr:
+    gt_text_attr_name: text                     # Attribute name for ground truth text
+    clean_groundtruth_text: false               # Whether to clean ground truth text
+    langid: en                                  # Language code for text normalization; only "en" is supported
+    use_cer: false                              # Whether to use character error rate
+    ignore_capitalization: true                 # Whether to ignore capitalization
+    ignore_punctuation: true                    # Whether to ignore punctuation
+    strip_punc_space: false                     # Whether to strip punctuation and space
+  nmt:
+    gt_text_attr_name: answer                   # Attribute name for ground truth text
+    ignore_capitalization: false                # Whether to ignore capitalization
+    ignore_punctuation: false                   # Whether to ignore punctuation
+    strip_punc_space: false                     # Whether to strip punctuation and space
@@ -22,6 +22,27 @@ itn:
   n_jobs: 16                                    # Number of parallel jobs for ITN processing
 
 
+# ================================
+# Neural Machine Translation Configuration
+# ================================
+nmt:
+  model_name: "utter-project/EuroLLM-1.7B-Instruct"  # vLLM-supported model name
+  source_language: "English"                         # Source language code
+  target_language: "Russian"                         # Target language code
+  waitk: -1                                          # Max allowed lag (in words) between ASR transcript and translation; -1 disables it and uses only the longest common prefix between current and previous translations.
+  device: cuda                                       # Device for translation: 'cuda'. 'cpu' is not supported.
+  device_id: 1                                       # GPU device ID for translation
+  batch_size: 16                                     # Batch size for translation, if -1, the batch size is equal to the ASR batch size
+  llm_params:                                        # See https://docs.vllm.ai/en/v0.8.1/api/offline_inference/llm.html for more details
+    dtype: "auto"                                    # Compute precision
+    seed: 42                                         # The seed to initialize the random number generator for sampling
+  sampling_params:                                   # See https://docs.vllm.ai/en/v0.6.4/dev/sampling_params.html for more details
+    max_tokens: 100                                  # Maximum number of tokens to generate with LLM
+    temperature: 0.0                                 # LLM sampling temperature, default for translation is 0 (greedy)
+    top_p: 0.9                                       # The cumulative probability threshold for nucleus sampling
+    seed: 42                                         # The seed to initialize the random number generator for sampling
+
+
 # ========================
 # Confidence estimation
 # ========================
@@ -74,7 +95,29 @@ output_filename: null                         # Path to output transcription JSO
 output_dir: null                              # Directory to save time-aligned output
 enable_pnc: false                             # Whether to apply punctuation & capitalization
 enable_itn: false                             # Whether to apply inverse text normalization
+enable_nmt: false                             # Whether to apply neural machine translation
 asr_output_granularity: segment               # Output granularity: word or segment
 cache_dir: null                               # Directory to store cache (e.g., .far files)
 lang: null                                    # Language code for ASR model
 return_tail_result: false                     # Whether to return the tail labels left in the right padded side of the buffer
+calculate_wer: true                           # Whether to calculate WER
+calculate_bleu: true                          # Whether to calculate BLEU score
+
+
+# ========================
+# Metrics
+# ========================
+metrics:
+  asr:
+    gt_text_attr_name: text                     # Attribute name for ground truth text
+    clean_groundtruth_text: false               # Whether to clean ground truth text
+    langid: en                                  # Language code for text normalization; only "en" is supported
+    use_cer: false                              # Whether to use character error rate
+    ignore_capitalization: true                 # Whether to ignore capitalization
+    ignore_punctuation: true                    # Whether to ignore punctuation
+    strip_punc_space: false                     # Whether to strip punctuation and space
+  nmt:
+    gt_text_attr_name: answer                   # Attribute name for ground truth text
+    ignore_capitalization: false                # Whether to ignore capitalization
+    ignore_punctuation: false                   # Whether to ignore punctuation
+    strip_punc_space: false                     # Whether to strip punctuation and space
@@ -38,6 +38,27 @@ itn:
   n_jobs: 16                                    # Number of parallel jobs for ITN processing
 
 
+# ================================
+# Neural Machine Translation Configuration
+# ================================
+nmt:
+  model_name: "utter-project/EuroLLM-1.7B-Instruct"  # vLLM-supported model name
+  source_language: "English"                         # Source language code
+  target_language: "Russian"                         # Target language code
+  waitk: -1                                          # Max allowed lag (in words) between ASR transcript and translation; -1 disables it and uses only the longest common prefix between current and previous translations.
+  device: cuda                                       # Device for translation: 'cuda'. 'cpu' is not supported.
+  device_id: 1                                       # GPU device ID for translation
+  batch_size: 16                                     # Batch size for translation, if -1, the batch size is equal to the ASR batch size
+  llm_params:                                        # See https://docs.vllm.ai/en/v0.8.1/api/offline_inference/llm.html for more details
+    dtype: "auto"                                    # Compute precision
+    seed: 42                                         # The seed to initialize the random number generator for sampling
+  sampling_params:                                   # See https://docs.vllm.ai/en/v0.6.4/dev/sampling_params.html for more details
+    max_tokens: 100                                  # Maximum number of tokens to generate with LLM
+    temperature: 0.0                                 # LLM sampling temperature, default for translation is 0 (greedy)
+    top_p: 0.9                                       # The cumulative probability threshold for nucleus sampling
+    seed: 42                                         # The seed to initialize the random number generator for sampling
+
+
 # ========================
 # Confidence estimation
 # ========================
@@ -84,14 +105,36 @@ asr_decoding_type: rnnt                       # Decoding method: ctc or rnnt
 
 
 # ========================
-# Runtime arguments defined at runtime   via command line
+# Runtime arguments defined at runtime via command line
 # ========================
 audio_file: null                              # Path to audio file, directory, or manifest JSON
 output_filename: null                         # Path to output transcription JSON file
 output_dir: null                              # Directory to save time-aligned output
 enable_pnc: false                             # Whether to apply punctuation & capitalization
 enable_itn: false                             # Whether to apply inverse text normalization
+enable_nmt: false                             # Whether to apply neural machine translation
 asr_output_granularity: segment               # Output granularity: word or segment
 cache_dir: null                               # Directory to store cache (e.g., .far files)
 lang: null                                    # Language code for ASR model
 return_tail_result: false                     # Whether to return the tail labels left in the right padded side of the buffer
+calculate_wer: true                           # Whether to calculate WER
+calculate_bleu: true                          # Whether to calculate BLEU score
+
+
+# ========================
+# Metrics
+# ========================
+metrics:
+  asr:
+    gt_text_attr_name: text                     # Attribute name for ground truth text
+    clean_groundtruth_text: false               # Whether to clean ground truth text
+    langid: en                                  # Language code for text normalization; only "en" is supported
+    use_cer: false                              # Whether to use character error rate
+    ignore_capitalization: true                 # Whether to ignore capitalization
+    ignore_punctuation: true                    # Whether to ignore punctuation
+    strip_punc_space: false                     # Whether to strip punctuation and space
+  nmt:
+    gt_text_attr_name: answer                   # Attribute name for ground truth text
+    ignore_capitalization: false                # Whether to ignore capitalization
+    ignore_punctuation: false                   # Whether to ignore punctuation
+    strip_punc_space: false                     # Whether to strip punctuation and space