NVIDIA-NeMo
diff --git a/‎docs/source/tools/nemo_forced_aligner.rst‎
Lines changed: 2 additions & 2 deletions b/‎docs/source/tools/nemo_forced_aligner.rst‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎nemo/collections/asr/metrics/bleu.py‎
Lines changed: 1 addition & 1 deletion b/‎nemo/collections/asr/metrics/bleu.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎nemo/collections/asr/metrics/wer.py‎
Lines changed: 1 addition & 1 deletion b/‎nemo/collections/asr/metrics/wer.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎nemo/collections/asr/models/aed_multitask_models.py‎
Lines changed: 100 additions & 18 deletions b/‎nemo/collections/asr/models/aed_multitask_models.py‎
Lines changed: 100 additions & 18 deletions
diff --git a/‎nemo/collections/asr/models/k2_aligner_model.py‎
Lines changed: 24 additions & 17 deletions b/‎nemo/collections/asr/models/k2_aligner_model.py‎
Lines changed: 24 additions & 17 deletions
diff --git a/‎nemo/collections/asr/parts/mixins/mixins.py‎
Lines changed: 6 additions & 6 deletions b/‎nemo/collections/asr/parts/mixins/mixins.py‎
Lines changed: 6 additions & 6 deletions
@@ -64,9 +64,9 @@ Optional parameters:
 
 * ``use_local_attention``: boolean flag specifying whether to try to use local attention for the ASR Model (will only work if the ASR Model is a Conformer model). If local attention is used, we will set the local attention context size to [64,64].
 
-* ``additional_segment_grouping_separator``: an optional string used to separate the text into smaller segments. If this is not specified, then the whole text will be treated as a single segment. (Default: ``None``. Cannot be empty string or space (" "), as NFA will automatically produce word-level timestamps for substrings separated by spaces).
+* ``additional_segment_grouping_separator``: a list of strings used to separate the text into smaller segments. If set to ``None``, then the whole text will be treated as a single segment. (Default: ``['.', '?', '!', '...']``. Cannot be empty string or space (" "), as NFA will automatically produce word-level timestamps for substrings separated by spaces).
 
-	.. note:: the ``additional_segment_grouping_separator`` will be removed from the reference text and all the output files, ie it is treated as a marker which is not part of the reference text. The separator will essentially be treated as a space, and any additional spaces around it will be amalgamated into one, i.e. if ``additional_segment_grouping_separator="|"``, the following texts will be treated equivalently: ``“abc|def”``, ``“abc |def”``, ``“abc| def”``, ``“abc | def"``.
+	.. note:: Starting in NeMo 2.5.0, separators are preserved in segment text after splitting. if ``additional_segment_grouping_separator="['.', '?', '!', '...']"`` (as is the default), then the text ``"Hi, have you updated your NeMo? Yes. Sure!"`` will result in the following segments ``["Hi, have you updated your NeMo?", "Yes.", "Sure!"]``.
 
 * ``remove_blank_tokens_from_ctm``: a boolean denoting whether to remove <blank> tokens from token-level output CTMs. (Default: False). 
 
 
@@ -184,7 +184,7 @@ def update(
             tgt_lenths_cpu_tensor = targets_lengths.long().cpu()
             for idx, tgt_len in enumerate(tgt_lenths_cpu_tensor):
                 target = targets_cpu_tensor[idx][:tgt_len].tolist()
-                reference = self.decoding.decode_tokens_to_str(target)
+                reference = self.decoding.decode_ids_to_str(target)
                 tok = tokenizers[idx] if tokenizers else None  # `None` arg uses default tokenizer
 
                 # TODO: the backend implementation of this has a lot of cpu to gpu operations. Should reimplement
 
@@ -324,7 +324,7 @@ def update(
             for ind in range(targets_cpu_tensor.shape[0]):
                 tgt_len = tgt_lenths_cpu_tensor[ind].item()
                 target = targets_cpu_tensor[ind][:tgt_len].numpy().tolist()
-                reference = self.decoding.decode_tokens_to_str(target)
+                reference = self.decoding.decode_ids_to_str(target)
                 references.append(reference)
             hypotheses = (
                 self.decode(predictions, predictions_lengths, predictions_mask, input_ids)
 
@@ -18,6 +18,7 @@
 from dataclasses import dataclass, field
 from math import ceil
 from typing import Any, Dict, List, Optional, Union
+
 import numpy as np
 import torch
 from lightning.pytorch import Trainer
@@ -40,14 +41,18 @@
 from nemo.collections.asr.parts.submodules.multitask_decoding import MultiTaskDecoding, MultiTaskDecodingConfig
 from nemo.collections.asr.parts.submodules.token_classifier import TokenClassifier
 from nemo.collections.asr.parts.utils.rnnt_utils import Hypothesis
-from nemo.collections.asr.parts.utils.timestamp_utils import process_aed_timestamp_outputs
+from nemo.collections.asr.parts.utils.timestamp_utils import (
+    get_forced_aligned_timestamps_with_external_model,
+    process_aed_timestamp_outputs,
+)
 from nemo.collections.common import tokenizers
 from nemo.collections.common.data.lhotse.dataloader import get_lhotse_dataloader_from_config
 from nemo.collections.common.metrics import GlobalAverageLossMetric
 from nemo.collections.common.parts import transformer_weights_init
 from nemo.collections.common.parts.preprocessing.manifest import get_full_path
 from nemo.collections.common.prompts.formatter import PromptFormatter
 from nemo.core.classes.common import typecheck
+from nemo.core.connectors.save_restore_connector import SaveRestoreConnector
 from nemo.core.neural_types import (
     AudioSignal,
     ChannelType,
@@ -59,6 +64,7 @@
     SpectrogramType,
 )
 from nemo.utils import logging, model_utils
+from nemo.utils.app_state import AppState
 
 __all__ = ['EncDecMultiTaskModel']
 
@@ -241,6 +247,10 @@ def __init__(self, cfg: DictConfig, trainer: Trainer = None):
         # Setup encoder adapters (from ASRAdapterModelMixin)
         self.setup_adapters()
 
+        timestamps_asr_model = self.__restore_timestamps_asr_model()
+        # Using object.__setattr__ to bypass PyTorch's module registration
+        object.__setattr__(self, 'timestamps_asr_model', timestamps_asr_model)
+
     def change_decoding_strategy(self, decoding_cfg: DictConfig):
         """
         Changes decoding strategy used during Multi Task decoding process.
@@ -518,16 +528,18 @@ def transcribe(
             as paths2audio_files
         """
         if timestamps is not None:
-            # TODO: Handle this key gracefully later
-
-            if timestamps is True:
-                timestamps = 'yes'
-            elif timestamps is False:
-                timestamps = 'no'
+            if self.timestamps_asr_model is None:
+                # TODO: Handle this key gracefully later
+                if timestamps is True:
+                    timestamps = 'yes'
+                elif timestamps is False:
+                    timestamps = 'no'
+                else:
+                    timestamps = str(timestamps)
+                    assert timestamps in ('yes', 'no', 'timestamp', 'notimestamp', '1', '0')
+                prompt['timestamp'] = timestamps
             else:
-                timestamps = str(timestamps)
-                assert timestamps in ('yes', 'no', 'timestamp', 'notimestamp', '1', '0')
-            prompt['timestamp'] = timestamps
+                prompt['timestamp'] = 'no'
 
         if override_config is None:
             trcfg = MultiTaskTranscriptionConfig(
@@ -538,6 +550,7 @@ def transcribe(
                 augmentor=augmentor,
                 verbose=verbose,
                 prompt=prompt,
+                timestamps=timestamps,
             )
         else:
             if not isinstance(override_config, MultiTaskTranscriptionConfig):
@@ -546,6 +559,7 @@ def transcribe(
                     f"but got {type(override_config)}"
                 )
             trcfg = override_config
+            trcfg.timestamps = timestamps
 
         return super().transcribe(audio=audio, override_config=trcfg)
 
@@ -856,6 +870,9 @@ def _transcribe_on_begin(self, audio, trcfg: MultiTaskTranscriptionConfig):
                     trcfg._internal.primary_language = self.tokenizer.langs[0]
                     logging.debug(f"Transcribing with default setting of {trcfg._internal.primary_language}.")
 
+        if trcfg.timestamps and self.timestamps_asr_model is not None:
+            self.timestamps_asr_model.to(trcfg._internal.device)
+
     def _transcribe_input_manifest_processing(
         self, audio_files: List[str], temp_dir: str, trcfg: MultiTaskTranscriptionConfig
     ) -> Dict[str, Any]:
@@ -955,6 +972,7 @@ def _transcribe_forward(
             encoder_states=enc_states,
             encoder_mask=enc_mask,
             decoder_input_ids=decoder_input_ids,
+            batch=batch,
         )
 
     def _transcribe_output_processing(self, outputs, trcfg: MultiTaskTranscriptionConfig) -> GenericTranscriptionType:
@@ -976,6 +994,7 @@ def _transcribe_output_processing(self, outputs, trcfg: MultiTaskTranscriptionCo
         enc_states = outputs.pop('encoder_states')
         enc_mask = outputs.pop('encoder_mask')
         decoder_input_ids = outputs.pop('decoder_input_ids')
+        batch = outputs.pop('batch')
 
         del log_probs, encoded_len
 
@@ -988,10 +1007,19 @@ def _transcribe_output_processing(self, outputs, trcfg: MultiTaskTranscriptionCo
 
         del enc_states, enc_mask, decoder_input_ids
 
-        hypotheses = process_aed_timestamp_outputs(
-            hypotheses, self.encoder.subsampling_factor, self.cfg['preprocessor']['window_stride']
-        )
-
+        if trcfg.timestamps and self.timestamps_asr_model is not None:
+            hypotheses = get_forced_aligned_timestamps_with_external_model(
+                audio=[audio.squeeze()[:audio_len] for audio, audio_len in zip(batch.audio, batch.audio_lens)],
+                batch_size=len(batch.audio),
+                external_ctc_model=self.timestamps_asr_model,
+                main_model_predictions=hypotheses,
+                timestamp_type=['word', 'segment'],
+                viterbi_device=trcfg._internal.device,
+            )
+        elif trcfg.timestamps:
+            hypotheses = process_aed_timestamp_outputs(
+                hypotheses, self.encoder.subsampling_factor, self.cfg['preprocessor']['window_stride']
+            )
         return hypotheses
 
     def _setup_transcribe_dataloader(self, config: Dict) -> 'torch.utils.data.DataLoader':
@@ -1062,6 +1090,7 @@ def _may_be_make_dict_and_fix_paths(self, json_items, manifest_path, trcfg: Mult
         # This method is a legacy helper for Canary that checks whether prompt slot values were provided
         # in the input manifest and if not, it injects the defaults.
         out_json_items = []
+        timestamps_required = False
         for item in json_items:
             if isinstance(item, str):
                 # assume it is a path to audio file
@@ -1099,7 +1128,21 @@ def _may_be_make_dict_and_fix_paths(self, json_items, manifest_path, trcfg: Mult
                 if k not in entry:
                     # last-chance fallback injecting legacy Canary defaults if none were provided.
                     entry[k] = default_turn.get(k, dv)
+                if k == "timestamp":
+                    if (
+                        str(entry[k]).lower() not in ['notimestamp', "no", "false", "0"]
+                        and self.timestamps_asr_model is not None
+                    ):
+                        timestamps_required = True
+                        entry[k] = 'notimestamp'
             out_json_items.append(entry)
+
+        if timestamps_required:
+            trcfg.timestamps = True
+            logging.warning(
+                "Timestamps are enabled for at least one of the input items. "
+                "Setting timestamps to True for all the input items, as the current model is using external ASR model for alignment."
+            )
         return out_json_items
 
     @classmethod
@@ -1113,7 +1156,12 @@ def get_transcribe_config(cls) -> MultiTaskTranscriptionConfig:
         return MultiTaskTranscriptionConfig()
 
     def predict_step(
-        self, batch: PromptedAudioToTextMiniBatch, batch_idx=0, dataloader_idx=0, has_processed_signal=False
+        self,
+        batch: PromptedAudioToTextMiniBatch,
+        batch_idx=0,
+        dataloader_idx=0,
+        has_processed_signal=False,
+        timestamps=False,
     ):
         if has_processed_signal:
             processed_signal = batch.audio
@@ -1140,9 +1188,10 @@ def predict_step(
             return_hypotheses=False,
         )
 
-        hypotheses = process_aed_timestamp_outputs(
-            hypotheses, self.encoder.subsampling_factor, self.cfg['preprocessor']['window_stride']
-        )
+        if timestamps and self.timestamps_asr_model is None:
+            hypotheses = process_aed_timestamp_outputs(
+                hypotheses, self.encoder.subsampling_factor, self.cfg['preprocessor']['window_stride']
+            )
 
         if batch.cuts:
             return list(zip(batch.cuts, hypotheses))
@@ -1182,6 +1231,39 @@ def oomptimizer_schema(self) -> dict:
             ],
         }
 
+    def __restore_timestamps_asr_model(self):
+        """
+        This method is used to restore the external timestamp ASR model that will be used for forced alignment in `.transcribe()`.
+        The config and weights are expected to be in the main .nemo file and be named `timestamps_asr_model_config.yaml` and `timestamps_asr_model_weights.ckpt` respectively.
+        """
+        app_state = AppState()
+        model_restore_path = app_state.model_restore_path
+
+        if not model_restore_path:
+            return None
+
+        save_restore_connector = SaveRestoreConnector()
+
+        filter_fn = lambda name: "timestamps_asr_model" in name
+        members = save_restore_connector._filtered_tar_info(model_restore_path, filter_fn=filter_fn)
+
+        if not members:
+            return None
+
+        try:
+            save_restore_connector.model_config_yaml = "timestamps_asr_model_config.yaml"
+            save_restore_connector.model_weights_ckpt = "timestamps_asr_model_weights.ckpt"
+            external_timestamps_model = ASRModel.restore_from(
+                model_restore_path, save_restore_connector=save_restore_connector
+            )
+            external_timestamps_model.eval()
+        except Exception as e:
+            raise RuntimeError(
+                f"Error restoring external timestamps ASR model with timestamps_asr_model_config.yaml and timestamps_asr_model_weights.ckpt: {e}"
+            )
+
+        return external_timestamps_model
+
 
 def parse_multitask_prompt(prompt: dict | None) -> list[dict]:
     if prompt is None or not prompt:
 
@@ -81,7 +81,8 @@ def _init_ctc_alignment_specific(self, cfg: DictConfig):
                 self.graph_decoder.split_batch_size = self.decode_batch_size
             else:
                 self.graph_decoder = ViterbiDecoderWithGraph(
-                    num_classes=self.blank_id, split_batch_size=self.decode_batch_size,
+                    num_classes=self.blank_id,
+                    split_batch_size=self.decode_batch_size,
                 )
             # override decoder args if a config is provided
             decoder_module_cfg = cfg.get("decoder_module_cfg", None)
@@ -119,16 +120,18 @@ def _init_rnnt_alignment_specific(self, cfg: DictConfig):
 
             from nemo.collections.asr.parts.k2.utils import apply_rnnt_prune_ranges, get_uniform_rnnt_prune_ranges
 
-            self.prepare_pruned_outputs = lambda encoder_outputs, encoded_len, decoder_outputs, transcript_len: apply_rnnt_prune_ranges(
-                encoder_outputs,
-                decoder_outputs,
-                get_uniform_rnnt_prune_ranges(
-                    encoded_len,
-                    transcript_len,
-                    self.predictor_window_size + 1,
-                    self.predictor_step_size,
-                    encoder_outputs.size(1),
-                ).to(device=encoder_outputs.device),
+            self.prepare_pruned_outputs = (
+                lambda encoder_outputs, encoded_len, decoder_outputs, transcript_len: apply_rnnt_prune_ranges(
+                    encoder_outputs,
+                    decoder_outputs,
+                    get_uniform_rnnt_prune_ranges(
+                        encoded_len,
+                        transcript_len,
+                        self.predictor_window_size + 1,
+                        self.predictor_step_size,
+                        encoder_outputs.size(1),
+                    ).to(device=encoder_outputs.device),
+                )
             )
 
             from nemo.collections.asr.parts.k2.classes import GraphModuleConfig
@@ -231,9 +234,9 @@ def _rnnt_joint_pruned(
     def _apply_prob_suppress(self, log_probs: torch.Tensor) -> torch.Tensor:
         """Multiplies probability of an element with index self.prob_suppress_index by self.prob_suppress_value times
         with stochasticity preservation of the log_probs tensor.
-        
+
         Often used to suppress <blank> probability of the output of a CTC model.
-        
+
         Example:
             For
                 - log_probs = torch.log(torch.tensor([0.015, 0.085, 0.9]))
@@ -305,7 +308,7 @@ def _predict_impl_rnnt_argmax(
             # we have no token probabilities for the argmax rnnt setup
             token_prob = [1.0] * len(tokens)
             if self.word_output:
-                words = [w for w in self._model.decoding.decode_tokens_to_str(pred_ids).split(" ") if w != ""]
+                words = [w for w in self._model.decoding.decode_ids_to_str(pred_ids).split(" ") if w != ""]
                 words, word_begin, word_len, word_prob = (
                     self._process_tokens_to_words(tokens, token_begin, token_len, token_prob, words)
                     if hasattr(self._model, "tokenizer")
@@ -411,7 +414,7 @@ def _process_char_with_space_to_words(
     def _results_to_ctmUnits(
         self, s_id: int, pred: torch.Tensor, prob: torch.Tensor
     ) -> Tuple[int, List['FrameCtmUnit']]:
-        """Transforms predictions with probabilities to a list of FrameCtmUnit objects, 
+        """Transforms predictions with probabilities to a list of FrameCtmUnit objects,
         containing frame-level alignment information (label, start, duration, probability), for a given sample id.
 
         Alignment information can be either token-based (char, wordpiece, ...) or word-based.
@@ -440,7 +443,7 @@ def _results_to_ctmUnits(
             for i, j in zip(non_blank_idx.tolist(), non_blank_idx[1:].tolist() + [len(pred)])
         ]
         if self.word_output:
-            words = wer_module.decode_tokens_to_str(pred_ids).split(" ")
+            words = wer_module.decode_ids_to_str(pred_ids).split(" ")
             words, word_begin, word_len, word_prob = (
                 self._process_tokens_to_words(tokens, token_begin, token_len, token_prob, words)
                 if hasattr(self._model, "tokenizer")
@@ -539,7 +542,11 @@ def predict_step(self, batch, batch_idx, dataloader_idx=0) -> List[Tuple[int, 'F
 
     @torch.no_grad()
     def transcribe(
-        self, manifest: List[str], batch_size: int = 4, num_workers: int = None, verbose: bool = True,
+        self,
+        manifest: List[str],
+        batch_size: int = 4,
+        num_workers: int = None,
+        verbose: bool = True,
     ) -> List['FrameCtmUnit']:
         """
         Does alignment. Use this method for debugging and prototyping.
 
@@ -16,7 +16,6 @@
 import os
 import shutil
 import tarfile
-import unicodedata
 from abc import ABC, abstractmethod
 from typing import List
 
@@ -29,6 +28,10 @@
 from nemo.collections.asr.parts.mixins.streaming import StreamingEncoder
 from nemo.collections.asr.parts.utils import asr_module_utils
 from nemo.collections.asr.parts.utils.rnnt_utils import Hypothesis
+from nemo.collections.asr.parts.utils.tokenizer_utils import (
+    extract_capitalized_tokens_from_vocab,
+    extract_punctuation_from_vocab,
+)
 from nemo.collections.common import tokenizers
 from nemo.utils import app_state, logging
 
@@ -482,11 +485,8 @@ def _extract_tokenizer_from_config(self, tokenizer_cfg: DictConfig, dir: str):
     def _derive_tokenizer_properties(self):
         vocab = self.tokenizer.tokenizer.get_vocab()
 
-        capitalized_tokens = {token.strip() for token in vocab if any(char.isupper() for char in token)}
-        self.tokenizer.supports_capitalization = bool(capitalized_tokens)
-
-        punctuation = {char for token in vocab for char in token if unicodedata.category(char).startswith('P')}
-        self.tokenizer.supported_punctuation = punctuation
+        self.tokenizer.supports_capitalization = bool(extract_capitalized_tokens_from_vocab(vocab))
+        self.tokenizer.supported_punctuation = extract_punctuation_from_vocab(vocab)
 
 
 class ASRModuleMixin(ASRAdapterModelMixin):