Merge branch 'main' into vbataev/multi_biasing_models

artbataev · artbataev · commit 8e9a98660717 · 2025-12-23T23:15:25.000+04:00
# Conflicts:
#	nemo/collections/asr/inference/streaming/framing/request_options.py
diff --git a/nemo/collections/asr/inference/model_wrappers/cache_aware_rnnt_inference_wrapper.py b/nemo/collections/asr/inference/model_wrappers/cache_aware_rnnt_inference_wrapper.py
@@ -83,6 +83,7 @@ def execute_step(
         keep_all_outputs: bool,
         drop_left_context: int | None = None,
         valid_out_len: int | None = None,
+        prompt_vectors: Tensor | None = None,
     ) -> tuple[list[Hypothesis], CacheAwareContext]:
         """
         Executes a single streaming step.
@@ -95,6 +96,7 @@ def execute_step(
             keep_all_outputs: (bool) whether to keep all outputs or not.
             drop_left_context: (int | None) number of left context frames to drop.
             valid_out_len: (int | None) number of valid output frames.
+            prompt_vectors: (Tensor | None) Optional prompt vectors of shape [B, num_prompts].
         Returns:
             (tuple[list[Hypothesis], CacheAwareContext]) best hypothesis and new context.
         """
@@ -144,6 +146,7 @@ def stream_step(
         keep_all_outputs: bool = False,
         drop_left_context: int | None = None,
         valid_out_len: int | None = None,
+        prompt_vectors: Tensor | None = None,
     ) -> tuple[list[Hypothesis], CacheAwareContext]:
         """
         Executes a single streaming step.
@@ -156,6 +159,7 @@ def stream_step(
             keep_all_outputs: (bool) whether to keep all outputs or not.
             drop_left_context: (int | None) number of left context frames to drop.
             valid_out_len: (int | None) number of valid output frames.
+            prompt_vectors: (Tensor | None) Optional prompt vectors of shape [B, num_prompts].
         Returns:
             (tuple[list[Hypothesis], CacheAwareContext]) best hypothesis and new context.
         """
@@ -185,6 +189,7 @@ def stream_step(
                 keep_all_outputs,
                 drop_left_context,
                 valid_out_len,
+                prompt_vectors,
             )
 
         return best_hyp, new_context
diff --git a/nemo/collections/asr/inference/model_wrappers/rnnt_inference_wrapper.py b/nemo/collections/asr/inference/model_wrappers/rnnt_inference_wrapper.py
@@ -71,12 +71,16 @@ def get_subsampling_factor(self) -> int:
         """
         return self.asr_model.encoder.subsampling_factor
 
-    def encode(self, processed_signal: Tensor, processed_signal_length: Tensor) -> tuple[Tensor, Tensor]:
+    def encode(
+        self, processed_signal: Tensor, processed_signal_length: Tensor, prompt_vectors: Tensor | None = None
+    ) -> tuple[Tensor, Tensor]:
         """
         Get encoder output from the model. It is used for streaming inference.
         Args:
             processed_signal: (Tensor) processed signal. Shape is torch.Size([B, C, T]).
             processed_signal_length: (Tensor) processed signal length. Shape is torch.Size([B]).
+            prompt_vectors: (Tensor | None) Optional prompt vectors for multilingual models.
+                Shape can be torch.Size([B, num_prompts]) or torch.Size([B, T_enc, num_prompts]) if already expanded.
         Returns:
             (tuple[Tensor, Tensor]) encoder output and encoder output length of shape torch.Size([B, T, D]), torch.Size([B]).
         """
@@ -92,9 +96,15 @@ def encode(self, processed_signal: Tensor, processed_signal_length: Tensor) -> t
             torch.no_grad(),
         ):
 
-            forward_outs = self.asr_model(
-                processed_signal=processed_signal.to(self.cast_dtype), processed_signal_length=processed_signal_length
-            )
+            # Prepare model arguments
+            model_args = {
+                'processed_signal': processed_signal.to(self.cast_dtype),
+                'processed_signal_length': processed_signal_length,
+            }
+            if prompt_vectors is not None:
+                model_args['prompt'] = prompt_vectors
+
+            forward_outs = self.asr_model(**model_args)
 
         encoded, encoded_len = forward_outs
         return encoded, encoded_len
@@ -113,3 +123,25 @@ def decode(self, encoded: Tensor, encoded_len: Tensor, partial_hypotheses: list)
             encoded.to(self.cast_dtype), encoded_len, return_hypotheses=True, partial_hypotheses=partial_hypotheses
         )
         return best_hyp
+
+    def encode_with_prompts(
+        self, processed_signal: Tensor, processed_signal_length: Tensor, prompt_vectors: Tensor
+    ) -> tuple[Tensor, Tensor]:
+        """
+        Convenience wrapper for prompt-enabled encoding.
+        Expands prompt vectors across the time dimension before calling encode.
+        Args:
+            processed_signal: (Tensor) processed signal. Shape is torch.Size([B, C, T]).
+            processed_signal_length: (Tensor) processed signal length. Shape is torch.Size([B]).
+            prompt_vectors: (Tensor) prompt vectors. Shape is torch.Size([B, num_prompts]).
+        Returns:
+            (tuple[Tensor, Tensor]) encoder output and encoder output length.
+        """
+        encoder_time_steps = processed_signal.shape[2] // self.get_subsampling_factor()
+        # Expand prompts: [B, num_prompts] -> [B, T_enc, num_prompts]
+        prompt_vectors = prompt_vectors.unsqueeze(1).expand(-1, encoder_time_steps, -1)
+        return self.encode(
+            processed_signal=processed_signal,
+            processed_signal_length=processed_signal_length,
+            prompt_vectors=prompt_vectors,
+        )
diff --git a/nemo/collections/asr/inference/pipelines/base_pipeline.py b/nemo/collections/asr/inference/pipelines/base_pipeline.py
@@ -21,7 +21,9 @@
 from dataclasses import dataclass
 from typing import TYPE_CHECKING, Iterable
 
+import torch
 from omegaconf import DictConfig
+from torch import Tensor
 
 from nemo.collections.asr.inference.model_wrappers.asr_inference_wrapper import ASRInferenceWrapper
 from nemo.collections.asr.inference.pipelines.pipeline_interface import PipelineInterface
@@ -481,6 +483,90 @@ def init_context_manager(self) -> None:
             cache_aware_model=self.asr_model, num_slots=self.num_slots, use_cache=self.use_cache
         )
 
+    def init_prompt_support(self) -> None:
+        """Initialize prompt support for multilingual models."""
+        self.prompt_enabled = hasattr(self.asr_model.asr_model, 'concat') and self.asr_model.asr_model.concat
+
+        if self.prompt_enabled:
+            self._prompt_config = self._load_prompt_config()
+
+    def _load_prompt_config(self) -> dict:
+        """
+        Load prompt configuration from model.
+        Returns:
+            (dict) Prompt configuration containing num_prompts, prompt_dict, and compute_dtype.
+        """
+        cfg = self.asr_model.asr_model.cfg
+        if cfg and hasattr(cfg, 'model_defaults'):
+            model_defaults = cfg.model_defaults
+            num_prompts = model_defaults.get('num_prompts', None)
+            prompt_dict = model_defaults.get('prompt_dictionary', None)
+
+            # Validate and convert types once
+            num_prompts_int = int(num_prompts) if num_prompts is not None else 0
+
+            is_dict_like = isinstance(prompt_dict, dict) or (
+                hasattr(prompt_dict, 'get') and hasattr(prompt_dict, '__contains__')
+            )
+
+            if num_prompts_int > 0 and is_dict_like:
+                return {
+                    'num_prompts': num_prompts_int,
+                    'prompt_dict': prompt_dict,
+                    'compute_dtype': getattr(self.asr_model.asr_model, 'dtype', torch.float32),
+                }
+
+        return {}
+
+    def _resolve_prompt_index(self, language_code: str) -> int:
+        """
+        Resolve language_code to a strict prompt index; raise if invalid.
+        Args:
+            language_code: (str) Language code to resolve (e.g., "en-US", "es-ES").
+        Returns:
+            (int) Prompt index corresponding to the language code.
+        Raises:
+            RuntimeError: If prompt configuration is missing.
+            ValueError: If language_code is not found in prompt dictionary.
+        """
+        if not hasattr(self, '_prompt_config') or not self._prompt_config:
+            raise RuntimeError("Prompt configuration is missing for a prompt-enabled model.")
+        prompt_dict = self._prompt_config['prompt_dict']
+        lang_index = prompt_dict.get(language_code, None)
+        if lang_index is None:
+            raise ValueError(
+                f"Language code '{language_code}' not found in prompt dictionary. "
+                f"Available languages: {list(prompt_dict.keys())}"
+            )
+        return lang_index
+
+    def _create_one_hot_prompts(self, indices: Tensor) -> Tensor:
+        """
+        Create one-hot prompt vectors from indices.
+        Args:
+            indices: (Tensor) Prompt indices of shape [B].
+        Returns:
+            (Tensor) One-hot prompt vectors of shape [B, num_prompts].
+        """
+        num_prompts = self._prompt_config['num_prompts']
+        return torch.nn.functional.one_hot(indices, num_classes=num_prompts).to(self._prompt_config['compute_dtype'])
+
+    def _build_prompt_vectors(self, states: list) -> Tensor:
+        """
+        Build prompt vectors for a batch of states using one-hot encoding.
+        Args:
+            states: (list) List of streaming states.
+        Returns:
+            (Tensor) Prompt vectors of shape [B, num_prompts].
+        Raises:
+            ValueError: If any prompt index is out of range.
+        """
+        indices = torch.tensor([getattr(s, 'prompt_idx', 0) for s in states], device=self.device, dtype=torch.long)
+        num_prompts = self._prompt_config['num_prompts']
+        if torch.any((indices < 0) | (indices >= num_prompts)):
+            raise ValueError("Found out-of-range prompt index in batch.")
+        return self._create_one_hot_prompts(indices)
+
     def run(
         self,
         audio_filepaths: list[str],
diff --git a/nemo/collections/asr/inference/pipelines/buffered_rnnt_pipeline.py b/nemo/collections/asr/inference/pipelines/buffered_rnnt_pipeline.py
@@ -67,6 +67,7 @@ def __init__(
         """
 
         self.copy_asr_model_attributes(asr_model)
+        self.init_prompt_support()
         self.init_parameters(cfg)
         self.init_bufferer_for_buffered_streaming()
         self.conf_func, self.confidence_aggregator = get_confidence_utils(cfg.confidence)
@@ -196,9 +197,24 @@ def init_zero_enc(self) -> Tensor:
             buffer_lens=torch.tensor([zero_buffer.shape[1]], device=self.device),
             expected_feature_buffer_len=self.expected_feature_buffer_len,
         )
-        zero_encoded, _ = self.asr_model.encode(
-            processed_signal=zero_features, processed_signal_length=zero_features_len
-        )
+
+        if self.prompt_enabled:
+            # Use "en-US" as the default prompt for zero encoding
+            # This region is sliced out before decoding, so language choice doesn't matter
+            default_prompt_idx = self._resolve_prompt_index("en-US")
+            prompt_indices = torch.tensor([default_prompt_idx], device=self.device, dtype=torch.long)
+            prompt_vector = self._create_one_hot_prompts(prompt_indices)  # [1, num_prompts]
+
+            zero_encoded, _ = self.asr_model.encode_with_prompts(
+                processed_signal=zero_features,
+                processed_signal_length=zero_features_len,
+                prompt_vectors=prompt_vector,
+            )
+        else:
+            zero_encoded, _ = self.asr_model.encode(
+                processed_signal=zero_features, processed_signal_length=zero_features_len
+            )
+
         return zero_encoded[0]
 
     def create_state(self, options: ASRRequestOptions) -> RNNTStreamingState:
@@ -219,8 +235,18 @@ def create_state(self, options: ASRRequestOptions) -> RNNTStreamingState:
             default_target_language=self.nmt_model.target_language if self.nmt_enabled else None,
             default_stop_history_eou=self.stop_history_eou_in_milliseconds,
             default_asr_output_granularity=self.asr_output_granularity,
+            default_language_code="en-US" if self.prompt_enabled else None,
         )
         state.set_options(new_options)
+
+        # Create per-stream prompt index for prompt-enabled models
+        if self.prompt_enabled:
+            lang_code = getattr(new_options, "language_code", None)
+            if not isinstance(lang_code, str) or len(lang_code) == 0:
+                raise ValueError("Prompt-enabled model requires a valid language_code in request options.")
+            prompt_idx = self._resolve_prompt_index(lang_code)
+            state.set_prompt_index(prompt_idx)
+
         return state
 
     def get_sep(self) -> str:
@@ -304,9 +330,21 @@ def encode_raw_signals(
             expected_feature_buffer_len=self.expected_feature_buffer_len,
         )
 
-        encoded, encoded_len = self.asr_model.encode(
-            processed_signal=feature_buffers, processed_signal_length=feature_buffer_lens
-        )
+        # Build prompt vectors if prompts are enabled
+        if self.prompt_enabled:
+            requests_states = [self.get_state(f.stream_id) for f in frames]
+            prompt_vectors = self._build_prompt_vectors(requests_states)
+
+            # Use encode_with_prompts which handles dimension expansion
+            encoded, encoded_len = self.asr_model.encode_with_prompts(
+                processed_signal=feature_buffers,
+                processed_signal_length=feature_buffer_lens,
+                prompt_vectors=prompt_vectors,
+            )
+        else:
+            encoded, encoded_len = self.asr_model.encode(
+                processed_signal=feature_buffers, processed_signal_length=feature_buffer_lens
+            )
         encoded = encoded.clone()
         encoded_len = encoded_len.clone()
 
@@ -340,9 +378,21 @@ def encode_processed_signals(
         processed_signals = normalize_features(processed_signals, processed_signal_lengths)
         processed_signal_lengths = processed_signal_lengths.clamp(max=processed_signals.shape[2])
 
-        encoded, encoded_len = self.asr_model.encode(
-            processed_signal=processed_signals, processed_signal_length=processed_signal_lengths
-        )
+        # Build prompt vectors if prompts are enabled
+        if self.prompt_enabled:
+            requests_states = [self.get_state(f.stream_id) for f in fbuffers]
+            prompt_vectors = self._build_prompt_vectors(requests_states)
+
+            # Use encode_with_prompts which handles dimension expansion
+            encoded, encoded_len = self.asr_model.encode_with_prompts(
+                processed_signal=processed_signals,
+                processed_signal_length=processed_signal_lengths,
+                prompt_vectors=prompt_vectors,
+            )
+        else:
+            encoded, encoded_len = self.asr_model.encode(
+                processed_signal=processed_signals, processed_signal_length=processed_signal_lengths
+            )
         encoded = encoded.clone()
         encoded_len = encoded_len.clone()
 
diff --git a/nemo/collections/asr/inference/pipelines/cache_aware_rnnt_pipeline.py b/nemo/collections/asr/inference/pipelines/cache_aware_rnnt_pipeline.py
@@ -64,6 +64,7 @@ def __init__(
             nmt_model: (LLMTranslator | None) LLM based translation model.
         """
         self.copy_asr_model_attributes(asr_model)
+        self.init_prompt_support()
         self.init_parameters(cfg)
         self.init_context_manager()
         self.init_bufferer_for_cache_aware_streaming()
@@ -187,6 +188,7 @@ def create_state(self, options: ASRRequestOptions) -> CacheAwareRNNTStreamingSta
             default_target_language=self.nmt_model.target_language if self.nmt_enabled else None,
             default_stop_history_eou=self.stop_history_eou_in_milliseconds,
             default_asr_output_granularity=self.asr_output_granularity,
+            default_language_code="en-US" if self.prompt_enabled else None,
         )
 
         eou_label_buffer_size = 0
@@ -198,6 +200,15 @@ def create_state(self, options: ASRRequestOptions) -> CacheAwareRNNTStreamingSta
         state.setup_label_buffer(eou_label_buffer_size, self.blank_id)
         state.set_previous_hypothesis(None)
         state.set_options(new_options)
+
+        # Create per-stream prompt index for prompt-enabled models
+        if self.prompt_enabled:
+            lang_code = getattr(new_options, "language_code", None)
+            if not isinstance(lang_code, str) or len(lang_code) == 0:
+                raise ValueError("Prompt-enabled model requires a valid language_code in request options.")
+            prompt_idx = self._resolve_prompt_index(lang_code)
+            state.set_prompt_index(prompt_idx)
+
         return state
 
     def get_sep(self) -> str:
@@ -291,6 +302,10 @@ def cache_aware_transcribe_step(
         previous_hypotheses = [state.get_previous_hypothesis() for state in states]
         context, mapping = self.context_manager.get_context(stream_ids)
 
+        prompt_vectors = None
+        if self.prompt_enabled:
+            prompt_vectors = self._build_prompt_vectors(states)
+
         drop_extra_pre_encoded = 0 if not self.use_cache else self.asr_model.drop_extra_pre_encoded
         best_hyp, new_context = self.asr_model.stream_step(
             processed_signal=feature_buffers,
@@ -301,6 +316,7 @@ def cache_aware_transcribe_step(
             keep_all_outputs=keep_all_outputs,
             drop_left_context=self.drop_left_context,
             valid_out_len=self.valid_out_len,
+            prompt_vectors=prompt_vectors,
         )
 
         # update the cache and reset the cache slots for the streams that has ended
diff --git a/nemo/collections/asr/inference/streaming/framing/request_options.py b/nemo/collections/asr/inference/streaming/framing/request_options.py
@@ -31,6 +31,7 @@ class ASRRequestOptions:
     enable_pnc: bool = None
     stop_history_eou: int = None
     asr_output_granularity: ASROutputGranularity | str = None
+    language_code: str | None = None
     enable_nmt: bool = None
     source_language: str = None
     target_language: str = None
@@ -82,6 +83,7 @@ def augment_with_defaults(
         default_target_language: str,
         default_stop_history_eou: int,
         default_asr_output_granularity: ASROutputGranularity | str,
+        default_language_code: str | None = None,
         biasing_cfg: BiasingRequestItemConfig | None = None,
     ) -> "ASRRequestOptions":
         """
@@ -94,6 +96,7 @@ def augment_with_defaults(
             default_target_language (str): Default target language.
             default_stop_history_eou (int): Default stop history EOU.
             default_asr_output_granularity (ASROutputGranularity | str): Default output granularity.
+            default_language_code (str | None): Default language code for prompt-enabled models.
             biasing_cfg: Default biasing config or None
         Returns:
             ASRRequestOptions: Augmented options.
@@ -113,6 +116,7 @@ def augment_with_defaults(
 
         stop_history_eou = self._with_default(self.stop_history_eou, default_stop_history_eou)
         granularity = self._with_default(self.asr_output_granularity, default_asr_output_granularity)
+        language_code = self._with_default(self.language_code, default_language_code)
 
         return ASRRequestOptions(
             enable_itn=enable_itn,
@@ -122,6 +126,7 @@ def augment_with_defaults(
             target_language=target_language,
             stop_history_eou=stop_history_eou,
             asr_output_granularity=granularity,
+            language_code=language_code,
             biasing_cfg=self.biasing_cfg or biasing_cfg,
         )
 
diff --git a/nemo/collections/asr/inference/streaming/state/state.py b/nemo/collections/asr/inference/streaming/state/state.py
diff --git a/scripts/dataset_processing/get_commonvoice_data.py b/scripts/dataset_processing/get_commonvoice_data.py
diff --git a/tools/asr_evaluator/utils.py b/tools/asr_evaluator/utils.py