NVIDIA-NeMo
diff --git a/‎.github/workflows/cicd-main-speech.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/cicd-main-speech.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/asr/asr_chunked_inference/aed/speech_to_text_aed_chunked_infer.py‎
Lines changed: 9 additions & 6 deletions b/‎examples/asr/asr_chunked_inference/aed/speech_to_text_aed_chunked_infer.py‎
Lines changed: 9 additions & 6 deletions
diff --git a/‎nemo/collections/asr/data/audio_to_text_lhotse_prompted.py‎
Lines changed: 110 additions & 2 deletions b/‎nemo/collections/asr/data/audio_to_text_lhotse_prompted.py‎
Lines changed: 110 additions & 2 deletions
diff --git a/‎nemo/collections/asr/models/aed_multitask_models.py‎
Lines changed: 62 additions & 7 deletions b/‎nemo/collections/asr/models/aed_multitask_models.py‎
Lines changed: 62 additions & 7 deletions
@@ -37,7 +37,7 @@ jobs:
         include:
           - script: L0_Unit_Tests_GPU_ASR
             runner: self-hosted-azure-gpus-1
-            timeout: 20
+            timeout: 30
           - script: L0_Unit_Tests_CPU_ASR
             runner: self-hosted-azure-cpu
             cpu-only: true
 
@@ -17,17 +17,16 @@
 seconds and performs inference on each 
 segment individually. The results are then concatenated to form the final output.
 
-Below is an example of how to run this script with the Canary-1b model.
+Below is an example of how to run this script with the Canary-1b-v2 model.
 It's recommended to use manifest input, otherwise the model will perform English ASR 
 with punctuations and capitalizations. 
 An example manifest line:
 {
     "audio_filepath": "/path/to/audio.wav",  # path to the audio file
     "duration": 10000.0,  # duration of the audio
     "taskname": "asr",  # use "s2t_translation" for AST
-    "source_lang": "en",  # Set `source_lang`==`target_lang` for ASR, choices=['en','de','es','fr']
-    "target_lang": "de",  # choices=['en','de','es','fr']
-    "pnc": "yes",  # whether to have PnC output, choices=['yes', 'no'] 
+    "source_lang": "en",  # Set `source_lang`==`target_lang` for ASR. Currently supported for 25 EU languages.
+    "target_lang": "de",  # See https://huggingface.co/nvidia/canary-1b-v2
 }
 
 Example Usage:
@@ -41,8 +40,12 @@
     batch_size=16 \
     decoding.beam.beam_size=1
 
-To return word and segment level timestamps, add `timestamps=True` to the above command, 
-and set `chunk_len_in_secs=10.0` for best results.
+To return word and segment level timestamps, add `timestamps=True` to the above command.
+
+Note: Canary-1b-v2 supports long‑form inference via the `.transcribe()` method.
+It will use dynamic chunking with overlapping windows for better performance.
+This behavior is enabled automatically for long‑form inference when transcribing a single 
+audio file or when batch_size is set to 1.
     
 """
 
 
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from dataclasses import dataclass
-from typing import Callable, Optional, Union
+from typing import Optional, Union
 
 import torch.utils.data
 from lhotse import CutSet
@@ -21,7 +21,7 @@
 from lhotse.dataset.collation import collate_vectors
 
 from nemo.collections.common.data import apply_prompt_format_fn
-from nemo.collections.common.prompts import CanaryPromptFormatter, PromptFormatter
+from nemo.collections.common.prompts import PromptFormatter
 from nemo.collections.common.tokenizers import TokenizerSpec
 
 
@@ -61,22 +61,43 @@ class PromptedAudioToTextLhotseDataset(torch.utils.data.Dataset):
     Tokenized utterances will be extended with special prompt tokens according to ``prompt_format_fn`` logic.
     We support cuts with multiple supervision segments -- their tokenized texts will be concatenated before we add the prompt tokens.
     This is useful, for example, in code-switched scenarios where each segment is spoken in a different language.
+
+    Chunking:
+    If `enable_chunking` is True, each audio sample is split into optimally sized chunks
+    (see `find_optimal_chunk_size` and `chunk_waveform`). This is useful for long audio inputs,
+    allowing the model to process them in manageable segments.
     """
 
     def __init__(
         self,
         tokenizer: TokenizerSpec,
         prompt: PromptFormatter,
+        enable_chunking: bool = False,
     ):
         super().__init__()
         self.tokenizer = tokenizer
         self.load_audio = AudioSamples(fault_tolerant=True)
         self.padding_value = self.tokenizer.pad_id
         self.prompt = prompt
+        self.enable_chunking = enable_chunking
 
     def __getitem__(self, cuts: CutSet) -> PromptedAudioToTextMiniBatch:
         audio, audio_lens, cuts = self.load_audio(cuts)
 
+        # Will work if batch_size is set to 1.
+        if self.enable_chunking:
+            # If dynamic chunking is enabled, split each audio sample into chunks.
+            new_audio = []
+            new_audio_lens = []
+            for i in range(audio.shape[0]):
+                waveform = audio[i, : audio_lens[i]]
+                # Split the waveform into chunks and get their lengths.
+                chunks, chunk_lens = self._chunk_waveform(waveform)
+                new_audio.extend(chunks)
+                new_audio_lens.extend(chunk_lens)
+            # Stack all chunks into a batch.
+            audio = torch.stack(new_audio)
+            audio_lens = torch.tensor(new_audio_lens, dtype=torch.long)
         # Fast-path: the tokenization and prompt formatting was already done before sampling.
         attrs = ("input_ids", "context_ids", "answer_ids")
         pre_formatted = all(hasattr(c, a) for c in cuts for a in attrs)
@@ -110,6 +131,93 @@ def _collate_tokens(self, tokens: list[Union[list[int], torch.Tensor]]) -> tuple
         tokens = collate_vectors(tokens, padding_value=self.padding_value)
         return tokens, token_lens
 
+    def _find_optimal_chunk_size(
+        self, total_len: int, min_sec: int = 30, max_sec: int = 40, sample_rate: int = 16000, overlap_sec: float = 1.0
+    ) -> int:
+        """
+        Find the optimal chunk size for audio processing that minimizes paddings to the last chunk.
+
+        Args:
+            total_len (int): Total length of the audio waveform in samples
+            min_sec (int, optional): Minimum chunk size in seconds. Defaults to 30.
+            max_sec (int, optional): Maximum chunk size in seconds. Defaults to 40.
+            sample_rate (int, optional): Audio sample rate in Hz. Defaults to 16000.
+            overlap_sec (float, optional): Overlap duration between consecutive chunks in seconds.
+                                         Defaults to 1.0.
+
+        Returns:
+            int: Optimal chunk size in samples that maximizes the last chunk length
+        """
+        best_chunk_size = min_sec * sample_rate
+        best_last_chunk_len = 0
+        if total_len < max_sec * sample_rate:
+            return total_len
+        # Try each possible chunk duration in the range
+        for sec in range(min_sec, max_sec + 1):
+            chunk_size = sec * sample_rate
+            overlap_size = int(overlap_sec * sample_rate)
+            step_size = chunk_size - overlap_size
+
+            if step_size <= 0:  # Invalid overlap
+                continue
+            if chunk_size > total_len:
+                continue
+
+            # Calculate how many chunks we'd need and the last chunk's length
+            n_chunks = (total_len + step_size - 1) // step_size
+            last_chunk_len = total_len - step_size * (n_chunks - 1)
+
+            if last_chunk_len > best_last_chunk_len:
+                best_last_chunk_len = last_chunk_len
+                best_chunk_size = chunk_size
+
+        return best_chunk_size
+
+    def _chunk_waveform(
+        self, waveform: torch.Tensor, chunk_size: int = None, overlap_sec: float = 1.0, sample_rate: int = 16000
+    ) -> tuple[list[torch.Tensor], list[int]]:
+        """
+        Split a waveform tensor into overlapping chunks.
+
+        Args:
+            waveform (torch.Tensor): Input audio waveform tensor of shape (time_samples,)
+            chunk_size (int, optional): Size of each chunk in samples. If None, automatically
+                                       determines optimal chunk size using find_optimal_chunk_size().
+                                       Defaults to None.
+            sample_rate (int, optional): Audio sample rate in Hz. Defaults to 16000.
+            overlap_sec (float, optional): Overlap duration between consecutive chunks in seconds.
+                                          Used to calculate step size. Defaults to 2.
+
+        Returns:
+            tuple[list[torch.Tensor], list[int]]: A tuple containing:
+                - List of chunk tensors, each of shape (chunk_size,)
+                - List of original lengths for each chunk before padding (useful for masking
+                  padded regions during processing.
+        """
+        # If chunk_size is None, find the optimal chunk size for this waveform
+        total_len = waveform.shape[0]
+        if chunk_size is None:
+            chunk_size = self._find_optimal_chunk_size(total_len, overlap_sec=overlap_sec)
+        if chunk_size <= total_len:
+            return [waveform], [total_len]
+        overlap_size = int(overlap_sec * sample_rate)
+        step_size = chunk_size - overlap_size
+        chunks = []
+        chunk_lens = []
+        start = 0
+        while start + overlap_size < total_len:
+            end = min(start + chunk_size, total_len)
+            chunk = waveform[start:end]
+            length = chunk.shape[0]
+            if length < chunk_size:
+                pad = torch.zeros(chunk_size - length, dtype=chunk.dtype, device=chunk.device)
+                chunk = torch.cat([chunk, pad], dim=0)
+            chunks.append(chunk)
+            chunk_lens.append(length)
+            start += step_size
+
+        return chunks, chunk_lens
+
 
 class ProbablyIncorrectLanguageKeyError(RuntimeError):
     pass
 
@@ -40,6 +40,7 @@
 from nemo.collections.asr.parts.preprocessing.segment import ChannelSelectorType
 from nemo.collections.asr.parts.submodules.multitask_decoding import MultiTaskDecoding, MultiTaskDecodingConfig
 from nemo.collections.asr.parts.submodules.token_classifier import TokenClassifier
+from nemo.collections.asr.parts.utils.chunking_utils import merge_all_hypotheses, merge_parallel_chunks
 from nemo.collections.asr.parts.utils.rnnt_utils import Hypothesis
 from nemo.collections.asr.parts.utils.timestamp_utils import (
     get_forced_aligned_timestamps_with_external_model,
@@ -110,6 +111,10 @@ class MultiTaskTranscriptionInternalConfig(InternalTranscribeConfig):
 class MultiTaskTranscriptionConfig(TranscribeConfig):
     """
     Configuration for Multi Task Transcription
+
+    enable_chunking: bool = True
+            Whether to enable parallel processing of audio chunks for long-form audio.
+            If enabled, batch_size should be set to 1 or single audio be passed.
     """
 
     prompt: list[dict[str, dict[str, str]]] | None = None
@@ -119,6 +124,7 @@ class MultiTaskTranscriptionConfig(TranscribeConfig):
     _internal: Optional[MultiTaskTranscriptionInternalConfig] = field(
         default_factory=lambda: MultiTaskTranscriptionInternalConfig()
     )
+    enable_chunking: bool = True
 
     def __post_init__(self):
         self.prompt = parse_multitask_prompt(self.prompt)
@@ -495,6 +501,7 @@ def transcribe(
     ) -> Union[List[str], List[Hypothesis]]:
         """
         Uses greedy decoding to transcribe audio files. Use this method for debugging and prototyping.
+        This allows the model to process long audio in manageable chunks and merge the results.
         Args:
             audio: (a single or list) of paths to audio files or a np.ndarray/tensor audio array or path 
                 to a manifest file.
@@ -525,7 +532,7 @@ def transcribe(
 
         Returns:
             A list of transcriptions (or raw log probabilities if logprobs is True) in the same order 
-            as paths2audio_files
+            as paths2audio_files 
         """
         if timestamps is not None:
             if self.timestamps_asr_model is None:
@@ -561,22 +568,43 @@ def transcribe(
             trcfg = override_config
             trcfg.timestamps = timestamps
 
-        return super().transcribe(audio=audio, override_config=trcfg)
+        if trcfg.enable_chunking:
+            # Check if only one audio is provided with string
+            is_one_audio = isinstance(audio, str) and not (audio.endswith("json") or audio.endswith("jsonl"))
+            # Check if it is provided as a list of strings
+            is_one_audio = is_one_audio or (isinstance(audio, list) and len(audio) == 1)
+            # Check if chunking will be enabled
+            trcfg.enable_chunking = is_one_audio or (override_config is not None and override_config.batch_size == 1)
+            if not trcfg.enable_chunking:
+                logging.warning("Chunking is disabled. Please pass a single audio file or set batch_size to 1")
+
+        results = super().transcribe(audio=audio, override_config=trcfg)
+        if trcfg.enable_chunking:
+            results = merge_all_hypotheses(results, trcfg.timestamps, self.encoder.subsampling_factor)
+
+        return results
 
     def _setup_dataloader_from_config(self, config: Optional[Dict]):
+
         assert config.get("use_lhotse", False), (
             "Multi-task model only supports dataloading with Lhotse. "
             "Please set config.{train,validation,test}_ds.use_lhotse=True"
         )
         global_rank = config.get("global_rank", self.global_rank)
         world_size = config.get("world_size", self.world_size)
+        enable_chunking = config.get("enable_chunking", False)
+        if enable_chunking:
+            # Adding this to support processing audio files of arbitrary length by chunking them into hour-long segments.
+            config.cut_into_windows_duration = 3600
+            config.cut_into_windows_hop = 3600
         return get_lhotse_dataloader_from_config(
             config,
             global_rank=global_rank,
             world_size=world_size,
             dataset=PromptedAudioToTextLhotseDataset(
                 tokenizer=self.tokenizer,
                 prompt=self.prompt,
+                enable_chunking=enable_chunking,  # <-- enables chunking
             ),
             tokenizer=self.tokenizer,
         )
@@ -889,10 +917,12 @@ def _transcribe_input_manifest_processing(
             A config dict that is used to setup the dataloader for transcription.
         """
         manifest_filepath = trcfg._internal.manifest_filepath
-
         audio_files = self._may_be_make_dict_and_fix_paths(audio_files, manifest_filepath, trcfg)
 
-        return super()._transcribe_input_manifest_processing(audio_files, temp_dir, trcfg)
+        ds_config = super()._transcribe_input_manifest_processing(audio_files, temp_dir, trcfg)
+        if trcfg.enable_chunking:
+            ds_config['enable_chunking'] = True
+        return ds_config
 
     def _transcribe_forward(
         self, batch: PromptedAudioToTextMiniBatch | tuple[torch.Tensor, ...], trcfg: MultiTaskTranscriptionConfig
@@ -979,6 +1009,8 @@ def _transcribe_output_processing(self, outputs, trcfg: MultiTaskTranscriptionCo
         """
         Internal function to process the model's outputs to return the results to the user. This function is called by
         `transcribe()` and `transcribe_generator()` to process the model's outputs.
+        If parallel chunking was used (enable_chunking=True), merges the hypotheses from each chunk
+        into a single hypothesis, joining text, token sequences, and timestamps.
 
         Args:
             outputs: The model's outputs that are processed by `_transcribe_forward()`.
@@ -988,6 +1020,7 @@ def _transcribe_output_processing(self, outputs, trcfg: MultiTaskTranscriptionCo
             The output can be a list of
             objects, list of list of objects.
             Its type is defined in `TranscriptionReturnType`.
+
         """
         log_probs = outputs.pop('log_probs')
         encoded_len = outputs.pop('encoded_lengths')
@@ -996,14 +1029,18 @@ def _transcribe_output_processing(self, outputs, trcfg: MultiTaskTranscriptionCo
         decoder_input_ids = outputs.pop('decoder_input_ids')
         batch = outputs.pop('batch')
 
-        del log_probs, encoded_len
-
+        del log_probs
+        num_chunks = enc_states.shape[0]
+        # Repear decoder_input_ids to match number of chunks
+        if trcfg.enable_chunking and num_chunks > decoder_input_ids.shape[0]:
+            decoder_input_ids = decoder_input_ids.repeat(num_chunks, 1)
         hypotheses = self.decoding.decode_predictions_tensor(
             encoder_hidden_states=enc_states,
             encoder_input_mask=enc_mask,
             decoder_input_ids=decoder_input_ids,
             return_hypotheses=trcfg.return_hypotheses,
         )
+        merge_to_be_done = trcfg.enable_chunking and len(hypotheses) > 1
 
         del enc_states, enc_mask, decoder_input_ids
 
@@ -1013,13 +1050,29 @@ def _transcribe_output_processing(self, outputs, trcfg: MultiTaskTranscriptionCo
                 batch_size=len(batch.audio),
                 external_ctc_model=self.timestamps_asr_model,
                 main_model_predictions=hypotheses,
-                timestamp_type=['word', 'segment'],
+                timestamp_type='char' if merge_to_be_done else ['word', 'segment'],
                 viterbi_device=trcfg._internal.device,
             )
         elif trcfg.timestamps:
             hypotheses = process_aed_timestamp_outputs(
                 hypotheses, self.encoder.subsampling_factor, self.cfg['preprocessor']['window_stride']
             )
+        if merge_to_be_done:
+            merged_hypotheses = merge_parallel_chunks(
+                hypotheses=hypotheses,
+                encoded_len=encoded_len,
+                model=self,
+                timestamps=trcfg.timestamps,
+                subsampling_factor=self.encoder.subsampling_factor,
+                window_stride=self.cfg['preprocessor']['window_stride'],
+                decoding=self.decoding,
+            )
+            # Inject the id of the cut to hypothese to later be used for separate batches
+            setattr(merged_hypotheses, 'id', batch.cuts[0].id.split("-", 1)[0])
+            return [merged_hypotheses]
+
+        if trcfg.enable_chunking and len(hypotheses) == 1:
+            setattr(hypotheses[0], 'id', batch.cuts[0].id.split("-", 1)[0])
         return hypotheses
 
     def _setup_transcribe_dataloader(self, config: Dict) -> 'torch.utils.data.DataLoader':
@@ -1035,6 +1088,7 @@ def _setup_transcribe_dataloader(self, config: Dict) -> 'torch.utils.data.DataLo
                     stored.
         Returns:
             A pytorch DataLoader for the given audio file(s).
+
         """
         if 'manifest_filepath' in config:
             manifest_filepath = config['manifest_filepath']
@@ -1059,6 +1113,7 @@ def _setup_transcribe_dataloader(self, config: Dict) -> 'torch.utils.data.DataLo
             'channel_selector': config.get('channel_selector', None),
             'pad_min_duration': config.get('pad_min_duration', 1.0),
             'pad_direction': config.get('pad_direction', 'both'),
+            'enable_chunking': config.get('enable_chunking', False),
         }
 
         temporary_datalayer = self._setup_dataloader_from_config(config=DictConfig(dl_config))