NVIDIA-NeMo
diff --git a/‎nemo/collections/asr/data/audio_to_diar_label_lhotse.py‎
Lines changed: 18 additions & 3 deletions b/‎nemo/collections/asr/data/audio_to_diar_label_lhotse.py‎
Lines changed: 18 additions & 3 deletions
diff --git a/‎nemo/collections/asr/models/sortformer_diar_models.py‎
Lines changed: 2 additions & 0 deletions b/‎nemo/collections/asr/models/sortformer_diar_models.py‎
Lines changed: 2 additions & 0 deletions
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import logging
 from typing import Dict, Optional, Tuple
 
 import torch.utils.data
@@ -24,6 +23,7 @@
     speaker_to_target,
 )
 from nemo.core.neural_types import AudioSignal, LabelsType, LengthsType, NeuralType
+from nemo.utils import logging
 
 
 class LhotseAudioToSpeechE2ESpkDiarDataset(torch.utils.data.Dataset):
@@ -58,11 +58,23 @@ def __init__(self, cfg):
         self.num_mel_frame_per_target_frame = int(self.cfg.get('subsampling_factor', 8))
 
     def __getitem__(self, cuts) -> Tuple[torch.Tensor, ...]:
-        audio, audio_lens, cuts = self.load_audio(cuts)
+        # NOTE: This end-to-end diarization dataloader only loads the 1st ch of the audio file.
+        # Process cuts in a single loop: convert to mono and compute speaker activities
+        mono_cuts = []
         speaker_activities = []
         for cut in cuts:
+            if cut.num_channels is not None and cut.num_channels > 1:
+                logging.warning(
+                    "Multiple channels detected in cut '%s' (%d channels). "
+                    "Only the first channel will be used; remaining channels are ignored.",
+                    cut.id,
+                    cut.num_channels,
+                )
+            mono_cut = cut.with_channels(channels=[0])
+            mono_cuts.append(mono_cut)
+
             speaker_activity = speaker_to_target(
-                a_cut=cut,
+                a_cut=mono_cut,
                 num_speakers=self.num_speakers,
                 num_sample_per_mel_frame=self.num_sample_per_mel_frame,
                 num_mel_frame_per_asr_frame=self.num_mel_frame_per_target_frame,
@@ -79,6 +91,9 @@ def __getitem__(self, cuts) -> Tuple[torch.Tensor, ...]:
                 )
                 speaker_activity = speaker_activity[:, : self.num_speakers]
             speaker_activities.append(speaker_activity)
+
+        cuts = type(cuts).from_cuts(mono_cuts)
+        audio, audio_lens, cuts = self.load_audio(cuts)
         targets = collate_matrices(speaker_activities).to(audio.dtype)  # (B, T, N)
 
         if targets.shape[2] > self.num_speakers:
 
@@ -425,6 +425,7 @@ def _setup_diarize_dataloader(self, config: Dict) -> 'torch.utils.data.DataLoade
             'session_len_sec': config['session_len_sec'],
             'num_workers': config.get('num_workers', min(batch_size, os.cpu_count() - 1)),
             'pin_memory': True,
+            'use_lhotse': config.get('use_lhotse', False),
         }
         temporary_datalayer = self.__setup_dataloader_from_config(config=DictConfig(dl_config))
         return temporary_datalayer
@@ -1112,6 +1113,7 @@ def on_validation_epoch_end(self) -> Optional[dict[str, dict[str, torch.Tensor]]
     def diarize(
         self,
         audio: Union[str, List[str], np.ndarray, DataLoader],
+        sample_rate: Optional[int] = None,
         batch_size: int = 1,
         include_tensor_outputs: bool = False,
         postprocessing_yaml: Optional[str] = None,