[Fix] Fix the notebook errors on multispeaker data simulation and end to end diarization training (#15149)

tango4j · web-flow · commit 66ffb38338aa · 2025-12-05T14:38:16.000-08:00
* Fixed the notebook errors

Signed-off-by: taejinp &lt;tango4j@gmail.com&gt;

* Apply isort and black reformatting

Signed-off-by: tango4j &lt;tango4j@users.noreply.github.com&gt;

---------

Signed-off-by: taejinp &lt;tango4j@gmail.com&gt;
Signed-off-by: tango4j &lt;tango4j@users.noreply.github.com&gt;
Co-authored-by: tango4j &lt;tango4j@users.noreply.github.com&gt;
diff --git a/nemo/collections/asr/data/data_simulation.py b/nemo/collections/asr/data/data_simulation.py
@@ -1148,7 +1148,7 @@ def _generate_session(
         if self._params.data_simulator.background_noise.add_bg:
             if len(self._noise_samples) > 0:
                 avg_power_array = torch.mean(array[is_speech == 1] ** 2)
-                bg, snr = get_background_noise(
+                bg, snr, _ = get_background_noise(
                     len_array=len(array),
                     power_array=avg_power_array,
                     noise_samples=self._noise_samples,
@@ -1466,6 +1466,8 @@ def _generate_rir_pyroomacoustics(self) -> Tuple[torch.Tensor, int]:
         if self._params.data_simulator.rir_generation.mic_config.mic_pattern == 'omni':
             mic_pattern = DirectivityPattern.OMNI
             dir_vec = DirectionVector(azimuth=0, colatitude=90, degrees=True)
+        else:
+            raise Exception("Currently, microphone pattern must be omni. Aborting RIR generation.")
         dir_obj = CardioidFamily(
             orientation=dir_vec,
             pattern_enum=mic_pattern,
@@ -1509,6 +1511,8 @@ def _convolve_rir(self, input, speaker_turn: int, RIR: torch.Tensor) -> Tuple[li
                 out_channel = convolve(input, RIR[speaker_turn, channel, : len(input)]).tolist()
             elif self._params.data_simulator.rir_generation.toolkit == 'pyroomacoustics':
                 out_channel = convolve(input, RIR[channel][speaker_turn][: len(input)]).tolist()
+            else:
+                raise Exception("Toolkit must be pyroomacoustics or gpuRIR. Aborting RIR convolution.")
             if len(out_channel) > length:
                 length = len(out_channel)
             output_sound.append(torch.tensor(out_channel))
@@ -1644,7 +1648,11 @@ def _generate_session(
             self.annotator.annote_lists['json'].append(new_json_entry)
 
             new_ctm_entries, _ = self.annotator.create_new_ctm_entry(
-                filename, speaker_ids[speaker_turn], start / self._params.data_simulator.sr
+                words=self._text,
+                alignments=self._alignments,
+                session_name=filename,
+                speaker_id=speaker_ids[speaker_turn],
+                start=start / self._params.data_simulator.sr,
             )
             self.annotator.annote_lists['ctm'].extend(new_ctm_entries)
 
@@ -1659,23 +1667,21 @@ def _generate_session(
             array = perturb_audio(array, self._params.data_simulator.sr, self.session_augmentor)
 
         # Step 7-2: Additive background noise from noise manifest files
-        if self._params.data_simulator.background_noise.add_bg:
-            if len(self._noise_samples) > 0:
-                avg_power_array = torch.mean(array[is_speech == 1] ** 2)
-                bg, snr = get_background_noise(
-                    len_array=len(array),
-                    power_array=avg_power_array,
-                    noise_samples=self._noise_samples,
-                    audio_read_buffer_dict=self._audio_read_buffer_dict,
-                    snr_min=self._params.data_simulator.background_noise.snr_min,
-                    snr_max=self._params.data_simulator.background_noise.snr_max,
-                    background_noise_snr=self._params.data_simulator.background_noise.snr,
-                    seed=(random_seed + idx),
-                    device=self._device,
-                )
-                array += bg
+        if self._params.data_simulator.background_noise.add_bg and len(self._noise_samples) > 0:
+            avg_power_array = torch.mean(array[is_speech == 1] ** 2)
+            bg, snr, _ = get_background_noise(
+                len_array=len(array),
+                power_array=avg_power_array,
+                noise_samples=self._noise_samples,
+                audio_read_buffer_dict=self._audio_read_buffer_dict,
+                snr_min=self._params.data_simulator.background_noise.snr_min,
+                snr_max=self._params.data_simulator.background_noise.snr_max,
+                background_noise_snr=self._params.data_simulator.background_noise.snr,
+                seed=(random_seed + idx),
+                device=self._device,
+            )
+            array += bg
             length = array.shape[0]
-            bg, snr = self._get_background(length, avg_power_array)
             augmented_bg, _ = self._convolve_rir(bg, -1, RIR)
             for channel in range(self._params.data_simulator.rir_generation.mic_config.num_channels):
                 array[:, channel] += augmented_bg[channel][:length]
diff --git a/tutorials/speaker_tasks/End_to_End_Diarization_Training.ipynb b/tutorials/speaker_tasks/End_to_End_Diarization_Training.ipynb
@@ -153,7 +153,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "In mathmatical terms, Sort-Loss can be expressed as follows:\n",
+    "In mathematical terms, Sort-Loss can be expressed as follows:\n",
     "\n",
     "* **Arrival Time Sorting Function with $\\Psi$ function**   \n",
     "\n",
@@ -200,7 +200,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Now that we learn the concept of Sort Loss and Sortformer, we can now calculate Sort Loss based target matrix and PIL-based target matrix to compare the difference in target-value setting atrix and loss calculation.\n",
+    "Now that we learn the concept of Sort Loss and Sortformer, we can now calculate Sort Loss based target matrix and PIL-based target matrix to compare the difference in target-value setting matrix and loss calculation.\n",
     "\n",
     "- raw target matrix $\\mathbf{Y}$: `raw_targets`\n",
     "- prediction matrix $\\mathbf{P}$: `preds`\n",
@@ -297,7 +297,6 @@
     "from nemo.collections.asr.losses.bce_loss import BCELoss \n",
     "\n",
     "bce_loss = BCELoss()\n",
-    "# reduction='mean', class_normalization=False)\n",
     "\n",
     "def plot_diarout(preds, title_text, cmap_str):\n",
     "\n",
@@ -825,7 +824,6 @@
    "source": [
     "curr_dir = os.getcwd() + \"/\"\n",
     "config.model.train_ds.manifest_filepath = f'{curr_dir}simulated_train/sortformer_train.json'\n",
-    "# config.model.test_ds.manifest_filepath = f'{curr_dir}simulated_valid/sortformer_valid.json'\n",
     "config.model.validation_ds.manifest_filepath = f'{curr_dir}simulated_valid/sortformer_valid.json'\n",
     "config.trainer.strategy = \"ddp_notebook\"\n",
     "config.batch_size = 3\n",
diff --git a/tutorials/tools/Multispeaker_Simulator.ipynb b/tutorials/tools/Multispeaker_Simulator.ipynb
@@ -122,7 +122,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "!python NeMo/scripts/speaker_tasks/create_alignment_manifest.py \\\n",
+    "!python {NEMO_DIR_PATH}/scripts/speaker_tasks/create_alignment_manifest.py \\\n",
     "  --input_manifest_filepath LibriSpeech/dev_clean.json \\\n",
     "  --base_alignment_path LibriSpeech_Alignments \\\n",
     "  --output_manifest_filepath ./dev-clean-align.json \\\n",
@@ -218,7 +218,7 @@
    "source": [
     "# Step 5: Generate Simulated Audio Session\n",
     "\n",
-    "A single 4-speaker session of 60 seconds is generated as an example. "
+    "A single 4-speaker session of 30 seconds is generated as an example. "
    ]
   },
   {
@@ -250,7 +250,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "# Step 5: Listen to and Visualize Session\n",
+    "# Step 6: Listen to and Visualize Session\n",
     "\n",
     "Listen to the audio and visualize the corresponding speaker timestamps (recorded in a RTTM file for each session)"
    ]
@@ -264,7 +264,6 @@
    "outputs": [],
    "source": [
     "import os\n",
-    "import wget\n",
     "import IPython\n",
     "import matplotlib.pyplot as plt\n",
     "import numpy as np\n",
@@ -316,7 +315,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "# Step 6: Get Simulated Data Statistics "
+    "# Step 7: Get Simulated Data Statistics "
    ]
   },
   {
@@ -325,6 +324,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "import wget\n",
     "if not os.path.exists(\"multispeaker_data_analysis.py\"):\n",
     "  !wget https://raw.githubusercontent.com/NVIDIA/NeMo/main/scripts/speaker_tasks/multispeaker_data_analysis.py\n",
     "\n",
@@ -365,7 +365,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
+   "display_name": "nemo093025",
    "language": "python",
    "name": "python3"
   },
@@ -379,7 +379,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.9.7"
+   "version": "3.10.12"
   },
   "pycharm": {
    "stem_cell": {