NVIDIA · karpnv · Jul 7, 2025 · May 14, 2025 · May 14, 2025 · May 15, 2025
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -86,6 +86,7 @@ jobs:
         AWS_ACCESS_KEY: ${{ secrets.AWS_ACCESS_KEY }}
         CLEAN_UP_TMP_PATH: 1
       run: |
+
         wget https://uit.stanford.edu/sites/default/files/2023/10/11/incommon-rsa-ca2.pem #downloading cert manually [for CORAL]
         sudo cp incommon-rsa-ca2.pem     /usr/local/share/ca-certificates/incommon-rsa-server-ca-2.crt # [cert for CORAL]
         sudo update-ca-certificates # [cert for CORAL]

diff --git a/dataset_configs/ipl/config.yaml b/dataset_configs/ipl/config.yaml
@@ -0,0 +1,49 @@
+documentation: |
+  TopIPL
+  ######
+
+  This config is used to run the `TopIPL: Iterative Pseudo-Labeling for ASR <https://arxiv.org/abs/2506.07659>`_ training algorithm using NeMo-Run.
+
+  TopIPL is a **semi-supervised training method** for automatic speech recognition (ASR) that iteratively alternates between model training and pseudo-label generation for unlabeled data. It uses a **top-N checkpoint averaging strategy** to create a strong teacher model and maintains a **dynamic cache** of pseudo-labels throughout the process.
+
+  The pipeline is implemented as a processor compatible with the `nemo_run` framework. It generates an output manifest containing updated labels based on pseudo-labeling iterations.
+
+  This config performs the following steps:
+
+  1. Runs training and inference commands using NeMo-Run.
+  2. Periodically stops training to generate pseudo-labels with a top-N checkpoint ensemble.
+  3. Maintains a dynamic cache of pseudo-labels for unlabeled data.
+  4. Produces a new output manifest after each iteration.
+
+  **Required arguments**
+
+  - **output_manifest_file**: path where the final manifest with pseudo-labels will be saved.
+  - **nemo_run_config**: YAML config file specifying the training, inference, and IPL parameters.
+
+  **Training config requirements**
+
+  Your training config must include the following setting to enable IPL:
+
+  .. code-block:: yaml
+
+    exp_manager:
+      create_ipl_epoch_stopper_callback: True
+
+  If you're not using Lhotse, also include:
+
+  .. code-block:: yaml
+
+    ipl_epoch_stopper_callback_params:
+      stop_every_n_epochs: 2
+
+  ### Prerequisites
+
+  - nemo_run
+  - ``pip install -r ipl.txt``
+
+processors_to_run: all
+
+processors:
+  - _target_: sdp.processors.IPL.nemo_run_processor.NemoRunIPLProcessor
+    config_path: ./nemo_run_config.yaml
+    output_manifest_file: ???
diff --git a/dataset_configs/ipl/nemo_run_config.yaml b/dataset_configs/ipl/nemo_run_config.yaml
@@ -0,0 +1,80 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# The script to be run.
+script: # Script path  to run relative to directory 
+script_config: # Training config file for the script. ipl_epoch_stopper_callback should be provided in the config
+inference_config: # Inference config file of unlabeled data for transcribe_speech_parallel
+
+exp_name: null  # populated by exp_manager.name if not provided
+results_dir: # Where to store the results of the run
+
+# Path to the local NeMo repository. This is used to locate scripts and configs from NeMo.
+# To set this up:
+#   1. Clone the NeMo repository:
+#        git clone https://github.com/NVIDIA/NeMo.git /your/desired/path/to/nemo
+#   2. Set the path here:
+# Make sure this path is valid and NeMo is up to date if you're using its scripts.
+nemo_directory: # Nemo directory path
+do_average: # Boolean value indicating whether to do average of checkpoints for pseudo-label generation
+p_cache: # Probability with which update pseudo-labeled set
+num_ipl_epochs: # How many epochs do pseudo-labeling
+
+# Optional arguments
+num_runs: 
+num_gpus: 
+num_tasks_per_node: 
+max_runtime: # Specify for clusters
+
+########################################################################################################################
+
+executor: slurm # or local
+
+USER:
+
+# Fields for cluster run
+ssh_tunnel:
+  host: 
+  # ------------------------------- Fill this up! -------------------------------
+  user: "${USER}"  # your username; or resolved from ${USER} environment variable ; or can be null which resolved from ${USER} environment variable
+  job_dir: "" # Job directory to keep created files
+  identity: ""
+  # -----------------------------------------------------------------------------
+
+account: 
+partition:
+job_name_prefix: 
+
+containers:
+  asr: # Container image
+
+
+env_vars:
+  - 'TOKENIZERS_PARALLELISM='
+  - 'AIS_ENDPOINT='
+  - 'LHOTSE_AUDIO_DURATION_MISMATCH_TOLERANCE='
+  - 'TORCH_CUDNN_V8_API_ENABLED='
+  - 'PYTORCH_CUDA_ALLOC_CONF='
+  - 'HYDRA_FULL_ERROR=1'
+
+required_env_vars:
+  - 'HF_TOKEN='
+  - 'WANDB_KEY=' 
+
+mounts:
+  # Replace with your own paths in your cluster config
+  - /path/to/mount:/where/to/mount/
+
+timeouts:
+  partition_name: # Specify time
diff --git a/docs/src/sdp/api.rst b/docs/src/sdp/api.rst
@@ -379,6 +379,14 @@ Miscellaneous
 .. autodata:: sdp.processors.tts.prepare_tts_segments.PrepareTTSSegmentsProcessor
    :annotation:
 
+.. autodata:: sdp.processors.ipl.nemo_run_processor.NemoRunIPLProcessor
+   :annotation:
+
+.. autodata:: sdp.processors.ipl.ipl_processors.TrainingCommandGenerator
+   :annotation:
+
+.. autodata:: sdp.processors.ipl.ipl_processors.InferenceCommandGenerator
+   :annotation:
 
 .. _sdp-base-classes:
 

diff --git a/docs/src/sdp/existing_configs.rst b/docs/src/sdp/existing_configs.rst
@@ -407,3 +407,21 @@ HiFiTTS-2
    config-docs/english/hifitts2/config_22khz
    config-docs/english/hifitts2/config_44khz
    config-docs/english/hifitts2/config_bandwidth
+
+NemoRunIPL
+~~~~~~~~~~
+
+**Supported configs**.
+
+* **IPL**:
+  `config <https://github.com/NVIDIA/NeMo-speech-data-processor/blob/main/dataset_configs/ipl/config.yaml>`__ |
+  :doc:`documentation <config-docs/ipl/config>`
+* **NeMoRun**:
+  `config <https://github.com/NVIDIA/NeMo-speech-data-processor/blob/main/dataset_configs/ipl/nemo_run_config.yaml>`__ |
+  :doc:`documentation <config-docs/ipl/nemo_run_config>`
+
+.. toctree::
+   :hidden:
+
+   config-docs/ipl/config
+   config-docs/ipl/nemo_run_config
diff --git a/requirements/ipl.txt b/requirements/ipl.txt
@@ -0,0 +1,11 @@
+nemo_run
+
+# Nemo repository path is also required, it is used to locate scripts and configs from NeMo.
+#
+# To set this up:
+#   1. Clone the NeMo repository:
+#        git clone https://github.com/NVIDIA/NeMo.git /your/desired/path/to/nemo
+#   2. Set the path in nemo_run_config.yaml:
+#        nemo_directory: /your/desired/path/to/nemo
+#
+# Make sure this path is valid and NeMo is up to date if you're using its scripts.
diff --git a/sdp/processors/ipl/README.md b/sdp/processors/ipl/README.md
@@ -0,0 +1,47 @@
+# 🧠 TopIPL: Iterative Pseudo-Labeling for ASR
+
+TopIPL is an **iterative pseudo-labeling algorithm** designed for training ASR models using both labeled and unlabeled data. It maintains a **dynamic pseudo-label cache** and leverages **top-N averaged checkpoints** as a teacher model to generate high-quality pseudo-labels across training iterations.
+
+## 📦 Contents
+
+- `NemoRunIPLProcessor` — Command generator and job submitter for IPL runs, compatible with local and cluster environments.
+- `nemo_run_config.yaml` — Main configuration file. Users should define all required paths and parameters here.
+
+## 🚀 Getting Started
+
+TopIPL runs like any other processor in the `nemo_run` framework. To use it, you must pass:
+
+- `output_manifest_file`: Path where the resulting manifest will be saved.
+- `nemo_run_config`: YAML file containing IPL setup, training/inference configs, and NeMo-Run settings.
+
+### 🔧 Training Config Requirements
+
+Your training config must:
+
+```yaml
+exp_manager:
+  create_ipl_epoch_stopper_callback: True
+```
+If you're not using Lhotse, also include:
+
+```yaml
+ipl_epoch_stopper_callback_params:
+stop_every_n_epochs: 2
+
+```
+
+### Prerequisites
+
+Before using TopIPL, make sure the following are set up:
+
+- Clone the NeMo repository:
+  ```bash
+  git clone https://github.com/NVIDIA/NeMo.git /your/desired/path/to/nemo
+
+- Set the path to NeMo in your `nemo_run_config.yaml`: `nemo_directory: /your/desired/path/to/nemo`
+- `pip install -r requirements/ipl.txt`
+
+### Running the Code
+
+```bash
+python main.py --config-path=/path/to/directory/config --config-name=config.yaml
diff --git a/sdp/processors/ipl/__init__.py b/sdp/processors/ipl/__init__.py