NVIDIA
diff --git a/‎.github/workflows/tests.yml‎
Lines changed: 7 additions & 0 deletions b/‎.github/workflows/tests.yml‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎dataset_configs/english/coraal/config.yaml‎
Lines changed: 1 addition & 1 deletion b/‎dataset_configs/english/coraal/config.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎dataset_configs/ipl/config.yaml‎
Lines changed: 49 additions & 0 deletions b/‎dataset_configs/ipl/config.yaml‎
Lines changed: 49 additions & 0 deletions
diff --git a/‎dataset_configs/ipl/nemo_run_config.yaml‎
Lines changed: 80 additions & 0 deletions b/‎dataset_configs/ipl/nemo_run_config.yaml‎
Lines changed: 80 additions & 0 deletions
diff --git a/‎docker/Dockerfile‎
Lines changed: 4 additions & 1 deletion b/‎docker/Dockerfile‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎docs/src/conf.py‎
Lines changed: 5 additions & 1 deletion b/‎docs/src/conf.py‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎docs/src/sdp/api.rst‎
Lines changed: 8 additions & 0 deletions b/‎docs/src/sdp/api.rst‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎docs/src/sdp/existing_configs.rst‎
Lines changed: 18 additions & 0 deletions b/‎docs/src/sdp/existing_configs.rst‎
Lines changed: 18 additions & 0 deletions
diff --git a/‎requirements/ipl.txt‎
Lines changed: 11 additions & 0 deletions b/‎requirements/ipl.txt‎
Lines changed: 11 additions & 0 deletions
diff --git a/‎requirements/main.txt‎
Lines changed: 2 additions & 2 deletions b/‎requirements/main.txt‎
Lines changed: 2 additions & 2 deletions
@@ -75,14 +75,21 @@ jobs:
         pip install nemo-toolkit[asr,nlp]==1.23.0
         pip install nemo_text_processing
         pip install -r requirements/huggingface.txt
+        pip install certifi #this needed to avoid problems with certificates [COORAL]
+        export SSL_CERT_FILE=$(python -m certifi)
         python -m pip cache purge
+        
 
     - name: Run all tests
       env:
         AWS_SECRET_KEY: ${{ secrets.AWS_SECRET_KEY }}
         AWS_ACCESS_KEY: ${{ secrets.AWS_ACCESS_KEY }}
         CLEAN_UP_TMP_PATH: 1
       run: |
+
+        wget https://uit.stanford.edu/sites/default/files/2023/10/11/incommon-rsa-ca2.pem #downloading cert manually [for CORAL]
+        sudo cp incommon-rsa-ca2.pem     /usr/local/share/ca-certificates/incommon-rsa-server-ca-2.crt # [cert for CORAL]
+        sudo update-ca-certificates # [cert for CORAL]
         set -o pipefail # this will make sure next line returns non-0 exit code if tests fail
         python -m pytest tests/ --junitxml=pytest.xml --ignore=tests/test_tts_sdp_end_to_end.py --cov-report=term-missing:skip-covered --cov=sdp --durations=30 -rs | tee pytest-coverage.txt
 
 
@@ -18,7 +18,7 @@ documentation: |
   This config performs the following data processing.
 
   1. Downloads CORAAL data based on the
-     `official file list <http://lingtools.uoregon.edu/coraal/coraal_download_list.txt>`_.
+     `official file list <https://lingtools.uoregon.edu/coraal/coraal_download_list.txt>`_. #Official mirror link
      There are a couple of errors in the links there, which are fixed in our code.
   2. Drops all utterances which contain only pauses. Set ``drop_pauses=False`` to undo.
   3. Groups all consecutive segments from the same speaker until 20 seconds duration
 
@@ -0,0 +1,49 @@
+documentation: |
+  TopIPL
+  ######
+
+  This config is used to run the `TopIPL: Iterative Pseudo-Labeling for ASR <https://arxiv.org/abs/2506.07659>`_ training algorithm using NeMo-Run.
+
+  TopIPL is a **semi-supervised training method** for automatic speech recognition (ASR) that iteratively alternates between model training and pseudo-label generation for unlabeled data. It uses a **top-N checkpoint averaging strategy** to create a strong teacher model and maintains a **dynamic cache** of pseudo-labels throughout the process.
+
+  The pipeline is implemented as a processor compatible with the `nemo_run` framework. It generates an output manifest containing updated labels based on pseudo-labeling iterations.
+
+  This config performs the following steps:
+
+  1. Runs training and inference commands using NeMo-Run.
+  2. Periodically stops training to generate pseudo-labels with a top-N checkpoint ensemble.
+  3. Maintains a dynamic cache of pseudo-labels for unlabeled data.
+  4. Produces a new output manifest after each iteration.
+
+  **Required arguments**
+
+  - **output_manifest_file**: path where the final manifest with pseudo-labels will be saved.
+  - **nemo_run_config**: YAML config file specifying the training, inference, and IPL parameters.
+
+  **Training config requirements**
+
+  Your training config must include the following setting to enable IPL:
+
+  .. code-block:: yaml
+
+    exp_manager:
+      create_ipl_epoch_stopper_callback: True
+
+  If you're not using Lhotse, also include:
+
+  .. code-block:: yaml
+
+    ipl_epoch_stopper_callback_params:
+      stop_every_n_epochs: 2
+
+  ### Prerequisites
+
+  - nemo_run
+  - ``pip install -r ipl.txt``
+
+processors_to_run: all
+
+processors:
+  - _target_: sdp.processors.IPL.nemo_run_processor.NemoRunIPLProcessor
+    config_path: ./nemo_run_config.yaml
+    output_manifest_file: ???
@@ -0,0 +1,80 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# The script to be run.
+script: # Script path  to run relative to directory 
+script_config: # Training config file for the script. ipl_epoch_stopper_callback should be provided in the config
+inference_config: # Inference config file of unlabeled data for transcribe_speech_parallel
+
+exp_name: null  # populated by exp_manager.name if not provided
+results_dir: # Where to store the results of the run
+
+# Path to the local NeMo repository. This is used to locate scripts and configs from NeMo.
+# To set this up:
+#   1. Clone the NeMo repository:
+#        git clone https://github.com/NVIDIA/NeMo.git /your/desired/path/to/nemo
+#   2. Set the path here:
+# Make sure this path is valid and NeMo is up to date if you're using its scripts.
+nemo_directory: # Nemo directory path
+do_average: # Boolean value indicating whether to do average of checkpoints for pseudo-label generation
+p_cache: # Probability with which update pseudo-labeled set
+num_ipl_epochs: # How many epochs do pseudo-labeling
+
+# Optional arguments
+num_runs: 
+num_gpus: 
+num_tasks_per_node: 
+max_runtime: # Specify for clusters
+
+########################################################################################################################
+
+executor: slurm # or local
+
+USER:
+
+# Fields for cluster run
+ssh_tunnel:
+  host: 
+  # ------------------------------- Fill this up! -------------------------------
+  user: "${USER}"  # your username; or resolved from ${USER} environment variable ; or can be null which resolved from ${USER} environment variable
+  job_dir: "" # Job directory to keep created files
+  identity: ""
+  # -----------------------------------------------------------------------------
+
+account: 
+partition:
+job_name_prefix: 
+
+containers:
+  asr: # Container image
+
+
+env_vars:
+  - 'TOKENIZERS_PARALLELISM='
+  - 'AIS_ENDPOINT='
+  - 'LHOTSE_AUDIO_DURATION_MISMATCH_TOLERANCE='
+  - 'TORCH_CUDNN_V8_API_ENABLED='
+  - 'PYTORCH_CUDA_ALLOC_CONF='
+  - 'HYDRA_FULL_ERROR=1'
+
+required_env_vars:
+  - 'HF_TOKEN='
+  - 'WANDB_KEY=' 
+
+mounts:
+  # Replace with your own paths in your cluster config
+  - /path/to/mount:/where/to/mount/
+
+timeouts:
+  partition_name: # Specify time
@@ -21,6 +21,9 @@ RUN apt-get update \
 # Update pip
 RUN pip install --upgrade pip
 
+#install typing-ext manually
+RUN pip install typing-extensions
+
 # Clone the NeMo SDP repository
 COPY . /src/NeMo-speech-data-processor
 RUN rm -rf /src/NeMo-speech-data-processor/.git
@@ -34,4 +37,4 @@ RUN find requirements/ -name "*.txt" -exec pip install -r {} \;
 WORKDIR /src/NeMo-speech-data-processor
 
 # Set up entrypoint
-CMD ["bash"]
+CMD ["bash"]
@@ -45,7 +45,6 @@
     "numpy",
     "tqdm",
     "soundfile",
-    "ndjson",
     "boto3",
     "webvtt_py",
     "python_docx",
@@ -189,3 +188,8 @@ def setup(app):
 ]
 # nitpick_ignore_regex = [('py:class', '*')]
 
+#adding this especially for coraal, temporary
+linkcheck_ignore = [
+    r'https://lingtools\.uoregon\.edu/coraal/coraal_download_list\.txt',
+]
+# https://lingtools.uoregon.edu/coraal/coraal_download_list.txt
@@ -379,6 +379,14 @@ Miscellaneous
 .. autodata:: sdp.processors.tts.prepare_tts_segments.PrepareTTSSegmentsProcessor
    :annotation:
 
+.. autodata:: sdp.processors.ipl.nemo_run_processor.NemoRunIPLProcessor
+   :annotation:
+
+.. autodata:: sdp.processors.ipl.ipl_processors.TrainingCommandGenerator
+   :annotation:
+
+.. autodata:: sdp.processors.ipl.ipl_processors.InferenceCommandGenerator
+   :annotation:
 
 .. _sdp-base-classes:
 
 
@@ -408,6 +408,7 @@ HiFiTTS-2
    config-docs/english/hifitts2/config_44khz
    config-docs/english/hifitts2/config_bandwidth
 
+
 Unlabeled Portuguese Data
 ~~~~~~~~~~~~~~~~~~~~~~~~~
 
@@ -418,3 +419,20 @@ Unlabeled Portuguese Data
    :hidden:
 
    config-docs/portuguese/unlabeled/config
+
+NemoRunIPL
+
+**Supported configs**.
+
+* **IPL**:
+  `config <https://github.com/NVIDIA/NeMo-speech-data-processor/blob/main/dataset_configs/ipl/config.yaml>`__ |
+  :doc:`documentation <config-docs/ipl/config>`
+* **NeMoRun**:
+  `config <https://github.com/NVIDIA/NeMo-speech-data-processor/blob/main/dataset_configs/ipl/nemo_run_config.yaml>`__ |
+  :doc:`documentation <config-docs/ipl/nemo_run_config>`
+  
+.. toctree::
+   :hidden:
+
+   config-docs/ipl/config
+   config-docs/ipl/nemo_run_config
@@ -0,0 +1,11 @@
+nemo_run
+
+# Nemo repository path is also required, it is used to locate scripts and configs from NeMo.
+#
+# To set this up:
+#   1. Clone the NeMo repository:
+#        git clone https://github.com/NVIDIA/NeMo.git /your/desired/path/to/nemo
+#   2. Set the path in nemo_run_config.yaml:
+#        nemo_directory: /your/desired/path/to/nemo
+#
+# Make sure this path is valid and NeMo is up to date if you're using its scripts.
@@ -4,7 +4,7 @@ ffmpeg
 hydra-core
 joblib
 librosa>=0.10.0 # specify >=0.10.0 so that librosa.get_duration(path=...) will work
-numpy==1.26
+numpy>=1.26, <2.0 # module was used numpy 1.x and may crash in 2.x
 omegaconf
 pandas
 rarfile
@@ -18,7 +18,7 @@ python-docx
 pydub
 dask
 distributed
-
+jiwer>=3.1.0,<4.0.0
 # toloka-kit  # Temporarily disabled due to Toloka's technical pause; keep as reference for past and future API support
 # for some processers, additionally https://github.com/NVIDIA/NeMo is required
 # for some processers, additionally nemo_text_processing is required