NVIDIA · karpnv · Jul 22, 2025 · Jan 17, 2024 · Jan 22, 2024 · Mar 16, 2024
diff --git a/dataset_configs/portuguese/unlabeled/config.yaml b/dataset_configs/portuguese/unlabeled/config.yaml
@@ -0,0 +1,108 @@
+documentation: |
+  Unlabeled Data Processing Pipeline
+  ##################################
+
+  This pipeline processes unlabeled data for iterative pseudo-labeling training.
+
+  The pipeline performs the following steps:
+  1. Creates an initial manifest by searching for all WAV files in the `raw_data_dir` folder.
+  2. Counts the duration of each WAV file.
+  3. Identifies the language using the `langid_ambernet` NeMo model.
+  4. Filters out audios that are tagged with a different language.
+  5. Filters out audios that are too long to be processed.
+  6. Applies the VAD algorithm from the NeMo repository.
+  7. Forms segments by joining adjacent segments up to a duration threshold.
+  8. Splits long audios into shorter segments.
+  9. Removes empty files and extra fields from the manifest.
+
+  **Required inputs**:
+    - `workspace_dir`: Directory for intermediate files, containing the following subfolders:
+    - `${workspace_dir}/wavs/` - Folder with source long files.
+    - `${workspace_dir}/sdp/` - Folder to store manifests.
+    - `${workspace_dir}/sdp/vad/` - Folder to store temporary files from the VAD algorithm.
+    - `${workspace_dir}/splited_wavs/` - Folder to store split short files.
+
+  - `language_short`: Two-letter language code.
+  - `nemo_path`: Path to NeMo installation.
+  - `final_manifest`: Path to the final output manifest.
+
+processors_to_run: "0:"
+workspace_dir: ???
+manifest_dir: ${workspace_dir}/sdp
+language_short: pt
+nemo_path: ??? 
+final_manifest: ${manifest_dir}/final_manifest.json
+
+processors:
+  - _target_: sdp.processors.CreateInitialManifestByExt
+    raw_data_dir: ${workspace_dir}/wavs
+    extension: wav
+    output_file_key: audio_filepath
+    output_manifest_file: ${manifest_dir}/manifest0.json
+
+  - _target_: sdp.processors.GetAudioDuration
+    audio_filepath_key: audio_filepath
+    duration_key: duration
+    output_manifest_file: ${manifest_dir}/manifest1.json
+
+  - _target_: sdp.processors.AudioLid
+    output_manifest_file: ${manifest_dir}/manifest2.json
+    input_audio_key: audio_filepath
+    output_lang_key: audio_lang
+    should_run: False
+    device: cuda
+    pretrained_model: "langid_ambernet"
+    segment_duration: 20
+    num_segments: 3
+
+  - _target_: sdp.processors.PreserveByValue
+    output_manifest_file: ${manifest_dir}/manifest3.json
+    input_value_key: audio_lang
+    should_run: False
+    target_value: ${language_short}
+
+  - _target_: sdp.processors.PreserveByValue
+    output_manifest_file: ${manifest_dir}/manifest4.json
+    input_value_key: duration
+    operator: le
+    target_value: 20000.0
+
+  - _target_: sdp.processors.Subprocess
+    cmd: 'rm -rf ${manifest_dir}/vad/*'
+
+  - _target_: sdp.processors.Subprocess
+    input_manifest_file: ${manifest_dir}/manifest4.json
+    output_manifest_file: ${manifest_dir}/vad
+    input_manifest_arg: "manifest_filepath"
+    output_manifest_arg: "output_dir"
+    cmd: 'python sdp/processors/nemo/speech_to_text_with_vad.py audio_type=wav vad_model=vad_multilingual_frame_marblenet  vad_config=sdp/processors/nemo/frame_vad_infer_postprocess.yaml'
+
+  - _target_: sdp.processors.RenameFields
+    input_manifest_file: ${manifest_dir}/vad/temp_manifest_vad_rttm-onset0.3-offset0.3-pad_onset0.2-pad_offset0.2-min_duration_on0.2-min_duration_off0.2-filter_speech_firstTrue.json
+    output_manifest_file: ${manifest_dir}/manifest7.json
+    rename_fields: {"audio_filepath":"source_filepath"}
+
+  - _target_: sdp.processors.nemo.rttm.GetRttmSegments
+    output_manifest_file: ${manifest_dir}/manifest8.json
+    rttm_key: rttm_file
+    output_file_key: audio_segments
+    duration_key: duration
+    duration_threshold: 20.0
+
+  - _target_: sdp.processors.nemo.rttm.SplitAudioFile
+    output_manifest_file: ${manifest_dir}/manifest9.json
+    splited_audio_dir: ${workspace_dir}/splited_wavs/
+    segments_key: audio_segments
+    duration_key: duration
+    input_file_key: source_filepath
+    output_file_key: audio_filepath
+
+  - _target_: sdp.processors.PreserveByValue
+    output_manifest_file: ${manifest_dir}/manifest10.json
+    input_value_key: duration
+    operator: gt
+    target_value: 0.0
+
+  - _target_: sdp.processors.KeepOnlySpecifiedFields
+    output_manifest_file: ${final_manifest}
+    fields_to_keep: ["audio_filepath", "duration"]
diff --git a/docs/src/sdp/existing_configs.rst b/docs/src/sdp/existing_configs.rst
@@ -408,8 +408,19 @@ HiFiTTS-2
    config-docs/english/hifitts2/config_44khz
    config-docs/english/hifitts2/config_bandwidth
 
+
+Unlabeled Portuguese Data
+~~~~~~~~~~~~~~~~~~~~~~~~~
+
+`config <https://github.com/NVIDIA/NeMo-speech-data-processor/blob/main/dataset_configs/portuguese/unlabeled/config.yaml>`__ |
+:doc:`documentation <config-docs/portuguese/unlabeled/config>`
+
+.. toctree::
+   :hidden:
+
+   config-docs/portuguese/unlabeled/config
+
 NemoRunIPL
-~~~~~~~~~~
 
 **Supported configs**.
 
@@ -419,13 +430,13 @@ NemoRunIPL
 * **NeMoRun**:
   `config <https://github.com/NVIDIA/NeMo-speech-data-processor/blob/main/dataset_configs/ipl/nemo_run_config.yaml>`__ |
   :doc:`documentation <config-docs/ipl/nemo_run_config>`
-
+  
 .. toctree::
    :hidden:
 
    config-docs/ipl/config
    config-docs/ipl/nemo_run_config
-
+   
 Earnings21/22
 ~~~~~~~~~~~~~
 
@@ -438,4 +449,4 @@ Earnings21/22
 .. toctree::
    :hidden:
 
-   config-docs/english/earnings/config
+   config-docs/english/earnings/config
diff --git a/sdp/processors/__init__.py b/sdp/processors/__init__.py
@@ -32,9 +32,8 @@
     CreateInitialManifestFleurs,
 )
 from sdp.processors.datasets.hifitts2.download_dataset import DownloadHiFiTTS2
-from sdp.processors.datasets.hifitts2.remove_failed_chapters import RemovedFailedChapters
-from sdp.processors.datasets.uzbekvoice.create_initial_manifest import (
-    CreateInitialManifestUzbekvoice,
+from sdp.processors.datasets.hifitts2.remove_failed_chapters import (
+    RemovedFailedChapters,
 )
 from sdp.processors.datasets.ksc2.create_initial_manifest import (
     CreateInitialManifestKSC2,
@@ -44,13 +43,15 @@
     CreateInitialManifestLibrispeech,
 )
 from sdp.processors.datasets.masc import (
-    CreateInitialManifestMASC,
     AggregateSegments,
+    CreateInitialManifestMASC,
+    GetCaptionFileSegments,
     RegExpVttEntries,
-    GetCaptionFileSegments
 )
-from sdp.processors.datasets.mediaspeech.create_initial_manifest import CreateInitialManifestMediaSpeech
 from sdp.processors.datasets.mcv.create_initial_manifest import CreateInitialManifestMCV
+from sdp.processors.datasets.mediaspeech.create_initial_manifest import (
+    CreateInitialManifestMediaSpeech,
+)
 from sdp.processors.datasets.mls.create_initial_manifest import CreateInitialManifestMLS
 from sdp.processors.datasets.mls.restore_pc import RestorePCForMLS
 from sdp.processors.datasets.mtedx.create_initial_manifest import (
@@ -67,18 +68,20 @@
     CreateInitialManifestSLR140,
     CustomDataSplitSLR140,
 )
+from sdp.processors.datasets.uzbekvoice.create_initial_manifest import (
+    CreateInitialManifestUzbekvoice,
+)
 from sdp.processors.datasets.voxpopuli.create_initial_manifest import (
     CreateInitialManifestVoxpopuli,
 )
 from sdp.processors.datasets.voxpopuli.normalize_from_non_pc_text import (
     NormalizeFromNonPCTextVoxpopuli,
 )
-from sdp.processors.datasets.ytc.create_initial_manifest import (
-    CreateInitialManifestYTC,
+from sdp.processors.datasets.ytc.create_initial_manifest import CreateInitialManifestYTC
+from sdp.processors.huggingface.create_initial_manifest import (
+    CreateInitialManifestHuggingFace,
 )
 from sdp.processors.huggingface.speech_recognition import ASRTransformers
-from sdp.processors.huggingface.create_initial_manifest import CreateInitialManifestHuggingFace
-
 from sdp.processors.modify_manifest.common import (
     AddConstantFields,
     ApplyInnerJoin,
@@ -89,7 +92,9 @@
     RenameFields,
     SortManifest,
     SplitOnFixedDuration,
+    Subprocess,
     DropSpecifiedFields,
+
 )
 from sdp.processors.modify_manifest.create_manifest import (
     CreateCombinedManifests,
@@ -104,8 +109,8 @@
     GetWER,
     InsIfASRInsertion,
     InverseNormalizeText,
-    NormalizeText,
     MakeSentence,
+    NormalizeText,
     ReadDocxLines,
     ReadTxtLines,
     SplitLineBySentence,
@@ -130,8 +135,8 @@
     DropLowWordMatchRate,
     DropNonAlphabet,
     DropOnAttribute,
-    PreserveByValue,
     DropRepeatedFields,
+    PreserveByValue,
 )
 from sdp.processors.modify_manifest.make_letters_uppercase_after_period import (
     MakeLettersUppercaseAfterPeriod,
@@ -148,6 +153,7 @@
 )
 from sdp.processors.nemo.asr_inference import ASRInference
 from sdp.processors.nemo.estimate_bandwidth import EstimateBandwidth
+from sdp.processors.nemo.lid_inference import AudioLid
 from sdp.processors.nemo.pc_inference import PCInference
 from sdp.processors.toloka.accept_if import AcceptIfWERLess
 from sdp.processors.toloka.create_pool import CreateTolokaPool

diff --git a/sdp/processors/modify_manifest/common.py b/sdp/processors/modify_manifest/common.py
@@ -14,12 +14,14 @@
 
 import json
 import os
+import subprocess
 from pathlib import Path
-from typing import Dict, List, Union, Optional
+from typing import Dict, List, Optional, Union
 
 import pandas as pd
 from tqdm import tqdm
 
+from sdp.logging import logger
 from sdp.processors.base_processor import (
     BaseParallelProcessor,
     BaseProcessor,
@@ -28,6 +30,71 @@
 )
 from sdp.utils.common import load_manifest
 
+
+class Subprocess(BaseProcessor):
+    """
+    Processor for handling subprocess execution with additional features for managing input and output manifests.
+
+    Args:
+        cmd (str): The command to be executed as a subprocess.
+        input_manifest_arg (str, optional): The argument specifying the input manifest. Defaults to an empty string.
+        output_manifest_arg (str, optional): The argument specifying the output manifest. Defaults to an empty string.
+        arg_separator (str, optional): The separator used between argument and value. Defaults to "=".
+        **kwargs: Additional keyword arguments to be passed to the base class.
+
+    Example:
+
+        _target_: sdp.processors.datasets.commoncrawl.Subprocess
+        output_manifest_file: /workspace/manifest.json
+        input_manifest_arg: "--manifest"
+        output_manifest_arg: "--output_filename"
+        arg_separator: "="
+        cmd: "python /workspace/NeMo-text-processing/nemo_text_processing/text_normalization/normalize_with_audio.py \
+            --language=en --n_jobs=-1 --batch_size=600 --manifest_text_field=text --cache_dir=${workspace_dir}/cache --overwrite_cache \
+            --whitelist=/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/en/data/whitelist/asr_with_pc.tsv"
+
+    """
+
+    def __init__(
+        self,
+        cmd: str,
+        input_manifest_arg: str = "",
+        output_manifest_arg: str = "",
+        arg_separator: str = "=",
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.input_manifest_arg = input_manifest_arg
+        self.output_manifest_arg = output_manifest_arg
+        self.arg_separator = arg_separator
+        self.cmd = cmd
+
+    def process(self):
+        os.makedirs(os.path.dirname(self.output_manifest_file), exist_ok=True)
+        if self.cmd.find(self.input_manifest_file) != -1 or self.cmd.find(self.output_manifest_file) != -1:
+            logger.error(
+                "input_manifest_file "
+                + self.input_manifest_file
+                + " and output_manifest_file "
+                + self.output_manifest_file
+                + " should be exluded from cmd line!"
+            )
+            raise ValueError
+        process_args = [x for x in self.cmd.split(" ") if x]
+        if self.arg_separator == " ":
+            if self.input_manifest_arg:
+                process_args.extend([self.input_manifest_arg, self.input_manifest_file])
+            if self.output_manifest_arg:
+                process_args.extend([self.output_manifest_arg, self.output_manifest_file])
+        else:
+            if self.input_manifest_arg:
+                process_args.extend([self.input_manifest_arg + self.arg_separator + self.input_manifest_file])
+            if self.output_manifest_arg:
+                process_args.extend([self.output_manifest_arg + self.arg_separator + self.output_manifest_file])
+        subprocess.run(" ".join(process_args), shell=True)
+
+
+
 class CombineSources(BaseParallelProcessor):
     """Can be used to create a single field from two alternative sources.
 
@@ -104,24 +171,24 @@ class AddConstantFields(BaseParallelProcessor):
     This processor adds constant fields to all manifest entries using Dask BaseParallelProcessor.
     It is useful when you want to attach fixed information (e.g., a language label or metadata)
     to each entry for downstream tasks such as language identification model training.
-    
+
     Args:
         fields (dict): A dictionary containing key-value pairs of fields to add to each manifest entry.
             For example::
-    
+
                 {
                     "label": "en",
                     "metadata": "mcv-11.0-2022-09-21"
                 }
-    
+
     Returns:
         dict: The same data as in the input manifest with the added constant fields as specified in
         the ``fields`` dictionary.
-    
+
     Example:
-    
+
         .. code-block:: yaml
-    
+
             - _target_: sdp.processors.modify_manifest.common.AddConstantFields
               input_manifest_file: ${workspace_dir}/input_manifest.json
               output_manifest_file: ${workspace_dir}/output_manifest.json
@@ -139,7 +206,6 @@ def process_dataset_entry(self, data_entry: Dict):
         return [DataEntry(data=data_entry)]
 
 
-
 class DuplicateFields(BaseParallelProcessor):
     """This processor duplicates fields in all manifest entries.
 
@@ -154,8 +220,8 @@ class DuplicateFields(BaseParallelProcessor):
 
     Returns:
         The same data as in the input manifest with duplicated fields
-        as specified in the ``duplicate_fields`` input dictionary. 
-    
+        as specified in the ``duplicate_fields`` input dictionary.
+
     Example:
         .. code-block:: yaml
 
@@ -165,6 +231,7 @@ class DuplicateFields(BaseParallelProcessor):
               duplicate_fields: {"text":"answer"}
 
     """
+
     def __init__(
         self,
         duplicate_fields: Dict,