Add RemoveFiles and ExtractTar, reorganize audio converters (#139)

ssh-meister · root · web-flow · commit c61366e797bb · 2025-07-21T15:33:19.000+04:00
* Group file management processors

Signed-off-by: Sasha Meister &lt;117230141+ssh-meister@users.noreply.github.com&gt;

* Changes addressing the reviewer’s comments

Signed-off-by: root &lt;root@cpu-00167.cm.cluster&gt;

* Fix docs build issue

Signed-off-by: Sasha Meister &lt;ameister@nvidia.com&gt;

* Earnings21/22
added to docs

Signed-off-by: Sasha Meister &lt;ameister@nvidia.com&gt;

* Fix doc header

Signed-off-by: Sasha Meister &lt;ameister@nvidia.com&gt;

---------

Signed-off-by: Sasha Meister &lt;117230141+ssh-meister@users.noreply.github.com&gt;
Signed-off-by: root &lt;root@cpu-00167.cm.cluster&gt;
Signed-off-by: Sasha Meister &lt;ameister@nvidia.com&gt;
Co-authored-by: root &lt;root@cpu-00167.cm.cluster&gt;
diff --git a/docs/src/sdp/api.rst b/docs/src/sdp/api.rst
@@ -99,22 +99,22 @@ UzbekVoice
 Earnings21/22
 '''''''''''''
 
-.. autodata:: sdp.processors.datasets.earnings21.CreateInitialAudioAndManifest
+.. autodata:: sdp.processors.datasets.earnings.CreateInitialAudioAndManifest
    :annotation:
 
-.. autodata:: sdp.processors.datasets.earnings21.CreateFullAudioManifestEarnings21
+.. autodata:: sdp.processors.datasets.earnings.CreateFullAudioManifestEarnings21
    :annotation:
 
-.. autodata:: sdp.processors.datasets.earnings21.SpeakerSegmentedManifest
+.. autodata:: sdp.processors.datasets.earnings.SpeakerSegmentedManifest
    :annotation:
 
-.. autodata:: sdp.processors.datasets.earnings21.CreateSentenceSegmentedManifest
+.. autodata:: sdp.processors.datasets.earnings.CreateSentenceSegmentedManifest
    :annotation:
 
-.. autodata:: sdp.processors.datasets.earnings21.NeMoForcedAligner
+.. autodata:: sdp.processors.datasets.earnings.NeMoForcedAligner
    :annotation:
 
-.. autodata:: sdp.processors.datasets.earnings21.ApplyEarnings21Normalizations
+.. autodata:: sdp.processors.datasets.earnings.ApplyEarnings21Normalizations
    :annotation:
 
 
@@ -278,13 +278,25 @@ ASR-based processors
 Data modifications
 ''''''''''''''''''
 
+.. autodata:: sdp.processors.InsIfASRInsertion
+   :annotation:
+
+.. autodata:: sdp.processors.SubIfASRSubstitution
+   :annotation:
+
+Files management
+''''''''''''''''
+
 .. autodata:: sdp.processors.SoxConvert
    :annotation:
 
-.. autodata:: sdp.processors.InsIfASRInsertion
+.. autodata:: sdp.processors.FfmpegConvert
    :annotation:
 
-.. autodata:: sdp.processors.SubIfASRSubstitution
+.. autodata:: sdp.processors.ExtractTar
+   :annotation:
+
+.. autodata:: sdp.processors.RemoveFiles
    :annotation:
 
 Data filtering
@@ -379,9 +391,6 @@ Miscellaneous
 .. autodata:: sdp.processors.GetAudioDuration
    :annotation:
 
-.. autodata:: sdp.processors.FfmpegConvert
-   :annotation:
-
 .. autodata:: sdp.processors.CreateInitialManifestByExt
    :annotation:
 
diff --git a/docs/src/sdp/existing_configs.rst b/docs/src/sdp/existing_configs.rst
@@ -424,4 +424,18 @@ NemoRunIPL
    :hidden:
 
    config-docs/ipl/config
-   config-docs/ipl/nemo_run_config
+   config-docs/ipl/nemo_run_config
+
+Earnings21/22
+~~~~~~~~~~~~~
+
+**Supported configs**.
+
+* **English**:
+  `config <https://github.com/NVIDIA/NeMo-speech-data-processor/blob/main/dataset_configs/english/earnings/config.yaml>`__ |
+  :doc:`documentation <config-docs/english/earnings/config>`
+
+.. toctree::
+   :hidden:
+
+   config-docs/english/earnings/config
diff --git a/sdp/processors/__init__.py b/sdp/processors/__init__.py
@@ -99,7 +99,6 @@
     CopyManifestData,
     CountNumWords,
     ExtractFromBrackets,
-    FfmpegConvert,
     GetAudioDuration,
     GetWER,
     InsIfASRInsertion,
@@ -108,7 +107,6 @@
     MakeSentence,
     ReadDocxLines,
     ReadTxtLines,
-    SoxConvert,
     SplitLineBySentence,
     SubIfASRSubstitution,
     SubMakeLowercase,
@@ -136,6 +134,16 @@
 from sdp.processors.modify_manifest.make_letters_uppercase_after_period import (
     MakeLettersUppercaseAfterPeriod,
 )
+from sdp.processors.manage_files.convert_audio import (
+    FfmpegConvert,
+    SoxConvert,
+)
+from sdp.processors.manage_files.extract import (
+    ExtractTar,
+)
+from sdp.processors.manage_files.remove import (
+    RemoveFiles,
+)
 from sdp.processors.nemo.asr_inference import ASRInference
 from sdp.processors.nemo.estimate_bandwidth import EstimateBandwidth
 from sdp.processors.nemo.pc_inference import PCInference
diff --git a/sdp/processors/base_processor.py b/sdp/processors/base_processor.py
@@ -19,7 +19,6 @@
 import time
 from abc import ABC, abstractmethod
 from dataclasses import dataclass
-from itertools import chain
 from typing import Any, Dict, List, Optional, Union
 
 from tqdm import tqdm
diff --git a/sdp/processors/manage_files/__init__.py b/sdp/processors/manage_files/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/sdp/processors/manage_files/convert_audio.py b/sdp/processors/manage_files/convert_audio.py
@@ -0,0 +1,170 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from typing import Optional
+from sox import Transformer
+
+from sdp.logging import logger
+from sdp.processors.base_processor import BaseParallelProcessor, DataEntry
+
+from sdp.utils.common import ffmpeg_convert
+
+
+class FfmpegConvert(BaseParallelProcessor):
+    """
+    Processor for converting video or audio files to audio using FFmpeg and updating the dataset with the path to the resampled audio.
+    If ``id_key`` is not None, the output file path will be ``<resampled_audio_dir>/<id_key>.wav``.
+    If ``id_key`` is None, the output file path will be ``<resampled_audio_dir>/<input file name without extension>.wav``.
+
+    .. note:: ``id_key`` can be used to create subdirectories inside ``resampled_audio_dir`` (by using forward slashes ``/``).
+        e.g. if ``id_key`` takes the form ``dir_name1/dir_name2/filename``, the output file path will be
+
+        ``<resampled_audio_dir>/dir_name1/dirname2/filename.wav``.
+
+    Args:
+        converted_audio_dir (str): The directory to store the resampled audio files.
+        input_file_key (str): The field in the dataset representing the path to the input video or audio files.
+        output_file_key (str): The field in the dataset representing the path to the resampled audio files with ``output_format``. If ``id_key`` is None, the output file path will be ``<resampled_audio_dir>/<input file name without extension>.wav``.
+        id_key (str): (Optional) The field in the dataset representing the unique ID or identifier for each entry. If ``id_key`` is not None, the output file path will be ``<resampled_audio_dir>/<id_key>.wav``. Defaults to None.
+        output_format (str): (Optional) Format of the output audio files. Defaults to `wav`.
+        target_samplerate (int): (Optional) The target sampling rate for the resampled audio. Defaults to 16000.
+        target_nchannels (int): (Optional) The target number of channels for the resampled audio. Defaults to 1.
+        **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`.
+
+    """
+
+    def __init__(
+        self,
+        converted_audio_dir: str,
+        input_file_key: str,
+        output_file_key: str,
+        id_key: str = None,
+        output_format: str = "wav",
+        base_dir: str = None,
+        target_samplerate: int = 16000,
+        target_nchannels: int = 1,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.converted_audio_dir = converted_audio_dir
+        self.input_file_key = input_file_key
+        self.output_file_key = output_file_key
+        self.output_format = output_format
+        self.id_key = id_key
+        self.base_dir = base_dir
+        self.target_samplerate = target_samplerate
+        self.target_nchannels = target_nchannels
+
+    def prepare(self):
+        assert self.output_format == "wav", "Currently only wav format is supported"
+        os.makedirs(self.converted_audio_dir, exist_ok=True)
+
+    def process_dataset_entry(self, data_entry):
+        input_file = data_entry[self.input_file_key]
+        if self.id_key:
+            key = data_entry[self.id_key]
+            os.makedirs(os.path.join(self.converted_audio_dir, *key.split("/")[:-1]), exist_ok=True)
+        else:
+            key = os.path.splitext(input_file)[0].split("/")[-1]
+
+        if self.base_dir:
+            new_dir = os.path.dirname(os.path.relpath(input_file, self.base_dir))
+            os.makedirs(os.path.join(self.converted_audio_dir, new_dir), exist_ok=True)
+
+            key = os.path.join(new_dir, key)
+
+        audio_file = os.path.join(self.converted_audio_dir, key) + "." + self.output_format
+
+        if not os.path.isfile(audio_file):
+            ffmpeg_convert(input_file, audio_file, self.target_samplerate, self.target_nchannels)
+
+        data_entry[self.output_file_key] = audio_file
+        return [DataEntry(data=data_entry)]
+
+
+class SoxConvert(BaseParallelProcessor):
+    """Processor for Sox to convert audio files to specified format.
+
+    Args:
+        output_manifest_file (str): Path to the output manifest file.
+        input_audio_file_key (str): Key in the manifest file that contains the path to the input audio file.
+        output_audio_file_key (str): Key in the manifest file that contains the path to the output audio file.
+        converted_audio_dir (str): Path to the directory where the converted audio files will be stored.
+        output_format (str): Format of the output audio file.
+        rate (int): Sample rate of the output audio file.
+        channels (int): Number of channels of the output audio file.
+        workspace_dir (str, Optional): Path to the workspace directory. Defaults to None.
+    """
+
+    def __init__(
+        self,
+        converted_audio_dir: str,
+        input_audio_file_key: str = "audio_filepath",
+        output_audio_file_key: str = "audio_filepath",
+        output_format: str = "wav",
+        rate: int = 16000,
+        channels: int = 1,
+        workspace_dir: Optional[str] = None,
+        **kwargs,
+    ):
+        # Extract workspace_dir from kwargs to avoid passing it to BaseProcessor
+        if "workspace_dir" in kwargs:
+            workspace_dir = kwargs.pop("workspace_dir")
+            
+        super().__init__(**kwargs)
+        self.input_audio_file_key = input_audio_file_key
+        self.output_audio_file_key = output_audio_file_key
+        self.converted_audio_dir = converted_audio_dir
+        self.output_format = output_format
+        self.workspace_dir = workspace_dir
+
+        # Store the new parameters for later use:
+        self.rate = rate
+        self.channels = channels
+
+    def prepare(self):
+        # Debug print for workspace_dir
+        logger.info(f"SoxConvert workspace_dir: {self.workspace_dir}")
+        os.makedirs(self.converted_audio_dir, exist_ok=True)
+
+    def process_dataset_entry(self, data_entry):
+        audio_path = data_entry[self.input_audio_file_key]
+        
+        # If workspace_dir is provided, join it with audio_path to get absolute path
+        if self.workspace_dir is not None:
+            full_audio_path = os.path.join(self.workspace_dir, audio_path)
+        else:
+            full_audio_path = audio_path
+            
+        # Debug print first file path
+        if not hasattr(self, '_debug_printed'):
+            logger.info(f"First audio_path from manifest: {audio_path}")
+            logger.info(f"First full_audio_path: {full_audio_path}")
+            logger.info(f"Path exists: {os.path.exists(full_audio_path)}")
+            self._debug_printed = True
+
+        key = os.path.splitext(audio_path)[0].split("/")[-1]
+        converted_file = os.path.join(self.converted_audio_dir, key) + f".{self.output_format}"
+
+        if not os.path.isfile(converted_file):
+            transformer = Transformer()
+
+            transformer.rate(self.rate)
+            transformer.channels(self.channels)
+
+            transformer.build(full_audio_path, converted_file)
+
+        data_entry[self.output_audio_file_key] = converted_file
+        return [DataEntry(data=data_entry)]
diff --git a/sdp/processors/manage_files/extract.py b/sdp/processors/manage_files/extract.py
diff --git a/sdp/processors/manage_files/remove.py b/sdp/processors/manage_files/remove.py
diff --git a/sdp/processors/modify_manifest/data_to_data.py b/sdp/processors/modify_manifest/data_to_data.py