Skip to content

Commit c61366e

Browse files
ssh-meisterroot
andauthored
Add RemoveFiles and ExtractTar, reorganize audio converters (#139)
* Group file management processors Signed-off-by: Sasha Meister <[email protected]> * Changes addressing the reviewer’s comments Signed-off-by: root <[email protected]> * Fix docs build issue Signed-off-by: Sasha Meister <[email protected]> * Earnings21/22 added to docs Signed-off-by: Sasha Meister <[email protected]> * Fix doc header Signed-off-by: Sasha Meister <[email protected]> --------- Signed-off-by: Sasha Meister <[email protected]> Signed-off-by: root <[email protected]> Signed-off-by: Sasha Meister <[email protected]> Co-authored-by: root <[email protected]>
1 parent 64f9d13 commit c61366e

File tree

9 files changed

+481
-164
lines changed

9 files changed

+481
-164
lines changed

docs/src/sdp/api.rst

Lines changed: 20 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -99,22 +99,22 @@ UzbekVoice
9999
Earnings21/22
100100
'''''''''''''
101101

102-
.. autodata:: sdp.processors.datasets.earnings21.CreateInitialAudioAndManifest
102+
.. autodata:: sdp.processors.datasets.earnings.CreateInitialAudioAndManifest
103103
:annotation:
104104

105-
.. autodata:: sdp.processors.datasets.earnings21.CreateFullAudioManifestEarnings21
105+
.. autodata:: sdp.processors.datasets.earnings.CreateFullAudioManifestEarnings21
106106
:annotation:
107107

108-
.. autodata:: sdp.processors.datasets.earnings21.SpeakerSegmentedManifest
108+
.. autodata:: sdp.processors.datasets.earnings.SpeakerSegmentedManifest
109109
:annotation:
110110

111-
.. autodata:: sdp.processors.datasets.earnings21.CreateSentenceSegmentedManifest
111+
.. autodata:: sdp.processors.datasets.earnings.CreateSentenceSegmentedManifest
112112
:annotation:
113113

114-
.. autodata:: sdp.processors.datasets.earnings21.NeMoForcedAligner
114+
.. autodata:: sdp.processors.datasets.earnings.NeMoForcedAligner
115115
:annotation:
116116

117-
.. autodata:: sdp.processors.datasets.earnings21.ApplyEarnings21Normalizations
117+
.. autodata:: sdp.processors.datasets.earnings.ApplyEarnings21Normalizations
118118
:annotation:
119119

120120

@@ -278,13 +278,25 @@ ASR-based processors
278278
Data modifications
279279
''''''''''''''''''
280280

281+
.. autodata:: sdp.processors.InsIfASRInsertion
282+
:annotation:
283+
284+
.. autodata:: sdp.processors.SubIfASRSubstitution
285+
:annotation:
286+
287+
Files management
288+
''''''''''''''''
289+
281290
.. autodata:: sdp.processors.SoxConvert
282291
:annotation:
283292

284-
.. autodata:: sdp.processors.InsIfASRInsertion
293+
.. autodata:: sdp.processors.FfmpegConvert
285294
:annotation:
286295

287-
.. autodata:: sdp.processors.SubIfASRSubstitution
296+
.. autodata:: sdp.processors.ExtractTar
297+
:annotation:
298+
299+
.. autodata:: sdp.processors.RemoveFiles
288300
:annotation:
289301

290302
Data filtering
@@ -379,9 +391,6 @@ Miscellaneous
379391
.. autodata:: sdp.processors.GetAudioDuration
380392
:annotation:
381393

382-
.. autodata:: sdp.processors.FfmpegConvert
383-
:annotation:
384-
385394
.. autodata:: sdp.processors.CreateInitialManifestByExt
386395
:annotation:
387396

docs/src/sdp/existing_configs.rst

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -424,4 +424,18 @@ NemoRunIPL
424424
:hidden:
425425

426426
config-docs/ipl/config
427-
config-docs/ipl/nemo_run_config
427+
config-docs/ipl/nemo_run_config
428+
429+
Earnings21/22
430+
~~~~~~~~~~~~~
431+
432+
**Supported configs**.
433+
434+
* **English**:
435+
`config <https://github.com/NVIDIA/NeMo-speech-data-processor/blob/main/dataset_configs/english/earnings/config.yaml>`__ |
436+
:doc:`documentation <config-docs/english/earnings/config>`
437+
438+
.. toctree::
439+
:hidden:
440+
441+
config-docs/english/earnings/config

sdp/processors/__init__.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,6 @@
9999
CopyManifestData,
100100
CountNumWords,
101101
ExtractFromBrackets,
102-
FfmpegConvert,
103102
GetAudioDuration,
104103
GetWER,
105104
InsIfASRInsertion,
@@ -108,7 +107,6 @@
108107
MakeSentence,
109108
ReadDocxLines,
110109
ReadTxtLines,
111-
SoxConvert,
112110
SplitLineBySentence,
113111
SubIfASRSubstitution,
114112
SubMakeLowercase,
@@ -136,6 +134,16 @@
136134
from sdp.processors.modify_manifest.make_letters_uppercase_after_period import (
137135
MakeLettersUppercaseAfterPeriod,
138136
)
137+
from sdp.processors.manage_files.convert_audio import (
138+
FfmpegConvert,
139+
SoxConvert,
140+
)
141+
from sdp.processors.manage_files.extract import (
142+
ExtractTar,
143+
)
144+
from sdp.processors.manage_files.remove import (
145+
RemoveFiles,
146+
)
139147
from sdp.processors.nemo.asr_inference import ASRInference
140148
from sdp.processors.nemo.estimate_bandwidth import EstimateBandwidth
141149
from sdp.processors.nemo.pc_inference import PCInference

sdp/processors/base_processor.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,6 @@
1919
import time
2020
from abc import ABC, abstractmethod
2121
from dataclasses import dataclass
22-
from itertools import chain
2322
from typing import Any, Dict, List, Optional, Union
2423

2524
from tqdm import tqdm
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
Lines changed: 170 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,170 @@
1+
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
import os
16+
from typing import Optional
17+
from sox import Transformer
18+
19+
from sdp.logging import logger
20+
from sdp.processors.base_processor import BaseParallelProcessor, DataEntry
21+
22+
from sdp.utils.common import ffmpeg_convert
23+
24+
25+
class FfmpegConvert(BaseParallelProcessor):
26+
"""
27+
Processor for converting video or audio files to audio using FFmpeg and updating the dataset with the path to the resampled audio.
28+
If ``id_key`` is not None, the output file path will be ``<resampled_audio_dir>/<id_key>.wav``.
29+
If ``id_key`` is None, the output file path will be ``<resampled_audio_dir>/<input file name without extension>.wav``.
30+
31+
.. note:: ``id_key`` can be used to create subdirectories inside ``resampled_audio_dir`` (by using forward slashes ``/``).
32+
e.g. if ``id_key`` takes the form ``dir_name1/dir_name2/filename``, the output file path will be
33+
34+
``<resampled_audio_dir>/dir_name1/dirname2/filename.wav``.
35+
36+
Args:
37+
converted_audio_dir (str): The directory to store the resampled audio files.
38+
input_file_key (str): The field in the dataset representing the path to the input video or audio files.
39+
output_file_key (str): The field in the dataset representing the path to the resampled audio files with ``output_format``. If ``id_key`` is None, the output file path will be ``<resampled_audio_dir>/<input file name without extension>.wav``.
40+
id_key (str): (Optional) The field in the dataset representing the unique ID or identifier for each entry. If ``id_key`` is not None, the output file path will be ``<resampled_audio_dir>/<id_key>.wav``. Defaults to None.
41+
output_format (str): (Optional) Format of the output audio files. Defaults to `wav`.
42+
target_samplerate (int): (Optional) The target sampling rate for the resampled audio. Defaults to 16000.
43+
target_nchannels (int): (Optional) The target number of channels for the resampled audio. Defaults to 1.
44+
**kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`.
45+
46+
"""
47+
48+
def __init__(
49+
self,
50+
converted_audio_dir: str,
51+
input_file_key: str,
52+
output_file_key: str,
53+
id_key: str = None,
54+
output_format: str = "wav",
55+
base_dir: str = None,
56+
target_samplerate: int = 16000,
57+
target_nchannels: int = 1,
58+
**kwargs,
59+
):
60+
super().__init__(**kwargs)
61+
self.converted_audio_dir = converted_audio_dir
62+
self.input_file_key = input_file_key
63+
self.output_file_key = output_file_key
64+
self.output_format = output_format
65+
self.id_key = id_key
66+
self.base_dir = base_dir
67+
self.target_samplerate = target_samplerate
68+
self.target_nchannels = target_nchannels
69+
70+
def prepare(self):
71+
assert self.output_format == "wav", "Currently only wav format is supported"
72+
os.makedirs(self.converted_audio_dir, exist_ok=True)
73+
74+
def process_dataset_entry(self, data_entry):
75+
input_file = data_entry[self.input_file_key]
76+
if self.id_key:
77+
key = data_entry[self.id_key]
78+
os.makedirs(os.path.join(self.converted_audio_dir, *key.split("/")[:-1]), exist_ok=True)
79+
else:
80+
key = os.path.splitext(input_file)[0].split("/")[-1]
81+
82+
if self.base_dir:
83+
new_dir = os.path.dirname(os.path.relpath(input_file, self.base_dir))
84+
os.makedirs(os.path.join(self.converted_audio_dir, new_dir), exist_ok=True)
85+
86+
key = os.path.join(new_dir, key)
87+
88+
audio_file = os.path.join(self.converted_audio_dir, key) + "." + self.output_format
89+
90+
if not os.path.isfile(audio_file):
91+
ffmpeg_convert(input_file, audio_file, self.target_samplerate, self.target_nchannels)
92+
93+
data_entry[self.output_file_key] = audio_file
94+
return [DataEntry(data=data_entry)]
95+
96+
97+
class SoxConvert(BaseParallelProcessor):
98+
"""Processor for Sox to convert audio files to specified format.
99+
100+
Args:
101+
output_manifest_file (str): Path to the output manifest file.
102+
input_audio_file_key (str): Key in the manifest file that contains the path to the input audio file.
103+
output_audio_file_key (str): Key in the manifest file that contains the path to the output audio file.
104+
converted_audio_dir (str): Path to the directory where the converted audio files will be stored.
105+
output_format (str): Format of the output audio file.
106+
rate (int): Sample rate of the output audio file.
107+
channels (int): Number of channels of the output audio file.
108+
workspace_dir (str, Optional): Path to the workspace directory. Defaults to None.
109+
"""
110+
111+
def __init__(
112+
self,
113+
converted_audio_dir: str,
114+
input_audio_file_key: str = "audio_filepath",
115+
output_audio_file_key: str = "audio_filepath",
116+
output_format: str = "wav",
117+
rate: int = 16000,
118+
channels: int = 1,
119+
workspace_dir: Optional[str] = None,
120+
**kwargs,
121+
):
122+
# Extract workspace_dir from kwargs to avoid passing it to BaseProcessor
123+
if "workspace_dir" in kwargs:
124+
workspace_dir = kwargs.pop("workspace_dir")
125+
126+
super().__init__(**kwargs)
127+
self.input_audio_file_key = input_audio_file_key
128+
self.output_audio_file_key = output_audio_file_key
129+
self.converted_audio_dir = converted_audio_dir
130+
self.output_format = output_format
131+
self.workspace_dir = workspace_dir
132+
133+
# Store the new parameters for later use:
134+
self.rate = rate
135+
self.channels = channels
136+
137+
def prepare(self):
138+
# Debug print for workspace_dir
139+
logger.info(f"SoxConvert workspace_dir: {self.workspace_dir}")
140+
os.makedirs(self.converted_audio_dir, exist_ok=True)
141+
142+
def process_dataset_entry(self, data_entry):
143+
audio_path = data_entry[self.input_audio_file_key]
144+
145+
# If workspace_dir is provided, join it with audio_path to get absolute path
146+
if self.workspace_dir is not None:
147+
full_audio_path = os.path.join(self.workspace_dir, audio_path)
148+
else:
149+
full_audio_path = audio_path
150+
151+
# Debug print first file path
152+
if not hasattr(self, '_debug_printed'):
153+
logger.info(f"First audio_path from manifest: {audio_path}")
154+
logger.info(f"First full_audio_path: {full_audio_path}")
155+
logger.info(f"Path exists: {os.path.exists(full_audio_path)}")
156+
self._debug_printed = True
157+
158+
key = os.path.splitext(audio_path)[0].split("/")[-1]
159+
converted_file = os.path.join(self.converted_audio_dir, key) + f".{self.output_format}"
160+
161+
if not os.path.isfile(converted_file):
162+
transformer = Transformer()
163+
164+
transformer.rate(self.rate)
165+
transformer.channels(self.channels)
166+
167+
transformer.build(full_audio_path, converted_file)
168+
169+
data_entry[self.output_audio_file_key] = converted_file
170+
return [DataEntry(data=data_entry)]

0 commit comments

Comments
 (0)