diff --git a/dataset_configs/english/coraal/config.yaml b/dataset_configs/english/coraal/config.yaml
index d5b570a2..7b3fb240 100644
--- a/dataset_configs/english/coraal/config.yaml
+++ b/dataset_configs/english/coraal/config.yaml
@@ -93,8 +93,8 @@ processors:
- {"pattern": '\baksing\b', "repl": "asking"}
- {"pattern": '\baksed\b', "repl": "asked"}
# removing unintelligible/redacted flags
- - {"pattern": '/(?i)unintelligible/', "repl": ""}
- - {"pattern": '/(?i)inaudible/', "repl": ""}
+ - {"pattern": '(?i)unintelligible/', "repl": ""}
+ - {"pattern": '(?i)inaudible/', "repl": ""}
- {"pattern": '/RD(.*?)/', "repl": ""}
- {"pattern": '/(\?)\1*/', "repl": ""}
# removing non-linguistic markers
diff --git a/dataset_configs/multilingual/babel/config.yaml b/dataset_configs/multilingual/babel/config.yaml
new file mode 100644
index 00000000..ade44a4e
--- /dev/null
+++ b/dataset_configs/multilingual/babel/config.yaml
@@ -0,0 +1,53 @@
+documentation: |
+ IARPA Babel Dataset
+ ###################
+
+ This config is designed for the languages of the IARPA Babel Dataset available at https://catalog.ldc.upenn.edu.
+
+ It creates initial manifest for the specified data type and data split.
+ Further data processing steps should be performed based on the specific langauge.
+
+ **Required arguments**.
+
+ * **raw_data_dir**: specify path of the directory downloaded from LDC.
+ * **data_type**: should be "conversational" or "scripted".
+ * **resampled_audio_dir**: specify the directory path, where new processed audios should be located.
+ * **data_split**: should be "training", "untranscribed-training", "sub-train", "dev" or "eval".
+ * **output_manifest_file**: specify output manifest filepath.
+
+ **Output format**.
+
+ This config dumps the final manifest at ``${output_manifest_file}``.
+ The output manifest contains the following fields:
+
+ * **outputFn (str)**: initial audio filename.
+ * **sessID (str)**: session ID of the recording.
+ * **date (str)**: date of the recording.
+ * **time (str)**: time of the recording.
+ * **spkrCode (str)**: speaker ID.
+ * **lineType (str)**: type of the line (inline or outline).
+ * **dialect (str)**: dialect of the speaker.
+ * **gen (str)**: gender of the speaker.
+ * **envType (str)**: environment (i.e., home, office, etc.).
+ * **age (str)**: age of the speaker.
+ * **network (str)**: name of the telecommunications network.
+ * **phoneModel (str)**: model of the phone.
+ * **sampleCount (str)**: count of the sample.
+ * **sampleRate (str)**: original sample rate of the recording.
+ * **audio_filepath (str)**: path to the processed audio file.
+ * **duration (float)**: duration of the audio in seconds.
+
+processors_to_run: all
+workspace_dir: ???
+data_type: scripted
+resampled_audio_dir: ${workspace_dir}/processed/${data_type}/${data_split}
+data_split: training
+final_manifest: ${workspace_dir}/processed/${data_type}/${data_split}_manifest.json
+
+processors:
+ - _target_: sdp.processors.CreateInitialManifestBabel
+ raw_data_dir: ${workspace_dir}
+ data_type: ${data_type}
+ data_split: ${data_split}
+ resampled_audio_dir: ${resampled_audio_dir}
+ output_manifest_file: ${final_manifest}
\ No newline at end of file
diff --git a/dataset_configs/multilingual/voxpopuli/config_un.yaml b/dataset_configs/multilingual/voxpopuli/config_un.yaml
new file mode 100644
index 00000000..4480ad6e
--- /dev/null
+++ b/dataset_configs/multilingual/voxpopuli/config_un.yaml
@@ -0,0 +1,39 @@
+documentation: |
+ Voxpopuli unlabelled subset
+ ###########################
+
+ This config can be used to prepare
+ `Voxpopuli dataset unlabelled subset `_
+ dataset in the NeMo format.
+
+ It creates initial manifest for the specified language.
+
+ **Required arguments**.
+
+ * **raw_data_dir**: specify the directory where the downloaded data will be/is saved.
+ * **language_id**: specify the language of the data you wish to be downloaded and/or processed.
+ * **resampled_audio_dir**: specify the directory path, where new processed audios should be located.
+ * **delete_raw_file**: specify if the initial raw audio files should be deleted or not.
+
+
+ **Output format**.
+
+ This config dumps the final manifest at ``${resampled_audio_dir}/${language_id}/manifest.json``.
+ The output manifest contains the following fields:
+
+ * **audio_filepath (str)**: path to the processed audio file.
+ * **duration (float)**: duration of the audio in seconds.
+
+processors_to_run: all
+workspace_dir: ???
+language_id: hu_v2
+resampled_data_dir: ${workspace_dir}/unlabelled_processed/
+final_manifest: ${workspace_dir}/unlabelled_processed/${language_id}/manifest.json
+
+processors:
+ - _target_: sdp.processors.CreateInitialManifestVoxpopuliUnlabelled
+ raw_data_dir: ${workspace_dir}
+ language_id: ${language_id}
+ resampled_data_dir: ${resampled_data_dir}
+ delete_raw_file: False
+ output_manifest_file: ${final_manifest}
diff --git a/dataset_configs/multilingual/yodas/config.yaml b/dataset_configs/multilingual/yodas/config.yaml
new file mode 100644
index 00000000..16fffefc
--- /dev/null
+++ b/dataset_configs/multilingual/yodas/config.yaml
@@ -0,0 +1,35 @@
+processors_to_run: all
+manifest: ???
+resampled_audio_dir: ???
+out_manifest: ???
+char_rate: 10
+min_duration: 1.5
+max_duration: 40.1
+
+processors:
+ - _target_: sdp.processors.GetAudioDuration
+ audio_filepath_key: audio_filepath
+ duration_key: duration
+ input_manifest_file: ${manifest}
+
+ - _target_: sdp.processors.RandomSegment
+ min_duration: ${min_duration}
+ max_duration: ${max_duration}
+ resampled_audio_dir: ${resampled_audio_dir}
+ audio_format: flac
+
+ - _target_: sdp.processors.ASRInference
+ pretrained_model: nvidia/parakeet-ctc-0.6b
+
+ - _target_: sdp.processors.DropHighLowCharrate
+ low_charrate_threshold: ${char_rate}
+ text_key: pred_text
+ high_charrate_threshold: 10000
+
+ - _target_: sdp.processors.KeepOnlySpecifiedFields
+ fields_to_keep:
+ - audio_filepath
+ - duration
+
+ - _target_: sdp.processors.DropCorrupted
+ output_manifest_file: ${out_manifest}
diff --git a/docs/src/sdp/api.rst b/docs/src/sdp/api.rst
index 6d85e83d..496b6154 100644
--- a/docs/src/sdp/api.rst
+++ b/docs/src/sdp/api.rst
@@ -40,6 +40,9 @@ VoxPopuli
.. autodata:: sdp.processors.CreateInitialManifestVoxpopuli
:annotation:
+.. autodata:: sdp.processors.CreateInitialManifestVoxpopuliUnlabelled
+ :annotation:
+
.. autodata:: sdp.processors.NormalizeFromNonPCTextVoxpopuli
:annotation:
@@ -58,8 +61,13 @@ Librispeech
.. autodata:: sdp.processors.CreateInitialManifestLibrispeech
:annotation:
-
+Babel
+'''''''''''
+
+.. autodata:: sdp.processors.CreateInitialManifestBabel
+ :annotation:
+
SLR83
'''''
@@ -158,6 +166,18 @@ Data modifications
.. autodata:: sdp.processors.InverseNormalizeText
:annotation:
+.. autodata:: sdp.processors.RandomSegment
+ :annotation:
+
+.. autodata:: sdp.processors.UntarAudios
+ :annotation:
+
+.. autodata:: sdp.processors.ExtractFilesFromTar
+ :annotation:
+
+.. autodata:: sdp.processors.RemoveEmojis
+ :annotation:
+
Data filtering
''''''''''''''
@@ -237,6 +257,9 @@ Data filtering
.. autodata:: sdp.processors.DropRepeatedFields
:annotation:
+.. autodata:: sdp.processors.DropCorrupted
+ :annotation:
+
Miscellaneous
#############
diff --git a/docs/src/sdp/existing_configs.rst b/docs/src/sdp/existing_configs.rst
index 32c52ec1..271a938c 100644
--- a/docs/src/sdp/existing_configs.rst
+++ b/docs/src/sdp/existing_configs.rst
@@ -92,12 +92,16 @@ VoxPopuli
* **Spanish**:
`config `__ |
:doc:`documentation `
+* **Multilingual**:
+ `config `__ |
+ :doc:`documentation `
.. toctree::
:hidden:
config-docs/italian/voxpopuli/config
config-docs/spanish_pc/voxpopuli/config
+ config-docs/multilingual/voxpopuli/config_un
Fisher
~~~~~~
@@ -237,6 +241,22 @@ MTEDx
config-docs/portuguese/mtedx/config
+Babel
+~~~~~~
+
+**Dataset link:** https://www.ldc.upenn.edu
+
+**Supported configs**.
+
+* **Multilingual**:
+ `config `__ |
+ :doc:`documentation `
+
+.. toctree::
+ :hidden:
+
+ config-docs/multilingual/babel/config
+
Kazakh Speech Dataset (SLR140)
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
diff --git a/pytest.ini b/pytest.ini
index 2bed0f3a..ae4de828 100644
--- a/pytest.ini
+++ b/pytest.ini
@@ -1,2 +1,5 @@
[pytest]
-addopts = --doctest-modules
\ No newline at end of file
+addopts = --doctest-modules
+markers =
+ dependency: mark a test as a dependent on the other mentioned test.
+ slow: marks tests as slow (deselect with '-m "not slow"').
\ No newline at end of file
diff --git a/requirements/huggingface.txt b/requirements/huggingface.txt
index e603c631..4f8696e1 100644
--- a/requirements/huggingface.txt
+++ b/requirements/huggingface.txt
@@ -1,3 +1,3 @@
-accelerate
-transformers>=0.2.1
+accelerate==0.34.2
+transformers==4.39
huggingface_hub>=0.20.3,<0.24.0 # https://github.com/NVIDIA/NeMo/issues/9793
diff --git a/requirements/main.txt b/requirements/main.txt
index c39b2844..7cb1ee43 100644
--- a/requirements/main.txt
+++ b/requirements/main.txt
@@ -4,7 +4,7 @@ ffmpeg
hydra-core
joblib
librosa>=0.10.0 # specify >=0.10.0 so that librosa.get_duration(path=...) will work
-numpy
+numpy==1.26.4
omegaconf
pandas
rarfile
@@ -13,7 +13,7 @@ sox
tqdm
webvtt-py
wget
-
+pydub
# for some processers, additionally https://github.com/NVIDIA/NeMo is required
# for some processers, additionally nemo_text_processing is required
# for mcv: apt-get update && apt-get upgrade -y && apt-get install -y sox libsox-fmt-all
diff --git a/sdp/processors/__init__.py b/sdp/processors/__init__.py
index fdafb521..7f0c0d71 100644
--- a/sdp/processors/__init__.py
+++ b/sdp/processors/__init__.py
@@ -50,7 +50,9 @@
)
from sdp.processors.datasets.voxpopuli.create_initial_manifest import (
CreateInitialManifestVoxpopuli,
+ CreateInitialManifestVoxpopuliUnlabelled,
)
+from sdp.processors.datasets.babel.create_initial_manifest import CreateInitialManifestBabel
from sdp.processors.datasets.voxpopuli.normalize_from_non_pc_text import (
NormalizeFromNonPCTextVoxpopuli,
)
@@ -80,6 +82,10 @@
SubIfASRSubstitution,
SubMakeLowercase,
SubRegex,
+ ExtractFilesFromTar,
+ RandomSegment,
+ UntarAudios,
+ RemoveEmojis
)
from sdp.processors.modify_manifest.data_to_dropbool import (
DropASRError,
@@ -97,6 +103,7 @@
DropOnAttribute,
PreserveByValue,
DropRepeatedFields,
+ DropCorrupted,
)
from sdp.processors.modify_manifest.make_letters_uppercase_after_period import (
MakeLettersUppercaseAfterPeriod,
diff --git a/sdp/processors/datasets/babel/create_initial_manifest.py b/sdp/processors/datasets/babel/create_initial_manifest.py
new file mode 100644
index 00000000..4bf0eaa6
--- /dev/null
+++ b/sdp/processors/datasets/babel/create_initial_manifest.py
@@ -0,0 +1,179 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import pathlib
+from pathlib import Path
+
+from pydub import AudioSegment
+
+from sdp.logging import logger
+from sdp.processors.base_processor import BaseParallelProcessor, DataEntry
+
+
+class CreateInitialManifestBabel(BaseParallelProcessor):
+ """Processor to create initial manifest for the Babel dataset.
+
+ Dataset is available for 25 underserved languages on https://catalog.ldc.upenn.edu
+
+ Segments the raw audio based on transcriptions files
+ (each segment contains an utterance from the transcription file for which start and end timestamps are procided)
+ and creates manifest for the resampled data.
+
+ .. note::
+ The dataset should be downloaded manually from LDC.
+
+ Args:
+ raw_data_dir (str): the directory where the downloaded data is saved.
+ data_type (str): "conversational" or "scripted".
+ data_split (str): "training", "untranscribed-training", "sub-train", "dev" or "eval".
+ resampled_audio_dir (str): the directory where the resampled audio
+ files will be stored.
+ audio_format (str): format in which new audio files will be stored.
+ target_samplerate (int): sample rate (Hz) to use for resampling.
+ Defaults to 16000.
+ target_nchannels (int): number of channels to create during resampling process.
+ Defaults to 1.
+
+ Returns:
+ This processor generates an initial manifest file with the following fields::
+
+ {
+ "outputFn": ,
+ "sessID": ,
+ "date": ,
+ "time":