diff --git a/dataset_configs/english/coraal/config.yaml b/dataset_configs/english/coraal/config.yaml index d5b570a2..7b3fb240 100644 --- a/dataset_configs/english/coraal/config.yaml +++ b/dataset_configs/english/coraal/config.yaml @@ -93,8 +93,8 @@ processors: - {"pattern": '\baksing\b', "repl": "asking"} - {"pattern": '\baksed\b', "repl": "asked"} # removing unintelligible/redacted flags - - {"pattern": '/(?i)unintelligible/', "repl": ""} - - {"pattern": '/(?i)inaudible/', "repl": ""} + - {"pattern": '(?i)unintelligible/', "repl": ""} + - {"pattern": '(?i)inaudible/', "repl": ""} - {"pattern": '/RD(.*?)/', "repl": ""} - {"pattern": '/(\?)\1*/', "repl": ""} # removing non-linguistic markers diff --git a/dataset_configs/multilingual/babel/config.yaml b/dataset_configs/multilingual/babel/config.yaml new file mode 100644 index 00000000..ade44a4e --- /dev/null +++ b/dataset_configs/multilingual/babel/config.yaml @@ -0,0 +1,53 @@ +documentation: | + IARPA Babel Dataset + ################### + + This config is designed for the languages of the IARPA Babel Dataset available at https://catalog.ldc.upenn.edu. + + It creates initial manifest for the specified data type and data split. + Further data processing steps should be performed based on the specific langauge. + + **Required arguments**. + + * **raw_data_dir**: specify path of the directory downloaded from LDC. + * **data_type**: should be "conversational" or "scripted". + * **resampled_audio_dir**: specify the directory path, where new processed audios should be located. + * **data_split**: should be "training", "untranscribed-training", "sub-train", "dev" or "eval". + * **output_manifest_file**: specify output manifest filepath. + + **Output format**. + + This config dumps the final manifest at ``${output_manifest_file}``. + The output manifest contains the following fields: + + * **outputFn (str)**: initial audio filename. + * **sessID (str)**: session ID of the recording. + * **date (str)**: date of the recording. + * **time (str)**: time of the recording. + * **spkrCode (str)**: speaker ID. + * **lineType (str)**: type of the line (inline or outline). + * **dialect (str)**: dialect of the speaker. + * **gen (str)**: gender of the speaker. + * **envType (str)**: environment (i.e., home, office, etc.). + * **age (str)**: age of the speaker. + * **network (str)**: name of the telecommunications network. + * **phoneModel (str)**: model of the phone. + * **sampleCount (str)**: count of the sample. + * **sampleRate (str)**: original sample rate of the recording. + * **audio_filepath (str)**: path to the processed audio file. + * **duration (float)**: duration of the audio in seconds. + +processors_to_run: all +workspace_dir: ??? +data_type: scripted +resampled_audio_dir: ${workspace_dir}/processed/${data_type}/${data_split} +data_split: training +final_manifest: ${workspace_dir}/processed/${data_type}/${data_split}_manifest.json + +processors: + - _target_: sdp.processors.CreateInitialManifestBabel + raw_data_dir: ${workspace_dir} + data_type: ${data_type} + data_split: ${data_split} + resampled_audio_dir: ${resampled_audio_dir} + output_manifest_file: ${final_manifest} \ No newline at end of file diff --git a/dataset_configs/multilingual/voxpopuli/config_un.yaml b/dataset_configs/multilingual/voxpopuli/config_un.yaml new file mode 100644 index 00000000..4480ad6e --- /dev/null +++ b/dataset_configs/multilingual/voxpopuli/config_un.yaml @@ -0,0 +1,39 @@ +documentation: | + Voxpopuli unlabelled subset + ########################### + + This config can be used to prepare + `Voxpopuli dataset unlabelled subset `_ + dataset in the NeMo format. + + It creates initial manifest for the specified language. + + **Required arguments**. + + * **raw_data_dir**: specify the directory where the downloaded data will be/is saved. + * **language_id**: specify the language of the data you wish to be downloaded and/or processed. + * **resampled_audio_dir**: specify the directory path, where new processed audios should be located. + * **delete_raw_file**: specify if the initial raw audio files should be deleted or not. + + + **Output format**. + + This config dumps the final manifest at ``${resampled_audio_dir}/${language_id}/manifest.json``. + The output manifest contains the following fields: + + * **audio_filepath (str)**: path to the processed audio file. + * **duration (float)**: duration of the audio in seconds. + +processors_to_run: all +workspace_dir: ??? +language_id: hu_v2 +resampled_data_dir: ${workspace_dir}/unlabelled_processed/ +final_manifest: ${workspace_dir}/unlabelled_processed/${language_id}/manifest.json + +processors: + - _target_: sdp.processors.CreateInitialManifestVoxpopuliUnlabelled + raw_data_dir: ${workspace_dir} + language_id: ${language_id} + resampled_data_dir: ${resampled_data_dir} + delete_raw_file: False + output_manifest_file: ${final_manifest} diff --git a/dataset_configs/multilingual/yodas/config.yaml b/dataset_configs/multilingual/yodas/config.yaml new file mode 100644 index 00000000..16fffefc --- /dev/null +++ b/dataset_configs/multilingual/yodas/config.yaml @@ -0,0 +1,35 @@ +processors_to_run: all +manifest: ??? +resampled_audio_dir: ??? +out_manifest: ??? +char_rate: 10 +min_duration: 1.5 +max_duration: 40.1 + +processors: + - _target_: sdp.processors.GetAudioDuration + audio_filepath_key: audio_filepath + duration_key: duration + input_manifest_file: ${manifest} + + - _target_: sdp.processors.RandomSegment + min_duration: ${min_duration} + max_duration: ${max_duration} + resampled_audio_dir: ${resampled_audio_dir} + audio_format: flac + + - _target_: sdp.processors.ASRInference + pretrained_model: nvidia/parakeet-ctc-0.6b + + - _target_: sdp.processors.DropHighLowCharrate + low_charrate_threshold: ${char_rate} + text_key: pred_text + high_charrate_threshold: 10000 + + - _target_: sdp.processors.KeepOnlySpecifiedFields + fields_to_keep: + - audio_filepath + - duration + + - _target_: sdp.processors.DropCorrupted + output_manifest_file: ${out_manifest} diff --git a/docs/src/sdp/api.rst b/docs/src/sdp/api.rst index 6d85e83d..496b6154 100644 --- a/docs/src/sdp/api.rst +++ b/docs/src/sdp/api.rst @@ -40,6 +40,9 @@ VoxPopuli .. autodata:: sdp.processors.CreateInitialManifestVoxpopuli :annotation: +.. autodata:: sdp.processors.CreateInitialManifestVoxpopuliUnlabelled + :annotation: + .. autodata:: sdp.processors.NormalizeFromNonPCTextVoxpopuli :annotation: @@ -58,8 +61,13 @@ Librispeech .. autodata:: sdp.processors.CreateInitialManifestLibrispeech :annotation: - +Babel +''''''''''' + +.. autodata:: sdp.processors.CreateInitialManifestBabel + :annotation: + SLR83 ''''' @@ -158,6 +166,18 @@ Data modifications .. autodata:: sdp.processors.InverseNormalizeText :annotation: +.. autodata:: sdp.processors.RandomSegment + :annotation: + +.. autodata:: sdp.processors.UntarAudios + :annotation: + +.. autodata:: sdp.processors.ExtractFilesFromTar + :annotation: + +.. autodata:: sdp.processors.RemoveEmojis + :annotation: + Data filtering '''''''''''''' @@ -237,6 +257,9 @@ Data filtering .. autodata:: sdp.processors.DropRepeatedFields :annotation: +.. autodata:: sdp.processors.DropCorrupted + :annotation: + Miscellaneous ############# diff --git a/docs/src/sdp/existing_configs.rst b/docs/src/sdp/existing_configs.rst index 32c52ec1..271a938c 100644 --- a/docs/src/sdp/existing_configs.rst +++ b/docs/src/sdp/existing_configs.rst @@ -92,12 +92,16 @@ VoxPopuli * **Spanish**: `config `__ | :doc:`documentation ` +* **Multilingual**: + `config `__ | + :doc:`documentation ` .. toctree:: :hidden: config-docs/italian/voxpopuli/config config-docs/spanish_pc/voxpopuli/config + config-docs/multilingual/voxpopuli/config_un Fisher ~~~~~~ @@ -237,6 +241,22 @@ MTEDx config-docs/portuguese/mtedx/config +Babel +~~~~~~ + +**Dataset link:** https://www.ldc.upenn.edu + +**Supported configs**. + +* **Multilingual**: + `config `__ | + :doc:`documentation ` + +.. toctree:: + :hidden: + + config-docs/multilingual/babel/config + Kazakh Speech Dataset (SLR140) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/pytest.ini b/pytest.ini index 2bed0f3a..ae4de828 100644 --- a/pytest.ini +++ b/pytest.ini @@ -1,2 +1,5 @@ [pytest] -addopts = --doctest-modules \ No newline at end of file +addopts = --doctest-modules +markers = + dependency: mark a test as a dependent on the other mentioned test. + slow: marks tests as slow (deselect with '-m "not slow"'). \ No newline at end of file diff --git a/requirements/huggingface.txt b/requirements/huggingface.txt index e603c631..4f8696e1 100644 --- a/requirements/huggingface.txt +++ b/requirements/huggingface.txt @@ -1,3 +1,3 @@ -accelerate -transformers>=0.2.1 +accelerate==0.34.2 +transformers==4.39 huggingface_hub>=0.20.3,<0.24.0 # https://github.com/NVIDIA/NeMo/issues/9793 diff --git a/requirements/main.txt b/requirements/main.txt index c39b2844..7cb1ee43 100644 --- a/requirements/main.txt +++ b/requirements/main.txt @@ -4,7 +4,7 @@ ffmpeg hydra-core joblib librosa>=0.10.0 # specify >=0.10.0 so that librosa.get_duration(path=...) will work -numpy +numpy==1.26.4 omegaconf pandas rarfile @@ -13,7 +13,7 @@ sox tqdm webvtt-py wget - +pydub # for some processers, additionally https://github.com/NVIDIA/NeMo is required # for some processers, additionally nemo_text_processing is required # for mcv: apt-get update && apt-get upgrade -y && apt-get install -y sox libsox-fmt-all diff --git a/sdp/processors/__init__.py b/sdp/processors/__init__.py index fdafb521..7f0c0d71 100644 --- a/sdp/processors/__init__.py +++ b/sdp/processors/__init__.py @@ -50,7 +50,9 @@ ) from sdp.processors.datasets.voxpopuli.create_initial_manifest import ( CreateInitialManifestVoxpopuli, + CreateInitialManifestVoxpopuliUnlabelled, ) +from sdp.processors.datasets.babel.create_initial_manifest import CreateInitialManifestBabel from sdp.processors.datasets.voxpopuli.normalize_from_non_pc_text import ( NormalizeFromNonPCTextVoxpopuli, ) @@ -80,6 +82,10 @@ SubIfASRSubstitution, SubMakeLowercase, SubRegex, + ExtractFilesFromTar, + RandomSegment, + UntarAudios, + RemoveEmojis ) from sdp.processors.modify_manifest.data_to_dropbool import ( DropASRError, @@ -97,6 +103,7 @@ DropOnAttribute, PreserveByValue, DropRepeatedFields, + DropCorrupted, ) from sdp.processors.modify_manifest.make_letters_uppercase_after_period import ( MakeLettersUppercaseAfterPeriod, diff --git a/sdp/processors/datasets/babel/create_initial_manifest.py b/sdp/processors/datasets/babel/create_initial_manifest.py new file mode 100644 index 00000000..4bf0eaa6 --- /dev/null +++ b/sdp/processors/datasets/babel/create_initial_manifest.py @@ -0,0 +1,179 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import pathlib +from pathlib import Path + +from pydub import AudioSegment + +from sdp.logging import logger +from sdp.processors.base_processor import BaseParallelProcessor, DataEntry + + +class CreateInitialManifestBabel(BaseParallelProcessor): + """Processor to create initial manifest for the Babel dataset. + + Dataset is available for 25 underserved languages on https://catalog.ldc.upenn.edu + + Segments the raw audio based on transcriptions files + (each segment contains an utterance from the transcription file for which start and end timestamps are procided) + and creates manifest for the resampled data. + + .. note:: + The dataset should be downloaded manually from LDC. + + Args: + raw_data_dir (str): the directory where the downloaded data is saved. + data_type (str): "conversational" or "scripted". + data_split (str): "training", "untranscribed-training", "sub-train", "dev" or "eval". + resampled_audio_dir (str): the directory where the resampled audio + files will be stored. + audio_format (str): format in which new audio files will be stored. + target_samplerate (int): sample rate (Hz) to use for resampling. + Defaults to 16000. + target_nchannels (int): number of channels to create during resampling process. + Defaults to 1. + + Returns: + This processor generates an initial manifest file with the following fields:: + + { + "outputFn": , + "sessID": , + "date": , + "time":