diff --git a/dataset_configs/armenian/youtube_audio_tmp/config.yaml b/dataset_configs/armenian/youtube_audio_tmp/config.yaml new file mode 100644 index 00000000..47b1e496 --- /dev/null +++ b/dataset_configs/armenian/youtube_audio_tmp/config.yaml @@ -0,0 +1,71 @@ +processors_to_run: "0:" +workspace_dir: /workspace/nemo_capstone +final_manifest: ${workspace_dir}/final_manifest.json + +processors: + - _target_: sdp.processors.CreateInitialManifestArmData + raw_data_dir: /workspace/nemo_capstone + output_field: audio_filepath + output_manifest_file: ${workspace_dir}/manifest0.json + + - _target_: sdp.processors.GetYoutubeAudio + links_filepath_field: audio_filepath + output_audio_path: ${workspace_dir}/audio_files/ + output_manifest_file: ${workspace_dir}/manifest1.json + + - _target_: sdp.processors.AudioLid + output_manifest_file: ${workspace_dir}/manifest2.json + input_audio_field: audio_filepath + output_lang_field: audio_lang + device: gpu + pretrained_model: "langid_ambernet" + segment_duration: 20 + num_segments: 3 + + - _target_: sdp.processors.PreserveByValue + output_manifest_file: ${workspace_dir}/manifest3.json + input_field: audio_lang + target_value: hy + + - _target_: sdp.processors.PreserveByValue + output_manifest_file: ${workspace_dir}/manifest4.json + input_field: duration + operator: le + target_value: 20000.0 + + - _target_: sdp.processors.Subprocess + output_manifest_file: ${workspace_dir}/vad + input_manifest_arg: "manifest_filepath" + output_manifest_arg: "output_dir" + cmd: "python /workspace/nemo/examples/asr/asr_vad/speech_to_text_with_vad.py audio_type=wav \ + vad_model=vad_multilingual_frame_marblenet vad_config=/workspace/nemo/examples/asr/conf/vad/frame_vad_infer_postprocess.yaml" + + - _target_: sdp.processors.RenameFields + input_manifest_file: ${workspace_dir}/vad/temp_manifest_vad_rttm-onset0.3-offset0.3-pad_onset0.2-pad_offset0.2-min_duration_on0.2-min_duration_off0.2-filter_speech_firstTrue.json + output_manifest_file: ${workspace_dir}/manifest6.json + rename_fields: {"audio_filepath":"source_filepath"} + + - _target_: sdp.processors.nemo.rttm.GetRttmSegments + output_manifest_file: ${workspace_dir}/manifest7.json + rttm_field: rttm_file + output_file_field: audio_segments + duration_threshold: 20.0 + duration_field: duration + + - _target_: sdp.processors.nemo.rttm.SplitFile + output_manifest_file: ${workspace_dir}/manifest8.json + splited_audio_dir: ${workspace_dir}/armenian/yt/splited_wavs/ + segments_field: audio_segments + input_file_field: source_filepath + output_file_field: audio_filepath + duration_field: duration + + - _target_: sdp.processors.PreserveByValue + output_manifest_file: ${workspace_dir}/manifest9.json + input_field: duration + operator: gt + target_value: 0.0 + + - _target_: sdp.processors.KeepOnlySpecifiedFields + output_manifest_file: ${workspace_dir}/manifest10.json + fields_to_keep: ["audio_filepath", "duration"] diff --git a/docker-compose.yaml b/docker-compose.yaml new file mode 100644 index 00000000..a6d183ea --- /dev/null +++ b/docker-compose.yaml @@ -0,0 +1,22 @@ +version: '3.9' +services: + frontend: + image: nvcr.io/nvidia/nemo:24.03 + build: . + volumes: + - type: tmpfs + target: /dev/shm + tmpfs: + size: 9000000000 # ~9gb + - .:/workspace/nemo_capstone + - /raid/asds/:/workspace/nemo_capstone/data + deploy: + resources: + reservations: + devices: + - driver: nvidia + capabilities: [gpu] + count: 'all' # or 'all' to use all GPUs + entrypoint: tail -f + environment: + NVIDIA_VISIBLE_DEVICES: all \ No newline at end of file diff --git a/sdp/processors/__init__.py b/sdp/processors/__init__.py index 19694290..39c5e604 100644 --- a/sdp/processors/__init__.py +++ b/sdp/processors/__init__.py @@ -14,6 +14,14 @@ # let's import all supported processors here to simplify target specification +from sdp.processors.datasets.ytdlp.downlaod_youtube_audio import ( + GetYoutubeAudio, +) + +from sdp.processors.datasets.ytdlp.create_initial_manifest import ( + CreateInitialManifestytdlp, +) + from sdp.processors.datasets.coraa.create_initial_manifest import ( CreateInitialManifestCORAA, ) diff --git a/sdp/processors/datasets/ytdlp/__init__.py b/sdp/processors/datasets/ytdlp/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/sdp/processors/datasets/ytdlp/create_initial_manifest.py b/sdp/processors/datasets/ytdlp/create_initial_manifest.py new file mode 100644 index 00000000..9bec8a13 --- /dev/null +++ b/sdp/processors/datasets/ytdlp/create_initial_manifest.py @@ -0,0 +1,87 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import subprocess +from pathlib import Path +import json +from sdp.logging import logger +from sdp.processors.base_processor import BaseParallelProcessor, DataEntry + + +class CreateInitialManifestytdlp(BaseParallelProcessor): + """ + Processor for creating an initial dataset manifest by saving youtube links. + Make sure to install yt-dlp tool before funning this code. + + Tool link: https://github.com/yt-dlp/yt-dlp + + Args: + raw_data_dir (str): Root directory of the files to be added to the manifest. Recursively searches for files with the given 'extension'. + output_field (str): Field to store the file paths in the dataset. Default is "audio_filepath". + extension (str): Extension of the files to include in the dataset. Default is "wav". + **kwargs: Additional keyword arguments for the base class `BaseParallelProcessor`. + """ + + def __init__( + self, + raw_data_dir: str, + output_field: str = "audio_filepath", + # extension: str = "wav", + **kwargs, + ): + super().__init__(**kwargs) + self.raw_data_dir = Path(raw_data_dir) + self.output_field = output_field + file_path = "sdp/processors/datasets/ytdlp/search_terms.json" + + with open(file_path, "r") as f: + channels = json.load(f) + + self.channel_tuples = [(channel["search_term"], channel["audio_count"]) for channel in channels["channels"]] + + + def read_manifest(self): + channels_data = [] + for search_term, audio_count in self.channel_tuples: + if search_term is not None: + command = [ + 'yt-dlp', + f'ytsearch{audio_count}:{search_term}', + '--match-filter', "license = 'Creative Commons Attribution license (reuse allowed)'", + '--get-id', + ] + try: + process = subprocess.run(command, stdout=subprocess.PIPE, text=True) + output = process.stdout.strip() + # Each video ID will be on a new line, so split the output into a list of IDs + video_ids = output.split('\n') + while("" in video_ids): + video_ids.remove("") + # Construct the full YouTube page URL for each video ID + youtube_base_url = "https://www.youtube.com/watch?v=" + # Append the data to the channels_data dictionary + logger.info("Got youtube links :", video_ids) + channels_data.extend( + [(youtube_base_url + video_id, video_id) for video_id in video_ids] + ) + except subprocess.CalledProcessError as e: + print(f"Error fetching URLs for {search_term}: {e}") + else: + continue + + return channels_data + + def process_dataset_entry(self, data_entry): + data = {self.output_field: data_entry[0],'youtube_id':data_entry[1]} + return [DataEntry(data=data)] diff --git a/sdp/processors/datasets/ytdlp/downlaod_youtube_audio.py b/sdp/processors/datasets/ytdlp/downlaod_youtube_audio.py new file mode 100644 index 00000000..f05ad736 --- /dev/null +++ b/sdp/processors/datasets/ytdlp/downlaod_youtube_audio.py @@ -0,0 +1,81 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import subprocess +import os +from pathlib import Path +import json +from sdp.logging import logger +from sdp.processors.base_processor import BaseParallelProcessor, DataEntry + + + +class GetYoutubeAudio(BaseParallelProcessor): + """ + Processor to download audio from YouTube links and calculate the duration of the audio. + + Args: + links_filepath_field (str): Field to get the YouTube video link. + output_audio_path (str): Path to save the downloaded audio files. + **kwargs: Additional keyword arguments for the base class `BaseParallelProcessor`. + + Returns: + All the same fields as in the input manifest plus the audio duration. + """ + def __init__( + self, + links_filepath_field: str, + output_audio_path: str, + **kwargs, + ): + super().__init__(**kwargs) + + self.links_filepath_field = links_filepath_field + self.output_audio_path = output_audio_path + path = Path(output_audio_path) + path.mkdir(exist_ok=True) + + def process_dataset_entry(self, data_entry): + audio_link = data_entry[self.links_filepath_field] + logger.info(f"Processing audio link: {audio_link}") + output_path = os.path.join(self.output_audio_path, data_entry['youtube_id'] + '.wav') + + os.makedirs(self.output_audio_path, exist_ok=True) + + if not os.path.exists(output_path): + # Download audio with postprocessor sample rate = 16k + command = f'yt-dlp -x --audio-format wav --postprocessor-args "-ac 1 -ar 16000" -o "{output_path}" "{audio_link}"' + try: + subprocess.run(command, shell=True, check=True) + logger.info(f"Audio downloaded successfully: {output_path}") + except subprocess.CalledProcessError as e: + logger.warning(f"Failed to download audio: {e}") + else: + logger.info(f"Output file already exists: {output_path}") + + ffprobe_cmd = f'ffprobe -v error -show_entries format=duration -of default=noprint_wrappers=1:nokey=1 "{output_path}"' + try: + duration_str = subprocess.run(ffprobe_cmd, shell=True, check=True, stdout=subprocess.PIPE, text=True).stdout.strip() + duration = float(duration_str) + logger.info(f"Audio length: {duration} seconds") + except subprocess.CalledProcessError as e: + logger.warning(f"Failed to get audio duration: {e}") + duration = None + + data = { + self.links_filepath_field: output_path, + 'youtube_id': data_entry['youtube_id'], + 'duration': duration + } + return [DataEntry(data=data)] \ No newline at end of file diff --git a/sdp/processors/datasets/ytdlp/search_terms.json b/sdp/processors/datasets/ytdlp/search_terms.json new file mode 100644 index 00000000..d2aafd34 --- /dev/null +++ b/sdp/processors/datasets/ytdlp/search_terms.json @@ -0,0 +1,8 @@ +{ + "channels": [ + { + "search_term": "Narine Kirakosyan", + "audio_count": 1 + } + ] +}