NVIDIA · mheryerznkanyan · Jun 1, 2024 · Jun 20, 2024 · Alcray · Jul 7, 2024
diff --git a/dataset_configs/armenian/youtube_audio_tmp/config.yaml b/dataset_configs/armenian/youtube_audio_tmp/config.yaml
@@ -0,0 +1,71 @@
+processors_to_run: "0:"
+workspace_dir: /workspace/nemo_capstone
+final_manifest: ${workspace_dir}/final_manifest.json
+
+processors:
+  - _target_: sdp.processors.CreateInitialManifestArmData
+    raw_data_dir: /workspace/nemo_capstone
+    output_field: audio_filepath
+    output_manifest_file: ${workspace_dir}/manifest0.json
+
+  - _target_: sdp.processors.GetYoutubeAudio
+    links_filepath_field: audio_filepath
+    output_audio_path: ${workspace_dir}/audio_files/
+    output_manifest_file: ${workspace_dir}/manifest1.json
+
+  - _target_: sdp.processors.AudioLid
+    output_manifest_file: ${workspace_dir}/manifest2.json
+    input_audio_field: audio_filepath
+    output_lang_field: audio_lang
+    device: gpu
+    pretrained_model: "langid_ambernet"
+    segment_duration: 20
+    num_segments: 3
+
+  - _target_: sdp.processors.PreserveByValue
+    output_manifest_file: ${workspace_dir}/manifest3.json
+    input_field: audio_lang
+    target_value: hy
+
+  - _target_: sdp.processors.PreserveByValue
+    output_manifest_file: ${workspace_dir}/manifest4.json
+    input_field: duration
+    operator: le
+    target_value: 20000.0
+
+  - _target_: sdp.processors.Subprocess
+    output_manifest_file: ${workspace_dir}/vad
+    input_manifest_arg: "manifest_filepath"
+    output_manifest_arg: "output_dir"
+    cmd: "python /workspace/nemo/examples/asr/asr_vad/speech_to_text_with_vad.py audio_type=wav \
+    vad_model=vad_multilingual_frame_marblenet  vad_config=/workspace/nemo/examples/asr/conf/vad/frame_vad_infer_postprocess.yaml"
+
+  - _target_: sdp.processors.RenameFields
+    input_manifest_file: ${workspace_dir}/vad/temp_manifest_vad_rttm-onset0.3-offset0.3-pad_onset0.2-pad_offset0.2-min_duration_on0.2-min_duration_off0.2-filter_speech_firstTrue.json
+    output_manifest_file: ${workspace_dir}/manifest6.json
+    rename_fields: {"audio_filepath":"source_filepath"}
+
+  - _target_: sdp.processors.nemo.rttm.GetRttmSegments
+    output_manifest_file: ${workspace_dir}/manifest7.json
+    rttm_field: rttm_file
+    output_file_field: audio_segments
+    duration_threshold: 20.0
+    duration_field: duration
+
+  - _target_: sdp.processors.nemo.rttm.SplitFile
+    output_manifest_file: ${workspace_dir}/manifest8.json
+    splited_audio_dir: ${workspace_dir}/armenian/yt/splited_wavs/
+    segments_field: audio_segments
+    input_file_field: source_filepath
+    output_file_field: audio_filepath
+    duration_field: duration
+
+  - _target_: sdp.processors.PreserveByValue
+    output_manifest_file: ${workspace_dir}/manifest9.json
+    input_field: duration
+    operator: gt
+    target_value: 0.0
+
+  - _target_: sdp.processors.KeepOnlySpecifiedFields
+    output_manifest_file: ${workspace_dir}/manifest10.json
+    fields_to_keep: ["audio_filepath", "duration"]
diff --git a/docker-compose.yaml b/docker-compose.yaml
@@ -0,0 +1,22 @@
+version: '3.9'
+services:
+  frontend:
+    image: nvcr.io/nvidia/nemo:24.03
+    build: .
+    volumes:
+      - type: tmpfs
+        target: /dev/shm
+        tmpfs:
+          size: 9000000000 # ~9gb
+      - .:/workspace/nemo_capstone
+      - /raid/asds/:/workspace/nemo_capstone/data
+    deploy:
+      resources:
+        reservations:
+          devices:
+          - driver: nvidia
+            capabilities: [gpu]
+            count: 'all' # or 'all' to use all GPUs
+    entrypoint: tail -f
+    environment:
+      NVIDIA_VISIBLE_DEVICES: all
diff --git a/sdp/processors/__init__.py b/sdp/processors/__init__.py
@@ -14,6 +14,14 @@
 
 # let's import all supported processors here to simplify target specification
 
+from sdp.processors.datasets.ytdlp.downlaod_youtube_audio import (
+    GetYoutubeAudio,
+)
+
+from sdp.processors.datasets.ytdlp.create_initial_manifest import (
+    CreateInitialManifestytdlp,
+)
+
 from sdp.processors.datasets.coraa.create_initial_manifest import (
     CreateInitialManifestCORAA,
 )

diff --git a/sdp/processors/datasets/ytdlp/__init__.py b/sdp/processors/datasets/ytdlp/__init__.py
diff --git a/sdp/processors/datasets/ytdlp/create_initial_manifest.py b/sdp/processors/datasets/ytdlp/create_initial_manifest.py
@@ -0,0 +1,87 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import subprocess
+from pathlib import Path
+import json
+from sdp.logging import logger
+from sdp.processors.base_processor import BaseParallelProcessor, DataEntry
+
+
+class CreateInitialManifestytdlp(BaseParallelProcessor):
+    """
+    Processor for creating an initial dataset manifest by saving youtube links.
+    Make sure to install yt-dlp tool before funning this code. 
+
+    Tool link: https://github.com/yt-dlp/yt-dlp
+
+    Args:
+        raw_data_dir (str): Root directory of the files to be added to the manifest. Recursively searches for files with the given 'extension'.
+        output_field (str): Field to store the file paths in the dataset. Default is "audio_filepath".
+        extension (str): Extension of the files to include in the dataset. Default is "wav".
+        **kwargs: Additional keyword arguments for the base class `BaseParallelProcessor`.
+    """
+
+    def __init__(
+        self,
+        raw_data_dir: str,
+        output_field: str = "audio_filepath",
+        # extension: str = "wav",
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.raw_data_dir = Path(raw_data_dir)
+        self.output_field = output_field
+        file_path = "sdp/processors/datasets/ytdlp/search_terms.json"
+
+        with open(file_path, "r") as f:
+            channels = json.load(f)
+
+        self.channel_tuples = [(channel["search_term"], channel["audio_count"]) for channel in channels["channels"]]
+
+
+    def read_manifest(self):
+        channels_data = []
+        for search_term, audio_count in self.channel_tuples:
+            if search_term is not None:
+                command = [
+                    'yt-dlp',
+                    f'ytsearch{audio_count}:{search_term}',
+                    '--match-filter', "license = 'Creative Commons Attribution license (reuse allowed)'",
+                    '--get-id',
+                ]
+                try:
+                    process = subprocess.run(command, stdout=subprocess.PIPE, text=True)
+                    output = process.stdout.strip()
+                    # Each video ID will be on a new line, so split the output into a list of IDs
+                    video_ids = output.split('\n')
+                    while("" in video_ids):
+                        video_ids.remove("")
+                    # Construct the full YouTube page URL for each video ID
+                    youtube_base_url = "https://www.youtube.com/watch?v="
+                    # Append the data to the channels_data dictionary
+                    logger.info("Got youtube links :", video_ids)
+                    channels_data.extend(
+                        [(youtube_base_url + video_id, video_id) for video_id in video_ids]
+                    )
+                except subprocess.CalledProcessError as e:
+                    print(f"Error fetching URLs for {search_term}: {e}")
+            else:
+                continue
+
+        return channels_data
+
+    def process_dataset_entry(self, data_entry):
+        data = {self.output_field: data_entry[0],'youtube_id':data_entry[1]}
+        return [DataEntry(data=data)]
diff --git a/sdp/processors/datasets/ytdlp/downlaod_youtube_audio.py b/sdp/processors/datasets/ytdlp/downlaod_youtube_audio.py
@@ -0,0 +1,81 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import subprocess
+import os
+from pathlib import Path
+import json
+from sdp.logging import logger
+from sdp.processors.base_processor import BaseParallelProcessor, DataEntry
+
+
+
+class GetYoutubeAudio(BaseParallelProcessor):
+    """
+    Processor to download audio from YouTube links and calculate the duration of the audio.
+
+    Args:
+        links_filepath_field (str): Field to get the YouTube video link.
+        output_audio_path (str): Path to save the downloaded audio files.
+        **kwargs: Additional keyword arguments for the base class `BaseParallelProcessor`.
+
+    Returns:
+        All the same fields as in the input manifest plus the audio duration.
+    """
+    def __init__(
+        self,
+        links_filepath_field: str,
+        output_audio_path: str,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.links_filepath_field = links_filepath_field
+        self.output_audio_path = output_audio_path
+        path = Path(output_audio_path)
+        path.mkdir(exist_ok=True)
+
+    def process_dataset_entry(self, data_entry):
+        audio_link = data_entry[self.links_filepath_field]
+        logger.info(f"Processing audio link: {audio_link}")
+        output_path = os.path.join(self.output_audio_path, data_entry['youtube_id'] + '.wav')
+
+        os.makedirs(self.output_audio_path, exist_ok=True)
+
+        if not os.path.exists(output_path):
+            # Download audio with postprocessor sample rate = 16k
+            command = f'yt-dlp -x --audio-format wav --postprocessor-args "-ac 1 -ar 16000" -o "{output_path}" "{audio_link}"'
+            try:
+                subprocess.run(command, shell=True, check=True)
+                logger.info(f"Audio downloaded successfully: {output_path}")
+            except subprocess.CalledProcessError as e:
+                logger.warning(f"Failed to download audio: {e}")
+        else:
+            logger.info(f"Output file already exists: {output_path}")
+
+        ffprobe_cmd = f'ffprobe -v error -show_entries format=duration -of default=noprint_wrappers=1:nokey=1 "{output_path}"'
+        try:
+            duration_str = subprocess.run(ffprobe_cmd, shell=True, check=True, stdout=subprocess.PIPE, text=True).stdout.strip()
+            duration = float(duration_str)
+            logger.info(f"Audio length: {duration} seconds")
+        except subprocess.CalledProcessError as e:
+            logger.warning(f"Failed to get audio duration: {e}")
+            duration = None  
+
+        data = {
+            self.links_filepath_field: output_path,
+            'youtube_id': data_entry['youtube_id'],
+            'duration': duration 
+        }
+        return [DataEntry(data=data)]
diff --git a/sdp/processors/datasets/ytdlp/search_terms.json b/sdp/processors/datasets/ytdlp/search_terms.json
@@ -0,0 +1,8 @@
+{
+    "channels": [
+      {
+        "search_term": "Narine Kirakosyan",
+        "audio_count": 1
+      }
+    ]
+}