-
Notifications
You must be signed in to change notification settings - Fork 38
yt dlp initial pull request #63
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,71 @@ | ||
| processors_to_run: "0:" | ||
| workspace_dir: /workspace/nemo_capstone | ||
| final_manifest: ${workspace_dir}/final_manifest.json | ||
|
|
||
| processors: | ||
| - _target_: sdp.processors.CreateInitialManifestArmData | ||
| raw_data_dir: /workspace/nemo_capstone | ||
| output_field: audio_filepath | ||
| output_manifest_file: ${workspace_dir}/manifest0.json | ||
|
|
||
| - _target_: sdp.processors.GetYoutubeAudio | ||
| links_filepath_field: audio_filepath | ||
| output_audio_path: ${workspace_dir}/audio_files/ | ||
| output_manifest_file: ${workspace_dir}/manifest1.json | ||
|
|
||
| - _target_: sdp.processors.AudioLid | ||
| output_manifest_file: ${workspace_dir}/manifest2.json | ||
| input_audio_field: audio_filepath | ||
| output_lang_field: audio_lang | ||
| device: gpu | ||
| pretrained_model: "langid_ambernet" | ||
| segment_duration: 20 | ||
| num_segments: 3 | ||
|
|
||
| - _target_: sdp.processors.PreserveByValue | ||
| output_manifest_file: ${workspace_dir}/manifest3.json | ||
| input_field: audio_lang | ||
| target_value: hy | ||
|
|
||
| - _target_: sdp.processors.PreserveByValue | ||
| output_manifest_file: ${workspace_dir}/manifest4.json | ||
| input_field: duration | ||
| operator: le | ||
| target_value: 20000.0 | ||
|
|
||
| - _target_: sdp.processors.Subprocess | ||
| output_manifest_file: ${workspace_dir}/vad | ||
| input_manifest_arg: "manifest_filepath" | ||
| output_manifest_arg: "output_dir" | ||
| cmd: "python /workspace/nemo/examples/asr/asr_vad/speech_to_text_with_vad.py audio_type=wav \ | ||
| vad_model=vad_multilingual_frame_marblenet vad_config=/workspace/nemo/examples/asr/conf/vad/frame_vad_infer_postprocess.yaml" | ||
|
|
||
| - _target_: sdp.processors.RenameFields | ||
| input_manifest_file: ${workspace_dir}/vad/temp_manifest_vad_rttm-onset0.3-offset0.3-pad_onset0.2-pad_offset0.2-min_duration_on0.2-min_duration_off0.2-filter_speech_firstTrue.json | ||
| output_manifest_file: ${workspace_dir}/manifest6.json | ||
| rename_fields: {"audio_filepath":"source_filepath"} | ||
|
|
||
| - _target_: sdp.processors.nemo.rttm.GetRttmSegments | ||
| output_manifest_file: ${workspace_dir}/manifest7.json | ||
| rttm_field: rttm_file | ||
| output_file_field: audio_segments | ||
| duration_threshold: 20.0 | ||
| duration_field: duration | ||
|
|
||
| - _target_: sdp.processors.nemo.rttm.SplitFile | ||
| output_manifest_file: ${workspace_dir}/manifest8.json | ||
| splited_audio_dir: ${workspace_dir}/armenian/yt/splited_wavs/ | ||
| segments_field: audio_segments | ||
| input_file_field: source_filepath | ||
| output_file_field: audio_filepath | ||
| duration_field: duration | ||
|
|
||
| - _target_: sdp.processors.PreserveByValue | ||
| output_manifest_file: ${workspace_dir}/manifest9.json | ||
| input_field: duration | ||
| operator: gt | ||
| target_value: 0.0 | ||
|
|
||
| - _target_: sdp.processors.KeepOnlySpecifiedFields | ||
| output_manifest_file: ${workspace_dir}/manifest10.json | ||
| fields_to_keep: ["audio_filepath", "duration"] | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Where do you use this docker compose file? Is it possible to run the scripts without it? |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,22 @@ | ||
| version: '3.9' | ||
| services: | ||
| frontend: | ||
| image: nvcr.io/nvidia/nemo:24.03 | ||
| build: . | ||
| volumes: | ||
| - type: tmpfs | ||
| target: /dev/shm | ||
| tmpfs: | ||
| size: 9000000000 # ~9gb | ||
| - .:/workspace/nemo_capstone | ||
| - /raid/asds/:/workspace/nemo_capstone/data | ||
| deploy: | ||
| resources: | ||
| reservations: | ||
| devices: | ||
| - driver: nvidia | ||
| capabilities: [gpu] | ||
| count: 'all' # or 'all' to use all GPUs | ||
| entrypoint: tail -f | ||
| environment: | ||
| NVIDIA_VISIBLE_DEVICES: all |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,87 @@ | ||
| # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. | ||
| # | ||
| # Licensed under the Apache License, Version 2.0 (the "License"); | ||
| # you may not use this file except in compliance with the License. | ||
| # You may obtain a copy of the License at | ||
| # | ||
| # http://www.apache.org/licenses/LICENSE-2.0 | ||
| # | ||
| # Unless required by applicable law or agreed to in writing, software | ||
| # distributed under the License is distributed on an "AS IS" BASIS, | ||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| # See the License for the specific language governing permissions and | ||
| # limitations under the License. | ||
|
|
||
| import subprocess | ||
| from pathlib import Path | ||
| import json | ||
| from sdp.logging import logger | ||
| from sdp.processors.base_processor import BaseParallelProcessor, DataEntry | ||
|
|
||
|
|
||
| class CreateInitialManifestytdlp(BaseParallelProcessor): | ||
| """ | ||
| Processor for creating an initial dataset manifest by saving youtube links. | ||
| Make sure to install yt-dlp tool before funning this code. | ||
|
|
||
| Tool link: https://github.com/yt-dlp/yt-dlp | ||
|
Comment on lines
+25
to
+27
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Since you use a 3rd party tool here, could you add a specific check in the script whether the tool is installed or not, and log message for user to install it? |
||
|
|
||
| Args: | ||
| raw_data_dir (str): Root directory of the files to be added to the manifest. Recursively searches for files with the given 'extension'. | ||
| output_field (str): Field to store the file paths in the dataset. Default is "audio_filepath". | ||
| extension (str): Extension of the files to include in the dataset. Default is "wav". | ||
| **kwargs: Additional keyword arguments for the base class `BaseParallelProcessor`. | ||
| """ | ||
|
|
||
| def __init__( | ||
| self, | ||
| raw_data_dir: str, | ||
| output_field: str = "audio_filepath", | ||
| # extension: str = "wav", | ||
| **kwargs, | ||
| ): | ||
| super().__init__(**kwargs) | ||
| self.raw_data_dir = Path(raw_data_dir) | ||
| self.output_field = output_field | ||
| file_path = "sdp/processors/datasets/ytdlp/search_terms.json" | ||
|
Comment on lines
+29
to
+46
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
|
|
||
| with open(file_path, "r") as f: | ||
| channels = json.load(f) | ||
|
|
||
| self.channel_tuples = [(channel["search_term"], channel["audio_count"]) for channel in channels["channels"]] | ||
|
|
||
|
|
||
| def read_manifest(self): | ||
| channels_data = [] | ||
| for search_term, audio_count in self.channel_tuples: | ||
| if search_term is not None: | ||
| command = [ | ||
| 'yt-dlp', | ||
| f'ytsearch{audio_count}:{search_term}', | ||
| '--match-filter', "license = 'Creative Commons Attribution license (reuse allowed)'", | ||
| '--get-id', | ||
| ] | ||
| try: | ||
| process = subprocess.run(command, stdout=subprocess.PIPE, text=True) | ||
| output = process.stdout.strip() | ||
| # Each video ID will be on a new line, so split the output into a list of IDs | ||
| video_ids = output.split('\n') | ||
| while("" in video_ids): | ||
| video_ids.remove("") | ||
| # Construct the full YouTube page URL for each video ID | ||
| youtube_base_url = "https://www.youtube.com/watch?v=" | ||
| # Append the data to the channels_data dictionary | ||
| logger.info("Got youtube links :", video_ids) | ||
| channels_data.extend( | ||
| [(youtube_base_url + video_id, video_id) for video_id in video_ids] | ||
| ) | ||
| except subprocess.CalledProcessError as e: | ||
| print(f"Error fetching URLs for {search_term}: {e}") | ||
| else: | ||
| continue | ||
|
|
||
| return channels_data | ||
|
|
||
| def process_dataset_entry(self, data_entry): | ||
| data = {self.output_field: data_entry[0],'youtube_id':data_entry[1]} | ||
| return [DataEntry(data=data)] | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,81 @@ | ||
| # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. | ||
| # | ||
| # Licensed under the Apache License, Version 2.0 (the "License"); | ||
| # you may not use this file except in compliance with the License. | ||
| # You may obtain a copy of the License at | ||
| # | ||
| # http://www.apache.org/licenses/LICENSE-2.0 | ||
| # | ||
| # Unless required by applicable law or agreed to in writing, software | ||
| # distributed under the License is distributed on an "AS IS" BASIS, | ||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| # See the License for the specific language governing permissions and | ||
| # limitations under the License. | ||
|
|
||
| import subprocess | ||
| import os | ||
| from pathlib import Path | ||
| import json | ||
| from sdp.logging import logger | ||
| from sdp.processors.base_processor import BaseParallelProcessor, DataEntry | ||
|
|
||
|
|
||
|
|
||
| class GetYoutubeAudio(BaseParallelProcessor): | ||
| """ | ||
| Processor to download audio from YouTube links and calculate the duration of the audio. | ||
|
|
||
| Args: | ||
| links_filepath_field (str): Field to get the YouTube video link. | ||
| output_audio_path (str): Path to save the downloaded audio files. | ||
| **kwargs: Additional keyword arguments for the base class `BaseParallelProcessor`. | ||
|
Comment on lines
+28
to
+31
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please use "key" instead of field |
||
|
|
||
| Returns: | ||
| All the same fields as in the input manifest plus the audio duration. | ||
| """ | ||
| def __init__( | ||
| self, | ||
| links_filepath_field: str, | ||
| output_audio_path: str, | ||
| **kwargs, | ||
| ): | ||
| super().__init__(**kwargs) | ||
|
|
||
| self.links_filepath_field = links_filepath_field | ||
| self.output_audio_path = output_audio_path | ||
| path = Path(output_audio_path) | ||
| path.mkdir(exist_ok=True) | ||
|
|
||
| def process_dataset_entry(self, data_entry): | ||
| audio_link = data_entry[self.links_filepath_field] | ||
| logger.info(f"Processing audio link: {audio_link}") | ||
| output_path = os.path.join(self.output_audio_path, data_entry['youtube_id'] + '.wav') | ||
|
|
||
| os.makedirs(self.output_audio_path, exist_ok=True) | ||
|
|
||
| if not os.path.exists(output_path): | ||
| # Download audio with postprocessor sample rate = 16k | ||
| command = f'yt-dlp -x --audio-format wav --postprocessor-args "-ac 1 -ar 16000" -o "{output_path}" "{audio_link}"' | ||
| try: | ||
| subprocess.run(command, shell=True, check=True) | ||
| logger.info(f"Audio downloaded successfully: {output_path}") | ||
| except subprocess.CalledProcessError as e: | ||
| logger.warning(f"Failed to download audio: {e}") | ||
| else: | ||
| logger.info(f"Output file already exists: {output_path}") | ||
|
|
||
| ffprobe_cmd = f'ffprobe -v error -show_entries format=duration -of default=noprint_wrappers=1:nokey=1 "{output_path}"' | ||
| try: | ||
| duration_str = subprocess.run(ffprobe_cmd, shell=True, check=True, stdout=subprocess.PIPE, text=True).stdout.strip() | ||
| duration = float(duration_str) | ||
| logger.info(f"Audio length: {duration} seconds") | ||
| except subprocess.CalledProcessError as e: | ||
| logger.warning(f"Failed to get audio duration: {e}") | ||
| duration = None | ||
|
|
||
| data = { | ||
| self.links_filepath_field: output_path, | ||
| 'youtube_id': data_entry['youtube_id'], | ||
| 'duration': duration | ||
| } | ||
| return [DataEntry(data=data)] | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If you want to use this as an example, not as a working configuration, please mention somewhere how user should work with this file and remove any personal information from here, like the name |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,8 @@ | ||
| { | ||
| "channels": [ | ||
| { | ||
| "search_term": "Narine Kirakosyan", | ||
| "audio_count": 1 | ||
| } | ||
| ] | ||
| } |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Please add