Skip to content

Commit fc9532a

Browse files
committed
add script to download dataset from huggingface
1 parent f5f0362 commit fc9532a

File tree

1 file changed

+35
-0
lines changed
  • AI-and-Analytics/End-to-end-Workloads/LanguageIdentification/Dataset

1 file changed

+35
-0
lines changed
Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
import os
2+
import shutil
3+
import argparse
4+
from datasets import load_dataset
5+
from tqdm import tqdm
6+
7+
language_to_code = {
8+
"japanese": "ja",
9+
"swedish": "sv-SE"
10+
}
11+
12+
def download_dataset(output_dir):
13+
for lang, lang_code in language_to_code.items():
14+
print(f"Processing dataset for language: {lang_code}")
15+
16+
# Load the dataset for the specific language
17+
dataset = load_dataset("mozilla-foundation/common_voice_11_0", lang_code, split="train")
18+
19+
# Create a language-specific output folder
20+
output_folder = os.path.join(output_dir, lang, lang_code, "clips")
21+
os.makedirs(output_folder, exist_ok=True)
22+
23+
# Extract and copy MP3 files
24+
for sample in tqdm(dataset, desc=f"Extracting and copying MP3 files for {lang}"):
25+
audio_path = sample['audio']['path']
26+
shutil.copy(audio_path, output_folder)
27+
28+
print("Extraction and copy complete.")
29+
30+
if __name__ == "__main__":
31+
parser = argparse.ArgumentParser(description="Extract and copy audio files from a dataset to a specified directory.")
32+
parser.add_argument("--output_dir", type=str, default="/data/commonVoice", help="Base output directory for saving the files. Default is /data/commonVoice")
33+
args = parser.parse_args()
34+
35+
download_dataset(args.output_dir)

0 commit comments

Comments
 (0)