11import os
2- import warnings
3- from typing import List , Dict , Tuple , Optional
2+ from typing import List , Dict , Tuple
43
54import torchaudio
6- from torchaudio .datasets .utils import extract_archive , unicode_csv_reader , validate_file
5+ from torchaudio .datasets .utils import download_url , extract_archive , unicode_csv_reader
76from torch import Tensor
87from torch .utils .data import Dataset
98
1615# validated.tsv
1716
1817FOLDER_IN_ARCHIVE = "CommonVoice"
19- LANGUAGE = "english"
20- VERSION = "cv-corpus-5.1-2020-06-22 "
18+ URL = "english"
19+ VERSION = "cv-corpus-4-2019-12-10 "
2120TSV = "train.tsv"
2221_CHECKSUMS = {
23- "cv-corpus-5.1-2020-06-22/tt.tar.gz" : None ,
24- "cv-corpus-5.1-2020-06-22/en.tar.gz" : None ,
25- "cv-corpus-5.1-2020-06-22/de.tar.gz" : None ,
26- "cv-corpus-5.1-2020-06-22/fr.tar.gz" : None ,
27- "cv-corpus-5.1-2020-06-22/cy.tar.gz" : None ,
28- "cv-corpus-5.1-2020-06-22/br.tar.gz" : None ,
29- "cv-corpus-5.1-2020-06-22/cv.tar.gz" : None ,
30- "cv-corpus-5.1-2020-06-22/tr.tar.gz" : None ,
31- "cv-corpus-5.1-2020-06-22/ky.tar.gz" : None ,
32- "cv-corpus-5.1-2020-06-22/ga-IE.tar.gz" : None ,
33- "cv-corpus-5.1-2020-06-22/kab.tar.gz" : None ,
34- "cv-corpus-5.1-2020-06-22/ca.tar.gz" : None ,
35- "cv-corpus-5.1-2020-06-22/zh-TW.tar.gz" : None ,
36- "cv-corpus-5.1-2020-06-22/sl.tar.gz" : None ,
37- "cv-corpus-5.1-2020-06-22/it.tar.gz" : None ,
38- "cv-corpus-5.1-2020-06-22/nl.tar.gz" : None ,
39- "cv-corpus-5.1-2020-06-22/cnh.tar.gz" : None ,
40- "cv-corpus-5.1-2020-06-22/eo.tar.gz" : None ,
41- "cv-corpus-5.1-2020-06-22/et.tar.gz" : None ,
42- "cv-corpus-5.1-2020-06-22/fa.tar.gz" : None ,
43- "cv-corpus-5.1-2020-06-22/eu.tar.gz" : None ,
44- "cv-corpus-5.1-2020-06-22/es.tar.gz" : None ,
45- "cv-corpus-5.1-2020-06-22/zh-CN.tar.gz" : None ,
46- "cv-corpus-5.1-2020-06-22/mn.tar.gz" : None ,
47- "cv-corpus-5.1-2020-06-22/sah.tar.gz" : None ,
48- "cv-corpus-5.1-2020-06-22/dv.tar.gz" : None ,
49- "cv-corpus-5.1-2020-06-22/rw.tar.gz" : None ,
50- "cv-corpus-5.1-2020-06-22/sv-SE.tar.gz" : None ,
51- "cv-corpus-5.1-2020-06-22/ru.tar.gz" : None ,
22+ "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/tt.tar.gz" :
23+ None ,
24+ "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/en.tar.gz" :
25+ None ,
26+ "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/de.tar.gz" :
27+ None ,
28+ "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/fr.tar.gz" :
29+ None ,
30+ "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/cy.tar.gz" :
31+ None ,
32+ "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/br.tar.gz" :
33+ None ,
34+ "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/cv.tar.gz" :
35+ None ,
36+ "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/tr.tar.gz" :
37+ None ,
38+ "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/ky.tar.gz" :
39+ None ,
40+ "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/ga-IE.tar.gz" :
41+ None ,
42+ "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/kab.tar.gz" :
43+ None ,
44+ "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/ca.tar.gz" :
45+ None ,
46+ "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/zh-TW.tar.gz" :
47+ None ,
48+ "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/sl.tar.gz" :
49+ None ,
50+ "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/it.tar.gz" :
51+ None ,
52+ "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/nl.tar.gz" :
53+ None ,
54+ "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/cnh.tar.gz" :
55+ None ,
56+ "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/eo.tar.gz" :
57+ None ,
58+ "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/et.tar.gz" :
59+ None ,
60+ "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/fa.tar.gz" :
61+ None ,
62+ "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/eu.tar.gz" :
63+ None ,
64+ "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/es.tar.gz" :
65+ None ,
66+ "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/zh-CN.tar.gz" :
67+ None ,
68+ "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/mn.tar.gz" :
69+ None ,
70+ "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/sah.tar.gz" :
71+ None ,
72+ "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/dv.tar.gz" :
73+ None ,
74+ "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/rw.tar.gz" :
75+ None ,
76+ "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/sv-SE.tar.gz" :
77+ None ,
78+ "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/ru.tar.gz" :
79+ None
5280}
5381
5482
@@ -72,18 +100,15 @@ def load_commonvoice_item(line: List[str],
72100
73101
74102class COMMONVOICE (Dataset ):
75- """Create a Dataset for ` CommonVoice <https://commonvoice.mozilla.org/>`_ .
103+ """Create a Dataset for CommonVoice.
76104
77105 Args:
78106 root (str): Path to the directory where the dataset is found or downloaded.
79107 tsv (str, optional): The name of the tsv file used to construct the metadata.
80108 (default: ``"train.tsv"``)
81- url (str, optional): Deprecated.
82- folder_in_archive (str, optional): The top-level directory of the dataset.
83- version (str): Version string. (default: ``"cv-corpus-5.1-2020-06-22"``)
84- language (str, optional): Language of the dataset. (default: None)
85- The following values are mapped to their corresponding shortened version:
86- ``"tatar"``, ``"english"``, ``"german"``,
109+ url (str, optional): The URL to download the dataset from, or the language of
110+ the dataset to download. (default: ``"english"``).
111+ Allowed language values are ``"tatar"``, ``"english"``, ``"german"``,
87112 ``"french"``, ``"welsh"``, ``"breton"``, ``"chuvash"``, ``"turkish"``, ``"kyrgyz"``,
88113 ``"irish"``, ``"kabyle"``, ``"catalan"``, ``"taiwanese"``, ``"slovenian"``,
89114 ``"italian"``, ``"dutch"``, ``"hakha chin"``, ``"esperanto"``, ``"estonian"``,
@@ -92,8 +117,11 @@ class COMMONVOICE(Dataset):
92117 ``"russian"``, ``"indonesian"``, ``"arabic"``, ``"tamil"``, ``"interlingua"``,
93118 ``"latvian"``, ``"japanese"``, ``"votic"``, ``"abkhaz"``, ``"cantonese"`` and
94119 ``"romansh sursilvan"``.
120+ folder_in_archive (str, optional): The top-level directory of the dataset.
121+ version (str): Version string. (default: ``"cv-corpus-4-2019-12-10"``)
95122 For the other allowed values, Please checkout https://commonvoice.mozilla.org/en/datasets.
96- download (bool, optional): Deprecated.
123+ download (bool, optional):
124+ Whether to download the dataset if it is not found at root path. (default: ``False``).
97125 """
98126
99127 _ext_txt = ".txt"
@@ -103,30 +131,10 @@ class COMMONVOICE(Dataset):
103131 def __init__ (self ,
104132 root : str ,
105133 tsv : str = TSV ,
106- url : Optional [ str ] = None ,
134+ url : str = URL ,
107135 folder_in_archive : str = FOLDER_IN_ARCHIVE ,
108136 version : str = VERSION ,
109- language : str = LANGUAGE ,
110- download : Optional [bool ] = False ) -> None :
111-
112- if download is True :
113- raise RuntimeError (
114- "The dataset is no longer publicly accessible. You need to "
115- "download the archives externally and place them in the root "
116- "directory."
117- )
118- elif download is False :
119- warnings .warn (
120- "The use of the download flag is deprecated, since the dataset "
121- "is no longer directly accessible." , RuntimeWarning
122- )
123-
124- if url is not None :
125- warnings .warn (
126- "The use of the url flag is deprecated, since the dataset "
127- "is no longer publicly accessible. To specify the language of the dataset, "
128- "please use the language parameter instead." , RuntimeWarning
129- )
137+ download : bool = False ) -> None :
130138
131139 languages = {
132140 "tatar" : "tt" ,
@@ -171,22 +179,12 @@ def __init__(self,
171179 "romansh sursilvan" : "rm-sursilv"
172180 }
173181
174- if language in languages :
182+ if url in languages :
175183 ext_archive = ".tar.gz"
176- language = languages [language ]
177- url = os .path .join (version , language + ext_archive )
178- else :
179- raise ValueError (
180- 'Allowed language values are ``"tatar"``, ``"english"``, ``"german"``,'
181- '``"french"``, ``"welsh"``, ``"breton"``, ``"chuvash"``, ``"turkish"``, ``"kyrgyz"``,'
182- '``"irish"``, ``"kabyle"``, ``"catalan"``, ``"taiwanese"``, ``"slovenian"``,'
183- '``"italian"``, ``"dutch"``, ``"hakha chin"``, ``"esperanto"``, ``"estonian"``,'
184- '``"persian"``, ``"portuguese"``, ``"basque"``, ``"spanish"``, ``"chinese"``,'
185- '``"mongolian"``, ``"sakha"``, ``"dhivehi"``, ``"kinyarwanda"``, ``"swedish"``,'
186- '``"russian"``, ``"indonesian"``, ``"arabic"``, ``"tamil"``, ``"interlingua"``,'
187- '``"latvian"``, ``"japanese"``, ``"votic"``, ``"abkhaz"``, ``"cantonese"`` and'
188- '``"romansh sursilvan"``.'
189- )
184+ language = languages [url ]
185+
186+ base_url = "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com"
187+ url = os .path .join (base_url , version , language + ext_archive )
190188
191189 basename = os .path .basename (url )
192190 archive = os .path .join (root , basename )
@@ -196,23 +194,12 @@ def __init__(self,
196194
197195 self ._path = os .path .join (root , folder_in_archive )
198196
199- if not os .path .isdir (self ._path ):
200- if os .path .isfile (archive ):
201- checksum = _CHECKSUMS .get (url , None )
202- if checksum :
203- filepath = os .path .basename (url )
204- with open (filepath , "rb" ) as file_obj :
205- if not validate_file (file_obj , checksum , "sha256" ):
206- raise RuntimeError (
207- f"The hash of { filepath } does not match. Delete the file manually and retry."
208- )
197+ if download :
198+ if not os .path .isdir (self ._path ):
199+ if not os .path .isfile (archive ):
200+ checksum = _CHECKSUMS .get (url , None )
201+ download_url (url , root , hash_value = checksum )
209202 extract_archive (archive )
210- else :
211- raise RuntimeError (
212- "The dataset is no longer publicly accessible. You need to "
213- "download the archives externally and place them in the root "
214- "directory."
215- )
216203
217204 self ._tsv = os .path .join (root , folder_in_archive , tsv )
218205
0 commit comments