Skip to content

Commit fa6e140

Browse files
authored
Revert "[Cherry-pick for 0.7.1] no longer download CommonVoice directly (#1065)" (#1075)
This reverts commit 1ec49fe.
1 parent 27f2089 commit fa6e140

File tree

2 files changed

+84
-97
lines changed

2 files changed

+84
-97
lines changed

test/torchaudio_unittest/datasets/utils_test.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -54,15 +54,15 @@ class TestIterator(TorchaudioTestCase):
5454
path = get_asset_path()
5555

5656
def test_disckcache_iterator(self):
57-
data = COMMONVOICE(self.path, version="cv-corpus-4-2019-12-10", language="tatar")
57+
data = COMMONVOICE(self.path, url="tatar")
5858
data = dataset_utils.diskcache_iterator(data)
5959
# Save
6060
data[0]
6161
# Load
6262
data[0]
6363

6464
def test_bg_iterator(self):
65-
data = COMMONVOICE(self.path, version="cv-corpus-4-2019-12-10", language="tatar")
65+
data = COMMONVOICE(self.path, url="tatar")
6666
data = dataset_utils.bg_iterator(data, 5)
6767
for _ in data:
6868
pass

torchaudio/datasets/commonvoice.py

Lines changed: 82 additions & 95 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,8 @@
11
import os
2-
import warnings
3-
from typing import List, Dict, Tuple, Optional
2+
from typing import List, Dict, Tuple
43

54
import torchaudio
6-
from torchaudio.datasets.utils import extract_archive, unicode_csv_reader, validate_file
5+
from torchaudio.datasets.utils import download_url, extract_archive, unicode_csv_reader
76
from torch import Tensor
87
from torch.utils.data import Dataset
98

@@ -16,39 +15,68 @@
1615
# validated.tsv
1716

1817
FOLDER_IN_ARCHIVE = "CommonVoice"
19-
LANGUAGE = "english"
20-
VERSION = "cv-corpus-5.1-2020-06-22"
18+
URL = "english"
19+
VERSION = "cv-corpus-4-2019-12-10"
2120
TSV = "train.tsv"
2221
_CHECKSUMS = {
23-
"cv-corpus-5.1-2020-06-22/tt.tar.gz": None,
24-
"cv-corpus-5.1-2020-06-22/en.tar.gz": None,
25-
"cv-corpus-5.1-2020-06-22/de.tar.gz": None,
26-
"cv-corpus-5.1-2020-06-22/fr.tar.gz": None,
27-
"cv-corpus-5.1-2020-06-22/cy.tar.gz": None,
28-
"cv-corpus-5.1-2020-06-22/br.tar.gz": None,
29-
"cv-corpus-5.1-2020-06-22/cv.tar.gz": None,
30-
"cv-corpus-5.1-2020-06-22/tr.tar.gz": None,
31-
"cv-corpus-5.1-2020-06-22/ky.tar.gz": None,
32-
"cv-corpus-5.1-2020-06-22/ga-IE.tar.gz": None,
33-
"cv-corpus-5.1-2020-06-22/kab.tar.gz": None,
34-
"cv-corpus-5.1-2020-06-22/ca.tar.gz": None,
35-
"cv-corpus-5.1-2020-06-22/zh-TW.tar.gz": None,
36-
"cv-corpus-5.1-2020-06-22/sl.tar.gz": None,
37-
"cv-corpus-5.1-2020-06-22/it.tar.gz": None,
38-
"cv-corpus-5.1-2020-06-22/nl.tar.gz": None,
39-
"cv-corpus-5.1-2020-06-22/cnh.tar.gz": None,
40-
"cv-corpus-5.1-2020-06-22/eo.tar.gz": None,
41-
"cv-corpus-5.1-2020-06-22/et.tar.gz": None,
42-
"cv-corpus-5.1-2020-06-22/fa.tar.gz": None,
43-
"cv-corpus-5.1-2020-06-22/eu.tar.gz": None,
44-
"cv-corpus-5.1-2020-06-22/es.tar.gz": None,
45-
"cv-corpus-5.1-2020-06-22/zh-CN.tar.gz": None,
46-
"cv-corpus-5.1-2020-06-22/mn.tar.gz": None,
47-
"cv-corpus-5.1-2020-06-22/sah.tar.gz": None,
48-
"cv-corpus-5.1-2020-06-22/dv.tar.gz": None,
49-
"cv-corpus-5.1-2020-06-22/rw.tar.gz": None,
50-
"cv-corpus-5.1-2020-06-22/sv-SE.tar.gz": None,
51-
"cv-corpus-5.1-2020-06-22/ru.tar.gz": None,
22+
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/tt.tar.gz":
23+
None,
24+
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/en.tar.gz":
25+
None,
26+
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/de.tar.gz":
27+
None,
28+
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/fr.tar.gz":
29+
None,
30+
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/cy.tar.gz":
31+
None,
32+
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/br.tar.gz":
33+
None,
34+
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/cv.tar.gz":
35+
None,
36+
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/tr.tar.gz":
37+
None,
38+
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/ky.tar.gz":
39+
None,
40+
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/ga-IE.tar.gz":
41+
None,
42+
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/kab.tar.gz":
43+
None,
44+
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/ca.tar.gz":
45+
None,
46+
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/zh-TW.tar.gz":
47+
None,
48+
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/sl.tar.gz":
49+
None,
50+
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/it.tar.gz":
51+
None,
52+
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/nl.tar.gz":
53+
None,
54+
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/cnh.tar.gz":
55+
None,
56+
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/eo.tar.gz":
57+
None,
58+
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/et.tar.gz":
59+
None,
60+
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/fa.tar.gz":
61+
None,
62+
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/eu.tar.gz":
63+
None,
64+
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/es.tar.gz":
65+
None,
66+
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/zh-CN.tar.gz":
67+
None,
68+
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/mn.tar.gz":
69+
None,
70+
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/sah.tar.gz":
71+
None,
72+
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/dv.tar.gz":
73+
None,
74+
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/rw.tar.gz":
75+
None,
76+
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/sv-SE.tar.gz":
77+
None,
78+
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/ru.tar.gz":
79+
None
5280
}
5381

5482

@@ -72,18 +100,15 @@ def load_commonvoice_item(line: List[str],
72100

73101

74102
class COMMONVOICE(Dataset):
75-
"""Create a Dataset for `CommonVoice <https://commonvoice.mozilla.org/>`_.
103+
"""Create a Dataset for CommonVoice.
76104
77105
Args:
78106
root (str): Path to the directory where the dataset is found or downloaded.
79107
tsv (str, optional): The name of the tsv file used to construct the metadata.
80108
(default: ``"train.tsv"``)
81-
url (str, optional): Deprecated.
82-
folder_in_archive (str, optional): The top-level directory of the dataset.
83-
version (str): Version string. (default: ``"cv-corpus-5.1-2020-06-22"``)
84-
language (str, optional): Language of the dataset. (default: None)
85-
The following values are mapped to their corresponding shortened version:
86-
``"tatar"``, ``"english"``, ``"german"``,
109+
url (str, optional): The URL to download the dataset from, or the language of
110+
the dataset to download. (default: ``"english"``).
111+
Allowed language values are ``"tatar"``, ``"english"``, ``"german"``,
87112
``"french"``, ``"welsh"``, ``"breton"``, ``"chuvash"``, ``"turkish"``, ``"kyrgyz"``,
88113
``"irish"``, ``"kabyle"``, ``"catalan"``, ``"taiwanese"``, ``"slovenian"``,
89114
``"italian"``, ``"dutch"``, ``"hakha chin"``, ``"esperanto"``, ``"estonian"``,
@@ -92,8 +117,11 @@ class COMMONVOICE(Dataset):
92117
``"russian"``, ``"indonesian"``, ``"arabic"``, ``"tamil"``, ``"interlingua"``,
93118
``"latvian"``, ``"japanese"``, ``"votic"``, ``"abkhaz"``, ``"cantonese"`` and
94119
``"romansh sursilvan"``.
120+
folder_in_archive (str, optional): The top-level directory of the dataset.
121+
version (str): Version string. (default: ``"cv-corpus-4-2019-12-10"``)
95122
For the other allowed values, Please checkout https://commonvoice.mozilla.org/en/datasets.
96-
download (bool, optional): Deprecated.
123+
download (bool, optional):
124+
Whether to download the dataset if it is not found at root path. (default: ``False``).
97125
"""
98126

99127
_ext_txt = ".txt"
@@ -103,30 +131,10 @@ class COMMONVOICE(Dataset):
103131
def __init__(self,
104132
root: str,
105133
tsv: str = TSV,
106-
url: Optional[str] = None,
134+
url: str = URL,
107135
folder_in_archive: str = FOLDER_IN_ARCHIVE,
108136
version: str = VERSION,
109-
language: str = LANGUAGE,
110-
download: Optional[bool] = False) -> None:
111-
112-
if download is True:
113-
raise RuntimeError(
114-
"The dataset is no longer publicly accessible. You need to "
115-
"download the archives externally and place them in the root "
116-
"directory."
117-
)
118-
elif download is False:
119-
warnings.warn(
120-
"The use of the download flag is deprecated, since the dataset "
121-
"is no longer directly accessible.", RuntimeWarning
122-
)
123-
124-
if url is not None:
125-
warnings.warn(
126-
"The use of the url flag is deprecated, since the dataset "
127-
"is no longer publicly accessible. To specify the language of the dataset, "
128-
"please use the language parameter instead.", RuntimeWarning
129-
)
137+
download: bool = False) -> None:
130138

131139
languages = {
132140
"tatar": "tt",
@@ -171,22 +179,12 @@ def __init__(self,
171179
"romansh sursilvan": "rm-sursilv"
172180
}
173181

174-
if language in languages:
182+
if url in languages:
175183
ext_archive = ".tar.gz"
176-
language = languages[language]
177-
url = os.path.join(version, language + ext_archive)
178-
else:
179-
raise ValueError(
180-
'Allowed language values are ``"tatar"``, ``"english"``, ``"german"``,'
181-
'``"french"``, ``"welsh"``, ``"breton"``, ``"chuvash"``, ``"turkish"``, ``"kyrgyz"``,'
182-
'``"irish"``, ``"kabyle"``, ``"catalan"``, ``"taiwanese"``, ``"slovenian"``,'
183-
'``"italian"``, ``"dutch"``, ``"hakha chin"``, ``"esperanto"``, ``"estonian"``,'
184-
'``"persian"``, ``"portuguese"``, ``"basque"``, ``"spanish"``, ``"chinese"``,'
185-
'``"mongolian"``, ``"sakha"``, ``"dhivehi"``, ``"kinyarwanda"``, ``"swedish"``,'
186-
'``"russian"``, ``"indonesian"``, ``"arabic"``, ``"tamil"``, ``"interlingua"``,'
187-
'``"latvian"``, ``"japanese"``, ``"votic"``, ``"abkhaz"``, ``"cantonese"`` and'
188-
'``"romansh sursilvan"``.'
189-
)
184+
language = languages[url]
185+
186+
base_url = "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com"
187+
url = os.path.join(base_url, version, language + ext_archive)
190188

191189
basename = os.path.basename(url)
192190
archive = os.path.join(root, basename)
@@ -196,23 +194,12 @@ def __init__(self,
196194

197195
self._path = os.path.join(root, folder_in_archive)
198196

199-
if not os.path.isdir(self._path):
200-
if os.path.isfile(archive):
201-
checksum = _CHECKSUMS.get(url, None)
202-
if checksum:
203-
filepath = os.path.basename(url)
204-
with open(filepath, "rb") as file_obj:
205-
if not validate_file(file_obj, checksum, "sha256"):
206-
raise RuntimeError(
207-
f"The hash of {filepath} does not match. Delete the file manually and retry."
208-
)
197+
if download:
198+
if not os.path.isdir(self._path):
199+
if not os.path.isfile(archive):
200+
checksum = _CHECKSUMS.get(url, None)
201+
download_url(url, root, hash_value=checksum)
209202
extract_archive(archive)
210-
else:
211-
raise RuntimeError(
212-
"The dataset is no longer publicly accessible. You need to "
213-
"download the archives externally and place them in the root "
214-
"directory."
215-
)
216203

217204
self._tsv = os.path.join(root, folder_in_archive, tsv)
218205

0 commit comments

Comments
 (0)