Skip to content

Commit 22a6bfe

Browse files
rlangmanJorjeous
andauthored
Validate chapter duration in HiFiTTS-2 download (#129)
* Validate chapter duration in HiFiTTS-2 download Signed-off-by: Ryan <[email protected]> * ipdate Dockerfile Signed-off-by: George Zelenfroind <[email protected]> --------- Signed-off-by: Ryan <[email protected]> Signed-off-by: George Zelenfroind <[email protected]> Co-authored-by: George Zelenfroind <[email protected]>
1 parent 9466a50 commit 22a6bfe

File tree

2 files changed

+33
-8
lines changed

2 files changed

+33
-8
lines changed

docker/Dockerfile

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,9 @@ RUN apt-get update \
2121
# Update pip
2222
RUN pip install --upgrade pip
2323

24+
#install typing-ext manually
25+
RUN pip install typing-extensions
26+
2427
# Clone the NeMo SDP repository
2528
COPY . /src/NeMo-speech-data-processor
2629
RUN rm -rf /src/NeMo-speech-data-processor/.git
@@ -34,4 +37,4 @@ RUN find requirements/ -name "*.txt" -exec pip install -r {} \;
3437
WORKDIR /src/NeMo-speech-data-processor
3538

3639
# Set up entrypoint
37-
CMD ["bash"]
40+
CMD ["bash"]

sdp/processors/datasets/hifitts2/download_dataset.py

Lines changed: 29 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ class DownloadHiFiTTS2(BaseParallelProcessor):
4646
4747
Returns:
4848
Utterance files are stored under 'audio_dir' and chapter files are downloaded under 'chapter_dir'.
49-
49+
5050
If exit_on_error is False, then an output manifest will be saved with manifest entries that fail to downlaod,
5151
with error information stored under the 'error_code' and 'error_reason' fields.
5252
@@ -107,29 +107,51 @@ def process_dataset_entry(self, data_entry):
107107
try:
108108
urllib.request.urlretrieve(url=url, filename=chapter_path)
109109
break
110-
except (urllib.error.HTTPError, urllib.error.URLError) as http_error:
111-
error_msg = f"Encountered HTTP error when downloading {url}: {http_error}"
110+
except Exception as ex:
111+
error_msg = f"Encountered exception when downloading {url}: {ex}"
112112
logger.warning(error_msg)
113113

114-
error_code = getattr(http_error, "code", 0)
115-
if (not error_code or str(error_code).startswith("5")) and i < self.num_retries:
114+
if i < self.num_retries:
116115
logger.info(f"Retry {i} for url {url}")
117116
time.sleep(10)
118117
continue
119118

120119
if self.exit_on_error:
121120
raise RuntimeError(error_msg)
122121

122+
if isinstance(ex, urllib.error.URLError):
123+
error_reason = ex.reason
124+
else:
125+
error_reason = repr(ex)
126+
123127
error_data = {
124128
"url": url,
125129
"chapter_filepath": chapter_filepath,
126-
"error_code": error_code,
127-
"error_reason": http_error.reason,
130+
"error_reason": error_reason,
128131
"utterances": utterances,
129132
}
130133
return [DataEntry(data=error_data)]
131134

132135
chapter_audio, sr = librosa.load(path=chapter_path, sr=self.sample_rate)
136+
chapter_duration = librosa.get_duration(y=chapter_audio, sr=sr)
137+
138+
original_duration = data_entry["duration"]
139+
duration_diff = abs(chapter_duration - original_duration)
140+
if duration_diff > 0.1:
141+
error_msg = f"Duration mismatch for {url}: original duration={original_duration}; " \
142+
f"downloaded duration={round(chapter_duration, 2)}"
143+
logger.warning(error_msg)
144+
145+
if self.exit_on_error:
146+
raise RuntimeError(error_msg)
147+
148+
error_data = {
149+
"url": url,
150+
"chapter_filepath": chapter_filepath,
151+
"error_reason": error_msg,
152+
"utterances": utterances,
153+
}
154+
return [DataEntry(data=error_data)]
133155

134156
for utt in utterances:
135157
audio_filepath = utt["audio_filepath"]

0 commit comments

Comments
 (0)