19
19
# 1. Text preprocessing
20
20
#
21
21
# First, the input text is encoded into a list of symbols. In this
22
- # tutorial, we will use English characters and phonemes as the symbols.
22
+ # tutorial, we will use English characters as the symbols.
23
23
#
24
24
# 2. Spectrogram generation
25
25
#
47
47
# Preparation
48
48
# -----------
49
49
#
50
- # First, we install the necessary dependencies. In addition to
51
- # ``torchaudio``, ``DeepPhonemizer`` is required to perform phoneme-based
52
- # encoding.
53
- #
54
-
55
- # %%
56
- # .. code-block:: bash
57
- #
58
- # %%bash
59
- # pip3 install deep_phonemizer
60
50
61
51
import torch
62
52
import torchaudio
@@ -140,49 +130,6 @@ def text_to_sequence(text):
140
130
print ([processor .tokens [i ] for i in processed [0 , : lengths [0 ]]])
141
131
142
132
143
- ######################################################################
144
- # Phoneme-based encoding
145
- # ~~~~~~~~~~~~~~~~~~~~~~
146
- #
147
- # Phoneme-based encoding is similar to character-based encoding, but it
148
- # uses a symbol table based on phonemes and a G2P (Grapheme-to-Phoneme)
149
- # model.
150
- #
151
- # The detail of the G2P model is out of the scope of this tutorial, we will
152
- # just look at what the conversion looks like.
153
- #
154
- # Similar to the case of character-based encoding, the encoding process is
155
- # expected to match what a pretrained Tacotron2 model is trained on.
156
- # ``torchaudio`` has an interface to create the process.
157
- #
158
- # The following code illustrates how to make and use the process. Behind
159
- # the scene, a G2P model is created using ``DeepPhonemizer`` package, and
160
- # the pretrained weights published by the author of ``DeepPhonemizer`` is
161
- # fetched.
162
- #
163
-
164
- bundle = torchaudio .pipelines .TACOTRON2_WAVERNN_PHONE_LJSPEECH
165
-
166
- processor = bundle .get_text_processor ()
167
-
168
- text = "Hello world! Text to speech!"
169
- with torch .inference_mode ():
170
- processed , lengths = processor (text )
171
-
172
- print (processed )
173
- print (lengths )
174
-
175
-
176
- ######################################################################
177
- # Notice that the encoded values are different from the example of
178
- # character-based encoding.
179
- #
180
- # The intermediate representation looks like the following.
181
- #
182
-
183
- print ([processor .tokens [i ] for i in processed [0 , : lengths [0 ]]])
184
-
185
-
186
133
######################################################################
187
134
# Spectrogram Generation
188
135
# ----------------------
@@ -202,7 +149,7 @@ def text_to_sequence(text):
202
149
# :py:class:`~torchaudio.pipelines.Tacotron2TTSBundle`.
203
150
#
204
151
205
- bundle = torchaudio .pipelines .TACOTRON2_WAVERNN_PHONE_LJSPEECH
152
+ bundle = torchaudio .pipelines .TACOTRON2_WAVERNN_CHAR_LJSPEECH
206
153
processor = bundle .get_text_processor ()
207
154
tacotron2 = bundle .get_tacotron2 ().to (device )
208
155
@@ -256,7 +203,7 @@ def plot():
256
203
# WaveRNN model from the same bundle.
257
204
#
258
205
259
- bundle = torchaudio .pipelines .TACOTRON2_WAVERNN_PHONE_LJSPEECH
206
+ bundle = torchaudio .pipelines .TACOTRON2_WAVERNN_CHAR_LJSPEECH
260
207
261
208
processor = bundle .get_text_processor ()
262
209
tacotron2 = bundle .get_tacotron2 ().to (device )
@@ -299,7 +246,7 @@ def plot(waveforms, spec, sample_rate):
299
246
# method and pass the spectrogram.
300
247
#
301
248
302
- bundle = torchaudio .pipelines .TACOTRON2_GRIFFINLIM_PHONE_LJSPEECH
249
+ bundle = torchaudio .pipelines .TACOTRON2_GRIFFINLIM_CHAR_LJSPEECH
303
250
304
251
processor = bundle .get_text_processor ()
305
252
tacotron2 = bundle .get_tacotron2 ().to (device )
0 commit comments