1919# 1. Text preprocessing
2020#
2121# First, the input text is encoded into a list of symbols. In this
22- # tutorial, we will use English characters and phonemes as the symbols.
22+ # tutorial, we will use English characters as the symbols.
2323#
2424# 2. Spectrogram generation
2525#
4747# Preparation
4848# -----------
4949#
50- # First, we install the necessary dependencies. In addition to
51- # ``torchaudio``, ``DeepPhonemizer`` is required to perform phoneme-based
52- # encoding.
53- #
54-
55- # %%
56- # .. code-block:: bash
57- #
58- # %%bash
59- # pip3 install deep_phonemizer
6050
6151import torch
6252import torchaudio
@@ -140,49 +130,6 @@ def text_to_sequence(text):
140130print ([processor .tokens [i ] for i in processed [0 , : lengths [0 ]]])
141131
142132
143- ######################################################################
144- # Phoneme-based encoding
145- # ~~~~~~~~~~~~~~~~~~~~~~
146- #
147- # Phoneme-based encoding is similar to character-based encoding, but it
148- # uses a symbol table based on phonemes and a G2P (Grapheme-to-Phoneme)
149- # model.
150- #
151- # The detail of the G2P model is out of the scope of this tutorial, we will
152- # just look at what the conversion looks like.
153- #
154- # Similar to the case of character-based encoding, the encoding process is
155- # expected to match what a pretrained Tacotron2 model is trained on.
156- # ``torchaudio`` has an interface to create the process.
157- #
158- # The following code illustrates how to make and use the process. Behind
159- # the scene, a G2P model is created using ``DeepPhonemizer`` package, and
160- # the pretrained weights published by the author of ``DeepPhonemizer`` is
161- # fetched.
162- #
163-
164- bundle = torchaudio .pipelines .TACOTRON2_WAVERNN_PHONE_LJSPEECH
165-
166- processor = bundle .get_text_processor ()
167-
168- text = "Hello world! Text to speech!"
169- with torch .inference_mode ():
170- processed , lengths = processor (text )
171-
172- print (processed )
173- print (lengths )
174-
175-
176- ######################################################################
177- # Notice that the encoded values are different from the example of
178- # character-based encoding.
179- #
180- # The intermediate representation looks like the following.
181- #
182-
183- print ([processor .tokens [i ] for i in processed [0 , : lengths [0 ]]])
184-
185-
186133######################################################################
187134# Spectrogram Generation
188135# ----------------------
@@ -202,7 +149,7 @@ def text_to_sequence(text):
202149# :py:class:`~torchaudio.pipelines.Tacotron2TTSBundle`.
203150#
204151
205- bundle = torchaudio .pipelines .TACOTRON2_WAVERNN_PHONE_LJSPEECH
152+ bundle = torchaudio .pipelines .TACOTRON2_WAVERNN_CHAR_LJSPEECH
206153processor = bundle .get_text_processor ()
207154tacotron2 = bundle .get_tacotron2 ().to (device )
208155
@@ -256,7 +203,7 @@ def plot():
256203# WaveRNN model from the same bundle.
257204#
258205
259- bundle = torchaudio .pipelines .TACOTRON2_WAVERNN_PHONE_LJSPEECH
206+ bundle = torchaudio .pipelines .TACOTRON2_WAVERNN_CHAR_LJSPEECH
260207
261208processor = bundle .get_text_processor ()
262209tacotron2 = bundle .get_tacotron2 ().to (device )
@@ -299,7 +246,7 @@ def plot(waveforms, spec, sample_rate):
299246# method and pass the spectrogram.
300247#
301248
302- bundle = torchaudio .pipelines .TACOTRON2_GRIFFINLIM_PHONE_LJSPEECH
249+ bundle = torchaudio .pipelines .TACOTRON2_GRIFFINLIM_CHAR_LJSPEECH
303250
304251processor = bundle .get_text_processor ()
305252tacotron2 = bundle .get_tacotron2 ().to (device )
0 commit comments