Update tacotron2_pipeline_tutorial.py (#3759)

mikeboensel · web-flow · commit 17a708152592 · 2024-03-18T18:46:24.000Z
* Update tacotron2_pipeline_tutorial.py

- Fixed typo
- Clarified what was being done in different sections
diff --git a/examples/tutorials/tacotron2_pipeline_tutorial.py b/examples/tutorials/tacotron2_pipeline_tutorial.py
@@ -23,13 +23,13 @@
 #
 # 2. Spectrogram generation
 #
-#    From the encoded text, a spectrogram is generated. We use ``Tacotron2``
+#    From the encoded text, a spectrogram is generated. We use the ``Tacotron2``
 #    model for this.
 #
 # 3. Time-domain conversion
 #
 #    The last step is converting the spectrogram into the waveform. The
-#    process to generate speech from spectrogram is also called Vocoder.
+#    process to generate speech from spectrogram is also called a Vocoder.
 #    In this tutorial, three different vocoders are used,
 #    :py:class:`~torchaudio.models.WaveRNN`,
 #    :py:class:`~torchaudio.transforms.GriffinLim`, and
@@ -90,17 +90,13 @@
 # works.
 #
 # Since the pre-trained Tacotron2 model expects specific set of symbol
-# tables, the same functionalities available in ``torchaudio``. This
-# section is more for the explanation of the basis of encoding.
+# tables, the same functionalities is available in ``torchaudio``. However,
+# we will first manually implement the encoding to aid in understanding.
 #
-# Firstly, we define the set of symbols. For example, we can use
+# First, we define the set of symbols
 # ``'_-!\'(),.:;? abcdefghijklmnopqrstuvwxyz'``. Then, we will map the
 # each character of the input text into the index of the corresponding
-# symbol in the table.
-#
-# The following is an example of such processing. In the example, symbols
-# that are not in the table are ignored.
-#
+# symbol in the table. Symbols that are not in the table are ignored.
 
 symbols = "_-!'(),.:;? abcdefghijklmnopqrstuvwxyz"
 look_up = {s: i for i, s in enumerate(symbols)}
@@ -118,8 +114,8 @@ def text_to_sequence(text):
 
 ######################################################################
 # As mentioned in the above, the symbol table and indices must match
-# what the pretrained Tacotron2 model expects. ``torchaudio`` provides the
-# transform along with the pretrained model. For example, you can
+# what the pretrained Tacotron2 model expects. ``torchaudio`` provides the same
+# transform along with the pretrained model. You can
 # instantiate and use such transform as follow.
 #
 
@@ -133,12 +129,12 @@ def text_to_sequence(text):
 
 
 ######################################################################
-# The ``processor`` object takes either a text or list of texts as inputs.
+# Note: The output of our manual encoding and the ``torchaudio`` ``text_processor`` output matches (meaning we correctly re-implemented what the library does internally). It takes either a text or list of texts as inputs.
 # When a list of texts are provided, the returned ``lengths`` variable
 # represents the valid length of each processed tokens in the output
 # batch.
 #
-# The intermediate representation can be retrieved as follow.
+# The intermediate representation can be retrieved as follows:
 #
 
 print([processor.tokens[i] for i in processed[0, : lengths[0]]])
@@ -152,7 +148,7 @@ def text_to_sequence(text):
 # uses a symbol table based on phonemes and a G2P (Grapheme-to-Phoneme)
 # model.
 #
-# The detail of the G2P model is out of scope of this tutorial, we will
+# The detail of the G2P model is out of the scope of this tutorial, we will
 # just look at what the conversion looks like.
 #
 # Similar to the case of character-based encoding, the encoding process is
@@ -195,7 +191,7 @@ def text_to_sequence(text):
 # encoded text. For the detail of the model, please refer to `the
 # paper <https://arxiv.org/abs/1712.05884>`__.
 #
-# It is easy to instantiate a Tacotron2 model with pretrained weight,
+# It is easy to instantiate a Tacotron2 model with pretrained weights,
 # however, note that the input to Tacotron2 models need to be processed
 # by the matching text processor.
 #
@@ -224,7 +220,7 @@ def text_to_sequence(text):
 
 ######################################################################
 # Note that ``Tacotron2.infer`` method perfoms multinomial sampling,
-# therefor, the process of generating the spectrogram incurs randomness.
+# therefore, the process of generating the spectrogram incurs randomness.
 #
 
 
@@ -245,16 +241,16 @@ def plot():
 # -------------------
 #
 # Once the spectrogram is generated, the last process is to recover the
-# waveform from the spectrogram.
+# waveform from the spectrogram using a vocoder.
 #
 # ``torchaudio`` provides vocoders based on ``GriffinLim`` and
 # ``WaveRNN``.
 #
 
 
 ######################################################################
-# WaveRNN
-# ~~~~~~~
+# WaveRNN Vocoder
+# ~~~~~~~~~~~~~~~
 #
 # Continuing from the previous section, we can instantiate the matching
 # WaveRNN model from the same bundle.
@@ -294,11 +290,11 @@ def plot(waveforms, spec, sample_rate):
 
 
 ######################################################################
-# Griffin-Lim
-# ~~~~~~~~~~~
+# Griffin-Lim Vocoder
+# ~~~~~~~~~~~~~~~~~~~
 #
 # Using the Griffin-Lim vocoder is same as WaveRNN. You can instantiate
-# the vocode object with
+# the vocoder object with
 # :py:func:`~torchaudio.pipelines.Tacotron2TTSBundle.get_vocoder`
 # method and pass the spectrogram.
 #
@@ -323,8 +319,8 @@ def plot(waveforms, spec, sample_rate):
 
 
 ######################################################################
-# Waveglow
-# ~~~~~~~~
+# Waveglow Vocoder
+# ~~~~~~~~~~~~~~~~
 #
 # Waveglow is a vocoder published by Nvidia. The pretrained weights are
 # published on Torch Hub. One can instantiate the model using ``torch.hub``