add more documentation.

csukuangfj · csukuangfj · commit b832be4e5951 · 2020-02-20T23:11:38.000+08:00
diff --git a/egs/aishell/s10b/local/convert_text_to_labels.py b/egs/aishell/s10b/local/convert_text_to_labels.py
@@ -3,18 +3,86 @@
 # Copyright 2020 Mobvoi AI Lab, Beijing, China (author: Fangjun Kuang)
 # Apache 2.0
 
+# This program converts a transcript file `text` to labels
+# used in CTC training.
+#
+# For example, if we have
+#
+# the lexicon file `lexicon.txt`
+#
+# foo f o o
+# bar b a r
+#
+# the phone symbol table `tokens.txt`
+#
+# <eps> 0
+# <blk> 1
+# a 2
+# b 3
+# f 4
+# o 5
+# r 6
+#
+# and the transcript file `text`
+#
+# utt1 foo bar bar
+# utt2 bar
+#
+# Given the above three inputs, this program generates a
+# file `labels.ark` containing
+#
+# utt1 3 4 4 2 1 5 2 1 5
+# utt2 2 1 5
+#
+# where
+# - `3 4 4` is from `(4-1) (5-1) (5-1)`, which is from the indices of `f o o`
+# - `2 1 5` is from `(3-1) (2-1) (6-1)`, which is from the indices of `b a r`
+#
+# Note that 1 is subtracted from here since `<eps>` exists only in FSTs
+# and the neural network considers index `0` as `<blk>`, Therefore, the integer
+# value of every symbol is shifted downwards by 1.
+
 import argparse
 import os
 
 import kaldi
 
 
 def get_args():
-    parser = argparse.ArgumentParser(description='convert text to labels')
+    parser = argparse.ArgumentParser(description='''
+Convert transcript to labels.
+
+It takes the following inputs:
+
+- lexicon.txt, the lexicon file
+- tokens.txt, the phone symbol table
+- dir, a directory containing the transcript file `text`
+
+It generates `lables.scp` and `labels.ark` in the provided `dir`.
+
+Usage:
+    python3 ./local/convert_text_to_labels.py \
+            --lexicon-filename data/lang/lexicon.txt \
+            --tokens-filename data/lang/tokens.txt \
+            --dir data/train
+
+    It will generate data/train/labels.scp and data/train/labels.ark.
+        ''')
+
+    parser.add_argument('--lexicon-filename',
+                        dest='lexicon_filename',
+                        type=str,
+                        help='filename for lexicon.txt')
+
+    parser.add_argument('--tokens-filename',
+                        dest='tokens_filename',
+                        type=str,
+                        help='filename for the phone symbol table tokens.txt')
 
-    parser.add_argument('--lexicon-filename', dest='lexicon_filename', type=str)
-    parser.add_argument('--tokens-filename', dest='tokens_filename', type=str)
-    parser.add_argument('--dir', help='input/output dir', type=str)
+    parser.add_argument('--dir',
+                        type=str,
+                        help='''the dir containing the transcript text;
+        it will contain the generated labels.scp and labels.ark''')
 
     args = parser.parse_args()
 
@@ -26,14 +94,33 @@ def get_args():
 
 
 def read_lexicon(filename):
-    '''
+    '''Read lexicon.txt and save it into a Python dict.
+
+    Args:
+        filename: filename of lexicon.txt.
+
+                  Every line in lexicon.txt has the following format:
+
+                    word phone1 phone2 phone3 ... phoneN
+
+                  That is, fields are separated by spaces. The first
+                  field is the word and the remaining fields are the
+                  phones indicating the pronunciation of the word.
+
     Returns:
         a dict whose keys are words and values are phones.
     '''
     lexicon = dict()
+
     with open(filename, 'r', encoding='utf-8') as f:
         for line in f:
+            # line contains:
+            # word phone1 phone2 phone3 ... phoneN
             word_phones = line.split()
+
+            # It should have at least two fields:
+            # the first one is the word and
+            # the second one is the pronunciation
             assert len(word_phones) >= 2
 
             word = word_phones[0]
@@ -48,23 +135,43 @@ def read_lexicon(filename):
 
 
 def read_tokens(filename):
-    '''
+    '''Read phone symbol table tokens.txt and save it into a Python dict.
+
+    Note that we remove the symbol `<eps>` and shift every symbol index
+    downwards by 1.
+
+    Args:
+        filename: filename of the phone symbol table tokens.txt.
+
+                  Two integer values have specific meanings in the symbol
+                  table. The first one is 0, which is reserved for `<eps>`.
+                  And the second one is 1, which is reserved for the
+                  blank symbol `<blk>`.
+                  Other integer values do NOT have specific meanings.
+
     Returns:
         a dict whose keys are phones and values are phone indices
     '''
     tokens = dict()
     with open(filename, 'r', encoding='utf-8') as f:
         for line in f:
+            # line has the format: phone index
             phone_index = line.split()
+
+            # it should have two fields:
+            # the first field is the phone
+            # and the second field is its index
             assert len(phone_index) == 2
 
             phone = phone_index[0]
             index = int(phone_index[1])
 
             if phone == '<eps>':
+                # <eps> appears only in the FSTs.
                 continue
 
             # decreased by one since we removed <eps> above
+            # and every symbol index is shifted downwards by 1
             index -= 1
 
             assert phone not in tokens
@@ -82,27 +189,45 @@ def read_tokens(filename):
 
 
 def read_text(filename):
-    '''
+    '''Read transcript file `text` and save it into a Python dict.
+
+    Args:
+        filename: filename of the transcript file `text`.
+
     Returns:
         a dict whose keys are utterance IDs and values are texts
     '''
     transcript = dict()
 
     with open(filename, 'r', encoding='utf-8') as f:
         for line in f:
-            utt_text = line.split()
-            assert len(utt_text) >= 2
+            # line has the format: uttid word1 word2 word3 ... wordN
+            uttid_text = line.split()
 
-            utt = utt_text[0]
-            text = utt_text[1:]
+            # it should have at least 2 fields:
+            # the first field is the utterance id;
+            # the remaining fields are the words of the utterance
+            assert len(uttid_text) >= 2
 
-            assert utt not in transcript
-            transcript[utt] = text
+            uttid = uttid_text[0]
+            text = uttid_text[1:]
+
+            assert uttid not in transcript
+            transcript[uttid] = text
 
     return transcript
 
 
 def phones_to_indices(phone_list, tokens):
+    '''Convert a list of phones to a list of indices via a phone symbol table.
+
+    Args:
+        phone_list: a list of phones
+        tokens: a dict representing a phone symbol table.
+
+    Returns:
+        Return a list of indices corresponding to the given phones
+    '''
     index_list = []
 
     for phone in phone_list:
@@ -125,27 +250,27 @@ def main():
 
     transcript_labels = dict()
 
-    for utt, text in transcript.items():
+    for uttid, text in transcript.items():
         labels = []
-        for t in text:
+        for word in text:
             # TODO(fangjun): add support for OOV.
-            phones = lexicon[t]
+            phones = lexicon[word]
 
             indices = phones_to_indices(phones, tokens)
 
             labels.extend(indices)
 
-        assert utt not in transcript_labels
+        assert uttid not in transcript_labels
 
-        transcript_labels[utt] = labels
+        transcript_labels[uttid] = labels
 
     wspecifier = 'ark,scp:{dir}/labels.ark,{dir}/labels.scp'.format(
         dir=args.dir)
 
     writer = kaldi.IntVectorWriter(wspecifier)
 
-    for utt, labels in transcript_labels.items():
-        writer.Write(utt, labels)
+    for uttid, labels in transcript_labels.items():
+        writer.Write(uttid, labels)
 
     writer.Close()
 
diff --git a/egs/aishell/s10b/local/token_to_fst.py b/egs/aishell/s10b/local/token_to_fst.py
@@ -3,16 +3,38 @@
 # Copyright 2020 Mobvoi AI Lab, Beijing, China (author: Fangjun Kuang)
 # Apache 2.0
 
+# This program takes as input a phone symbol table
+# `tokens.txt` and prints a text fst to the console.
+#
+# You can use `fstcompile` to convert the printed text fst
+# to a binary fst.
+#
+# Two integer values in the symbol table have particular meaning:
+#  - 0 for `<eps>`
+#  - 1 for the blank symbol `<blk>`
+
 import argparse
 import os
 
 
 def get_args():
-    parser = argparse.ArgumentParser(
-        description='convert tokens.txt to tokens.fst')
+    parser = argparse.ArgumentParser(description='''
+Convert tokens.txt to tokens.fst.
+
+Usage:
+    python3 ./local/token_to_fst.py \
+            --tokens-txt-filename data/lang/tokens.txt |
+    fstcompile \
+      --isymbols=data/lang/tokens.txt \
+      --osymbols=data/lang/tokens.txt \
+      --keep_isymbols=false \
+      --keep_osymbols=false |
+    fstarcsort --sort_type=olabel > $data/lang/T.fst || exit 1
+''')
 
     parser.add_argument('--tokens-txt-filename',
                         dest='tokens_txt_filename',
+                        help="a phone symbol table",
                         type=str)
 
     args = parser.parse_args()