33# Copyright 2020 Mobvoi AI Lab, Beijing, China (author: Fangjun Kuang)
44# Apache 2.0
55
6+ # This program converts a transcript file `text` to labels
7+ # used in CTC training.
8+ #
9+ # For example, if we have
10+ #
11+ # the lexicon file `lexicon.txt`
12+ #
13+ # foo f o o
14+ # bar b a r
15+ #
16+ # the phone symbol table `tokens.txt`
17+ #
18+ # <eps> 0
19+ # <blk> 1
20+ # a 2
21+ # b 3
22+ # f 4
23+ # o 5
24+ # r 6
25+ #
26+ # and the transcript file `text`
27+ #
28+ # utt1 foo bar bar
29+ # utt2 bar
30+ #
31+ # Given the above three inputs, this program generates a
32+ # file `labels.ark` containing
33+ #
34+ # utt1 3 4 4 2 1 5 2 1 5
35+ # utt2 2 1 5
36+ #
37+ # where
38+ # - `3 4 4` is from `(4-1) (5-1) (5-1)`, which is from the indices of `f o o`
39+ # - `2 1 5` is from `(3-1) (2-1) (6-1)`, which is from the indices of `b a r`
40+ #
41+ # Note that 1 is subtracted from here since `<eps>` exists only in FSTs
42+ # and the neural network considers index `0` as `<blk>`, Therefore, the integer
43+ # value of every symbol is shifted downwards by 1.
44+
645import argparse
746import os
847
948import kaldi
1049
1150
1251def get_args ():
13- parser = argparse .ArgumentParser (description = 'convert text to labels' )
52+ parser = argparse .ArgumentParser (description = '''
53+ Convert transcript to labels.
54+
55+ It takes the following inputs:
56+
57+ - lexicon.txt, the lexicon file
58+ - tokens.txt, the phone symbol table
59+ - dir, a directory containing the transcript file `text`
60+
61+ It generates `lables.scp` and `labels.ark` in the provided `dir`.
62+
63+ Usage:
64+ python3 ./local/convert_text_to_labels.py \
65+ --lexicon-filename data/lang/lexicon.txt \
66+ --tokens-filename data/lang/tokens.txt \
67+ --dir data/train
68+
69+ It will generate data/train/labels.scp and data/train/labels.ark.
70+ ''' )
71+
72+ parser .add_argument ('--lexicon-filename' ,
73+ dest = 'lexicon_filename' ,
74+ type = str ,
75+ help = 'filename for lexicon.txt' )
76+
77+ parser .add_argument ('--tokens-filename' ,
78+ dest = 'tokens_filename' ,
79+ type = str ,
80+ help = 'filename for the phone symbol table tokens.txt' )
1481
15- parser .add_argument ('--lexicon-filename' , dest = 'lexicon_filename' , type = str )
16- parser .add_argument ('--tokens-filename' , dest = 'tokens_filename' , type = str )
17- parser .add_argument ('--dir' , help = 'input/output dir' , type = str )
82+ parser .add_argument ('--dir' ,
83+ type = str ,
84+ help = '''the dir containing the transcript text;
85+ it will contain the generated labels.scp and labels.ark''' )
1886
1987 args = parser .parse_args ()
2088
@@ -26,14 +94,33 @@ def get_args():
2694
2795
2896def read_lexicon (filename ):
29- '''
97+ '''Read lexicon.txt and save it into a Python dict.
98+
99+ Args:
100+ filename: filename of lexicon.txt.
101+
102+ Every line in lexicon.txt has the following format:
103+
104+ word phone1 phone2 phone3 ... phoneN
105+
106+ That is, fields are separated by spaces. The first
107+ field is the word and the remaining fields are the
108+ phones indicating the pronunciation of the word.
109+
30110 Returns:
31111 a dict whose keys are words and values are phones.
32112 '''
33113 lexicon = dict ()
114+
34115 with open (filename , 'r' , encoding = 'utf-8' ) as f :
35116 for line in f :
117+ # line contains:
118+ # word phone1 phone2 phone3 ... phoneN
36119 word_phones = line .split ()
120+
121+ # It should have at least two fields:
122+ # the first one is the word and
123+ # the second one is the pronunciation
37124 assert len (word_phones ) >= 2
38125
39126 word = word_phones [0 ]
@@ -48,23 +135,43 @@ def read_lexicon(filename):
48135
49136
50137def read_tokens (filename ):
51- '''
138+ '''Read phone symbol table tokens.txt and save it into a Python dict.
139+
140+ Note that we remove the symbol `<eps>` and shift every symbol index
141+ downwards by 1.
142+
143+ Args:
144+ filename: filename of the phone symbol table tokens.txt.
145+
146+ Two integer values have specific meanings in the symbol
147+ table. The first one is 0, which is reserved for `<eps>`.
148+ And the second one is 1, which is reserved for the
149+ blank symbol `<blk>`.
150+ Other integer values do NOT have specific meanings.
151+
52152 Returns:
53153 a dict whose keys are phones and values are phone indices
54154 '''
55155 tokens = dict ()
56156 with open (filename , 'r' , encoding = 'utf-8' ) as f :
57157 for line in f :
158+ # line has the format: phone index
58159 phone_index = line .split ()
160+
161+ # it should have two fields:
162+ # the first field is the phone
163+ # and the second field is its index
59164 assert len (phone_index ) == 2
60165
61166 phone = phone_index [0 ]
62167 index = int (phone_index [1 ])
63168
64169 if phone == '<eps>' :
170+ # <eps> appears only in the FSTs.
65171 continue
66172
67173 # decreased by one since we removed <eps> above
174+ # and every symbol index is shifted downwards by 1
68175 index -= 1
69176
70177 assert phone not in tokens
@@ -82,27 +189,45 @@ def read_tokens(filename):
82189
83190
84191def read_text (filename ):
85- '''
192+ '''Read transcript file `text` and save it into a Python dict.
193+
194+ Args:
195+ filename: filename of the transcript file `text`.
196+
86197 Returns:
87198 a dict whose keys are utterance IDs and values are texts
88199 '''
89200 transcript = dict ()
90201
91202 with open (filename , 'r' , encoding = 'utf-8' ) as f :
92203 for line in f :
93- utt_text = line . split ()
94- assert len ( utt_text ) >= 2
204+ # line has the format: uttid word1 word2 word3 ... wordN
205+ uttid_text = line . split ()
95206
96- utt = utt_text [0 ]
97- text = utt_text [1 :]
207+ # it should have at least 2 fields:
208+ # the first field is the utterance id;
209+ # the remaining fields are the words of the utterance
210+ assert len (uttid_text ) >= 2
98211
99- assert utt not in transcript
100- transcript [utt ] = text
212+ uttid = uttid_text [0 ]
213+ text = uttid_text [1 :]
214+
215+ assert uttid not in transcript
216+ transcript [uttid ] = text
101217
102218 return transcript
103219
104220
105221def phones_to_indices (phone_list , tokens ):
222+ '''Convert a list of phones to a list of indices via a phone symbol table.
223+
224+ Args:
225+ phone_list: a list of phones
226+ tokens: a dict representing a phone symbol table.
227+
228+ Returns:
229+ Return a list of indices corresponding to the given phones
230+ '''
106231 index_list = []
107232
108233 for phone in phone_list :
@@ -125,27 +250,27 @@ def main():
125250
126251 transcript_labels = dict ()
127252
128- for utt , text in transcript .items ():
253+ for uttid , text in transcript .items ():
129254 labels = []
130- for t in text :
255+ for word in text :
131256 # TODO(fangjun): add support for OOV.
132- phones = lexicon [t ]
257+ phones = lexicon [word ]
133258
134259 indices = phones_to_indices (phones , tokens )
135260
136261 labels .extend (indices )
137262
138- assert utt not in transcript_labels
263+ assert uttid not in transcript_labels
139264
140- transcript_labels [utt ] = labels
265+ transcript_labels [uttid ] = labels
141266
142267 wspecifier = 'ark,scp:{dir}/labels.ark,{dir}/labels.scp' .format (
143268 dir = args .dir )
144269
145270 writer = kaldi .IntVectorWriter (wspecifier )
146271
147- for utt , labels in transcript_labels .items ():
148- writer .Write (utt , labels )
272+ for uttid , labels in transcript_labels .items ():
273+ writer .Write (uttid , labels )
149274
150275 writer .Close ()
151276
0 commit comments