Skip to content

Commit b832be4

Browse files
committed
add more documentation.
1 parent e6526be commit b832be4

File tree

2 files changed

+169
-22
lines changed

2 files changed

+169
-22
lines changed

egs/aishell/s10b/local/convert_text_to_labels.py

Lines changed: 145 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -3,18 +3,86 @@
33
# Copyright 2020 Mobvoi AI Lab, Beijing, China (author: Fangjun Kuang)
44
# Apache 2.0
55

6+
# This program converts a transcript file `text` to labels
7+
# used in CTC training.
8+
#
9+
# For example, if we have
10+
#
11+
# the lexicon file `lexicon.txt`
12+
#
13+
# foo f o o
14+
# bar b a r
15+
#
16+
# the phone symbol table `tokens.txt`
17+
#
18+
# <eps> 0
19+
# <blk> 1
20+
# a 2
21+
# b 3
22+
# f 4
23+
# o 5
24+
# r 6
25+
#
26+
# and the transcript file `text`
27+
#
28+
# utt1 foo bar bar
29+
# utt2 bar
30+
#
31+
# Given the above three inputs, this program generates a
32+
# file `labels.ark` containing
33+
#
34+
# utt1 3 4 4 2 1 5 2 1 5
35+
# utt2 2 1 5
36+
#
37+
# where
38+
# - `3 4 4` is from `(4-1) (5-1) (5-1)`, which is from the indices of `f o o`
39+
# - `2 1 5` is from `(3-1) (2-1) (6-1)`, which is from the indices of `b a r`
40+
#
41+
# Note that 1 is subtracted from here since `<eps>` exists only in FSTs
42+
# and the neural network considers index `0` as `<blk>`, Therefore, the integer
43+
# value of every symbol is shifted downwards by 1.
44+
645
import argparse
746
import os
847

948
import kaldi
1049

1150

1251
def get_args():
13-
parser = argparse.ArgumentParser(description='convert text to labels')
52+
parser = argparse.ArgumentParser(description='''
53+
Convert transcript to labels.
54+
55+
It takes the following inputs:
56+
57+
- lexicon.txt, the lexicon file
58+
- tokens.txt, the phone symbol table
59+
- dir, a directory containing the transcript file `text`
60+
61+
It generates `lables.scp` and `labels.ark` in the provided `dir`.
62+
63+
Usage:
64+
python3 ./local/convert_text_to_labels.py \
65+
--lexicon-filename data/lang/lexicon.txt \
66+
--tokens-filename data/lang/tokens.txt \
67+
--dir data/train
68+
69+
It will generate data/train/labels.scp and data/train/labels.ark.
70+
''')
71+
72+
parser.add_argument('--lexicon-filename',
73+
dest='lexicon_filename',
74+
type=str,
75+
help='filename for lexicon.txt')
76+
77+
parser.add_argument('--tokens-filename',
78+
dest='tokens_filename',
79+
type=str,
80+
help='filename for the phone symbol table tokens.txt')
1481

15-
parser.add_argument('--lexicon-filename', dest='lexicon_filename', type=str)
16-
parser.add_argument('--tokens-filename', dest='tokens_filename', type=str)
17-
parser.add_argument('--dir', help='input/output dir', type=str)
82+
parser.add_argument('--dir',
83+
type=str,
84+
help='''the dir containing the transcript text;
85+
it will contain the generated labels.scp and labels.ark''')
1886

1987
args = parser.parse_args()
2088

@@ -26,14 +94,33 @@ def get_args():
2694

2795

2896
def read_lexicon(filename):
29-
'''
97+
'''Read lexicon.txt and save it into a Python dict.
98+
99+
Args:
100+
filename: filename of lexicon.txt.
101+
102+
Every line in lexicon.txt has the following format:
103+
104+
word phone1 phone2 phone3 ... phoneN
105+
106+
That is, fields are separated by spaces. The first
107+
field is the word and the remaining fields are the
108+
phones indicating the pronunciation of the word.
109+
30110
Returns:
31111
a dict whose keys are words and values are phones.
32112
'''
33113
lexicon = dict()
114+
34115
with open(filename, 'r', encoding='utf-8') as f:
35116
for line in f:
117+
# line contains:
118+
# word phone1 phone2 phone3 ... phoneN
36119
word_phones = line.split()
120+
121+
# It should have at least two fields:
122+
# the first one is the word and
123+
# the second one is the pronunciation
37124
assert len(word_phones) >= 2
38125

39126
word = word_phones[0]
@@ -48,23 +135,43 @@ def read_lexicon(filename):
48135

49136

50137
def read_tokens(filename):
51-
'''
138+
'''Read phone symbol table tokens.txt and save it into a Python dict.
139+
140+
Note that we remove the symbol `<eps>` and shift every symbol index
141+
downwards by 1.
142+
143+
Args:
144+
filename: filename of the phone symbol table tokens.txt.
145+
146+
Two integer values have specific meanings in the symbol
147+
table. The first one is 0, which is reserved for `<eps>`.
148+
And the second one is 1, which is reserved for the
149+
blank symbol `<blk>`.
150+
Other integer values do NOT have specific meanings.
151+
52152
Returns:
53153
a dict whose keys are phones and values are phone indices
54154
'''
55155
tokens = dict()
56156
with open(filename, 'r', encoding='utf-8') as f:
57157
for line in f:
158+
# line has the format: phone index
58159
phone_index = line.split()
160+
161+
# it should have two fields:
162+
# the first field is the phone
163+
# and the second field is its index
59164
assert len(phone_index) == 2
60165

61166
phone = phone_index[0]
62167
index = int(phone_index[1])
63168

64169
if phone == '<eps>':
170+
# <eps> appears only in the FSTs.
65171
continue
66172

67173
# decreased by one since we removed <eps> above
174+
# and every symbol index is shifted downwards by 1
68175
index -= 1
69176

70177
assert phone not in tokens
@@ -82,27 +189,45 @@ def read_tokens(filename):
82189

83190

84191
def read_text(filename):
85-
'''
192+
'''Read transcript file `text` and save it into a Python dict.
193+
194+
Args:
195+
filename: filename of the transcript file `text`.
196+
86197
Returns:
87198
a dict whose keys are utterance IDs and values are texts
88199
'''
89200
transcript = dict()
90201

91202
with open(filename, 'r', encoding='utf-8') as f:
92203
for line in f:
93-
utt_text = line.split()
94-
assert len(utt_text) >= 2
204+
# line has the format: uttid word1 word2 word3 ... wordN
205+
uttid_text = line.split()
95206

96-
utt = utt_text[0]
97-
text = utt_text[1:]
207+
# it should have at least 2 fields:
208+
# the first field is the utterance id;
209+
# the remaining fields are the words of the utterance
210+
assert len(uttid_text) >= 2
98211

99-
assert utt not in transcript
100-
transcript[utt] = text
212+
uttid = uttid_text[0]
213+
text = uttid_text[1:]
214+
215+
assert uttid not in transcript
216+
transcript[uttid] = text
101217

102218
return transcript
103219

104220

105221
def phones_to_indices(phone_list, tokens):
222+
'''Convert a list of phones to a list of indices via a phone symbol table.
223+
224+
Args:
225+
phone_list: a list of phones
226+
tokens: a dict representing a phone symbol table.
227+
228+
Returns:
229+
Return a list of indices corresponding to the given phones
230+
'''
106231
index_list = []
107232

108233
for phone in phone_list:
@@ -125,27 +250,27 @@ def main():
125250

126251
transcript_labels = dict()
127252

128-
for utt, text in transcript.items():
253+
for uttid, text in transcript.items():
129254
labels = []
130-
for t in text:
255+
for word in text:
131256
# TODO(fangjun): add support for OOV.
132-
phones = lexicon[t]
257+
phones = lexicon[word]
133258

134259
indices = phones_to_indices(phones, tokens)
135260

136261
labels.extend(indices)
137262

138-
assert utt not in transcript_labels
263+
assert uttid not in transcript_labels
139264

140-
transcript_labels[utt] = labels
265+
transcript_labels[uttid] = labels
141266

142267
wspecifier = 'ark,scp:{dir}/labels.ark,{dir}/labels.scp'.format(
143268
dir=args.dir)
144269

145270
writer = kaldi.IntVectorWriter(wspecifier)
146271

147-
for utt, labels in transcript_labels.items():
148-
writer.Write(utt, labels)
272+
for uttid, labels in transcript_labels.items():
273+
writer.Write(uttid, labels)
149274

150275
writer.Close()
151276

egs/aishell/s10b/local/token_to_fst.py

Lines changed: 24 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,16 +3,38 @@
33
# Copyright 2020 Mobvoi AI Lab, Beijing, China (author: Fangjun Kuang)
44
# Apache 2.0
55

6+
# This program takes as input a phone symbol table
7+
# `tokens.txt` and prints a text fst to the console.
8+
#
9+
# You can use `fstcompile` to convert the printed text fst
10+
# to a binary fst.
11+
#
12+
# Two integer values in the symbol table have particular meaning:
13+
# - 0 for `<eps>`
14+
# - 1 for the blank symbol `<blk>`
15+
616
import argparse
717
import os
818

919

1020
def get_args():
11-
parser = argparse.ArgumentParser(
12-
description='convert tokens.txt to tokens.fst')
21+
parser = argparse.ArgumentParser(description='''
22+
Convert tokens.txt to tokens.fst.
23+
24+
Usage:
25+
python3 ./local/token_to_fst.py \
26+
--tokens-txt-filename data/lang/tokens.txt |
27+
fstcompile \
28+
--isymbols=data/lang/tokens.txt \
29+
--osymbols=data/lang/tokens.txt \
30+
--keep_isymbols=false \
31+
--keep_osymbols=false |
32+
fstarcsort --sort_type=olabel > $data/lang/T.fst || exit 1
33+
''')
1334

1435
parser.add_argument('--tokens-txt-filename',
1536
dest='tokens_txt_filename',
37+
help="a phone symbol table",
1638
type=str)
1739

1840
args = parser.parse_args()

0 commit comments

Comments
 (0)