Skip to content

Commit e6526be

Browse files
committed
begin to add CTC training with kaldi pybind and PyTorch.
1 parent f5875be commit e6526be

File tree

14 files changed

+782
-0
lines changed

14 files changed

+782
-0
lines changed

egs/aishell/s10b/cmd.sh

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
# you can change cmd.sh depending on what type of queue you are using.
2+
# If you have no queueing system and want to run on a local machine, you
3+
# can change all instances 'queue.pl' to run.pl (but be careful and run
4+
# commands one by one: most recipes will exhaust the memory on your
5+
# machine). queue.pl works with GridEngine (qsub). slurm.pl works
6+
# with slurm. Different queues are configured differently, with different
7+
# queue names and different ways of specifying things like memory;
8+
# to account for these differences you can create and edit the file
9+
# conf/queue.conf to match your queue's configuration. Search for
10+
# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
11+
# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
12+
13+
export train_cmd="run.pl"
14+
export decode_cmd="run.pl"
15+
export mkgraph_cmd="run.pl"
16+
export cuda_cmd="run.pl"

egs/aishell/s10b/conf/fbank.conf

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
--num-mel-bins=40
Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
#!/bin/bash
2+
3+
# Copyright 2017 Xingyu Na
4+
# Apache 2.0
5+
6+
. ./path.sh || exit 1;
7+
8+
if [ $# != 2 ]; then
9+
echo "Usage: $0 <audio-path> <text-path>"
10+
echo " $0 /export/a05/xna/data/data_aishell/wav /export/a05/xna/data/data_aishell/transcript"
11+
exit 1;
12+
fi
13+
14+
aishell_audio_dir=$1
15+
aishell_text=$2/aishell_transcript_v0.8.txt
16+
17+
train_dir=data/local/train
18+
dev_dir=data/local/dev
19+
test_dir=data/local/test
20+
tmp_dir=data/local/tmp
21+
22+
mkdir -p $train_dir
23+
mkdir -p $dev_dir
24+
mkdir -p $test_dir
25+
mkdir -p $tmp_dir
26+
27+
# data directory check
28+
if [ ! -d $aishell_audio_dir ] || [ ! -f $aishell_text ]; then
29+
echo "Error: $0 requires two directory arguments"
30+
exit 1;
31+
fi
32+
33+
# find wav audio file for train, dev and test resp.
34+
find $aishell_audio_dir -iname "*.wav" > $tmp_dir/wav.flist
35+
n=`cat $tmp_dir/wav.flist | wc -l`
36+
[ $n -ne 141925 ] && \
37+
echo Warning: expected 141925 data data files, found $n
38+
39+
grep -i "wav/train" $tmp_dir/wav.flist > $train_dir/wav.flist || exit 1;
40+
grep -i "wav/dev" $tmp_dir/wav.flist > $dev_dir/wav.flist || exit 1;
41+
grep -i "wav/test" $tmp_dir/wav.flist > $test_dir/wav.flist || exit 1;
42+
43+
rm -r $tmp_dir
44+
45+
# Transcriptions preparation
46+
for dir in $train_dir $dev_dir $test_dir; do
47+
echo Preparing $dir transcriptions
48+
sed -e 's/\.wav//' $dir/wav.flist | awk -F '/' '{print $NF}' > $dir/utt.list
49+
sed -e 's/\.wav//' $dir/wav.flist | awk -F '/' '{i=NF-1;printf("%s %s\n",$NF,$i)}' > $dir/utt2spk_all
50+
paste -d' ' $dir/utt.list $dir/wav.flist > $dir/wav.scp_all
51+
utils/filter_scp.pl -f 1 $dir/utt.list $aishell_text > $dir/transcripts.txt
52+
awk '{print $1}' $dir/transcripts.txt > $dir/utt.list
53+
utils/filter_scp.pl -f 1 $dir/utt.list $dir/utt2spk_all | sort -u > $dir/utt2spk
54+
utils/filter_scp.pl -f 1 $dir/utt.list $dir/wav.scp_all | sort -u > $dir/wav.scp
55+
sort -u $dir/transcripts.txt > $dir/text
56+
utils/utt2spk_to_spk2utt.pl $dir/utt2spk > $dir/spk2utt
57+
done
58+
59+
mkdir -p data/train data/dev data/test
60+
61+
for f in spk2utt utt2spk wav.scp text; do
62+
cp $train_dir/$f data/train/$f || exit 1;
63+
cp $dev_dir/$f data/dev/$f || exit 1;
64+
cp $test_dir/$f data/test/$f || exit 1;
65+
done
66+
67+
echo "$0: AISHELL data preparation succeeded"
68+
exit 0;
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
#!/bin/bash
2+
3+
# Copyright 2017 Xingyu Na
4+
# Apache 2.0
5+
6+
# prepare dict resources
7+
8+
. ./path.sh
9+
10+
[ $# != 1 ] && echo "Usage: $0 <resource-path>" && exit 1;
11+
12+
res_dir=$1
13+
dict_dir=data/local/dict
14+
mkdir -p $dict_dir
15+
cp $res_dir/lexicon.txt $dict_dir
16+
17+
cat $dict_dir/lexicon.txt | awk '{ for(n=2;n<=NF;n++){ phones[$n] = 1; }} END{for (p in phones) print p;}'| \
18+
perl -e 'while(<>){ chomp($_); $phone = $_; next if ($phone eq "sil");
19+
m:^([^\d]+)(\d*)$: || die "Bad phone $_"; $q{$1} .= "$phone "; }
20+
foreach $l (values %q) {print "$l\n";}
21+
' | sort -k1 > $dict_dir/nonsilence_phones.txt || exit 1;
22+
23+
echo sil > $dict_dir/silence_phones.txt
24+
25+
echo sil > $dict_dir/optional_silence.txt
26+
27+
# No "extra questions" in the input to this setup, as we don't
28+
# have stress or tone
29+
30+
cat $dict_dir/silence_phones.txt| awk '{printf("%s ", $1);} END{printf "\n";}' > $dict_dir/extra_questions.txt || exit 1;
31+
cat $dict_dir/nonsilence_phones.txt | perl -e 'while(<>){ foreach $p (split(" ", $_)) {
32+
$p =~ m:^([^\d]+)(\d*)$: || die "Bad phone $_"; $q{$2} .= "$p "; } } foreach $l (values %q) {print "$l\n";}' \
33+
>> $dict_dir/extra_questions.txt || exit 1;
34+
35+
echo "$0: AISHELL dict preparation succeeded"
36+
exit 0;
Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,88 @@
1+
#!/bin/bash
2+
3+
4+
# To be run from one directory above this script.
5+
. ./path.sh
6+
7+
text=data/local/train/text
8+
lexicon=data/local/dict/lexicon.txt
9+
10+
for f in "$text" "$lexicon"; do
11+
[ ! -f $x ] && echo "$0: No such file $f" && exit 1;
12+
done
13+
14+
# This script takes no arguments. It assumes you have already run
15+
# aishell_data_prep.sh.
16+
# It takes as input the files
17+
# data/local/train/text
18+
# data/local/dict/lexicon.txt
19+
dir=data/local/lm
20+
mkdir -p $dir
21+
22+
kaldi_lm=`which train_lm.sh`
23+
if [ -z $kaldi_lm ]; then
24+
echo "$0: train_lm.sh is not found. That might mean it's not installed"
25+
echo "$0: or it is not added to PATH"
26+
echo "$0: Use the script tools/extras/install_kaldi_lm.sh to install it"
27+
exit 1
28+
fi
29+
30+
cleantext=$dir/text.no_oov
31+
32+
cat $text | awk -v lex=$lexicon 'BEGIN{while((getline<lex) >0){ seen[$1]=1; } }
33+
{for(n=1; n<=NF;n++) { if (seen[$n]) { printf("%s ", $n); } else {printf("<SPOKEN_NOISE> ");} } printf("\n");}' \
34+
> $cleantext || exit 1;
35+
36+
cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | sort | uniq -c | \
37+
sort -nr > $dir/word.counts || exit 1;
38+
39+
# Get counts from acoustic training transcripts, and add one-count
40+
# for each word in the lexicon (but not silence, we don't want it
41+
# in the LM-- we'll add it optionally later).
42+
cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | \
43+
cat - <(grep -w -v '!SIL' $lexicon | awk '{print $1}') | \
44+
sort | uniq -c | sort -nr > $dir/unigram.counts || exit 1;
45+
46+
# note: we probably won't really make use of <SPOKEN_NOISE> as there aren't any OOVs
47+
cat $dir/unigram.counts | awk '{print $2}' | get_word_map.pl "<s>" "</s>" "<SPOKEN_NOISE>" > $dir/word_map \
48+
|| exit 1;
49+
50+
# note: ignore 1st field of train.txt, it's the utterance-id.
51+
cat $cleantext | awk -v wmap=$dir/word_map 'BEGIN{while((getline<wmap)>0)map[$1]=$2;}
52+
{ for(n=2;n<=NF;n++) { printf map[$n]; if(n<NF){ printf " "; } else { print ""; }}}' | gzip -c >$dir/train.gz \
53+
|| exit 1;
54+
55+
train_lm.sh --arpa --lmtype 3gram-mincount $dir || exit 1;
56+
57+
# LM is small enough that we don't need to prune it (only about 0.7M N-grams).
58+
# Perplexity over 128254.000000 words is 90.446690
59+
60+
# note: output is
61+
# data/local/lm/3gram-mincount/lm_unpruned.gz
62+
63+
exit 0
64+
65+
66+
# From here is some commands to do a baseline with SRILM (assuming
67+
# you have it installed).
68+
heldout_sent=10000 # Don't change this if you want result to be comparable with
69+
# kaldi_lm results
70+
sdir=$dir/srilm # in case we want to use SRILM to double-check perplexities.
71+
mkdir -p $sdir
72+
cat $cleantext | awk '{for(n=2;n<=NF;n++){ printf $n; if(n<NF) printf " "; else print ""; }}' | \
73+
head -$heldout_sent > $sdir/heldout
74+
cat $cleantext | awk '{for(n=2;n<=NF;n++){ printf $n; if(n<NF) printf " "; else print ""; }}' | \
75+
tail -n +$heldout_sent > $sdir/train
76+
77+
cat $dir/word_map | awk '{print $1}' | cat - <(echo "<s>"; echo "</s>" ) > $sdir/wordlist
78+
79+
80+
ngram-count -text $sdir/train -order 3 -limit-vocab -vocab $sdir/wordlist -unk \
81+
-map-unk "<SPOKEN_NOISE>" -kndiscount -interpolate -lm $sdir/srilm.o3g.kn.gz
82+
ngram -lm $sdir/srilm.o3g.kn.gz -ppl $sdir/heldout
83+
# 0 zeroprobs, logprob= -250954 ppl= 90.5091 ppl1= 132.482
84+
85+
# Note: perplexity SRILM gives to Kaldi-LM model is same as kaldi-lm reports above.
86+
# Difference in WSJ must have been due to different treatment of <SPOKEN_NOISE>.
87+
ngram -lm $dir/3gram-mincount/lm_unpruned.gz -ppl $sdir/heldout
88+
# 0 zeroprobs, logprob= -250913 ppl= 90.4439 ppl1= 132.379
Lines changed: 156 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,156 @@
1+
#!/usr/bin/env python3
2+
3+
# Copyright 2020 Mobvoi AI Lab, Beijing, China (author: Fangjun Kuang)
4+
# Apache 2.0
5+
6+
import argparse
7+
import os
8+
9+
import kaldi
10+
11+
12+
def get_args():
13+
parser = argparse.ArgumentParser(description='convert text to labels')
14+
15+
parser.add_argument('--lexicon-filename', dest='lexicon_filename', type=str)
16+
parser.add_argument('--tokens-filename', dest='tokens_filename', type=str)
17+
parser.add_argument('--dir', help='input/output dir', type=str)
18+
19+
args = parser.parse_args()
20+
21+
assert os.path.isfile(args.lexicon_filename)
22+
assert os.path.isfile(args.tokens_filename)
23+
assert os.path.isfile(os.path.join(args.dir, 'text'))
24+
25+
return args
26+
27+
28+
def read_lexicon(filename):
29+
'''
30+
Returns:
31+
a dict whose keys are words and values are phones.
32+
'''
33+
lexicon = dict()
34+
with open(filename, 'r', encoding='utf-8') as f:
35+
for line in f:
36+
word_phones = line.split()
37+
assert len(word_phones) >= 2
38+
39+
word = word_phones[0]
40+
phones = word_phones[1:]
41+
42+
if word not in lexicon:
43+
# if there are multiple pronunciations for a word,
44+
# we choose only the first one and drop other alternatives
45+
lexicon[word] = phones
46+
47+
return lexicon
48+
49+
50+
def read_tokens(filename):
51+
'''
52+
Returns:
53+
a dict whose keys are phones and values are phone indices
54+
'''
55+
tokens = dict()
56+
with open(filename, 'r', encoding='utf-8') as f:
57+
for line in f:
58+
phone_index = line.split()
59+
assert len(phone_index) == 2
60+
61+
phone = phone_index[0]
62+
index = int(phone_index[1])
63+
64+
if phone == '<eps>':
65+
continue
66+
67+
# decreased by one since we removed <eps> above
68+
index -= 1
69+
70+
assert phone not in tokens
71+
72+
tokens[phone] = index
73+
74+
assert '<blk>' in tokens
75+
76+
# WARNING(fangjun): we assume that the blank symbol has index 0
77+
# in the neural network output.
78+
# Do NOT confuse it with `<eps>` in fst.
79+
assert tokens['<blk>'] == 0
80+
81+
return tokens
82+
83+
84+
def read_text(filename):
85+
'''
86+
Returns:
87+
a dict whose keys are utterance IDs and values are texts
88+
'''
89+
transcript = dict()
90+
91+
with open(filename, 'r', encoding='utf-8') as f:
92+
for line in f:
93+
utt_text = line.split()
94+
assert len(utt_text) >= 2
95+
96+
utt = utt_text[0]
97+
text = utt_text[1:]
98+
99+
assert utt not in transcript
100+
transcript[utt] = text
101+
102+
return transcript
103+
104+
105+
def phones_to_indices(phone_list, tokens):
106+
index_list = []
107+
108+
for phone in phone_list:
109+
assert phone in tokens
110+
111+
index = tokens[phone]
112+
index_list.append(index)
113+
114+
return index_list
115+
116+
117+
def main():
118+
args = get_args()
119+
120+
lexicon = read_lexicon(args.lexicon_filename)
121+
122+
tokens = read_tokens(args.tokens_filename)
123+
124+
transcript = read_text(os.path.join(args.dir, 'text'))
125+
126+
transcript_labels = dict()
127+
128+
for utt, text in transcript.items():
129+
labels = []
130+
for t in text:
131+
# TODO(fangjun): add support for OOV.
132+
phones = lexicon[t]
133+
134+
indices = phones_to_indices(phones, tokens)
135+
136+
labels.extend(indices)
137+
138+
assert utt not in transcript_labels
139+
140+
transcript_labels[utt] = labels
141+
142+
wspecifier = 'ark,scp:{dir}/labels.ark,{dir}/labels.scp'.format(
143+
dir=args.dir)
144+
145+
writer = kaldi.IntVectorWriter(wspecifier)
146+
147+
for utt, labels in transcript_labels.items():
148+
writer.Write(utt, labels)
149+
150+
writer.Close()
151+
152+
print('Generated label file {}/labels.scp successfully'.format(args.dir))
153+
154+
155+
if __name__ == '__main__':
156+
main()

0 commit comments

Comments
 (0)