Skip to content

Commit 523f9a4

Browse files
committed
replace the data processing pipeline with haowen's.
1 parent 793f572 commit 523f9a4

File tree

3 files changed

+189
-116
lines changed

3 files changed

+189
-116
lines changed

egs/aishell/s10/local/run_chain.sh

Lines changed: 104 additions & 66 deletions
Original file line numberDiff line numberDiff line change
@@ -5,18 +5,16 @@
55

66
set -e
77

8-
stage=0
8+
stage=10
99

1010
# GPU device id to use (count from 0).
1111
# you can also set `CUDA_VISIBLE_DEVICES` and set `device_id=0`
12-
device_id=6
12+
device_id=3
1313

1414
nj=10
1515

16-
lang=data/lang_chain # output lang dir
17-
ali_dir=exp/tri5a_ali # input alignment dir
18-
lat_dir=exp/tri5a_lats # input lat dir
19-
treedir=exp/chain/tri5_tree # output tree dir
16+
train_set=train_cleaned
17+
gmm_dir=exp/tri3_cleaned
2018

2119
# You should know how to calculate your model's left/right context **manually**
2220
model_left_context=28
@@ -47,67 +45,107 @@ save_nn_output_as_compressed=false
4745

4846
. parse_options.sh
4947

48+
ali_dir=${gmm_dir}_ali_${train_set}_sp # output ali dir
49+
lat_dir=${gmm_dir}_lat_${train_set}_sp # output lat dir
50+
tree_dir=${gmm_dir}_tree_${train_set}_sp # output tree dir
51+
train_data_dir=data/${train_set}_sp_hires
52+
lores_train_data_dir=data/${train_set}_sp
53+
5054
if [[ $stage -le 0 ]]; then
51-
for datadir in train dev test; do
52-
dst_dir=data/mfcc_hires/$datadir
53-
if [[ ! -f $dst_dir/feats.scp ]]; then
54-
echo "making mfcc features for LF-MMI training"
55-
utils/copy_data_dir.sh data/$datadir $dst_dir
56-
steps/make_mfcc.sh \
57-
--mfcc-config conf/mfcc_hires.conf \
58-
--cmd "$train_cmd" \
59-
--nj $nj \
60-
$dst_dir || exit 1
61-
steps/compute_cmvn_stats.sh $dst_dir || exit 1
62-
utils/fix_data_dir.sh $dst_dir
63-
else
64-
echo "$dst_dir/feats.scp already exists."
65-
echo "kaldi (local/run_tdnn_1b.sh) LF-MMI may have generated it."
66-
echo "skip $dst_dir"
67-
fi
55+
echo "$0: preparing directory for low-resolution speed-perturbed data (for alignment)"
56+
utils/data/perturb_data_dir_speed_3way.sh data/$train_set data/${train_set}_sp
57+
58+
for x in ${train_set}_sp dev test; do
59+
utils/copy_data_dir.sh data/$x data/${x}_hires
6860
done
6961
fi
7062

7163
if [[ $stage -le 1 ]]; then
64+
echo "$0: making MFCC features for low-resolution speed-perturbed data"
65+
steps/make_mfcc.sh --nj $nj --cmd "$train_cmd" data/${train_set}_sp
66+
steps/compute_cmvn_stats.sh data/${train_set}_sp
67+
echo "fixing input data-dir to remove nonexistent features, in case some "
68+
echo ".. speed-perturbed segments were too short."
69+
utils/fix_data_dir.sh data/${train_set}_sp
70+
fi
71+
72+
if [[ $stage -le 2 ]]; then
73+
echo "$0: aligning with the perturbed low-resolution data"
74+
steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
75+
data/${train_set}_sp data/lang $gmm_dir $ali_dir
76+
fi
77+
78+
if [[ $stage -le 3 ]]; then
79+
echo "$0: creating high-resolution MFCC features"
80+
81+
# do volume-perturbation on the training data prior to extracting hires
82+
# features; this helps make trained nnets more invariant to test data volume.
83+
utils/data/perturb_data_dir_volume.sh data/${train_set}_sp_hires
84+
85+
for x in ${train_set}_sp dev test; do
86+
steps/make_mfcc.sh --nj $nj --mfcc-config conf/mfcc_hires.conf \
87+
--cmd "$train_cmd" data/${x}_hires
88+
steps/compute_cmvn_stats.sh data/${x}_hires
89+
utils/fix_data_dir.sh data/${x}_hires
90+
done
91+
fi
92+
93+
if [[ $stage -le 4 ]]; then
94+
for f in $gmm_dir/final.mdl $train_data_dir/feats.scp \
95+
$lores_train_data_dir/feats.scp $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
96+
[ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
97+
done
98+
fi
99+
100+
if [[ $stage -le 5 ]]; then
101+
echo "$0: creating lang directory with one state per phone."
72102
# Create a version of the lang/ directory that has one state per phone in the
73103
# topo file. [note, it really has two states.. the first one is only repeated
74104
# once, the second one has zero or more repeats.]
75-
rm -rf $lang
76-
cp -r data/lang $lang
77-
silphonelist=$(cat $lang/phones/silence.csl) || exit 1
78-
nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1
105+
cp -r data/lang data/lang_chain
106+
silphonelist=$(cat data/lang_chain/phones/silence.csl) || exit 1;
107+
nonsilphonelist=$(cat data/lang_chain/phones/nonsilence.csl) || exit 1;
79108
# Use our special topology... note that later on may have to tune this
80109
# topology.
81-
steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
110+
steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >data/lang_chain/topo
82111
fi
83112

84-
if [[ $stage -le 2 ]]; then
85-
# Build a tree using our new topology. This is the critically different
86-
# step compared with other recipes.
113+
if [[ $stage -le 6 ]]; then
114+
# Get the alignments as lattices (gives the chain training more freedom).
115+
# use the same num-jobs as the alignments
116+
steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" ${lores_train_data_dir} \
117+
data/lang $gmm_dir $lat_dir
118+
rm $lat_dir/fsts.*.gz # save space
119+
fi
120+
121+
if [[ $stage -le 7 ]]; then
122+
# Build a tree using our new topology. We know we have alignments for the
123+
# speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
124+
# those.
87125
steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
88126
--context-opts "--context-width=2 --central-position=1" \
89-
--cmd "$train_cmd" 5000 data/mfcc/train $lang $ali_dir $treedir
127+
--cmd "$train_cmd" 4000 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir
90128
fi
91129

92-
if [[ $stage -le 3 ]]; then
93-
echo "creating phone language-model"
130+
if [[ $stage -le 8 ]]; then
131+
echo "$0: creating phone language-model"
94132
"$train_cmd" exp/chain/log/make_phone_lm.log \
95133
chain-est-phone-lm \
96-
"ark:gunzip -c $treedir/ali.*.gz | ali-to-phones $treedir/final.mdl ark:- ark:- |" \
134+
"ark:gunzip -c $tree_dir/ali.*.gz | ali-to-phones $tree_dir/final.mdl ark:- ark:- |" \
97135
exp/chain/phone_lm.fst || exit 1
98136
fi
99137

100-
if [[ $stage -le 4 ]]; then
138+
if [[ $stage -le 9 ]]; then
101139
echo "creating denominator FST"
102-
copy-transition-model $treedir/final.mdl exp/chain/0.trans_mdl
103-
cp $treedir/tree exp/chain
140+
copy-transition-model $tree_dir/final.mdl exp/chain/0.trans_mdl
141+
cp $tree_dir/tree exp/chain
104142
"$train_cmd" exp/chain/log/make_den_fst.log \
105143
chain-make-den-fst exp/chain/tree exp/chain/0.trans_mdl exp/chain/phone_lm.fst \
106144
exp/chain/den.fst exp/chain/normalization.fst || exit 1
107145
fi
108146

109-
if [[ $stage -le 5 ]]; then
110-
echo "generating egs"
147+
if [[ $stage -le 10 ]]; then
148+
echo "$0: generating egs"
111149
steps/nnet3/chain/get_egs.sh \
112150
--alignment-subsampling-factor 3 \
113151
--cmd "$train_cmd" \
@@ -125,15 +163,15 @@ if [[ $stage -le 5 ]]; then
125163
--right-tolerance 5 \
126164
--srand 0 \
127165
--stage -10 \
128-
data/mfcc_hires/train \
166+
$train_data_dir \
129167
exp/chain $lat_dir exp/chain/egs
130168
fi
131169

132170
feat_dim=$(cat exp/chain/egs/info/feat_dim)
133171
output_dim=$(cat exp/chain/egs/info/num_pdfs)
134172

135-
if [[ $stage -le 6 ]]; then
136-
echo "merging egs"
173+
if [[ $stage -le 11 ]]; then
174+
echo "$0: merging egs"
137175
mkdir -p exp/chain/merged_egs
138176
num_egs=$(ls -1 exp/chain/egs/cegs*.ark | wc -l)
139177

@@ -145,15 +183,15 @@ if [[ $stage -le 6 ]]; then
145183
rm exp/chain/egs/cegs.*.ark
146184
fi
147185

148-
if [[ $stage -le 7 ]]; then
186+
if [[ $stage -le 12 ]]; then
149187
# Note: it might appear that this $lang directory is mismatched, and it is as
150188
# far as the 'topo' is concerned, but this script doesn't read the 'topo' from
151189
# the lang directory.
152190
local/mkgraph.sh --self-loop-scale 1.0 data/lang_test exp/chain exp/chain/graph
153191
fi
154192

155-
if [[ $stage -le 8 ]]; then
156-
echo "training..."
193+
if [[ $stage -le 13 ]]; then
194+
echo "$0: training..."
157195

158196
mkdir -p exp/chain/train/tensorboard
159197
train_checkpoint=
@@ -187,22 +225,22 @@ if [[ $stage -le 8 ]]; then
187225
--train.xent-regularize 0.1
188226
fi
189227

190-
if [[ $stage -le 9 ]]; then
191-
echo "inference: computing likelihood"
228+
if [[ $stage -le 14 ]]; then
229+
echo "$0: inference: computing likelihood"
192230
for x in test dev; do
193-
mkdir -p exp/chain/inference/$x
194-
if [[ -f exp/chain/inference/$x/nnet_output.scp ]]; then
195-
echo "exp/chain/inference/$x/nnet_output.scp already exists! Skip"
231+
mkdir -p exp/chain/inference/${x}_hires
232+
if [[ -f exp/chain/inference/${x}_hires/nnet_output.scp ]]; then
233+
echo "$0: exp/chain/inference/${x}_hires/nnet_output.scp already exists! Skip"
196234
else
197235
best_epoch=$(cat exp/chain/train/best-epoch-info | grep 'best epoch' | awk '{print $NF}')
198236
inference_checkpoint=exp/chain/train/epoch-${best_epoch}.pt
199237
python3 ./chain/inference.py \
200238
--bottleneck-dim $bottleneck_dim \
201239
--checkpoint $inference_checkpoint \
202240
--device-id $device_id \
203-
--dir exp/chain/inference/$x \
241+
--dir exp/chain/inference/${x}_hires \
204242
--feat-dim $feat_dim \
205-
--feats-scp data/mfcc_hires/$x/feats.scp \
243+
--feats-scp data/${x}_hires/feats.scp \
206244
--hidden-dim $hidden_dim \
207245
--is-training false \
208246
--kernel-size-list "$kernel_size_list" \
@@ -217,36 +255,36 @@ if [[ $stage -le 9 ]]; then
217255
done
218256
fi
219257

220-
if [[ $stage -le 10 ]]; then
221-
echo "decoding"
258+
if [[ $stage -le 15 ]]; then
259+
echo "$0: decoding"
222260
for x in test dev; do
223-
if [[ ! -f exp/chain/inference/$x/nnet_output.scp ]]; then
224-
echo "exp/chain/inference/$x/nnet_output.scp does not exist!"
225-
echo "Please run inference.py first"
261+
if [[ ! -f exp/chain/inference/${x}_hires/nnet_output.scp ]]; then
262+
echo "$0: exp/chain/inference/${x}_hires/nnet_output.scp does not exist!"
263+
echo "$0: Please run inference.py first"
226264
exit 1
227265
fi
228-
echo "decoding $x"
266+
echo "$0: decoding ${x}_hires"
229267

230268
./local/decode.sh \
231269
--nj $nj \
232270
exp/chain/graph \
233271
exp/chain/0.trans_mdl \
234-
exp/chain/inference/$x/nnet_output.scp \
235-
exp/chain/decode_res/$x
272+
exp/chain/inference/${x}_hires/nnet_output.scp \
273+
exp/chain/decode_res/${x}_hires
236274
done
237275
fi
238276

239-
if [[ $stage -le 11 ]]; then
240-
echo "scoring"
277+
if [[ $stage -le 16 ]]; then
278+
echo "$0: scoring"
241279

242280
for x in test dev; do
243281
./local/score.sh --cmd "$decode_cmd" \
244-
data/mfcc_hires/$x \
282+
data/${x}_hires \
245283
exp/chain/graph \
246-
exp/chain/decode_res/$x || exit 1
284+
exp/chain/decode_res/${x}_hires || exit 1
247285
done
248286

249287
for x in test dev; do
250-
head exp/chain/decode_res/$x/scoring_kaldi/best_*
288+
head exp/chain/decode_res/${x}_hires/scoring_kaldi/best_*
251289
done
252290
fi
Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
#!/bin/bash
2+
3+
# Copyright 2016 Vimal Manohar
4+
# 2016 Johns Hopkins University (author: Daniel Povey)
5+
# Apache 2.0
6+
7+
# This script demonstrates how to re-segment training data selecting only the
8+
# "good" audio that matches the transcripts.
9+
# The basic idea is to decode with an existing in-domain acoustic model, and a
10+
# biased language model built from the reference, and then work out the
11+
# segmentation from a ctm like file.
12+
13+
# For nnet3 and chain results after cleanup, see the scripts in
14+
# local/nnet3/run_tdnn.sh and local/chain/run_tdnn.sh
15+
16+
# GMM Results for speaker-independent (SI) and speaker adaptive training (SAT) systems on dev and test sets
17+
# [will add these later].
18+
19+
set -e
20+
set -o pipefail
21+
set -u
22+
23+
stage=0
24+
cleanup_stage=0
25+
data=data/train
26+
cleanup_affix=cleaned
27+
srcdir=exp/tri3
28+
nj=100
29+
decode_nj=16
30+
decode_num_threads=4
31+
32+
. ./path.sh
33+
. ./cmd.sh
34+
. utils/parse_options.sh
35+
36+
cleaned_data=${data}_${cleanup_affix}
37+
38+
dir=${srcdir}_${cleanup_affix}_work
39+
cleaned_dir=${srcdir}_${cleanup_affix}
40+
41+
if [ $stage -le 1 ]; then
42+
# This does the actual data cleanup.
43+
steps/cleanup/clean_and_segment_data.sh --stage $cleanup_stage --nj $nj --cmd "$train_cmd" \
44+
$data data/lang $srcdir $dir $cleaned_data
45+
fi
46+
47+
if [ $stage -le 2 ]; then
48+
steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
49+
$cleaned_data data/lang $srcdir ${srcdir}_ali_${cleanup_affix}
50+
fi
51+
52+
if [ $stage -le 3 ]; then
53+
steps/train_sat.sh --cmd "$train_cmd" \
54+
5000 100000 $cleaned_data data/lang ${srcdir}_ali_${cleanup_affix} ${cleaned_dir}
55+
fi
56+
57+
if [ $stage -le 4 ]; then
58+
# Test with the models trained on cleaned-up data.
59+
utils/mkgraph.sh data/lang ${cleaned_dir} ${cleaned_dir}/graph
60+
61+
for dset in dev test; do
62+
steps/decode_fmllr.sh --nj $decode_nj --num-threads $decode_num_threads \
63+
--cmd "$decode_cmd" --num-threads 4 \
64+
${cleaned_dir}/graph data/${dset} ${cleaned_dir}/decode_${dset}
65+
steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
66+
data/${dset} ${cleaned_dir}/decode_${dset} ${cleaned_dir}/decode_${dset}_rescore
67+
done
68+
fi

0 commit comments

Comments
 (0)