Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 45 additions & 0 deletions egs/aishell/s10/RESULTS
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
# If you execute `run.sh`, then you should get similar results as follows:

# Results for kaldi pybind LF-MMI training with PyTorch
## head exp/chain/decode_res/*/scoring_kaldi/best_* > RESULTS
#
==> exp/chain/decode_res/dev/scoring_kaldi/best_cer <==
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The naming scheme is not obvious from this file... what is "res"? Please clarify this, and also chain_nnet3.
And can you please make sure that these results (and where appropriate, the output of chain_dir_info.pl) are
in a comment at the top of the script that generated them?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

thanks, I will change it to follow the current style of egs/swbd/s5c/RESULTS.

%WER 8.22 [ 16888 / 205341, 774 ins, 1007 del, 15107 sub ] exp/chain/decode_res/dev/cer_10_1.0

==> exp/chain/decode_res/dev/scoring_kaldi/best_wer <==
%WER 16.66 [ 21278 / 127698, 1690 ins, 3543 del, 16045 sub ] exp/chain/decode_res/dev/wer_11_0.5

==> exp/chain/decode_res/test/scoring_kaldi/best_cer <==
%WER 9.98 [ 10454 / 104765, 693 ins, 802 del, 8959 sub ] exp/chain/decode_res/test/cer_11_1.0

==> exp/chain/decode_res/test/scoring_kaldi/best_wer <==
%WER 18.89 [ 12170 / 64428, 1112 ins, 1950 del, 9108 sub ] exp/chain/decode_res/test/wer_12_0.5

# Results for kaldi nnet3 LF-MMI training
## head exp/chain_nnet3/tdnn_1b/decode_*/scoring_kaldi/best_*
#
==> exp/chain_nnet3/tdnn_1b/decode_dev/scoring_kaldi/best_cer <==
%WER 7.06 [ 14494 / 205341, 466 ins, 726 del, 13302 sub ] exp/chain_nnet3/tdnn_1b/decode_dev/cer_10_0.5

==> exp/chain_nnet3/tdnn_1b/decode_dev/scoring_kaldi/best_wer <==
%WER 15.11 [ 19296 / 127698, 1800 ins, 2778 del, 14718 sub ] exp/chain_nnet3/tdnn_1b/decode_dev/wer_11_0.0

==> exp/chain_nnet3/tdnn_1b/decode_test/scoring_kaldi/best_cer <==
%WER 8.63 [ 9041 / 104765, 367 ins, 668 del, 8006 sub ] exp/chain_nnet3/tdnn_1b/decode_test/cer_11_1.0

==> exp/chain_nnet3/tdnn_1b/decode_test/scoring_kaldi/best_wer <==
%WER 17.40 [ 11210 / 64428, 1059 ins, 1654 del, 8497 sub ] exp/chain_nnet3/tdnn_1b/decode_test/wer_11_0.5

# kaldi pybind LF-MMI training currently uses batchnorm to replace the LDA layer
# since it is not easy to get lda.mat without constructing a nnet3 network.

# Training Time comparison between kaldi pybind with PyTorch and nnet3
# on single GPU is as follows:
#
# training time for 6 epochs:
# - kaldi pybind with PyTorch: about 45 minutes
# - kaldi nnet3: about 4 hours 37 minutes

# You can find the training log in the following:
# - kaldi pybind with PyTorch: ./kaldi-pybind-with-pytorch-training-log.txt
# - kaldi nnet3: ./nnet3-training-log.txt
8 changes: 6 additions & 2 deletions egs/aishell/s10/chain/train.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,15 @@
#!/usr/bin/env python3

# Copyright 2019 Mobvoi AI Lab, Beijing, China (author: Fangjun Kuang)
# Copyright 2019-2020 Mobvoi AI Lab, Beijing, China (author: Fangjun Kuang)
# Apache 2.0

import logging
import os
import sys
import warnings

# disable warnings when loading tensorboard
warnings.simplefilter(action='ignore', category=FutureWarning)

import torch
import torch.optim as optim
Expand Down Expand Up @@ -159,7 +163,7 @@ def main():
lr=learning_rate,
weight_decay=args.l2_regularize)

scheduler = MultiStepLR(optimizer, milestones=[2, 6, 8, 9], gamma=0.5)
scheduler = MultiStepLR(optimizer, milestones=[1, 2, 3, 4, 5], gamma=0.5)
criterion = KaldiChainObjfFunction.apply

tf_writer = SummaryWriter(log_dir='{}/tensorboard'.format(args.dir))
Expand Down
1 change: 1 addition & 0 deletions egs/aishell/s10/cmd.sh
Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,4 @@
export train_cmd="run.pl"
export decode_cmd="run.pl"
export mkgraph_cmd="run.pl"
export cuda_cmd="run.pl"
28 changes: 17 additions & 11 deletions egs/aishell/s10/local/run_chain.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,14 +9,14 @@ stage=0

# GPU device id to use (count from 0).
# you can also set `CUDA_VISIBLE_DEVICES` and set `device_id=0`
device_id=7
device_id=0

nj=10

lang=data/lang_chain # output lang dir
ali_dir=exp/tri3a_ali # input alignment dir
lat_dir=exp/tri3a_lats # input lat dir
treedir=exp/chain/tri3_tree # output tree dir
ali_dir=exp/tri5a_ali # input alignment dir
lat_dir=exp/tri5a_lats # input lat dir
treedir=exp/chain/tri5_tree # output tree dir

# You should know how to calculate your model's left/right context **manually**
model_left_context=12
Expand All @@ -27,8 +27,8 @@ frames_per_eg=150,110,90
frames_per_iter=1500000
minibatch_size=128

num_epochs=10
lr=2e-3
num_epochs=6
lr=1e-3

hidden_dim=625
kernel_size_list="1, 3, 3, 3, 3, 3" # comma separated list
Expand All @@ -48,11 +48,17 @@ save_nn_output_as_compressed=false
if [[ $stage -le 0 ]]; then
for datadir in train dev test; do
dst_dir=data/fbank_pitch/$datadir
utils/copy_data_dir.sh data/$datadir $dst_dir
echo "making fbank-pitch features for LF-MMI training"
steps/make_fbank_pitch.sh --cmd "$train_cmd" --nj $nj $dst_dir || exit 1
steps/compute_cmvn_stats.sh $dst_dir || exit 1
utils/fix_data_dir.sh $dst_dir
if [[ ! -f $dst_dir/feats.scp ]]; then
utils/copy_data_dir.sh data/$datadir $dst_dir
echo "making fbank-pitch features for LF-MMI training"
steps/make_fbank_pitch.sh --cmd $train_cmd --nj $nj $dst_dir || exit 1
steps/compute_cmvn_stats.sh $dst_dir || exit 1
utils/fix_data_dir.sh $dst_dir
else
echo "$dst_dir/feats.scp already exists."
echo "kaldi (local/run_tdnn_1b.sh) LF-MMI may have generated it."
echo "skip $dst_dir"
fi
done
fi

Expand Down
179 changes: 179 additions & 0 deletions egs/aishell/s10/local/run_tdnn_1b.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,179 @@
#!/bin/bash

# This script is based on run_tdnn_7h.sh in swbd chain recipe.

set -e

nj=10
# configs for 'chain'
affix=
stage=0
train_stage=-10
get_egs_stage=-10
dir=exp/chain_nnet3/tdnn_1b
decode_iter=

# training options
num_epochs=6
initial_effective_lrate=0.001
final_effective_lrate=0.0001
max_param_change=2.0
final_layer_normalize_target=0.5
num_jobs_initial=2
num_jobs_final=12
minibatch_size=128
frames_per_eg=150,110,90
remove_egs=true
common_egs_dir=
xent_regularize=0.1

# End configuration section.
echo "$0 $@" # Print the command line for logging

. ./cmd.sh
. ./path.sh
. ./utils/parse_options.sh

if ! cuda-compiled; then
echo "This script is intended to be used with GPUs"
echo "but you have not compiled Kaldi with CUDA"
echo "If you want to use GPUs (and have them), go to src/,"
echo "and configure and make on a machine where "nvcc" is installed."
exit 1
fi

dir=${dir}${affix:+_$affix}
train_set=train
ali_dir=exp/tri5a_ali
lat_dir=exp/tri5a_lats
treedir=exp/chain_nnet3/tri5_tree
lang=data/lang_chain_nnet3


if [[ $stage -le 0 ]]; then
for datadir in train dev test; do
dst_dir=data/fbank_pitch/$datadir
if [[ ! -f $dst_dir/feats.scp ]]; then
utils/copy_data_dir.sh data/$datadir $dst_dir
echo "making fbank-pitch features for LF-MMI training"
steps/make_fbank_pitch.sh --cmd $train_cmd --nj $nj $dst_dir || exit 1
steps/compute_cmvn_stats.sh $dst_dir || exit 1
utils/fix_data_dir.sh $dst_dir
else
echo "$dst_dir/feats.scp already exists."
echo "kaldi pybind (local/run_chain.sh) LF-MMI may have generated it."
echo "skip $dst_dir"
fi
done
fi

if [[ $stage -le 1 ]]; then
# Create a version of the lang/ directory that has one state per phone in the
# topo file. [note, it really has two states.. the first one is only repeated
# once, the second one has zero or more repeats.]
rm -rf $lang
cp -r data/lang $lang
silphonelist=$(cat $lang/phones/silence.csl) || exit 1
nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1
# Use our special topology... note that later on may have to tune this
# topology.
steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
fi

if [[ $stage -le 2 ]]; then
# Build a tree using our new topology. This is the critically different
# step compared with other recipes.
steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
--context-opts "--context-width=2 --central-position=1" \
--cmd $train_cmd 5000 data/train $lang $ali_dir $treedir
fi

if [[ $stage -le 3 ]]; then
echo "creating neural net configs using the xconfig parser"

num_targets=$(tree-info $treedir/tree | grep num-pdfs | awk '{print $2}')
learning_rate_factor=$(echo "print(0.5/$xent_regularize)" | python3)
feat_dim=$(feat-to-dim scp:data/fbank_pitch/train/feats.scp -)

mkdir -p $dir/configs
cat <<EOF > $dir/configs/network.xconfig
input dim=$feat_dim name=input

# please note that it is important to have input layer with the name=input
# as the layer immediately preceding the fixed-affine-layer to enable
# the use of short notation for the descriptor
fixed-affine-layer name=lda input=Append(-1,0,1) affine-transform-file=$dir/configs/lda.mat
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How much do we lose from removing i-vectors? If you could make a comparison with run_tdnn_1a.sh via compare_wer.sh and put it in a comment at the top, that would be ideal. (If there is no compare_wer.sh, please
see if someone over there can make one for this setup!).

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I did not use ivector since I have not figured it out how to integrate it into PyTorch.
I will try to add ivector and compare the results with/without using ivector.


# the first splicing is moved before the lda layer, so no splicing here
relu-batchnorm-layer name=tdnn1 dim=625
relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=625
relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=625
relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=625
relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=625
relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=625

## adding the layers for chain branch
relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=625 target-rms=0.5
output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5

# adding the layers for xent branch
# This block prints the configs for a separate output that will be
# trained with a cross-entropy objective in the 'chain' models... this
# has the effect of regularizing the hidden parts of the model. we use
# 0.5 / args.xent_regularize as the learning rate factor- the factor of
# 0.5 / args.xent_regularize is suitable as it means the xent
# final-layer learns at a rate independent of the regularization
# constant; and the 0.5 was tuned so as to make the relative progress
# similar in the xent and regular final layers.
relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=625 target-rms=0.5
output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5

EOF
steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
fi

if [[ $stage -le 4 ]]; then
steps/nnet3/chain/train.py --stage $train_stage \
--cmd $cuda_cmd \
--feat.cmvn-opts "--norm-means=false --norm-vars=false" \
--chain.xent-regularize $xent_regularize \
--chain.leaky-hmm-coefficient 0.1 \
--chain.l2-regularize 0.00005 \
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

BTW, these days we tend to set chain.l2-regularize to zero and instead rely on l2 regularization in the TDNN or TDNN-F layers. This reminds me that this recipe is super old! Does someone at mobvoi have time to test out a more recent recipe? E.g. you could try out the current Swbd recipe (I don't remember how much data is in aishell). We need to make sure that we are comparing against a recent baseline, or we won't be aiming for the right place!!

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No problem, I will switch to follow the recipe in swbd.

--chain.apply-deriv-weights false \
--chain.lm-opts="--num-extra-lm-states=2000" \
--egs.dir "$common_egs_dir" \
--egs.stage $get_egs_stage \
--egs.opts "--frames-overlap-per-eg 0" \
--egs.chunk-width $frames_per_eg \
--trainer.num-chunk-per-minibatch $minibatch_size \
--trainer.frames-per-iter 1500000 \
--trainer.num-epochs $num_epochs \
--trainer.optimization.num-jobs-initial $num_jobs_initial \
--trainer.optimization.num-jobs-final $num_jobs_final \
--trainer.optimization.initial-effective-lrate $initial_effective_lrate \
--trainer.optimization.final-effective-lrate $final_effective_lrate \
--trainer.max-param-change $max_param_change \
--cleanup.remove-egs $remove_egs \
--cleanup.preserve-model-interval=1 \
--feat-dir data/fbank_pitch/train \
--tree-dir $treedir \
--use-gpu "wait" \
--lat-dir $lat_dir \
--dir $dir || exit 1
fi

if [[ $stage -le 5 ]]; then
# Note: it might appear that this $lang directory is mismatched, and it is as
# far as the 'topo' is concerned, but this script doesn't read the 'topo' from
# the lang directory.
utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $dir/graph
fi

graph_dir=$dir/graph
if [[ $stage -le 6 ]]; then
for test_set in dev test; do
steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
--nj $nj --cmd $decode_cmd \
$graph_dir data/fbank_pitch/${test_set} $dir/decode_${test_set} || exit 1
done
fi
34 changes: 30 additions & 4 deletions egs/aishell/s10/run.sh
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/bin/bash

# Copyright 2019 Mobvoi AI Lab, Beijing, China (author: Fangjun Kuang)
# Copyright 2019-2020 Mobvoi AI Lab, Beijing, China (author: Fangjun Kuang)
# Apache 2.0

# This file demonstrates how to run LF-MMI training in PyTorch
Expand Down Expand Up @@ -105,11 +105,37 @@ if [[ $stage -le 13 ]]; then
fi

if [[ $stage -le 14 ]]; then
steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/train \
data/lang exp/tri3a exp/tri3a_lats
rm exp/tri3a_lats/fsts.*.gz # save space
steps/train_sat.sh --cmd $train_cmd \
2500 20000 data/train data/lang exp/tri3a_ali exp/tri4a || exit 1
fi

if [[ $stage -le 15 ]]; then
steps/align_fmllr.sh --cmd $train_cmd --nj $nj \
data/train data/lang exp/tri4a exp/tri4a_ali
fi

if [[ $stage -le 16 ]]; then
steps/train_sat.sh --cmd $train_cmd \
3500 100000 data/train data/lang exp/tri4a_ali exp/tri5a || exit 1
fi

if [[ $stage -le 17 ]]; then
steps/align_fmllr.sh --cmd $train_cmd --nj $nj \
data/train data/lang exp/tri5a exp/tri5a_ali || exit 1
fi

if [[ $stage -le 18 ]]; then
steps/align_fmllr_lats.sh --nj $nj --cmd $train_cmd data/train \
data/lang exp/tri5a exp/tri5a_lats
rm exp/tri5a_lats/fsts.*.gz # save space
fi

if [[ $stage -le 19 ]]; then
# kaldi pybind LF-MMI training with PyTorch
./local/run_chain.sh --nj $nj
fi

if [[ $stage -le 20 ]]; then
# kaldi nnet3 LF-MMI training
./local/run_tdnn_1b.sh --nj $nj
fi