diff --git a/egs/aishell/s10/RESULTS b/egs/aishell/s10/RESULTS new file mode 100644 index 00000000000..a160433fc5e --- /dev/null +++ b/egs/aishell/s10/RESULTS @@ -0,0 +1,45 @@ +# If you execute `run.sh`, then you should get similar results as follows: + +# Results for kaldi pybind LF-MMI training with PyTorch +## head exp/chain/decode_res/*/scoring_kaldi/best_* > RESULTS +# +==> exp/chain/decode_res/dev/scoring_kaldi/best_cer <== +%WER 8.22 [ 16888 / 205341, 774 ins, 1007 del, 15107 sub ] exp/chain/decode_res/dev/cer_10_1.0 + +==> exp/chain/decode_res/dev/scoring_kaldi/best_wer <== +%WER 16.66 [ 21278 / 127698, 1690 ins, 3543 del, 16045 sub ] exp/chain/decode_res/dev/wer_11_0.5 + +==> exp/chain/decode_res/test/scoring_kaldi/best_cer <== +%WER 9.98 [ 10454 / 104765, 693 ins, 802 del, 8959 sub ] exp/chain/decode_res/test/cer_11_1.0 + +==> exp/chain/decode_res/test/scoring_kaldi/best_wer <== +%WER 18.89 [ 12170 / 64428, 1112 ins, 1950 del, 9108 sub ] exp/chain/decode_res/test/wer_12_0.5 + +# Results for kaldi nnet3 LF-MMI training +## head exp/chain_nnet3/tdnn_1b/decode_*/scoring_kaldi/best_* +# +==> exp/chain_nnet3/tdnn_1b/decode_dev/scoring_kaldi/best_cer <== +%WER 7.06 [ 14494 / 205341, 466 ins, 726 del, 13302 sub ] exp/chain_nnet3/tdnn_1b/decode_dev/cer_10_0.5 + +==> exp/chain_nnet3/tdnn_1b/decode_dev/scoring_kaldi/best_wer <== +%WER 15.11 [ 19296 / 127698, 1800 ins, 2778 del, 14718 sub ] exp/chain_nnet3/tdnn_1b/decode_dev/wer_11_0.0 + +==> exp/chain_nnet3/tdnn_1b/decode_test/scoring_kaldi/best_cer <== +%WER 8.63 [ 9041 / 104765, 367 ins, 668 del, 8006 sub ] exp/chain_nnet3/tdnn_1b/decode_test/cer_11_1.0 + +==> exp/chain_nnet3/tdnn_1b/decode_test/scoring_kaldi/best_wer <== +%WER 17.40 [ 11210 / 64428, 1059 ins, 1654 del, 8497 sub ] exp/chain_nnet3/tdnn_1b/decode_test/wer_11_0.5 + +# kaldi pybind LF-MMI training currently uses batchnorm to replace the LDA layer +# since it is not easy to get lda.mat without constructing a nnet3 network. + +# Training Time comparison between kaldi pybind with PyTorch and nnet3 +# on single GPU is as follows: +# +# training time for 6 epochs: +# - kaldi pybind with PyTorch: about 45 minutes +# - kaldi nnet3: about 4 hours 37 minutes + +# You can find the training log in the following: +# - kaldi pybind with PyTorch: ./kaldi-pybind-with-pytorch-training-log.txt +# - kaldi nnet3: ./nnet3-training-log.txt diff --git a/egs/aishell/s10/chain/train.py b/egs/aishell/s10/chain/train.py index 88d6d26e076..829202fbe94 100644 --- a/egs/aishell/s10/chain/train.py +++ b/egs/aishell/s10/chain/train.py @@ -1,11 +1,15 @@ #!/usr/bin/env python3 -# Copyright 2019 Mobvoi AI Lab, Beijing, China (author: Fangjun Kuang) +# Copyright 2019-2020 Mobvoi AI Lab, Beijing, China (author: Fangjun Kuang) # Apache 2.0 import logging import os import sys +import warnings + +# disable warnings when loading tensorboard +warnings.simplefilter(action='ignore', category=FutureWarning) import torch import torch.optim as optim @@ -159,7 +163,7 @@ def main(): lr=learning_rate, weight_decay=args.l2_regularize) - scheduler = MultiStepLR(optimizer, milestones=[2, 6, 8, 9], gamma=0.5) + scheduler = MultiStepLR(optimizer, milestones=[1, 2, 3, 4, 5], gamma=0.5) criterion = KaldiChainObjfFunction.apply tf_writer = SummaryWriter(log_dir='{}/tensorboard'.format(args.dir)) diff --git a/egs/aishell/s10/cmd.sh b/egs/aishell/s10/cmd.sh index 37b7c2c7568..82b1d114e08 100644 --- a/egs/aishell/s10/cmd.sh +++ b/egs/aishell/s10/cmd.sh @@ -13,3 +13,4 @@ export train_cmd="run.pl" export decode_cmd="run.pl" export mkgraph_cmd="run.pl" +export cuda_cmd="run.pl" diff --git a/egs/aishell/s10/local/run_chain.sh b/egs/aishell/s10/local/run_chain.sh index f0301b9ffed..8ce22d3364b 100755 --- a/egs/aishell/s10/local/run_chain.sh +++ b/egs/aishell/s10/local/run_chain.sh @@ -9,14 +9,14 @@ stage=0 # GPU device id to use (count from 0). # you can also set `CUDA_VISIBLE_DEVICES` and set `device_id=0` -device_id=7 +device_id=0 nj=10 lang=data/lang_chain # output lang dir -ali_dir=exp/tri3a_ali # input alignment dir -lat_dir=exp/tri3a_lats # input lat dir -treedir=exp/chain/tri3_tree # output tree dir +ali_dir=exp/tri5a_ali # input alignment dir +lat_dir=exp/tri5a_lats # input lat dir +treedir=exp/chain/tri5_tree # output tree dir # You should know how to calculate your model's left/right context **manually** model_left_context=12 @@ -27,8 +27,8 @@ frames_per_eg=150,110,90 frames_per_iter=1500000 minibatch_size=128 -num_epochs=10 -lr=2e-3 +num_epochs=6 +lr=1e-3 hidden_dim=625 kernel_size_list="1, 3, 3, 3, 3, 3" # comma separated list @@ -48,11 +48,17 @@ save_nn_output_as_compressed=false if [[ $stage -le 0 ]]; then for datadir in train dev test; do dst_dir=data/fbank_pitch/$datadir - utils/copy_data_dir.sh data/$datadir $dst_dir - echo "making fbank-pitch features for LF-MMI training" - steps/make_fbank_pitch.sh --cmd "$train_cmd" --nj $nj $dst_dir || exit 1 - steps/compute_cmvn_stats.sh $dst_dir || exit 1 - utils/fix_data_dir.sh $dst_dir + if [[ ! -f $dst_dir/feats.scp ]]; then + utils/copy_data_dir.sh data/$datadir $dst_dir + echo "making fbank-pitch features for LF-MMI training" + steps/make_fbank_pitch.sh --cmd $train_cmd --nj $nj $dst_dir || exit 1 + steps/compute_cmvn_stats.sh $dst_dir || exit 1 + utils/fix_data_dir.sh $dst_dir + else + echo "$dst_dir/feats.scp already exists." + echo "kaldi (local/run_tdnn_1b.sh) LF-MMI may have generated it." + echo "skip $dst_dir" + fi done fi diff --git a/egs/aishell/s10/local/run_tdnn_1b.sh b/egs/aishell/s10/local/run_tdnn_1b.sh new file mode 100755 index 00000000000..34aa7fc3fee --- /dev/null +++ b/egs/aishell/s10/local/run_tdnn_1b.sh @@ -0,0 +1,179 @@ +#!/bin/bash + +# This script is based on run_tdnn_7h.sh in swbd chain recipe. + +set -e + +nj=10 +# configs for 'chain' +affix= +stage=0 +train_stage=-10 +get_egs_stage=-10 +dir=exp/chain_nnet3/tdnn_1b +decode_iter= + +# training options +num_epochs=6 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=2 +num_jobs_final=12 +minibatch_size=128 +frames_per_eg=150,110,90 +remove_egs=true +common_egs_dir= +xent_regularize=0.1 + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + echo "This script is intended to be used with GPUs" + echo "but you have not compiled Kaldi with CUDA" + echo "If you want to use GPUs (and have them), go to src/," + echo "and configure and make on a machine where "nvcc" is installed." + exit 1 +fi + +dir=${dir}${affix:+_$affix} +train_set=train +ali_dir=exp/tri5a_ali +lat_dir=exp/tri5a_lats +treedir=exp/chain_nnet3/tri5_tree +lang=data/lang_chain_nnet3 + + +if [[ $stage -le 0 ]]; then + for datadir in train dev test; do + dst_dir=data/fbank_pitch/$datadir + if [[ ! -f $dst_dir/feats.scp ]]; then + utils/copy_data_dir.sh data/$datadir $dst_dir + echo "making fbank-pitch features for LF-MMI training" + steps/make_fbank_pitch.sh --cmd $train_cmd --nj $nj $dst_dir || exit 1 + steps/compute_cmvn_stats.sh $dst_dir || exit 1 + utils/fix_data_dir.sh $dst_dir + else + echo "$dst_dir/feats.scp already exists." + echo "kaldi pybind (local/run_chain.sh) LF-MMI may have generated it." + echo "skip $dst_dir" + fi + done +fi + +if [[ $stage -le 1 ]]; then + # Create a version of the lang/ directory that has one state per phone in the + # topo file. [note, it really has two states.. the first one is only repeated + # once, the second one has zero or more repeats.] + rm -rf $lang + cp -r data/lang $lang + silphonelist=$(cat $lang/phones/silence.csl) || exit 1 + nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1 + # Use our special topology... note that later on may have to tune this + # topology. + steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo +fi + +if [[ $stage -le 2 ]]; then + # Build a tree using our new topology. This is the critically different + # step compared with other recipes. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd $train_cmd 5000 data/train $lang $ali_dir $treedir +fi + +if [[ $stage -le 3 ]]; then + echo "creating neural net configs using the xconfig parser" + + num_targets=$(tree-info $treedir/tree | grep num-pdfs | awk '{print $2}') + learning_rate_factor=$(echo "print(0.5/$xent_regularize)" | python3) + feat_dim=$(feat-to-dim scp:data/fbank_pitch/train/feats.scp -) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=$feat_dim name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=625 + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=625 + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=625 + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=625 + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=625 + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=625 + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=625 target-rms=0.5 + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=625 target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [[ $stage -le 4 ]]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd $cuda_cmd \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.dir "$common_egs_dir" \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_eg \ + --trainer.num-chunk-per-minibatch $minibatch_size \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.num-jobs-initial $num_jobs_initial \ + --trainer.optimization.num-jobs-final $num_jobs_final \ + --trainer.optimization.initial-effective-lrate $initial_effective_lrate \ + --trainer.optimization.final-effective-lrate $final_effective_lrate \ + --trainer.max-param-change $max_param_change \ + --cleanup.remove-egs $remove_egs \ + --cleanup.preserve-model-interval=1 \ + --feat-dir data/fbank_pitch/train \ + --tree-dir $treedir \ + --use-gpu "wait" \ + --lat-dir $lat_dir \ + --dir $dir || exit 1 +fi + +if [[ $stage -le 5 ]]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $dir/graph +fi + +graph_dir=$dir/graph +if [[ $stage -le 6 ]]; then + for test_set in dev test; do + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $nj --cmd $decode_cmd \ + $graph_dir data/fbank_pitch/${test_set} $dir/decode_${test_set} || exit 1 + done +fi diff --git a/egs/aishell/s10/run.sh b/egs/aishell/s10/run.sh index 4f583859665..50c87d7e94a 100755 --- a/egs/aishell/s10/run.sh +++ b/egs/aishell/s10/run.sh @@ -1,6 +1,6 @@ #!/bin/bash -# Copyright 2019 Mobvoi AI Lab, Beijing, China (author: Fangjun Kuang) +# Copyright 2019-2020 Mobvoi AI Lab, Beijing, China (author: Fangjun Kuang) # Apache 2.0 # This file demonstrates how to run LF-MMI training in PyTorch @@ -105,11 +105,37 @@ if [[ $stage -le 13 ]]; then fi if [[ $stage -le 14 ]]; then - steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/train \ - data/lang exp/tri3a exp/tri3a_lats - rm exp/tri3a_lats/fsts.*.gz # save space + steps/train_sat.sh --cmd $train_cmd \ + 2500 20000 data/train data/lang exp/tri3a_ali exp/tri4a || exit 1 fi if [[ $stage -le 15 ]]; then + steps/align_fmllr.sh --cmd $train_cmd --nj $nj \ + data/train data/lang exp/tri4a exp/tri4a_ali +fi + +if [[ $stage -le 16 ]]; then + steps/train_sat.sh --cmd $train_cmd \ + 3500 100000 data/train data/lang exp/tri4a_ali exp/tri5a || exit 1 +fi + +if [[ $stage -le 17 ]]; then + steps/align_fmllr.sh --cmd $train_cmd --nj $nj \ + data/train data/lang exp/tri5a exp/tri5a_ali || exit 1 +fi + +if [[ $stage -le 18 ]]; then + steps/align_fmllr_lats.sh --nj $nj --cmd $train_cmd data/train \ + data/lang exp/tri5a exp/tri5a_lats + rm exp/tri5a_lats/fsts.*.gz # save space +fi + +if [[ $stage -le 19 ]]; then + # kaldi pybind LF-MMI training with PyTorch ./local/run_chain.sh --nj $nj fi + +if [[ $stage -le 20 ]]; then + # kaldi nnet3 LF-MMI training + ./local/run_tdnn_1b.sh --nj $nj +fi