diff --git a/egs/aishell/s10/RESULTS b/egs/aishell/s10/RESULTS
new file mode 100644
index 00000000000..a160433fc5e
--- /dev/null
+++ b/egs/aishell/s10/RESULTS
@@ -0,0 +1,45 @@
+# If you execute `run.sh`, then you should get similar results as follows:
+
+# Results for kaldi pybind LF-MMI training with PyTorch
+## head exp/chain/decode_res/*/scoring_kaldi/best_* > RESULTS
+#
+==> exp/chain/decode_res/dev/scoring_kaldi/best_cer <==
+%WER 8.22 [ 16888 / 205341, 774 ins, 1007 del, 15107 sub ] exp/chain/decode_res/dev/cer_10_1.0
+
+==> exp/chain/decode_res/dev/scoring_kaldi/best_wer <==
+%WER 16.66 [ 21278 / 127698, 1690 ins, 3543 del, 16045 sub ] exp/chain/decode_res/dev/wer_11_0.5
+
+==> exp/chain/decode_res/test/scoring_kaldi/best_cer <==
+%WER 9.98 [ 10454 / 104765, 693 ins, 802 del, 8959 sub ] exp/chain/decode_res/test/cer_11_1.0
+
+==> exp/chain/decode_res/test/scoring_kaldi/best_wer <==
+%WER 18.89 [ 12170 / 64428, 1112 ins, 1950 del, 9108 sub ] exp/chain/decode_res/test/wer_12_0.5
+
+# Results for kaldi nnet3 LF-MMI training
+## head exp/chain_nnet3/tdnn_1b/decode_*/scoring_kaldi/best_*
+#
+==> exp/chain_nnet3/tdnn_1b/decode_dev/scoring_kaldi/best_cer <==
+%WER 7.06 [ 14494 / 205341, 466 ins, 726 del, 13302 sub ] exp/chain_nnet3/tdnn_1b/decode_dev/cer_10_0.5
+
+==> exp/chain_nnet3/tdnn_1b/decode_dev/scoring_kaldi/best_wer <==
+%WER 15.11 [ 19296 / 127698, 1800 ins, 2778 del, 14718 sub ] exp/chain_nnet3/tdnn_1b/decode_dev/wer_11_0.0
+
+==> exp/chain_nnet3/tdnn_1b/decode_test/scoring_kaldi/best_cer <==
+%WER 8.63 [ 9041 / 104765, 367 ins, 668 del, 8006 sub ] exp/chain_nnet3/tdnn_1b/decode_test/cer_11_1.0
+
+==> exp/chain_nnet3/tdnn_1b/decode_test/scoring_kaldi/best_wer <==
+%WER 17.40 [ 11210 / 64428, 1059 ins, 1654 del, 8497 sub ] exp/chain_nnet3/tdnn_1b/decode_test/wer_11_0.5
+
+# kaldi pybind LF-MMI training currently uses batchnorm to replace the LDA layer
+# since it is not easy to get lda.mat without constructing a nnet3 network.
+
+# Training Time comparison between kaldi pybind with PyTorch and nnet3
+# on single GPU is as follows:
+#
+# training time for 6 epochs:
+#   - kaldi pybind with PyTorch: about 45 minutes
+#   - kaldi nnet3: about 4 hours 37 minutes
+
+# You can find the training log in the following:
+#  - kaldi pybind with PyTorch: ./kaldi-pybind-with-pytorch-training-log.txt
+#  - kaldi nnet3: ./nnet3-training-log.txt
diff --git a/egs/aishell/s10/chain/train.py b/egs/aishell/s10/chain/train.py
index 88d6d26e076..829202fbe94 100644
--- a/egs/aishell/s10/chain/train.py
+++ b/egs/aishell/s10/chain/train.py
@@ -1,11 +1,15 @@
 #!/usr/bin/env python3
 
-# Copyright 2019 Mobvoi AI Lab, Beijing, China (author: Fangjun Kuang)
+# Copyright 2019-2020 Mobvoi AI Lab, Beijing, China (author: Fangjun Kuang)
 # Apache 2.0
 
 import logging
 import os
 import sys
+import warnings
+
+# disable warnings when loading tensorboard
+warnings.simplefilter(action='ignore', category=FutureWarning)
 
 import torch
 import torch.optim as optim
@@ -159,7 +163,7 @@ def main():
                            lr=learning_rate,
                            weight_decay=args.l2_regularize)
 
-    scheduler = MultiStepLR(optimizer, milestones=[2, 6, 8, 9], gamma=0.5)
+    scheduler = MultiStepLR(optimizer, milestones=[1, 2, 3, 4, 5], gamma=0.5)
     criterion = KaldiChainObjfFunction.apply
 
     tf_writer = SummaryWriter(log_dir='{}/tensorboard'.format(args.dir))
diff --git a/egs/aishell/s10/cmd.sh b/egs/aishell/s10/cmd.sh
index 37b7c2c7568..82b1d114e08 100644
--- a/egs/aishell/s10/cmd.sh
+++ b/egs/aishell/s10/cmd.sh
@@ -13,3 +13,4 @@
 export train_cmd="run.pl"
 export decode_cmd="run.pl"
 export mkgraph_cmd="run.pl"
+export cuda_cmd="run.pl"
diff --git a/egs/aishell/s10/local/run_chain.sh b/egs/aishell/s10/local/run_chain.sh
index f0301b9ffed..8ce22d3364b 100755
--- a/egs/aishell/s10/local/run_chain.sh
+++ b/egs/aishell/s10/local/run_chain.sh
@@ -9,14 +9,14 @@ stage=0
 
 # GPU device id to use (count from 0).
 # you can also set `CUDA_VISIBLE_DEVICES` and set `device_id=0`
-device_id=7
+device_id=0
 
 nj=10
 
 lang=data/lang_chain # output lang dir
-ali_dir=exp/tri3a_ali  # input alignment dir
-lat_dir=exp/tri3a_lats # input lat dir
-treedir=exp/chain/tri3_tree # output tree dir
+ali_dir=exp/tri5a_ali  # input alignment dir
+lat_dir=exp/tri5a_lats # input lat dir
+treedir=exp/chain/tri5_tree # output tree dir
 
 # You should know how to calculate your model's left/right context **manually**
 model_left_context=12
@@ -27,8 +27,8 @@ frames_per_eg=150,110,90
 frames_per_iter=1500000
 minibatch_size=128
 
-num_epochs=10
-lr=2e-3
+num_epochs=6
+lr=1e-3
 
 hidden_dim=625
 kernel_size_list="1, 3, 3, 3, 3, 3" # comma separated list
@@ -48,11 +48,17 @@ save_nn_output_as_compressed=false
 if [[ $stage -le 0 ]]; then
   for datadir in train dev test; do
     dst_dir=data/fbank_pitch/$datadir
-    utils/copy_data_dir.sh data/$datadir $dst_dir
-    echo "making fbank-pitch features for LF-MMI training"
-    steps/make_fbank_pitch.sh --cmd "$train_cmd" --nj $nj $dst_dir || exit 1
-    steps/compute_cmvn_stats.sh $dst_dir || exit 1
-    utils/fix_data_dir.sh $dst_dir
+    if [[ ! -f $dst_dir/feats.scp ]]; then
+      utils/copy_data_dir.sh data/$datadir $dst_dir
+      echo "making fbank-pitch features for LF-MMI training"
+      steps/make_fbank_pitch.sh --cmd $train_cmd --nj $nj $dst_dir || exit 1
+      steps/compute_cmvn_stats.sh $dst_dir || exit 1
+      utils/fix_data_dir.sh $dst_dir
+    else
+      echo "$dst_dir/feats.scp already exists."
+      echo "kaldi (local/run_tdnn_1b.sh) LF-MMI may have generated it."
+      echo "skip $dst_dir"
+    fi
   done
 fi
 
diff --git a/egs/aishell/s10/local/run_tdnn_1b.sh b/egs/aishell/s10/local/run_tdnn_1b.sh
new file mode 100755
index 00000000000..34aa7fc3fee
--- /dev/null
+++ b/egs/aishell/s10/local/run_tdnn_1b.sh
@@ -0,0 +1,179 @@
+#!/bin/bash
+
+# This script is based on run_tdnn_7h.sh in swbd chain recipe.
+
+set -e
+
+nj=10
+# configs for 'chain'
+affix=
+stage=0
+train_stage=-10
+get_egs_stage=-10
+dir=exp/chain_nnet3/tdnn_1b
+decode_iter=
+
+# training options
+num_epochs=6
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=2
+num_jobs_final=12
+minibatch_size=128
+frames_per_eg=150,110,90
+remove_egs=true
+common_egs_dir=
+xent_regularize=0.1
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  echo "This script is intended to be used with GPUs"
+  echo "but you have not compiled Kaldi with CUDA"
+  echo "If you want to use GPUs (and have them), go to src/,"
+  echo "and configure and make on a machine where "nvcc" is installed."
+  exit 1
+fi
+
+dir=${dir}${affix:+_$affix}
+train_set=train
+ali_dir=exp/tri5a_ali
+lat_dir=exp/tri5a_lats
+treedir=exp/chain_nnet3/tri5_tree
+lang=data/lang_chain_nnet3
+
+
+if [[ $stage -le 0 ]]; then
+  for datadir in train dev test; do
+    dst_dir=data/fbank_pitch/$datadir
+    if [[ ! -f $dst_dir/feats.scp ]]; then
+      utils/copy_data_dir.sh data/$datadir $dst_dir
+      echo "making fbank-pitch features for LF-MMI training"
+      steps/make_fbank_pitch.sh --cmd $train_cmd --nj $nj $dst_dir || exit 1
+      steps/compute_cmvn_stats.sh $dst_dir || exit 1
+      utils/fix_data_dir.sh $dst_dir
+    else
+      echo "$dst_dir/feats.scp already exists."
+      echo "kaldi pybind (local/run_chain.sh) LF-MMI may have generated it."
+      echo "skip $dst_dir"
+    fi
+  done
+fi
+
+if [[ $stage -le 1 ]]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [[ $stage -le 2 ]]; then
+  # Build a tree using our new topology. This is the critically different
+  # step compared with other recipes.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --cmd $train_cmd 5000 data/train $lang $ali_dir $treedir
+fi
+
+if [[ $stage -le 3 ]]; then
+  echo "creating neural net configs using the xconfig parser"
+
+  num_targets=$(tree-info $treedir/tree | grep num-pdfs | awk '{print $2}')
+  learning_rate_factor=$(echo "print(0.5/$xent_regularize)" | python3)
+  feat_dim=$(feat-to-dim scp:data/fbank_pitch/train/feats.scp -)
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=$feat_dim name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-1,0,1) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-batchnorm-layer name=tdnn1 dim=625
+  relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=625
+  relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=625
+  relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=625
+  relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=625
+  relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=625
+
+  ## adding the layers for chain branch
+  relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=625 target-rms=0.5
+  output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=625 target-rms=0.5
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [[ $stage -le 4 ]]; then
+  steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd $cuda_cmd \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --egs.dir "$common_egs_dir" \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width $frames_per_eg \
+    --trainer.num-chunk-per-minibatch $minibatch_size \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.num-epochs $num_epochs \
+    --trainer.optimization.num-jobs-initial $num_jobs_initial \
+    --trainer.optimization.num-jobs-final $num_jobs_final \
+    --trainer.optimization.initial-effective-lrate $initial_effective_lrate \
+    --trainer.optimization.final-effective-lrate $final_effective_lrate \
+    --trainer.max-param-change $max_param_change \
+    --cleanup.remove-egs $remove_egs \
+		--cleanup.preserve-model-interval=1 \
+    --feat-dir data/fbank_pitch/train \
+    --tree-dir $treedir \
+    --use-gpu "wait" \
+    --lat-dir $lat_dir \
+    --dir $dir  || exit 1
+fi
+
+if [[ $stage -le 5 ]]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $dir/graph
+fi
+
+graph_dir=$dir/graph
+if [[ $stage -le 6 ]]; then
+  for test_set in dev test; do
+    steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+      --nj $nj --cmd $decode_cmd \
+      $graph_dir data/fbank_pitch/${test_set} $dir/decode_${test_set} || exit 1
+  done
+fi
diff --git a/egs/aishell/s10/run.sh b/egs/aishell/s10/run.sh
index 4f583859665..50c87d7e94a 100755
--- a/egs/aishell/s10/run.sh
+++ b/egs/aishell/s10/run.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-# Copyright 2019 Mobvoi AI Lab, Beijing, China (author: Fangjun Kuang)
+# Copyright 2019-2020 Mobvoi AI Lab, Beijing, China (author: Fangjun Kuang)
 # Apache 2.0
 
 # This file demonstrates how to run LF-MMI training in PyTorch
@@ -105,11 +105,37 @@ if [[ $stage -le 13 ]]; then
 fi
 
 if [[ $stage -le 14 ]]; then
-  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/train \
-    data/lang exp/tri3a exp/tri3a_lats
-  rm exp/tri3a_lats/fsts.*.gz # save space
+  steps/train_sat.sh --cmd $train_cmd \
+    2500 20000 data/train data/lang exp/tri3a_ali exp/tri4a || exit 1
 fi
 
 if [[ $stage -le 15 ]]; then
+  steps/align_fmllr.sh  --cmd $train_cmd --nj $nj \
+    data/train data/lang exp/tri4a exp/tri4a_ali
+fi
+
+if [[ $stage -le 16 ]]; then
+  steps/train_sat.sh --cmd $train_cmd \
+    3500 100000 data/train data/lang exp/tri4a_ali exp/tri5a || exit 1
+fi
+
+if [[ $stage -le 17 ]]; then
+  steps/align_fmllr.sh --cmd $train_cmd --nj $nj \
+    data/train data/lang exp/tri5a exp/tri5a_ali || exit 1
+fi
+
+if [[ $stage -le 18 ]]; then
+  steps/align_fmllr_lats.sh --nj $nj --cmd $train_cmd data/train \
+    data/lang exp/tri5a exp/tri5a_lats
+  rm exp/tri5a_lats/fsts.*.gz # save space
+fi
+
+if [[ $stage -le 19 ]]; then
+  # kaldi pybind LF-MMI training with PyTorch
   ./local/run_chain.sh --nj $nj
 fi
+
+if [[ $stage -le 20 ]]; then
+  # kaldi nnet3 LF-MMI training
+  ./local/run_tdnn_1b.sh --nj $nj
+fi