diff --git a/egs/aishell/s10b/cmd.sh b/egs/aishell/s10b/cmd.sh
new file mode 100644
index 00000000000..82b1d114e08
--- /dev/null
+++ b/egs/aishell/s10b/cmd.sh
@@ -0,0 +1,16 @@
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
+
+export train_cmd="run.pl"
+export decode_cmd="run.pl"
+export mkgraph_cmd="run.pl"
+export cuda_cmd="run.pl"
diff --git a/egs/aishell/s10b/conf/fbank.conf b/egs/aishell/s10b/conf/fbank.conf
new file mode 100644
index 00000000000..3dac154706b
--- /dev/null
+++ b/egs/aishell/s10b/conf/fbank.conf
@@ -0,0 +1 @@
+--num-mel-bins=40
diff --git a/egs/aishell/s10b/ctc/add_deltas_layer.py b/egs/aishell/s10b/ctc/add_deltas_layer.py
new file mode 100644
index 00000000000..4e1c11d1b9e
--- /dev/null
+++ b/egs/aishell/s10b/ctc/add_deltas_layer.py
@@ -0,0 +1,96 @@
+# Copyright 2020 Mobvoi AI Lab, Beijing, China (author: Fangjun Kuang)
+# Apache 2.0
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+def compute_delta_feat(x, weight):
+    '''
+    Args:
+        x: input feat of shape [batch_size, feat_dim, seq_len]
+
+        weight: coefficients for computing delta features;
+              it has shape [feat_dim, 1, kernel_size].
+
+    Returns:
+        a tensor of shape [batch_size, feat_dim, seq_len]
+    '''
+
+    assert x.ndim == 3
+
+    assert weight.ndim == 3
+    assert weight.size(0) == x.size(1)
+    assert weight.size(1) == 1
+    assert weight.size(2) % 2 == 1
+
+    feat_dim = x.size(1)
+
+    # NOTE(fangjun): we perform a depthwise convolution here by
+    # setting groups == number of channels
+    y = F.conv1d(input=x, weight=weight, groups=feat_dim)
+
+    return y
+
+
+class AddDeltasLayer(nn.Module):
+    '''
+    This class implements `add-deltas` with order == 2 and window == 2.
+
+    Note that it has no trainable `nn.Parameter`s.
+    '''
+
+    def __init__(self,
+                 first_order_coef=[-1, 0, 1],
+                 second_order_coef=[1, 0, -2, 0, 1]):
+        '''
+        Args:
+            first_order_coef: coefficient to compute the first order delta feature
+
+            second_order_coef: coefficient to compute the second order delta feature
+        '''
+        super().__init__()
+
+        self.first_order_coef = torch.tensor(first_order_coef).float()
+        self.second_order_coef = torch.tensor(second_order_coef).float()
+
+    def forward(self, x):
+        '''
+        Args:
+            x: a tensor of shape [batch_size, feat_dim, seq_len]
+
+        Returns:
+            a tensor of shape [batch_size, feat_dim * 3, seq_len]
+        '''
+        if self.first_order_coef.ndim != 3:
+            num_duplicates = x.size(1)
+
+            # yapf: disable
+            self.first_order_coef = self.first_order_coef.reshape(1, 1, -1)
+            self.first_order_coef = torch.cat([self.first_order_coef] * num_duplicates, dim=0)
+
+            self.second_order_coef = self.second_order_coef.reshape(1, 1, -1)
+            self.second_order_coef = torch.cat([self.second_order_coef] * num_duplicates, dim=0)
+            # yapf: enable
+
+            device = x.device
+            self.first_order_coef = self.first_order_coef.to(device)
+            self.second_order_coef = self.second_order_coef.to(device)
+
+        first_order = compute_delta_feat(x, self.first_order_coef)
+        second_order = compute_delta_feat(x, self.second_order_coef)
+
+        # since we did not perform padding, we have to remove some frames
+        # from the 0th and 1st order features
+        zeroth_valid = (x.size(2) - second_order.size(2)) // 2
+        first_valid = (first_order.size(2) - second_order.size(2)) // 2
+
+        y = torch.cat([
+            x[:, :, zeroth_valid:-zeroth_valid,],
+            first_order[:, :, first_valid:-first_valid],
+            second_order,
+        ],
+                      dim=1)
+
+        return y
diff --git a/egs/aishell/s10b/ctc/add_deltas_layer_test.py b/egs/aishell/s10b/ctc/add_deltas_layer_test.py
new file mode 100755
index 00000000000..00832e693b0
--- /dev/null
+++ b/egs/aishell/s10b/ctc/add_deltas_layer_test.py
@@ -0,0 +1,79 @@
+#!/usr/bin/env python3
+
+# Copyright 2020 Mobvoi AI Lab, Beijing, China (author: Fangjun Kuang)
+# Apache 2.0
+
+import os
+import shutil
+import tempfile
+import unittest
+
+import numpy as np
+
+import torch
+import torch.nn.functional as F
+
+import kaldi
+
+from add_deltas_layer import AddDeltasLayer
+
+
+class AddDeltasLayerTest(unittest.TestCase):
+
+    def test(self):
+        x = torch.tensor([
+            [1, 3],
+            [5, 10],
+            [0, 1],
+            [10, 20],
+            [3, 1],
+            [3, 2],
+            [5, 1],
+            [10, -2],
+            [10, 20],
+            [100, 200],
+        ]).float()
+
+        x = x.unsqueeze(0)
+
+        transform = AddDeltasLayer(first_order_coef=[-0.2, -0.1, 0, 0.1, 0.2],
+                                   second_order_coef=[
+                                       0.04, 0.04, 0.01, -0.04, -0.1, -0.04,
+                                       0.01, 0.04, 0.04
+                                   ])
+        y = transform(x.permute(0, 2, 1)).permute(0, 2, 1)
+
+        # now use kaldi's add-deltas to compute the ground truth
+        d = tempfile.mkdtemp()
+
+        wspecifier = 'ark:{}/feats.ark'.format(d)
+
+        writer = kaldi.MatrixWriter(wspecifier)
+        writer.Write('utt1', x.squeeze(0).numpy())
+        writer.Close()
+
+        delta_feats_specifier = 'ark:{dir}/delta.ark'.format(dir=d)
+
+        cmd = '''
+        add-deltas --print-args=false --delta-order=2 --delta-window=2 {} {}
+        '''.format(wspecifier, delta_feats_specifier)
+
+        os.system(cmd)
+
+        reader = kaldi.RandomAccessMatrixReader(delta_feats_specifier)
+
+        expected = reader['utt1']
+
+        y = y.squeeze(0)
+
+        np.testing.assert_array_almost_equal(y.numpy(),
+                                             expected.numpy()[4:-4, :],
+                                             decimal=5)
+
+        reader.Close()
+
+        shutil.rmtree(d)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/egs/aishell/s10b/ctc/common.py b/egs/aishell/s10b/ctc/common.py
new file mode 100644
index 00000000000..b8a992241ca
--- /dev/null
+++ b/egs/aishell/s10b/ctc/common.py
@@ -0,0 +1,98 @@
+#!/usr/bin/env python3
+
+# Copyright 2020 Mobvoi AI Lab, Beijing, China (author: Fangjun Kuang)
+# Apache 2.0
+
+from datetime import datetime
+import logging
+
+import torch
+
+
+def setup_logger(log_filename, log_level='info'):
+    now = datetime.now()
+    date_time = now.strftime('%Y-%m-%d-%H-%M-%S')
+    log_filename = '{}-{}'.format(log_filename, date_time)
+    formatter = '%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s'
+    if log_level == 'debug':
+        level = logging.DEBUG
+    elif log_level == 'info':
+        level = logging.INFO
+    elif log_level == 'warning':
+        level = logging.WARNING
+    logging.basicConfig(filename=log_filename,
+                        format=formatter,
+                        level=level,
+                        filemode='w')
+    console = logging.StreamHandler()
+    console.setLevel(level)
+    console.setFormatter(logging.Formatter(formatter))
+    logging.getLogger('').addHandler(console)
+
+
+def load_checkpoint(filename, model):
+    logging.info('Loading checkpoint from {}'.format(filename))
+
+    checkpoint = torch.load(filename, map_location='cpu')
+
+    keys = ['state_dict', 'epoch', 'learning_rate', 'loss']
+    for k in keys:
+        assert k in checkpoint
+
+    if not list(model.state_dict().keys())[0].startswith('module.') \
+            and list(checkpoint['state_dict'])[0].startswith('module.'):
+        # the checkpoint was saved by DDP
+        logging.info('load checkpoint from DDP')
+        dst_state_dict = model.state_dict()
+        src_state_dict = checkpoint['state_dict']
+        for key in dst_state_dict.keys():
+            src_key = '{}.{}'.format('module', key)
+            dst_state_dict[key] = src_state_dict.pop(src_key)
+        assert len(src_state_dict) == 0
+        model.load_state_dict(dst_state_dict)
+    else:
+        model.load_state_dict(checkpoint['state_dict'])
+
+    epoch = checkpoint['epoch']
+    learning_rate = checkpoint['learning_rate']
+    loss = checkpoint['loss']
+
+    return epoch, learning_rate, loss
+
+
+def save_checkpoint(filename, model, epoch, learning_rate, loss, local_rank=0):
+    if local_rank != 0:
+        return
+    logging.info('Saving checkpoint to {filename}: epoch={epoch}, '
+                 'learning_rate={learning_rate}, loss={loss}'.format(
+                     filename=filename,
+                     epoch=epoch,
+                     learning_rate=learning_rate,
+                     loss=loss))
+    checkpoint = {
+        'state_dict': model.state_dict(),
+        'epoch': epoch,
+        'learning_rate': learning_rate,
+        'loss': loss
+    }
+    torch.save(checkpoint, filename)
+
+
+def save_training_info(filename,
+                       model_path,
+                       current_epoch,
+                       learning_rate,
+                       loss,
+                       best_loss,
+                       best_epoch,
+                       local_rank=0):
+    if local_rank != 0:
+        return
+
+    with open(filename, 'w') as f:
+        f.write('model_path: {}\n'.format(model_path))
+        f.write('epoch: {}\n'.format(current_epoch))
+        f.write('learning rate: {}\n'.format(learning_rate))
+        f.write('loss: {}\n'.format(loss))
+        f.write('best loss: {}\n'.format(best_loss))
+        f.write('best epoch: {}\n'.format(best_epoch))
diff --git a/egs/aishell/s10b/ctc/ctc_loss.py b/egs/aishell/s10b/ctc/ctc_loss.py
new file mode 100644
index 00000000000..2c703c2af8c
--- /dev/null
+++ b/egs/aishell/s10b/ctc/ctc_loss.py
@@ -0,0 +1,218 @@
+#!/usr/bin/env python3
+
+# Copyright 2020 Mobvoi AI Lab, Beijing, China (author: Fangjun Kuang)
+# Apache 2.0
+
+import torch
+from torch.autograd import Function
+from torch.utils.dlpack import to_dlpack
+import torch.nn as nn
+import torch.nn.functional as F
+
+import kaldi
+from kaldi import ctc
+
+
+class WarpCtcLoss(Function):
+
+    @staticmethod
+    def forward(ctx, activations, targets, input_lengths, target_lengths, blank,
+                reduction):
+        '''
+        Args:
+            activations: `(seq_len, batch_size, C)`, where `C` is the number
+                          of characters in aphabet including the blank symbol.
+
+            targets: a tensor of [batch size] containing the concatenated labels.
+                     Targets cannot be blank.
+
+            input_lengths: a tensor of [batch_size] containing the number of input frames
+                     for each utterance in the batch.
+
+            target_lengths: a tensor of [batch_size] containing the label lengths
+
+            blank: the index of the blank symbol.
+
+            reduction: specifies the reduction to apply to
+                       the output: `none` | `mean` | `sum`.
+
+                       `none`: no reduction will be applied;
+
+                       `mean`: the output losses will be divided
+                       by the target lengths and then the mean
+                       over the batch is taken.
+
+                       `sum`: the output will be summed.
+        '''
+        device = activations.device
+        assert device.type == 'cuda', 'we only support computing CTCLoss on GPU devices.'
+
+        activations_tensor = activations.float().reshape(-1).contiguous()
+        gradients_tensor = torch.zeros_like(activations_tensor).contiguous()
+
+        # NOTE(fangjun): foobar.cpu() is a no operation if foobar is already on CPU.
+        flat_labels_tensor = targets.int().view(-1).cpu()
+        label_lengths_tensor = target_lengths.int().view(-1).cpu()
+        input_lengths_tensor = input_lengths.int().view(-1).cpu()
+
+        alphabet_size = activations.size(2)
+        minibatch = activations.size(1)
+
+        costs_tensor = torch.zeros(minibatch, dtype=torch.float32).contiguous()
+
+        info = ctc.CtcOptions()
+        info.loc = ctc.CtcComputeLocation.CTC_GPU
+        info.blank_label = blank
+
+        label_lengths = kaldi.IntSubVectorFromDLPack(
+            to_dlpack(label_lengths_tensor))
+
+        input_lengths = kaldi.IntSubVectorFromDLPack(
+            to_dlpack(input_lengths_tensor))
+
+        status, size_in_bytes = ctc.GetWorkspaceSize(
+            label_lengths=label_lengths,
+            input_lengths=input_lengths,
+            alphabet_size=alphabet_size,
+            minibatch=minibatch,
+            info=info)
+
+        assert status == ctc.CtcStatus.CTC_STATUS_SUCCESS
+
+        num_floats = size_in_bytes // 4 + 1
+        workspace_tensor = torch.zeros(
+            num_floats, dtype=torch.float32).contiguous().to(device)
+
+        cu_activations = kaldi.CuSubVectorFromDLPack(
+            to_dlpack(activations_tensor))
+        cu_gradients = kaldi.CuSubVectorFromDLPack(to_dlpack(gradients_tensor))
+        flat_labels = kaldi.IntSubVectorFromDLPack(
+            to_dlpack(flat_labels_tensor))
+        costs = kaldi.FloatSubVectorFromDLPack(to_dlpack(costs_tensor))
+        workspace = kaldi.CuSubVectorFromDLPack(to_dlpack(workspace_tensor))
+
+        stream = torch.cuda.default_stream(device)
+        with torch.cuda.stream(stream):
+            status = ctc.ComputeCtcLossGpu(activations=cu_activations,
+                                           gradients=cu_gradients,
+                                           flat_labels=flat_labels,
+                                           label_lengths=label_lengths,
+                                           input_lengths=input_lengths,
+                                           alphabet_size=alphabet_size,
+                                           minibatch=minibatch,
+                                           costs=costs,
+                                           workspace=workspace,
+                                           options=info)
+
+        gradients_tensor = gradients_tensor.reshape(*activations.shape)
+
+        ctx.save_for_backward(gradients_tensor),
+
+        if reduction == 'none':
+            return costs_tensor
+
+        total_loss = torch.sum(costs_tensor)
+
+        if reduction == 'sum':
+            return total_loss
+
+        # else it is `mean`
+        total_target_lengths = torch.sum(label_lengths_tensor)
+
+        return total_loss / minibatch / total_target_lengths
+
+    @staticmethod
+    def backward(ctx, unused):
+        '''
+        The `forward` method has 6 inputs:
+            `activations`, `targets`, `input_lengths`,
+            `target_lengths`, `blank`, `reduction`
+
+        We have to return 6 values.
+        '''
+        gradients, = ctx.saved_tensors
+        return gradients, None, None, None, None, None
+
+
+def warp_ctc_loss(activations, targets, input_lengths, target_lengths, blank,
+                  reduction):
+    '''
+    A thin wrapper for WarpCtcLoss.
+
+    We can use keyword arguments with this wrapper
+    '''
+    loss_func = WarpCtcLoss.apply
+    return loss_func(activations, targets, input_lengths, target_lengths, blank,
+                     reduction)
+
+
+class CTCLoss(nn.Module):
+    '''
+    Note that PyTorch requires the probability to be log prob,
+    while warp-ctc does not have this requirement.
+    '''
+
+    def __init__(self, use_warp_ctc=True, blank=0, reduction='mean'):
+        '''
+        Args:
+            blank: the index of the blank label
+            reduction: specifies the reduction to apply to
+                       the output: `none` | `mean` | `sum`.
+
+                       `none`: no reduction will be applied;
+
+                       `mean`: the output losses will be divided
+                       by the target lengths and then the mean
+                       over the batch is taken.
+
+                       `sum`: the output will be summed.
+        '''
+        super().__init__()
+        assert reduction in ['none', 'mean', 'sum']
+
+        #  if use_warp_ctc:
+        #      self.loss_func = warp_ctc_loss
+        #  else:
+        #      self.loss_func = F.ctc_loss
+
+        self.use_warp_ctc = use_warp_ctc
+
+        self.blank = blank
+        self.reduction = reduction
+
+    def forward(self, activations, targets, input_lengths, target_lengths):
+        '''
+        Args:
+            activations: `(seq_len, batch_size, C)`, where `C` is the number
+                         of characters in alphabet including the blank symbol.
+
+            targets: a tensor of [batch size] containing the concatenated labels.
+                     Targets cannot be blank.
+
+            input_lengths: a tensor of [batch_size] containing the number of input frames
+                     for each utterance in the batch.
+
+            target_lengths: a tensor of [batch_size] containing the label lengths
+        '''
+        if self.use_warp_ctc == False:
+            # move all tensors to GPU
+            device = activations.device
+            targets = targets.to(device)
+            input_lengths = input_lengths.to(device)
+            target_lengths = target_lengths.to(device)
+
+            log_probs = F.log_softmax(activations, dim=-1)
+
+            return F.ctc_loss(log_probs=log_probs,
+                              targets=targets,
+                              input_lengths=input_lengths,
+                              target_lengths=target_lengths,
+                              blank=self.blank,
+                              reduction=self.reduction)
+        else:
+            return warp_ctc_loss(activations=activations,
+                                 targets=targets,
+                                 input_lengths=input_lengths,
+                                 target_lengths=target_lengths,
+                                 blank=self.blank,
+                                 reduction=self.reduction)
diff --git a/egs/aishell/s10b/ctc/ctc_loss_test.py b/egs/aishell/s10b/ctc/ctc_loss_test.py
new file mode 100755
index 00000000000..338e816cffa
--- /dev/null
+++ b/egs/aishell/s10b/ctc/ctc_loss_test.py
@@ -0,0 +1,77 @@
+#!/usr/bin/env python3
+
+# Copyright 2020 Mobvoi AI Lab, Beijing, China (author: Fangjun Kuang)
+# Apache 2.0
+
+import torch
+
+import kaldi
+
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn.utils.rnn import pad_sequence
+
+from ctc_loss import CTCLoss
+
+
+def test_baidu_warp_ctc():
+    device_id = 1
+    kaldi.SelectGpuDevice(device_id=device_id)
+
+    device = torch.device('cuda', index=device_id)
+
+    ex1 = torch.tensor([[0.2, 0.2, 0.2, 0.2, 0.2]], dtype=torch.float32)
+
+    ex2 = torch.tensor(
+        [[1, 2, 3, 4, 5], [6, 7, 8, 9, 10], [11, 12, 13, 14, 15]],
+        dtype=torch.float32)
+
+    ex3 = torch.tensor([[-5, -4, -3, -2, -1], [-10, -9, -8, -7, -6],
+                        [-15, -14, -13, -12, -11]],
+                       dtype=torch.float32)
+
+    activations = pad_sequence([ex1, ex2, ex3], batch_first=False)
+    activations = activations.to(device)
+
+    tmp_activations = activations.clone()
+
+    activations.requires_grad_(True)
+    tmp_activations.requires_grad_(True)
+
+    targets = torch.tensor([1, 3, 3, 2, 3])
+    target_lengths = torch.tensor([1, 2, 2])
+    input_lengths = torch.tensor([1, 3, 3])
+
+    loss_func = CTCLoss(use_warp_ctc=True, blank=0, reduction='mean')
+    loss = loss_func(activations=activations,
+                     targets=targets,
+                     input_lengths=input_lengths,
+                     target_lengths=target_lengths)
+
+    print('warp ctc loss', loss)
+    loss.backward()
+    print('warp ctc activations grad', activations.grad)
+
+    loss_func = CTCLoss(use_warp_ctc=False, blank=0, reduction='mean')
+    loss = loss_func(activations=tmp_activations,
+                     targets=targets,
+                     input_lengths=input_lengths,
+                     target_lengths=target_lengths)
+    loss.backward()
+    print('loss', loss)
+    print('grad', tmp_activations.grad)
+    print('grad x 6', tmp_activations.grad * 6)
+
+    # It turns out that
+    #   - the loss
+    #   - and the gradients
+    # computed by warp ctc and PyTorch's built-in CTCLoss are different.
+
+
+def main():
+    test_baidu_warp_ctc()
+
+
+if __name__ == '__main__':
+    torch.manual_seed(20200224)
+    main()
diff --git a/egs/aishell/s10b/ctc/dataset.py b/egs/aishell/s10b/ctc/dataset.py
new file mode 100644
index 00000000000..6c3ab0aa7a9
--- /dev/null
+++ b/egs/aishell/s10b/ctc/dataset.py
@@ -0,0 +1,248 @@
+#!/usr/bin/env python3
+
+# Copyright 2020 Mobvoi AI Lab, Beijing, China (author: Fangjun Kuang)
+# Apache 2.0
+
+import os
+import logging
+
+import numpy as np
+import torch
+from torch.nn.utils.rnn import pad_sequence
+from torch.utils.data import DataLoader
+from torch.utils.data import Dataset
+
+import kaldi
+
+
+def get_ctc_dataloader(feats_scp,
+                       labels_scp=None,
+                       batch_size=1,
+                       shuffle=False,
+                       num_workers=0,
+                       model_left_context=0,
+                       model_right_context=0,
+                       world_size=None,
+                       local_rank=None):
+
+    dataset = CtcDataset(feats_scp=feats_scp, labels_scp=labels_scp)
+
+    collate_fn = CtcDatasetCollateFunc(model_left_context=model_left_context,
+                                       model_right_context=model_right_context)
+
+    if world_size:
+        logging.info('world_size: {}'.format(world_size))
+        sampler = torch.utils.data.distributed.DistributedSampler(
+            dataset, num_replicas=world_size, rank=local_rank, shuffle=shuffle)
+        # sampler and shuffle are mutually exclusive;
+        # it will raise an exception if you set both
+        shuffle = False
+
+    else:
+        sampler = None
+
+    dataloader = DataLoader(dataset,
+                            batch_size=batch_size,
+                            shuffle=shuffle,
+                            num_workers=num_workers,
+                            collate_fn=collate_fn,
+                            sampler=sampler)
+
+    return dataloader
+
+
+def _add_model_left_right_context(x, left_context, right_context):
+    padded = x
+    if left_context > 0:
+        first_frame = x[0, :]
+        left_padding = [first_frame] * left_context
+        padded = np.vstack([left_padding, x])
+
+    if right_context > 0:
+        last_frame = x[-1, :]
+        right_padding = [last_frame] * right_context
+        padded = np.vstack([padded, right_padding])
+
+    return padded
+
+
+class CtcDataset(Dataset):
+
+    def __init__(self, feats_scp, labels_scp=None):
+        '''
+        Args:
+            feats_scp: filename for feats.scp
+            labels_scp: if provided, it is the filename of labels.scp
+        '''
+        assert os.path.isfile(feats_scp)
+        if labels_scp:
+            assert os.path.isfile(labels_scp)
+            logging.info('labels scp: {}'.format(labels_scp))
+        else:
+            logging.warn('No labels scp is given.')
+
+        # items is a dict of [uttid, feat_rxfilename, None]
+        # or [uttid, feat_rxfilename, label_rxfilename] if labels_scp is not None
+        items = dict()
+
+        with open(feats_scp, 'r') as f:
+            for line in f:
+                # every line has the following format:
+                # uttid feat_rxfilename
+                uttid_rxfilename = line.split()
+                assert len(uttid_rxfilename) == 2
+
+                uttid, rxfilename = uttid_rxfilename
+
+                assert uttid not in items
+
+                items[uttid] = [uttid, rxfilename, None]
+
+        if labels_scp:
+            expected_count = len(items)
+            n = 0
+            with open(labels_scp, 'r') as f:
+                for line in f:
+                    # every line has the following format:
+                    # uttid rxfilename
+                    uttid_rxfilename = line.split()
+
+                    assert len(uttid_rxfilename) == 2
+
+                    uttid, rxfilename = uttid_rxfilename
+
+                    assert uttid in items
+
+                    items[uttid][-1] = rxfilename
+
+                    n += 1
+
+            # every utterance should have a label if
+            # labels_scp is given
+            assert n == expected_count
+
+        self.items = list(items.values())
+        self.num_items = len(self.items)
+        self.feats_scp = feats_scp
+        self.labels_scp = labels_scp
+
+    def __len__(self):
+        return self.num_items
+
+    def __getitem__(self, i):
+        '''
+        Returns:
+            a list [key, feat_rxfilename, label_rxfilename]
+            Note that label_rxfilename may be None.
+        '''
+        return self.items[i]
+
+    def __str__(self):
+        s = 'feats scp: {}\n'.format(self.feats_scp)
+
+        if self.labels_scp:
+            s += 'labels scp: {}\n'.format(self.labels_scp)
+
+        s += 'num utterances: {}\n'.format(self.num_items)
+
+        return s
+
+
+class CtcDatasetCollateFunc:
+
+    def __init__(self, model_left_context=0, model_right_context=0):
+        self.model_left_context = model_left_context
+        self.model_right_context = model_right_context
+
+    def __call__(self, batch):
+        '''
+        Args:
+            batch: a list of [uttid, feat_rxfilename, label_rxfilename].
+                   Note that label_rxfilename may be None.
+
+        Returns:
+            uttid_list: a list of utterance id
+
+            feat: a 3-D float tensor of shape [batch_size, seq_len, feat_dim]
+
+            feat_len_list: number of frames of each utterance before padding
+
+            label_list: a list of labels of each utterance; It may be None.
+
+            label_len_list: label length of each utterance; It is None if label_list is None.
+        '''
+        uttid_list = []  # utterance id of each utterance
+        feat_len_list = []  # number of frames of each utterance
+        label_list = []  # label of each utterance
+        label_len_list = []  # label length of each utterance
+
+        feat_list = []
+
+        for b in batch:
+            uttid, feat_rxfilename, label_rxfilename = b
+
+            uttid_list.append(uttid)
+
+            feat = kaldi.read_mat(feat_rxfilename).numpy()
+
+            # use the length before padding
+            feat_len_list.append(feat.shape[0])
+
+            feat = _add_model_left_right_context(feat, self.model_left_context,
+                                                 self.model_right_context)
+
+            feat = torch.from_numpy(feat).float()
+
+            feat_list.append(feat)
+
+            if label_rxfilename:
+                label = kaldi.read_vec_int(label_rxfilename)
+                assert 0 not in label
+
+                # we will use frame subsampling factor == 3
+                assert len(label) < feat_len_list[-1] / 3
+
+                label_list.extend(label)
+                label_len_list.append(len(label))
+
+        feat = pad_sequence(feat_list, batch_first=True)
+
+        if not label_list:
+            label_list = None
+            label_len_list = None
+
+        return uttid_list, feat, feat_len_list, label_list, label_len_list
+
+
+def _test_dataset():
+    feats_scp = 'data/train_sp/feats.scp'
+    labels_scp = 'data/train_sp/labels.scp'
+
+    dataset = CtcDataset(feats_scp=feats_scp, labels_scp=labels_scp)
+
+    print(dataset)
+
+
+def _test_dataloader():
+    feats_scp = 'data/test/feats.scp'
+    labels_scp = 'data/test/labels.scp'
+
+    dataset = CtcDataset(feats_scp=feats_scp, labels_scp=labels_scp)
+
+    dataloader = DataLoader(dataset,
+                            batch_size=2,
+                            num_workers=10,
+                            shuffle=True,
+                            collate_fn=CtcDatasetCollateFunc())
+    i = 0
+    for batch in dataloader:
+        uttid_list, feat, feat_len_list, label_list, label_len_list = batch
+        print(uttid_list, feat.shape, feat_len_list, label_len_list)
+        i += 1
+        if i > 10:
+            break
+
+
+if __name__ == '__main__':
+    #  _test_dataset()
+    _test_dataloader()
diff --git a/egs/aishell/s10b/ctc/ddp_train.py b/egs/aishell/s10b/ctc/ddp_train.py
new file mode 100644
index 00000000000..0047d2fa554
--- /dev/null
+++ b/egs/aishell/s10b/ctc/ddp_train.py
@@ -0,0 +1,275 @@
+#!/usr/bin/env python3
+
+# Copyright 2020 Mobvoi AI Lab, Beijing, China (author: Fangjun Kuang)
+# Apache 2.0
+
+import logging
+import math
+import os
+import sys
+import warnings
+
+# disable warnings when loading tensorboard
+warnings.simplefilter(action='ignore', category=FutureWarning)
+
+import torch
+import torch.distributed as dist
+import torch.nn.functional as F
+import torch.optim as optim
+from torch.nn.parallel import DistributedDataParallel as DDP
+from torch.nn.utils import clip_grad_value_
+from torch.utils.tensorboard import SummaryWriter
+
+import kaldi
+
+from common import load_checkpoint
+from common import save_checkpoint
+from common import save_training_info
+from common import setup_logger
+from ctc_loss import CTCLoss
+from dataset import get_ctc_dataloader
+from model import get_ctc_model
+from options import get_args
+
+
+def train_one_epoch(dataloader, model, device, optimizer, loss_func,
+                    current_epoch, tf_writer):
+    total_loss = 0.
+    num = 0.
+
+    # TODO(fangjun): remove `num_repeat`. It's used only for testing.
+    num_repeat = 100
+    for kk in range(num_repeat):
+        for batch_idx, batch in enumerate(dataloader):
+            unused_uttid_list, feat, feat_len_list, label_list, label_len_list = batch
+
+            feat = feat.to(device)
+
+            activations, feat_len_list = model(feat, feat_len_list)
+
+            # at this point activations is of shape: [batch_size, seq_len, output_dim]
+            # CTCLoss requires a layout: [seq_len, batch_size, output_dim]
+
+            activations = activations.permute(1, 0, 2)
+            # now activations is of shape [seq_len, batch_size, output_dim]
+
+            targets = torch.tensor(label_list)
+
+            if not isinstance(feat_len_list, torch.Tensor):
+                input_lengths = torch.tensor(feat_len_list)
+            else:
+                input_lengths = feat_len_list
+
+            target_lengths = torch.tensor(label_len_list)
+
+            loss = loss_func(activations=activations,
+                             targets=targets,
+                             input_lengths=input_lengths,
+                             target_lengths=target_lengths)
+
+            optimizer.zero_grad()
+            if math.isnan(loss.item()):
+                print(loss)
+                logging.warn('loss is nan for batch {} at epoch {}\n'
+                             'feat_len_list: {}\n'
+                             'label_len_list: {}\n'.format(
+                                 batch_idx, current_epoch, feat_len_list,
+                                 label_len_list))
+                import sys
+                sys.exit(1)
+
+            loss.backward()
+
+            #  clip_grad_value_(model.parameters(), 5.0)
+
+            optimizer.step()
+
+            total_loss += loss.item()
+            num += 1
+            if batch_idx % 100 == 0:
+                logging.info(
+                    'Device ({}) batch {}/{} ({:.2f}%) ({}/{}), loss {:.5f}, average {:.5f}'
+                    .format(device.index, batch_idx, len(dataloader),
+                            float(batch_idx) / len(dataloader) * 100, kk,
+                            num_repeat, loss.item(), total_loss / num))
+
+            if tf_writer and batch_idx % 100 == 0:
+                tf_writer.add_scalar(
+                    'train/current_batch_average_loss', loss.item(),
+                    batch_idx + kk * len(dataloader) +
+                    num_repeat * len(dataloader) * current_epoch)
+
+                tf_writer.add_scalar(
+                    'train/global_average_loss', total_loss / num,
+                    batch_idx + kk * len(dataloader) +
+                    num_repeat * len(dataloader) * current_epoch)
+
+    return total_loss / num
+
+
+def main():
+    args = get_args()
+    setup_logger('{}/log-train-device-{}'.format(args.dir, args.device_id),
+                 args.log_level)
+    logging.info(' '.join(sys.argv))
+
+    if torch.cuda.is_available() == False:
+        logging.error('No GPU detected!')
+        sys.exit(-1)
+
+    dist.init_process_group('nccl',
+                            rank=args.device_id,
+                            world_size=args.world_size)
+
+    kaldi.SelectGpuDevice(device_id=args.device_id)
+
+    device = torch.device('cuda', args.device_id)
+
+    model = get_ctc_model(input_dim=args.input_dim,
+                          output_dim=args.output_dim,
+                          num_layers=args.num_layers,
+                          hidden_dim=args.hidden_dim,
+                          proj_dim=args.proj_dim)
+
+    start_epoch = 0
+    num_epochs = args.num_epochs
+    learning_rate = args.learning_rate
+    best_loss = None
+
+    if args.checkpoint:
+        start_epoch, learning_rate, best_loss = load_checkpoint(
+            args.checkpoint, model)
+        logging.info(
+            'loaded from checkpoint: start epoch {start_epoch}, '
+            'learning rate {learning_rate}, best loss {best_loss}'.format(
+                start_epoch=start_epoch,
+                learning_rate=learning_rate,
+                best_loss=best_loss))
+
+    model.to(device)
+
+    model = DDP(model, device_ids=[args.device_id])
+
+    dataloader = get_ctc_dataloader(
+        feats_scp=args.feats_scp,
+        labels_scp=args.labels_scp,
+        batch_size=args.batch_size,
+        shuffle=True,
+        num_workers=8,
+        model_left_context=args.model_left_context,
+        model_right_context=args.model_right_context,
+        world_size=args.world_size,
+        local_rank=args.device_id)
+
+    lr = learning_rate
+    optimizer = optim.Adam(model.parameters(),
+                           lr=lr,
+                           weight_decay=args.l2_regularize)
+
+    if device.index == 0:
+        tf_writer = SummaryWriter(log_dir='{}/tensorboard'.format(args.dir))
+    else:
+        tf_writer = None
+
+    model.train()
+
+    loss_func = CTCLoss(use_warp_ctc=False, blank=0, reduction='mean')
+
+    best_epoch = 0
+    best_model_path = os.path.join(args.dir, 'best_model.pt')
+    best_epoch_info_filename = os.path.join(args.dir, 'best-epoch-info')
+
+    dist.barrier()
+
+    try:
+        for epoch in range(start_epoch, num_epochs):
+            learning_rate = lr * pow(0.8, epoch)
+            #  learning_rate = lr
+
+            if tf_writer:
+                tf_writer.add_scalar('learning_rate', learning_rate, epoch)
+
+            for param_group in optimizer.param_groups:
+                param_group['lr'] = learning_rate
+
+            logging.info('Device ({}) epoch {}, learning rate {}'.format(
+                device.index, epoch, learning_rate))
+
+            loss = train_one_epoch(dataloader=dataloader,
+                                   model=model,
+                                   device=device,
+                                   optimizer=optimizer,
+                                   loss_func=loss_func,
+                                   current_epoch=epoch,
+                                   tf_writer=tf_writer)
+
+            # the lower, the better
+            if best_loss is None or best_loss > loss:
+                best_loss = loss
+                best_epoch = epoch
+                save_checkpoint(filename=best_model_path,
+                                model=model,
+                                epoch=epoch,
+                                learning_rate=learning_rate,
+                                loss=loss,
+                                local_rank=args.device_id)
+                save_training_info(filename=best_epoch_info_filename,
+                                   model_path=best_model_path,
+                                   current_epoch=epoch,
+                                   learning_rate=learning_rate,
+                                   loss=loss,
+                                   best_loss=best_loss,
+                                   best_epoch=best_epoch,
+                                   local_rank=args.device_id)
+
+            # we always save the model for every epoch
+            model_path = os.path.join(args.dir, 'epoch-{}.pt'.format(epoch))
+            save_checkpoint(filename=model_path,
+                            model=model,
+                            epoch=epoch,
+                            learning_rate=learning_rate,
+                            loss=loss,
+                            local_rank=args.device_id)
+
+            epoch_info_filename = os.path.join(args.dir,
+                                               'epoch-{}-info'.format(epoch))
+            save_training_info(filename=epoch_info_filename,
+                               model_path=model_path,
+                               current_epoch=epoch,
+                               learning_rate=learning_rate,
+                               loss=loss,
+                               best_loss=best_loss,
+                               best_epoch=best_epoch,
+                               local_rank=args.device_id)
+    except KeyboardInterrupt:
+        # save the model when ctrl-c is pressed
+        model_path = os.path.join(args.dir,
+                                  'epoch-{}-interrupted.pt'.format(epoch))
+        # use a very large loss for the interrupted model
+        loss = 100000000
+        save_checkpoint(model_path,
+                        model=model,
+                        epoch=epoch,
+                        learning_rate=learning_rate,
+                        loss=loss,
+                        local_rank=args.device_id)
+
+        epoch_info_filename = os.path.join(
+            args.dir, 'epoch-{}-interrupted-info'.format(epoch))
+        save_training_info(filename=epoch_info_filename,
+                           model_path=model_path,
+                           current_epoch=epoch,
+                           learning_rate=learning_rate,
+                           loss=loss,
+                           best_loss=best_loss,
+                           best_epoch=best_epoch,
+                           local_rank=args.device_id)
+
+    if tf_writer:
+        tf_writer.close()
+    logging.warning('Device ({}) Training done!'.format(args.device_id))
+
+
+if __name__ == '__main__':
+    torch.manual_seed(20200221)
+    main()
diff --git a/egs/aishell/s10b/ctc/inference.py b/egs/aishell/s10b/ctc/inference.py
new file mode 100644
index 00000000000..0eb922a0376
--- /dev/null
+++ b/egs/aishell/s10b/ctc/inference.py
@@ -0,0 +1,93 @@
+#!/usr/bin/env python3
+
+# Copyright 2020 Mobvoi AI Lab, Beijing, China (author: Fangjun Kuang)
+# Apache 2.0
+
+import logging
+import os
+import sys
+
+import torch
+import torch.nn.functional as F
+
+import kaldi
+
+from common import load_checkpoint
+from common import setup_logger
+from dataset import get_ctc_dataloader
+from model import get_ctc_model
+from options import get_args
+from tdnnf_model import get_tdnnf_model
+
+
+def main():
+    args = get_args()
+
+    setup_logger('{}/log-inference'.format(args.dir), args.log_level)
+    logging.info(' '.join(sys.argv))
+
+    if torch.cuda.is_available() == False:
+        logging.warning('No GPU detected! Use CPU for inference.')
+        device = torch.device('cpu')
+    else:
+        device = torch.device('cuda', args.device_id)
+
+    model = get_tdnnf_model(
+        input_dim=args.input_dim,
+        output_dim=args.output_dim,
+        hidden_dim=args.hidden_dim,
+        bottleneck_dim=args.bottleneck_dim,
+        prefinal_bottleneck_dim=args.prefinal_bottleneck_dim,
+        kernel_size_list=args.kernel_size_list,
+        subsampling_factor_list=args.subsampling_factor_list)
+
+    load_checkpoint(args.checkpoint, model)
+
+    model.to(device)
+    model.eval()
+
+    wspecifier = 'ark,scp:{filename}.ark,{filename}.scp'.format(
+        filename=os.path.join(args.dir, 'nnet_output'))
+
+    writer = kaldi.MatrixWriter(wspecifier)
+
+    dataloader = get_ctc_dataloader(
+        feats_scp=args.feats_scp,
+        batch_size=args.batch_size,
+        shuffle=False,
+        num_workers=8,
+        model_left_context=args.model_left_context,
+        model_right_context=args.model_right_context)
+
+    for batch_idx, batch in enumerate(dataloader):
+        uttid_list, feat, feat_len_list, _, _ = batch
+
+        feat = feat.to(device)
+
+        with torch.no_grad():
+            activations, feat_len_list = model(feat, feat_len_list)
+
+        log_probs = F.log_softmax(activations, dim=-1)
+
+        num = len(uttid_list)
+        for i in range(num):
+            uttid = uttid_list[i]
+            feat_len = feat_len_list[i]
+            value = log_probs[i, :feat_len, :]
+
+            value = value.cpu()
+
+            writer.Write(uttid, value.numpy())
+
+        if batch_idx % 10 == 0:
+            logging.info('Processed batch {}/{} ({:.3f}%)'.format(
+                batch_idx, len(dataloader),
+                float(batch_idx) / len(dataloader) * 100))
+
+    writer.Close()
+    logging.info('pseudo-log-likelihood is saved to {}'.format(
+        os.path.join(args.dir, 'nnet_output.scp')))
+
+
+if __name__ == '__main__':
+    main()
diff --git a/egs/aishell/s10b/ctc/model.py b/egs/aishell/s10b/ctc/model.py
new file mode 100644
index 00000000000..dbda62f6fca
--- /dev/null
+++ b/egs/aishell/s10b/ctc/model.py
@@ -0,0 +1,142 @@
+#!/usr/bin/env python3
+
+# Copyright 2019-2020 Mobvoi AI Lab, Beijing, China (author: Fangjun Kuang)
+# Apache 2.0
+
+import logging
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn.utils.rnn import pack_padded_sequence
+from torch.nn.utils.rnn import pad_packed_sequence
+
+from add_deltas_layer import AddDeltasLayer
+
+
+# TODO(fangjun): remove proj_dim since we'll use TDNN-F.
+def get_ctc_model(input_dim,
+                  output_dim,
+                  num_layers=4,
+                  hidden_dim=512,
+                  proj_dim=256,
+                  frame_subsampling_factor=3):
+    model = CtcModel(input_dim=input_dim,
+                     output_dim=output_dim,
+                     num_layers=num_layers,
+                     hidden_dim=hidden_dim,
+                     proj_dim=proj_dim,
+                     frame_subsampling_factor=frame_subsampling_factor)
+
+    return model
+
+
+class CtcModel(nn.Module):
+
+    def __init__(self, input_dim, output_dim, num_layers, hidden_dim, proj_dim,
+                 frame_subsampling_factor):
+        '''
+        Args:
+            input_dim: input dimension of the network
+
+            output_dim: output dimension of the network
+
+            num_layers: number of LSTM layers of the network
+
+            hidden_dim: the dimension of the hidden state of LSTM layers
+
+            proj_dim: dimension of the affine layer after every LSTM layer
+        '''
+        super().__init__()
+
+        assert frame_subsampling_factor in [1, 3]
+        self.sf = frame_subsampling_factor
+
+        lstm_layer_list = []
+        proj_layer_list = []
+
+        # batchnorm requires input of shape [N, C, L] == [batch_size, dim, seq_len]
+        self.input_batch_norm = nn.BatchNorm1d(num_features=input_dim * 3,
+                                               affine=False)
+
+        self.lstm = nn.LSTM(input_size=input_dim * 3,
+                            hidden_size=hidden_dim,
+                            num_layers=num_layers,
+                            dropout=0.2,
+                            batch_first=True)
+
+        self.prefinal = nn.Linear(in_features=hidden_dim,
+                                  out_features=output_dim)
+
+        self.add_deltas_layer = AddDeltasLayer()
+
+    def forward(self, feat, feat_len_list):
+        '''
+        Args:
+            feat: a 3-D tensor of shape [batch_size, seq_len, feat_dim]
+            feat_len_list: feat length of each utterance before padding
+
+        Returns:
+            a 3-D tensor of shape [batch_size, seq_len, output_dim]
+            It is the output of `nn.Linear`. That is, **NO** log_softmax
+            is applied to the output.
+        '''
+        x = feat
+
+        # at his point, x is of shape [batch_size, seq_len, feat_dim]
+        x = x.permute(0, 2, 1)
+
+        # at his point, x is of shape [batch_size, feat_dim, seq_len] == [N, C, L]
+
+        x = self.add_deltas_layer(x)
+
+        if self.sf == 3:
+            x = x[:, :, ::3]
+            feat_len_list = (torch.tensor(feat_len_list).int() + 2) / 3
+            # feat_len_list is still of type int32
+
+        x = self.input_batch_norm(x)
+
+        x = x.permute(0, 2, 1)
+
+        # at his point, x is of shape [batch_size, seq_len, feat_dim] == [N, L, C]
+
+        x = pack_padded_sequence(input=x,
+                                 lengths=feat_len_list,
+                                 batch_first=True,
+                                 enforce_sorted=False)
+
+        # TODO(fangjun): save intermediate LSTM state to support streaming inference
+        x, _ = self.lstm(x)
+
+        x, _ = pad_packed_sequence(x, batch_first=True)
+
+        x = self.prefinal(x)
+
+        return x, feat_len_list
+
+
+def _test_ctc_model():
+    input_dim = 5
+    output_dim = 20
+    model = CtcModel(input_dim=input_dim,
+                     output_dim=output_dim,
+                     num_layers=2,
+                     hidden_dim=3,
+                     proj_dim=4)
+
+    feat1 = torch.randn((6, input_dim))
+    feat2 = torch.randn((8, input_dim))
+
+    from torch.nn.utils.rnn import pad_sequence
+    feat = pad_sequence([feat1, feat2], batch_first=True)
+    assert feat.shape == torch.Size([2, 8, input_dim])
+
+    feat_len_list = [6, 8]
+    x = model(feat, feat_len_list)
+
+    assert x.shape == torch.Size([2, 8, output_dim])
+
+
+if __name__ == '__main__':
+    _test_ctc_model()
diff --git a/egs/aishell/s10b/ctc/options.py b/egs/aishell/s10b/ctc/options.py
new file mode 100644
index 00000000000..178f34d45fd
--- /dev/null
+++ b/egs/aishell/s10b/ctc/options.py
@@ -0,0 +1,226 @@
+#!/usr/bin/env python3
+
+# Copyright 2020 Mobvoi AI Lab, Beijing, China (author: Fangjun Kuang)
+# Apache 2.0
+
+import argparse
+import os
+
+
+def _str2bool(v):
+    '''
+    This function is modified from
+    https://stackoverflow.com/questions/15008758/parsing-boolean-values-with-argparse
+    '''
+    if isinstance(v, bool):
+        return v
+    elif v.lower() in ('yes', 'true', 't', 'y', '1'):
+        return True
+    elif v.lower() in ('no', 'false', 'f', 'n', '0'):
+        return False
+    else:
+        raise argparse.ArgumentTypeError('Boolean value expected.')
+
+
+def _set_training_args(parser):
+    parser.add_argument('--train.labels-scp',
+                        dest='labels_scp',
+                        help='filename of labels.scp',
+                        type=str)
+
+    parser.add_argument('--train.num-epochs',
+                        dest='num_epochs',
+                        help='number of epochs to train',
+                        type=int)
+
+    parser.add_argument('--train.lr',
+                        dest='learning_rate',
+                        help='learning rate',
+                        type=float)
+
+    parser.add_argument('--train.l2-regularize',
+                        dest='l2_regularize',
+                        help='l2 regularize',
+                        type=float)
+
+    # TODO(fangjun): add validation feats_scp
+
+    # PyTorch DistributedDataParallel (ddp) parameters
+    parser.add_argument(
+        '--train.use-ddp',
+        dest='use_ddp',
+        help="true to use PyTorch's built-in DistributedDataParallel trainer",
+        type=_str2bool)
+
+    # note that we use device id as local rank.
+
+    parser.add_argument('--train.ddp.world-size',
+                        dest='world_size',
+                        help='world size in ddp',
+                        default=1,
+                        type=int)
+
+
+def _check_training_args(args):
+    assert os.path.isfile(args.labels_scp)
+
+    assert args.num_epochs > 0
+    assert args.learning_rate > 0
+    assert args.l2_regularize >= 0
+
+    if args.checkpoint:
+        assert os.path.exists(args.checkpoint)
+
+    if args.use_ddp:
+        assert args.world_size >= 1
+
+
+def _check_inference_args(args):
+    assert args.checkpoint is not None
+    assert os.path.isfile(args.checkpoint)
+
+
+def _check_args(args):
+    if args.is_training:
+        _check_training_args(args)
+    else:
+        _check_inference_args(args)
+
+    assert os.path.isdir(args.dir)
+    assert os.path.isfile(args.feats_scp)
+
+    assert args.batch_size > 0
+    assert args.device_id >= 0
+
+    assert args.input_dim > 0
+    assert args.output_dim > 0
+    assert args.model_left_context >= 0
+    assert args.model_right_context >= 0
+    assert args.hidden_dim > 0
+    assert args.bottleneck_dim > 0
+    assert args.prefinal_bottleneck_dim > 0
+
+    assert args.kernel_size_list is not None
+    assert len(args.kernel_size_list) > 0
+
+    assert args.subsampling_factor_list is not None
+    assert len(args.subsampling_factor_list) > 0
+
+    args.kernel_size_list = [int(k) for k in args.kernel_size_list.split(', ')]
+
+    args.subsampling_factor_list = [
+        int(k) for k in args.subsampling_factor_list.split(', ')
+    ]
+
+    assert len(args.kernel_size_list) == len(args.subsampling_factor_list)
+
+    assert args.log_level in ['debug', 'info', 'warning']
+
+
+def get_args():
+    parser = argparse.ArgumentParser(
+        description='CTC training in PyTorch with kaldi pybind')
+
+    _set_training_args(parser)
+
+    parser.add_argument('--is-training',
+                        dest='is_training',
+                        help='true for training, false for inference',
+                        required=True,
+                        type=_str2bool)
+
+    parser.add_argument('--dir',
+                        help='dir to save results. The user has to '
+                        'create it before calling this script.',
+                        required=True,
+                        type=str)
+
+    parser.add_argument('--feats-scp',
+                        dest='feats_scp',
+                        help='filename of feats.scp',
+                        required=True,
+                        type=str)
+
+    parser.add_argument('--device-id',
+                        dest='device_id',
+                        help='GPU device id',
+                        required=True,
+                        type=int)
+
+    parser.add_argument('--batch-size',
+                        dest='batch_size',
+                        help='batch size used in training and inference',
+                        required=True,
+                        type=int)
+
+    parser.add_argument('--input-dim',
+                        dest='input_dim',
+                        help='input dimension of the network',
+                        required=True,
+                        type=int)
+
+    parser.add_argument('--output-dim',
+                        dest='output_dim',
+                        help='output dimension of the network',
+                        required=True,
+                        type=int)
+
+    parser.add_argument('--model-left-context',
+                        dest='model_left_context',
+                        help='model left context',
+                        type=int,
+                        default=0)
+
+    parser.add_argument('--model-right-context',
+                        dest='model_right_context',
+                        help='model right context',
+                        type=int,
+                        default=0)
+
+    parser.add_argument('--hidden-dim',
+                        dest='hidden_dim',
+                        help='nn hidden dimension',
+                        required=True,
+                        type=int)
+
+    parser.add_argument('--bottleneck-dim',
+                        dest='bottleneck_dim',
+                        help='nn bottleneck dimension',
+                        required=True,
+                        type=int)
+
+    parser.add_argument('--prefinal-bottleneck-dim',
+                        dest='prefinal_bottleneck_dim',
+                        help='nn prefinal bottleneck dimension',
+                        required=True,
+                        type=int)
+
+    parser.add_argument('--kernel-size-list',
+                        dest='kernel_size_list',
+                        help='kernel_size_list',
+                        required=True,
+                        type=str)
+
+    parser.add_argument('--subsampling-factor-list',
+                        dest='subsampling_factor_list',
+                        help='subsampling_factor_list',
+                        required=True,
+                        type=str)
+
+    parser.add_argument('--log-level',
+                        dest='log_level',
+                        help='log level. valid values: debug, info, warning',
+                        type=str,
+                        default='info')
+
+    parser.add_argument(
+        '--checkpoint',
+        dest='checkpoint',
+        help='filename of the checkpoint, required for inference',
+        type=str)
+
+    args = parser.parse_args()
+
+    _check_args(args)
+
+    return args
diff --git a/egs/aishell/s10b/ctc/tdnnf_layer.py b/egs/aishell/s10b/ctc/tdnnf_layer.py
new file mode 120000
index 00000000000..60cac44c091
--- /dev/null
+++ b/egs/aishell/s10b/ctc/tdnnf_layer.py
@@ -0,0 +1 @@
+../../s10/chain/tdnnf_layer.py
\ No newline at end of file
diff --git a/egs/aishell/s10b/ctc/tdnnf_model.py b/egs/aishell/s10b/ctc/tdnnf_model.py
new file mode 100644
index 00000000000..788952a4505
--- /dev/null
+++ b/egs/aishell/s10b/ctc/tdnnf_model.py
@@ -0,0 +1,201 @@
+#!/usr/bin/env python3
+
+# Copyright 2019-2020 Mobvoi AI Lab, Beijing, China (author: Fangjun Kuang)
+# Apache 2.0
+
+import logging
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from add_deltas_layer import AddDeltasLayer
+from tdnnf_layer import FactorizedTDNN
+from tdnnf_layer import OrthonormalLinear
+from tdnnf_layer import PrefinalLayer
+
+
+def get_tdnnf_model(input_dim, output_dim, hidden_dim, bottleneck_dim,
+                    prefinal_bottleneck_dim, kernel_size_list,
+                    subsampling_factor_list):
+    model = TdnnfModel(input_dim=input_dim,
+                       output_dim=output_dim,
+                       hidden_dim=hidden_dim,
+                       bottleneck_dim=bottleneck_dim,
+                       prefinal_bottleneck_dim=prefinal_bottleneck_dim,
+                       kernel_size_list=kernel_size_list,
+                       subsampling_factor_list=subsampling_factor_list)
+    return model
+
+
+'''
+input dim=43 name=input
+
+# please note that it is important to have input layer with the name=input
+# as the layer immediately preceding the fixed-affine-layer to enable
+# the use of short notation for the descriptor
+fixed-affine-layer name=lda input=Append(-1,0,1) affine-transform-file=exp/chain_cleaned_1c/tdnn1c_sp/configs/lda.mat
+
+# the first splicing is moved before the lda layer, so no splicing here
+relu-batchnorm-dropout-layer name=tdnn1 l2-regularize=0.008 dropout-proportion=0.0 dropout-per-dim-continuous=true dim=1024
+tdnnf-layer name=tdnnf2 l2-regularize=0.008 dropout-proportion=0.0 bypass-scale=0.66 dim=1024 bottleneck-dim=128 time-stride=1
+tdnnf-layer name=tdnnf3 l2-regularize=0.008 dropout-proportion=0.0 bypass-scale=0.66 dim=1024 bottleneck-dim=128 time-stride=1
+tdnnf-layer name=tdnnf4 l2-regularize=0.008 dropout-proportion=0.0 bypass-scale=0.66 dim=1024 bottleneck-dim=128 time-stride=1
+tdnnf-layer name=tdnnf5 l2-regularize=0.008 dropout-proportion=0.0 bypass-scale=0.66 dim=1024 bottleneck-dim=128 time-stride=0
+tdnnf-layer name=tdnnf6 l2-regularize=0.008 dropout-proportion=0.0 bypass-scale=0.66 dim=1024 bottleneck-dim=128 time-stride=3
+tdnnf-layer name=tdnnf7 l2-regularize=0.008 dropout-proportion=0.0 bypass-scale=0.66 dim=1024 bottleneck-dim=128 time-stride=3
+tdnnf-layer name=tdnnf8 l2-regularize=0.008 dropout-proportion=0.0 bypass-scale=0.66 dim=1024 bottleneck-dim=128 time-stride=3
+tdnnf-layer name=tdnnf9 l2-regularize=0.008 dropout-proportion=0.0 bypass-scale=0.66 dim=1024 bottleneck-dim=128 time-stride=3
+tdnnf-layer name=tdnnf10 l2-regularize=0.008 dropout-proportion=0.0 bypass-scale=0.66 dim=1024 bottleneck-dim=128 time-stride=3
+tdnnf-layer name=tdnnf11 l2-regularize=0.008 dropout-proportion=0.0 bypass-scale=0.66 dim=1024 bottleneck-dim=128 time-stride=3
+tdnnf-layer name=tdnnf12 l2-regularize=0.008 dropout-proportion=0.0 bypass-scale=0.66 dim=1024 bottleneck-dim=128 time-stride=3
+tdnnf-layer name=tdnnf13 l2-regularize=0.008 dropout-proportion=0.0 bypass-scale=0.66 dim=1024 bottleneck-dim=128 time-stride=3
+linear-component name=prefinal-l dim=256 l2-regularize=0.008 orthonormal-constraint=-1.0
+
+prefinal-layer name=prefinal-chain input=prefinal-l l2-regularize=0.008 big-dim=1024 small-dim=256
+output-layer name=output include-log-softmax=false dim=3456 l2-regularize=0.002
+
+prefinal-layer name=prefinal-xent input=prefinal-l l2-regularize=0.008 big-dim=1024 small-dim=256
+output-layer name=output-xent dim=3456 learning-rate-factor=5.0 l2-regularize=0.002
+'''
+
+
+# Create a network like the above one
+class TdnnfModel(nn.Module):
+
+    def __init__(self,
+                 input_dim,
+                 output_dim,
+                 hidden_dim=1024,
+                 bottleneck_dim=128,
+                 prefinal_bottleneck_dim=256,
+                 kernel_size_list=[3, 3, 3, 1, 3, 3, 3, 3, 3, 3, 3, 3],
+                 subsampling_factor_list=[1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1]):
+        super().__init__()
+
+        assert len(kernel_size_list) == len(subsampling_factor_list)
+        num_layers = len(kernel_size_list)
+
+        # deltas_layer requires [N, C, T]
+        self.delta_layer = AddDeltasLayer()
+
+        # batch_norm0 requires [N, C, T]
+        self.batch_norm0 = nn.BatchNorm1d(num_features=input_dim * 3,
+                                          affine=False)
+
+        # tdnn1_affine requires [N, T, C]
+        self.tdnn1_affine = nn.Linear(in_features=input_dim * 3,
+                                      out_features=hidden_dim)
+
+        # tdnn1_batchnorm requires [N, C, T]
+        self.tdnn1_batchnorm = nn.BatchNorm1d(num_features=hidden_dim,
+                                              affine=False)
+
+        tdnnfs = []
+        for i in range(num_layers):
+            kernel_size = kernel_size_list[i]
+            subsampling_factor = subsampling_factor_list[i]
+            layer = FactorizedTDNN(dim=hidden_dim,
+                                   bottleneck_dim=bottleneck_dim,
+                                   kernel_size=kernel_size,
+                                   subsampling_factor=subsampling_factor)
+            tdnnfs.append(layer)
+
+        # tdnnfs requires [N, C, T]
+        self.tdnnfs = nn.ModuleList(tdnnfs)
+
+        # prefinal_l affine requires [N, C, T]
+        self.prefinal_l = OrthonormalLinear(
+            dim=hidden_dim,
+            bottleneck_dim=prefinal_bottleneck_dim,
+            kernel_size=1)
+
+        # prefinal_chain requires [N, C, T]
+        self.prefinal_chain = PrefinalLayer(big_dim=hidden_dim,
+                                            small_dim=prefinal_bottleneck_dim)
+
+        # output_affine requires [N, T, C]
+        self.output_affine = nn.Linear(in_features=prefinal_bottleneck_dim,
+                                       out_features=output_dim)
+
+        # prefinal_xent requires [N, C, T]
+        self.prefinal_xent = PrefinalLayer(big_dim=hidden_dim,
+                                           small_dim=prefinal_bottleneck_dim)
+
+        self.output_xent_affine = nn.Linear(in_features=prefinal_bottleneck_dim,
+                                            out_features=output_dim)
+
+    # TODO(fangjun): avoid `permute`.
+    def forward(self, x, feat_len_list):
+        # input x is of shape: [batch_size, seq_len, input_dim] = [N, T, C]
+        assert x.ndim == 3
+
+        # at this point, x is [N, T, C]
+        x = x.permute(0, 2, 1)
+
+        # at this point, x is [N, C, T]
+        x = self.delta_layer(x)
+
+        # at this point, x is [N, C, T]
+        x = self.batch_norm0(x)
+
+        # at this point, x is [N, C, T]
+
+        x = x.permute(0, 2, 1)
+
+        # at this point, x is [N, T, C]
+
+        x = self.tdnn1_affine(x)
+
+        # at this point, x is [N, T, C]
+
+        x = F.relu(x)
+
+        x = x.permute(0, 2, 1)
+
+        # at this point, x is [N, C, T]
+
+        x = self.tdnn1_batchnorm(x)
+
+        # tdnnf requires input of shape [N, C, T]
+        for i in range(len(self.tdnnfs)):
+            x = self.tdnnfs[i](x)
+
+        # at this point, x is [N, C, T]
+
+        x = self.prefinal_l(x)
+
+        # at this point, x is [N, C, T]
+
+        x = self.prefinal_chain(x)
+
+        # at this point, x is [N, C, T]
+        x = x.permute(0, 2, 1)
+
+        # at this point, x is [N, T, C]
+        x = self.output_affine(x)
+
+        feat_len_list = (torch.tensor(feat_len_list).int() + 2) / 3
+
+        return x, feat_len_list
+
+    def constrain_orthonormal(self):
+        for i in range(len(self.tdnnfs)):
+            self.tdnnfs[i].constrain_orthonormal()
+
+        self.prefinal_l.constrain_orthonormal()
+        self.prefinal_chain.constrain_orthonormal()
+        self.prefinal_xent.constrain_orthonormal()
+
+
+if __name__ == '__main__':
+    input_dim = 40
+    output_dim = 218
+    model = TdnnfModel(input_dim=input_dim, output_dim=output_dim)
+    N = 1
+    T = 150 + 29 + 29
+    C = input_dim
+    x = torch.arange(N * T * C).reshape(N, T, C).float()
+    nnet_output = model(x)
+    print(x.shape, nnet_output.shape)
+    model.constrain_orthonormal()
diff --git a/egs/aishell/s10b/ctc/train.py b/egs/aishell/s10b/ctc/train.py
new file mode 100644
index 00000000000..69c99c603ca
--- /dev/null
+++ b/egs/aishell/s10b/ctc/train.py
@@ -0,0 +1,259 @@
+#!/usr/bin/env python3
+
+# Copyright 2020 Mobvoi AI Lab, Beijing, China (author: Fangjun Kuang)
+# Apache 2.0
+
+import logging
+import math
+import os
+import sys
+import warnings
+
+# disable warnings when loading tensorboard
+warnings.simplefilter(action='ignore', category=FutureWarning)
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+import torch.optim as optim
+from torch.nn.utils import clip_grad_value_
+from torch.utils.tensorboard import SummaryWriter
+
+import kaldi
+
+from common import load_checkpoint
+from common import save_checkpoint
+from common import save_training_info
+from common import setup_logger
+from ctc_loss import CTCLoss
+from dataset import get_ctc_dataloader
+from options import get_args
+from tdnnf_model import get_tdnnf_model
+
+
+def train_one_epoch(dataloader, model, device, optimizer, loss_func,
+                    current_epoch, tf_writer):
+    total_loss = 0.
+    num = 0.
+
+    num_repeat = 1
+    for kk in range(num_repeat):
+        for batch_idx, batch in enumerate(dataloader):
+            unused_uttid_list, feat, feat_len_list, label_list, label_len_list = batch
+
+            feat = feat.to(device)
+
+            activations, feat_len_list = model(feat, feat_len_list)
+
+            # at this point activations is of shape: [batch_size, seq_len, output_dim]
+            # CTCLoss requires a layout: [seq_len, batch_size, output_dim]
+
+            activations = activations.permute(1, 0, 2)
+            # now activations is of shape [seq_len, batch_size, output_dim]
+
+            targets = torch.tensor(label_list)
+
+            if not isinstance(feat_len_list, torch.Tensor):
+                input_lengths = torch.tensor(feat_len_list)
+            else:
+                input_lengths = feat_len_list
+
+            target_lengths = torch.tensor(label_len_list)
+
+            loss = loss_func(activations=activations,
+                             targets=targets,
+                             input_lengths=input_lengths,
+                             target_lengths=target_lengths)
+
+            optimizer.zero_grad()
+            if math.isnan(loss.item()):
+                print(loss)
+                logging.warn('loss is nan for batch {} at epoch {}\n'
+                             'feat_len_list: {}\n'
+                             'label_len_list: {}\n'.format(
+                                 batch_idx, current_epoch, feat_len_list,
+                                 label_len_list))
+                import sys
+                sys.exit(1)
+
+            loss.backward()
+
+            #  clip_grad_value_(model.parameters(), 5.0)
+
+            optimizer.step()
+
+            total_loss += loss.item()
+            num += 1
+
+            if np.random.choice(4) == 0:
+                with torch.no_grad():
+                    model.constrain_orthonormal()
+
+            if batch_idx % 100 == 0:
+                logging.info(
+                    'batch {}/{} ({:.2f}%) ({}/{}), loss {:.5f}, average {:.5f}'
+                    .format(batch_idx, len(dataloader),
+                            float(batch_idx) / len(dataloader) * 100, kk,
+                            num_repeat, loss.item(), total_loss / num))
+
+            if batch_idx % 100 == 0:
+                tf_writer.add_scalar(
+                    'train/current_batch_average_loss', loss.item(),
+                    batch_idx + kk * len(dataloader) +
+                    num_repeat * len(dataloader) * current_epoch)
+
+                tf_writer.add_scalar(
+                    'train/global_average_loss', total_loss / num,
+                    batch_idx + kk * len(dataloader) +
+                    num_repeat * len(dataloader) * current_epoch)
+
+    return total_loss / num
+
+
+def main():
+    args = get_args()
+    setup_logger('{}/log-train'.format(args.dir), args.log_level)
+    logging.info(' '.join(sys.argv))
+
+    if torch.cuda.is_available() == False:
+        logging.error('No GPU detected!')
+        sys.exit(-1)
+
+    kaldi.SelectGpuDevice(device_id=args.device_id)
+
+    device = torch.device('cuda', args.device_id)
+
+    model = get_tdnnf_model(
+        input_dim=args.input_dim,
+        output_dim=args.output_dim,
+        hidden_dim=args.hidden_dim,
+        bottleneck_dim=args.bottleneck_dim,
+        prefinal_bottleneck_dim=args.prefinal_bottleneck_dim,
+        kernel_size_list=args.kernel_size_list,
+        subsampling_factor_list=args.subsampling_factor_list)
+
+    start_epoch = 0
+    num_epochs = args.num_epochs
+    learning_rate = args.learning_rate
+    best_loss = None
+
+    if args.checkpoint:
+        start_epoch, learning_rate, best_loss = load_checkpoint(
+            args.checkpoint, model)
+        logging.info(
+            'loaded from checkpoint: start epoch {start_epoch}, '
+            'learning rate {learning_rate}, best loss {best_loss}'.format(
+                start_epoch=start_epoch,
+                learning_rate=learning_rate,
+                best_loss=best_loss))
+
+    model.to(device)
+
+    dataloader = get_ctc_dataloader(
+        feats_scp=args.feats_scp,
+        labels_scp=args.labels_scp,
+        batch_size=args.batch_size,
+        shuffle=True,
+        num_workers=8,
+        model_left_context=args.model_left_context,
+        model_right_context=args.model_right_context)
+
+    lr = learning_rate
+    optimizer = optim.Adam(model.parameters(),
+                           lr=lr,
+                           weight_decay=args.l2_regularize)
+
+    tf_writer = SummaryWriter(log_dir='{}/tensorboard'.format(args.dir))
+
+    model.train()
+
+    loss_func = CTCLoss(use_warp_ctc=False, blank=0, reduction='mean')
+
+    best_epoch = 0
+    best_model_path = os.path.join(args.dir, 'best_model.pt')
+    best_epoch_info_filename = os.path.join(args.dir, 'best-epoch-info')
+
+    try:
+        for epoch in range(start_epoch, num_epochs):
+            learning_rate = lr * pow(0.8, epoch)
+            #  learning_rate = lr
+            tf_writer.add_scalar('learning_rate', learning_rate, epoch)
+
+            for param_group in optimizer.param_groups:
+                param_group['lr'] = learning_rate
+
+            logging.info('epoch {}, learning rate {}'.format(
+                epoch, learning_rate))
+
+            loss = train_one_epoch(dataloader=dataloader,
+                                   model=model,
+                                   device=device,
+                                   optimizer=optimizer,
+                                   loss_func=loss_func,
+                                   current_epoch=epoch,
+                                   tf_writer=tf_writer)
+
+            # the loss, the better
+            if best_loss is None or best_loss > loss:
+                best_loss = loss
+                best_epoch = epoch
+                save_checkpoint(filename=best_model_path,
+                                model=model,
+                                epoch=epoch,
+                                learning_rate=learning_rate,
+                                loss=loss)
+                save_training_info(filename=best_epoch_info_filename,
+                                   model_path=best_model_path,
+                                   current_epoch=epoch,
+                                   learning_rate=learning_rate,
+                                   loss=loss,
+                                   best_loss=best_loss,
+                                   best_epoch=best_epoch)
+
+            # we always save the model for every epoch
+            model_path = os.path.join(args.dir, 'epoch-{}.pt'.format(epoch))
+            save_checkpoint(filename=model_path,
+                            model=model,
+                            epoch=epoch,
+                            learning_rate=learning_rate,
+                            loss=loss)
+
+            epoch_info_filename = os.path.join(args.dir,
+                                               'epoch-{}-info'.format(epoch))
+            save_training_info(filename=epoch_info_filename,
+                               model_path=model_path,
+                               current_epoch=epoch,
+                               learning_rate=learning_rate,
+                               loss=loss,
+                               best_loss=best_loss,
+                               best_epoch=best_epoch)
+    except KeyboardInterrupt:
+        # save the model when ctrl-c is pressed
+        model_path = os.path.join(args.dir,
+                                  'epoch-{}-interrupted.pt'.format(epoch))
+        # use a very large loss for the interrupted model
+        loss = 100000000
+        save_checkpoint(model_path,
+                        model=model,
+                        epoch=epoch,
+                        learning_rate=learning_rate,
+                        loss=loss)
+
+        epoch_info_filename = os.path.join(
+            args.dir, 'epoch-{}-interrupted-info'.format(epoch))
+        save_training_info(filename=epoch_info_filename,
+                           model_path=model_path,
+                           current_epoch=epoch,
+                           learning_rate=learning_rate,
+                           loss=loss,
+                           best_loss=best_loss,
+                           best_epoch=best_epoch)
+
+    tf_writer.close()
+    logging.warning('Training done!')
+
+
+if __name__ == '__main__':
+    np.random.seed(20200221)
+    torch.manual_seed(20200221)
+    main()
diff --git a/egs/aishell/s10b/local/aishell_data_prep.sh b/egs/aishell/s10b/local/aishell_data_prep.sh
new file mode 100755
index 00000000000..4747e4f4d82
--- /dev/null
+++ b/egs/aishell/s10b/local/aishell_data_prep.sh
@@ -0,0 +1,68 @@
+#!/bin/bash
+
+# Copyright 2017 Xingyu Na
+# Apache 2.0
+
+. ./path.sh || exit 1;
+
+if [ $# != 2 ]; then
+  echo "Usage: $0 <audio-path> <text-path>"
+  echo " $0 /export/a05/xna/data/data_aishell/wav /export/a05/xna/data/data_aishell/transcript"
+  exit 1;
+fi
+
+aishell_audio_dir=$1
+aishell_text=$2/aishell_transcript_v0.8.txt
+
+train_dir=data/local/train
+dev_dir=data/local/dev
+test_dir=data/local/test
+tmp_dir=data/local/tmp
+
+mkdir -p $train_dir
+mkdir -p $dev_dir
+mkdir -p $test_dir
+mkdir -p $tmp_dir
+
+# data directory check
+if [ ! -d $aishell_audio_dir ] || [ ! -f $aishell_text ]; then
+  echo "Error: $0 requires two directory arguments"
+  exit 1;
+fi
+
+# find wav audio file for train, dev and test resp.
+find $aishell_audio_dir -iname "*.wav" > $tmp_dir/wav.flist
+n=`cat $tmp_dir/wav.flist | wc -l`
+[ $n -ne 141925 ] && \
+  echo Warning: expected 141925 data data files, found $n
+
+grep -i "wav/train" $tmp_dir/wav.flist > $train_dir/wav.flist || exit 1;
+grep -i "wav/dev" $tmp_dir/wav.flist > $dev_dir/wav.flist || exit 1;
+grep -i "wav/test" $tmp_dir/wav.flist > $test_dir/wav.flist || exit 1;
+
+rm -r $tmp_dir
+
+# Transcriptions preparation
+for dir in $train_dir $dev_dir $test_dir; do
+  echo Preparing $dir transcriptions
+  sed -e 's/\.wav//' $dir/wav.flist | awk -F '/' '{print $NF}' > $dir/utt.list
+  sed -e 's/\.wav//' $dir/wav.flist | awk -F '/' '{i=NF-1;printf("%s %s\n",$NF,$i)}' > $dir/utt2spk_all
+  paste -d' ' $dir/utt.list $dir/wav.flist > $dir/wav.scp_all
+  utils/filter_scp.pl -f 1 $dir/utt.list $aishell_text > $dir/transcripts.txt
+  awk '{print $1}' $dir/transcripts.txt > $dir/utt.list
+  utils/filter_scp.pl -f 1 $dir/utt.list $dir/utt2spk_all | sort -u > $dir/utt2spk
+  utils/filter_scp.pl -f 1 $dir/utt.list $dir/wav.scp_all | sort -u > $dir/wav.scp
+  sort -u $dir/transcripts.txt > $dir/text
+  utils/utt2spk_to_spk2utt.pl $dir/utt2spk > $dir/spk2utt
+done
+
+mkdir -p data/train data/dev data/test
+
+for f in spk2utt utt2spk wav.scp text; do
+  cp $train_dir/$f data/train/$f || exit 1;
+  cp $dev_dir/$f data/dev/$f || exit 1;
+  cp $test_dir/$f data/test/$f || exit 1;
+done
+
+echo "$0: AISHELL data preparation succeeded"
+exit 0;
diff --git a/egs/aishell/s10b/local/aishell_prepare_dict.sh b/egs/aishell/s10b/local/aishell_prepare_dict.sh
new file mode 100755
index 00000000000..c4cabb24de4
--- /dev/null
+++ b/egs/aishell/s10b/local/aishell_prepare_dict.sh
@@ -0,0 +1,36 @@
+#!/bin/bash
+
+# Copyright 2017 Xingyu Na
+# Apache 2.0
+
+# prepare dict resources
+
+. ./path.sh
+
+[ $# != 1 ] && echo "Usage: $0 <resource-path>" && exit 1;
+
+res_dir=$1
+dict_dir=data/local/dict
+mkdir -p $dict_dir
+cp $res_dir/lexicon.txt $dict_dir
+
+cat $dict_dir/lexicon.txt | awk '{ for(n=2;n<=NF;n++){ phones[$n] = 1; }} END{for (p in phones) print p;}'| \
+  perl -e 'while(<>){ chomp($_); $phone = $_; next if ($phone eq "sil");
+    m:^([^\d]+)(\d*)$: || die "Bad phone $_"; $q{$1} .= "$phone "; }
+    foreach $l (values %q) {print "$l\n";}
+  ' | sort -k1 > $dict_dir/nonsilence_phones.txt  || exit 1;
+
+echo sil > $dict_dir/silence_phones.txt
+
+echo sil > $dict_dir/optional_silence.txt
+
+# No "extra questions" in the input to this setup, as we don't
+# have stress or tone
+
+cat $dict_dir/silence_phones.txt| awk '{printf("%s ", $1);} END{printf "\n";}' > $dict_dir/extra_questions.txt || exit 1;
+cat $dict_dir/nonsilence_phones.txt | perl -e 'while(<>){ foreach $p (split(" ", $_)) {
+  $p =~ m:^([^\d]+)(\d*)$: || die "Bad phone $_"; $q{$2} .= "$p "; } } foreach $l (values %q) {print "$l\n";}' \
+ >> $dict_dir/extra_questions.txt || exit 1;
+
+echo "$0: AISHELL dict preparation succeeded"
+exit 0;
diff --git a/egs/aishell/s10b/local/aishell_train_lms.sh b/egs/aishell/s10b/local/aishell_train_lms.sh
new file mode 100755
index 00000000000..9b6cdad2960
--- /dev/null
+++ b/egs/aishell/s10b/local/aishell_train_lms.sh
@@ -0,0 +1,88 @@
+#!/bin/bash
+
+
+# To be run from one directory above this script.
+. ./path.sh
+
+text=data/local/train/text
+lexicon=data/local/dict/lexicon.txt
+
+for f in "$text" "$lexicon"; do
+  [ ! -f $x ] && echo "$0: No such file $f" && exit 1;
+done
+
+# This script takes no arguments.  It assumes you have already run
+# aishell_data_prep.sh.
+# It takes as input the files
+# data/local/train/text
+# data/local/dict/lexicon.txt
+dir=data/local/lm
+mkdir -p $dir
+
+kaldi_lm=`which train_lm.sh`
+if [ -z $kaldi_lm ]; then
+  echo "$0: train_lm.sh is not found. That might mean it's not installed"
+  echo "$0: or it is not added to PATH"
+  echo "$0: Use the script tools/extras/install_kaldi_lm.sh to install it"
+  exit 1
+fi
+
+cleantext=$dir/text.no_oov
+
+cat $text | awk -v lex=$lexicon 'BEGIN{while((getline<lex) >0){ seen[$1]=1; } }
+  {for(n=1; n<=NF;n++) {  if (seen[$n]) { printf("%s ", $n); } else {printf("<SPOKEN_NOISE> ");} } printf("\n");}' \
+  > $cleantext || exit 1;
+
+cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | sort | uniq -c | \
+   sort -nr > $dir/word.counts || exit 1;
+
+# Get counts from acoustic training transcripts, and add  one-count
+# for each word in the lexicon (but not silence, we don't want it
+# in the LM-- we'll add it optionally later).
+cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | \
+  cat - <(grep -w -v '!SIL' $lexicon | awk '{print $1}') | \
+   sort | uniq -c | sort -nr > $dir/unigram.counts || exit 1;
+
+# note: we probably won't really make use of <SPOKEN_NOISE> as there aren't any OOVs
+cat $dir/unigram.counts  | awk '{print $2}' | get_word_map.pl "<s>" "</s>" "<SPOKEN_NOISE>" > $dir/word_map \
+   || exit 1;
+
+# note: ignore 1st field of train.txt, it's the utterance-id.
+cat $cleantext | awk -v wmap=$dir/word_map 'BEGIN{while((getline<wmap)>0)map[$1]=$2;}
+  { for(n=2;n<=NF;n++) { printf map[$n]; if(n<NF){ printf " "; } else { print ""; }}}' | gzip -c >$dir/train.gz \
+   || exit 1;
+
+train_lm.sh --arpa --lmtype 3gram-mincount $dir || exit 1;
+
+# LM is small enough that we don't need to prune it (only about 0.7M N-grams).
+# Perplexity over 128254.000000 words is 90.446690
+
+# note: output is
+# data/local/lm/3gram-mincount/lm_unpruned.gz
+
+exit 0
+
+
+# From here is some commands to do a baseline with SRILM (assuming
+# you have it installed).
+heldout_sent=10000 # Don't change this if you want result to be comparable with
+    # kaldi_lm results
+sdir=$dir/srilm # in case we want to use SRILM to double-check perplexities.
+mkdir -p $sdir
+cat $cleantext | awk '{for(n=2;n<=NF;n++){ printf $n; if(n<NF) printf " "; else print ""; }}' | \
+  head -$heldout_sent > $sdir/heldout
+cat $cleantext | awk '{for(n=2;n<=NF;n++){ printf $n; if(n<NF) printf " "; else print ""; }}' | \
+  tail -n +$heldout_sent > $sdir/train
+
+cat $dir/word_map | awk '{print $1}' | cat - <(echo "<s>"; echo "</s>" ) > $sdir/wordlist
+
+
+ngram-count -text $sdir/train -order 3 -limit-vocab -vocab $sdir/wordlist -unk \
+  -map-unk "<SPOKEN_NOISE>" -kndiscount -interpolate -lm $sdir/srilm.o3g.kn.gz
+ngram -lm $sdir/srilm.o3g.kn.gz -ppl $sdir/heldout
+# 0 zeroprobs, logprob= -250954 ppl= 90.5091 ppl1= 132.482
+
+# Note: perplexity SRILM gives to Kaldi-LM model is same as kaldi-lm reports above.
+# Difference in WSJ must have been due to different treatment of <SPOKEN_NOISE>.
+ngram -lm $dir/3gram-mincount/lm_unpruned.gz  -ppl $sdir/heldout
+# 0 zeroprobs, logprob= -250913 ppl= 90.4439 ppl1= 132.379
diff --git a/egs/aishell/s10b/local/convert_text_to_labels.py b/egs/aishell/s10b/local/convert_text_to_labels.py
new file mode 100755
index 00000000000..ed1527e1623
--- /dev/null
+++ b/egs/aishell/s10b/local/convert_text_to_labels.py
@@ -0,0 +1,281 @@
+#!/usr/bin/env python3
+
+# Copyright 2020 Mobvoi AI Lab, Beijing, China (author: Fangjun Kuang)
+# Apache 2.0
+
+# This program converts a transcript file `text` to labels
+# used in CTC training.
+#
+# For example, if we have
+#
+# the lexicon file `lexicon.txt`
+#
+# foo f o o
+# bar b a r
+#
+# the phone symbol table `tokens.txt`
+#
+# <eps> 0
+# <blk> 1
+# a 2
+# b 3
+# f 4
+# o 5
+# r 6
+#
+# and the transcript file `text`
+#
+# utt1 foo bar bar
+# utt2 bar
+#
+# Given the above three inputs, this program generates a
+# file `labels.ark` containing
+#
+# utt1 3 4 4 2 1 5 2 1 5
+# utt2 2 1 5
+#
+# where
+# - `3 4 4` is from `(4-1) (5-1) (5-1)`, which is from the indices of `f o o`
+# - `2 1 5` is from `(3-1) (2-1) (6-1)`, which is from the indices of `b a r`
+#
+# Note that 1 is subtracted from here since `<eps>` exists only in FSTs
+# and the neural network considers index `0` as `<blk>`, Therefore, the integer
+# value of every symbol is shifted downwards by 1.
+
+import argparse
+import os
+
+import kaldi
+
+
+def get_args():
+    parser = argparse.ArgumentParser(description='''
+Convert transcript to labels.
+
+It takes the following inputs:
+
+- lexicon.txt, the lexicon file
+- tokens.txt, the phone symbol table
+- dir, a directory containing the transcript file `text`
+
+It generates `lables.scp` and `labels.ark` in the provided `dir`.
+
+Usage:
+    python3 ./local/convert_text_to_labels.py \
+            --lexicon-filename data/lang/lexicon.txt \
+            --tokens-filename data/lang/tokens.txt \
+            --dir data/train
+
+    It will generate data/train/labels.scp and data/train/labels.ark.
+        ''')
+
+    parser.add_argument('--lexicon-filename',
+                        dest='lexicon_filename',
+                        type=str,
+                        help='filename for lexicon.txt')
+
+    parser.add_argument('--tokens-filename',
+                        dest='tokens_filename',
+                        type=str,
+                        help='filename for the phone symbol table tokens.txt')
+
+    parser.add_argument('--dir',
+                        type=str,
+                        help='''the dir containing the transcript text;
+        it will contain the generated labels.scp and labels.ark''')
+
+    args = parser.parse_args()
+
+    assert os.path.isfile(args.lexicon_filename)
+    assert os.path.isfile(args.tokens_filename)
+    assert os.path.isfile(os.path.join(args.dir, 'text'))
+
+    return args
+
+
+def read_lexicon(filename):
+    '''Read lexicon.txt and save it into a Python dict.
+
+    Args:
+        filename: filename of lexicon.txt.
+
+                  Every line in lexicon.txt has the following format:
+
+                    word phone1 phone2 phone3 ... phoneN
+
+                  That is, fields are separated by spaces. The first
+                  field is the word and the remaining fields are the
+                  phones indicating the pronunciation of the word.
+
+    Returns:
+        a dict whose keys are words and values are phones.
+    '''
+    lexicon = dict()
+
+    with open(filename, 'r', encoding='utf-8') as f:
+        for line in f:
+            # line contains:
+            # word phone1 phone2 phone3 ... phoneN
+            word_phones = line.split()
+
+            # It should have at least two fields:
+            # the first one is the word and
+            # the second one is the pronunciation
+            assert len(word_phones) >= 2
+
+            word = word_phones[0]
+            phones = word_phones[1:]
+
+            if word not in lexicon:
+                # if there are multiple pronunciations for a word,
+                # we choose only the first one and drop other alternatives
+                lexicon[word] = phones
+
+    return lexicon
+
+
+def read_tokens(filename):
+    '''Read phone symbol table tokens.txt and save it into a Python dict.
+
+    Note that we remove the symbol `<eps>` and shift every symbol index
+    downwards by 1.
+
+    Args:
+        filename: filename of the phone symbol table tokens.txt.
+
+                  Two integer values have specific meanings in the symbol
+                  table. The first one is 0, which is reserved for `<eps>`.
+                  And the second one is 1, which is reserved for the
+                  blank symbol `<blk>`.
+                  Other integer values do NOT have specific meanings.
+
+    Returns:
+        a dict whose keys are phones and values are phone indices
+    '''
+    tokens = dict()
+    with open(filename, 'r', encoding='utf-8') as f:
+        for line in f:
+            # line has the format: phone index
+            phone_index = line.split()
+
+            # it should have two fields:
+            # the first field is the phone
+            # and the second field is its index
+            assert len(phone_index) == 2
+
+            phone = phone_index[0]
+            index = int(phone_index[1])
+
+            if phone == '<eps>':
+                # <eps> appears only in the FSTs.
+                continue
+
+            # decreased by one since we removed <eps> above
+            # and every symbol index is shifted downwards by 1
+            index -= 1
+
+            assert phone not in tokens
+
+            tokens[phone] = index
+
+    assert '<blk>' in tokens
+
+    # WARNING(fangjun): we assume that the blank symbol has index 0
+    # in the neural network output.
+    # Do NOT confuse it with `<eps>` in fst.
+    assert tokens['<blk>'] == 0
+
+    return tokens
+
+
+def read_text(filename):
+    '''Read transcript file `text` and save it into a Python dict.
+
+    Args:
+        filename: filename of the transcript file `text`.
+
+    Returns:
+        a dict whose keys are utterance IDs and values are texts
+    '''
+    transcript = dict()
+
+    with open(filename, 'r', encoding='utf-8') as f:
+        for line in f:
+            # line has the format: uttid word1 word2 word3 ... wordN
+            uttid_text = line.split()
+
+            # it should have at least 2 fields:
+            # the first field is the utterance id;
+            # the remaining fields are the words of the utterance
+            assert len(uttid_text) >= 2
+
+            uttid = uttid_text[0]
+            text = uttid_text[1:]
+
+            assert uttid not in transcript
+            transcript[uttid] = text
+
+    return transcript
+
+
+def phones_to_indices(phone_list, tokens):
+    '''Convert a list of phones to a list of indices via a phone symbol table.
+
+    Args:
+        phone_list: a list of phones
+        tokens: a dict representing a phone symbol table.
+
+    Returns:
+        Return a list of indices corresponding to the given phones
+    '''
+    index_list = []
+
+    for phone in phone_list:
+        assert phone in tokens
+
+        index = tokens[phone]
+        index_list.append(index)
+
+    return index_list
+
+
+def main():
+    args = get_args()
+
+    lexicon = read_lexicon(args.lexicon_filename)
+
+    tokens = read_tokens(args.tokens_filename)
+
+    transcript = read_text(os.path.join(args.dir, 'text'))
+
+    transcript_labels = dict()
+
+    for uttid, text in transcript.items():
+        labels = []
+        for word in text:
+            # TODO(fangjun): add support for OOV.
+            phones = lexicon[word]
+
+            indices = phones_to_indices(phones, tokens)
+
+            labels.extend(indices)
+
+        assert uttid not in transcript_labels
+
+        transcript_labels[uttid] = labels
+
+    wspecifier = 'ark,scp:{dir}/labels.ark,{dir}/labels.scp'.format(
+        dir=args.dir)
+
+    writer = kaldi.IntVectorWriter(wspecifier)
+
+    for uttid, labels in transcript_labels.items():
+        writer.Write(uttid, labels)
+
+    writer.Close()
+
+    print('Generated label file {}/labels.scp successfully'.format(args.dir))
+
+
+if __name__ == '__main__':
+    main()
diff --git a/egs/aishell/s10b/local/convert_text_to_labels.sh b/egs/aishell/s10b/local/convert_text_to_labels.sh
new file mode 100755
index 00000000000..ba1cd823116
--- /dev/null
+++ b/egs/aishell/s10b/local/convert_text_to_labels.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+
+# Copyright 2020 Mobvoi AI Lab, Beijing, China (author: Fangjun Kuang)
+# Apache 2.0
+
+set -e
+
+echo "$0 $@"  # Print the command line for logging
+
+if [ $# != 2 ]; then
+  echo "usage: $0 <data-dir> <lang-dir>"
+  exit 1
+fi
+
+dir=$1
+lang=$2
+
+[[ ! -f $dir/text ]] && echo "file $dir/text does not exist!" && exit 1
+
+for f in lexicon.txt tokens.txt; do
+  if [[ ! -f $lang/$f ]]; then
+    echo "file $lang/$f does not exist!"
+    exit 1
+  fi
+done
+
+python3 ./local/convert_text_to_labels.py \
+  --lexicon-filename $lang/lexicon.txt \
+  --tokens-filename $lang/tokens.txt \
+  --dir $dir
diff --git a/egs/aishell/s10b/local/convert_text_to_labels_test.py b/egs/aishell/s10b/local/convert_text_to_labels_test.py
new file mode 100644
index 00000000000..2fd86a91c7e
--- /dev/null
+++ b/egs/aishell/s10b/local/convert_text_to_labels_test.py
@@ -0,0 +1,84 @@
+#!/usr/bin/env python3
+
+# Copyright 2020 Mobvoi AI Lab, Beijing, China (author: Fangjun Kuang)
+# Apache 2.0
+
+import os
+import shutil
+import tempfile
+import unittest
+
+import kaldi
+
+
+def generate_test_lexicon(d):
+    s = 'foo f o o\n'
+    s += 'bar b a r\n'
+
+    filename = os.path.join(d, 'lexicon.txt')
+    with open(filename, 'w') as f:
+        f.write(s)
+
+
+def generate_test_tokens(d):
+    s = '''<eps> 0
+<blk> 1
+a 2
+b 3
+f 4
+o 5
+r 6
+'''
+    filename = os.path.join(d, 'tokens.txt')
+    with open(filename, 'w') as f:
+        f.write(s)
+
+
+def generate_test_text(d):
+    s = 'utt1 foo bar bar\n'
+    s += 'utt2 bar\n'
+
+    filename = os.path.join(d, 'text')
+    with open(filename, 'w') as f:
+        f.write(s)
+
+
+class ConvertTextToLablesTest(unittest.TestCase):
+
+    def test(self):
+        d = tempfile.mkdtemp()
+
+        generate_test_lexicon(d)
+        generate_test_tokens(d)
+        generate_test_text(d)
+
+        cmd = '''
+        python3 ./local/convert_text_to_labels.py \
+                --lexicon-filename {lexicon} \
+                --tokens-filename {tokens} \
+                --dir {dir}
+        '''.format(lexicon=os.path.join(d, 'lexicon.txt'),
+                   tokens=os.path.join(d, 'tokens.txt'),
+                   dir=d)
+
+        os.system(cmd)
+
+        rspecifier = 'scp:{}/labels.scp'.format(d)
+
+        reader = kaldi.SequentialIntVectorReader(rspecifier)
+
+        expected_labels = dict()
+        expected_labels['utt1'] = [3, 4, 4, 2, 1, 5, 2, 1, 5]
+        expected_labels['utt2'] = [2, 1, 5]
+
+        for key, value in reader:
+            self.assertTrue(key in expected_labels)
+            self.assertEqual(value, expected_labels[key])
+
+        reader.Close()
+
+        shutil.rmtree(d)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/egs/aishell/s10b/local/download_and_untar.sh b/egs/aishell/s10b/local/download_and_untar.sh
new file mode 100755
index 00000000000..58a278241d7
--- /dev/null
+++ b/egs/aishell/s10b/local/download_and_untar.sh
@@ -0,0 +1,105 @@
+#!/bin/bash
+
+# Copyright   2014  Johns Hopkins University (author: Daniel Povey)
+#             2017  Xingyu Na
+# Apache 2.0
+
+remove_archive=false
+
+if [ "$1" == --remove-archive ]; then
+  remove_archive=true
+  shift
+fi
+
+if [ $# -ne 3 ]; then
+  echo "Usage: $0 [--remove-archive] <data-base> <url-base> <corpus-part>"
+  echo "e.g.: $0 /export/a05/xna/data www.openslr.org/resources/33 data_aishell"
+  echo "With --remove-archive it will remove the archive after successfully un-tarring it."
+  echo "<corpus-part> can be one of: data_aishell, resource_aishell."
+fi
+
+data=$1
+url=$2
+part=$3
+
+if [ ! -d "$data" ]; then
+  echo "$0: no such directory $data"
+  exit 1;
+fi
+
+part_ok=false
+list="data_aishell resource_aishell"
+for x in $list; do
+  if [ "$part" == $x ]; then part_ok=true; fi
+done
+if ! $part_ok; then
+  echo "$0: expected <corpus-part> to be one of $list, but got '$part'"
+  exit 1;
+fi
+
+if [ -z "$url" ]; then
+  echo "$0: empty URL base."
+  exit 1;
+fi
+
+if [ -f $data/$part/.complete ]; then
+  echo "$0: data part $part was already successfully extracted, nothing to do."
+  exit 0;
+fi
+
+# sizes of the archive files in bytes.
+sizes="15582913665 1246920"
+
+if [ -f $data/$part.tgz ]; then
+  size=$(/bin/ls -l $data/$part.tgz | awk '{print $5}')
+  size_ok=false
+  for s in $sizes; do if [ $s == $size ]; then size_ok=true; fi; done
+  if ! $size_ok; then
+    echo "$0: removing existing file $data/$part.tgz because its size in bytes $size"
+    echo "does not equal the size of one of the archives."
+    rm $data/$part.tgz
+  else
+    echo "$data/$part.tgz exists and appears to be complete."
+  fi
+fi
+
+if [ ! -f $data/$part.tgz ]; then
+  if ! which wget >/dev/null; then
+    echo "$0: wget is not installed."
+    exit 1;
+  fi
+  full_url=$url/$part.tgz
+  echo "$0: downloading data from $full_url.  This may take some time, please be patient."
+
+  cd $data
+  if ! wget --no-check-certificate $full_url; then
+    echo "$0: error executing wget $full_url"
+    exit 1;
+  fi
+fi
+
+cd $data
+
+if ! tar -xvzf $part.tgz; then
+  echo "$0: error un-tarring archive $data/$part.tgz"
+  exit 1;
+fi
+
+touch $data/$part/.complete
+
+if [ $part == "data_aishell" ]; then
+  cd $data/$part/wav
+  for wav in ./*.tar.gz; do
+    echo "Extracting wav from $wav"
+    tar -zxf $wav && rm $wav
+  done
+fi
+
+echo "$0: Successfully downloaded and un-tarred $data/$part.tgz"
+
+if $remove_archive; then
+  echo "$0: removing $data/$part.tgz file since --remove-archive option was supplied."
+  rm $data/$part.tgz
+fi
+
+exit 0;
diff --git a/egs/aishell/s10b/local/generate_tlg.sh b/egs/aishell/s10b/local/generate_tlg.sh
new file mode 100755
index 00000000000..ba3ee4c4046
--- /dev/null
+++ b/egs/aishell/s10b/local/generate_tlg.sh
@@ -0,0 +1,159 @@
+#!/bin/bash
+
+# Copyright 2020 Mobvoi AI Lab, Beijing, China (author: Fangjun Kuang)
+# Apache 2.0
+
+# References:
+#  - https://github.com/srvk/eesen/blob/master/asr_egs/wsj/utils/ctc_compile_dict_token.sh
+#
+#  - EESEN: End-to-End Speech Recognition using Deep RNN Models and
+#      WFST-based Decoding (https://arxiv.org/pdf/1507.08240.pdf)
+
+set -e
+
+echo "$0 $@"  # Print the command line for logging
+
+if [ $# != 3 ]; then
+  echo "usage: $0 <lexicon_txt> <lm-gz> <output-lang-dir>"
+  exit 1
+fi
+
+. ./cmd.sh
+. ./path.sh
+
+dict=$1
+lm=$2
+dir=$3
+
+[ ! -f $dict ] && echo "$dict does not exit!" && exit 1
+[ ! -f $lm ] && echo "$lm does not exit!" && exit 1
+
+mkdir -p $dir
+
+cp $dict $dir/lexicon.txt
+
+cat $dir/lexicon.txt | cut -d ' ' -f2- | tr -s ' ' '\n' | sort | uniq > $dir/phones.list
+
+perl -ape 's/(\S+\s+)(.+)/${1}1.0 $2/;' < $dir/lexicon.txt > $dir/lexiconp.txt || exit 1
+
+ndisambig=$(utils/add_lex_disambig.pl $dir/lexiconp.txt $dir/lexiconp_disambig.txt)
+ndisambig=$[$ndisambig+1]
+
+for ((i=0; i<=$ndisambig; i++)); do
+  echo '#'$i
+done > $dir/disambig.list
+
+(
+  echo '<eps>'
+  echo '<blk>'
+) | cat - $dir/phones.list $dir/disambig.list | awk '{print $1, NR-1}' > $dir/tokens.txt
+
+if [[ ! -f $dir/T.fst ]]; then
+  local/token_to_fst.py --tokens-txt-filename $dir/tokens.txt |
+    fstcompile \
+      --isymbols=$dir/tokens.txt \
+      --osymbols=$dir/tokens.txt \
+      --keep_isymbols=false \
+      --keep_osymbols=false |
+    fstarcsort --sort_type=olabel > $dir/T.fst || exit 1
+fi
+
+cat $dir/lexiconp.txt | awk '{print $1}' | sort | uniq  | awk '
+  BEGIN {
+    print "<eps> 0";
+  }
+  {
+    printf("%s %d\n", $1, NR);
+  }
+  END {
+    printf("#0 %d\n", NR + 1);
+  }' > $dir/words.txt || exit 1
+
+
+token_disambig_symbol=$(grep \#0 $dir/tokens.txt | awk '{print $2}')
+word_disambig_symbol=$(grep \#0 $dir/words.txt | awk '{print $2}')
+
+silprob=0
+silphone="sil"
+
+if [[ ! -f $dir/L.fst ]]; then
+  utils/make_lexicon_fst.pl \
+      --pron-probs $dir/lexiconp_disambig.txt $silprob $silphone '#'$ndisambig |
+    fstcompile --isymbols=$dir/tokens.txt --osymbols=$dir/words.txt \
+      --keep_isymbols=false --keep_osymbols=false |
+    fstaddselfloops "echo $token_disambig_symbol |" "echo $word_disambig_symbol |" |
+    fstarcsort --sort_type=olabel > $dir/L.fst || exit 1
+fi
+
+
+if [[ ! -f $dir/G.fst ]]; then
+  gunzip -c $lm |
+    grep -v '<s> <s>' |
+    grep -v '</s> <s>' |
+    grep -v '</s> </s>' |
+    arpa2fst - |
+    fstprint |
+    utils/eps2disambig.pl |
+    utils/s2eps.pl |
+    fstcompile \
+      --isymbols=$dir/words.txt \
+      --osymbols=$dir/words.txt  \
+      --keep_isymbols=false \
+      --keep_osymbols=false |
+    fstrmepsilon |
+    fstarcsort --sort_type=ilabel > $dir/G.fst
+fi
+
+set +e
+fstisstochastic $dir/G.fst
+set -e
+
+# The output is like:
+# 9.14233e-05 -0.259833
+# we do expect the first of these 2 numbers to be close to zero (the second is
+# nonzero because the backoff weights make the states sum to >1).
+
+if true; then
+  # Everything in this "if" statement is only for diagnostic.
+  # Checking that G has no cycles with empty words on them (e.g. <s>, </s>);
+  # this might cause determinization failure of CLG.
+  # #0 is treated as an empty word.
+  mkdir -p $dir/tmpdir.g
+  awk '{if(NF==1){ printf("0 0 %s %s\n", $1,$1); }}
+       END{print "0 0 #0 #0"; print "0";}' \
+       < "$dir/lexicon.txt" > $dir/tmpdir.g/select_empty.fst.txt
+
+  fstcompile --isymbols=$dir/words.txt --osymbols=$dir/words.txt \
+    $dir/tmpdir.g/select_empty.fst.txt \
+    | fstarcsort --sort_type=olabel \
+    | fstcompose - $dir/G.fst > $dir/tmpdir.g/empty_words.fst
+
+  fstinfo $dir/tmpdir.g/empty_words.fst | grep cyclic | grep -w 'y' \
+    && echo "Language model has cycles with empty words" && exit 1
+
+  rm -r $dir/tmpdir.g
+fi
+
+fsttablecompose $dir/L.fst $dir/G.fst |
+  fstdeterminizestar --use-log=true |
+  fstminimizeencoded |
+  fstarcsort --sort_type=ilabel > $dir/LG.fst || exit 1
+
+set +e
+fstisstochastic $dir/LG.fst
+set -e
+
+fsttablecompose $dir/T.fst $dir/LG.fst > $dir/TLG.fst || exit 1
+
+fstconvert --fst_type=const TLG.fst const_TLG.fst
+mv const_TLG.fst TLG.fst
+
+set +e
+fstisstochastic $dir/TLG.fst
+set -e
+
+# remove files not needed any more
+for f in G.fst L.fst T.fst LG.fst disambig.list \
+         lexiconp.txt lexiconp_disambig.txt; do
+  rm $dir/$f
+done
diff --git a/egs/aishell/s10b/local/latgen-faster.py b/egs/aishell/s10b/local/latgen-faster.py
new file mode 100755
index 00000000000..4dca8720177
--- /dev/null
+++ b/egs/aishell/s10b/local/latgen-faster.py
@@ -0,0 +1,148 @@
+#!/usr/bin/env python3
+
+# Copyright 2020 Mobvoi AI Lab, Beijing, China (author: Fangjun Kuang)
+# Apache 2.0
+#
+#
+'''
+This file is adapted from src/bin/latgen-faster-mapped.cc.
+
+Note that there is no **mapped** in the filename since we
+do not use a transition model for mapping pdf ids to transition ids.
+
+Since this Python script is just a thin wrapper about the C++ code,
+there should not be any performance problem.
+
+You can write another `src/bin/latgen-faster.cc` if you are
+still worrying about the performance.
+'''
+#
+#
+
+# TODO(fangjun): refer to src/bin/latgen-faster-mapped parallel.cc to
+# implement latgen-faster-parallel.py
+
+import sys
+
+import kaldi
+from kaldi import fst
+
+
+def main():
+    usage = kaldi.StringArg('''\
+Generate lattices, reading log-likelihoods as matrices
+
+Usage: latgen-faster [options]  fst-rxfilename loglikes-rspecifier \
+lattice-wspecifier [ words-wspecifier [alignments-wspecifier] ]
+''')
+
+    allow_partial = kaldi.BoolArg(False)
+    acoustic_scale = kaldi.FloatArg(0.1)
+    word_syms_filename = kaldi.StringArg()
+
+    config = kaldi.LatticeFasterDecoderConfig()
+
+    po = kaldi.ParseOptions(usage)
+
+    config.Register(po)
+
+    po.Register('acoustic-scale', acoustic_scale,
+                'Scaling factor for acoustic likelihoods')
+
+    po.Register('word-symbol-table', word_syms_filename,
+                'Symbol table for words [for debug output]')
+
+    po.Register('allow-partial', allow_partial,
+                'If true, produce output even if end state was not reached.')
+
+    po.Read(sys.argv)
+
+    if po.NumArgs() < 3 or po.NumArgs() > 5:
+        po.PrintUsage()
+        sys.exit(1)
+
+    fst_in_str = po.GetArg(1)
+    log_likes_rspecifier = po.GetArg(2)
+    lattice_wspecifier = po.GetArg(3)
+    words_wspecifier = po.GetOptArg(4)
+    alignment_wspecifier = po.GetOptArg(5)
+
+    determinize = config.determinize_lattice
+    compact_lattice_writer = kaldi.CompactLatticeWriter()
+    lattice_writer = kaldi.LatticeWriter()
+
+    if determinize:
+        assert compact_lattice_writer.Open(lattice_wspecifier) == True
+    else:
+        assert lattice_writer.Open(lattice_wspecifier) == True
+
+    words_writer = kaldi.IntVectorWriter(words_wspecifier)
+    alignments_writer = kaldi.IntVectorWriter(alignment_wspecifier)
+
+    word_syms = fst.SymbolTable()
+
+    if word_syms_filename:
+        word_syms = fst.SymbolTable.ReadText(word_syms_filename.value)
+
+    # TODO(fangjun): support a table of FSTs
+
+    tot_like = 0.0
+    frame_count = 0
+    num_success = 0
+    num_fail = 0
+
+    loglike_reader = kaldi.SequentialMatrixReader(log_likes_rspecifier)
+
+    # WARNING(fangjun): fst_in_str has to be a **const** fst.
+    # If it is a vector fst, you will get an error
+    # while creating the subsequent LatticeFasterDecoder.
+    tlg_fst = fst.ReadFstKaldiGeneric(fst_in_str)
+
+    decoder = kaldi.LatticeFasterDecoder(tlg_fst, config)
+
+    trans_model = kaldi.TransitionModel()  # a dummy transition model
+
+    for key, value in loglike_reader:
+        if value.NumRows() == 0:
+            print('zero length utterance: {}'.format(key))
+            num_fail += 1
+            continue
+
+        decodable = kaldi.DecodableMatrixScaled(likes=value,
+                                                scale=acoustic_scale.value)
+
+        is_succeeded, likelihood = kaldi.DecodeUtteranceLatticeFaster(
+            decoder=decoder,
+            decodable=decodable,
+            trans_model=trans_model,
+            word_syms=word_syms,
+            utt=key,
+            acoustic_scale=acoustic_scale.value,
+            determinize=determinize,
+            allow_partial=allow_partial.value,
+            alignments_writer=alignments_writer,
+            words_writer=words_writer,
+            compact_lattice_writer=compact_lattice_writer,
+            lattice_writer=lattice_writer)
+
+        if is_succeeded:
+            tot_like += likelihood
+            frame_count += value.NumRows()
+            num_success += 1
+        else:
+            num_fail += 1
+
+    print('Done {num_success} utterances, failed for {num_fail}'.format(
+        num_success=num_success, num_fail=num_fail))
+
+    print('Overall log-likelihood per frame is {} over {} frames'.format(
+        tot_like / frame_count, frame_count))
+
+    if num_success != 0:
+        sys.exit(0)
+    else:
+        sys.exit(1)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/egs/aishell/s10b/local/run_ctc.sh b/egs/aishell/s10b/local/run_ctc.sh
new file mode 100755
index 00000000000..d0f0c6fb029
--- /dev/null
+++ b/egs/aishell/s10b/local/run_ctc.sh
@@ -0,0 +1,218 @@
+#!/bin/bash
+
+# Copyright 2020 Mobvoi AI Lab, Beijing, China (author: Fangjun Kuang)
+# Apache 2.0
+
+set -e
+
+echo "$0 $@"  # Print the command line for logging
+
+stage=0
+nj=30
+
+export CUDA_VISIBLE_DEVICES="0"
+device_id=0
+
+train_data_dir=data/train_sp
+dev_data_dir=data/dev_sp
+test_data_dir=data/test
+lang_dir=data/lang
+
+lr=1e-3
+num_epochs=6
+l2_regularize=1e-5
+batch_size=64
+
+# WARNING(fangjun): You should know how to calculate your
+# model's left/right context **manually**
+model_left_context=29
+model_right_context=29
+
+hidden_dim=1024
+bottleneck_dim=128
+prefinal_bottleneck_dim=256
+kernel_size_list="3, 3, 3, 1, 3, 3, 3, 3, 3, 3, 3, 3" # comma separated list
+subsampling_factor_list="1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1" # comma separated list
+
+log_level=info # valid values: debug, info, warning
+
+post_decode_acwt=1
+
+dir=exp/ctc
+
+. ./path.sh
+. ./cmd.sh
+
+. parse_options.sh
+
+feat_dim=$(feat-to-dim --print-args=false scp:$train_data_dir/feats.scp -)
+output_dim=$(cat $lang_dir/phones.list | wc -l)
+# added by one since we have an extra blank symbol <blk>
+output_dim=$[$output_dim+1]
+
+pids=()
+function kill_trainer() { echo "kill training processes" && kill "${pids[@]}"; }
+
+if [[ $stage -le 0 ]]; then
+  mkdir -p $dir/train/tensorboard
+  train_checkpoint=
+  if [[ -f $dir/train/best_model.pt ]]; then
+    train_checkpoint=$dir/train/best_model.pt
+  fi
+
+  num_gpus=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
+
+  if [[ $num_gpus -gt 1 ]]; then
+    echo "$0: training with ddp..."
+    echo "$0: number of gpus: $num_gpus"
+
+    export MASTER_ADDR=localhost
+    export MASTER_PORT=6666
+
+    for ((i = 0; i < $num_gpus; ++i)); do
+      # sort options alphabetically
+      python3 ./ctc/ddp_train.py \
+        --batch-size $batch_size \
+        --checkpoint=${train_checkpoint:-} \
+        --device-id $i \
+        --dir $dir/train \
+        --feats-scp $train_data_dir/feats.scp \
+        --hidden-dim $hidden_dim \
+        --input-dim $feat_dim \
+        --is-training true \
+        --model-left-context $model_left_context \
+        --model-right-context $model_right_context \
+        --num-layers $num_layers \
+        --output-dim $output_dim \
+        --proj-dim $proj_dim \
+        --train.ddp.world-size $num_gpus \
+        --train.l2-regularize $l2_regularize \
+        --train.labels-scp $train_data_dir/labels.scp \
+        --train.lr $lr \
+        --train.num-epochs $num_epochs \
+        --train.use-ddp true &
+      pids+=("$!")
+    done
+    trap kill_trainer SIGINT SIGTERM
+    wait
+  else
+    echo "$0: training with single gpu..."
+    # sort options alphabetically
+    python3 ./ctc/train.py \
+      --batch-size $batch_size \
+      --bottleneck-dim $bottleneck_dim \
+      --checkpoint=${train_checkpoint:-} \
+      --device-id $device_id \
+      --dir $dir/train \
+      --feats-scp $train_data_dir/feats.scp \
+      --hidden-dim $hidden_dim \
+      --input-dim $feat_dim \
+      --is-training true \
+      --kernel-size-list "$kernel_size_list" \
+      --log-level $log_level \
+      --model-left-context $model_left_context \
+      --model-right-context $model_right_context \
+      --output-dim $output_dim \
+      --prefinal-bottleneck-dim $prefinal_bottleneck_dim \
+      --subsampling-factor-list "$subsampling_factor_list" \
+      --train.l2-regularize $l2_regularize \
+      --train.labels-scp $train_data_dir/labels.scp \
+      --train.lr $lr \
+      --train.num-epochs $num_epochs \
+      --train.use-ddp false
+  fi
+fi
+
+if [[ $stage -le 1 ]]; then
+  echo "$0: inference: computing likelihood"
+  mkdir -p $dir/inference
+
+  for x in $test_data_dir; do
+    basename=$(basename $x)
+    mkdir -p $dir/inference/$basename
+    if [[ -f $dir/inference/$basename/nnet_output.scp ]]; then
+      echo "$0: $dir/inference/$basename/nnet_output.scp already exists! Skip"
+    else
+    best_epoch=$(cat $dir/train/best-epoch-info | grep 'best epoch' | awk '{print $NF}')
+    [[ -z $best_epoch ]] && echo "$dir/train/best-epoch-info is not available!" && exit 1
+    inference_checkpoint=$dir/train/epoch-${best_epoch}.pt
+    echo "$0: using inference checking point: $inference_checkpoint"
+    # sort options alphabetically
+    python3 ./ctc/inference.py \
+      --batch-size $batch_size \
+      --bottleneck-dim $bottleneck_dim \
+      --checkpoint ${inference_checkpoint:-} \
+      --device-id $device_id \
+      --dir $dir/inference/$basename \
+      --feats-scp $x/feats.scp \
+      --hidden-dim $hidden_dim \
+      --input-dim $feat_dim \
+      --is-training false \
+      --kernel-size-list "$kernel_size_list" \
+      --log-level $log_level \
+      --model-left-context $model_left_context \
+      --model-right-context $model_right_context \
+      --output-dim $output_dim \
+      --prefinal-bottleneck-dim $prefinal_bottleneck_dim \
+      --subsampling-factor-list "$subsampling_factor_list"
+    fi
+  done
+fi
+
+if [[ $stage -le 2 ]]; then
+  echo "$0: decoding"
+  mkdir -p $dir/decode
+  for x in $test_data_dir; do
+    basename=$(basename $x)
+    mkdir -p $dir/decode/$basename
+
+    if [[ ! -f $dir/inference/$basename/nnet_output.scp ]]; then
+      echo "$0: $dir/inference/$basename/nnet_output.scp does not exist!"
+      echo "$0: Please run inference.py first"
+      exit 1
+    fi
+
+    echo "$0: decoding $x"
+
+    for i in $(seq $nj); do
+      utils/split_scp.pl -j $nj $[$i - 1] $dir/inference/$basename/nnet_output.scp $dir/decode/$basename/nnet_output.$i.scp
+    done
+
+    lat_wspecifier="ark:|lattice-scale --acoustic-scale=$post_decode_acwt ark:- ark:- | gzip -c >$dir/decode/$basename/lat.JOB.gz"
+
+    # sort options alphabetically
+    $decode_cmd JOB=1:$nj $dir/decode/$basename/log/decode.JOB.log \
+      ./local/latgen-faster.py \
+      --acoustic-scale=1.0 \
+      --allow-partial=true \
+      --beam=17.0 \
+      --determinize-lattice=false \
+      --lattice-beam=8.0 \
+      --max-active=7000 \
+      --max-mem=200000000 \
+      --min-active=200 \
+      --minimize=false \
+      --word-symbol-table=$lang_dir/words.txt \
+      $lang_dir/TLG.fst \
+      scp:$dir/decode/$basename/nnet_output.JOB.scp \
+      "$lat_wspecifier"
+  done
+fi
+
+if [[ $stage -le 3 ]]; then
+  echo "$0: scoring"
+
+  for x in $test_data_dir; do
+    basename=$(basename $x)
+
+    ./local/score.sh --cmd "$decode_cmd" \
+      $x \
+      $lang_dir \
+      $dir/decode/$basename || exit 1
+  done
+
+  for x in $test_data_dir; do
+    basename=$(basename $x)
+    head $dir/decode/$basename/scoring_kaldi/best_*
+  done
+fi
diff --git a/egs/aishell/s10b/local/score.sh b/egs/aishell/s10b/local/score.sh
new file mode 100755
index 00000000000..a9786169973
--- /dev/null
+++ b/egs/aishell/s10b/local/score.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+
+set -e -o pipefail
+set -x
+steps/score_kaldi.sh "$@"
+steps/scoring/score_kaldi_cer.sh --stage 2 "$@"
+
+echo "$0: Done"
diff --git a/egs/aishell/s10b/local/token_to_fst.py b/egs/aishell/s10b/local/token_to_fst.py
new file mode 100755
index 00000000000..66660f5d886
--- /dev/null
+++ b/egs/aishell/s10b/local/token_to_fst.py
@@ -0,0 +1,83 @@
+#!/usr/bin/env python3
+
+# Copyright 2020 Mobvoi AI Lab, Beijing, China (author: Fangjun Kuang)
+# Apache 2.0
+
+# This program takes as input a phone symbol table
+# `tokens.txt` and prints a text fst to the console.
+#
+# You can use `fstcompile` to convert the printed text fst
+# to a binary fst.
+#
+# Two integer values in the symbol table have particular meaning:
+#  - 0 for `<eps>`
+#  - 1 for the blank symbol `<blk>`
+
+import argparse
+import os
+
+
+def get_args():
+    parser = argparse.ArgumentParser(description='''
+Convert tokens.txt to tokens.fst.
+
+Usage:
+    python3 ./local/token_to_fst.py \
+            --tokens-txt-filename data/lang/tokens.txt |
+    fstcompile \
+      --isymbols=data/lang/tokens.txt \
+      --osymbols=data/lang/tokens.txt \
+      --keep_isymbols=false \
+      --keep_osymbols=false |
+    fstarcsort --sort_type=olabel > $data/lang/T.fst || exit 1
+''')
+
+    parser.add_argument('--tokens-txt-filename',
+                        dest='tokens_txt_filename',
+                        help="a phone symbol table",
+                        type=str)
+
+    args = parser.parse_args()
+    assert os.path.isfile(args.tokens_txt_filename)
+
+    return args
+
+
+def main():
+    args = get_args()
+
+    s = '0 1 <eps> <eps>\n'
+    s += '1 1 <blk> <eps>\n'
+    s += '2 2 <blk> <eps>\n'
+    s += '2 0 <eps> <eps>\n'
+
+    next_state = 3
+    with open(args.tokens_txt_filename, 'r') as f:
+        for line in f:
+            phone_index = line.split()
+            assert len(phone_index) == 2
+            phone, _ = phone_index
+
+            if phone in ['<eps>', '<blk>']:
+                continue
+
+            if '#' in phone:
+                s += '0 0 <eps> {}\n'.format(phone)
+                continue
+
+            s += '1 {next_state} {phone} {phone}\n'.format(
+                next_state=next_state, phone=phone)
+
+            s += '{next_state} {next_state} {phone} <eps>\n'.format(
+                next_state=next_state, phone=phone)
+
+            s += '{next_state} 2 <eps> <eps>\n'.format(next_state=next_state)
+
+            next_state += 1
+
+    s += '0'
+    print(s)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/egs/aishell/s10b/path.sh b/egs/aishell/s10b/path.sh
new file mode 100755
index 00000000000..d3525eedd82
--- /dev/null
+++ b/egs/aishell/s10b/path.sh
@@ -0,0 +1,8 @@
+export KALDI_ROOT=`pwd`/../../..
+[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh
+export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH
+[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
+. $KALDI_ROOT/tools/config/common_path.sh
+export LC_ALL=C
+
+export PYTHONPATH=$KALDI_ROOT/src/pybind:$PYTHONPATH
diff --git a/egs/aishell/s10b/run.sh b/egs/aishell/s10b/run.sh
new file mode 100755
index 00000000000..6ffcc2e4a5b
--- /dev/null
+++ b/egs/aishell/s10b/run.sh
@@ -0,0 +1,90 @@
+#!/bin/bash
+
+# Copyright 2020 Mobvoi AI Lab, Beijing, China (author: Fangjun Kuang)
+# Apache 2.0
+
+set -e
+
+. ./cmd.sh
+. ./path.sh
+
+data=/home/fangjun/data/aishell
+data_url=www.openslr.org/resources/33
+
+nj=30
+
+stage=8
+
+if [[ $stage -le 0 ]]; then
+  local/download_and_untar.sh $data $data_url data_aishell || exit 1
+  local/download_and_untar.sh $data $data_url resource_aishell || exit 1
+fi
+
+if [[ $stage -le 1 ]]; then
+  local/aishell_prepare_dict.sh $data/resource_aishell || exit 1
+  # generated in data/local/dict
+fi
+
+if [[ $stage -le 2 ]]; then
+  local/aishell_data_prep.sh $data/data_aishell/wav \
+    $data/data_aishell/transcript || exit 1
+  # generated in data/{train,test,dev}/{spk2utt text utt2spk wav.scp}
+fi
+
+if [[ $stage -le 3 ]]; then
+  local/aishell_train_lms.sh || exit 1
+fi
+
+if [[ $stage -le 4 ]]; then
+  echo "$0: generating TLG.fst"
+  ./local/generate_tlg.sh \
+    data/local/dict/lexicon.txt \
+    data/local/lm/3gram-mincount/lm_unpruned.gz \
+    data/lang
+fi
+
+if [[ $stage -le 5 ]]; then
+  echo "$0: generating fbank features (40-dim)"
+
+  for x in train dev; do
+    utils/data/perturb_data_dir_speed_3way.sh data/$x data/${x}_sp
+  done
+
+  for x in train_sp dev_sp test; do
+    steps/make_fbank.sh --cmd "$train_cmd" --nj $nj data/$x || exit 1
+    steps/compute_cmvn_stats.sh data/$x || exit 1
+    utils/fix_data_dir.sh data/$x || exit 1
+  done
+fi
+
+if [[ $stage -le 6 ]]; then
+  echo "$0: convert text to labels"
+  for x in train_sp dev_sp test; do
+    ./local/convert_text_to_labels.sh data/$x data/lang
+  done
+fi
+
+# n=1024
+# # n=
+# if [[ $stage -le 7 ]]; then
+#   if true; then
+#     utils/subset_data_dir.sh data/train_sp $n data/train_sp$n || exit 1
+#     utils/subset_data_dir.sh data/dev_sp $n data/dev_sp$n || exit 1
+#   else
+#     utils/subset_data_dir.sh --first data/train_sp $n data/train_sp$n || exit 1
+#     utils/subset_data_dir.sh --first data/dev_sp $n data/dev_sp$n || exit 1
+#   fi
+#
+#   for x in train_sp dev_sp; do
+#     ./local/convert_text_to_labels.sh data/${x}$n data/lang
+#   done
+# fi
+
+if [[ $stage -le 8 ]]; then
+  ./local/run_ctc.sh \
+    --train-data-dir data/train_sp$n \
+    --dev-data-dir data/dev_sp$n \
+    --test-data-dir data/test  \
+    --lang-dir data/lang \
+    --nj $nj
+fi
diff --git a/egs/aishell/s10b/steps b/egs/aishell/s10b/steps
new file mode 120000
index 00000000000..6e99bf5b5ad
--- /dev/null
+++ b/egs/aishell/s10b/steps
@@ -0,0 +1 @@
+../../wsj/s5/steps
\ No newline at end of file
diff --git a/egs/aishell/s10b/utils b/egs/aishell/s10b/utils
new file mode 120000
index 00000000000..b240885218f
--- /dev/null
+++ b/egs/aishell/s10b/utils
@@ -0,0 +1 @@
+../../wsj/s5/utils
\ No newline at end of file
diff --git a/src/pybind/decoder/lattice_faster_decoder_pybind_test.py b/src/pybind/decoder/lattice_faster_decoder_pybind_test.py
index 6842e2d7afa..010c2d77920 100755
--- a/src/pybind/decoder/lattice_faster_decoder_pybind_test.py
+++ b/src/pybind/decoder/lattice_faster_decoder_pybind_test.py
@@ -20,7 +20,7 @@ def test_lattice_faster_decoder_config(self):
         print(opts)
 
     def test_lattice_faster_decoder_config_parse_options(self):
-        usage = 'testing'
+        usage = kaldi.StringArg('testing')
         parse_options = kaldi.ParseOptions(usage)
         argv = [
             'a.out', '--print-args=false', '--beam=20', '--max-active=7000',
diff --git a/src/pybind/fst/symbol_table_pybind.cc b/src/pybind/fst/symbol_table_pybind.cc
index 96b351a6497..714f60233f6 100644
--- a/src/pybind/fst/symbol_table_pybind.cc
+++ b/src/pybind/fst/symbol_table_pybind.cc
@@ -59,21 +59,14 @@ void pybind_symbol_table(py::module& m) {
         .def(py::init<const fst::string&>(),
              "Constructs symbol table with an optional name.",
              py::arg("name") = "<unspecified>")
-        .def_static("ReadText",
-                    overload_cast_<std::istream&, const fst::string&,
-                                   const fst::SymbolTableTextOptions&>()(
-                        &PyClass::ReadText),
-                    "Reads a text representation of the symbol table from an "
-                    "istream. Pass a name to give the resulting SymbolTable.",
-                    py::arg("strm"), py::arg("name"),
-                    py::arg("opts") = fst::SymbolTableTextOptions())
         .def_static("ReadText",
                     overload_cast_<const fst::string&,
                                    const fst::SymbolTableTextOptions&>()(
                         &PyClass::ReadText),
                     "Reads a text representation of the symbol table",
                     py::arg("filename"),
-                    py::arg("opts") = fst::SymbolTableTextOptions())
+                    py::arg("opts") = fst::SymbolTableTextOptions(),
+                    py::return_value_policy::take_ownership)
         .def_static(
             "Read",
             overload_cast_<std::istream&, const fst::SymbolTableReadOptions&>()(
diff --git a/src/pybind/fstext/kaldi_fst_io_pybind.cc b/src/pybind/fstext/kaldi_fst_io_pybind.cc
index c7a9e7616f2..9beef523909 100644
--- a/src/pybind/fstext/kaldi_fst_io_pybind.cc
+++ b/src/pybind/fstext/kaldi_fst_io_pybind.cc
@@ -25,7 +25,7 @@ void pybind_kaldi_fst_io(py::module& m) {
         "Read a binary FST using Kaldi I/O mechanisms (pipes, etc.) On error, "
         "throws using KALDI_ERR.  Note: this doesn't support the text-mode "
         "option that we generally like to support.",
-        py::arg("rxfilename"), py::return_value_policy::reference);
+        py::arg("rxfilename"), py::return_value_policy::take_ownership);
 
   m.def("ReadFstKaldiGeneric", fst::ReadFstKaldiGeneric,
         "Read a binary FST using Kaldi I/O mechanisms (pipes, etc.) If it "
@@ -36,22 +36,11 @@ void pybind_kaldi_fst_io(py::module& m) {
         "VectorFst<StdArc> (const-fst can give better performance for "
         "decoding).",
         py::arg("rxfilename"), py::arg("throw_on_err") = true,
-        py::return_value_policy::reference);
+        py::return_value_policy::take_ownership);
 
-  m.def("CastOrConvertToVectorFst", &fst::CastOrConvertToVectorFst,
-        "This function attempts to dynamic_cast the pointer 'fst' (which will "
-        "likely have been returned by ReadFstGeneric()), to the more derived "
-        "type VectorFst<StdArc>. If this succeeds, it returns the same "
-        "pointer; if it fails, it converts the FST type (by creating a new "
-        "VectorFst<stdArc> initialized by 'fst'), prints a warning, and "
-        "deletes 'fst'.",
-        py::arg("fst"), py::return_value_policy::reference);
-
-  m.def("ReadFstKaldi",
-        (void (*)(std::string, fst::StdVectorFst*)) & fst::ReadFstKaldi,
-        "Version of ReadFstKaldi() that writes to a pointer.  Assumes the FST "
-        "is binary with no binary marker.  Crashes on error.",
-        py::arg("rxfilename"), py::arg("ofst"));
+  // CastOrConvertToVectorFst may return an existing pointer
+  // or a newly created pointer. There may be memory leak
+  // if it's wrapped to Python.
 
   m.def("WriteFstKaldi",
         (void (*)(const fst::StdVectorFst&, std::string)) & fst::WriteFstKaldi,
@@ -60,24 +49,10 @@ void pybind_kaldi_fst_io(py::module& m) {
         "doesn't support the text-mode option.",
         py::arg("fst"), py::arg("wxfilename"));
 
-  m.def("WriteFstKaldi",
-        (void (*)(std::ostream&, bool, const fst::StdVectorFst&)) &
-            fst::WriteFstKaldi,
-        "This is a more general Kaldi-type-IO mechanism of writing FSTs to "
-        "streams, supporting binary or text-mode writing.  (note: we just "
-        "write the integers, symbol tables are not supported). On error, "
-        "throws using KALDI_ERR.",
-        py::arg("os"), py::arg("binary"), py::arg("fst"));
-
-  m.def("ReadFstKaldi",
-        (void (*)(std::istream&, bool, fst::StdVectorFst*)) & fst::ReadFstKaldi,
-        "A generic Kaldi-type-IO mechanism of reading FSTs from streams, "
-        "supporting binary or text-mode reading/writing.",
-        py::arg("is"), py::arg("binary"), py::arg("fst"));
   m.def("ReadAndPrepareLmFst", &fst::ReadAndPrepareLmFst,
         "Read an FST file for LM (G.fst) and make it an acceptor, and make "
         "sure it is sorted on labels",
-        py::arg("rxfilename"), py::return_value_policy::reference);
+        py::arg("rxfilename"), py::return_value_policy::take_ownership);
 
   {
     // fangjun: it should be called StdVectorFstHolder to match the naming
diff --git a/src/pybind/tests/test_latgen_faster_mapped.py b/src/pybind/tests/test_latgen_faster_mapped.py
index 5b2b9315419..d9a0620ed98 100755
--- a/src/pybind/tests/test_latgen_faster_mapped.py
+++ b/src/pybind/tests/test_latgen_faster_mapped.py
@@ -28,9 +28,11 @@
 class TestLatGenFasterMapped(unittest.TestCase):
 
     def test(self):
-        usage = 'Generate lattices, reading log-likelihoods as matrices\n'
-        ' (model is needed only for the integer mappings in its transition-model)\n'
-        po = kaldi.ParseOptions(usage)
+        usage = kaldi.StringArg(
+            'Generate lattices, reading log-likelihoods as matrices\n'
+            ' (model is needed only for the integer mappings in its transition-model)\n'
+        )
+        po = kaldi.ParseOptions(usage=usage)
 
         allow_partial = kaldi.BoolArg(False)
         acoustic_scale = kaldi.FloatArg(0.1)
@@ -39,7 +41,7 @@ def test(self):
         if not os.path.exists(
                 '../../../egs/aishell/s10/exp/chain/graph/HCLG.fst'):
             print('Please execute kaldi/egs/aishell/s10/run.sh first')
-            print('and souce path.sh in it before running this script')
+            print('and source path.sh in it before running this script')
             print('Or replace relevant files in this test with your own')
             print('Skip this test')
             return
diff --git a/src/pybind/util/parse_options_pybind.cc b/src/pybind/util/parse_options_pybind.cc
index a2764dc3b5f..3d03f106d06 100644
--- a/src/pybind/util/parse_options_pybind.cc
+++ b/src/pybind/util/parse_options_pybind.cc
@@ -75,7 +75,6 @@ void pybind_parse_options(py::module& m) {
 
   auto opt =
       py::class_<PyClass, OptionsItf>(m, "ParseOptions")
-          .def(py::init<const char*>(), py::arg("usage"))
           .def("Read",
                [](PyClass* opts, const std::vector<std::string>& args) {
                  int argc = static_cast<int>(args.size());
@@ -130,4 +129,12 @@ void pybind_parse_options(py::module& m) {
   pybind_arg<float>(m, opt);
   pybind_arg<double>(m, opt);
   pybind_arg<std::string>(m, opt);
+
+  opt.def(py::init([](const Arg<std::string>& usage) {
+            // NOTE(fangjun): no memory leak here using `new`.
+            // Refer to
+            // https://pybind11.readthedocs.io/en/stable/upgrade.html#new-api-for-defining-custom-constructors-and-pickling-functions
+            return new PyClass(usage.value.c_str());
+          }),
+          py::arg("usage"));
 }
diff --git a/src/pybind/util/parse_options_pybind_test.py b/src/pybind/util/parse_options_pybind_test.py
index ff68d0774bb..55b04ae46b5 100755
--- a/src/pybind/util/parse_options_pybind_test.py
+++ b/src/pybind/util/parse_options_pybind_test.py
@@ -41,7 +41,8 @@ def test_parse_args(self):
         d = kaldi.DoubleArg()
         s = kaldi.StringArg()
 
-        parse_options = kaldi.ParseOptions(usage='test args')
+        usage = kaldi.StringArg('test args')
+        parse_options = kaldi.ParseOptions(usage=usage)
         parse_options.Register(name='b', arg=b, doc='bool args')
         parse_options.Register('i', i, 'int32 args')
         parse_options.Register('u', u, 'uint32 args')