From b3e6623d0cb21d628aa30bf947a88abb1a2e6c12 Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Wed, 12 Feb 2020 09:03:00 +0800 Subject: [PATCH 1/6] show L2 norm of parameters during training. In addition, set affine to false for batchnorm layers and switch to SGD optimizer. --- egs/aishell/s10/chain/inference.py | 16 ++++++----- egs/aishell/s10/chain/model.py | 22 +++++++++------ egs/aishell/s10/chain/options.py | 7 +++++ egs/aishell/s10/chain/tdnnf_layer.py | 6 ++--- egs/aishell/s10/chain/train.py | 40 ++++++++++++++++++++-------- egs/aishell/s10/local/run_chain.sh | 3 +++ 6 files changed, 65 insertions(+), 29 deletions(-) diff --git a/egs/aishell/s10/chain/inference.py b/egs/aishell/s10/chain/inference.py index c8ef809ae61..e360038b204 100644 --- a/egs/aishell/s10/chain/inference.py +++ b/egs/aishell/s10/chain/inference.py @@ -30,13 +30,15 @@ def main(): else: device = torch.device('cuda', args.device_id) - model = get_chain_model(feat_dim=args.feat_dim, - output_dim=args.output_dim, - lda_mat_filename=args.lda_mat_filename, - hidden_dim=args.hidden_dim, - bottleneck_dim=args.bottleneck_dim, - time_stride_list=args.time_stride_list, - conv_stride_list=args.conv_stride_list) + model = get_chain_model( + feat_dim=args.feat_dim, + output_dim=args.output_dim, + lda_mat_filename=args.lda_mat_filename, + hidden_dim=args.hidden_dim, + bottleneck_dim=args.bottleneck_dim, + prefinal_bottleneck_dim=args.prefinal_bottleneck_dim, + time_stride_list=args.time_stride_list, + conv_stride_list=args.conv_stride_list) load_checkpoint(args.checkpoint, model) diff --git a/egs/aishell/s10/chain/model.py b/egs/aishell/s10/chain/model.py index 39d7acb765a..e360b4ab053 100644 --- a/egs/aishell/s10/chain/model.py +++ b/egs/aishell/s10/chain/model.py @@ -19,6 +19,7 @@ def get_chain_model(feat_dim, output_dim, hidden_dim, bottleneck_dim, + prefinal_bottleneck_dim, time_stride_list, conv_stride_list, lda_mat_filename=None): @@ -26,6 +27,8 @@ def get_chain_model(feat_dim, output_dim=output_dim, lda_mat_filename=lda_mat_filename, hidden_dim=hidden_dim, + bottleneck_dim=bottleneck_dim, + prefinal_bottleneck_dim=prefinal_bottleneck_dim, time_stride_list=time_stride_list, conv_stride_list=conv_stride_list) return model @@ -72,6 +75,7 @@ def __init__(self, lda_mat_filename=None, hidden_dim=1024, bottleneck_dim=128, + prefinal_bottleneck_dim=256, time_stride_list=[1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1], conv_stride_list=[1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1], frame_subsampling_factor=3): @@ -88,7 +92,8 @@ def __init__(self, out_features=hidden_dim) # tdnn1_batchnorm requires [N, C, T] - self.tdnn1_batchnorm = nn.BatchNorm1d(num_features=hidden_dim) + self.tdnn1_batchnorm = nn.BatchNorm1d(num_features=hidden_dim, + affine=False) tdnnfs = [] for i in range(num_layers): @@ -104,23 +109,24 @@ def __init__(self, self.tdnnfs = nn.ModuleList(tdnnfs) # prefinal_l affine requires [N, C, T] - self.prefinal_l = OrthonormalLinear(dim=hidden_dim, - bottleneck_dim=bottleneck_dim * 2, - time_stride=0) + self.prefinal_l = OrthonormalLinear( + dim=hidden_dim, + bottleneck_dim=prefinal_bottleneck_dim, + time_stride=0) # prefinal_chain requires [N, C, T] self.prefinal_chain = PrefinalLayer(big_dim=hidden_dim, - small_dim=bottleneck_dim * 2) + small_dim=prefinal_bottleneck_dim) # output_affine requires [N, T, C] - self.output_affine = nn.Linear(in_features=bottleneck_dim * 2, + self.output_affine = nn.Linear(in_features=prefinal_bottleneck_dim, out_features=output_dim) # prefinal_xent requires [N, C, T] self.prefinal_xent = PrefinalLayer(big_dim=hidden_dim, - small_dim=bottleneck_dim * 2) + small_dim=prefinal_bottleneck_dim) - self.output_xent_affine = nn.Linear(in_features=bottleneck_dim * 2, + self.output_xent_affine = nn.Linear(in_features=prefinal_bottleneck_dim, out_features=output_dim) if lda_mat_filename: diff --git a/egs/aishell/s10/chain/options.py b/egs/aishell/s10/chain/options.py index 5a6e04f9ba7..fae430a0519 100644 --- a/egs/aishell/s10/chain/options.py +++ b/egs/aishell/s10/chain/options.py @@ -130,6 +130,7 @@ def _check_args(args): assert args.output_dim > 0 assert args.hidden_dim > 0 assert args.bottleneck_dim > 0 + assert args.prefinal_bottleneck_dim > 0 assert args.time_stride_list is not None assert len(args.time_stride_list) > 0 @@ -202,6 +203,12 @@ def get_args(): required=True, type=int) + parser.add_argument('--prefinal-bottleneck-dim', + dest='prefinal_bottleneck_dim', + help='nn prefinal bottleneck dimension', + required=True, + type=int) + parser.add_argument('--time-stride-list', dest='time_stride_list', help='time stride list', diff --git a/egs/aishell/s10/chain/tdnnf_layer.py b/egs/aishell/s10/chain/tdnnf_layer.py index cf3c5a11862..72fbbd90463 100644 --- a/egs/aishell/s10/chain/tdnnf_layer.py +++ b/egs/aishell/s10/chain/tdnnf_layer.py @@ -113,11 +113,11 @@ class PrefinalLayer(nn.Module): def __init__(self, big_dim, small_dim): super().__init__() self.affine = nn.Linear(in_features=small_dim, out_features=big_dim) - self.batchnorm1 = nn.BatchNorm1d(num_features=big_dim) + self.batchnorm1 = nn.BatchNorm1d(num_features=big_dim, affine=False) self.linear = OrthonormalLinear(dim=big_dim, bottleneck_dim=small_dim, time_stride=0) - self.batchnorm2 = nn.BatchNorm1d(num_features=small_dim) + self.batchnorm2 = nn.BatchNorm1d(num_features=small_dim, affine=False) def forward(self, x): # x is [N, C, T] @@ -186,7 +186,7 @@ def __init__(self, stride=conv_stride) # batchnorm requires [N, C, T] - self.batchnorm = nn.BatchNorm1d(num_features=dim) + self.batchnorm = nn.BatchNorm1d(num_features=dim, affine=False) def forward(self, x): # input x is of shape: [batch_size, feat_dim, seq_len] = [N, C, T] diff --git a/egs/aishell/s10/chain/train.py b/egs/aishell/s10/chain/train.py index 1f5c6824c97..72c58153510 100644 --- a/egs/aishell/s10/chain/train.py +++ b/egs/aishell/s10/chain/train.py @@ -110,6 +110,21 @@ def train_one_epoch(dataloader, model, device, optimizer, criterion, objf_l2_term_weight[0].item() / objf_l2_term_weight[2].item(), batch_idx + current_epoch * len(dataloader)) + state_dict = model.state_dict() + for key, value in state_dict.items(): + # skip batchnorm parameters + if value.dtype != torch.float32: + continue + if 'running_mean' in key or 'running_var' in key: + continue + + with torch.no_grad(): + frobenius_norm = torch.norm(value, p='fro') + + tf_writer.add_scalar( + 'train/parameters/{}'.format(key), frobenius_norm, + batch_idx + current_epoch * len(dataloader)) + return total_objf / total_weight @@ -137,13 +152,15 @@ def main(): den_graph = chain.DenominatorGraph(fst=den_fst, num_pdfs=args.output_dim) - model = get_chain_model(feat_dim=args.feat_dim, - output_dim=args.output_dim, - lda_mat_filename=args.lda_mat_filename, - hidden_dim=args.hidden_dim, - bottleneck_dim=args.bottleneck_dim, - time_stride_list=args.time_stride_list, - conv_stride_list=args.conv_stride_list) + model = get_chain_model( + feat_dim=args.feat_dim, + output_dim=args.output_dim, + lda_mat_filename=args.lda_mat_filename, + hidden_dim=args.hidden_dim, + bottleneck_dim=args.bottleneck_dim, + prefinal_bottleneck_dim=args.prefinal_bottleneck_dim, + time_stride_list=args.time_stride_list, + conv_stride_list=args.conv_stride_list) start_epoch = 0 num_epochs = args.num_epochs @@ -166,11 +183,12 @@ def main(): egs_left_context=args.egs_left_context, egs_right_context=args.egs_right_context) - optimizer = optim.Adam(model.parameters(), - lr=learning_rate, - weight_decay=args.l2_regularize) + optimizer = optim.SGD(model.parameters(), + lr=learning_rate, + momentum=0.9, + weight_decay=args.l2_regularize) - scheduler = MultiStepLR(optimizer, milestones=[1, 2, 3, 4, 5], gamma=0.5) + scheduler = MultiStepLR(optimizer, milestones=[1, 3, 5], gamma=0.5) criterion = KaldiChainObjfFunction.apply tf_writer = SummaryWriter(log_dir='{}/tensorboard'.format(args.dir)) diff --git a/egs/aishell/s10/local/run_chain.sh b/egs/aishell/s10/local/run_chain.sh index 06b5d47e89f..81e4baea5b6 100755 --- a/egs/aishell/s10/local/run_chain.sh +++ b/egs/aishell/s10/local/run_chain.sh @@ -32,6 +32,7 @@ lr=1e-3 hidden_dim=1024 bottleneck_dim=128 +prefinal_bottleneck_dim=256 time_stride_list="1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1" # comma separated list conv_stride_list="1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1" # comma separated list @@ -172,6 +173,7 @@ if [[ $stage -le 8 ]]; then --is-training true \ --log-level $log_level \ --output-dim $output_dim \ + --prefinal-bottleneck-dim $prefinal_bottleneck_dim \ --time-stride-list "$time_stride_list" \ --train.cegs-dir exp/chain/merged_egs \ --train.den-fst exp/chain/den.fst \ @@ -205,6 +207,7 @@ if [[ $stage -le 9 ]]; then --model-left-context $model_left_context \ --model-right-context $model_right_context \ --output-dim $output_dim \ + --prefinal-bottleneck-dim $prefinal_bottleneck_dim \ --save-as-compressed $save_nn_output_as_compressed \ --time-stride-list "$time_stride_list" || exit 1 fi From 926f7d1098c619c3874d0ce07a7091a4dace13ca Mon Sep 17 00:00:00 2001 From: fanlu Date: Wed, 12 Feb 2020 21:09:36 +0800 Subject: [PATCH 2/6] change stride kernel(3,1) to stride kernel(2,2) (#1) * change stride kernel(3,1) to stride kernel(2,2) * make subsampling readable * make model trainable --- egs/aishell/s10/chain/inference.py | 4 +- egs/aishell/s10/chain/model.py | 31 +++++++----- egs/aishell/s10/chain/options.py | 26 +++++----- egs/aishell/s10/chain/tdnnf_layer.py | 72 +++++++++++----------------- egs/aishell/s10/chain/train.py | 4 +- egs/aishell/s10/local/run_chain.sh | 12 ++--- 6 files changed, 70 insertions(+), 79 deletions(-) diff --git a/egs/aishell/s10/chain/inference.py b/egs/aishell/s10/chain/inference.py index e360038b204..a15de5ef5ec 100644 --- a/egs/aishell/s10/chain/inference.py +++ b/egs/aishell/s10/chain/inference.py @@ -37,8 +37,8 @@ def main(): hidden_dim=args.hidden_dim, bottleneck_dim=args.bottleneck_dim, prefinal_bottleneck_dim=args.prefinal_bottleneck_dim, - time_stride_list=args.time_stride_list, - conv_stride_list=args.conv_stride_list) + kernel_size_list=args.kernel_size_list, + subsampling_factor_list=args.subsampling_factor_list) load_checkpoint(args.checkpoint, model) diff --git a/egs/aishell/s10/chain/model.py b/egs/aishell/s10/chain/model.py index e360b4ab053..e05a45a2b38 100644 --- a/egs/aishell/s10/chain/model.py +++ b/egs/aishell/s10/chain/model.py @@ -20,8 +20,8 @@ def get_chain_model(feat_dim, hidden_dim, bottleneck_dim, prefinal_bottleneck_dim, - time_stride_list, - conv_stride_list, + kernel_size_list, + subsampling_factor_list, lda_mat_filename=None): model = ChainModel(feat_dim=feat_dim, output_dim=output_dim, @@ -29,8 +29,8 @@ def get_chain_model(feat_dim, hidden_dim=hidden_dim, bottleneck_dim=bottleneck_dim, prefinal_bottleneck_dim=prefinal_bottleneck_dim, - time_stride_list=time_stride_list, - conv_stride_list=conv_stride_list) + kernel_size_list=kernel_size_list, + subsampling_factor_list=subsampling_factor_list) return model @@ -76,16 +76,16 @@ def __init__(self, hidden_dim=1024, bottleneck_dim=128, prefinal_bottleneck_dim=256, - time_stride_list=[1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1], - conv_stride_list=[1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1], + kernel_size_list=[2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2], + subsampling_factor_list=[1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1], frame_subsampling_factor=3): super().__init__() # at present, we support only frame_subsampling_factor to be 3 assert frame_subsampling_factor == 3 - assert len(time_stride_list) == len(conv_stride_list) - num_layers = len(time_stride_list) + assert len(kernel_size_list) == len(subsampling_factor_list) + num_layers = len(kernel_size_list) # tdnn1_affine requires [N, T, C] self.tdnn1_affine = nn.Linear(in_features=feat_dim * 3, @@ -97,12 +97,12 @@ def __init__(self, tdnnfs = [] for i in range(num_layers): - time_stride = time_stride_list[i] - conv_stride = conv_stride_list[i] + kernel_size = kernel_size_list[i] + subsampling_factor = subsampling_factor_list[i] layer = FactorizedTDNN(dim=hidden_dim, bottleneck_dim=bottleneck_dim, - time_stride=time_stride, - conv_stride=conv_stride) + kernel_size=kernel_size, + subsampling_factor=subsampling_factor) tdnnfs.append(layer) # tdnnfs requires [N, C, T] @@ -112,7 +112,7 @@ def __init__(self, self.prefinal_l = OrthonormalLinear( dim=hidden_dim, bottleneck_dim=prefinal_bottleneck_dim, - time_stride=0) + kernel_size=1) # prefinal_chain requires [N, C, T] self.prefinal_chain = PrefinalLayer(big_dim=hidden_dim, @@ -217,9 +217,14 @@ def constrain_orthonormal(self): if __name__ == '__main__': + logging.basicConfig( + level=logging.DEBUG, + format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", + ) feat_dim = 43 output_dim = 4344 model = ChainModel(feat_dim=feat_dim, output_dim=output_dim) + logging.info(model) N = 1 T = 150 + 27 + 27 C = feat_dim * 3 diff --git a/egs/aishell/s10/chain/options.py b/egs/aishell/s10/chain/options.py index fae430a0519..98246c436df 100644 --- a/egs/aishell/s10/chain/options.py +++ b/egs/aishell/s10/chain/options.py @@ -132,17 +132,17 @@ def _check_args(args): assert args.bottleneck_dim > 0 assert args.prefinal_bottleneck_dim > 0 - assert args.time_stride_list is not None - assert len(args.time_stride_list) > 0 + assert args.kernel_size_list is not None + assert len(args.kernel_size_list) > 0 - assert args.conv_stride_list is not None - assert len(args.conv_stride_list) > 0 + assert args.subsampling_factor_list is not None + assert len(args.subsampling_factor_list) > 0 - args.time_stride_list = [int(k) for k in args.time_stride_list.split(', ')] + args.kernel_size_list = [int(k) for k in args.kernel_size_list.split(', ')] - args.conv_stride_list = [int(k) for k in args.conv_stride_list.split(', ')] + args.subsampling_factor_list = [int(k) for k in args.subsampling_factor_list.split(', ')] - assert len(args.time_stride_list) == len(args.conv_stride_list) + assert len(args.kernel_size_list) == len(args.subsampling_factor_list) assert args.log_level in ['debug', 'info', 'warning'] @@ -209,15 +209,15 @@ def get_args(): required=True, type=int) - parser.add_argument('--time-stride-list', - dest='time_stride_list', - help='time stride list', + parser.add_argument('--kernel-size-list', + dest='kernel_size_list', + help='kernel_size_list', required=True, type=str) - parser.add_argument('--conv-stride-list', - dest='conv_stride_list', - help='conv stride list', + parser.add_argument('--subsampling-factor-list', + dest='subsampling_factor_list', + help='subsampling_factor_list', required=True, type=str) diff --git a/egs/aishell/s10/chain/tdnnf_layer.py b/egs/aishell/s10/chain/tdnnf_layer.py index 72fbbd90463..262f89e86fd 100644 --- a/egs/aishell/s10/chain/tdnnf_layer.py +++ b/egs/aishell/s10/chain/tdnnf_layer.py @@ -2,7 +2,7 @@ # Copyright 2020 Mobvoi AI Lab, Beijing, China (author: Fangjun Kuang) # Apache 2.0 - +import logging import torch import torch.nn as nn import torch.nn.functional as F @@ -53,18 +53,12 @@ def _constrain_orthonormal_internal(M): class OrthonormalLinear(nn.Module): - def __init__(self, dim, bottleneck_dim, time_stride): + def __init__(self, dim, bottleneck_dim, kernel_size): super().__init__() - assert time_stride in [0, 1] # WARNING(fangjun): kaldi uses [-1, 0] for the first linear layer # and [0, 1] for the second affine layer; # we use [-1, 0, 1] for the first linear layer if time_stride == 1 - if time_stride == 0: - kernel_size = 1 - else: - kernel_size = 3 - self.kernel_size = kernel_size # conv requires [N, C, T] @@ -116,7 +110,7 @@ def __init__(self, big_dim, small_dim): self.batchnorm1 = nn.BatchNorm1d(num_features=big_dim, affine=False) self.linear = OrthonormalLinear(dim=big_dim, bottleneck_dim=small_dim, - time_stride=0) + kernel_size=1) self.batchnorm2 = nn.BatchNorm1d(num_features=small_dim, affine=False) def forward(self, x): @@ -160,30 +154,29 @@ class FactorizedTDNN(nn.Module): def __init__(self, dim, bottleneck_dim, - time_stride, - conv_stride, + kernel_size, + subsampling_factor, bypass_scale=0.66): super().__init__() - assert conv_stride in [1, 3] assert abs(bypass_scale) <= 1 self.bypass_scale = bypass_scale - self.conv_stride = conv_stride + self.s = subsampling_factor # linear requires [N, C, T] self.linear = OrthonormalLinear(dim=dim, bottleneck_dim=bottleneck_dim, - time_stride=time_stride) + kernel_size=kernel_size) # affine requires [N, C, T] # WARNING(fangjun): we do not use nn.Linear here # since we want to use `stride` self.affine = nn.Conv1d(in_channels=bottleneck_dim, out_channels=dim, - kernel_size=1, - stride=conv_stride) + kernel_size=kernel_size, + stride=subsampling_factor) # batchnorm requires [N, C, T] self.batchnorm = nn.BatchNorm1d(num_features=dim, affine=False) @@ -194,13 +187,13 @@ def forward(self, x): # save it for skip connection input_x = x - + logging.debug(f"input_x shape is {input_x.shape}") x = self.linear(x) - + logging.debug(f"x shape after linear is {x.shape}") # at this point, x is [N, C, T] x = self.affine(x) - + logging.debug(f"x shape after affine is {x.shape}") # at this point, x is [N, C, T] x = F.relu(x) @@ -213,10 +206,10 @@ def forward(self, x): # TODO(fangjun): implement GeneralDropoutComponent in PyTorch - if self.linear.kernel_size == 3: - x = self.bypass_scale * input_x[:, :, 1:-1:self.conv_stride] + x + if self.linear.kernel_size == 2: + x = self.bypass_scale * input_x[:, :, self.s:-self.s:self.s] + x else: - x = self.bypass_scale * input_x[:, :, ::self.conv_stride] + x + x = self.bypass_scale * input_x[:, :, ::self.s] + x return x def constrain_orthonormal(self): @@ -257,8 +250,8 @@ def compute_loss(M): model = FactorizedTDNN(dim=1024, bottleneck_dim=128, - time_stride=1, - conv_stride=3) + kernel_size=2, + subsampling_factor=1) loss = [] model.constrain_orthonormal() loss.append( @@ -279,38 +272,31 @@ def _test_factorized_tdnn(): T = 10 C = 4 - # case 0: time_stride == 1, conv_stride == 1 + # case 0: kernel_size == 1, subsampling_factor == 1 model = FactorizedTDNN(dim=C, bottleneck_dim=2, - time_stride=1, - conv_stride=1) + kernel_size=1, + subsampling_factor=1) x = torch.arange(N * T * C).reshape(N, C, T).float() y = model(x) - assert y.size(2) == T - 2 - - # case 1: time_stride == 0, conv_stride == 1 - model = FactorizedTDNN(dim=C, - bottleneck_dim=2, - time_stride=0, - conv_stride=1) - y = model(x) assert y.size(2) == T - # case 2: time_stride == 1, conv_stride == 3 + # case 1: kernel_size == 2, subsampling_factor == 1 model = FactorizedTDNN(dim=C, bottleneck_dim=2, - time_stride=1, - conv_stride=3) + kernel_size=2, + subsampling_factor=1) y = model(x) - assert y.size(2) == math.ceil((T - 2) / 3) + assert y.size(2) == T - 2 - # case 3: time_stride == 0, conv_stride == 3 + # case 2: kernel_size == 1, subsampling_factor == 3 model = FactorizedTDNN(dim=C, bottleneck_dim=2, - time_stride=0, - conv_stride=3) + kernel_size=1, + subsampling_factor=3) y = model(x) - assert y.size(2) == math.ceil(T / 3) + assert y.size(2) == math.ceil(math.ceil((T - 3)) - 3) + if __name__ == '__main__': diff --git a/egs/aishell/s10/chain/train.py b/egs/aishell/s10/chain/train.py index 72c58153510..034fde00b9c 100644 --- a/egs/aishell/s10/chain/train.py +++ b/egs/aishell/s10/chain/train.py @@ -159,8 +159,8 @@ def main(): hidden_dim=args.hidden_dim, bottleneck_dim=args.bottleneck_dim, prefinal_bottleneck_dim=args.prefinal_bottleneck_dim, - time_stride_list=args.time_stride_list, - conv_stride_list=args.conv_stride_list) + kernel_size_list=args.kernel_size_list, + subsampling_factor_list=args.subsampling_factor_list) start_epoch = 0 num_epochs = args.num_epochs diff --git a/egs/aishell/s10/local/run_chain.sh b/egs/aishell/s10/local/run_chain.sh index 81e4baea5b6..5be698139b0 100755 --- a/egs/aishell/s10/local/run_chain.sh +++ b/egs/aishell/s10/local/run_chain.sh @@ -33,8 +33,8 @@ lr=1e-3 hidden_dim=1024 bottleneck_dim=128 prefinal_bottleneck_dim=256 -time_stride_list="1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1" # comma separated list -conv_stride_list="1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1" # comma separated list +kernel_size_list="2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2" # comma separated list +subsampling_factor_list="1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1" # comma separated list log_level=info # valid values: debug, info, warning @@ -165,16 +165,16 @@ if [[ $stage -le 8 ]]; then python3 ./chain/train.py \ --bottleneck-dim $bottleneck_dim \ --checkpoint=${train_checkpoint:-} \ - --conv-stride-list "$conv_stride_list" \ --device-id $device_id \ --dir exp/chain/train \ --feat-dim $feat_dim \ --hidden-dim $hidden_dim \ --is-training true \ + --kernel-size-list "$kernel_size_list" \ --log-level $log_level \ --output-dim $output_dim \ --prefinal-bottleneck-dim $prefinal_bottleneck_dim \ - --time-stride-list "$time_stride_list" \ + --subsampling-factor-list "$subsampling_factor_list" \ --train.cegs-dir exp/chain/merged_egs \ --train.den-fst exp/chain/den.fst \ --train.egs-left-context $egs_left_context \ @@ -196,20 +196,20 @@ if [[ $stage -le 9 ]]; then python3 ./chain/inference.py \ --bottleneck-dim $bottleneck_dim \ --checkpoint $inference_checkpoint \ - --conv-stride-list "$conv_stride_list" \ --device-id $device_id \ --dir exp/chain/inference/$x \ --feat-dim $feat_dim \ --feats-scp data/mfcc_hires/$x/feats.scp \ --hidden-dim $hidden_dim \ --is-training false \ + --kernel-size-list "$kernel_size_list" \ --log-level $log_level \ --model-left-context $model_left_context \ --model-right-context $model_right_context \ --output-dim $output_dim \ --prefinal-bottleneck-dim $prefinal_bottleneck_dim \ --save-as-compressed $save_nn_output_as_compressed \ - --time-stride-list "$time_stride_list" || exit 1 + --subsampling-factor-list "$subsampling_factor_list" || exit 1 fi done fi From 55e97503922622216f614495fcedae0a8c02c2c8 Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Fri, 14 Feb 2020 08:07:10 +0800 Subject: [PATCH 3/6] switch to adam and set affine to false for the input batchnorm layer. --- egs/aishell/s10/chain/model.py | 9 +++++---- egs/aishell/s10/chain/tdnnf_layer.py | 5 +---- egs/aishell/s10/chain/train.py | 9 ++++----- 3 files changed, 10 insertions(+), 13 deletions(-) diff --git a/egs/aishell/s10/chain/model.py b/egs/aishell/s10/chain/model.py index e05a45a2b38..ff27b2d9cd8 100644 --- a/egs/aishell/s10/chain/model.py +++ b/egs/aishell/s10/chain/model.py @@ -136,7 +136,8 @@ def __init__(self, self.has_LDA = True else: logging.info('replace LDA with BatchNorm') - self.input_batch_norm = nn.BatchNorm1d(num_features=feat_dim * 3) + self.input_batch_norm = nn.BatchNorm1d(num_features=feat_dim * 3, + affine=False) self.has_LDA = False def forward(self, x): @@ -218,9 +219,9 @@ def constrain_orthonormal(self): if __name__ == '__main__': logging.basicConfig( - level=logging.DEBUG, - format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", - ) + level=logging.DEBUG, + format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", + ) feat_dim = 43 output_dim = 4344 model = ChainModel(feat_dim=feat_dim, output_dim=output_dim) diff --git a/egs/aishell/s10/chain/tdnnf_layer.py b/egs/aishell/s10/chain/tdnnf_layer.py index 262f89e86fd..1efd6bc8e2e 100644 --- a/egs/aishell/s10/chain/tdnnf_layer.py +++ b/egs/aishell/s10/chain/tdnnf_layer.py @@ -187,13 +187,11 @@ def forward(self, x): # save it for skip connection input_x = x - logging.debug(f"input_x shape is {input_x.shape}") + x = self.linear(x) - logging.debug(f"x shape after linear is {x.shape}") # at this point, x is [N, C, T] x = self.affine(x) - logging.debug(f"x shape after affine is {x.shape}") # at this point, x is [N, C, T] x = F.relu(x) @@ -298,7 +296,6 @@ def _test_factorized_tdnn(): assert y.size(2) == math.ceil(math.ceil((T - 3)) - 3) - if __name__ == '__main__': torch.manual_seed(20200130) _test_factorized_tdnn() diff --git a/egs/aishell/s10/chain/train.py b/egs/aishell/s10/chain/train.py index 034fde00b9c..47a9f48b106 100644 --- a/egs/aishell/s10/chain/train.py +++ b/egs/aishell/s10/chain/train.py @@ -183,12 +183,11 @@ def main(): egs_left_context=args.egs_left_context, egs_right_context=args.egs_right_context) - optimizer = optim.SGD(model.parameters(), - lr=learning_rate, - momentum=0.9, - weight_decay=args.l2_regularize) + optimizer = optim.Adam(model.parameters(), + lr=learning_rate, + weight_decay=args.l2_regularize) - scheduler = MultiStepLR(optimizer, milestones=[1, 3, 5], gamma=0.5) + scheduler = MultiStepLR(optimizer, milestones=[1, 2, 3, 4, 5], gamma=0.5) criterion = KaldiChainObjfFunction.apply tf_writer = SummaryWriter(log_dir='{}/tensorboard'.format(args.dir)) From 8c3234981b79966cb5b5536baaeb35ad8e0f872a Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Fri, 14 Feb 2020 09:56:53 +0800 Subject: [PATCH 4/6] enable shuffle in dataloader. --- egs/aishell/s10/chain/egs_dataset.py | 6 +++++- egs/aishell/s10/chain/train.py | 3 ++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/egs/aishell/s10/chain/egs_dataset.py b/egs/aishell/s10/chain/egs_dataset.py index 6bd36cf7cb2..98afa8fca26 100755 --- a/egs/aishell/s10/chain/egs_dataset.py +++ b/egs/aishell/s10/chain/egs_dataset.py @@ -17,7 +17,10 @@ from common import splice_feats -def get_egs_dataloader(egs_dir, egs_left_context, egs_right_context): +def get_egs_dataloader(egs_dir, + egs_left_context, + egs_right_context, + shuffle=True): dataset = NnetChainExampleDataset(egs_dir=egs_dir) frame_subsampling_factor = 3 @@ -32,6 +35,7 @@ def get_egs_dataloader(egs_dir, egs_left_context, egs_right_context): dataloader = DataLoader(dataset, batch_size=batch_size, + shuffle=shuffle, num_workers=0, collate_fn=collate_fn) return dataloader diff --git a/egs/aishell/s10/chain/train.py b/egs/aishell/s10/chain/train.py index 47a9f48b106..8abb988943c 100644 --- a/egs/aishell/s10/chain/train.py +++ b/egs/aishell/s10/chain/train.py @@ -181,7 +181,8 @@ def main(): dataloader = get_egs_dataloader(egs_dir=args.cegs_dir, egs_left_context=args.egs_left_context, - egs_right_context=args.egs_right_context) + egs_right_context=args.egs_right_context, + shuffle=True) optimizer = optim.Adam(model.parameters(), lr=learning_rate, From 793f57281ca26af9db0c901215716065580f6c24 Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Tue, 18 Feb 2020 16:08:57 +0800 Subject: [PATCH 5/6] compute validation objf and switch back to [-1, 0, 1] for the linear layer. --- egs/aishell/s10/chain/egs_dataset.py | 18 +++-- egs/aishell/s10/chain/options.py | 24 +++++- egs/aishell/s10/chain/tdnnf_layer.py | 10 +-- egs/aishell/s10/chain/train.py | 106 ++++++++++++++++++++++++--- egs/aishell/s10/local/run_chain.sh | 9 ++- 5 files changed, 139 insertions(+), 28 deletions(-) diff --git a/egs/aishell/s10/chain/egs_dataset.py b/egs/aishell/s10/chain/egs_dataset.py index 98afa8fca26..e081880a3c4 100755 --- a/egs/aishell/s10/chain/egs_dataset.py +++ b/egs/aishell/s10/chain/egs_dataset.py @@ -4,6 +4,7 @@ # Apache 2.0 import glob +import os import numpy as np import torch @@ -17,12 +18,12 @@ from common import splice_feats -def get_egs_dataloader(egs_dir, +def get_egs_dataloader(egs_dir_or_scp, egs_left_context, egs_right_context, shuffle=True): - dataset = NnetChainExampleDataset(egs_dir=egs_dir) + dataset = NnetChainExampleDataset(egs_dir_or_scp=egs_dir_or_scp) frame_subsampling_factor = 3 # we have merged egs offline, so batch size is 1 @@ -54,11 +55,16 @@ def read_nnet_chain_example(rxfilename): class NnetChainExampleDataset(Dataset): - def __init__(self, egs_dir): + def __init__(self, egs_dir_or_scp): ''' - We assume that there exist many cegs.*.scp files inside egs_dir + If egs_dir_or_scp is a directory, we assume that there exist many cegs.*.scp files + inside egs_dir_or_scp. ''' - self.scps = glob.glob('{}/cegs.*.scp'.format(egs_dir)) + if os.path.isdir(egs_dir_or_scp): + self.scps = glob.glob('{}/cegs.*.scp'.format(egs_dir_or_scp)) + else: + self.scps = [egs_dir_or_scp] + assert len(self.scps) > 0 self.items = list() for scp in self.scps: @@ -175,7 +181,7 @@ def __call__(self, batch): def _test_nnet_chain_example_dataset(): egs_dir = 'exp/chain/merged_egs' - dataset = NnetChainExampleDataset(egs_dir=egs_dir) + dataset = NnetChainExampleDataset(egs_dir_or_scp=egs_dir) egs_left_context = 29 egs_right_context = 29 frame_subsampling_factor = 3 diff --git a/egs/aishell/s10/chain/options.py b/egs/aishell/s10/chain/options.py index 98246c436df..4cc5fa59168 100644 --- a/egs/aishell/s10/chain/options.py +++ b/egs/aishell/s10/chain/options.py @@ -54,6 +54,11 @@ def _set_training_args(parser): help='cegs dir containing comibined cegs.*.scp', type=str) + parser.add_argument('--train.valid-cegs-scp', + dest='valid_cegs_scp', + help='validation cegs scp', + type=str) + parser.add_argument('--train.den-fst', dest='den_fst_filename', help='denominator fst filename', @@ -84,9 +89,20 @@ def _set_training_args(parser): help='l2 regularize', type=float) + parser.add_argument('--train.xent-regularize', + dest='xent_regularize', + help='xent regularize', + type=float) + + parser.add_argument('--train.leaky-hmm-coefficient', + dest='leaky_hmm_coefficient', + help='leaky hmm coefficient', + type=float) + def _check_training_args(args): assert os.path.isdir(args.cegs_dir) + assert os.path.isfile(args.valid_cegs_scp) assert os.path.isfile(args.den_fst_filename) @@ -95,7 +111,9 @@ def _check_training_args(args): assert args.num_epochs > 0 assert args.learning_rate > 0 - assert args.l2_regularize > 0 + assert args.l2_regularize >= 0 + assert args.xent_regularize >= 0 + assert args.leaky_hmm_coefficient >= 0 if args.checkpoint: assert os.path.exists(args.checkpoint) @@ -140,7 +158,9 @@ def _check_args(args): args.kernel_size_list = [int(k) for k in args.kernel_size_list.split(', ')] - args.subsampling_factor_list = [int(k) for k in args.subsampling_factor_list.split(', ')] + args.subsampling_factor_list = [ + int(k) for k in args.subsampling_factor_list.split(', ') + ] assert len(args.kernel_size_list) == len(args.subsampling_factor_list) diff --git a/egs/aishell/s10/chain/tdnnf_layer.py b/egs/aishell/s10/chain/tdnnf_layer.py index 1efd6bc8e2e..bcc5a5f6e56 100644 --- a/egs/aishell/s10/chain/tdnnf_layer.py +++ b/egs/aishell/s10/chain/tdnnf_layer.py @@ -175,7 +175,7 @@ def __init__(self, # since we want to use `stride` self.affine = nn.Conv1d(in_channels=bottleneck_dim, out_channels=dim, - kernel_size=kernel_size, + kernel_size=1, stride=subsampling_factor) # batchnorm requires [N, C, T] @@ -204,7 +204,7 @@ def forward(self, x): # TODO(fangjun): implement GeneralDropoutComponent in PyTorch - if self.linear.kernel_size == 2: + if self.linear.kernel_size == 3: x = self.bypass_scale * input_x[:, :, self.s:-self.s:self.s] + x else: x = self.bypass_scale * input_x[:, :, ::self.s] + x @@ -248,7 +248,7 @@ def compute_loss(M): model = FactorizedTDNN(dim=1024, bottleneck_dim=128, - kernel_size=2, + kernel_size=3, subsampling_factor=1) loss = [] model.constrain_orthonormal() @@ -279,10 +279,10 @@ def _test_factorized_tdnn(): y = model(x) assert y.size(2) == T - # case 1: kernel_size == 2, subsampling_factor == 1 + # case 1: kernel_size == 3, subsampling_factor == 1 model = FactorizedTDNN(dim=C, bottleneck_dim=2, - kernel_size=2, + kernel_size=3, subsampling_factor=1) y = model(x) assert y.size(2) == T - 2 diff --git a/egs/aishell/s10/chain/train.py b/egs/aishell/s10/chain/train.py index 8abb988943c..84bcb4f12b8 100644 --- a/egs/aishell/s10/chain/train.py +++ b/egs/aishell/s10/chain/train.py @@ -15,7 +15,6 @@ import torch import torch.optim as optim from torch.nn.utils import clip_grad_value_ -from torch.optim.lr_scheduler import MultiStepLR from torch.utils.tensorboard import SummaryWriter import kaldi @@ -32,8 +31,63 @@ from options import get_args -def train_one_epoch(dataloader, model, device, optimizer, criterion, - current_epoch, opts, den_graph, tf_writer): +def get_validation_objf(dataloader, model, device, criterion, opts, den_graph): + total_objf = 0. + total_weight = 0. + total_frames = 0. # for display only + + model.eval() + + for batch_idx, batch in enumerate(dataloader): + key_list, feature_list, supervision_list = batch + + assert len(key_list) == len(feature_list) == len(supervision_list) + batch_size = len(key_list) + + for n in range(batch_size): + feats = feature_list[n] + assert feats.ndim == 3 + + # at this point, feats is [N, T, C] + feats = feats.to(device) + + with torch.no_grad(): + nnet_output, xent_output = model(feats) + + # at this point, nnet_output is: [N, T, C] + # refer to kaldi/src/chain/chain-training.h + # the output should be organized as + # [all sequences for frame 0] + # [all sequences for frame 1] + # [etc.] + nnet_output = nnet_output.permute(1, 0, 2) + # at this point, nnet_output is: [T, N, C] + nnet_output = nnet_output.contiguous().view(-1, + nnet_output.shape[-1]) + + # at this point, xent_output is: [N, T, C] + xent_output = xent_output.permute(1, 0, 2) + # at this point, xent_output is: [T, N, C] + xent_output = xent_output.contiguous().view(-1, + xent_output.shape[-1]) + objf_l2_term_weight = criterion(opts, den_graph, + supervision_list[n], nnet_output, + xent_output) + objf = objf_l2_term_weight[0] + + objf_l2_term_weight = objf_l2_term_weight.cpu() + + total_objf += objf_l2_term_weight[0].item() + total_weight += objf_l2_term_weight[2].item() + + num_frames = nnet_output.shape[0] + total_frames += num_frames + + return total_objf, total_weight, total_frames + + +def train_one_epoch(dataloader, valid_dataloader, model, device, optimizer, + criterion, current_epoch, opts, den_graph, tf_writer): model.train() total_objf = 0. @@ -75,8 +129,8 @@ def train_one_epoch(dataloader, model, device, optimizer, criterion, optimizer.zero_grad() objf.backward() - # TODO(fangjun): how to choose this value or do we need this ? clip_grad_value_(model.parameters(), 5.0) + optimizer.step() objf_l2_term_weight = objf_l2_term_weight.detach().cpu() @@ -101,6 +155,25 @@ def train_one_epoch(dataloader, model, device, optimizer, criterion, objf_l2_term_weight[0].item() / objf_l2_term_weight[2].item(), num_frames, current_epoch)) + if batch_idx % 500 == 0: + total_valid_objf, total_valid_weight, total_valid_frames = get_validation_objf( + dataloader=valid_dataloader, + model=model, + device=device, + criterion=criterion, + opts=opts, + den_graph=den_graph) + + model.train() + + logging.info( + 'Validation average objf: {:.6f} over {} frames'.format( + total_valid_objf / total_valid_weight, total_valid_frames)) + + tf_writer.add_scalar('train/global_valid_average_objf', + total_valid_objf / total_valid_weight, + batch_idx + current_epoch * len(dataloader)) + if batch_idx % 100 == 0: tf_writer.add_scalar('train/global_average_objf', total_objf / total_weight, @@ -145,10 +218,10 @@ def main(): den_fst = fst.StdVectorFst.Read(args.den_fst_filename) - # TODO(fangjun): pass these options from commandline opts = chain.ChainTrainingOptions() - opts.l2_regularize = 5e-4 - opts.leaky_hmm_coefficient = 0.1 + opts.l2_regularize = args.l2_regularize + opts.xent_regularize = args.xent_regularize + opts.leaky_hmm_coefficient = args.leaky_hmm_coefficient den_graph = chain.DenominatorGraph(fst=den_fst, num_pdfs=args.output_dim) @@ -179,16 +252,21 @@ def main(): model.to(device) - dataloader = get_egs_dataloader(egs_dir=args.cegs_dir, + dataloader = get_egs_dataloader(egs_dir_or_scp=args.cegs_dir, egs_left_context=args.egs_left_context, egs_right_context=args.egs_right_context, shuffle=True) + valid_dataloader = get_egs_dataloader( + egs_dir_or_scp=args.valid_cegs_scp, + egs_left_context=args.egs_left_context, + egs_right_context=args.egs_right_context, + shuffle=False) + optimizer = optim.Adam(model.parameters(), lr=learning_rate, - weight_decay=args.l2_regularize) + weight_decay=5e-4) - scheduler = MultiStepLR(optimizer, milestones=[1, 2, 3, 4, 5], gamma=0.5) criterion = KaldiChainObjfFunction.apply tf_writer = SummaryWriter(log_dir='{}/tensorboard'.format(args.dir)) @@ -198,12 +276,17 @@ def main(): best_epoch_info_filename = os.path.join(args.dir, 'best-epoch-info') try: for epoch in range(start_epoch, args.num_epochs): - learning_rate = scheduler.get_lr()[0] + learning_rate = 1e-3 * pow(0.4, epoch) + for param_group in optimizer.param_groups: + param_group['lr'] = learning_rate + logging.info('epoch {}, learning rate {}'.format( epoch, learning_rate)) + tf_writer.add_scalar('learning_rate', learning_rate, epoch) objf = train_one_epoch(dataloader=dataloader, + valid_dataloader=valid_dataloader, model=model, device=device, optimizer=optimizer, @@ -212,7 +295,6 @@ def main(): opts=opts, den_graph=den_graph, tf_writer=tf_writer) - scheduler.step() if best_objf is None: best_objf = objf diff --git a/egs/aishell/s10/local/run_chain.sh b/egs/aishell/s10/local/run_chain.sh index 5be698139b0..738747fdb84 100755 --- a/egs/aishell/s10/local/run_chain.sh +++ b/egs/aishell/s10/local/run_chain.sh @@ -33,7 +33,7 @@ lr=1e-3 hidden_dim=1024 bottleneck_dim=128 prefinal_bottleneck_dim=256 -kernel_size_list="2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2" # comma separated list +kernel_size_list="3, 3, 3, 1, 3, 3, 3, 3, 3, 3, 3, 3" # comma separated list subsampling_factor_list="1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1" # comma separated list log_level=info # valid values: debug, info, warning @@ -179,9 +179,12 @@ if [[ $stage -le 8 ]]; then --train.den-fst exp/chain/den.fst \ --train.egs-left-context $egs_left_context \ --train.egs-right-context $egs_right_context \ - --train.l2-regularize 5e-4 \ + --train.l2-regularize 5e-5 \ + --train.leaky-hmm-coefficient 0.1 \ --train.lr $lr \ - --train.num-epochs $num_epochs + --train.num-epochs $num_epochs \ + --train.valid-cegs-scp exp/chain/egs/valid_diagnostic.scp \ + --train.xent-regularize 0.1 fi if [[ $stage -le 9 ]]; then From 523f9a46b71e11d0e39f004e8416e26f4177225f Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Tue, 18 Feb 2020 23:03:18 +0800 Subject: [PATCH 6/6] replace the data processing pipeline with haowen's. --- egs/aishell/s10/local/run_chain.sh | 170 +++++++++++------- .../s10/local/run_cleanup_segmentation.sh | 68 +++++++ egs/aishell/s10/run.sh | 67 ++----- 3 files changed, 189 insertions(+), 116 deletions(-) create mode 100755 egs/aishell/s10/local/run_cleanup_segmentation.sh diff --git a/egs/aishell/s10/local/run_chain.sh b/egs/aishell/s10/local/run_chain.sh index 738747fdb84..25b944fde3d 100755 --- a/egs/aishell/s10/local/run_chain.sh +++ b/egs/aishell/s10/local/run_chain.sh @@ -5,18 +5,16 @@ set -e -stage=0 +stage=10 # GPU device id to use (count from 0). # you can also set `CUDA_VISIBLE_DEVICES` and set `device_id=0` -device_id=6 +device_id=3 nj=10 -lang=data/lang_chain # output lang dir -ali_dir=exp/tri5a_ali # input alignment dir -lat_dir=exp/tri5a_lats # input lat dir -treedir=exp/chain/tri5_tree # output tree dir +train_set=train_cleaned +gmm_dir=exp/tri3_cleaned # You should know how to calculate your model's left/right context **manually** model_left_context=28 @@ -47,67 +45,107 @@ save_nn_output_as_compressed=false . parse_options.sh +ali_dir=${gmm_dir}_ali_${train_set}_sp # output ali dir +lat_dir=${gmm_dir}_lat_${train_set}_sp # output lat dir +tree_dir=${gmm_dir}_tree_${train_set}_sp # output tree dir +train_data_dir=data/${train_set}_sp_hires +lores_train_data_dir=data/${train_set}_sp + if [[ $stage -le 0 ]]; then - for datadir in train dev test; do - dst_dir=data/mfcc_hires/$datadir - if [[ ! -f $dst_dir/feats.scp ]]; then - echo "making mfcc features for LF-MMI training" - utils/copy_data_dir.sh data/$datadir $dst_dir - steps/make_mfcc.sh \ - --mfcc-config conf/mfcc_hires.conf \ - --cmd "$train_cmd" \ - --nj $nj \ - $dst_dir || exit 1 - steps/compute_cmvn_stats.sh $dst_dir || exit 1 - utils/fix_data_dir.sh $dst_dir - else - echo "$dst_dir/feats.scp already exists." - echo "kaldi (local/run_tdnn_1b.sh) LF-MMI may have generated it." - echo "skip $dst_dir" - fi + echo "$0: preparing directory for low-resolution speed-perturbed data (for alignment)" + utils/data/perturb_data_dir_speed_3way.sh data/$train_set data/${train_set}_sp + + for x in ${train_set}_sp dev test; do + utils/copy_data_dir.sh data/$x data/${x}_hires done fi if [[ $stage -le 1 ]]; then + echo "$0: making MFCC features for low-resolution speed-perturbed data" + steps/make_mfcc.sh --nj $nj --cmd "$train_cmd" data/${train_set}_sp + steps/compute_cmvn_stats.sh data/${train_set}_sp + echo "fixing input data-dir to remove nonexistent features, in case some " + echo ".. speed-perturbed segments were too short." + utils/fix_data_dir.sh data/${train_set}_sp +fi + +if [[ $stage -le 2 ]]; then + echo "$0: aligning with the perturbed low-resolution data" + steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \ + data/${train_set}_sp data/lang $gmm_dir $ali_dir +fi + +if [[ $stage -le 3 ]]; then + echo "$0: creating high-resolution MFCC features" + + # do volume-perturbation on the training data prior to extracting hires + # features; this helps make trained nnets more invariant to test data volume. + utils/data/perturb_data_dir_volume.sh data/${train_set}_sp_hires + + for x in ${train_set}_sp dev test; do + steps/make_mfcc.sh --nj $nj --mfcc-config conf/mfcc_hires.conf \ + --cmd "$train_cmd" data/${x}_hires + steps/compute_cmvn_stats.sh data/${x}_hires + utils/fix_data_dir.sh data/${x}_hires + done +fi + +if [[ $stage -le 4 ]]; then + for f in $gmm_dir/final.mdl $train_data_dir/feats.scp \ + $lores_train_data_dir/feats.scp $ali_dir/ali.1.gz $gmm_dir/final.mdl; do + [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1 + done +fi + +if [[ $stage -le 5 ]]; then + echo "$0: creating lang directory with one state per phone." # Create a version of the lang/ directory that has one state per phone in the # topo file. [note, it really has two states.. the first one is only repeated # once, the second one has zero or more repeats.] - rm -rf $lang - cp -r data/lang $lang - silphonelist=$(cat $lang/phones/silence.csl) || exit 1 - nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1 + cp -r data/lang data/lang_chain + silphonelist=$(cat data/lang_chain/phones/silence.csl) || exit 1; + nonsilphonelist=$(cat data/lang_chain/phones/nonsilence.csl) || exit 1; # Use our special topology... note that later on may have to tune this # topology. - steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo + steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >data/lang_chain/topo fi -if [[ $stage -le 2 ]]; then - # Build a tree using our new topology. This is the critically different - # step compared with other recipes. +if [[ $stage -le 6 ]]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" ${lores_train_data_dir} \ + data/lang $gmm_dir $lat_dir + rm $lat_dir/fsts.*.gz # save space +fi + +if [[ $stage -le 7 ]]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ --context-opts "--context-width=2 --central-position=1" \ - --cmd "$train_cmd" 5000 data/mfcc/train $lang $ali_dir $treedir + --cmd "$train_cmd" 4000 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir fi -if [[ $stage -le 3 ]]; then - echo "creating phone language-model" +if [[ $stage -le 8 ]]; then + echo "$0: creating phone language-model" "$train_cmd" exp/chain/log/make_phone_lm.log \ chain-est-phone-lm \ - "ark:gunzip -c $treedir/ali.*.gz | ali-to-phones $treedir/final.mdl ark:- ark:- |" \ + "ark:gunzip -c $tree_dir/ali.*.gz | ali-to-phones $tree_dir/final.mdl ark:- ark:- |" \ exp/chain/phone_lm.fst || exit 1 fi -if [[ $stage -le 4 ]]; then +if [[ $stage -le 9 ]]; then echo "creating denominator FST" - copy-transition-model $treedir/final.mdl exp/chain/0.trans_mdl - cp $treedir/tree exp/chain + copy-transition-model $tree_dir/final.mdl exp/chain/0.trans_mdl + cp $tree_dir/tree exp/chain "$train_cmd" exp/chain/log/make_den_fst.log \ chain-make-den-fst exp/chain/tree exp/chain/0.trans_mdl exp/chain/phone_lm.fst \ exp/chain/den.fst exp/chain/normalization.fst || exit 1 fi -if [[ $stage -le 5 ]]; then - echo "generating egs" +if [[ $stage -le 10 ]]; then + echo "$0: generating egs" steps/nnet3/chain/get_egs.sh \ --alignment-subsampling-factor 3 \ --cmd "$train_cmd" \ @@ -125,15 +163,15 @@ if [[ $stage -le 5 ]]; then --right-tolerance 5 \ --srand 0 \ --stage -10 \ - data/mfcc_hires/train \ + $train_data_dir \ exp/chain $lat_dir exp/chain/egs fi feat_dim=$(cat exp/chain/egs/info/feat_dim) output_dim=$(cat exp/chain/egs/info/num_pdfs) -if [[ $stage -le 6 ]]; then - echo "merging egs" +if [[ $stage -le 11 ]]; then + echo "$0: merging egs" mkdir -p exp/chain/merged_egs num_egs=$(ls -1 exp/chain/egs/cegs*.ark | wc -l) @@ -145,15 +183,15 @@ if [[ $stage -le 6 ]]; then rm exp/chain/egs/cegs.*.ark fi -if [[ $stage -le 7 ]]; then +if [[ $stage -le 12 ]]; then # Note: it might appear that this $lang directory is mismatched, and it is as # far as the 'topo' is concerned, but this script doesn't read the 'topo' from # the lang directory. local/mkgraph.sh --self-loop-scale 1.0 data/lang_test exp/chain exp/chain/graph fi -if [[ $stage -le 8 ]]; then - echo "training..." +if [[ $stage -le 13 ]]; then + echo "$0: training..." mkdir -p exp/chain/train/tensorboard train_checkpoint= @@ -187,12 +225,12 @@ if [[ $stage -le 8 ]]; then --train.xent-regularize 0.1 fi -if [[ $stage -le 9 ]]; then - echo "inference: computing likelihood" +if [[ $stage -le 14 ]]; then + echo "$0: inference: computing likelihood" for x in test dev; do - mkdir -p exp/chain/inference/$x - if [[ -f exp/chain/inference/$x/nnet_output.scp ]]; then - echo "exp/chain/inference/$x/nnet_output.scp already exists! Skip" + mkdir -p exp/chain/inference/${x}_hires + if [[ -f exp/chain/inference/${x}_hires/nnet_output.scp ]]; then + echo "$0: exp/chain/inference/${x}_hires/nnet_output.scp already exists! Skip" else best_epoch=$(cat exp/chain/train/best-epoch-info | grep 'best epoch' | awk '{print $NF}') inference_checkpoint=exp/chain/train/epoch-${best_epoch}.pt @@ -200,9 +238,9 @@ if [[ $stage -le 9 ]]; then --bottleneck-dim $bottleneck_dim \ --checkpoint $inference_checkpoint \ --device-id $device_id \ - --dir exp/chain/inference/$x \ + --dir exp/chain/inference/${x}_hires \ --feat-dim $feat_dim \ - --feats-scp data/mfcc_hires/$x/feats.scp \ + --feats-scp data/${x}_hires/feats.scp \ --hidden-dim $hidden_dim \ --is-training false \ --kernel-size-list "$kernel_size_list" \ @@ -217,36 +255,36 @@ if [[ $stage -le 9 ]]; then done fi -if [[ $stage -le 10 ]]; then - echo "decoding" +if [[ $stage -le 15 ]]; then + echo "$0: decoding" for x in test dev; do - if [[ ! -f exp/chain/inference/$x/nnet_output.scp ]]; then - echo "exp/chain/inference/$x/nnet_output.scp does not exist!" - echo "Please run inference.py first" + if [[ ! -f exp/chain/inference/${x}_hires/nnet_output.scp ]]; then + echo "$0: exp/chain/inference/${x}_hires/nnet_output.scp does not exist!" + echo "$0: Please run inference.py first" exit 1 fi - echo "decoding $x" + echo "$0: decoding ${x}_hires" ./local/decode.sh \ --nj $nj \ exp/chain/graph \ exp/chain/0.trans_mdl \ - exp/chain/inference/$x/nnet_output.scp \ - exp/chain/decode_res/$x + exp/chain/inference/${x}_hires/nnet_output.scp \ + exp/chain/decode_res/${x}_hires done fi -if [[ $stage -le 11 ]]; then - echo "scoring" +if [[ $stage -le 16 ]]; then + echo "$0: scoring" for x in test dev; do ./local/score.sh --cmd "$decode_cmd" \ - data/mfcc_hires/$x \ + data/${x}_hires \ exp/chain/graph \ - exp/chain/decode_res/$x || exit 1 + exp/chain/decode_res/${x}_hires || exit 1 done for x in test dev; do - head exp/chain/decode_res/$x/scoring_kaldi/best_* + head exp/chain/decode_res/${x}_hires/scoring_kaldi/best_* done fi diff --git a/egs/aishell/s10/local/run_cleanup_segmentation.sh b/egs/aishell/s10/local/run_cleanup_segmentation.sh new file mode 100755 index 00000000000..559d20046dd --- /dev/null +++ b/egs/aishell/s10/local/run_cleanup_segmentation.sh @@ -0,0 +1,68 @@ +#!/bin/bash + +# Copyright 2016 Vimal Manohar +# 2016 Johns Hopkins University (author: Daniel Povey) +# Apache 2.0 + +# This script demonstrates how to re-segment training data selecting only the +# "good" audio that matches the transcripts. +# The basic idea is to decode with an existing in-domain acoustic model, and a +# biased language model built from the reference, and then work out the +# segmentation from a ctm like file. + +# For nnet3 and chain results after cleanup, see the scripts in +# local/nnet3/run_tdnn.sh and local/chain/run_tdnn.sh + +# GMM Results for speaker-independent (SI) and speaker adaptive training (SAT) systems on dev and test sets +# [will add these later]. + +set -e +set -o pipefail +set -u + +stage=0 +cleanup_stage=0 +data=data/train +cleanup_affix=cleaned +srcdir=exp/tri3 +nj=100 +decode_nj=16 +decode_num_threads=4 + +. ./path.sh +. ./cmd.sh +. utils/parse_options.sh + +cleaned_data=${data}_${cleanup_affix} + +dir=${srcdir}_${cleanup_affix}_work +cleaned_dir=${srcdir}_${cleanup_affix} + +if [ $stage -le 1 ]; then + # This does the actual data cleanup. + steps/cleanup/clean_and_segment_data.sh --stage $cleanup_stage --nj $nj --cmd "$train_cmd" \ + $data data/lang $srcdir $dir $cleaned_data +fi + +if [ $stage -le 2 ]; then + steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \ + $cleaned_data data/lang $srcdir ${srcdir}_ali_${cleanup_affix} +fi + +if [ $stage -le 3 ]; then + steps/train_sat.sh --cmd "$train_cmd" \ + 5000 100000 $cleaned_data data/lang ${srcdir}_ali_${cleanup_affix} ${cleaned_dir} +fi + +if [ $stage -le 4 ]; then + # Test with the models trained on cleaned-up data. + utils/mkgraph.sh data/lang ${cleaned_dir} ${cleaned_dir}/graph + + for dset in dev test; do + steps/decode_fmllr.sh --nj $decode_nj --num-threads $decode_num_threads \ + --cmd "$decode_cmd" --num-threads 4 \ + ${cleaned_dir}/graph data/${dset} ${cleaned_dir}/decode_${dset} + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \ + data/${dset} ${cleaned_dir}/decode_${dset} ${cleaned_dir}/decode_${dset}_rescore + done +fi diff --git a/egs/aishell/s10/run.sh b/egs/aishell/s10/run.sh index 5e42fc954cc..f6f3b6e4cb0 100755 --- a/egs/aishell/s10/run.sh +++ b/egs/aishell/s10/run.sh @@ -1,6 +1,7 @@ #!/bin/bash # Copyright 2019-2020 Mobvoi AI Lab, Beijing, China (author: Fangjun Kuang) +# 2020 Xiaomi Corporation (author: Haowen Qiu) # Apache 2.0 # This file demonstrates how to run LF-MMI training in PyTorch @@ -24,7 +25,7 @@ data_url=www.openslr.org/resources/33 nj=30 -stage=0 +stage=14 if [[ $stage -le 0 ]]; then local/download_and_untar.sh $data $data_url data_aishell || exit 1 @@ -56,86 +57,52 @@ fi if [[ $stage -le 5 ]]; then for x in train dev test; do - dst_dir=data/mfcc/$x - utils/copy_data_dir.sh data/$x $dst_dir - steps/make_mfcc_pitch.sh --cmd "$train_cmd" --nj $nj $dst_dir || exit 1 - steps/compute_cmvn_stats.sh $dst_dir || exit 1 - utils/fix_data_dir.sh $dst_dir || exit 1 + steps/make_mfcc.sh --cmd "$train_cmd" --nj $nj data/$x || exit 1 + steps/compute_cmvn_stats.sh data/$x || exit 1 + utils/fix_data_dir.sh data/$x || exit 1 done fi if [[ $stage -le 6 ]]; then steps/train_mono.sh --cmd "$train_cmd" --nj $nj \ - data/mfcc/train data/lang exp/mono || exit 1 + data/train data/lang exp/mono || exit 1 fi if [[ $stage -le 7 ]]; then steps/align_si.sh --cmd "$train_cmd" --nj $nj \ - data/mfcc/train data/lang exp/mono exp/mono_ali || exit 1 + data/train data/lang exp/mono exp/mono_ali || exit 1 fi if [[ $stage -le 8 ]]; then steps/train_deltas.sh --cmd "$train_cmd" \ - 2500 20000 data/mfcc/train data/lang exp/mono_ali exp/tri1 || exit 1 + 2500 20000 data/train data/lang exp/mono_ali exp/tri1 || exit 1 fi if [[ $stage -le 9 ]]; then steps/align_si.sh --cmd "$train_cmd" --nj $nj \ - data/mfcc/train data/lang exp/tri1 exp/tri1_ali || exit 1 + data/train data/lang exp/tri1 exp/tri1_ali || exit 1 fi if [[ $stage -le 10 ]]; then - steps/train_deltas.sh --cmd "$train_cmd" \ - 2500 20000 data/mfcc/train data/lang exp/tri1_ali exp/tri2 || exit 1 + steps/train_lda_mllt.sh --cmd "$train_cmd" \ + 3000 40000 data/train data/lang exp/tri1_ali exp/tri2 || exit 1 fi if [[ $stage -le 11 ]]; then steps/align_si.sh --cmd "$train_cmd" --nj $nj \ - data/mfcc/train data/lang exp/tri2 exp/tri2_ali || exit 1 + data/train data/lang exp/tri2 exp/tri2_ali || exit 1 fi if [[ $stage -le 12 ]]; then - steps/train_lda_mllt.sh --cmd "$train_cmd" \ - 2500 20000 data/mfcc/train data/lang exp/tri2_ali exp/tri3a || exit 1 + steps/train_sat.sh --cmd "$train_cmd" \ + 4000 80000 data/train data/lang exp/tri2_ali exp/tri3 || exit 1 fi if [[ $stage -le 13 ]]; then - steps/align_fmllr.sh --cmd "$train_cmd" --nj $nj \ - data/mfcc/train data/lang exp/tri3a exp/tri3a_ali || exit 1 + local/run_cleanup_segmentation.sh --nj $nj fi if [[ $stage -le 14 ]]; then - steps/train_sat.sh --cmd "$train_cmd" \ - 2500 20000 data/mfcc/train data/lang exp/tri3a_ali exp/tri4a || exit 1 -fi - -if [[ $stage -le 15 ]]; then - steps/align_fmllr.sh --cmd "$train_cmd" --nj $nj \ - data/mfcc/train data/lang exp/tri4a exp/tri4a_ali -fi - -if [[ $stage -le 16 ]]; then - steps/train_sat.sh --cmd "$train_cmd" \ - 3500 100000 data/mfcc/train data/lang exp/tri4a_ali exp/tri5a || exit 1 -fi - -if [[ $stage -le 17 ]]; then - steps/align_fmllr.sh --cmd "$train_cmd" --nj $nj \ - data/mfcc/train data/lang exp/tri5a exp/tri5a_ali || exit 1 -fi - -if [[ $stage -le 18 ]]; then - steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/mfcc/train \ - data/lang exp/tri5a exp/tri5a_lats - rm exp/tri5a_lats/fsts.*.gz # save space -fi - -if [[ $stage -le 19 ]]; then - # kaldi pybind LF-MMI training with PyTorch - ./local/run_chain.sh --nj $nj -fi - -if [[ $stage -le 20 ]]; then - # kaldi nnet3 LF-MMI training - ./local/run_tdnn_1b.sh --nj $nj + local/run_chain.sh --nj $nj \ + --gmm-dir exp/tri3_cleaned fi