compute validation objf and switch back to [-1, 0, 1] for the linear

csukuangfj · csukuangfj · commit 793f57281ca2 · 2020-02-18T16:08:57.000+08:00
layer.
diff --git a/egs/aishell/s10/chain/egs_dataset.py b/egs/aishell/s10/chain/egs_dataset.py
@@ -4,6 +4,7 @@
 # Apache 2.0
 
 import glob
+import os
 
 import numpy as np
 import torch
@@ -17,12 +18,12 @@
 from common import splice_feats
 
 
-def get_egs_dataloader(egs_dir,
+def get_egs_dataloader(egs_dir_or_scp,
                        egs_left_context,
                        egs_right_context,
                        shuffle=True):
 
-    dataset = NnetChainExampleDataset(egs_dir=egs_dir)
+    dataset = NnetChainExampleDataset(egs_dir_or_scp=egs_dir_or_scp)
     frame_subsampling_factor = 3
 
     # we have merged egs offline, so batch size is 1
@@ -54,11 +55,16 @@ def read_nnet_chain_example(rxfilename):
 
 class NnetChainExampleDataset(Dataset):
 
-    def __init__(self, egs_dir):
+    def __init__(self, egs_dir_or_scp):
         '''
-        We assume that there exist many cegs.*.scp files inside egs_dir
+        If egs_dir_or_scp is a directory, we assume that there exist many cegs.*.scp files
+        inside egs_dir_or_scp.
         '''
-        self.scps = glob.glob('{}/cegs.*.scp'.format(egs_dir))
+        if os.path.isdir(egs_dir_or_scp):
+            self.scps = glob.glob('{}/cegs.*.scp'.format(egs_dir_or_scp))
+        else:
+            self.scps = [egs_dir_or_scp]
+
         assert len(self.scps) > 0
         self.items = list()
         for scp in self.scps:
@@ -175,7 +181,7 @@ def __call__(self, batch):
 
 def _test_nnet_chain_example_dataset():
     egs_dir = 'exp/chain/merged_egs'
-    dataset = NnetChainExampleDataset(egs_dir=egs_dir)
+    dataset = NnetChainExampleDataset(egs_dir_or_scp=egs_dir)
     egs_left_context = 29
     egs_right_context = 29
     frame_subsampling_factor = 3
diff --git a/egs/aishell/s10/chain/options.py b/egs/aishell/s10/chain/options.py
@@ -54,6 +54,11 @@ def _set_training_args(parser):
                         help='cegs dir containing comibined cegs.*.scp',
                         type=str)
 
+    parser.add_argument('--train.valid-cegs-scp',
+                        dest='valid_cegs_scp',
+                        help='validation cegs scp',
+                        type=str)
+
     parser.add_argument('--train.den-fst',
                         dest='den_fst_filename',
                         help='denominator fst filename',
@@ -84,9 +89,20 @@ def _set_training_args(parser):
                         help='l2 regularize',
                         type=float)
 
+    parser.add_argument('--train.xent-regularize',
+                        dest='xent_regularize',
+                        help='xent regularize',
+                        type=float)
+
+    parser.add_argument('--train.leaky-hmm-coefficient',
+                        dest='leaky_hmm_coefficient',
+                        help='leaky hmm coefficient',
+                        type=float)
+
 
 def _check_training_args(args):
     assert os.path.isdir(args.cegs_dir)
+    assert os.path.isfile(args.valid_cegs_scp)
 
     assert os.path.isfile(args.den_fst_filename)
 
@@ -95,7 +111,9 @@ def _check_training_args(args):
 
     assert args.num_epochs > 0
     assert args.learning_rate > 0
-    assert args.l2_regularize > 0
+    assert args.l2_regularize >= 0
+    assert args.xent_regularize >= 0
+    assert args.leaky_hmm_coefficient >= 0
 
     if args.checkpoint:
         assert os.path.exists(args.checkpoint)
@@ -140,7 +158,9 @@ def _check_args(args):
 
     args.kernel_size_list = [int(k) for k in args.kernel_size_list.split(', ')]
 
-    args.subsampling_factor_list = [int(k) for k in args.subsampling_factor_list.split(', ')]
+    args.subsampling_factor_list = [
+        int(k) for k in args.subsampling_factor_list.split(', ')
+    ]
 
     assert len(args.kernel_size_list) == len(args.subsampling_factor_list)
 
diff --git a/egs/aishell/s10/chain/tdnnf_layer.py b/egs/aishell/s10/chain/tdnnf_layer.py
@@ -175,7 +175,7 @@ def __init__(self,
         # since we want to use `stride`
         self.affine = nn.Conv1d(in_channels=bottleneck_dim,
                                 out_channels=dim,
-                                kernel_size=kernel_size,
+                                kernel_size=1,
                                 stride=subsampling_factor)
 
         # batchnorm requires [N, C, T]
@@ -204,7 +204,7 @@ def forward(self, x):
 
         # TODO(fangjun): implement GeneralDropoutComponent in PyTorch
 
-        if self.linear.kernel_size == 2:
+        if self.linear.kernel_size == 3:
             x = self.bypass_scale * input_x[:, :, self.s:-self.s:self.s] + x
         else:
             x = self.bypass_scale * input_x[:, :, ::self.s] + x
@@ -248,7 +248,7 @@ def compute_loss(M):
 
     model = FactorizedTDNN(dim=1024,
                            bottleneck_dim=128,
-                           kernel_size=2,
+                           kernel_size=3,
                            subsampling_factor=1)
     loss = []
     model.constrain_orthonormal()
@@ -279,10 +279,10 @@ def _test_factorized_tdnn():
     y = model(x)
     assert y.size(2) == T
 
-    # case 1: kernel_size == 2, subsampling_factor == 1
+    # case 1: kernel_size == 3, subsampling_factor == 1
     model = FactorizedTDNN(dim=C,
                            bottleneck_dim=2,
-                           kernel_size=2,
+                           kernel_size=3,
                            subsampling_factor=1)
     y = model(x)
     assert y.size(2) == T - 2
diff --git a/egs/aishell/s10/chain/train.py b/egs/aishell/s10/chain/train.py
@@ -15,7 +15,6 @@
 import torch
 import torch.optim as optim
 from torch.nn.utils import clip_grad_value_
-from torch.optim.lr_scheduler import MultiStepLR
 from torch.utils.tensorboard import SummaryWriter
 
 import kaldi
@@ -32,8 +31,63 @@
 from options import get_args
 
 
-def train_one_epoch(dataloader, model, device, optimizer, criterion,
-                    current_epoch, opts, den_graph, tf_writer):
+def get_validation_objf(dataloader, model, device, criterion, opts, den_graph):
+    total_objf = 0.
+    total_weight = 0.
+    total_frames = 0.  # for display only
+
+    model.eval()
+
+    for batch_idx, batch in enumerate(dataloader):
+        key_list, feature_list, supervision_list = batch
+
+        assert len(key_list) == len(feature_list) == len(supervision_list)
+        batch_size = len(key_list)
+
+        for n in range(batch_size):
+            feats = feature_list[n]
+            assert feats.ndim == 3
+
+            # at this point, feats is [N, T, C]
+            feats = feats.to(device)
+
+            with torch.no_grad():
+                nnet_output, xent_output = model(feats)
+
+            # at this point, nnet_output is: [N, T, C]
+            # refer to kaldi/src/chain/chain-training.h
+            # the output should be organized as
+            # [all sequences for frame 0]
+            # [all sequences for frame 1]
+            # [etc.]
+            nnet_output = nnet_output.permute(1, 0, 2)
+            # at this point, nnet_output is: [T, N, C]
+            nnet_output = nnet_output.contiguous().view(-1,
+                                                        nnet_output.shape[-1])
+
+            # at this point, xent_output is: [N, T, C]
+            xent_output = xent_output.permute(1, 0, 2)
+            # at this point, xent_output is: [T, N, C]
+            xent_output = xent_output.contiguous().view(-1,
+                                                        xent_output.shape[-1])
+            objf_l2_term_weight = criterion(opts, den_graph,
+                                            supervision_list[n], nnet_output,
+                                            xent_output)
+            objf = objf_l2_term_weight[0]
+
+            objf_l2_term_weight = objf_l2_term_weight.cpu()
+
+            total_objf += objf_l2_term_weight[0].item()
+            total_weight += objf_l2_term_weight[2].item()
+
+            num_frames = nnet_output.shape[0]
+            total_frames += num_frames
+
+    return total_objf, total_weight, total_frames
+
+
+def train_one_epoch(dataloader, valid_dataloader, model, device, optimizer,
+                    criterion, current_epoch, opts, den_graph, tf_writer):
     model.train()
 
     total_objf = 0.
@@ -75,8 +129,8 @@ def train_one_epoch(dataloader, model, device, optimizer, criterion,
             optimizer.zero_grad()
             objf.backward()
 
-            # TODO(fangjun): how to choose this value or do we need this ?
             clip_grad_value_(model.parameters(), 5.0)
+
             optimizer.step()
 
             objf_l2_term_weight = objf_l2_term_weight.detach().cpu()
@@ -101,6 +155,25 @@ def train_one_epoch(dataloader, model, device, optimizer, criterion,
                     objf_l2_term_weight[0].item() /
                     objf_l2_term_weight[2].item(), num_frames, current_epoch))
 
+        if batch_idx % 500 == 0:
+            total_valid_objf, total_valid_weight, total_valid_frames = get_validation_objf(
+                dataloader=valid_dataloader,
+                model=model,
+                device=device,
+                criterion=criterion,
+                opts=opts,
+                den_graph=den_graph)
+
+            model.train()
+
+            logging.info(
+                'Validation average objf: {:.6f} over {} frames'.format(
+                    total_valid_objf / total_valid_weight, total_valid_frames))
+
+            tf_writer.add_scalar('train/global_valid_average_objf',
+                                 total_valid_objf / total_valid_weight,
+                                 batch_idx + current_epoch * len(dataloader))
+
         if batch_idx % 100 == 0:
             tf_writer.add_scalar('train/global_average_objf',
                                  total_objf / total_weight,
@@ -145,10 +218,10 @@ def main():
 
     den_fst = fst.StdVectorFst.Read(args.den_fst_filename)
 
-    # TODO(fangjun): pass these options from commandline
     opts = chain.ChainTrainingOptions()
-    opts.l2_regularize = 5e-4
-    opts.leaky_hmm_coefficient = 0.1
+    opts.l2_regularize = args.l2_regularize
+    opts.xent_regularize = args.xent_regularize
+    opts.leaky_hmm_coefficient = args.leaky_hmm_coefficient
 
     den_graph = chain.DenominatorGraph(fst=den_fst, num_pdfs=args.output_dim)
 
@@ -179,16 +252,21 @@ def main():
 
     model.to(device)
 
-    dataloader = get_egs_dataloader(egs_dir=args.cegs_dir,
+    dataloader = get_egs_dataloader(egs_dir_or_scp=args.cegs_dir,
                                     egs_left_context=args.egs_left_context,
                                     egs_right_context=args.egs_right_context,
                                     shuffle=True)
 
+    valid_dataloader = get_egs_dataloader(
+        egs_dir_or_scp=args.valid_cegs_scp,
+        egs_left_context=args.egs_left_context,
+        egs_right_context=args.egs_right_context,
+        shuffle=False)
+
     optimizer = optim.Adam(model.parameters(),
                            lr=learning_rate,
-                           weight_decay=args.l2_regularize)
+                           weight_decay=5e-4)
 
-    scheduler = MultiStepLR(optimizer, milestones=[1, 2, 3, 4, 5], gamma=0.5)
     criterion = KaldiChainObjfFunction.apply
 
     tf_writer = SummaryWriter(log_dir='{}/tensorboard'.format(args.dir))
@@ -198,12 +276,17 @@ def main():
     best_epoch_info_filename = os.path.join(args.dir, 'best-epoch-info')
     try:
         for epoch in range(start_epoch, args.num_epochs):
-            learning_rate = scheduler.get_lr()[0]
+            learning_rate = 1e-3 * pow(0.4, epoch)
+            for param_group in optimizer.param_groups:
+                param_group['lr'] = learning_rate
+
             logging.info('epoch {}, learning rate {}'.format(
                 epoch, learning_rate))
+
             tf_writer.add_scalar('learning_rate', learning_rate, epoch)
 
             objf = train_one_epoch(dataloader=dataloader,
+                                   valid_dataloader=valid_dataloader,
                                    model=model,
                                    device=device,
                                    optimizer=optimizer,
@@ -212,7 +295,6 @@ def main():
                                    opts=opts,
                                    den_graph=den_graph,
                                    tf_writer=tf_writer)
-            scheduler.step()
 
             if best_objf is None:
                 best_objf = objf
diff --git a/egs/aishell/s10/local/run_chain.sh b/egs/aishell/s10/local/run_chain.sh
@@ -33,7 +33,7 @@ lr=1e-3
 hidden_dim=1024
 bottleneck_dim=128
 prefinal_bottleneck_dim=256
-kernel_size_list="2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2" # comma separated list
+kernel_size_list="3, 3, 3, 1, 3, 3, 3, 3, 3, 3, 3, 3" # comma separated list
 subsampling_factor_list="1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1" # comma separated list
 
 log_level=info # valid values: debug, info, warning
@@ -179,9 +179,12 @@ if [[ $stage -le 8 ]]; then
     --train.den-fst exp/chain/den.fst \
     --train.egs-left-context $egs_left_context \
     --train.egs-right-context $egs_right_context \
-    --train.l2-regularize 5e-4 \
+    --train.l2-regularize 5e-5 \
+    --train.leaky-hmm-coefficient 0.1 \
     --train.lr $lr \
-    --train.num-epochs $num_epochs
+    --train.num-epochs $num_epochs \
+    --train.valid-cegs-scp exp/chain/egs/valid_diagnostic.scp \
+    --train.xent-regularize 0.1
 fi
 
 if [[ $stage -le 9 ]]; then