diff --git a/egs/aishell/s10b/cmd.sh b/egs/aishell/s10b/cmd.sh new file mode 100644 index 00000000000..82b1d114e08 --- /dev/null +++ b/egs/aishell/s10b/cmd.sh @@ -0,0 +1,16 @@ +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. + +export train_cmd="run.pl" +export decode_cmd="run.pl" +export mkgraph_cmd="run.pl" +export cuda_cmd="run.pl" diff --git a/egs/aishell/s10b/conf/fbank.conf b/egs/aishell/s10b/conf/fbank.conf new file mode 100644 index 00000000000..3dac154706b --- /dev/null +++ b/egs/aishell/s10b/conf/fbank.conf @@ -0,0 +1 @@ +--num-mel-bins=40 diff --git a/egs/aishell/s10b/ctc/add_deltas_layer.py b/egs/aishell/s10b/ctc/add_deltas_layer.py new file mode 100644 index 00000000000..4e1c11d1b9e --- /dev/null +++ b/egs/aishell/s10b/ctc/add_deltas_layer.py @@ -0,0 +1,96 @@ +# Copyright 2020 Mobvoi AI Lab, Beijing, China (author: Fangjun Kuang) +# Apache 2.0 + +import torch +import torch.nn as nn +import torch.nn.functional as F + + +def compute_delta_feat(x, weight): + ''' + Args: + x: input feat of shape [batch_size, feat_dim, seq_len] + + weight: coefficients for computing delta features; + it has shape [feat_dim, 1, kernel_size]. + + Returns: + a tensor of shape [batch_size, feat_dim, seq_len] + ''' + + assert x.ndim == 3 + + assert weight.ndim == 3 + assert weight.size(0) == x.size(1) + assert weight.size(1) == 1 + assert weight.size(2) % 2 == 1 + + feat_dim = x.size(1) + + # NOTE(fangjun): we perform a depthwise convolution here by + # setting groups == number of channels + y = F.conv1d(input=x, weight=weight, groups=feat_dim) + + return y + + +class AddDeltasLayer(nn.Module): + ''' + This class implements `add-deltas` with order == 2 and window == 2. + + Note that it has no trainable `nn.Parameter`s. + ''' + + def __init__(self, + first_order_coef=[-1, 0, 1], + second_order_coef=[1, 0, -2, 0, 1]): + ''' + Args: + first_order_coef: coefficient to compute the first order delta feature + + second_order_coef: coefficient to compute the second order delta feature + ''' + super().__init__() + + self.first_order_coef = torch.tensor(first_order_coef).float() + self.second_order_coef = torch.tensor(second_order_coef).float() + + def forward(self, x): + ''' + Args: + x: a tensor of shape [batch_size, feat_dim, seq_len] + + Returns: + a tensor of shape [batch_size, feat_dim * 3, seq_len] + ''' + if self.first_order_coef.ndim != 3: + num_duplicates = x.size(1) + + # yapf: disable + self.first_order_coef = self.first_order_coef.reshape(1, 1, -1) + self.first_order_coef = torch.cat([self.first_order_coef] * num_duplicates, dim=0) + + self.second_order_coef = self.second_order_coef.reshape(1, 1, -1) + self.second_order_coef = torch.cat([self.second_order_coef] * num_duplicates, dim=0) + # yapf: enable + + device = x.device + self.first_order_coef = self.first_order_coef.to(device) + self.second_order_coef = self.second_order_coef.to(device) + + first_order = compute_delta_feat(x, self.first_order_coef) + second_order = compute_delta_feat(x, self.second_order_coef) + + # since we did not perform padding, we have to remove some frames + # from the 0th and 1st order features + zeroth_valid = (x.size(2) - second_order.size(2)) // 2 + first_valid = (first_order.size(2) - second_order.size(2)) // 2 + + y = torch.cat([ + x[:, :, zeroth_valid:-zeroth_valid,], + first_order[:, :, first_valid:-first_valid], + second_order, + ], + dim=1) + + return y diff --git a/egs/aishell/s10b/ctc/add_deltas_layer_test.py b/egs/aishell/s10b/ctc/add_deltas_layer_test.py new file mode 100755 index 00000000000..00832e693b0 --- /dev/null +++ b/egs/aishell/s10b/ctc/add_deltas_layer_test.py @@ -0,0 +1,79 @@ +#!/usr/bin/env python3 + +# Copyright 2020 Mobvoi AI Lab, Beijing, China (author: Fangjun Kuang) +# Apache 2.0 + +import os +import shutil +import tempfile +import unittest + +import numpy as np + +import torch +import torch.nn.functional as F + +import kaldi + +from add_deltas_layer import AddDeltasLayer + + +class AddDeltasLayerTest(unittest.TestCase): + + def test(self): + x = torch.tensor([ + [1, 3], + [5, 10], + [0, 1], + [10, 20], + [3, 1], + [3, 2], + [5, 1], + [10, -2], + [10, 20], + [100, 200], + ]).float() + + x = x.unsqueeze(0) + + transform = AddDeltasLayer(first_order_coef=[-0.2, -0.1, 0, 0.1, 0.2], + second_order_coef=[ + 0.04, 0.04, 0.01, -0.04, -0.1, -0.04, + 0.01, 0.04, 0.04 + ]) + y = transform(x.permute(0, 2, 1)).permute(0, 2, 1) + + # now use kaldi's add-deltas to compute the ground truth + d = tempfile.mkdtemp() + + wspecifier = 'ark:{}/feats.ark'.format(d) + + writer = kaldi.MatrixWriter(wspecifier) + writer.Write('utt1', x.squeeze(0).numpy()) + writer.Close() + + delta_feats_specifier = 'ark:{dir}/delta.ark'.format(dir=d) + + cmd = ''' + add-deltas --print-args=false --delta-order=2 --delta-window=2 {} {} + '''.format(wspecifier, delta_feats_specifier) + + os.system(cmd) + + reader = kaldi.RandomAccessMatrixReader(delta_feats_specifier) + + expected = reader['utt1'] + + y = y.squeeze(0) + + np.testing.assert_array_almost_equal(y.numpy(), + expected.numpy()[4:-4, :], + decimal=5) + + reader.Close() + + shutil.rmtree(d) + + +if __name__ == '__main__': + unittest.main() diff --git a/egs/aishell/s10b/ctc/common.py b/egs/aishell/s10b/ctc/common.py new file mode 100644 index 00000000000..b8a992241ca --- /dev/null +++ b/egs/aishell/s10b/ctc/common.py @@ -0,0 +1,98 @@ +#!/usr/bin/env python3 + +# Copyright 2020 Mobvoi AI Lab, Beijing, China (author: Fangjun Kuang) +# Apache 2.0 + +from datetime import datetime +import logging + +import torch + + +def setup_logger(log_filename, log_level='info'): + now = datetime.now() + date_time = now.strftime('%Y-%m-%d-%H-%M-%S') + log_filename = '{}-{}'.format(log_filename, date_time) + formatter = '%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s' + if log_level == 'debug': + level = logging.DEBUG + elif log_level == 'info': + level = logging.INFO + elif log_level == 'warning': + level = logging.WARNING + logging.basicConfig(filename=log_filename, + format=formatter, + level=level, + filemode='w') + console = logging.StreamHandler() + console.setLevel(level) + console.setFormatter(logging.Formatter(formatter)) + logging.getLogger('').addHandler(console) + + +def load_checkpoint(filename, model): + logging.info('Loading checkpoint from {}'.format(filename)) + + checkpoint = torch.load(filename, map_location='cpu') + + keys = ['state_dict', 'epoch', 'learning_rate', 'loss'] + for k in keys: + assert k in checkpoint + + if not list(model.state_dict().keys())[0].startswith('module.') \ + and list(checkpoint['state_dict'])[0].startswith('module.'): + # the checkpoint was saved by DDP + logging.info('load checkpoint from DDP') + dst_state_dict = model.state_dict() + src_state_dict = checkpoint['state_dict'] + for key in dst_state_dict.keys(): + src_key = '{}.{}'.format('module', key) + dst_state_dict[key] = src_state_dict.pop(src_key) + assert len(src_state_dict) == 0 + model.load_state_dict(dst_state_dict) + else: + model.load_state_dict(checkpoint['state_dict']) + + epoch = checkpoint['epoch'] + learning_rate = checkpoint['learning_rate'] + loss = checkpoint['loss'] + + return epoch, learning_rate, loss + + +def save_checkpoint(filename, model, epoch, learning_rate, loss, local_rank=0): + if local_rank != 0: + return + logging.info('Saving checkpoint to {filename}: epoch={epoch}, ' + 'learning_rate={learning_rate}, loss={loss}'.format( + filename=filename, + epoch=epoch, + learning_rate=learning_rate, + loss=loss)) + checkpoint = { + 'state_dict': model.state_dict(), + 'epoch': epoch, + 'learning_rate': learning_rate, + 'loss': loss + } + torch.save(checkpoint, filename) + + +def save_training_info(filename, + model_path, + current_epoch, + learning_rate, + loss, + best_loss, + best_epoch, + local_rank=0): + if local_rank != 0: + return + + with open(filename, 'w') as f: + f.write('model_path: {}\n'.format(model_path)) + f.write('epoch: {}\n'.format(current_epoch)) + f.write('learning rate: {}\n'.format(learning_rate)) + f.write('loss: {}\n'.format(loss)) + f.write('best loss: {}\n'.format(best_loss)) + f.write('best epoch: {}\n'.format(best_epoch)) diff --git a/egs/aishell/s10b/ctc/ctc_loss.py b/egs/aishell/s10b/ctc/ctc_loss.py new file mode 100644 index 00000000000..2c703c2af8c --- /dev/null +++ b/egs/aishell/s10b/ctc/ctc_loss.py @@ -0,0 +1,218 @@ +#!/usr/bin/env python3 + +# Copyright 2020 Mobvoi AI Lab, Beijing, China (author: Fangjun Kuang) +# Apache 2.0 + +import torch +from torch.autograd import Function +from torch.utils.dlpack import to_dlpack +import torch.nn as nn +import torch.nn.functional as F + +import kaldi +from kaldi import ctc + + +class WarpCtcLoss(Function): + + @staticmethod + def forward(ctx, activations, targets, input_lengths, target_lengths, blank, + reduction): + ''' + Args: + activations: `(seq_len, batch_size, C)`, where `C` is the number + of characters in aphabet including the blank symbol. + + targets: a tensor of [batch size] containing the concatenated labels. + Targets cannot be blank. + + input_lengths: a tensor of [batch_size] containing the number of input frames + for each utterance in the batch. + + target_lengths: a tensor of [batch_size] containing the label lengths + + blank: the index of the blank symbol. + + reduction: specifies the reduction to apply to + the output: `none` | `mean` | `sum`. + + `none`: no reduction will be applied; + + `mean`: the output losses will be divided + by the target lengths and then the mean + over the batch is taken. + + `sum`: the output will be summed. + ''' + device = activations.device + assert device.type == 'cuda', 'we only support computing CTCLoss on GPU devices.' + + activations_tensor = activations.float().reshape(-1).contiguous() + gradients_tensor = torch.zeros_like(activations_tensor).contiguous() + + # NOTE(fangjun): foobar.cpu() is a no operation if foobar is already on CPU. + flat_labels_tensor = targets.int().view(-1).cpu() + label_lengths_tensor = target_lengths.int().view(-1).cpu() + input_lengths_tensor = input_lengths.int().view(-1).cpu() + + alphabet_size = activations.size(2) + minibatch = activations.size(1) + + costs_tensor = torch.zeros(minibatch, dtype=torch.float32).contiguous() + + info = ctc.CtcOptions() + info.loc = ctc.CtcComputeLocation.CTC_GPU + info.blank_label = blank + + label_lengths = kaldi.IntSubVectorFromDLPack( + to_dlpack(label_lengths_tensor)) + + input_lengths = kaldi.IntSubVectorFromDLPack( + to_dlpack(input_lengths_tensor)) + + status, size_in_bytes = ctc.GetWorkspaceSize( + label_lengths=label_lengths, + input_lengths=input_lengths, + alphabet_size=alphabet_size, + minibatch=minibatch, + info=info) + + assert status == ctc.CtcStatus.CTC_STATUS_SUCCESS + + num_floats = size_in_bytes // 4 + 1 + workspace_tensor = torch.zeros( + num_floats, dtype=torch.float32).contiguous().to(device) + + cu_activations = kaldi.CuSubVectorFromDLPack( + to_dlpack(activations_tensor)) + cu_gradients = kaldi.CuSubVectorFromDLPack(to_dlpack(gradients_tensor)) + flat_labels = kaldi.IntSubVectorFromDLPack( + to_dlpack(flat_labels_tensor)) + costs = kaldi.FloatSubVectorFromDLPack(to_dlpack(costs_tensor)) + workspace = kaldi.CuSubVectorFromDLPack(to_dlpack(workspace_tensor)) + + stream = torch.cuda.default_stream(device) + with torch.cuda.stream(stream): + status = ctc.ComputeCtcLossGpu(activations=cu_activations, + gradients=cu_gradients, + flat_labels=flat_labels, + label_lengths=label_lengths, + input_lengths=input_lengths, + alphabet_size=alphabet_size, + minibatch=minibatch, + costs=costs, + workspace=workspace, + options=info) + + gradients_tensor = gradients_tensor.reshape(*activations.shape) + + ctx.save_for_backward(gradients_tensor), + + if reduction == 'none': + return costs_tensor + + total_loss = torch.sum(costs_tensor) + + if reduction == 'sum': + return total_loss + + # else it is `mean` + total_target_lengths = torch.sum(label_lengths_tensor) + + return total_loss / minibatch / total_target_lengths + + @staticmethod + def backward(ctx, unused): + ''' + The `forward` method has 6 inputs: + `activations`, `targets`, `input_lengths`, + `target_lengths`, `blank`, `reduction` + + We have to return 6 values. + ''' + gradients, = ctx.saved_tensors + return gradients, None, None, None, None, None + + +def warp_ctc_loss(activations, targets, input_lengths, target_lengths, blank, + reduction): + ''' + A thin wrapper for WarpCtcLoss. + + We can use keyword arguments with this wrapper + ''' + loss_func = WarpCtcLoss.apply + return loss_func(activations, targets, input_lengths, target_lengths, blank, + reduction) + + +class CTCLoss(nn.Module): + ''' + Note that PyTorch requires the probability to be log prob, + while warp-ctc does not have this requirement. + ''' + + def __init__(self, use_warp_ctc=True, blank=0, reduction='mean'): + ''' + Args: + blank: the index of the blank label + reduction: specifies the reduction to apply to + the output: `none` | `mean` | `sum`. + + `none`: no reduction will be applied; + + `mean`: the output losses will be divided + by the target lengths and then the mean + over the batch is taken. + + `sum`: the output will be summed. + ''' + super().__init__() + assert reduction in ['none', 'mean', 'sum'] + + # if use_warp_ctc: + # self.loss_func = warp_ctc_loss + # else: + # self.loss_func = F.ctc_loss + + self.use_warp_ctc = use_warp_ctc + + self.blank = blank + self.reduction = reduction + + def forward(self, activations, targets, input_lengths, target_lengths): + ''' + Args: + activations: `(seq_len, batch_size, C)`, where `C` is the number + of characters in alphabet including the blank symbol. + + targets: a tensor of [batch size] containing the concatenated labels. + Targets cannot be blank. + + input_lengths: a tensor of [batch_size] containing the number of input frames + for each utterance in the batch. + + target_lengths: a tensor of [batch_size] containing the label lengths + ''' + if self.use_warp_ctc == False: + # move all tensors to GPU + device = activations.device + targets = targets.to(device) + input_lengths = input_lengths.to(device) + target_lengths = target_lengths.to(device) + + log_probs = F.log_softmax(activations, dim=-1) + + return F.ctc_loss(log_probs=log_probs, + targets=targets, + input_lengths=input_lengths, + target_lengths=target_lengths, + blank=self.blank, + reduction=self.reduction) + else: + return warp_ctc_loss(activations=activations, + targets=targets, + input_lengths=input_lengths, + target_lengths=target_lengths, + blank=self.blank, + reduction=self.reduction) diff --git a/egs/aishell/s10b/ctc/ctc_loss_test.py b/egs/aishell/s10b/ctc/ctc_loss_test.py new file mode 100755 index 00000000000..338e816cffa --- /dev/null +++ b/egs/aishell/s10b/ctc/ctc_loss_test.py @@ -0,0 +1,77 @@ +#!/usr/bin/env python3 + +# Copyright 2020 Mobvoi AI Lab, Beijing, China (author: Fangjun Kuang) +# Apache 2.0 + +import torch + +import kaldi + +import torch.nn as nn +import torch.nn.functional as F +from torch.nn.utils.rnn import pad_sequence + +from ctc_loss import CTCLoss + + +def test_baidu_warp_ctc(): + device_id = 1 + kaldi.SelectGpuDevice(device_id=device_id) + + device = torch.device('cuda', index=device_id) + + ex1 = torch.tensor([[0.2, 0.2, 0.2, 0.2, 0.2]], dtype=torch.float32) + + ex2 = torch.tensor( + [[1, 2, 3, 4, 5], [6, 7, 8, 9, 10], [11, 12, 13, 14, 15]], + dtype=torch.float32) + + ex3 = torch.tensor([[-5, -4, -3, -2, -1], [-10, -9, -8, -7, -6], + [-15, -14, -13, -12, -11]], + dtype=torch.float32) + + activations = pad_sequence([ex1, ex2, ex3], batch_first=False) + activations = activations.to(device) + + tmp_activations = activations.clone() + + activations.requires_grad_(True) + tmp_activations.requires_grad_(True) + + targets = torch.tensor([1, 3, 3, 2, 3]) + target_lengths = torch.tensor([1, 2, 2]) + input_lengths = torch.tensor([1, 3, 3]) + + loss_func = CTCLoss(use_warp_ctc=True, blank=0, reduction='mean') + loss = loss_func(activations=activations, + targets=targets, + input_lengths=input_lengths, + target_lengths=target_lengths) + + print('warp ctc loss', loss) + loss.backward() + print('warp ctc activations grad', activations.grad) + + loss_func = CTCLoss(use_warp_ctc=False, blank=0, reduction='mean') + loss = loss_func(activations=tmp_activations, + targets=targets, + input_lengths=input_lengths, + target_lengths=target_lengths) + loss.backward() + print('loss', loss) + print('grad', tmp_activations.grad) + print('grad x 6', tmp_activations.grad * 6) + + # It turns out that + # - the loss + # - and the gradients + # computed by warp ctc and PyTorch's built-in CTCLoss are different. + + +def main(): + test_baidu_warp_ctc() + + +if __name__ == '__main__': + torch.manual_seed(20200224) + main() diff --git a/egs/aishell/s10b/ctc/dataset.py b/egs/aishell/s10b/ctc/dataset.py new file mode 100644 index 00000000000..6c3ab0aa7a9 --- /dev/null +++ b/egs/aishell/s10b/ctc/dataset.py @@ -0,0 +1,248 @@ +#!/usr/bin/env python3 + +# Copyright 2020 Mobvoi AI Lab, Beijing, China (author: Fangjun Kuang) +# Apache 2.0 + +import os +import logging + +import numpy as np +import torch +from torch.nn.utils.rnn import pad_sequence +from torch.utils.data import DataLoader +from torch.utils.data import Dataset + +import kaldi + + +def get_ctc_dataloader(feats_scp, + labels_scp=None, + batch_size=1, + shuffle=False, + num_workers=0, + model_left_context=0, + model_right_context=0, + world_size=None, + local_rank=None): + + dataset = CtcDataset(feats_scp=feats_scp, labels_scp=labels_scp) + + collate_fn = CtcDatasetCollateFunc(model_left_context=model_left_context, + model_right_context=model_right_context) + + if world_size: + logging.info('world_size: {}'.format(world_size)) + sampler = torch.utils.data.distributed.DistributedSampler( + dataset, num_replicas=world_size, rank=local_rank, shuffle=shuffle) + # sampler and shuffle are mutually exclusive; + # it will raise an exception if you set both + shuffle = False + + else: + sampler = None + + dataloader = DataLoader(dataset, + batch_size=batch_size, + shuffle=shuffle, + num_workers=num_workers, + collate_fn=collate_fn, + sampler=sampler) + + return dataloader + + +def _add_model_left_right_context(x, left_context, right_context): + padded = x + if left_context > 0: + first_frame = x[0, :] + left_padding = [first_frame] * left_context + padded = np.vstack([left_padding, x]) + + if right_context > 0: + last_frame = x[-1, :] + right_padding = [last_frame] * right_context + padded = np.vstack([padded, right_padding]) + + return padded + + +class CtcDataset(Dataset): + + def __init__(self, feats_scp, labels_scp=None): + ''' + Args: + feats_scp: filename for feats.scp + labels_scp: if provided, it is the filename of labels.scp + ''' + assert os.path.isfile(feats_scp) + if labels_scp: + assert os.path.isfile(labels_scp) + logging.info('labels scp: {}'.format(labels_scp)) + else: + logging.warn('No labels scp is given.') + + # items is a dict of [uttid, feat_rxfilename, None] + # or [uttid, feat_rxfilename, label_rxfilename] if labels_scp is not None + items = dict() + + with open(feats_scp, 'r') as f: + for line in f: + # every line has the following format: + # uttid feat_rxfilename + uttid_rxfilename = line.split() + assert len(uttid_rxfilename) == 2 + + uttid, rxfilename = uttid_rxfilename + + assert uttid not in items + + items[uttid] = [uttid, rxfilename, None] + + if labels_scp: + expected_count = len(items) + n = 0 + with open(labels_scp, 'r') as f: + for line in f: + # every line has the following format: + # uttid rxfilename + uttid_rxfilename = line.split() + + assert len(uttid_rxfilename) == 2 + + uttid, rxfilename = uttid_rxfilename + + assert uttid in items + + items[uttid][-1] = rxfilename + + n += 1 + + # every utterance should have a label if + # labels_scp is given + assert n == expected_count + + self.items = list(items.values()) + self.num_items = len(self.items) + self.feats_scp = feats_scp + self.labels_scp = labels_scp + + def __len__(self): + return self.num_items + + def __getitem__(self, i): + ''' + Returns: + a list [key, feat_rxfilename, label_rxfilename] + Note that label_rxfilename may be None. + ''' + return self.items[i] + + def __str__(self): + s = 'feats scp: {}\n'.format(self.feats_scp) + + if self.labels_scp: + s += 'labels scp: {}\n'.format(self.labels_scp) + + s += 'num utterances: {}\n'.format(self.num_items) + + return s + + +class CtcDatasetCollateFunc: + + def __init__(self, model_left_context=0, model_right_context=0): + self.model_left_context = model_left_context + self.model_right_context = model_right_context + + def __call__(self, batch): + ''' + Args: + batch: a list of [uttid, feat_rxfilename, label_rxfilename]. + Note that label_rxfilename may be None. + + Returns: + uttid_list: a list of utterance id + + feat: a 3-D float tensor of shape [batch_size, seq_len, feat_dim] + + feat_len_list: number of frames of each utterance before padding + + label_list: a list of labels of each utterance; It may be None. + + label_len_list: label length of each utterance; It is None if label_list is None. + ''' + uttid_list = [] # utterance id of each utterance + feat_len_list = [] # number of frames of each utterance + label_list = [] # label of each utterance + label_len_list = [] # label length of each utterance + + feat_list = [] + + for b in batch: + uttid, feat_rxfilename, label_rxfilename = b + + uttid_list.append(uttid) + + feat = kaldi.read_mat(feat_rxfilename).numpy() + + # use the length before padding + feat_len_list.append(feat.shape[0]) + + feat = _add_model_left_right_context(feat, self.model_left_context, + self.model_right_context) + + feat = torch.from_numpy(feat).float() + + feat_list.append(feat) + + if label_rxfilename: + label = kaldi.read_vec_int(label_rxfilename) + assert 0 not in label + + # we will use frame subsampling factor == 3 + assert len(label) < feat_len_list[-1] / 3 + + label_list.extend(label) + label_len_list.append(len(label)) + + feat = pad_sequence(feat_list, batch_first=True) + + if not label_list: + label_list = None + label_len_list = None + + return uttid_list, feat, feat_len_list, label_list, label_len_list + + +def _test_dataset(): + feats_scp = 'data/train_sp/feats.scp' + labels_scp = 'data/train_sp/labels.scp' + + dataset = CtcDataset(feats_scp=feats_scp, labels_scp=labels_scp) + + print(dataset) + + +def _test_dataloader(): + feats_scp = 'data/test/feats.scp' + labels_scp = 'data/test/labels.scp' + + dataset = CtcDataset(feats_scp=feats_scp, labels_scp=labels_scp) + + dataloader = DataLoader(dataset, + batch_size=2, + num_workers=10, + shuffle=True, + collate_fn=CtcDatasetCollateFunc()) + i = 0 + for batch in dataloader: + uttid_list, feat, feat_len_list, label_list, label_len_list = batch + print(uttid_list, feat.shape, feat_len_list, label_len_list) + i += 1 + if i > 10: + break + + +if __name__ == '__main__': + # _test_dataset() + _test_dataloader() diff --git a/egs/aishell/s10b/ctc/ddp_train.py b/egs/aishell/s10b/ctc/ddp_train.py new file mode 100644 index 00000000000..0047d2fa554 --- /dev/null +++ b/egs/aishell/s10b/ctc/ddp_train.py @@ -0,0 +1,275 @@ +#!/usr/bin/env python3 + +# Copyright 2020 Mobvoi AI Lab, Beijing, China (author: Fangjun Kuang) +# Apache 2.0 + +import logging +import math +import os +import sys +import warnings + +# disable warnings when loading tensorboard +warnings.simplefilter(action='ignore', category=FutureWarning) + +import torch +import torch.distributed as dist +import torch.nn.functional as F +import torch.optim as optim +from torch.nn.parallel import DistributedDataParallel as DDP +from torch.nn.utils import clip_grad_value_ +from torch.utils.tensorboard import SummaryWriter + +import kaldi + +from common import load_checkpoint +from common import save_checkpoint +from common import save_training_info +from common import setup_logger +from ctc_loss import CTCLoss +from dataset import get_ctc_dataloader +from model import get_ctc_model +from options import get_args + + +def train_one_epoch(dataloader, model, device, optimizer, loss_func, + current_epoch, tf_writer): + total_loss = 0. + num = 0. + + # TODO(fangjun): remove `num_repeat`. It's used only for testing. + num_repeat = 100 + for kk in range(num_repeat): + for batch_idx, batch in enumerate(dataloader): + unused_uttid_list, feat, feat_len_list, label_list, label_len_list = batch + + feat = feat.to(device) + + activations, feat_len_list = model(feat, feat_len_list) + + # at this point activations is of shape: [batch_size, seq_len, output_dim] + # CTCLoss requires a layout: [seq_len, batch_size, output_dim] + + activations = activations.permute(1, 0, 2) + # now activations is of shape [seq_len, batch_size, output_dim] + + targets = torch.tensor(label_list) + + if not isinstance(feat_len_list, torch.Tensor): + input_lengths = torch.tensor(feat_len_list) + else: + input_lengths = feat_len_list + + target_lengths = torch.tensor(label_len_list) + + loss = loss_func(activations=activations, + targets=targets, + input_lengths=input_lengths, + target_lengths=target_lengths) + + optimizer.zero_grad() + if math.isnan(loss.item()): + print(loss) + logging.warn('loss is nan for batch {} at epoch {}\n' + 'feat_len_list: {}\n' + 'label_len_list: {}\n'.format( + batch_idx, current_epoch, feat_len_list, + label_len_list)) + import sys + sys.exit(1) + + loss.backward() + + # clip_grad_value_(model.parameters(), 5.0) + + optimizer.step() + + total_loss += loss.item() + num += 1 + if batch_idx % 100 == 0: + logging.info( + 'Device ({}) batch {}/{} ({:.2f}%) ({}/{}), loss {:.5f}, average {:.5f}' + .format(device.index, batch_idx, len(dataloader), + float(batch_idx) / len(dataloader) * 100, kk, + num_repeat, loss.item(), total_loss / num)) + + if tf_writer and batch_idx % 100 == 0: + tf_writer.add_scalar( + 'train/current_batch_average_loss', loss.item(), + batch_idx + kk * len(dataloader) + + num_repeat * len(dataloader) * current_epoch) + + tf_writer.add_scalar( + 'train/global_average_loss', total_loss / num, + batch_idx + kk * len(dataloader) + + num_repeat * len(dataloader) * current_epoch) + + return total_loss / num + + +def main(): + args = get_args() + setup_logger('{}/log-train-device-{}'.format(args.dir, args.device_id), + args.log_level) + logging.info(' '.join(sys.argv)) + + if torch.cuda.is_available() == False: + logging.error('No GPU detected!') + sys.exit(-1) + + dist.init_process_group('nccl', + rank=args.device_id, + world_size=args.world_size) + + kaldi.SelectGpuDevice(device_id=args.device_id) + + device = torch.device('cuda', args.device_id) + + model = get_ctc_model(input_dim=args.input_dim, + output_dim=args.output_dim, + num_layers=args.num_layers, + hidden_dim=args.hidden_dim, + proj_dim=args.proj_dim) + + start_epoch = 0 + num_epochs = args.num_epochs + learning_rate = args.learning_rate + best_loss = None + + if args.checkpoint: + start_epoch, learning_rate, best_loss = load_checkpoint( + args.checkpoint, model) + logging.info( + 'loaded from checkpoint: start epoch {start_epoch}, ' + 'learning rate {learning_rate}, best loss {best_loss}'.format( + start_epoch=start_epoch, + learning_rate=learning_rate, + best_loss=best_loss)) + + model.to(device) + + model = DDP(model, device_ids=[args.device_id]) + + dataloader = get_ctc_dataloader( + feats_scp=args.feats_scp, + labels_scp=args.labels_scp, + batch_size=args.batch_size, + shuffle=True, + num_workers=8, + model_left_context=args.model_left_context, + model_right_context=args.model_right_context, + world_size=args.world_size, + local_rank=args.device_id) + + lr = learning_rate + optimizer = optim.Adam(model.parameters(), + lr=lr, + weight_decay=args.l2_regularize) + + if device.index == 0: + tf_writer = SummaryWriter(log_dir='{}/tensorboard'.format(args.dir)) + else: + tf_writer = None + + model.train() + + loss_func = CTCLoss(use_warp_ctc=False, blank=0, reduction='mean') + + best_epoch = 0 + best_model_path = os.path.join(args.dir, 'best_model.pt') + best_epoch_info_filename = os.path.join(args.dir, 'best-epoch-info') + + dist.barrier() + + try: + for epoch in range(start_epoch, num_epochs): + learning_rate = lr * pow(0.8, epoch) + # learning_rate = lr + + if tf_writer: + tf_writer.add_scalar('learning_rate', learning_rate, epoch) + + for param_group in optimizer.param_groups: + param_group['lr'] = learning_rate + + logging.info('Device ({}) epoch {}, learning rate {}'.format( + device.index, epoch, learning_rate)) + + loss = train_one_epoch(dataloader=dataloader, + model=model, + device=device, + optimizer=optimizer, + loss_func=loss_func, + current_epoch=epoch, + tf_writer=tf_writer) + + # the lower, the better + if best_loss is None or best_loss > loss: + best_loss = loss + best_epoch = epoch + save_checkpoint(filename=best_model_path, + model=model, + epoch=epoch, + learning_rate=learning_rate, + loss=loss, + local_rank=args.device_id) + save_training_info(filename=best_epoch_info_filename, + model_path=best_model_path, + current_epoch=epoch, + learning_rate=learning_rate, + loss=loss, + best_loss=best_loss, + best_epoch=best_epoch, + local_rank=args.device_id) + + # we always save the model for every epoch + model_path = os.path.join(args.dir, 'epoch-{}.pt'.format(epoch)) + save_checkpoint(filename=model_path, + model=model, + epoch=epoch, + learning_rate=learning_rate, + loss=loss, + local_rank=args.device_id) + + epoch_info_filename = os.path.join(args.dir, + 'epoch-{}-info'.format(epoch)) + save_training_info(filename=epoch_info_filename, + model_path=model_path, + current_epoch=epoch, + learning_rate=learning_rate, + loss=loss, + best_loss=best_loss, + best_epoch=best_epoch, + local_rank=args.device_id) + except KeyboardInterrupt: + # save the model when ctrl-c is pressed + model_path = os.path.join(args.dir, + 'epoch-{}-interrupted.pt'.format(epoch)) + # use a very large loss for the interrupted model + loss = 100000000 + save_checkpoint(model_path, + model=model, + epoch=epoch, + learning_rate=learning_rate, + loss=loss, + local_rank=args.device_id) + + epoch_info_filename = os.path.join( + args.dir, 'epoch-{}-interrupted-info'.format(epoch)) + save_training_info(filename=epoch_info_filename, + model_path=model_path, + current_epoch=epoch, + learning_rate=learning_rate, + loss=loss, + best_loss=best_loss, + best_epoch=best_epoch, + local_rank=args.device_id) + + if tf_writer: + tf_writer.close() + logging.warning('Device ({}) Training done!'.format(args.device_id)) + + +if __name__ == '__main__': + torch.manual_seed(20200221) + main() diff --git a/egs/aishell/s10b/ctc/inference.py b/egs/aishell/s10b/ctc/inference.py new file mode 100644 index 00000000000..0eb922a0376 --- /dev/null +++ b/egs/aishell/s10b/ctc/inference.py @@ -0,0 +1,93 @@ +#!/usr/bin/env python3 + +# Copyright 2020 Mobvoi AI Lab, Beijing, China (author: Fangjun Kuang) +# Apache 2.0 + +import logging +import os +import sys + +import torch +import torch.nn.functional as F + +import kaldi + +from common import load_checkpoint +from common import setup_logger +from dataset import get_ctc_dataloader +from model import get_ctc_model +from options import get_args +from tdnnf_model import get_tdnnf_model + + +def main(): + args = get_args() + + setup_logger('{}/log-inference'.format(args.dir), args.log_level) + logging.info(' '.join(sys.argv)) + + if torch.cuda.is_available() == False: + logging.warning('No GPU detected! Use CPU for inference.') + device = torch.device('cpu') + else: + device = torch.device('cuda', args.device_id) + + model = get_tdnnf_model( + input_dim=args.input_dim, + output_dim=args.output_dim, + hidden_dim=args.hidden_dim, + bottleneck_dim=args.bottleneck_dim, + prefinal_bottleneck_dim=args.prefinal_bottleneck_dim, + kernel_size_list=args.kernel_size_list, + subsampling_factor_list=args.subsampling_factor_list) + + load_checkpoint(args.checkpoint, model) + + model.to(device) + model.eval() + + wspecifier = 'ark,scp:{filename}.ark,{filename}.scp'.format( + filename=os.path.join(args.dir, 'nnet_output')) + + writer = kaldi.MatrixWriter(wspecifier) + + dataloader = get_ctc_dataloader( + feats_scp=args.feats_scp, + batch_size=args.batch_size, + shuffle=False, + num_workers=8, + model_left_context=args.model_left_context, + model_right_context=args.model_right_context) + + for batch_idx, batch in enumerate(dataloader): + uttid_list, feat, feat_len_list, _, _ = batch + + feat = feat.to(device) + + with torch.no_grad(): + activations, feat_len_list = model(feat, feat_len_list) + + log_probs = F.log_softmax(activations, dim=-1) + + num = len(uttid_list) + for i in range(num): + uttid = uttid_list[i] + feat_len = feat_len_list[i] + value = log_probs[i, :feat_len, :] + + value = value.cpu() + + writer.Write(uttid, value.numpy()) + + if batch_idx % 10 == 0: + logging.info('Processed batch {}/{} ({:.3f}%)'.format( + batch_idx, len(dataloader), + float(batch_idx) / len(dataloader) * 100)) + + writer.Close() + logging.info('pseudo-log-likelihood is saved to {}'.format( + os.path.join(args.dir, 'nnet_output.scp'))) + + +if __name__ == '__main__': + main() diff --git a/egs/aishell/s10b/ctc/model.py b/egs/aishell/s10b/ctc/model.py new file mode 100644 index 00000000000..dbda62f6fca --- /dev/null +++ b/egs/aishell/s10b/ctc/model.py @@ -0,0 +1,142 @@ +#!/usr/bin/env python3 + +# Copyright 2019-2020 Mobvoi AI Lab, Beijing, China (author: Fangjun Kuang) +# Apache 2.0 + +import logging + +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch.nn.utils.rnn import pack_padded_sequence +from torch.nn.utils.rnn import pad_packed_sequence + +from add_deltas_layer import AddDeltasLayer + + +# TODO(fangjun): remove proj_dim since we'll use TDNN-F. +def get_ctc_model(input_dim, + output_dim, + num_layers=4, + hidden_dim=512, + proj_dim=256, + frame_subsampling_factor=3): + model = CtcModel(input_dim=input_dim, + output_dim=output_dim, + num_layers=num_layers, + hidden_dim=hidden_dim, + proj_dim=proj_dim, + frame_subsampling_factor=frame_subsampling_factor) + + return model + + +class CtcModel(nn.Module): + + def __init__(self, input_dim, output_dim, num_layers, hidden_dim, proj_dim, + frame_subsampling_factor): + ''' + Args: + input_dim: input dimension of the network + + output_dim: output dimension of the network + + num_layers: number of LSTM layers of the network + + hidden_dim: the dimension of the hidden state of LSTM layers + + proj_dim: dimension of the affine layer after every LSTM layer + ''' + super().__init__() + + assert frame_subsampling_factor in [1, 3] + self.sf = frame_subsampling_factor + + lstm_layer_list = [] + proj_layer_list = [] + + # batchnorm requires input of shape [N, C, L] == [batch_size, dim, seq_len] + self.input_batch_norm = nn.BatchNorm1d(num_features=input_dim * 3, + affine=False) + + self.lstm = nn.LSTM(input_size=input_dim * 3, + hidden_size=hidden_dim, + num_layers=num_layers, + dropout=0.2, + batch_first=True) + + self.prefinal = nn.Linear(in_features=hidden_dim, + out_features=output_dim) + + self.add_deltas_layer = AddDeltasLayer() + + def forward(self, feat, feat_len_list): + ''' + Args: + feat: a 3-D tensor of shape [batch_size, seq_len, feat_dim] + feat_len_list: feat length of each utterance before padding + + Returns: + a 3-D tensor of shape [batch_size, seq_len, output_dim] + It is the output of `nn.Linear`. That is, **NO** log_softmax + is applied to the output. + ''' + x = feat + + # at his point, x is of shape [batch_size, seq_len, feat_dim] + x = x.permute(0, 2, 1) + + # at his point, x is of shape [batch_size, feat_dim, seq_len] == [N, C, L] + + x = self.add_deltas_layer(x) + + if self.sf == 3: + x = x[:, :, ::3] + feat_len_list = (torch.tensor(feat_len_list).int() + 2) / 3 + # feat_len_list is still of type int32 + + x = self.input_batch_norm(x) + + x = x.permute(0, 2, 1) + + # at his point, x is of shape [batch_size, seq_len, feat_dim] == [N, L, C] + + x = pack_padded_sequence(input=x, + lengths=feat_len_list, + batch_first=True, + enforce_sorted=False) + + # TODO(fangjun): save intermediate LSTM state to support streaming inference + x, _ = self.lstm(x) + + x, _ = pad_packed_sequence(x, batch_first=True) + + x = self.prefinal(x) + + return x, feat_len_list + + +def _test_ctc_model(): + input_dim = 5 + output_dim = 20 + model = CtcModel(input_dim=input_dim, + output_dim=output_dim, + num_layers=2, + hidden_dim=3, + proj_dim=4) + + feat1 = torch.randn((6, input_dim)) + feat2 = torch.randn((8, input_dim)) + + from torch.nn.utils.rnn import pad_sequence + feat = pad_sequence([feat1, feat2], batch_first=True) + assert feat.shape == torch.Size([2, 8, input_dim]) + + feat_len_list = [6, 8] + x = model(feat, feat_len_list) + + assert x.shape == torch.Size([2, 8, output_dim]) + + +if __name__ == '__main__': + _test_ctc_model() diff --git a/egs/aishell/s10b/ctc/options.py b/egs/aishell/s10b/ctc/options.py new file mode 100644 index 00000000000..178f34d45fd --- /dev/null +++ b/egs/aishell/s10b/ctc/options.py @@ -0,0 +1,226 @@ +#!/usr/bin/env python3 + +# Copyright 2020 Mobvoi AI Lab, Beijing, China (author: Fangjun Kuang) +# Apache 2.0 + +import argparse +import os + + +def _str2bool(v): + ''' + This function is modified from + https://stackoverflow.com/questions/15008758/parsing-boolean-values-with-argparse + ''' + if isinstance(v, bool): + return v + elif v.lower() in ('yes', 'true', 't', 'y', '1'): + return True + elif v.lower() in ('no', 'false', 'f', 'n', '0'): + return False + else: + raise argparse.ArgumentTypeError('Boolean value expected.') + + +def _set_training_args(parser): + parser.add_argument('--train.labels-scp', + dest='labels_scp', + help='filename of labels.scp', + type=str) + + parser.add_argument('--train.num-epochs', + dest='num_epochs', + help='number of epochs to train', + type=int) + + parser.add_argument('--train.lr', + dest='learning_rate', + help='learning rate', + type=float) + + parser.add_argument('--train.l2-regularize', + dest='l2_regularize', + help='l2 regularize', + type=float) + + # TODO(fangjun): add validation feats_scp + + # PyTorch DistributedDataParallel (ddp) parameters + parser.add_argument( + '--train.use-ddp', + dest='use_ddp', + help="true to use PyTorch's built-in DistributedDataParallel trainer", + type=_str2bool) + + # note that we use device id as local rank. + + parser.add_argument('--train.ddp.world-size', + dest='world_size', + help='world size in ddp', + default=1, + type=int) + + +def _check_training_args(args): + assert os.path.isfile(args.labels_scp) + + assert args.num_epochs > 0 + assert args.learning_rate > 0 + assert args.l2_regularize >= 0 + + if args.checkpoint: + assert os.path.exists(args.checkpoint) + + if args.use_ddp: + assert args.world_size >= 1 + + +def _check_inference_args(args): + assert args.checkpoint is not None + assert os.path.isfile(args.checkpoint) + + +def _check_args(args): + if args.is_training: + _check_training_args(args) + else: + _check_inference_args(args) + + assert os.path.isdir(args.dir) + assert os.path.isfile(args.feats_scp) + + assert args.batch_size > 0 + assert args.device_id >= 0 + + assert args.input_dim > 0 + assert args.output_dim > 0 + assert args.model_left_context >= 0 + assert args.model_right_context >= 0 + assert args.hidden_dim > 0 + assert args.bottleneck_dim > 0 + assert args.prefinal_bottleneck_dim > 0 + + assert args.kernel_size_list is not None + assert len(args.kernel_size_list) > 0 + + assert args.subsampling_factor_list is not None + assert len(args.subsampling_factor_list) > 0 + + args.kernel_size_list = [int(k) for k in args.kernel_size_list.split(', ')] + + args.subsampling_factor_list = [ + int(k) for k in args.subsampling_factor_list.split(', ') + ] + + assert len(args.kernel_size_list) == len(args.subsampling_factor_list) + + assert args.log_level in ['debug', 'info', 'warning'] + + +def get_args(): + parser = argparse.ArgumentParser( + description='CTC training in PyTorch with kaldi pybind') + + _set_training_args(parser) + + parser.add_argument('--is-training', + dest='is_training', + help='true for training, false for inference', + required=True, + type=_str2bool) + + parser.add_argument('--dir', + help='dir to save results. The user has to ' + 'create it before calling this script.', + required=True, + type=str) + + parser.add_argument('--feats-scp', + dest='feats_scp', + help='filename of feats.scp', + required=True, + type=str) + + parser.add_argument('--device-id', + dest='device_id', + help='GPU device id', + required=True, + type=int) + + parser.add_argument('--batch-size', + dest='batch_size', + help='batch size used in training and inference', + required=True, + type=int) + + parser.add_argument('--input-dim', + dest='input_dim', + help='input dimension of the network', + required=True, + type=int) + + parser.add_argument('--output-dim', + dest='output_dim', + help='output dimension of the network', + required=True, + type=int) + + parser.add_argument('--model-left-context', + dest='model_left_context', + help='model left context', + type=int, + default=0) + + parser.add_argument('--model-right-context', + dest='model_right_context', + help='model right context', + type=int, + default=0) + + parser.add_argument('--hidden-dim', + dest='hidden_dim', + help='nn hidden dimension', + required=True, + type=int) + + parser.add_argument('--bottleneck-dim', + dest='bottleneck_dim', + help='nn bottleneck dimension', + required=True, + type=int) + + parser.add_argument('--prefinal-bottleneck-dim', + dest='prefinal_bottleneck_dim', + help='nn prefinal bottleneck dimension', + required=True, + type=int) + + parser.add_argument('--kernel-size-list', + dest='kernel_size_list', + help='kernel_size_list', + required=True, + type=str) + + parser.add_argument('--subsampling-factor-list', + dest='subsampling_factor_list', + help='subsampling_factor_list', + required=True, + type=str) + + parser.add_argument('--log-level', + dest='log_level', + help='log level. valid values: debug, info, warning', + type=str, + default='info') + + parser.add_argument( + '--checkpoint', + dest='checkpoint', + help='filename of the checkpoint, required for inference', + type=str) + + args = parser.parse_args() + + _check_args(args) + + return args diff --git a/egs/aishell/s10b/ctc/tdnnf_layer.py b/egs/aishell/s10b/ctc/tdnnf_layer.py new file mode 120000 index 00000000000..60cac44c091 --- /dev/null +++ b/egs/aishell/s10b/ctc/tdnnf_layer.py @@ -0,0 +1 @@ +../../s10/chain/tdnnf_layer.py \ No newline at end of file diff --git a/egs/aishell/s10b/ctc/tdnnf_model.py b/egs/aishell/s10b/ctc/tdnnf_model.py new file mode 100644 index 00000000000..788952a4505 --- /dev/null +++ b/egs/aishell/s10b/ctc/tdnnf_model.py @@ -0,0 +1,201 @@ +#!/usr/bin/env python3 + +# Copyright 2019-2020 Mobvoi AI Lab, Beijing, China (author: Fangjun Kuang) +# Apache 2.0 + +import logging + +import torch +import torch.nn as nn +import torch.nn.functional as F + +from add_deltas_layer import AddDeltasLayer +from tdnnf_layer import FactorizedTDNN +from tdnnf_layer import OrthonormalLinear +from tdnnf_layer import PrefinalLayer + + +def get_tdnnf_model(input_dim, output_dim, hidden_dim, bottleneck_dim, + prefinal_bottleneck_dim, kernel_size_list, + subsampling_factor_list): + model = TdnnfModel(input_dim=input_dim, + output_dim=output_dim, + hidden_dim=hidden_dim, + bottleneck_dim=bottleneck_dim, + prefinal_bottleneck_dim=prefinal_bottleneck_dim, + kernel_size_list=kernel_size_list, + subsampling_factor_list=subsampling_factor_list) + return model + + +''' +input dim=43 name=input + +# please note that it is important to have input layer with the name=input +# as the layer immediately preceding the fixed-affine-layer to enable +# the use of short notation for the descriptor +fixed-affine-layer name=lda input=Append(-1,0,1) affine-transform-file=exp/chain_cleaned_1c/tdnn1c_sp/configs/lda.mat + +# the first splicing is moved before the lda layer, so no splicing here +relu-batchnorm-dropout-layer name=tdnn1 l2-regularize=0.008 dropout-proportion=0.0 dropout-per-dim-continuous=true dim=1024 +tdnnf-layer name=tdnnf2 l2-regularize=0.008 dropout-proportion=0.0 bypass-scale=0.66 dim=1024 bottleneck-dim=128 time-stride=1 +tdnnf-layer name=tdnnf3 l2-regularize=0.008 dropout-proportion=0.0 bypass-scale=0.66 dim=1024 bottleneck-dim=128 time-stride=1 +tdnnf-layer name=tdnnf4 l2-regularize=0.008 dropout-proportion=0.0 bypass-scale=0.66 dim=1024 bottleneck-dim=128 time-stride=1 +tdnnf-layer name=tdnnf5 l2-regularize=0.008 dropout-proportion=0.0 bypass-scale=0.66 dim=1024 bottleneck-dim=128 time-stride=0 +tdnnf-layer name=tdnnf6 l2-regularize=0.008 dropout-proportion=0.0 bypass-scale=0.66 dim=1024 bottleneck-dim=128 time-stride=3 +tdnnf-layer name=tdnnf7 l2-regularize=0.008 dropout-proportion=0.0 bypass-scale=0.66 dim=1024 bottleneck-dim=128 time-stride=3 +tdnnf-layer name=tdnnf8 l2-regularize=0.008 dropout-proportion=0.0 bypass-scale=0.66 dim=1024 bottleneck-dim=128 time-stride=3 +tdnnf-layer name=tdnnf9 l2-regularize=0.008 dropout-proportion=0.0 bypass-scale=0.66 dim=1024 bottleneck-dim=128 time-stride=3 +tdnnf-layer name=tdnnf10 l2-regularize=0.008 dropout-proportion=0.0 bypass-scale=0.66 dim=1024 bottleneck-dim=128 time-stride=3 +tdnnf-layer name=tdnnf11 l2-regularize=0.008 dropout-proportion=0.0 bypass-scale=0.66 dim=1024 bottleneck-dim=128 time-stride=3 +tdnnf-layer name=tdnnf12 l2-regularize=0.008 dropout-proportion=0.0 bypass-scale=0.66 dim=1024 bottleneck-dim=128 time-stride=3 +tdnnf-layer name=tdnnf13 l2-regularize=0.008 dropout-proportion=0.0 bypass-scale=0.66 dim=1024 bottleneck-dim=128 time-stride=3 +linear-component name=prefinal-l dim=256 l2-regularize=0.008 orthonormal-constraint=-1.0 + +prefinal-layer name=prefinal-chain input=prefinal-l l2-regularize=0.008 big-dim=1024 small-dim=256 +output-layer name=output include-log-softmax=false dim=3456 l2-regularize=0.002 + +prefinal-layer name=prefinal-xent input=prefinal-l l2-regularize=0.008 big-dim=1024 small-dim=256 +output-layer name=output-xent dim=3456 learning-rate-factor=5.0 l2-regularize=0.002 +''' + + +# Create a network like the above one +class TdnnfModel(nn.Module): + + def __init__(self, + input_dim, + output_dim, + hidden_dim=1024, + bottleneck_dim=128, + prefinal_bottleneck_dim=256, + kernel_size_list=[3, 3, 3, 1, 3, 3, 3, 3, 3, 3, 3, 3], + subsampling_factor_list=[1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1]): + super().__init__() + + assert len(kernel_size_list) == len(subsampling_factor_list) + num_layers = len(kernel_size_list) + + # deltas_layer requires [N, C, T] + self.delta_layer = AddDeltasLayer() + + # batch_norm0 requires [N, C, T] + self.batch_norm0 = nn.BatchNorm1d(num_features=input_dim * 3, + affine=False) + + # tdnn1_affine requires [N, T, C] + self.tdnn1_affine = nn.Linear(in_features=input_dim * 3, + out_features=hidden_dim) + + # tdnn1_batchnorm requires [N, C, T] + self.tdnn1_batchnorm = nn.BatchNorm1d(num_features=hidden_dim, + affine=False) + + tdnnfs = [] + for i in range(num_layers): + kernel_size = kernel_size_list[i] + subsampling_factor = subsampling_factor_list[i] + layer = FactorizedTDNN(dim=hidden_dim, + bottleneck_dim=bottleneck_dim, + kernel_size=kernel_size, + subsampling_factor=subsampling_factor) + tdnnfs.append(layer) + + # tdnnfs requires [N, C, T] + self.tdnnfs = nn.ModuleList(tdnnfs) + + # prefinal_l affine requires [N, C, T] + self.prefinal_l = OrthonormalLinear( + dim=hidden_dim, + bottleneck_dim=prefinal_bottleneck_dim, + kernel_size=1) + + # prefinal_chain requires [N, C, T] + self.prefinal_chain = PrefinalLayer(big_dim=hidden_dim, + small_dim=prefinal_bottleneck_dim) + + # output_affine requires [N, T, C] + self.output_affine = nn.Linear(in_features=prefinal_bottleneck_dim, + out_features=output_dim) + + # prefinal_xent requires [N, C, T] + self.prefinal_xent = PrefinalLayer(big_dim=hidden_dim, + small_dim=prefinal_bottleneck_dim) + + self.output_xent_affine = nn.Linear(in_features=prefinal_bottleneck_dim, + out_features=output_dim) + + # TODO(fangjun): avoid `permute`. + def forward(self, x, feat_len_list): + # input x is of shape: [batch_size, seq_len, input_dim] = [N, T, C] + assert x.ndim == 3 + + # at this point, x is [N, T, C] + x = x.permute(0, 2, 1) + + # at this point, x is [N, C, T] + x = self.delta_layer(x) + + # at this point, x is [N, C, T] + x = self.batch_norm0(x) + + # at this point, x is [N, C, T] + + x = x.permute(0, 2, 1) + + # at this point, x is [N, T, C] + + x = self.tdnn1_affine(x) + + # at this point, x is [N, T, C] + + x = F.relu(x) + + x = x.permute(0, 2, 1) + + # at this point, x is [N, C, T] + + x = self.tdnn1_batchnorm(x) + + # tdnnf requires input of shape [N, C, T] + for i in range(len(self.tdnnfs)): + x = self.tdnnfs[i](x) + + # at this point, x is [N, C, T] + + x = self.prefinal_l(x) + + # at this point, x is [N, C, T] + + x = self.prefinal_chain(x) + + # at this point, x is [N, C, T] + x = x.permute(0, 2, 1) + + # at this point, x is [N, T, C] + x = self.output_affine(x) + + feat_len_list = (torch.tensor(feat_len_list).int() + 2) / 3 + + return x, feat_len_list + + def constrain_orthonormal(self): + for i in range(len(self.tdnnfs)): + self.tdnnfs[i].constrain_orthonormal() + + self.prefinal_l.constrain_orthonormal() + self.prefinal_chain.constrain_orthonormal() + self.prefinal_xent.constrain_orthonormal() + + +if __name__ == '__main__': + input_dim = 40 + output_dim = 218 + model = TdnnfModel(input_dim=input_dim, output_dim=output_dim) + N = 1 + T = 150 + 29 + 29 + C = input_dim + x = torch.arange(N * T * C).reshape(N, T, C).float() + nnet_output = model(x) + print(x.shape, nnet_output.shape) + model.constrain_orthonormal() diff --git a/egs/aishell/s10b/ctc/train.py b/egs/aishell/s10b/ctc/train.py new file mode 100644 index 00000000000..69c99c603ca --- /dev/null +++ b/egs/aishell/s10b/ctc/train.py @@ -0,0 +1,259 @@ +#!/usr/bin/env python3 + +# Copyright 2020 Mobvoi AI Lab, Beijing, China (author: Fangjun Kuang) +# Apache 2.0 + +import logging +import math +import os +import sys +import warnings + +# disable warnings when loading tensorboard +warnings.simplefilter(action='ignore', category=FutureWarning) + +import numpy as np +import torch +import torch.nn.functional as F +import torch.optim as optim +from torch.nn.utils import clip_grad_value_ +from torch.utils.tensorboard import SummaryWriter + +import kaldi + +from common import load_checkpoint +from common import save_checkpoint +from common import save_training_info +from common import setup_logger +from ctc_loss import CTCLoss +from dataset import get_ctc_dataloader +from options import get_args +from tdnnf_model import get_tdnnf_model + + +def train_one_epoch(dataloader, model, device, optimizer, loss_func, + current_epoch, tf_writer): + total_loss = 0. + num = 0. + + num_repeat = 1 + for kk in range(num_repeat): + for batch_idx, batch in enumerate(dataloader): + unused_uttid_list, feat, feat_len_list, label_list, label_len_list = batch + + feat = feat.to(device) + + activations, feat_len_list = model(feat, feat_len_list) + + # at this point activations is of shape: [batch_size, seq_len, output_dim] + # CTCLoss requires a layout: [seq_len, batch_size, output_dim] + + activations = activations.permute(1, 0, 2) + # now activations is of shape [seq_len, batch_size, output_dim] + + targets = torch.tensor(label_list) + + if not isinstance(feat_len_list, torch.Tensor): + input_lengths = torch.tensor(feat_len_list) + else: + input_lengths = feat_len_list + + target_lengths = torch.tensor(label_len_list) + + loss = loss_func(activations=activations, + targets=targets, + input_lengths=input_lengths, + target_lengths=target_lengths) + + optimizer.zero_grad() + if math.isnan(loss.item()): + print(loss) + logging.warn('loss is nan for batch {} at epoch {}\n' + 'feat_len_list: {}\n' + 'label_len_list: {}\n'.format( + batch_idx, current_epoch, feat_len_list, + label_len_list)) + import sys + sys.exit(1) + + loss.backward() + + # clip_grad_value_(model.parameters(), 5.0) + + optimizer.step() + + total_loss += loss.item() + num += 1 + + if np.random.choice(4) == 0: + with torch.no_grad(): + model.constrain_orthonormal() + + if batch_idx % 100 == 0: + logging.info( + 'batch {}/{} ({:.2f}%) ({}/{}), loss {:.5f}, average {:.5f}' + .format(batch_idx, len(dataloader), + float(batch_idx) / len(dataloader) * 100, kk, + num_repeat, loss.item(), total_loss / num)) + + if batch_idx % 100 == 0: + tf_writer.add_scalar( + 'train/current_batch_average_loss', loss.item(), + batch_idx + kk * len(dataloader) + + num_repeat * len(dataloader) * current_epoch) + + tf_writer.add_scalar( + 'train/global_average_loss', total_loss / num, + batch_idx + kk * len(dataloader) + + num_repeat * len(dataloader) * current_epoch) + + return total_loss / num + + +def main(): + args = get_args() + setup_logger('{}/log-train'.format(args.dir), args.log_level) + logging.info(' '.join(sys.argv)) + + if torch.cuda.is_available() == False: + logging.error('No GPU detected!') + sys.exit(-1) + + kaldi.SelectGpuDevice(device_id=args.device_id) + + device = torch.device('cuda', args.device_id) + + model = get_tdnnf_model( + input_dim=args.input_dim, + output_dim=args.output_dim, + hidden_dim=args.hidden_dim, + bottleneck_dim=args.bottleneck_dim, + prefinal_bottleneck_dim=args.prefinal_bottleneck_dim, + kernel_size_list=args.kernel_size_list, + subsampling_factor_list=args.subsampling_factor_list) + + start_epoch = 0 + num_epochs = args.num_epochs + learning_rate = args.learning_rate + best_loss = None + + if args.checkpoint: + start_epoch, learning_rate, best_loss = load_checkpoint( + args.checkpoint, model) + logging.info( + 'loaded from checkpoint: start epoch {start_epoch}, ' + 'learning rate {learning_rate}, best loss {best_loss}'.format( + start_epoch=start_epoch, + learning_rate=learning_rate, + best_loss=best_loss)) + + model.to(device) + + dataloader = get_ctc_dataloader( + feats_scp=args.feats_scp, + labels_scp=args.labels_scp, + batch_size=args.batch_size, + shuffle=True, + num_workers=8, + model_left_context=args.model_left_context, + model_right_context=args.model_right_context) + + lr = learning_rate + optimizer = optim.Adam(model.parameters(), + lr=lr, + weight_decay=args.l2_regularize) + + tf_writer = SummaryWriter(log_dir='{}/tensorboard'.format(args.dir)) + + model.train() + + loss_func = CTCLoss(use_warp_ctc=False, blank=0, reduction='mean') + + best_epoch = 0 + best_model_path = os.path.join(args.dir, 'best_model.pt') + best_epoch_info_filename = os.path.join(args.dir, 'best-epoch-info') + + try: + for epoch in range(start_epoch, num_epochs): + learning_rate = lr * pow(0.8, epoch) + # learning_rate = lr + tf_writer.add_scalar('learning_rate', learning_rate, epoch) + + for param_group in optimizer.param_groups: + param_group['lr'] = learning_rate + + logging.info('epoch {}, learning rate {}'.format( + epoch, learning_rate)) + + loss = train_one_epoch(dataloader=dataloader, + model=model, + device=device, + optimizer=optimizer, + loss_func=loss_func, + current_epoch=epoch, + tf_writer=tf_writer) + + # the loss, the better + if best_loss is None or best_loss > loss: + best_loss = loss + best_epoch = epoch + save_checkpoint(filename=best_model_path, + model=model, + epoch=epoch, + learning_rate=learning_rate, + loss=loss) + save_training_info(filename=best_epoch_info_filename, + model_path=best_model_path, + current_epoch=epoch, + learning_rate=learning_rate, + loss=loss, + best_loss=best_loss, + best_epoch=best_epoch) + + # we always save the model for every epoch + model_path = os.path.join(args.dir, 'epoch-{}.pt'.format(epoch)) + save_checkpoint(filename=model_path, + model=model, + epoch=epoch, + learning_rate=learning_rate, + loss=loss) + + epoch_info_filename = os.path.join(args.dir, + 'epoch-{}-info'.format(epoch)) + save_training_info(filename=epoch_info_filename, + model_path=model_path, + current_epoch=epoch, + learning_rate=learning_rate, + loss=loss, + best_loss=best_loss, + best_epoch=best_epoch) + except KeyboardInterrupt: + # save the model when ctrl-c is pressed + model_path = os.path.join(args.dir, + 'epoch-{}-interrupted.pt'.format(epoch)) + # use a very large loss for the interrupted model + loss = 100000000 + save_checkpoint(model_path, + model=model, + epoch=epoch, + learning_rate=learning_rate, + loss=loss) + + epoch_info_filename = os.path.join( + args.dir, 'epoch-{}-interrupted-info'.format(epoch)) + save_training_info(filename=epoch_info_filename, + model_path=model_path, + current_epoch=epoch, + learning_rate=learning_rate, + loss=loss, + best_loss=best_loss, + best_epoch=best_epoch) + + tf_writer.close() + logging.warning('Training done!') + + +if __name__ == '__main__': + np.random.seed(20200221) + torch.manual_seed(20200221) + main() diff --git a/egs/aishell/s10b/local/aishell_data_prep.sh b/egs/aishell/s10b/local/aishell_data_prep.sh new file mode 100755 index 00000000000..4747e4f4d82 --- /dev/null +++ b/egs/aishell/s10b/local/aishell_data_prep.sh @@ -0,0 +1,68 @@ +#!/bin/bash + +# Copyright 2017 Xingyu Na +# Apache 2.0 + +. ./path.sh || exit 1; + +if [ $# != 2 ]; then + echo "Usage: $0 " + echo " $0 /export/a05/xna/data/data_aishell/wav /export/a05/xna/data/data_aishell/transcript" + exit 1; +fi + +aishell_audio_dir=$1 +aishell_text=$2/aishell_transcript_v0.8.txt + +train_dir=data/local/train +dev_dir=data/local/dev +test_dir=data/local/test +tmp_dir=data/local/tmp + +mkdir -p $train_dir +mkdir -p $dev_dir +mkdir -p $test_dir +mkdir -p $tmp_dir + +# data directory check +if [ ! -d $aishell_audio_dir ] || [ ! -f $aishell_text ]; then + echo "Error: $0 requires two directory arguments" + exit 1; +fi + +# find wav audio file for train, dev and test resp. +find $aishell_audio_dir -iname "*.wav" > $tmp_dir/wav.flist +n=`cat $tmp_dir/wav.flist | wc -l` +[ $n -ne 141925 ] && \ + echo Warning: expected 141925 data data files, found $n + +grep -i "wav/train" $tmp_dir/wav.flist > $train_dir/wav.flist || exit 1; +grep -i "wav/dev" $tmp_dir/wav.flist > $dev_dir/wav.flist || exit 1; +grep -i "wav/test" $tmp_dir/wav.flist > $test_dir/wav.flist || exit 1; + +rm -r $tmp_dir + +# Transcriptions preparation +for dir in $train_dir $dev_dir $test_dir; do + echo Preparing $dir transcriptions + sed -e 's/\.wav//' $dir/wav.flist | awk -F '/' '{print $NF}' > $dir/utt.list + sed -e 's/\.wav//' $dir/wav.flist | awk -F '/' '{i=NF-1;printf("%s %s\n",$NF,$i)}' > $dir/utt2spk_all + paste -d' ' $dir/utt.list $dir/wav.flist > $dir/wav.scp_all + utils/filter_scp.pl -f 1 $dir/utt.list $aishell_text > $dir/transcripts.txt + awk '{print $1}' $dir/transcripts.txt > $dir/utt.list + utils/filter_scp.pl -f 1 $dir/utt.list $dir/utt2spk_all | sort -u > $dir/utt2spk + utils/filter_scp.pl -f 1 $dir/utt.list $dir/wav.scp_all | sort -u > $dir/wav.scp + sort -u $dir/transcripts.txt > $dir/text + utils/utt2spk_to_spk2utt.pl $dir/utt2spk > $dir/spk2utt +done + +mkdir -p data/train data/dev data/test + +for f in spk2utt utt2spk wav.scp text; do + cp $train_dir/$f data/train/$f || exit 1; + cp $dev_dir/$f data/dev/$f || exit 1; + cp $test_dir/$f data/test/$f || exit 1; +done + +echo "$0: AISHELL data preparation succeeded" +exit 0; diff --git a/egs/aishell/s10b/local/aishell_prepare_dict.sh b/egs/aishell/s10b/local/aishell_prepare_dict.sh new file mode 100755 index 00000000000..c4cabb24de4 --- /dev/null +++ b/egs/aishell/s10b/local/aishell_prepare_dict.sh @@ -0,0 +1,36 @@ +#!/bin/bash + +# Copyright 2017 Xingyu Na +# Apache 2.0 + +# prepare dict resources + +. ./path.sh + +[ $# != 1 ] && echo "Usage: $0 " && exit 1; + +res_dir=$1 +dict_dir=data/local/dict +mkdir -p $dict_dir +cp $res_dir/lexicon.txt $dict_dir + +cat $dict_dir/lexicon.txt | awk '{ for(n=2;n<=NF;n++){ phones[$n] = 1; }} END{for (p in phones) print p;}'| \ + perl -e 'while(<>){ chomp($_); $phone = $_; next if ($phone eq "sil"); + m:^([^\d]+)(\d*)$: || die "Bad phone $_"; $q{$1} .= "$phone "; } + foreach $l (values %q) {print "$l\n";} + ' | sort -k1 > $dict_dir/nonsilence_phones.txt || exit 1; + +echo sil > $dict_dir/silence_phones.txt + +echo sil > $dict_dir/optional_silence.txt + +# No "extra questions" in the input to this setup, as we don't +# have stress or tone + +cat $dict_dir/silence_phones.txt| awk '{printf("%s ", $1);} END{printf "\n";}' > $dict_dir/extra_questions.txt || exit 1; +cat $dict_dir/nonsilence_phones.txt | perl -e 'while(<>){ foreach $p (split(" ", $_)) { + $p =~ m:^([^\d]+)(\d*)$: || die "Bad phone $_"; $q{$2} .= "$p "; } } foreach $l (values %q) {print "$l\n";}' \ + >> $dict_dir/extra_questions.txt || exit 1; + +echo "$0: AISHELL dict preparation succeeded" +exit 0; diff --git a/egs/aishell/s10b/local/aishell_train_lms.sh b/egs/aishell/s10b/local/aishell_train_lms.sh new file mode 100755 index 00000000000..9b6cdad2960 --- /dev/null +++ b/egs/aishell/s10b/local/aishell_train_lms.sh @@ -0,0 +1,88 @@ +#!/bin/bash + + +# To be run from one directory above this script. +. ./path.sh + +text=data/local/train/text +lexicon=data/local/dict/lexicon.txt + +for f in "$text" "$lexicon"; do + [ ! -f $x ] && echo "$0: No such file $f" && exit 1; +done + +# This script takes no arguments. It assumes you have already run +# aishell_data_prep.sh. +# It takes as input the files +# data/local/train/text +# data/local/dict/lexicon.txt +dir=data/local/lm +mkdir -p $dir + +kaldi_lm=`which train_lm.sh` +if [ -z $kaldi_lm ]; then + echo "$0: train_lm.sh is not found. That might mean it's not installed" + echo "$0: or it is not added to PATH" + echo "$0: Use the script tools/extras/install_kaldi_lm.sh to install it" + exit 1 +fi + +cleantext=$dir/text.no_oov + +cat $text | awk -v lex=$lexicon 'BEGIN{while((getline0){ seen[$1]=1; } } + {for(n=1; n<=NF;n++) { if (seen[$n]) { printf("%s ", $n); } else {printf(" ");} } printf("\n");}' \ + > $cleantext || exit 1; + +cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | sort | uniq -c | \ + sort -nr > $dir/word.counts || exit 1; + +# Get counts from acoustic training transcripts, and add one-count +# for each word in the lexicon (but not silence, we don't want it +# in the LM-- we'll add it optionally later). +cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | \ + cat - <(grep -w -v '!SIL' $lexicon | awk '{print $1}') | \ + sort | uniq -c | sort -nr > $dir/unigram.counts || exit 1; + +# note: we probably won't really make use of as there aren't any OOVs +cat $dir/unigram.counts | awk '{print $2}' | get_word_map.pl "" "" "" > $dir/word_map \ + || exit 1; + +# note: ignore 1st field of train.txt, it's the utterance-id. +cat $cleantext | awk -v wmap=$dir/word_map 'BEGIN{while((getline0)map[$1]=$2;} + { for(n=2;n<=NF;n++) { printf map[$n]; if(n$dir/train.gz \ + || exit 1; + +train_lm.sh --arpa --lmtype 3gram-mincount $dir || exit 1; + +# LM is small enough that we don't need to prune it (only about 0.7M N-grams). +# Perplexity over 128254.000000 words is 90.446690 + +# note: output is +# data/local/lm/3gram-mincount/lm_unpruned.gz + +exit 0 + + +# From here is some commands to do a baseline with SRILM (assuming +# you have it installed). +heldout_sent=10000 # Don't change this if you want result to be comparable with + # kaldi_lm results +sdir=$dir/srilm # in case we want to use SRILM to double-check perplexities. +mkdir -p $sdir +cat $cleantext | awk '{for(n=2;n<=NF;n++){ printf $n; if(n $sdir/heldout +cat $cleantext | awk '{for(n=2;n<=NF;n++){ printf $n; if(n $sdir/train + +cat $dir/word_map | awk '{print $1}' | cat - <(echo ""; echo "" ) > $sdir/wordlist + + +ngram-count -text $sdir/train -order 3 -limit-vocab -vocab $sdir/wordlist -unk \ + -map-unk "" -kndiscount -interpolate -lm $sdir/srilm.o3g.kn.gz +ngram -lm $sdir/srilm.o3g.kn.gz -ppl $sdir/heldout +# 0 zeroprobs, logprob= -250954 ppl= 90.5091 ppl1= 132.482 + +# Note: perplexity SRILM gives to Kaldi-LM model is same as kaldi-lm reports above. +# Difference in WSJ must have been due to different treatment of . +ngram -lm $dir/3gram-mincount/lm_unpruned.gz -ppl $sdir/heldout +# 0 zeroprobs, logprob= -250913 ppl= 90.4439 ppl1= 132.379 diff --git a/egs/aishell/s10b/local/convert_text_to_labels.py b/egs/aishell/s10b/local/convert_text_to_labels.py new file mode 100755 index 00000000000..ed1527e1623 --- /dev/null +++ b/egs/aishell/s10b/local/convert_text_to_labels.py @@ -0,0 +1,281 @@ +#!/usr/bin/env python3 + +# Copyright 2020 Mobvoi AI Lab, Beijing, China (author: Fangjun Kuang) +# Apache 2.0 + +# This program converts a transcript file `text` to labels +# used in CTC training. +# +# For example, if we have +# +# the lexicon file `lexicon.txt` +# +# foo f o o +# bar b a r +# +# the phone symbol table `tokens.txt` +# +# 0 +# 1 +# a 2 +# b 3 +# f 4 +# o 5 +# r 6 +# +# and the transcript file `text` +# +# utt1 foo bar bar +# utt2 bar +# +# Given the above three inputs, this program generates a +# file `labels.ark` containing +# +# utt1 3 4 4 2 1 5 2 1 5 +# utt2 2 1 5 +# +# where +# - `3 4 4` is from `(4-1) (5-1) (5-1)`, which is from the indices of `f o o` +# - `2 1 5` is from `(3-1) (2-1) (6-1)`, which is from the indices of `b a r` +# +# Note that 1 is subtracted from here since `` exists only in FSTs +# and the neural network considers index `0` as ``, Therefore, the integer +# value of every symbol is shifted downwards by 1. + +import argparse +import os + +import kaldi + + +def get_args(): + parser = argparse.ArgumentParser(description=''' +Convert transcript to labels. + +It takes the following inputs: + +- lexicon.txt, the lexicon file +- tokens.txt, the phone symbol table +- dir, a directory containing the transcript file `text` + +It generates `lables.scp` and `labels.ark` in the provided `dir`. + +Usage: + python3 ./local/convert_text_to_labels.py \ + --lexicon-filename data/lang/lexicon.txt \ + --tokens-filename data/lang/tokens.txt \ + --dir data/train + + It will generate data/train/labels.scp and data/train/labels.ark. + ''') + + parser.add_argument('--lexicon-filename', + dest='lexicon_filename', + type=str, + help='filename for lexicon.txt') + + parser.add_argument('--tokens-filename', + dest='tokens_filename', + type=str, + help='filename for the phone symbol table tokens.txt') + + parser.add_argument('--dir', + type=str, + help='''the dir containing the transcript text; + it will contain the generated labels.scp and labels.ark''') + + args = parser.parse_args() + + assert os.path.isfile(args.lexicon_filename) + assert os.path.isfile(args.tokens_filename) + assert os.path.isfile(os.path.join(args.dir, 'text')) + + return args + + +def read_lexicon(filename): + '''Read lexicon.txt and save it into a Python dict. + + Args: + filename: filename of lexicon.txt. + + Every line in lexicon.txt has the following format: + + word phone1 phone2 phone3 ... phoneN + + That is, fields are separated by spaces. The first + field is the word and the remaining fields are the + phones indicating the pronunciation of the word. + + Returns: + a dict whose keys are words and values are phones. + ''' + lexicon = dict() + + with open(filename, 'r', encoding='utf-8') as f: + for line in f: + # line contains: + # word phone1 phone2 phone3 ... phoneN + word_phones = line.split() + + # It should have at least two fields: + # the first one is the word and + # the second one is the pronunciation + assert len(word_phones) >= 2 + + word = word_phones[0] + phones = word_phones[1:] + + if word not in lexicon: + # if there are multiple pronunciations for a word, + # we choose only the first one and drop other alternatives + lexicon[word] = phones + + return lexicon + + +def read_tokens(filename): + '''Read phone symbol table tokens.txt and save it into a Python dict. + + Note that we remove the symbol `` and shift every symbol index + downwards by 1. + + Args: + filename: filename of the phone symbol table tokens.txt. + + Two integer values have specific meanings in the symbol + table. The first one is 0, which is reserved for ``. + And the second one is 1, which is reserved for the + blank symbol ``. + Other integer values do NOT have specific meanings. + + Returns: + a dict whose keys are phones and values are phone indices + ''' + tokens = dict() + with open(filename, 'r', encoding='utf-8') as f: + for line in f: + # line has the format: phone index + phone_index = line.split() + + # it should have two fields: + # the first field is the phone + # and the second field is its index + assert len(phone_index) == 2 + + phone = phone_index[0] + index = int(phone_index[1]) + + if phone == '': + # appears only in the FSTs. + continue + + # decreased by one since we removed above + # and every symbol index is shifted downwards by 1 + index -= 1 + + assert phone not in tokens + + tokens[phone] = index + + assert '' in tokens + + # WARNING(fangjun): we assume that the blank symbol has index 0 + # in the neural network output. + # Do NOT confuse it with `` in fst. + assert tokens[''] == 0 + + return tokens + + +def read_text(filename): + '''Read transcript file `text` and save it into a Python dict. + + Args: + filename: filename of the transcript file `text`. + + Returns: + a dict whose keys are utterance IDs and values are texts + ''' + transcript = dict() + + with open(filename, 'r', encoding='utf-8') as f: + for line in f: + # line has the format: uttid word1 word2 word3 ... wordN + uttid_text = line.split() + + # it should have at least 2 fields: + # the first field is the utterance id; + # the remaining fields are the words of the utterance + assert len(uttid_text) >= 2 + + uttid = uttid_text[0] + text = uttid_text[1:] + + assert uttid not in transcript + transcript[uttid] = text + + return transcript + + +def phones_to_indices(phone_list, tokens): + '''Convert a list of phones to a list of indices via a phone symbol table. + + Args: + phone_list: a list of phones + tokens: a dict representing a phone symbol table. + + Returns: + Return a list of indices corresponding to the given phones + ''' + index_list = [] + + for phone in phone_list: + assert phone in tokens + + index = tokens[phone] + index_list.append(index) + + return index_list + + +def main(): + args = get_args() + + lexicon = read_lexicon(args.lexicon_filename) + + tokens = read_tokens(args.tokens_filename) + + transcript = read_text(os.path.join(args.dir, 'text')) + + transcript_labels = dict() + + for uttid, text in transcript.items(): + labels = [] + for word in text: + # TODO(fangjun): add support for OOV. + phones = lexicon[word] + + indices = phones_to_indices(phones, tokens) + + labels.extend(indices) + + assert uttid not in transcript_labels + + transcript_labels[uttid] = labels + + wspecifier = 'ark,scp:{dir}/labels.ark,{dir}/labels.scp'.format( + dir=args.dir) + + writer = kaldi.IntVectorWriter(wspecifier) + + for uttid, labels in transcript_labels.items(): + writer.Write(uttid, labels) + + writer.Close() + + print('Generated label file {}/labels.scp successfully'.format(args.dir)) + + +if __name__ == '__main__': + main() diff --git a/egs/aishell/s10b/local/convert_text_to_labels.sh b/egs/aishell/s10b/local/convert_text_to_labels.sh new file mode 100755 index 00000000000..ba1cd823116 --- /dev/null +++ b/egs/aishell/s10b/local/convert_text_to_labels.sh @@ -0,0 +1,30 @@ +#!/bin/bash + +# Copyright 2020 Mobvoi AI Lab, Beijing, China (author: Fangjun Kuang) +# Apache 2.0 + +set -e + +echo "$0 $@" # Print the command line for logging + +if [ $# != 2 ]; then + echo "usage: $0 " + exit 1 +fi + +dir=$1 +lang=$2 + +[[ ! -f $dir/text ]] && echo "file $dir/text does not exist!" && exit 1 + +for f in lexicon.txt tokens.txt; do + if [[ ! -f $lang/$f ]]; then + echo "file $lang/$f does not exist!" + exit 1 + fi +done + +python3 ./local/convert_text_to_labels.py \ + --lexicon-filename $lang/lexicon.txt \ + --tokens-filename $lang/tokens.txt \ + --dir $dir diff --git a/egs/aishell/s10b/local/convert_text_to_labels_test.py b/egs/aishell/s10b/local/convert_text_to_labels_test.py new file mode 100644 index 00000000000..2fd86a91c7e --- /dev/null +++ b/egs/aishell/s10b/local/convert_text_to_labels_test.py @@ -0,0 +1,84 @@ +#!/usr/bin/env python3 + +# Copyright 2020 Mobvoi AI Lab, Beijing, China (author: Fangjun Kuang) +# Apache 2.0 + +import os +import shutil +import tempfile +import unittest + +import kaldi + + +def generate_test_lexicon(d): + s = 'foo f o o\n' + s += 'bar b a r\n' + + filename = os.path.join(d, 'lexicon.txt') + with open(filename, 'w') as f: + f.write(s) + + +def generate_test_tokens(d): + s = ''' 0 + 1 +a 2 +b 3 +f 4 +o 5 +r 6 +''' + filename = os.path.join(d, 'tokens.txt') + with open(filename, 'w') as f: + f.write(s) + + +def generate_test_text(d): + s = 'utt1 foo bar bar\n' + s += 'utt2 bar\n' + + filename = os.path.join(d, 'text') + with open(filename, 'w') as f: + f.write(s) + + +class ConvertTextToLablesTest(unittest.TestCase): + + def test(self): + d = tempfile.mkdtemp() + + generate_test_lexicon(d) + generate_test_tokens(d) + generate_test_text(d) + + cmd = ''' + python3 ./local/convert_text_to_labels.py \ + --lexicon-filename {lexicon} \ + --tokens-filename {tokens} \ + --dir {dir} + '''.format(lexicon=os.path.join(d, 'lexicon.txt'), + tokens=os.path.join(d, 'tokens.txt'), + dir=d) + + os.system(cmd) + + rspecifier = 'scp:{}/labels.scp'.format(d) + + reader = kaldi.SequentialIntVectorReader(rspecifier) + + expected_labels = dict() + expected_labels['utt1'] = [3, 4, 4, 2, 1, 5, 2, 1, 5] + expected_labels['utt2'] = [2, 1, 5] + + for key, value in reader: + self.assertTrue(key in expected_labels) + self.assertEqual(value, expected_labels[key]) + + reader.Close() + + shutil.rmtree(d) + + +if __name__ == '__main__': + unittest.main() diff --git a/egs/aishell/s10b/local/download_and_untar.sh b/egs/aishell/s10b/local/download_and_untar.sh new file mode 100755 index 00000000000..58a278241d7 --- /dev/null +++ b/egs/aishell/s10b/local/download_and_untar.sh @@ -0,0 +1,105 @@ +#!/bin/bash + +# Copyright 2014 Johns Hopkins University (author: Daniel Povey) +# 2017 Xingyu Na +# Apache 2.0 + +remove_archive=false + +if [ "$1" == --remove-archive ]; then + remove_archive=true + shift +fi + +if [ $# -ne 3 ]; then + echo "Usage: $0 [--remove-archive] " + echo "e.g.: $0 /export/a05/xna/data www.openslr.org/resources/33 data_aishell" + echo "With --remove-archive it will remove the archive after successfully un-tarring it." + echo " can be one of: data_aishell, resource_aishell." +fi + +data=$1 +url=$2 +part=$3 + +if [ ! -d "$data" ]; then + echo "$0: no such directory $data" + exit 1; +fi + +part_ok=false +list="data_aishell resource_aishell" +for x in $list; do + if [ "$part" == $x ]; then part_ok=true; fi +done +if ! $part_ok; then + echo "$0: expected to be one of $list, but got '$part'" + exit 1; +fi + +if [ -z "$url" ]; then + echo "$0: empty URL base." + exit 1; +fi + +if [ -f $data/$part/.complete ]; then + echo "$0: data part $part was already successfully extracted, nothing to do." + exit 0; +fi + +# sizes of the archive files in bytes. +sizes="15582913665 1246920" + +if [ -f $data/$part.tgz ]; then + size=$(/bin/ls -l $data/$part.tgz | awk '{print $5}') + size_ok=false + for s in $sizes; do if [ $s == $size ]; then size_ok=true; fi; done + if ! $size_ok; then + echo "$0: removing existing file $data/$part.tgz because its size in bytes $size" + echo "does not equal the size of one of the archives." + rm $data/$part.tgz + else + echo "$data/$part.tgz exists and appears to be complete." + fi +fi + +if [ ! -f $data/$part.tgz ]; then + if ! which wget >/dev/null; then + echo "$0: wget is not installed." + exit 1; + fi + full_url=$url/$part.tgz + echo "$0: downloading data from $full_url. This may take some time, please be patient." + + cd $data + if ! wget --no-check-certificate $full_url; then + echo "$0: error executing wget $full_url" + exit 1; + fi +fi + +cd $data + +if ! tar -xvzf $part.tgz; then + echo "$0: error un-tarring archive $data/$part.tgz" + exit 1; +fi + +touch $data/$part/.complete + +if [ $part == "data_aishell" ]; then + cd $data/$part/wav + for wav in ./*.tar.gz; do + echo "Extracting wav from $wav" + tar -zxf $wav && rm $wav + done +fi + +echo "$0: Successfully downloaded and un-tarred $data/$part.tgz" + +if $remove_archive; then + echo "$0: removing $data/$part.tgz file since --remove-archive option was supplied." + rm $data/$part.tgz +fi + +exit 0; diff --git a/egs/aishell/s10b/local/generate_tlg.sh b/egs/aishell/s10b/local/generate_tlg.sh new file mode 100755 index 00000000000..ba3ee4c4046 --- /dev/null +++ b/egs/aishell/s10b/local/generate_tlg.sh @@ -0,0 +1,159 @@ +#!/bin/bash + +# Copyright 2020 Mobvoi AI Lab, Beijing, China (author: Fangjun Kuang) +# Apache 2.0 + +# References: +# - https://github.com/srvk/eesen/blob/master/asr_egs/wsj/utils/ctc_compile_dict_token.sh +# +# - EESEN: End-to-End Speech Recognition using Deep RNN Models and +# WFST-based Decoding (https://arxiv.org/pdf/1507.08240.pdf) + +set -e + +echo "$0 $@" # Print the command line for logging + +if [ $# != 3 ]; then + echo "usage: $0 " + exit 1 +fi + +. ./cmd.sh +. ./path.sh + +dict=$1 +lm=$2 +dir=$3 + +[ ! -f $dict ] && echo "$dict does not exit!" && exit 1 +[ ! -f $lm ] && echo "$lm does not exit!" && exit 1 + +mkdir -p $dir + +cp $dict $dir/lexicon.txt + +cat $dir/lexicon.txt | cut -d ' ' -f2- | tr -s ' ' '\n' | sort | uniq > $dir/phones.list + +perl -ape 's/(\S+\s+)(.+)/${1}1.0 $2/;' < $dir/lexicon.txt > $dir/lexiconp.txt || exit 1 + +ndisambig=$(utils/add_lex_disambig.pl $dir/lexiconp.txt $dir/lexiconp_disambig.txt) +ndisambig=$[$ndisambig+1] + +for ((i=0; i<=$ndisambig; i++)); do + echo '#'$i +done > $dir/disambig.list + +( + echo '' + echo '' +) | cat - $dir/phones.list $dir/disambig.list | awk '{print $1, NR-1}' > $dir/tokens.txt + +if [[ ! -f $dir/T.fst ]]; then + local/token_to_fst.py --tokens-txt-filename $dir/tokens.txt | + fstcompile \ + --isymbols=$dir/tokens.txt \ + --osymbols=$dir/tokens.txt \ + --keep_isymbols=false \ + --keep_osymbols=false | + fstarcsort --sort_type=olabel > $dir/T.fst || exit 1 +fi + +cat $dir/lexiconp.txt | awk '{print $1}' | sort | uniq | awk ' + BEGIN { + print " 0"; + } + { + printf("%s %d\n", $1, NR); + } + END { + printf("#0 %d\n", NR + 1); + }' > $dir/words.txt || exit 1 + + +token_disambig_symbol=$(grep \#0 $dir/tokens.txt | awk '{print $2}') +word_disambig_symbol=$(grep \#0 $dir/words.txt | awk '{print $2}') + +silprob=0 +silphone="sil" + +if [[ ! -f $dir/L.fst ]]; then + utils/make_lexicon_fst.pl \ + --pron-probs $dir/lexiconp_disambig.txt $silprob $silphone '#'$ndisambig | + fstcompile --isymbols=$dir/tokens.txt --osymbols=$dir/words.txt \ + --keep_isymbols=false --keep_osymbols=false | + fstaddselfloops "echo $token_disambig_symbol |" "echo $word_disambig_symbol |" | + fstarcsort --sort_type=olabel > $dir/L.fst || exit 1 +fi + + +if [[ ! -f $dir/G.fst ]]; then + gunzip -c $lm | + grep -v ' ' | + grep -v ' ' | + grep -v ' ' | + arpa2fst - | + fstprint | + utils/eps2disambig.pl | + utils/s2eps.pl | + fstcompile \ + --isymbols=$dir/words.txt \ + --osymbols=$dir/words.txt \ + --keep_isymbols=false \ + --keep_osymbols=false | + fstrmepsilon | + fstarcsort --sort_type=ilabel > $dir/G.fst +fi + +set +e +fstisstochastic $dir/G.fst +set -e + +# The output is like: +# 9.14233e-05 -0.259833 +# we do expect the first of these 2 numbers to be close to zero (the second is +# nonzero because the backoff weights make the states sum to >1). + +if true; then + # Everything in this "if" statement is only for diagnostic. + # Checking that G has no cycles with empty words on them (e.g. , ); + # this might cause determinization failure of CLG. + # #0 is treated as an empty word. + mkdir -p $dir/tmpdir.g + awk '{if(NF==1){ printf("0 0 %s %s\n", $1,$1); }} + END{print "0 0 #0 #0"; print "0";}' \ + < "$dir/lexicon.txt" > $dir/tmpdir.g/select_empty.fst.txt + + fstcompile --isymbols=$dir/words.txt --osymbols=$dir/words.txt \ + $dir/tmpdir.g/select_empty.fst.txt \ + | fstarcsort --sort_type=olabel \ + | fstcompose - $dir/G.fst > $dir/tmpdir.g/empty_words.fst + + fstinfo $dir/tmpdir.g/empty_words.fst | grep cyclic | grep -w 'y' \ + && echo "Language model has cycles with empty words" && exit 1 + + rm -r $dir/tmpdir.g +fi + +fsttablecompose $dir/L.fst $dir/G.fst | + fstdeterminizestar --use-log=true | + fstminimizeencoded | + fstarcsort --sort_type=ilabel > $dir/LG.fst || exit 1 + +set +e +fstisstochastic $dir/LG.fst +set -e + +fsttablecompose $dir/T.fst $dir/LG.fst > $dir/TLG.fst || exit 1 + +fstconvert --fst_type=const TLG.fst const_TLG.fst +mv const_TLG.fst TLG.fst + +set +e +fstisstochastic $dir/TLG.fst +set -e + +# remove files not needed any more +for f in G.fst L.fst T.fst LG.fst disambig.list \ + lexiconp.txt lexiconp_disambig.txt; do + rm $dir/$f +done diff --git a/egs/aishell/s10b/local/latgen-faster.py b/egs/aishell/s10b/local/latgen-faster.py new file mode 100755 index 00000000000..4dca8720177 --- /dev/null +++ b/egs/aishell/s10b/local/latgen-faster.py @@ -0,0 +1,148 @@ +#!/usr/bin/env python3 + +# Copyright 2020 Mobvoi AI Lab, Beijing, China (author: Fangjun Kuang) +# Apache 2.0 +# +# +''' +This file is adapted from src/bin/latgen-faster-mapped.cc. + +Note that there is no **mapped** in the filename since we +do not use a transition model for mapping pdf ids to transition ids. + +Since this Python script is just a thin wrapper about the C++ code, +there should not be any performance problem. + +You can write another `src/bin/latgen-faster.cc` if you are +still worrying about the performance. +''' +# +# + +# TODO(fangjun): refer to src/bin/latgen-faster-mapped parallel.cc to +# implement latgen-faster-parallel.py + +import sys + +import kaldi +from kaldi import fst + + +def main(): + usage = kaldi.StringArg('''\ +Generate lattices, reading log-likelihoods as matrices + +Usage: latgen-faster [options] fst-rxfilename loglikes-rspecifier \ +lattice-wspecifier [ words-wspecifier [alignments-wspecifier] ] +''') + + allow_partial = kaldi.BoolArg(False) + acoustic_scale = kaldi.FloatArg(0.1) + word_syms_filename = kaldi.StringArg() + + config = kaldi.LatticeFasterDecoderConfig() + + po = kaldi.ParseOptions(usage) + + config.Register(po) + + po.Register('acoustic-scale', acoustic_scale, + 'Scaling factor for acoustic likelihoods') + + po.Register('word-symbol-table', word_syms_filename, + 'Symbol table for words [for debug output]') + + po.Register('allow-partial', allow_partial, + 'If true, produce output even if end state was not reached.') + + po.Read(sys.argv) + + if po.NumArgs() < 3 or po.NumArgs() > 5: + po.PrintUsage() + sys.exit(1) + + fst_in_str = po.GetArg(1) + log_likes_rspecifier = po.GetArg(2) + lattice_wspecifier = po.GetArg(3) + words_wspecifier = po.GetOptArg(4) + alignment_wspecifier = po.GetOptArg(5) + + determinize = config.determinize_lattice + compact_lattice_writer = kaldi.CompactLatticeWriter() + lattice_writer = kaldi.LatticeWriter() + + if determinize: + assert compact_lattice_writer.Open(lattice_wspecifier) == True + else: + assert lattice_writer.Open(lattice_wspecifier) == True + + words_writer = kaldi.IntVectorWriter(words_wspecifier) + alignments_writer = kaldi.IntVectorWriter(alignment_wspecifier) + + word_syms = fst.SymbolTable() + + if word_syms_filename: + word_syms = fst.SymbolTable.ReadText(word_syms_filename.value) + + # TODO(fangjun): support a table of FSTs + + tot_like = 0.0 + frame_count = 0 + num_success = 0 + num_fail = 0 + + loglike_reader = kaldi.SequentialMatrixReader(log_likes_rspecifier) + + # WARNING(fangjun): fst_in_str has to be a **const** fst. + # If it is a vector fst, you will get an error + # while creating the subsequent LatticeFasterDecoder. + tlg_fst = fst.ReadFstKaldiGeneric(fst_in_str) + + decoder = kaldi.LatticeFasterDecoder(tlg_fst, config) + + trans_model = kaldi.TransitionModel() # a dummy transition model + + for key, value in loglike_reader: + if value.NumRows() == 0: + print('zero length utterance: {}'.format(key)) + num_fail += 1 + continue + + decodable = kaldi.DecodableMatrixScaled(likes=value, + scale=acoustic_scale.value) + + is_succeeded, likelihood = kaldi.DecodeUtteranceLatticeFaster( + decoder=decoder, + decodable=decodable, + trans_model=trans_model, + word_syms=word_syms, + utt=key, + acoustic_scale=acoustic_scale.value, + determinize=determinize, + allow_partial=allow_partial.value, + alignments_writer=alignments_writer, + words_writer=words_writer, + compact_lattice_writer=compact_lattice_writer, + lattice_writer=lattice_writer) + + if is_succeeded: + tot_like += likelihood + frame_count += value.NumRows() + num_success += 1 + else: + num_fail += 1 + + print('Done {num_success} utterances, failed for {num_fail}'.format( + num_success=num_success, num_fail=num_fail)) + + print('Overall log-likelihood per frame is {} over {} frames'.format( + tot_like / frame_count, frame_count)) + + if num_success != 0: + sys.exit(0) + else: + sys.exit(1) + + +if __name__ == '__main__': + main() diff --git a/egs/aishell/s10b/local/run_ctc.sh b/egs/aishell/s10b/local/run_ctc.sh new file mode 100755 index 00000000000..d0f0c6fb029 --- /dev/null +++ b/egs/aishell/s10b/local/run_ctc.sh @@ -0,0 +1,218 @@ +#!/bin/bash + +# Copyright 2020 Mobvoi AI Lab, Beijing, China (author: Fangjun Kuang) +# Apache 2.0 + +set -e + +echo "$0 $@" # Print the command line for logging + +stage=0 +nj=30 + +export CUDA_VISIBLE_DEVICES="0" +device_id=0 + +train_data_dir=data/train_sp +dev_data_dir=data/dev_sp +test_data_dir=data/test +lang_dir=data/lang + +lr=1e-3 +num_epochs=6 +l2_regularize=1e-5 +batch_size=64 + +# WARNING(fangjun): You should know how to calculate your +# model's left/right context **manually** +model_left_context=29 +model_right_context=29 + +hidden_dim=1024 +bottleneck_dim=128 +prefinal_bottleneck_dim=256 +kernel_size_list="3, 3, 3, 1, 3, 3, 3, 3, 3, 3, 3, 3" # comma separated list +subsampling_factor_list="1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1" # comma separated list + +log_level=info # valid values: debug, info, warning + +post_decode_acwt=1 + +dir=exp/ctc + +. ./path.sh +. ./cmd.sh + +. parse_options.sh + +feat_dim=$(feat-to-dim --print-args=false scp:$train_data_dir/feats.scp -) +output_dim=$(cat $lang_dir/phones.list | wc -l) +# added by one since we have an extra blank symbol +output_dim=$[$output_dim+1] + +pids=() +function kill_trainer() { echo "kill training processes" && kill "${pids[@]}"; } + +if [[ $stage -le 0 ]]; then + mkdir -p $dir/train/tensorboard + train_checkpoint= + if [[ -f $dir/train/best_model.pt ]]; then + train_checkpoint=$dir/train/best_model.pt + fi + + num_gpus=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') + + if [[ $num_gpus -gt 1 ]]; then + echo "$0: training with ddp..." + echo "$0: number of gpus: $num_gpus" + + export MASTER_ADDR=localhost + export MASTER_PORT=6666 + + for ((i = 0; i < $num_gpus; ++i)); do + # sort options alphabetically + python3 ./ctc/ddp_train.py \ + --batch-size $batch_size \ + --checkpoint=${train_checkpoint:-} \ + --device-id $i \ + --dir $dir/train \ + --feats-scp $train_data_dir/feats.scp \ + --hidden-dim $hidden_dim \ + --input-dim $feat_dim \ + --is-training true \ + --model-left-context $model_left_context \ + --model-right-context $model_right_context \ + --num-layers $num_layers \ + --output-dim $output_dim \ + --proj-dim $proj_dim \ + --train.ddp.world-size $num_gpus \ + --train.l2-regularize $l2_regularize \ + --train.labels-scp $train_data_dir/labels.scp \ + --train.lr $lr \ + --train.num-epochs $num_epochs \ + --train.use-ddp true & + pids+=("$!") + done + trap kill_trainer SIGINT SIGTERM + wait + else + echo "$0: training with single gpu..." + # sort options alphabetically + python3 ./ctc/train.py \ + --batch-size $batch_size \ + --bottleneck-dim $bottleneck_dim \ + --checkpoint=${train_checkpoint:-} \ + --device-id $device_id \ + --dir $dir/train \ + --feats-scp $train_data_dir/feats.scp \ + --hidden-dim $hidden_dim \ + --input-dim $feat_dim \ + --is-training true \ + --kernel-size-list "$kernel_size_list" \ + --log-level $log_level \ + --model-left-context $model_left_context \ + --model-right-context $model_right_context \ + --output-dim $output_dim \ + --prefinal-bottleneck-dim $prefinal_bottleneck_dim \ + --subsampling-factor-list "$subsampling_factor_list" \ + --train.l2-regularize $l2_regularize \ + --train.labels-scp $train_data_dir/labels.scp \ + --train.lr $lr \ + --train.num-epochs $num_epochs \ + --train.use-ddp false + fi +fi + +if [[ $stage -le 1 ]]; then + echo "$0: inference: computing likelihood" + mkdir -p $dir/inference + + for x in $test_data_dir; do + basename=$(basename $x) + mkdir -p $dir/inference/$basename + if [[ -f $dir/inference/$basename/nnet_output.scp ]]; then + echo "$0: $dir/inference/$basename/nnet_output.scp already exists! Skip" + else + best_epoch=$(cat $dir/train/best-epoch-info | grep 'best epoch' | awk '{print $NF}') + [[ -z $best_epoch ]] && echo "$dir/train/best-epoch-info is not available!" && exit 1 + inference_checkpoint=$dir/train/epoch-${best_epoch}.pt + echo "$0: using inference checking point: $inference_checkpoint" + # sort options alphabetically + python3 ./ctc/inference.py \ + --batch-size $batch_size \ + --bottleneck-dim $bottleneck_dim \ + --checkpoint ${inference_checkpoint:-} \ + --device-id $device_id \ + --dir $dir/inference/$basename \ + --feats-scp $x/feats.scp \ + --hidden-dim $hidden_dim \ + --input-dim $feat_dim \ + --is-training false \ + --kernel-size-list "$kernel_size_list" \ + --log-level $log_level \ + --model-left-context $model_left_context \ + --model-right-context $model_right_context \ + --output-dim $output_dim \ + --prefinal-bottleneck-dim $prefinal_bottleneck_dim \ + --subsampling-factor-list "$subsampling_factor_list" + fi + done +fi + +if [[ $stage -le 2 ]]; then + echo "$0: decoding" + mkdir -p $dir/decode + for x in $test_data_dir; do + basename=$(basename $x) + mkdir -p $dir/decode/$basename + + if [[ ! -f $dir/inference/$basename/nnet_output.scp ]]; then + echo "$0: $dir/inference/$basename/nnet_output.scp does not exist!" + echo "$0: Please run inference.py first" + exit 1 + fi + + echo "$0: decoding $x" + + for i in $(seq $nj); do + utils/split_scp.pl -j $nj $[$i - 1] $dir/inference/$basename/nnet_output.scp $dir/decode/$basename/nnet_output.$i.scp + done + + lat_wspecifier="ark:|lattice-scale --acoustic-scale=$post_decode_acwt ark:- ark:- | gzip -c >$dir/decode/$basename/lat.JOB.gz" + + # sort options alphabetically + $decode_cmd JOB=1:$nj $dir/decode/$basename/log/decode.JOB.log \ + ./local/latgen-faster.py \ + --acoustic-scale=1.0 \ + --allow-partial=true \ + --beam=17.0 \ + --determinize-lattice=false \ + --lattice-beam=8.0 \ + --max-active=7000 \ + --max-mem=200000000 \ + --min-active=200 \ + --minimize=false \ + --word-symbol-table=$lang_dir/words.txt \ + $lang_dir/TLG.fst \ + scp:$dir/decode/$basename/nnet_output.JOB.scp \ + "$lat_wspecifier" + done +fi + +if [[ $stage -le 3 ]]; then + echo "$0: scoring" + + for x in $test_data_dir; do + basename=$(basename $x) + + ./local/score.sh --cmd "$decode_cmd" \ + $x \ + $lang_dir \ + $dir/decode/$basename || exit 1 + done + + for x in $test_data_dir; do + basename=$(basename $x) + head $dir/decode/$basename/scoring_kaldi/best_* + done +fi diff --git a/egs/aishell/s10b/local/score.sh b/egs/aishell/s10b/local/score.sh new file mode 100755 index 00000000000..a9786169973 --- /dev/null +++ b/egs/aishell/s10b/local/score.sh @@ -0,0 +1,8 @@ +#!/bin/bash + +set -e -o pipefail +set -x +steps/score_kaldi.sh "$@" +steps/scoring/score_kaldi_cer.sh --stage 2 "$@" + +echo "$0: Done" diff --git a/egs/aishell/s10b/local/token_to_fst.py b/egs/aishell/s10b/local/token_to_fst.py new file mode 100755 index 00000000000..66660f5d886 --- /dev/null +++ b/egs/aishell/s10b/local/token_to_fst.py @@ -0,0 +1,83 @@ +#!/usr/bin/env python3 + +# Copyright 2020 Mobvoi AI Lab, Beijing, China (author: Fangjun Kuang) +# Apache 2.0 + +# This program takes as input a phone symbol table +# `tokens.txt` and prints a text fst to the console. +# +# You can use `fstcompile` to convert the printed text fst +# to a binary fst. +# +# Two integer values in the symbol table have particular meaning: +# - 0 for `` +# - 1 for the blank symbol `` + +import argparse +import os + + +def get_args(): + parser = argparse.ArgumentParser(description=''' +Convert tokens.txt to tokens.fst. + +Usage: + python3 ./local/token_to_fst.py \ + --tokens-txt-filename data/lang/tokens.txt | + fstcompile \ + --isymbols=data/lang/tokens.txt \ + --osymbols=data/lang/tokens.txt \ + --keep_isymbols=false \ + --keep_osymbols=false | + fstarcsort --sort_type=olabel > $data/lang/T.fst || exit 1 +''') + + parser.add_argument('--tokens-txt-filename', + dest='tokens_txt_filename', + help="a phone symbol table", + type=str) + + args = parser.parse_args() + assert os.path.isfile(args.tokens_txt_filename) + + return args + + +def main(): + args = get_args() + + s = '0 1 \n' + s += '1 1 \n' + s += '2 2 \n' + s += '2 0 \n' + + next_state = 3 + with open(args.tokens_txt_filename, 'r') as f: + for line in f: + phone_index = line.split() + assert len(phone_index) == 2 + phone, _ = phone_index + + if phone in ['', '']: + continue + + if '#' in phone: + s += '0 0 {}\n'.format(phone) + continue + + s += '1 {next_state} {phone} {phone}\n'.format( + next_state=next_state, phone=phone) + + s += '{next_state} {next_state} {phone} \n'.format( + next_state=next_state, phone=phone) + + s += '{next_state} 2 \n'.format(next_state=next_state) + + next_state += 1 + + s += '0' + print(s) + + +if __name__ == '__main__': + main() diff --git a/egs/aishell/s10b/path.sh b/egs/aishell/s10b/path.sh new file mode 100755 index 00000000000..d3525eedd82 --- /dev/null +++ b/egs/aishell/s10b/path.sh @@ -0,0 +1,8 @@ +export KALDI_ROOT=`pwd`/../../.. +[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh +export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH +[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1 +. $KALDI_ROOT/tools/config/common_path.sh +export LC_ALL=C + +export PYTHONPATH=$KALDI_ROOT/src/pybind:$PYTHONPATH diff --git a/egs/aishell/s10b/run.sh b/egs/aishell/s10b/run.sh new file mode 100755 index 00000000000..6ffcc2e4a5b --- /dev/null +++ b/egs/aishell/s10b/run.sh @@ -0,0 +1,90 @@ +#!/bin/bash + +# Copyright 2020 Mobvoi AI Lab, Beijing, China (author: Fangjun Kuang) +# Apache 2.0 + +set -e + +. ./cmd.sh +. ./path.sh + +data=/home/fangjun/data/aishell +data_url=www.openslr.org/resources/33 + +nj=30 + +stage=8 + +if [[ $stage -le 0 ]]; then + local/download_and_untar.sh $data $data_url data_aishell || exit 1 + local/download_and_untar.sh $data $data_url resource_aishell || exit 1 +fi + +if [[ $stage -le 1 ]]; then + local/aishell_prepare_dict.sh $data/resource_aishell || exit 1 + # generated in data/local/dict +fi + +if [[ $stage -le 2 ]]; then + local/aishell_data_prep.sh $data/data_aishell/wav \ + $data/data_aishell/transcript || exit 1 + # generated in data/{train,test,dev}/{spk2utt text utt2spk wav.scp} +fi + +if [[ $stage -le 3 ]]; then + local/aishell_train_lms.sh || exit 1 +fi + +if [[ $stage -le 4 ]]; then + echo "$0: generating TLG.fst" + ./local/generate_tlg.sh \ + data/local/dict/lexicon.txt \ + data/local/lm/3gram-mincount/lm_unpruned.gz \ + data/lang +fi + +if [[ $stage -le 5 ]]; then + echo "$0: generating fbank features (40-dim)" + + for x in train dev; do + utils/data/perturb_data_dir_speed_3way.sh data/$x data/${x}_sp + done + + for x in train_sp dev_sp test; do + steps/make_fbank.sh --cmd "$train_cmd" --nj $nj data/$x || exit 1 + steps/compute_cmvn_stats.sh data/$x || exit 1 + utils/fix_data_dir.sh data/$x || exit 1 + done +fi + +if [[ $stage -le 6 ]]; then + echo "$0: convert text to labels" + for x in train_sp dev_sp test; do + ./local/convert_text_to_labels.sh data/$x data/lang + done +fi + +# n=1024 +# # n= +# if [[ $stage -le 7 ]]; then +# if true; then +# utils/subset_data_dir.sh data/train_sp $n data/train_sp$n || exit 1 +# utils/subset_data_dir.sh data/dev_sp $n data/dev_sp$n || exit 1 +# else +# utils/subset_data_dir.sh --first data/train_sp $n data/train_sp$n || exit 1 +# utils/subset_data_dir.sh --first data/dev_sp $n data/dev_sp$n || exit 1 +# fi +# +# for x in train_sp dev_sp; do +# ./local/convert_text_to_labels.sh data/${x}$n data/lang +# done +# fi + +if [[ $stage -le 8 ]]; then + ./local/run_ctc.sh \ + --train-data-dir data/train_sp$n \ + --dev-data-dir data/dev_sp$n \ + --test-data-dir data/test \ + --lang-dir data/lang \ + --nj $nj +fi diff --git a/egs/aishell/s10b/steps b/egs/aishell/s10b/steps new file mode 120000 index 00000000000..6e99bf5b5ad --- /dev/null +++ b/egs/aishell/s10b/steps @@ -0,0 +1 @@ +../../wsj/s5/steps \ No newline at end of file diff --git a/egs/aishell/s10b/utils b/egs/aishell/s10b/utils new file mode 120000 index 00000000000..b240885218f --- /dev/null +++ b/egs/aishell/s10b/utils @@ -0,0 +1 @@ +../../wsj/s5/utils \ No newline at end of file diff --git a/src/pybind/decoder/lattice_faster_decoder_pybind_test.py b/src/pybind/decoder/lattice_faster_decoder_pybind_test.py index 6842e2d7afa..010c2d77920 100755 --- a/src/pybind/decoder/lattice_faster_decoder_pybind_test.py +++ b/src/pybind/decoder/lattice_faster_decoder_pybind_test.py @@ -20,7 +20,7 @@ def test_lattice_faster_decoder_config(self): print(opts) def test_lattice_faster_decoder_config_parse_options(self): - usage = 'testing' + usage = kaldi.StringArg('testing') parse_options = kaldi.ParseOptions(usage) argv = [ 'a.out', '--print-args=false', '--beam=20', '--max-active=7000', diff --git a/src/pybind/fst/symbol_table_pybind.cc b/src/pybind/fst/symbol_table_pybind.cc index 96b351a6497..714f60233f6 100644 --- a/src/pybind/fst/symbol_table_pybind.cc +++ b/src/pybind/fst/symbol_table_pybind.cc @@ -59,21 +59,14 @@ void pybind_symbol_table(py::module& m) { .def(py::init(), "Constructs symbol table with an optional name.", py::arg("name") = "") - .def_static("ReadText", - overload_cast_()( - &PyClass::ReadText), - "Reads a text representation of the symbol table from an " - "istream. Pass a name to give the resulting SymbolTable.", - py::arg("strm"), py::arg("name"), - py::arg("opts") = fst::SymbolTableTextOptions()) .def_static("ReadText", overload_cast_()( &PyClass::ReadText), "Reads a text representation of the symbol table", py::arg("filename"), - py::arg("opts") = fst::SymbolTableTextOptions()) + py::arg("opts") = fst::SymbolTableTextOptions(), + py::return_value_policy::take_ownership) .def_static( "Read", overload_cast_()( diff --git a/src/pybind/fstext/kaldi_fst_io_pybind.cc b/src/pybind/fstext/kaldi_fst_io_pybind.cc index c7a9e7616f2..9beef523909 100644 --- a/src/pybind/fstext/kaldi_fst_io_pybind.cc +++ b/src/pybind/fstext/kaldi_fst_io_pybind.cc @@ -25,7 +25,7 @@ void pybind_kaldi_fst_io(py::module& m) { "Read a binary FST using Kaldi I/O mechanisms (pipes, etc.) On error, " "throws using KALDI_ERR. Note: this doesn't support the text-mode " "option that we generally like to support.", - py::arg("rxfilename"), py::return_value_policy::reference); + py::arg("rxfilename"), py::return_value_policy::take_ownership); m.def("ReadFstKaldiGeneric", fst::ReadFstKaldiGeneric, "Read a binary FST using Kaldi I/O mechanisms (pipes, etc.) If it " @@ -36,22 +36,11 @@ void pybind_kaldi_fst_io(py::module& m) { "VectorFst (const-fst can give better performance for " "decoding).", py::arg("rxfilename"), py::arg("throw_on_err") = true, - py::return_value_policy::reference); + py::return_value_policy::take_ownership); - m.def("CastOrConvertToVectorFst", &fst::CastOrConvertToVectorFst, - "This function attempts to dynamic_cast the pointer 'fst' (which will " - "likely have been returned by ReadFstGeneric()), to the more derived " - "type VectorFst. If this succeeds, it returns the same " - "pointer; if it fails, it converts the FST type (by creating a new " - "VectorFst initialized by 'fst'), prints a warning, and " - "deletes 'fst'.", - py::arg("fst"), py::return_value_policy::reference); - - m.def("ReadFstKaldi", - (void (*)(std::string, fst::StdVectorFst*)) & fst::ReadFstKaldi, - "Version of ReadFstKaldi() that writes to a pointer. Assumes the FST " - "is binary with no binary marker. Crashes on error.", - py::arg("rxfilename"), py::arg("ofst")); + // CastOrConvertToVectorFst may return an existing pointer + // or a newly created pointer. There may be memory leak + // if it's wrapped to Python. m.def("WriteFstKaldi", (void (*)(const fst::StdVectorFst&, std::string)) & fst::WriteFstKaldi, @@ -60,24 +49,10 @@ void pybind_kaldi_fst_io(py::module& m) { "doesn't support the text-mode option.", py::arg("fst"), py::arg("wxfilename")); - m.def("WriteFstKaldi", - (void (*)(std::ostream&, bool, const fst::StdVectorFst&)) & - fst::WriteFstKaldi, - "This is a more general Kaldi-type-IO mechanism of writing FSTs to " - "streams, supporting binary or text-mode writing. (note: we just " - "write the integers, symbol tables are not supported). On error, " - "throws using KALDI_ERR.", - py::arg("os"), py::arg("binary"), py::arg("fst")); - - m.def("ReadFstKaldi", - (void (*)(std::istream&, bool, fst::StdVectorFst*)) & fst::ReadFstKaldi, - "A generic Kaldi-type-IO mechanism of reading FSTs from streams, " - "supporting binary or text-mode reading/writing.", - py::arg("is"), py::arg("binary"), py::arg("fst")); m.def("ReadAndPrepareLmFst", &fst::ReadAndPrepareLmFst, "Read an FST file for LM (G.fst) and make it an acceptor, and make " "sure it is sorted on labels", - py::arg("rxfilename"), py::return_value_policy::reference); + py::arg("rxfilename"), py::return_value_policy::take_ownership); { // fangjun: it should be called StdVectorFstHolder to match the naming diff --git a/src/pybind/tests/test_latgen_faster_mapped.py b/src/pybind/tests/test_latgen_faster_mapped.py index 5b2b9315419..d9a0620ed98 100755 --- a/src/pybind/tests/test_latgen_faster_mapped.py +++ b/src/pybind/tests/test_latgen_faster_mapped.py @@ -28,9 +28,11 @@ class TestLatGenFasterMapped(unittest.TestCase): def test(self): - usage = 'Generate lattices, reading log-likelihoods as matrices\n' - ' (model is needed only for the integer mappings in its transition-model)\n' - po = kaldi.ParseOptions(usage) + usage = kaldi.StringArg( + 'Generate lattices, reading log-likelihoods as matrices\n' + ' (model is needed only for the integer mappings in its transition-model)\n' + ) + po = kaldi.ParseOptions(usage=usage) allow_partial = kaldi.BoolArg(False) acoustic_scale = kaldi.FloatArg(0.1) @@ -39,7 +41,7 @@ def test(self): if not os.path.exists( '../../../egs/aishell/s10/exp/chain/graph/HCLG.fst'): print('Please execute kaldi/egs/aishell/s10/run.sh first') - print('and souce path.sh in it before running this script') + print('and source path.sh in it before running this script') print('Or replace relevant files in this test with your own') print('Skip this test') return diff --git a/src/pybind/util/parse_options_pybind.cc b/src/pybind/util/parse_options_pybind.cc index a2764dc3b5f..3d03f106d06 100644 --- a/src/pybind/util/parse_options_pybind.cc +++ b/src/pybind/util/parse_options_pybind.cc @@ -75,7 +75,6 @@ void pybind_parse_options(py::module& m) { auto opt = py::class_(m, "ParseOptions") - .def(py::init(), py::arg("usage")) .def("Read", [](PyClass* opts, const std::vector& args) { int argc = static_cast(args.size()); @@ -130,4 +129,12 @@ void pybind_parse_options(py::module& m) { pybind_arg(m, opt); pybind_arg(m, opt); pybind_arg(m, opt); + + opt.def(py::init([](const Arg& usage) { + // NOTE(fangjun): no memory leak here using `new`. + // Refer to + // https://pybind11.readthedocs.io/en/stable/upgrade.html#new-api-for-defining-custom-constructors-and-pickling-functions + return new PyClass(usage.value.c_str()); + }), + py::arg("usage")); } diff --git a/src/pybind/util/parse_options_pybind_test.py b/src/pybind/util/parse_options_pybind_test.py index ff68d0774bb..55b04ae46b5 100755 --- a/src/pybind/util/parse_options_pybind_test.py +++ b/src/pybind/util/parse_options_pybind_test.py @@ -41,7 +41,8 @@ def test_parse_args(self): d = kaldi.DoubleArg() s = kaldi.StringArg() - parse_options = kaldi.ParseOptions(usage='test args') + usage = kaldi.StringArg('test args') + parse_options = kaldi.ParseOptions(usage=usage) parse_options.Register(name='b', arg=b, doc='bool args') parse_options.Register('i', i, 'int32 args') parse_options.Register('u', u, 'uint32 args')