Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 15 additions & 12 deletions egs/aishell/s10/chain/model.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#!/usr/bin/env python3

# Copyright 2019-2020 Mobvoi AI Lab, Beijing, China (author: Fangjun Kuang)
# Copyright 2019-2020 JD AI, Beijing, China (author: Lu Fan)
# Apache 2.0

import logging
Expand All @@ -20,14 +21,13 @@ def get_chain_model(feat_dim,
hidden_dim,
bottleneck_dim,
time_stride_list,
conv_stride_list,
lda_mat_filename=None):
model = ChainModel(feat_dim=feat_dim,
output_dim=output_dim,
lda_mat_filename=lda_mat_filename,
hidden_dim=hidden_dim,
time_stride_list=time_stride_list,
conv_stride_list=conv_stride_list)
bottleneck_dim=bottleneck_dim,
time_stride_list=time_stride_list)
return model


Expand Down Expand Up @@ -72,15 +72,14 @@ def __init__(self,
lda_mat_filename=None,
hidden_dim=1024,
bottleneck_dim=128,
time_stride_list=[1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1],
conv_stride_list=[1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1],
time_stride_list=[1, 1, 1, 0, 3, 3, 3, 3, 3, 3, 3, 3],
frame_subsampling_factor=3):
super().__init__()

# at present, we support only frame_subsampling_factor to be 3
assert frame_subsampling_factor == 3

assert len(time_stride_list) == len(conv_stride_list)
self.frame_subsampling_factor = frame_subsampling_factor
self.time_stride_list = time_stride_list
num_layers = len(time_stride_list)

# tdnn1_affine requires [N, T, C]
Expand All @@ -93,20 +92,17 @@ def __init__(self,
tdnnfs = []
for i in range(num_layers):
time_stride = time_stride_list[i]
conv_stride = conv_stride_list[i]
layer = FactorizedTDNN(dim=hidden_dim,
bottleneck_dim=bottleneck_dim,
time_stride=time_stride,
conv_stride=conv_stride)
time_stride=time_stride)
tdnnfs.append(layer)

# tdnnfs requires [N, C, T]
self.tdnnfs = nn.ModuleList(tdnnfs)

# prefinal_l affine requires [N, C, T]
self.prefinal_l = OrthonormalLinear(dim=hidden_dim,
bottleneck_dim=bottleneck_dim * 2,
time_stride=0)
bottleneck_dim=bottleneck_dim * 2)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

mm.. I'm a bit surprised this * 2 is here?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

mm.. I think you were assuming that the final layer's bottleneck is always twice the TDNN-F layers' bottleneck.
In fact we generally leave the final layer's bottleneck at 256, which for some reason seems to work across a range
of conditions. You could make that a separate configuration value.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

when I have checked the param shape of kaldi's model, I don't find the difference betwieen the final layer and previous layers what you said.

'tdnnf12.linear':'time_offsets': array([-3,  0]), 'params': (128, 2048)
'tdnnf12.affine':'time_offsets': array([0, 3]), 'params': (1024, 256)
'tdnnf13.linear':'time_offsets': array([-3,  0]), 'params': (128, 2048)
'tdnnf13.affine':'time_offsets': array([0, 3]), 'params': (1024, 256)
'prefinal-l':'params': (256, 1024)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

mm.. I'm a bit surprised this * 2 is here?

* 2 is used here to follow what kaldi does.
I've changed it to be configurable in this pullrequest: #3925


# prefinal_chain requires [N, C, T]
self.prefinal_chain = PrefinalLayer(big_dim=hidden_dim,
Expand Down Expand Up @@ -174,6 +170,13 @@ def forward(self, x):
# tdnnf requires input of shape [N, C, T]
for i in range(len(self.tdnnfs)):
x = self.tdnnfs[i](x)
# stride manually, do not stride context
if self.tdnnfs[i].time_stride == 0:
cur_context = sum(self.time_stride_list[i:])
x_left = x[:, :, :cur_context]
x_mid = x[:, :, cur_context:-cur_context:self.frame_subsampling_factor]
x_right = x[:, :, -cur_context:]
x = torch.cat([x_left, x_mid, x_right], dim=2)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm surprised that you are doing this manually rather than using a 1d convolution. This could be quite slow.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I want subsample the length of window only rather than left_context and right_context. And this is slower than before training, but it worked. please help me to write this 1d convolution.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What might have happened here is that you tripled the dimension in the middle of the network.
This would have led to a system with many more parameters for your "dilation" system.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just subsample the t_out_length from (24+150+24) to (24+50+24) manually, the number of parameters will not increase than stride kernel(2,2) version. I explained this code behaviour in the picture below.


# at this point, x is [N, C, T]

Expand Down
14 changes: 1 addition & 13 deletions egs/aishell/s10/chain/options.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#!/usr/bin/env python3

# Copyright 2019 Mobvoi AI Lab, Beijing, China (author: Fangjun Kuang)
# Copyright 2020 JD AI, Beijing, China (author: Lu Fan)
# Apache 2.0

import argparse
Expand Down Expand Up @@ -134,15 +135,8 @@ def _check_args(args):
assert args.time_stride_list is not None
assert len(args.time_stride_list) > 0

assert args.conv_stride_list is not None
assert len(args.conv_stride_list) > 0

args.time_stride_list = [int(k) for k in args.time_stride_list.split(', ')]

args.conv_stride_list = [int(k) for k in args.conv_stride_list.split(', ')]

assert len(args.time_stride_list) == len(args.conv_stride_list)

assert args.log_level in ['debug', 'info', 'warning']


Expand Down Expand Up @@ -208,12 +202,6 @@ def get_args():
required=True,
type=str)

parser.add_argument('--conv-stride-list',
dest='conv_stride_list',
help='conv stride list',
required=True,
type=str)

parser.add_argument('--log-level',
dest='log_level',
help='log level. valid values: debug, info, warning',
Expand Down
70 changes: 27 additions & 43 deletions egs/aishell/s10/chain/tdnnf_layer.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,24 +53,16 @@ def _constrain_orthonormal_internal(M):

class OrthonormalLinear(nn.Module):

def __init__(self, dim, bottleneck_dim, time_stride):
def __init__(self, dim, bottleneck_dim, kernel_size=1, dilation=1):
super().__init__()
assert time_stride in [0, 1]
# WARNING(fangjun): kaldi uses [-1, 0] for the first linear layer
# and [0, 1] for the second affine layer;
# we use [-1, 0, 1] for the first linear layer if time_stride == 1

if time_stride == 0:
kernel_size = 1
else:
kernel_size = 3

self.kernel_size = kernel_size
self.dilation = dilation

# conv requires [N, C, T]
self.conv = nn.Conv1d(in_channels=dim,
out_channels=bottleneck_dim,
kernel_size=kernel_size,
dilation=dilation,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You should never need the dilation parameter. I think we discussed this before.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

... instead of using dilation, do a 3-fold subsampling after the last layer that had stride=1. Please don't argue about this! I remember last time was quite painful.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

hah, I find the discussed info before. I just to make the length of output is equal to the supervision

2020-02-11 14:59:29,145 (model_tdnnf3:172) DEBUG: x shape is torch.Size([1, 1024, 202])
2020-02-11 14:59:29,150 (model_tdnnf3:172) DEBUG: x shape is torch.Size([1, 1024, 200])
2020-02-11 14:59:29,156 (model_tdnnf3:172) DEBUG: x shape is torch.Size([1, 1024, 198])
2020-02-11 14:59:29,162 (model_tdnnf3:172) DEBUG: x shape is torch.Size([1, 1024, 98])
2020-02-11 14:59:29,175 (model_tdnnf3:172) DEBUG: x shape is torch.Size([1, 1024, 92])
2020-02-11 14:59:29,184 (model_tdnnf3:172) DEBUG: x shape is torch.Size([1, 1024, 86])
2020-02-11 14:59:29,195 (model_tdnnf3:172) DEBUG: x shape is torch.Size([1, 1024, 80])
2020-02-11 14:59:29,204 (model_tdnnf3:172) DEBUG: x shape is torch.Size([1, 1024, 74])
2020-02-11 14:59:29,215 (model_tdnnf3:172) DEBUG: x shape is torch.Size([1, 1024, 68])
2020-02-11 14:59:29,224 (model_tdnnf3:172) DEBUG: x shape is torch.Size([1, 1024, 62])
2020-02-11 14:59:29,233 (model_tdnnf3:172) DEBUG: x shape is torch.Size([1, 1024, 56])
2020-02-11 14:59:29,243 (model_tdnnf3:172) DEBUG: x shape is torch.Size([1, 1024, 50])

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are these shape of output generated by tdnnf layers correct?

2020-02-11 19:21:40,284 (model_tdnnf3:179) DEBUG: x shape is torch.Size([1, 1024, 202])
2020-02-11 19:21:40,291 (model_tdnnf3:179) DEBUG: x shape is torch.Size([1, 1024, 200])
2020-02-11 19:21:40,297 (model_tdnnf3:179) DEBUG: x shape is torch.Size([1, 1024, 198])
2020-02-11 19:21:40,306 (model_tdnnf3:179) DEBUG: x shape is torch.Size([1, 1024, 198])
2020-02-11 19:21:40,317 (model_tdnnf3:179) DEBUG: x shape is torch.Size([1, 1024, 192])
2020-02-11 19:21:40,327 (model_tdnnf3:179) DEBUG: x shape is torch.Size([1, 1024, 186])
2020-02-11 19:21:40,336 (model_tdnnf3:179) DEBUG: x shape is torch.Size([1, 1024, 180])
2020-02-11 19:21:40,346 (model_tdnnf3:179) DEBUG: x shape is torch.Size([1, 1024, 174])
2020-02-11 19:21:40,355 (model_tdnnf3:179) DEBUG: x shape is torch.Size([1, 1024, 168])
2020-02-11 19:21:40,364 (model_tdnnf3:179) DEBUG: x shape is torch.Size([1, 1024, 162])
2020-02-11 19:21:40,373 (model_tdnnf3:179) DEBUG: x shape is torch.Size([1, 1024, 156])
2020-02-11 19:21:40,382 (model_tdnnf3:179) DEBUG: x shape is torch.Size([1, 1024, 150])
2020-02-11 19:21:40,382 (model_tdnnf3:182) DEBUG: x shape is torch.Size([1, 1024, 50])

bias=False)

def forward(self, x):
Expand Down Expand Up @@ -116,7 +108,7 @@ def __init__(self, big_dim, small_dim):
self.batchnorm1 = nn.BatchNorm1d(num_features=big_dim)
self.linear = OrthonormalLinear(dim=big_dim,
bottleneck_dim=small_dim,
time_stride=0)
kernel_size=1)
self.batchnorm2 = nn.BatchNorm1d(num_features=small_dim)

def forward(self, x):
Copy link
Contributor

@danpovey danpovey Feb 12, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm surprised you didn't implement the TDNN_F layer in the "obvious" way with 1-d convolution.
[oh, sorry, this is prefinal layer]

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Inside OrthonormalLinear, it is nn.Conv1d. So it is ineeded implemented via 1-d convolution.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

kernel_size=1 doesn't look right. Some extremely weird stuff is going on in this PR.

Copy link
Author

@fanlu fanlu Feb 12, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hi, dan, this code's behavior is not different with before code's. just changed the param. the original stride version use kernel_size=1 as default also

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's getting less clear to me, not more clear, and in any case the code is not right.
You showed various experimental numbers but you never said with any clarity what code or parameters each one corresponded to.
I would prefer if you just went back to the original code and made the small and very specific changes that I asked for.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

sorry about this

Expand Down Expand Up @@ -161,29 +153,32 @@ def __init__(self,
dim,
bottleneck_dim,
time_stride,
conv_stride,
bypass_scale=0.66):
super().__init__()

assert conv_stride in [1, 3]
assert abs(bypass_scale) <= 1

self.bypass_scale = bypass_scale
self.time_stride = time_stride

self.conv_stride = conv_stride
if time_stride > 0:
kernel_size, dilation = 2, time_stride
else:
kernel_size, dilation = 1, 1

# linear requires [N, C, T]
self.linear = OrthonormalLinear(dim=dim,
bottleneck_dim=bottleneck_dim,
time_stride=time_stride)
kernel_size=kernel_size,
dilation=dilation)

# affine requires [N, C, T]
# WARNING(fangjun): we do not use nn.Linear here
# since we want to use `stride`
self.affine = nn.Conv1d(in_channels=bottleneck_dim,
out_channels=dim,
kernel_size=1,
stride=conv_stride)
kernel_size=kernel_size,
dilation=dilation)

# batchnorm requires [N, C, T]
self.batchnorm = nn.BatchNorm1d(num_features=dim)
Expand Down Expand Up @@ -213,10 +208,11 @@ def forward(self, x):

# TODO(fangjun): implement GeneralDropoutComponent in PyTorch

if self.linear.kernel_size == 3:
x = self.bypass_scale * input_x[:, :, 1:-1:self.conv_stride] + x
# at this point, x is [N, C, T]
if self.linear.kernel_size == 2:
x = self.bypass_scale * input_x[:, :, self.linear.dilation:-self.linear.dilation:1] + x
else:
x = self.bypass_scale * input_x[:, :, ::self.conv_stride] + x
x = self.bypass_scale * input_x[:, :, ::1] + x
return x

def constrain_orthonormal(self):
Expand Down Expand Up @@ -257,8 +253,7 @@ def compute_loss(M):

model = FactorizedTDNN(dim=1024,
bottleneck_dim=128,
time_stride=1,
conv_stride=3)
time_stride=1)
loss = []
model.constrain_orthonormal()
loss.append(
Expand All @@ -278,40 +273,29 @@ def _test_factorized_tdnn():
N = 1
T = 10
C = 4

# case 0: time_stride == 1, conv_stride == 1
# https://pytorch.org/docs/stable/nn.html?highlight=conv1d#torch.nn.Conv1d
# T_out = math.ceil((T + 2 * padding - dilation * (kernel_size - 1) - 1) / stride)
# case 0: time_stride == 1, kernel_size==2, dilation = 1
model = FactorizedTDNN(dim=C,
bottleneck_dim=2,
time_stride=1,
conv_stride=1)
time_stride=1)
x = torch.arange(N * T * C).reshape(N, C, T).float()
y = model(x)
assert y.size(2) == T - 2

# case 1: time_stride == 0, conv_stride == 1
# case 1: time_stride == 0, kernel_size == 1, dilation == 1
model = FactorizedTDNN(dim=C,
bottleneck_dim=2,
time_stride=0,
conv_stride=1)
time_stride=0)
y = model(x)
assert y.size(2) == T

# case 2: time_stride == 1, conv_stride == 3
# case 2: time_stride == 3, kernel_size == 2, dilation = 3
model = FactorizedTDNN(dim=C,
bottleneck_dim=2,
time_stride=1,
conv_stride=3)
time_stride=3)
y = model(x)
assert y.size(2) == math.ceil((T - 2) / 3)

# case 3: time_stride == 0, conv_stride == 3
model = FactorizedTDNN(dim=C,
bottleneck_dim=2,
time_stride=0,
conv_stride=3)
y = model(x)
assert y.size(2) == math.ceil(T / 3)

assert y.size(2) == math.ceil(math.ceil((T - 3)) - 3)

if __name__ == '__main__':
torch.manual_seed(20200130)
Expand Down
21 changes: 17 additions & 4 deletions egs/aishell/s10/chain/train.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#!/usr/bin/env python3

# Copyright 2019-2020 Mobvoi AI Lab, Beijing, China (author: Fangjun Kuang)
# Copyright 2019-2020 JD AI, Beijing, China (author: Lu Fan)
# Apache 2.0

import logging
Expand Down Expand Up @@ -100,15 +101,28 @@ def train_one_epoch(dataloader, model, device, optimizer, criterion,
total_objf / total_weight, total_frames,
objf_l2_term_weight[0].item() /
objf_l2_term_weight[2].item(), num_frames, current_epoch))
log_norm = ["{}: {:.4f}".format(name, torch.norm(parms)) \
for name, parms in model.named_parameters() \
if "affine" in name or "linear" in name]
logging.info("Process {}/{}({:.6f}%) l2-norm is:[ {} ]".format(batch_idx,
len(dataloader), float(batch_idx) / len(dataloader) * 100),
" ".join(log_norm))

if batch_idx % 100 == 0:
current_iter = batch_idx + current_epoch * len(dataloader)
tf_writer.add_scalar('train/global_average_objf',
total_objf / total_weight,
batch_idx + current_epoch * len(dataloader))
current_iter)
tf_writer.add_scalar(
'train/current_batch_average_objf',
objf_l2_term_weight[0].item() / objf_l2_term_weight[2].item(),
batch_idx + current_epoch * len(dataloader))
current_iter)
for name, parms in model.named_parameters():
tf_writer.add_histogram(f'train/norm/{name}',
parms.clone().cpu().data.numpy(),
current_iter)
tf_writer.add_scalar(f'train/l2_norm/{name}', torch.norm(parms),
current_iter)

return total_objf / total_weight

Expand Down Expand Up @@ -142,8 +156,7 @@ def main():
lda_mat_filename=args.lda_mat_filename,
hidden_dim=args.hidden_dim,
bottleneck_dim=args.bottleneck_dim,
time_stride_list=args.time_stride_list,
conv_stride_list=args.conv_stride_list)
time_stride_list=args.time_stride_list)

start_epoch = 0
num_epochs = args.num_epochs
Expand Down
4 changes: 1 addition & 3 deletions egs/aishell/s10/local/run_chain.sh
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#!/bin/bash

# Copyright 2019 Mobvoi AI Lab, Beijing, China (author: Fangjun Kuang)
# Copyright 2020 JD AI, Beijing, China (author: Lu Fan)
# Apache 2.0

set -e
Expand Down Expand Up @@ -33,7 +34,6 @@ lr=1e-3
hidden_dim=1024
bottleneck_dim=128
time_stride_list="1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1" # comma separated list
conv_stride_list="1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1" # comma separated list

log_level=info # valid values: debug, info, warning

Expand Down Expand Up @@ -164,7 +164,6 @@ if [[ $stage -le 8 ]]; then
python3 ./chain/train.py \
--bottleneck-dim $bottleneck_dim \
--checkpoint=${train_checkpoint:-} \
--conv-stride-list "$conv_stride_list" \
--device-id $device_id \
--dir exp/chain/train \
--feat-dim $feat_dim \
Expand Down Expand Up @@ -194,7 +193,6 @@ if [[ $stage -le 9 ]]; then
python3 ./chain/inference.py \
--bottleneck-dim $bottleneck_dim \
--checkpoint $inference_checkpoint \
--conv-stride-list "$conv_stride_list" \
--device-id $device_id \
--dir exp/chain/inference/$x \
--feat-dim $feat_dim \
Expand Down