Skip to content

Commit fa97b4f

Browse files
authored
Merge pull request #74 from TensorSpeech/dev/masking
Add support for masking
2 parents d52852a + 2c5904e commit fa97b4f

18 files changed

+707
-63
lines changed

README.md

Lines changed: 3 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,11 @@
22
<p>TensorFlowASR :zap:</p>
33
<p align="center">
44
<a href="https://github.com/TensorSpeech/TensorFlowASR/blob/main/LICENSE">
5-
<img alt="GitHub" src="https://img.shields.io/github/license/TensorSpeech/TensorFlowASR?logo=apache">
5+
<img alt="GitHub" src="https://img.shields.io/github/license/TensorSpeech/TensorFlowASR?logo=apache&logoColor=green">
66
</a>
77
<img alt="python" src="https://img.shields.io/badge/python-%3E%3D3.6-blue?logo=python">
88
<img alt="tensorflow" src="https://img.shields.io/badge/tensorflow-%3E%3D2.3.0-orange?logo=tensorflow">
9-
<img alt="PyPI" src="https://img.shields.io/pypi/v/TensorFlowASR?color=%2300B4EF&label=release&logo=pypi&logoColor=%2300B4EF">
9+
<img alt="PyPI" src="https://img.shields.io/pypi/v/TensorFlowASR?color=%234285F4&label=release&logo=pypi&logoColor=%234285F4">
1010
</p>
1111
</h1>
1212
<h2 align="center">
@@ -19,16 +19,11 @@ TensorFlowASR implements some automatic speech recognition architectures such as
1919

2020
## What's New?
2121

22+
- (12/12/2020) Add support for using masking
2223
- (11/14/2020) Supported Gradient Accumulation for Training in Larger Batch Size
2324
- (11/3/2020) Reduce differences between `librosa.stft` and `tf.signal.stft`
2425
- (10/31/2020) Update DeepSpeech2 and Supported Jasper [https://arxiv.org/abs/1904.03288](https://arxiv.org/abs/1904.03288)
2526
- (10/18/2020) Supported Streaming Transducer [https://arxiv.org/abs/1811.06621](https://arxiv.org/abs/1811.06621)
26-
- (10/15/2020) Add gradients accumulation and Refactor to TensorflowASR
27-
- (10/10/2020) Update documents and upload package to pypi
28-
- (10/6/2020) Change `nlpaug` version to `>=1.0.1`
29-
- (9/18/2020) Support `word-pieces` (aka `subwords`) using `tensorflow-datasets`
30-
- Support `transducer` tflite greedy decoding (conversion and invocation)
31-
- Distributed training using `tf.distribute.MirroredStrategy`
3227

3328
## Table of Contents
3429

examples/conformer/config.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,7 @@ learning_config:
8585
epsilon: 1e-9
8686

8787
running_config:
88-
batch_size: 4
88+
batch_size: 2
8989
accumulation_steps: 4
9090
num_epochs: 20
9191
outdir: /mnt/d/Models/local/conformer
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
# Training Conformer with Attention Masking
2+
3+
This is an example for anyone who wants to apply masking in Conformer.
4+
5+
**Note**: This is not a good practice since Conformer uses time reduction, which leads to create incorrect maskings.
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
import tensorflow as tf
2+
from tensorflow_asr.utils.utils import shape_list
3+
4+
5+
def create_padding_mask(features, input_length, time_reduction_factor):
6+
"""
7+
Create masking with 0 for paddings and 1 for non-paddings
8+
Args:
9+
features ([tf.Tensor]): audio features with shape [B, T, F, C]
10+
input_length ([tf.Tensor]): audio features length with shape [B]
11+
time_reduction_factor ([int])
12+
13+
Returns:
14+
[tf.Tensor]: with shape [B, Tquery, Tkey]
15+
"""
16+
batch_size, padded_time, _, _ = shape_list(features)
17+
reduced_padded_time = tf.math.ceil(padded_time / time_reduction_factor)
18+
19+
def create_mask(length):
20+
reduced_length = tf.math.ceil(length / time_reduction_factor)
21+
mask = tf.ones([reduced_length, reduced_length], dtype=tf.float32)
22+
return tf.pad(
23+
mask,
24+
[
25+
[0, reduced_padded_time - reduced_length],
26+
[0, reduced_padded_time - reduced_length]
27+
],
28+
mode="CONSTANT",
29+
constant_values=0.0
30+
)
31+
32+
return tf.map_fn(create_mask, input_length, fn_output_signature=tf.TensorSpec([None, None], dtype=tf.float32))
Lines changed: 131 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,131 @@
1+
# Copyright 2020 Huy Le Nguyen (@usimarit)
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
import os
16+
import math
17+
import argparse
18+
from tensorflow_asr.utils import setup_environment, setup_strategy
19+
20+
setup_environment()
21+
import tensorflow as tf
22+
23+
DEFAULT_YAML = os.path.join(os.path.abspath(os.path.dirname(__file__)), "..", "config.yml")
24+
25+
tf.keras.backend.clear_session()
26+
27+
parser = argparse.ArgumentParser(prog="Conformer Training")
28+
29+
parser.add_argument("--config", type=str, default=DEFAULT_YAML,
30+
help="The file path of model configuration file")
31+
32+
parser.add_argument("--max_ckpts", type=int, default=10,
33+
help="Max number of checkpoints to keep")
34+
35+
parser.add_argument("--tfrecords", default=False, action="store_true",
36+
help="Whether to use tfrecords")
37+
38+
parser.add_argument("--tbs", type=int, default=None,
39+
help="Train batch size per replica")
40+
41+
parser.add_argument("--ebs", type=int, default=None,
42+
help="Evaluation batch size per replica")
43+
44+
parser.add_argument("--acs", type=int, default=None,
45+
help="Train accumulation steps")
46+
47+
parser.add_argument("--devices", type=int, nargs="*", default=[0],
48+
help="Devices' ids to apply distributed training")
49+
50+
parser.add_argument("--mxp", default=False, action="store_true",
51+
help="Enable mixed precision")
52+
53+
parser.add_argument("--cache", default=False, action="store_true",
54+
help="Enable caching for dataset")
55+
56+
args = parser.parse_args()
57+
58+
tf.config.optimizer.set_experimental_options({"auto_mixed_precision": args.mxp})
59+
60+
strategy = setup_strategy(args.devices)
61+
62+
from tensorflow_asr.configs.config import Config
63+
from tensorflow_asr.datasets.asr_dataset import ASRTFRecordDataset, ASRSliceDataset
64+
from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer
65+
from tensorflow_asr.featurizers.text_featurizers import CharFeaturizer
66+
from trainer import TrainerWithMaskingGA
67+
from tensorflow_asr.models.conformer import Conformer
68+
from tensorflow_asr.optimizers.schedules import TransformerSchedule
69+
70+
config = Config(args.config, learning=True)
71+
speech_featurizer = TFSpeechFeaturizer(config.speech_config)
72+
text_featurizer = CharFeaturizer(config.decoder_config)
73+
74+
if args.tfrecords:
75+
train_dataset = ASRTFRecordDataset(
76+
data_paths=config.learning_config.dataset_config.train_paths,
77+
tfrecords_dir=config.learning_config.dataset_config.tfrecords_dir,
78+
speech_featurizer=speech_featurizer,
79+
text_featurizer=text_featurizer,
80+
augmentations=config.learning_config.augmentations,
81+
stage="train", cache=args.cache, shuffle=True
82+
)
83+
eval_dataset = ASRTFRecordDataset(
84+
data_paths=config.learning_config.dataset_config.eval_paths,
85+
tfrecords_dir=config.learning_config.dataset_config.tfrecords_dir,
86+
speech_featurizer=speech_featurizer,
87+
text_featurizer=text_featurizer,
88+
stage="eval", cache=args.cache, shuffle=True
89+
)
90+
else:
91+
train_dataset = ASRSliceDataset(
92+
data_paths=config.learning_config.dataset_config.train_paths,
93+
speech_featurizer=speech_featurizer,
94+
text_featurizer=text_featurizer,
95+
augmentations=config.learning_config.augmentations,
96+
stage="train", cache=args.cache, shuffle=True
97+
)
98+
eval_dataset = ASRSliceDataset(
99+
data_paths=config.learning_config.dataset_config.eval_paths,
100+
speech_featurizer=speech_featurizer,
101+
text_featurizer=text_featurizer,
102+
stage="eval", cache=args.cache, shuffle=True
103+
)
104+
105+
conformer_trainer = TrainerWithMaskingGA(
106+
config=config.learning_config.running_config,
107+
text_featurizer=text_featurizer, strategy=strategy
108+
)
109+
110+
with conformer_trainer.strategy.scope():
111+
# build model
112+
conformer = Conformer(**config.model_config, vocabulary_size=text_featurizer.num_classes)
113+
conformer._build(speech_featurizer.shape)
114+
conformer.summary(line_length=120)
115+
116+
optimizer = tf.keras.optimizers.Adam(
117+
TransformerSchedule(
118+
d_model=config.model_config["encoder_dmodel"],
119+
warmup_steps=config.learning_config.optimizer_config["warmup_steps"],
120+
max_lr=(0.05 / math.sqrt(config.model_config["encoder_dmodel"]))
121+
),
122+
beta_1=config.learning_config.optimizer_config["beta1"],
123+
beta_2=config.learning_config.optimizer_config["beta2"],
124+
epsilon=config.learning_config.optimizer_config["epsilon"]
125+
)
126+
127+
conformer_trainer.compile(model=conformer, optimizer=optimizer,
128+
max_to_keep=args.max_ckpts)
129+
130+
conformer_trainer.fit(train_dataset, eval_dataset,
131+
train_bs=args.tbs, eval_bs=args.ebs, train_acs=args.acs)
Lines changed: 147 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,147 @@
1+
# Copyright 2020 Huy Le Nguyen (@usimarit)
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
import os
16+
import math
17+
import argparse
18+
from tensorflow_asr.utils import setup_environment, setup_strategy
19+
20+
setup_environment()
21+
import tensorflow as tf
22+
23+
DEFAULT_YAML = os.path.join(os.path.abspath(os.path.dirname(__file__)), "..", "config.yml")
24+
25+
tf.keras.backend.clear_session()
26+
27+
parser = argparse.ArgumentParser(prog="Conformer Training")
28+
29+
parser.add_argument("--config", type=str, default=DEFAULT_YAML,
30+
help="The file path of model configuration file")
31+
32+
parser.add_argument("--max_ckpts", type=int, default=10,
33+
help="Max number of checkpoints to keep")
34+
35+
parser.add_argument("--tfrecords", default=False, action="store_true",
36+
help="Whether to use tfrecords")
37+
38+
parser.add_argument("--tbs", type=int, default=None,
39+
help="Train batch size per replica")
40+
41+
parser.add_argument("--ebs", type=int, default=None,
42+
help="Evaluation batch size per replica")
43+
44+
parser.add_argument("--acs", type=int, default=None,
45+
help="Train accumulation steps")
46+
47+
parser.add_argument("--devices", type=int, nargs="*", default=[0],
48+
help="Devices' ids to apply distributed training")
49+
50+
parser.add_argument("--mxp", default=False, action="store_true",
51+
help="Enable mixed precision")
52+
53+
parser.add_argument("--cache", default=False, action="store_true",
54+
help="Enable caching for dataset")
55+
56+
parser.add_argument("--subwords", type=str, default=None,
57+
help="Path to file that stores generated subwords")
58+
59+
parser.add_argument("--subwords_corpus", nargs="*", type=str, default=[],
60+
help="Transcript files for generating subwords")
61+
62+
args = parser.parse_args()
63+
64+
tf.config.optimizer.set_experimental_options({"auto_mixed_precision": args.mxp})
65+
66+
strategy = setup_strategy(args.devices)
67+
68+
from tensorflow_asr.configs.config import Config
69+
from tensorflow_asr.datasets.asr_dataset import ASRTFRecordDataset, ASRSliceDataset
70+
from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer
71+
from tensorflow_asr.featurizers.text_featurizers import SubwordFeaturizer
72+
from trainer import TrainerWithMaskingGA
73+
from tensorflow_asr.models.conformer import Conformer
74+
from tensorflow_asr.optimizers.schedules import TransformerSchedule
75+
76+
config = Config(args.config, learning=True)
77+
speech_featurizer = TFSpeechFeaturizer(config.speech_config)
78+
79+
if args.subwords and os.path.exists(args.subwords):
80+
print("Loading subwords ...")
81+
text_featurizer = SubwordFeaturizer.load_from_file(config.decoder_config, args.subwords)
82+
else:
83+
print("Generating subwords ...")
84+
text_featurizer = SubwordFeaturizer.build_from_corpus(
85+
config.decoder_config,
86+
corpus_files=args.subwords_corpus
87+
)
88+
text_featurizer.save_to_file(args.subwords)
89+
90+
if args.tfrecords:
91+
train_dataset = ASRTFRecordDataset(
92+
data_paths=config.learning_config.dataset_config.train_paths,
93+
tfrecords_dir=config.learning_config.dataset_config.tfrecords_dir,
94+
speech_featurizer=speech_featurizer,
95+
text_featurizer=text_featurizer,
96+
augmentations=config.learning_config.augmentations,
97+
stage="train", cache=args.cache, shuffle=True
98+
)
99+
eval_dataset = ASRTFRecordDataset(
100+
data_paths=config.learning_config.dataset_config.eval_paths,
101+
tfrecords_dir=config.learning_config.dataset_config.tfrecords_dir,
102+
speech_featurizer=speech_featurizer,
103+
text_featurizer=text_featurizer,
104+
stage="eval", cache=args.cache, shuffle=True
105+
)
106+
else:
107+
train_dataset = ASRSliceDataset(
108+
data_paths=config.learning_config.dataset_config.train_paths,
109+
speech_featurizer=speech_featurizer,
110+
text_featurizer=text_featurizer,
111+
augmentations=config.learning_config.augmentations,
112+
stage="train", cache=args.cache, shuffle=True
113+
)
114+
eval_dataset = ASRSliceDataset(
115+
data_paths=config.learning_config.dataset_config.eval_paths,
116+
speech_featurizer=speech_featurizer,
117+
text_featurizer=text_featurizer,
118+
stage="eval", cache=args.cache, shuffle=True
119+
)
120+
121+
conformer_trainer = TrainerWithMaskingGA(
122+
config=config.learning_config.running_config,
123+
text_featurizer=text_featurizer, strategy=strategy
124+
)
125+
126+
with conformer_trainer.strategy.scope():
127+
# build model
128+
conformer = Conformer(**config.model_config, vocabulary_size=text_featurizer.num_classes)
129+
conformer._build(speech_featurizer.shape)
130+
conformer.summary(line_length=120)
131+
132+
optimizer = tf.keras.optimizers.Adam(
133+
TransformerSchedule(
134+
d_model=config.model_config["encoder_dmodel"],
135+
warmup_steps=config.learning_config.optimizer_config["warmup_steps"],
136+
max_lr=(0.05 / math.sqrt(config.model_config["encoder_dmodel"]))
137+
),
138+
beta_1=config.learning_config.optimizer_config["beta1"],
139+
beta_2=config.learning_config.optimizer_config["beta2"],
140+
epsilon=config.learning_config.optimizer_config["epsilon"]
141+
)
142+
143+
conformer_trainer.compile(model=conformer, optimizer=optimizer,
144+
max_to_keep=args.max_ckpts)
145+
146+
conformer_trainer.fit(train_dataset, eval_dataset,
147+
train_bs=args.tbs, eval_bs=args.ebs, train_acs=args.acs)

0 commit comments

Comments
 (0)