Skip to content

Commit f5e3ae9

Browse files
committed
⚡ fix read audio, update dataset config and add transducer greedy v2
1 parent a80316c commit f5e3ae9

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

50 files changed

+4524
-1091
lines changed

examples/conformer/config.yml

Lines changed: 41 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -24,12 +24,16 @@ speech_config:
2424
normalize_per_feature: False
2525

2626
decoder_config:
27-
vocabulary: null
28-
target_vocab_size: 1024
27+
vocabulary: ./vocabularies/librispeech_train_4_4076.subwords
28+
target_vocab_size: 4096
2929
max_subword_length: 4
3030
blank_at_zero: True
3131
beam_width: 5
3232
norm_score: True
33+
corpus_files:
34+
- /media/nlhuy/Data/ML/ASR/Raw/LibriSpeech/LibriSpeech/train-clean-100/transcripts.tsv
35+
- /media/nlhuy/Data/ML/ASR/Raw/LibriSpeech/LibriSpeech/train-clean-360/transcripts.tsv
36+
- /media/nlhuy/Data/ML/ASR/Raw/LibriSpeech/LibriSpeech/train-other-500/transcripts.tsv
3337

3438
model_config:
3539
name: conformer
@@ -53,32 +57,51 @@ model_config:
5357
prediction_rnn_units: 320
5458
prediction_rnn_type: lstm
5559
prediction_rnn_implementation: 2
56-
prediction_layer_norm: True
60+
prediction_layer_norm: False
5761
prediction_projection_units: 0
58-
joint_dim: 320
62+
joint_dim: 640
5963
joint_activation: tanh
6064

6165
learning_config:
62-
augmentations:
66+
train_dataset_config:
6367
use_tf: True
64-
after:
65-
time_masking:
66-
num_masks: 10
67-
mask_factor: 100
68-
p_upperbound: 0.05
69-
freq_masking:
70-
num_masks: 1
71-
mask_factor: 27
72-
73-
dataset_config:
74-
train_paths:
68+
augmentation_config:
69+
after:
70+
time_masking:
71+
num_masks: 10
72+
mask_factor: 100
73+
p_upperbound: 0.05
74+
freq_masking:
75+
num_masks: 1
76+
mask_factor: 27
77+
data_paths:
7578
- /mnt/Miscellanea/Datasets/Speech/LibriSpeech/train-clean-100/transcripts.tsv
76-
eval_paths:
79+
tfrecords_dir: /mnt/Miscellanea/Datasets/Speech/LibriSpeech/tfrecords-test
80+
shuffle: True
81+
cache: True
82+
buffer_size: 100
83+
drop_remainder: True
84+
85+
eval_dataset_config:
86+
use_tf: True
87+
data_paths:
7788
- /mnt/Miscellanea/Datasets/Speech/LibriSpeech/dev-clean/transcripts.tsv
7889
- /mnt/Miscellanea/Datasets/Speech/LibriSpeech/dev-other/transcripts.tsv
79-
test_paths:
90+
tfrecords_dir: /mnt/Miscellanea/Datasets/Speech/LibriSpeech/tfrecords-test
91+
shuffle: False
92+
cache: True
93+
buffer_size: 100
94+
drop_remainder: True
95+
96+
test_dataset_config:
97+
use_tf: True
98+
data_paths:
8099
- /mnt/Miscellanea/Datasets/Speech/LibriSpeech/test-clean/transcripts.tsv
81100
tfrecords_dir: /mnt/Miscellanea/Datasets/Speech/LibriSpeech/tfrecords-test
101+
shuffle: False
102+
cache: True
103+
buffer_size: 100
104+
drop_remainder: True
82105

83106
optimizer_config:
84107
warmup_steps: 40000

examples/conformer/test_conformer.py

Lines changed: 14 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -25,26 +25,19 @@
2525

2626
parser = argparse.ArgumentParser(prog="Conformer Testing")
2727

28-
parser.add_argument("--config", type=str, default=DEFAULT_YAML,
29-
help="The file path of model configuration file")
28+
parser.add_argument("--config", type=str, default=DEFAULT_YAML, help="The file path of model configuration file")
3029

31-
parser.add_argument("--saved", type=str, default=None,
32-
help="Path to saved model")
30+
parser.add_argument("--saved", type=str, default=None, help="Path to saved model")
3331

34-
parser.add_argument("--tfrecords", default=False, action="store_true",
35-
help="Whether to use tfrecords as dataset")
32+
parser.add_argument("--tfrecords", default=False, action="store_true", help="Whether to use tfrecords as dataset")
3633

37-
parser.add_argument("--mxp", default=False, action="store_true",
38-
help="Enable mixed precision")
34+
parser.add_argument("--mxp", default=False, action="store_true", help="Enable mixed precision")
3935

40-
parser.add_argument("--device", type=int, default=0,
41-
help="Device's id to run test on")
36+
parser.add_argument("--device", type=int, default=0, help="Device's id to run test on")
4237

43-
parser.add_argument("--cpu", default=False, action="store_true",
44-
help="Whether to only use cpu")
38+
parser.add_argument("--cpu", default=False, action="store_true", help="Whether to only use cpu")
4539

46-
parser.add_argument("--output_name", type=str, default="test",
47-
help="Result filename name prefix")
40+
parser.add_argument("--output_name", type=str, default="test", help="Result filename name prefix")
4841

4942
args = parser.parse_args()
5043

@@ -53,7 +46,7 @@
5346
setup_devices([args.device], cpu=args.cpu)
5447

5548
from tensorflow_asr.configs.config import Config
56-
from tensorflow_asr.datasets.asr_dataset import ASRTFRecordTestDataset, ASRSliceTestDataset
49+
from tensorflow_asr.datasets.asr_dataset import ASRTFRecordDataset, ASRSliceDataset
5750
from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer
5851
from tensorflow_asr.featurizers.text_featurizers import CharFeaturizer
5952
from tensorflow_asr.runners.base_runners import BaseTester
@@ -67,19 +60,14 @@
6760
assert args.saved
6861

6962
if args.tfrecords:
70-
test_dataset = ASRTFRecordTestDataset(
71-
data_paths=config.learning_config.dataset_config.test_paths,
72-
tfrecords_dir=config.learning_config.dataset_config.tfrecords_dir,
73-
speech_featurizer=speech_featurizer,
74-
text_featurizer=text_featurizer,
75-
stage="test", shuffle=False
63+
test_dataset = ASRTFRecordDataset(
64+
speech_featurizer=speech_featurizer, text_featurizer=text_featurizer,
65+
**vars(config.learning_config.test_dataset_config)
7666
)
7767
else:
78-
test_dataset = ASRSliceTestDataset(
79-
data_paths=config.learning_config.dataset_config.test_paths,
80-
speech_featurizer=speech_featurizer,
81-
text_featurizer=text_featurizer,
82-
stage="test", shuffle=False
68+
test_dataset = ASRSliceDataset(
69+
speech_featurizer=speech_featurizer, text_featurizer=text_featurizer,
70+
**vars(config.learning_config.test_dataset_config)
8371
)
8472

8573
# build model

examples/conformer/test_subword_conformer.py

Lines changed: 15 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -25,31 +25,23 @@
2525

2626
parser = argparse.ArgumentParser(prog="Conformer Testing")
2727

28-
parser.add_argument("--config", type=str, default=DEFAULT_YAML,
29-
help="The file path of model configuration file")
28+
parser.add_argument("--config", type=str, default=DEFAULT_YAML, help="The file path of model configuration file")
3029

31-
parser.add_argument("--saved", type=str, default=None,
32-
help="Path to saved model")
30+
parser.add_argument("--saved", type=str, default=None, help="Path to saved model")
3331

34-
parser.add_argument("--tfrecords", default=False, action="store_true",
35-
help="Whether to use tfrecords as dataset")
32+
parser.add_argument("--tfrecords", default=False, action="store_true", help="Whether to use tfrecords as dataset")
3633

37-
parser.add_argument("--mxp", default=False, action="store_true",
38-
help="Enable mixed precision")
34+
parser.add_argument("--mxp", default=False, action="store_true", help="Enable mixed precision")
3935

4036
parser.add_argument("--sentence_piece", default=False, action="store_true", help="Whether to use `SentencePiece` model")
4137

42-
parser.add_argument("--device", type=int, default=0,
43-
help="Device's id to run test on")
38+
parser.add_argument("--device", type=int, default=0, help="Device's id to run test on")
4439

45-
parser.add_argument("--cpu", default=False, action="store_true",
46-
help="Whether to only use cpu")
40+
parser.add_argument("--cpu", default=False, action="store_true", help="Whether to only use cpu")
4741

48-
parser.add_argument("--subwords", type=str, default=None,
49-
help="Path to file that stores generated subwords")
42+
parser.add_argument("--subwords", type=str, default=None, help="Path to file that stores generated subwords")
5043

51-
parser.add_argument("--output_name", type=str, default="test",
52-
help="Result filename name prefix")
44+
parser.add_argument("--output_name", type=str, default="test", help="Result filename name prefix")
5345

5446
args = parser.parse_args()
5547

@@ -58,7 +50,7 @@
5850
setup_devices([args.device], cpu=args.cpu)
5951

6052
from tensorflow_asr.configs.config import Config
61-
from tensorflow_asr.datasets.asr_dataset import ASRTFRecordTestDataset, ASRSliceTestDataset
53+
from tensorflow_asr.datasets.asr_dataset import ASRTFRecordDataset, ASRSliceDataset
6254
from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer
6355
from tensorflow_asr.featurizers.text_featurizers import SubwordFeaturizer, SentencePieceFeaturizer
6456
from tensorflow_asr.runners.base_runners import BaseTester
@@ -80,19 +72,14 @@
8072
assert args.saved
8173

8274
if args.tfrecords:
83-
test_dataset = ASRTFRecordTestDataset(
84-
data_paths=config.learning_config.dataset_config.test_paths,
85-
tfrecords_dir=config.learning_config.dataset_config.tfrecords_dir,
86-
speech_featurizer=speech_featurizer,
87-
text_featurizer=text_featurizer,
88-
stage="test", shuffle=False
75+
test_dataset = ASRTFRecordDataset(
76+
speech_featurizer=speech_featurizer, text_featurizer=text_featurizer,
77+
**vars(config.learning_config.test_dataset_config)
8978
)
9079
else:
91-
test_dataset = ASRSliceTestDataset(
92-
data_paths=config.learning_config.dataset_config.test_paths,
93-
speech_featurizer=speech_featurizer,
94-
text_featurizer=text_featurizer,
95-
stage="test", shuffle=False
80+
test_dataset = ASRSliceDataset(
81+
speech_featurizer=speech_featurizer, text_featurizer=text_featurizer,
82+
**vars(config.learning_config.test_dataset_config)
9683
)
9784

9885
# build model

examples/conformer/train_conformer.py

Lines changed: 8 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -32,8 +32,6 @@
3232

3333
parser.add_argument("--tfrecords", default=False, action="store_true", help="Whether to use tfrecords")
3434

35-
parser.add_argument("--tfrecords_shards", type=int, default=16, help="Number of tfrecords shards")
36-
3735
parser.add_argument("--tbs", type=int, default=None, help="Train batch size per replica")
3836

3937
parser.add_argument("--ebs", type=int, default=None, help="Evaluation batch size per replica")
@@ -42,10 +40,6 @@
4240

4341
parser.add_argument("--mxp", default=False, action="store_true", help="Enable mixed precision")
4442

45-
parser.add_argument("--cache", default=False, action="store_true", help="Enable caching for dataset")
46-
47-
parser.add_argument("--bfs", type=int, default=100, help="Buffer size for shuffling")
48-
4943
args = parser.parse_args()
5044

5145
tf.config.optimizer.set_experimental_options({"auto_mixed_precision": args.mxp})
@@ -66,39 +60,21 @@
6660

6761
if args.tfrecords:
6862
train_dataset = ASRTFRecordDataset(
69-
data_paths=config.learning_config.dataset_config.train_paths,
70-
tfrecords_dir=config.learning_config.dataset_config.tfrecords_dir,
71-
speech_featurizer=speech_featurizer,
72-
text_featurizer=text_featurizer,
73-
augmentations=config.learning_config.augmentations,
74-
tfrecords_shards=args.tfrecords_shards,
75-
stage="train", cache=args.cache,
76-
shuffle=True, buffer_size=args.bfs,
63+
speech_featurizer=speech_featurizer, text_featurizer=text_featurizer,
64+
**vars(config.learning_config.train_dataset_config)
7765
)
7866
eval_dataset = ASRTFRecordDataset(
79-
data_paths=config.learning_config.dataset_config.eval_paths,
80-
tfrecords_dir=config.learning_config.dataset_config.tfrecords_dir,
81-
tfrecords_shards=args.tfrecords_shards,
82-
speech_featurizer=speech_featurizer,
83-
text_featurizer=text_featurizer,
84-
stage="eval", cache=args.cache,
85-
shuffle=True, buffer_size=args.bfs,
67+
speech_featurizer=speech_featurizer, text_featurizer=text_featurizer,
68+
**vars(config.learning_config.eval_dataset_config)
8669
)
8770
else:
8871
train_dataset = ASRSliceDataset(
89-
data_paths=config.learning_config.dataset_config.train_paths,
90-
speech_featurizer=speech_featurizer,
91-
text_featurizer=text_featurizer,
92-
augmentations=config.learning_config.augmentations,
93-
stage="train", cache=args.cache,
94-
shuffle=True, buffer_size=args.bfs,
72+
speech_featurizer=speech_featurizer, text_featurizer=text_featurizer,
73+
**vars(config.learning_config.train_dataset_config)
9574
)
9675
eval_dataset = ASRSliceDataset(
97-
data_paths=config.learning_config.dataset_config.eval_paths,
98-
speech_featurizer=speech_featurizer,
99-
text_featurizer=text_featurizer,
100-
stage="eval", cache=args.cache,
101-
shuffle=True, buffer_size=args.bfs,
76+
speech_featurizer=speech_featurizer, text_featurizer=text_featurizer,
77+
**vars(config.learning_config.eval_dataset_config)
10278
)
10379

10480
conformer_trainer = TransducerTrainer(

examples/conformer/train_ga_conformer.py

Lines changed: 8 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -32,8 +32,6 @@
3232

3333
parser.add_argument("--tfrecords", default=False, action="store_true", help="Whether to use tfrecords")
3434

35-
parser.add_argument("--tfrecords_shards", type=int, default=16, help="Number of tfrecords shards")
36-
3735
parser.add_argument("--tbs", type=int, default=None, help="Train batch size per replica")
3836

3937
parser.add_argument("--ebs", type=int, default=None, help="Evaluation batch size per replica")
@@ -44,10 +42,6 @@
4442

4543
parser.add_argument("--mxp", default=False, action="store_true", help="Enable mixed precision")
4644

47-
parser.add_argument("--cache", default=False, action="store_true", help="Enable caching for dataset")
48-
49-
parser.add_argument("--bfs", type=int, default=100, help="Buffer size for shuffling")
50-
5145
args = parser.parse_args()
5246

5347
tf.config.optimizer.set_experimental_options({"auto_mixed_precision": args.mxp})
@@ -68,39 +62,21 @@
6862

6963
if args.tfrecords:
7064
train_dataset = ASRTFRecordDataset(
71-
data_paths=config.learning_config.dataset_config.train_paths,
72-
tfrecords_dir=config.learning_config.dataset_config.tfrecords_dir,
73-
speech_featurizer=speech_featurizer,
74-
text_featurizer=text_featurizer,
75-
augmentations=config.learning_config.augmentations,
76-
tfrecords_shards=args.tfrecords_shards,
77-
stage="train", cache=args.cache,
78-
shuffle=True, buffer_size=args.bfs,
65+
speech_featurizer=speech_featurizer, text_featurizer=text_featurizer,
66+
**vars(config.learning_config.train_dataset_config)
7967
)
8068
eval_dataset = ASRTFRecordDataset(
81-
data_paths=config.learning_config.dataset_config.eval_paths,
82-
tfrecords_dir=config.learning_config.dataset_config.tfrecords_dir,
83-
tfrecords_shards=args.tfrecords_shards,
84-
speech_featurizer=speech_featurizer,
85-
text_featurizer=text_featurizer,
86-
stage="eval", cache=args.cache,
87-
shuffle=True, buffer_size=args.bfs,
69+
speech_featurizer=speech_featurizer, text_featurizer=text_featurizer,
70+
**vars(config.learning_config.eval_dataset_config)
8871
)
8972
else:
9073
train_dataset = ASRSliceDataset(
91-
data_paths=config.learning_config.dataset_config.train_paths,
92-
speech_featurizer=speech_featurizer,
93-
text_featurizer=text_featurizer,
94-
augmentations=config.learning_config.augmentations,
95-
stage="train", cache=args.cache,
96-
shuffle=True, buffer_size=args.bfs,
74+
speech_featurizer=speech_featurizer, text_featurizer=text_featurizer,
75+
**vars(config.learning_config.train_dataset_config)
9776
)
9877
eval_dataset = ASRSliceDataset(
99-
data_paths=config.learning_config.dataset_config.eval_paths,
100-
speech_featurizer=speech_featurizer,
101-
text_featurizer=text_featurizer,
102-
stage="eval", cache=args.cache,
103-
shuffle=True, buffer_size=args.bfs,
78+
speech_featurizer=speech_featurizer, text_featurizer=text_featurizer,
79+
**vars(config.learning_config.eval_dataset_config)
10480
)
10581

10682
conformer_trainer = TransducerTrainerGA(

0 commit comments

Comments
 (0)