Skip to content

Commit ec0df60

Browse files
authored
Merge pull request #36 from TensorSpeech/dev/jasper
Update CTC and DeepSpeech2, Supported Jasper
2 parents ee4c314 + d18aac8 commit ec0df60

24 files changed

+780
-788
lines changed

examples/deepspeech2/README.md

Lines changed: 15 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -6,22 +6,19 @@ References: [https://arxiv.org/abs/1512.02595](https://arxiv.org/abs/1512.02595)
66

77
```yaml
88
model_config:
9-
conv_conf:
10-
conv_type: 2
11-
conv_kernels: [[11, 41], [11, 21], [11, 11]]
12-
conv_strides: [[2, 2], [1, 2], [1, 2]]
13-
conv_filters: [32, 32, 96]
14-
conv_dropout: 0
15-
rnn_conf:
16-
rnn_layers: 5
17-
rnn_type: lstm
18-
rnn_units: 512
19-
rnn_bidirectional: True
20-
rnn_rowconv: False
21-
rnn_dropout: 0
22-
fc_conf:
23-
fc_units: [1024]
24-
fc_dropout: 0
9+
conv_type: conv2d
10+
conv_kernels: [[11, 41], [11, 21], [11, 11]]
11+
conv_strides: [[2, 2], [1, 2], [1, 2]]
12+
conv_filters: [32, 32, 96]
13+
conv_dropout: 0.1
14+
rnn_nlayers: 5
15+
rnn_type: lstm
16+
rnn_units: 512
17+
rnn_bidirectional: True
18+
rnn_rowconv: 0
19+
rnn_dropout: 0.1
20+
fc_nlayers: 0
21+
fc_units: 1024
2522
```
2623
2724
## Architecture
@@ -30,24 +27,6 @@ model_config:
3027
3128
## Training and Testing
3229
33-
See `python examples/deepspeech2/run_ds2.py --help`
30+
See `python examples/deepspeech2/train_ds2.py --help`
3431

35-
## Results on VIVOS Dataset
36-
37-
* Features: Spectrogram with `80` frequency channels
38-
* KenLM: `alpha = 2.0` and `beta = 1.0`
39-
* Epochs: `20`
40-
* Train set split ratio: `90:10`
41-
* Augmentation: `None`
42-
* Model architecture: same as [vivos.yaml](./configs/vivos.yml)
43-
44-
**CTC Loss**
45-
46-
<img src="./figs/ds2_vivos_ctc_loss.svg" alt="ds2_vivos_ctc_loss" width="300px" />
47-
48-
**Error rates**
49-
50-
| | WER (%) | CER (%) |
51-
| :-------------- | :------------: | :------------: |
52-
| *BeamSearch* | 43.75243 | 17.991581 |
53-
| *BeamSearch LM* | **20.7561836** | **11.0304441** |
32+
See `python examples/deepspeech2/test_ds2.py --help`

examples/deepspeech2/configs/vivos.yml renamed to examples/deepspeech2/config.yml

Lines changed: 15 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ speech_config:
2424
normalize_per_feature: False
2525

2626
decoder_config:
27-
vocabulary: /mnt/Projects/asrk16/TiramisuASR/vocabularies/vietnamese.txt
27+
vocabulary: ./vocabularies/vietnamese.characters
2828
blank_at_zero: False
2929
beam_width: 500
3030
lm_config:
@@ -33,21 +33,20 @@ decoder_config:
3333
beta: 1.0
3434

3535
model_config:
36-
conv_conf:
37-
conv_type: 2
38-
conv_kernels: [[11, 41], [11, 21], [11, 11]]
39-
conv_strides: [[2, 2], [1, 2], [1, 2]]
40-
conv_filters: [32, 32, 96]
41-
conv_dropout: 0
42-
rnn_conf:
43-
rnn_layers: 5
44-
rnn_type: lstm
45-
rnn_units: 512
46-
rnn_bidirectional: True
47-
rnn_rowconv: False
48-
rnn_dropout: 0
49-
fc_conf:
50-
fc_units: null
36+
name: deepspeech2
37+
conv_type: conv2d
38+
conv_kernels: [[11, 41], [11, 21], [11, 11]]
39+
conv_strides: [[2, 2], [1, 2], [1, 2]]
40+
conv_filters: [32, 32, 96]
41+
conv_dropout: 0.1
42+
rnn_nlayers: 5
43+
rnn_type: lstm
44+
rnn_units: 512
45+
rnn_bidirectional: True
46+
rnn_rowconv: 0
47+
rnn_dropout: 0.1
48+
fc_nlayers: 0
49+
fc_units: 1024
5150

5251
learning_config:
5352
augmentations: null

examples/deepspeech2/figs/ds2_vivos_ctc_loss.svg

Lines changed: 0 additions & 1 deletion
This file was deleted.

examples/deepspeech2/model.py

Lines changed: 0 additions & 148 deletions
This file was deleted.

examples/deepspeech2/test_ds2.py

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
setup_environment()
2020
import tensorflow as tf
2121

22-
DEFAULT_YAML = os.path.join(os.path.abspath(os.path.dirname(__file__)), "configs", "vivos.yml")
22+
DEFAULT_YAML = os.path.join(os.path.abspath(os.path.dirname(__file__)), "config.yml")
2323

2424
tf.keras.backend.clear_session()
2525

@@ -54,7 +54,7 @@
5454
from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer
5555
from tensorflow_asr.featurizers.text_featurizers import CharFeaturizer
5656
from tensorflow_asr.runners.base_runners import BaseTester
57-
from model import DeepSpeech2
57+
from tensorflow_asr.models.deepspeech2 import DeepSpeech2
5858

5959
tf.random.set_seed(0)
6060
assert args.export
@@ -63,13 +63,10 @@
6363
speech_featurizer = TFSpeechFeaturizer(config["speech_config"])
6464
text_featurizer = CharFeaturizer(config["decoder_config"])
6565
# Build DS2 model
66-
ds2_model = DeepSpeech2(input_shape=speech_featurizer.shape,
67-
arch_config=config["model_config"],
68-
num_classes=text_featurizer.num_classes,
69-
name="deepspeech2")
66+
ds2_model = DeepSpeech2(**config["model_config"], vocabulary_size=text_featurizer.num_classes)
7067
ds2_model._build(speech_featurizer.shape)
7168
ds2_model.load_weights(args.saved, by_name=True)
72-
ds2_model.summary(line_length=150)
69+
ds2_model.summary(line_length=120)
7370
ds2_model.add_featurizers(speech_featurizer, text_featurizer)
7471

7572
if args.tfrecords:

examples/deepspeech2/train_ds2.py

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
setup_environment()
2020
import tensorflow as tf
2121

22-
DEFAULT_YAML = os.path.join(os.path.abspath(os.path.dirname(__file__)), "configs", "vivos.yml")
22+
DEFAULT_YAML = os.path.join(os.path.abspath(os.path.dirname(__file__)), "config.yml")
2323

2424
tf.keras.backend.clear_session()
2525

@@ -60,7 +60,7 @@
6060
from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer
6161
from tensorflow_asr.featurizers.text_featurizers import CharFeaturizer
6262
from tensorflow_asr.runners.ctc_runners import CTCTrainer
63-
from model import DeepSpeech2
63+
from tensorflow_asr.models.deepspeech2 import DeepSpeech2
6464

6565
config = UserConfig(DEFAULT_YAML, args.config, learning=True)
6666
speech_featurizer = TFSpeechFeaturizer(config["speech_config"])
@@ -100,12 +100,9 @@
100100
ctc_trainer = CTCTrainer(text_featurizer, config["learning_config"]["running_config"])
101101
# Build DS2 model
102102
with ctc_trainer.strategy.scope():
103-
ds2_model = DeepSpeech2(input_shape=speech_featurizer.shape,
104-
arch_config=config["model_config"],
105-
num_classes=text_featurizer.num_classes,
106-
name="deepspeech2")
103+
ds2_model = DeepSpeech2(**config["model_config"], vocabulary_size=text_featurizer.num_classes)
107104
ds2_model._build(speech_featurizer.shape)
108-
ds2_model.summary(line_length=150)
105+
ds2_model.summary(line_length=120)
109106
# Compile
110107
ctc_trainer.compile(ds2_model, config["learning_config"]["optimizer_config"],
111108
max_to_keep=args.max_ckpts)

examples/jasper/README.md

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
# Jasper
2+
3+
References: [https://arxiv.org/abs/1904.03288](https://arxiv.org/abs/1904.03288)
4+
5+
## Model YAML Config Structure
6+
7+
```yaml
8+
model_config:
9+
10+
```
11+
12+
## Architecture
13+
14+
<img src="./figs/jasper_arch.png" alt="jasper_arch" width="500px" />
15+
16+
## Training and Testing
17+
18+
See `python examples/jasper/train_jasper.py --help`
19+
20+
See `python examples/jasper/test_jasper.py --help`

0 commit comments

Comments
 (0)