17
17
from __future__ import division
18
18
from __future__ import print_function
19
19
20
+ import functools
21
+ import multiprocessing
22
+
20
23
import numpy as np
21
24
import scipy .io .wavfile as wavfile
22
25
from six .moves import xrange # pylint: disable=redefined-builtin
23
26
import tensorflow as tf
24
27
25
- # pylint: disable=g-bad-import-order
26
- from data .featurizer import AudioFeaturizer
27
- from data .featurizer import TextFeaturizer
28
+ import data .featurizer as featurizer # pylint: disable=g-bad-import-order
28
29
29
30
30
31
class AudioConfig (object ):
@@ -44,7 +45,7 @@ def __init__(self,
44
45
frame_length: an integer for the length of a spectrogram frame, in ms.
45
46
frame_step: an integer for the frame stride, in ms.
46
47
fft_length: an integer for the number of fft bins.
47
- normalize: a boolean for whether apply normalization on the audio tensor .
48
+ normalize: a boolean for whether apply normalization on the audio feature .
48
49
spect_type: a string for the type of spectrogram to be extracted.
49
50
"""
50
51
@@ -78,90 +79,122 @@ def __init__(self, audio_config, data_path, vocab_file_path):
78
79
self .vocab_file_path = vocab_file_path
79
80
80
81
82
+ def _normalize_audio_feature (audio_feature ):
83
+ """Perform mean and variance normalization on the spectrogram feature.
84
+
85
+ Args:
86
+ audio_feature: a numpy array for the spectrogram feature.
87
+
88
+ Returns:
89
+ a numpy array of the normalized spectrogram.
90
+ """
91
+ mean = np .mean (audio_feature , axis = 0 )
92
+ var = np .var (audio_feature , axis = 0 )
93
+ normalized = (audio_feature - mean ) / (np .sqrt (var ) + 1e-6 )
94
+
95
+ return normalized
96
+
97
+
98
+ def _preprocess_audio (
99
+ audio_file_path , audio_sample_rate , audio_featurizer , normalize ):
100
+ """Load the audio file in memory and compute spectrogram feature."""
101
+ tf .logging .info (
102
+ "Extracting spectrogram feature for {}" .format (audio_file_path ))
103
+ sample_rate , data = wavfile .read (audio_file_path )
104
+ assert sample_rate == audio_sample_rate
105
+ if data .dtype not in [np .float32 , np .float64 ]:
106
+ data = data .astype (np .float32 ) / np .iinfo (data .dtype ).max
107
+ feature = featurizer .compute_spectrogram_feature (
108
+ data , audio_featurizer .frame_length , audio_featurizer .frame_step ,
109
+ audio_featurizer .fft_length )
110
+ if normalize :
111
+ feature = _normalize_audio_feature (feature )
112
+ return feature
113
+
114
+
115
+ def _preprocess_transcript (transcript , token_to_index ):
116
+ """Process transcript as label features."""
117
+ return featurizer .compute_label_feature (transcript , token_to_index )
118
+
119
+
120
+ def _preprocess_data (dataset_config , audio_featurizer , token_to_index ):
121
+ """Generate a list of waveform, transcript pair.
122
+
123
+ Each dataset file contains three columns: "wav_filename", "wav_filesize",
124
+ and "transcript". This function parses the csv file and stores each example
125
+ by the increasing order of audio length (indicated by wav_filesize).
126
+ AS the waveforms are ordered in increasing length, audio samples in a
127
+ mini-batch have similar length.
128
+
129
+ Args:
130
+ dataset_config: an instance of DatasetConfig.
131
+ audio_featurizer: an instance of AudioFeaturizer.
132
+ token_to_index: the mapping from character to its index
133
+
134
+ Returns:
135
+ features and labels array processed from the audio/text input.
136
+ """
137
+
138
+ file_path = dataset_config .data_path
139
+ sample_rate = dataset_config .audio_config .sample_rate
140
+ normalize = dataset_config .audio_config .normalize
141
+
142
+ with tf .gfile .Open (file_path , "r" ) as f :
143
+ lines = f .read ().splitlines ()
144
+ lines = [line .split ("\t " ) for line in lines ]
145
+ # Skip the csv header.
146
+ lines = lines [1 :]
147
+ # Sort input data by the length of waveform.
148
+ lines .sort (key = lambda item : int (item [1 ]))
149
+
150
+ # Use multiprocessing for feature/label extraction
151
+ num_cores = multiprocessing .cpu_count ()
152
+ pool = multiprocessing .Pool (processes = num_cores )
153
+
154
+ features = pool .map (
155
+ functools .partial (
156
+ _preprocess_audio , audio_sample_rate = sample_rate ,
157
+ audio_featurizer = audio_featurizer , normalize = normalize ),
158
+ [line [0 ] for line in lines ])
159
+ labels = pool .map (
160
+ functools .partial (
161
+ _preprocess_transcript , token_to_index = token_to_index ),
162
+ [line [2 ] for line in lines ])
163
+
164
+ pool .terminate ()
165
+ return features , labels
166
+
167
+
81
168
class DeepSpeechDataset (object ):
82
169
"""Dataset class for training/evaluation of DeepSpeech model."""
83
170
84
171
def __init__ (self , dataset_config ):
85
- """Initialize the class.
86
-
87
- Each dataset file contains three columns: "wav_filename", "wav_filesize",
88
- and "transcript". This function parses the csv file and stores each example
89
- by the increasing order of audio length (indicated by wav_filesize).
172
+ """Initialize the DeepSpeechDataset class.
90
173
91
174
Args:
92
175
dataset_config: DatasetConfig object.
93
176
"""
94
177
self .config = dataset_config
95
178
# Instantiate audio feature extractor.
96
- self .audio_featurizer = AudioFeaturizer (
179
+ self .audio_featurizer = featurizer . AudioFeaturizer (
97
180
sample_rate = self .config .audio_config .sample_rate ,
98
181
frame_length = self .config .audio_config .frame_length ,
99
182
frame_step = self .config .audio_config .frame_step ,
100
- fft_length = self .config .audio_config .fft_length ,
101
- spect_type = self .config .audio_config .spect_type )
183
+ fft_length = self .config .audio_config .fft_length )
102
184
# Instantiate text feature extractor.
103
- self .text_featurizer = TextFeaturizer (
185
+ self .text_featurizer = featurizer . TextFeaturizer (
104
186
vocab_file = self .config .vocab_file_path )
105
187
106
188
self .speech_labels = self .text_featurizer .speech_labels
107
- self .features , self .labels = self ._preprocess_data (self .config .data_path )
189
+ self .features , self .labels = _preprocess_data (
190
+ self .config ,
191
+ self .audio_featurizer ,
192
+ self .text_featurizer .token_to_idx
193
+ )
194
+
108
195
self .num_feature_bins = (
109
196
self .features [0 ].shape [1 ] if len (self .features ) else None )
110
197
111
- def _preprocess_data (self , file_path ):
112
- """Generate a list of waveform, transcript pair.
113
-
114
- Note that the waveforms are ordered in increasing length, so that audio
115
- samples in a mini-batch have similar length.
116
-
117
- Args:
118
- file_path: a string specifying the csv file path for a data set.
119
-
120
- Returns:
121
- features and labels array processed from the audio/text input.
122
- """
123
-
124
- with tf .gfile .Open (file_path , "r" ) as f :
125
- lines = f .read ().splitlines ()
126
- lines = [line .split ("\t " ) for line in lines ]
127
- # Skip the csv header.
128
- lines = lines [1 :]
129
- # Sort input data by the length of waveform.
130
- lines .sort (key = lambda item : int (item [1 ]))
131
- features = [self ._preprocess_audio (line [0 ]) for line in lines ]
132
- labels = [self ._preprocess_transcript (line [2 ]) for line in lines ]
133
- return features , labels
134
-
135
- def _normalize_audio_tensor (self , audio_tensor ):
136
- """Perform mean and variance normalization on the spectrogram tensor.
137
-
138
- Args:
139
- audio_tensor: a tensor for the spectrogram feature.
140
-
141
- Returns:
142
- a tensor for the normalized spectrogram.
143
- """
144
- mean , var = tf .nn .moments (audio_tensor , axes = [0 ])
145
- normalized = (audio_tensor - mean ) / (tf .sqrt (var ) + 1e-6 )
146
- return normalized
147
-
148
- def _preprocess_audio (self , audio_file_path ):
149
- """Load the audio file in memory."""
150
- tf .logging .info (
151
- "Extracting spectrogram feature for {}" .format (audio_file_path ))
152
- sample_rate , data = wavfile .read (audio_file_path )
153
- assert sample_rate == self .config .audio_config .sample_rate
154
- if data .dtype not in [np .float32 , np .float64 ]:
155
- data = data .astype (np .float32 ) / np .iinfo (data .dtype ).max
156
- feature = self .audio_featurizer .featurize (data )
157
- if self .config .audio_config .normalize :
158
- feature = self ._normalize_audio_tensor (feature )
159
- return tf .Session ().run (
160
- feature ) # return a numpy array rather than a tensor
161
-
162
- def _preprocess_transcript (self , transcript ):
163
- return self .text_featurizer .featurize (transcript )
164
-
165
198
166
199
def input_fn (batch_size , deep_speech_dataset , repeat = 1 ):
167
200
"""Input function for model training and evaluation.
0 commit comments