Skip to content

Commit fbf9757

Browse files
authored
Merge pull request #40 from TensorSpeech/fix/stft
Reduce differences between librosa and tf.signal
2 parents 7481678 + 7231190 commit fbf9757

File tree

4 files changed

+10
-19
lines changed

4 files changed

+10
-19
lines changed

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ TensorFlowASR implements some automatic speech recognition architectures such as
1919

2020
## What's New?
2121

22+
- (11/3/2020) Reduce differences between `librosa.stft` and `tf.signal.stft`
2223
- (10/31/2020) Update DeepSpeech2 and Supported Jasper [https://arxiv.org/abs/1904.03288](https://arxiv.org/abs/1904.03288)
2324
- (10/18/2020) Supported Streaming Transducer [https://arxiv.org/abs/1811.06621](https://arxiv.org/abs/1811.06621)
2425
- (10/15/2020) Add gradients accumulation and Refactor to TensorflowASR

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@
3737

3838
setuptools.setup(
3939
name="TensorFlowASR",
40-
version="0.2.9",
40+
version="0.2.10",
4141
author="Huy Le Nguyen",
4242
author_email="[email protected]",
4343
description="Almost State-of-the-art Automatic Speech Recognition using Tensorflow 2",

tensorflow_asr/featurizers/speech_featurizers.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -245,7 +245,7 @@ def shape(self) -> list:
245245
def stft(self, signal):
246246
return np.square(
247247
np.abs(librosa.core.stft(signal, n_fft=self.nfft, hop_length=self.frame_step,
248-
win_length=self.frame_length, center=True, window="hann")))
248+
win_length=self.frame_length, center=False, window="hann")))
249249

250250
def power_to_db(self, S, ref=1.0, amin=1e-10, top_db=80.0):
251251
return librosa.power_to_db(S, ref=ref, amin=amin, top_db=top_db)
@@ -302,7 +302,7 @@ def compute_pitch(self, signal: np.ndarray) -> np.ndarray:
302302
pitches, _ = librosa.core.piptrack(
303303
y=signal, sr=self.sample_rate,
304304
n_fft=self.nfft, hop_length=self.frame_step,
305-
fmin=0.0, fmax=int(self.sample_rate / 2), win_length=self.frame_length, center=True
305+
fmin=0.0, fmax=int(self.sample_rate / 2), win_length=self.frame_length, center=False
306306
)
307307

308308
pitches = pitches.T

tests/speech_featurizer_test.py

Lines changed: 6 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -25,25 +25,11 @@
2525
import matplotlib.pyplot as plt
2626
from tensorflow_asr.featurizers.speech_featurizers import read_raw_audio, \
2727
TFSpeechFeaturizer, NumpySpeechFeaturizer
28-
from tensorflow_asr.augmentations.augments import UserAugmentation
2928

3029

3130
def main(argv):
3231
speech_file = argv[1]
3332
feature_type = argv[2]
34-
augments = {
35-
# "after": {
36-
# "time_masking": {
37-
# "num_masks": 10,
38-
# "mask_factor": 100,
39-
# "p_upperbound": 0.05
40-
# },
41-
# "freq_masking": {
42-
# "mask_factor": 27
43-
# }
44-
# },
45-
}
46-
au = UserAugmentation(augments)
4733
speech_conf = {
4834
"sample_rate": 16000,
4935
"frame_ms": 25,
@@ -57,9 +43,13 @@ def main(argv):
5743
}
5844
signal = read_raw_audio(speech_file, speech_conf["sample_rate"])
5945

60-
sf = NumpySpeechFeaturizer(speech_conf)
46+
nsf = NumpySpeechFeaturizer(speech_conf)
47+
sf = TFSpeechFeaturizer(speech_conf)
48+
ft = nsf.stft(signal)
49+
print(ft.shape, np.mean(ft))
50+
ft = sf.stft(signal).numpy()
51+
print(ft.shape, np.mean(ft))
6152
ft = sf.extract(signal)
62-
ft = au["after"].augment(ft)[:, :, 0]
6353

6454
plt.figure(figsize=(16, 2.5))
6555
ax = plt.gca()

0 commit comments

Comments
 (0)