Merge pull request #40 from TensorSpeech/fix/stft

nglehuy · web-flow · commit fbf97575e2b2 · 2020-11-03T21:35:22.000+07:00
Reduce differences between librosa and tf.signal
diff --git a/README.md b/README.md
@@ -19,6 +19,7 @@ TensorFlowASR implements some automatic speech recognition architectures such as
 
 ## What's New?
 
+- (11/3/2020) Reduce differences between `librosa.stft` and `tf.signal.stft`
 - (10/31/2020) Update DeepSpeech2 and Supported Jasper [https://arxiv.org/abs/1904.03288](https://arxiv.org/abs/1904.03288)
 - (10/18/2020) Supported Streaming Transducer [https://arxiv.org/abs/1811.06621](https://arxiv.org/abs/1811.06621)
 - (10/15/2020) Add gradients accumulation and Refactor to TensorflowASR
diff --git a/setup.py b/setup.py
@@ -37,7 +37,7 @@
 
 setuptools.setup(
     name="TensorFlowASR",
-    version="0.2.9",
+    version="0.2.10",
     author="Huy Le Nguyen",
     author_email="nlhuy.cs.16@gmail.com",
     description="Almost State-of-the-art Automatic Speech Recognition using Tensorflow 2",
diff --git a/tensorflow_asr/featurizers/speech_featurizers.py b/tensorflow_asr/featurizers/speech_featurizers.py
@@ -245,7 +245,7 @@ def shape(self) -> list:
     def stft(self, signal):
         return np.square(
             np.abs(librosa.core.stft(signal, n_fft=self.nfft, hop_length=self.frame_step,
-                                     win_length=self.frame_length, center=True, window="hann")))
+                                     win_length=self.frame_length, center=False, window="hann")))
 
     def power_to_db(self, S, ref=1.0, amin=1e-10, top_db=80.0):
         return librosa.power_to_db(S, ref=ref, amin=amin, top_db=top_db)
@@ -302,7 +302,7 @@ def compute_pitch(self, signal: np.ndarray) -> np.ndarray:
         pitches, _ = librosa.core.piptrack(
             y=signal, sr=self.sample_rate,
             n_fft=self.nfft, hop_length=self.frame_step,
-            fmin=0.0, fmax=int(self.sample_rate / 2), win_length=self.frame_length, center=True
+            fmin=0.0, fmax=int(self.sample_rate / 2), win_length=self.frame_length, center=False
         )
 
         pitches = pitches.T
diff --git a/tests/speech_featurizer_test.py b/tests/speech_featurizer_test.py
@@ -25,25 +25,11 @@
 import matplotlib.pyplot as plt
 from tensorflow_asr.featurizers.speech_featurizers import read_raw_audio, \
     TFSpeechFeaturizer, NumpySpeechFeaturizer
-from tensorflow_asr.augmentations.augments import UserAugmentation
 
 
 def main(argv):
     speech_file = argv[1]
     feature_type = argv[2]
-    augments = {
-        # "after": {
-        #     "time_masking": {
-        #         "num_masks": 10,
-        #         "mask_factor": 100,
-        #         "p_upperbound": 0.05
-        #     },
-        #     "freq_masking": {
-        #         "mask_factor": 27
-        #     }
-        # },
-    }
-    au = UserAugmentation(augments)
     speech_conf = {
         "sample_rate": 16000,
         "frame_ms": 25,
@@ -57,9 +43,13 @@ def main(argv):
     }
     signal = read_raw_audio(speech_file, speech_conf["sample_rate"])
 
-    sf = NumpySpeechFeaturizer(speech_conf)
+    nsf = NumpySpeechFeaturizer(speech_conf)
+    sf = TFSpeechFeaturizer(speech_conf)
+    ft = nsf.stft(signal)
+    print(ft.shape, np.mean(ft))
+    ft = sf.stft(signal).numpy()
+    print(ft.shape, np.mean(ft))
     ft = sf.extract(signal)
-    ft = au["after"].augment(ft)[:, :, 0]
 
     plt.figure(figsize=(16, 2.5))
     ax = plt.gca()