Merge pull request #930 from Trusted-AI/fix/psychoacoustic-model

beat-buesser · web-flow · commit 52bdc5f651bc · 2021-03-10T19:13:13.000Z
Improve Psychoacoustic model and some ASR maintenance
diff --git a/art/attacks/evasion/adversarial_asr.py b/art/attacks/evasion/adversarial_asr.py
@@ -68,7 +68,7 @@ def __init__(
         """
         # pylint: disable=W0231
 
-        # re-implement init such that inherrited methods work
+        # re-implement init such that inherited methods work
         EvasionAttack.__init__(self, estimator=estimator)  # pylint: disable=W0233
         self.masker = None
         self.eps = eps
@@ -81,5 +81,12 @@ def __init__(
         # set remaining stage 2 params to some random values
         self.alpha = 0.1
         self.learning_rate_2 = 0.1
+        self.loss_theta_min = 0.0
+        self.decrease_factor_eps: float = 1.0
+        self.num_iter_decrease_eps: int = 1
+        self.increase_factor_alpha: float = 1.0
+        self.num_iter_increase_alpha: int = 1
+        self.decrease_factor_alpha: float = 1.0
+        self.num_iter_decrease_alpha: int = 1
 
         self._check_params()
diff --git a/art/attacks/evasion/imperceptible_asr/imperceptible_asr.py b/art/attacks/evasion/imperceptible_asr/imperceptible_asr.py
diff --git a/art/attacks/evasion/imperceptible_asr/imperceptible_asr_pytorch.py b/art/attacks/evasion/imperceptible_asr/imperceptible_asr_pytorch.py
@@ -658,27 +658,19 @@ def _compute_masking_threshold(self, x: np.ndarray) -> Tuple[np.ndarray, np.ndar
         hop_length = int(sample_rate * window_stride)
         win_length = n_fft
 
-        window = self.estimator.model.audio_conf.window.value
+        window_name = self.estimator.model.audio_conf.window.value
 
-        if window == "hamming":
-            window_fn = scipy.signal.windows.hamming
-        elif window == "hann":
-            window_fn = scipy.signal.windows.hann
-        elif window == "blackman":
-            window_fn = scipy.signal.windows.blackman
-        elif window == "bartlett":
-            window_fn = scipy.signal.windows.bartlett
-        else:
-            raise NotImplementedError("Spectrogram window %s not supported." % window)
+        window = scipy.signal.get_window(window_name, win_length, fftbins=True)
 
         transformed_x = librosa.core.stft(
-            y=x, n_fft=n_fft, hop_length=hop_length, win_length=win_length, window=window_fn, center=False
+            y=x, n_fft=n_fft, hop_length=hop_length, win_length=win_length, window=window, center=False
         )
         transformed_x *= np.sqrt(8.0 / 3.0)
 
         psd = abs(transformed_x / win_length)
         original_max_psd = np.max(psd * psd)
-        psd = 10 * np.log10(psd * psd + 10e-20)
+        with np.errstate(divide='ignore'):
+            psd = (20 * np.log10(psd)).clip(min=-200)
         psd = 96 - np.max(psd) + psd
 
         # Compute freqs and barks
diff --git a/art/estimators/speech_recognition/pytorch_deep_speech.py b/art/estimators/speech_recognition/pytorch_deep_speech.py
@@ -275,10 +275,9 @@ def predict(
         :param batch_size: Batch size.
         :param transcription_output: Indicate whether the function will produce probability or transcription as
                                      prediction output. If transcription_output is not available, then probability
-                                     output is returned.
-        :type transcription_output: `bool`
+                                     output is returned. Default: True
         :return: Predicted probability (if transcription_output False) or transcription (default, if
-                 transcription_output is True or None):
+                 transcription_output is True):
                  - Probability return is a tuple of (probs, sizes), where `probs` is the probability of characters of
                  shape (nb_samples, seq_length, nb_classes) and `sizes` is the real sequence length of shape
                  (nb_samples,).
@@ -346,9 +345,9 @@ def predict(
         result_outputs[batch_idx] = result_outputs_
 
         # Check if users want transcription outputs
-        transcription_output = kwargs.get("transcription_output")
+        transcription_output = kwargs.get("transcription_output", True)
 
-        if transcription_output is None or transcription_output is False:
+        if transcription_output is False:
             return result_outputs, result_output_sizes
 
         # Now users want transcription outputs
diff --git a/art/estimators/speech_recognition/tensorflow_lingvo.py b/art/estimators/speech_recognition/tensorflow_lingvo.py
@@ -513,7 +513,9 @@ def _loss_gradient_per_batch(self, x: np.ndarray, y: np.ndarray) -> np.ndarray:
             gradient = gradient_padded[:length]
             gradients.append(gradient)
 
-        return np.array(gradients, dtype=object)
+        # for ragged input, use np.object dtype
+        dtype = np.float32 if x.ndim != 1 else np.object
+        return np.array(gradients, dtype=dtype)
 
     def _loss_gradient_per_sequence(self, x: np.ndarray, y: np.ndarray) -> np.ndarray:
         """
@@ -539,7 +541,9 @@ def _loss_gradient_per_sequence(self, x: np.ndarray, y: np.ndarray) -> np.ndarra
             gradient = self._sess.run(self._loss_gradient_op, feed_dict)
             gradients.append(np.squeeze(gradient))
 
-        return np.array(gradients, dtype=object)
+        # for ragged input, use np.object dtype
+        dtype = np.float32 if x.ndim != 1 else np.object
+        return np.array(gradients, dtype=dtype)
 
     def set_learning_phase(self, train: bool) -> None:
         raise NotImplementedError
diff --git a/art/utils.py b/art/utils.py
@@ -1231,7 +1231,8 @@ def pad_sequence_input(x: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
     max_length = max(map(len, x))
     batch_size = x.shape[0]
 
-    x_padded = np.zeros((batch_size, max_length))
+    # note: use dtype of inner elements
+    x_padded = np.zeros((batch_size, max_length), dtype=x[0].dtype)
     x_mask = np.zeros((batch_size, max_length), dtype=bool)
 
     for i, x_i in enumerate(x):
diff --git a/requirements.txt b/requirements.txt
@@ -12,6 +12,7 @@ resampy==0.2.2
 ffmpeg-python==0.2.0
 cma==3.0.3
 pandas==1.1.4
+librosa==0.8.0
 
 # frameworks
 h5py==2.10.0
diff --git a/tests/attacks/evasion/test_imperceptible_asr.py b/tests/attacks/evasion/test_imperceptible_asr.py
@@ -139,7 +139,11 @@ def test_loss_gradient_masking_threshold(self, art_warning, asr_dummy_estimator,
             test_delta = test_input * 0
 
             imperceptible_asr = ImperceptibleASR(estimator=asr_dummy_estimator(), masker=PsychoacousticMasker())
-            loss_gradient, loss = imperceptible_asr._loss_gradient_masking_threshold(test_delta, test_input)
+
+            masking_threshold, psd_maximum = imperceptible_asr._stabilized_threshold_and_psd_maximum(test_input)
+            loss_gradient, loss = imperceptible_asr._loss_gradient_masking_threshold(
+                test_delta, test_input, masking_threshold, psd_maximum
+            )
 
             assert [g.shape for g in loss_gradient] == [d.shape for d in test_delta]
             assert loss.ndim == 1 and loss.shape == test_delta.shape
@@ -154,7 +158,7 @@ def test_loss_gradient_masking_threshold_tf(self, art_warning, asr_dummy_estimat
             tf1.reset_default_graph()
 
             test_delta = audio_batch_padded
-            test_psd_maximum = np.ones((test_delta.shape[0], 28))
+            test_psd_maximum = np.ones((test_delta.shape[0]))
             test_masking_threshold = np.zeros((test_delta.shape[0], 1025, 28))
 
             imperceptible_asr = ImperceptibleASR(estimator=asr_dummy_estimator(), masker=PsychoacousticMasker())
@@ -175,7 +179,7 @@ def test_loss_gradient_masking_threshold_tf(self, art_warning, asr_dummy_estimat
     def test_loss_gradient_masking_threshold_torch(self, art_warning, asr_dummy_estimator, audio_batch_padded):
         try:
             test_delta = audio_batch_padded
-            test_psd_maximum = np.ones((test_delta.shape[0], 28))
+            test_psd_maximum = np.ones((test_delta.shape[0], 1, 1))
             test_masking_threshold = np.zeros((test_delta.shape[0], 1025, 28))
 
             imperceptible_asr = ImperceptibleASR(estimator=asr_dummy_estimator(), masker=PsychoacousticMasker())
@@ -196,7 +200,7 @@ def test_approximate_power_spectral_density_tf(self, art_warning, asr_dummy_esti
             tf1.reset_default_graph()
 
             test_delta = audio_batch_padded
-            test_psd_maximum = np.ones((test_delta.shape[0], 28))
+            test_psd_maximum = np.ones((test_delta.shape[0]))
 
             masker = PsychoacousticMasker()
             imperceptible_asr = ImperceptibleASR(estimator=asr_dummy_estimator(), masker=masker)
@@ -223,7 +227,7 @@ def test_approximate_power_spectral_density_torch(self, art_warning, asr_dummy_e
             import torch
 
             test_delta = audio_batch_padded
-            test_psd_maximum = np.ones((test_delta.shape[0], 28))
+            test_psd_maximum = np.ones((test_delta.shape[0], 1, 1))
 
             masker = PsychoacousticMasker()
             imperceptible_asr = ImperceptibleASR(estimator=asr_dummy_estimator(), masker=masker)
@@ -252,8 +256,8 @@ def test_power_spectral_density(self, art_warning, audio_sample):
             masker = PsychoacousticMasker()
             psd_matrix, psd_max = masker.power_spectral_density(test_input)
 
-            assert psd_matrix.shape[0] == masker.window_size // 2 + 1
-            assert psd_matrix.shape[1] == psd_max.shape[0]
+            assert psd_matrix.shape == (masker.window_size // 2 + 1, 28)
+            assert np.floor(psd_max) == 78.0
         except ARTTestException as e:
             art_warning(e)
 
@@ -283,8 +287,8 @@ def test_filter_maskers(self, art_warning):
             masker = PsychoacousticMasker()
             maskers, masker_idx = masker.filter_maskers(test_maskers, test_masker_idx)
 
-            assert masker_idx.tolist() == [9]
-            assert maskers.tolist() == [91]
+            assert masker_idx.tolist() == [2]
+            assert maskers.tolist() == [96]
         except ARTTestException as e:
             art_warning(e)
 
@@ -328,7 +332,7 @@ def test_calculate_threshold_and_psd_maximum(self, art_warning, audio_sample):
             masker = PsychoacousticMasker()
             threshold, psd_max = masker.calculate_threshold_and_psd_maximum(test_input)
 
-            assert threshold.shape[1] == psd_max.shape[0]
-            assert threshold.shape[0] == masker.window_size // 2 + 1
+            assert threshold.shape == (masker.window_size // 2 + 1, 28)
+            assert np.floor(psd_max) == 78.0
         except ARTTestException as e:
             art_warning(e)