Merge pull request #621 from Trusted-AI/fix_asr_ds

beat-buesser · web-flow · commit c9f3bdeb79f2 · 2020-09-28T23:10:42.000+01:00
fix bugs for asr and deepspeech
diff --git a/art/attacks/evasion/imperceptible_asr/imperceptible_asr_pytorch.py b/art/attacks/evasion/imperceptible_asr/imperceptible_asr_pytorch.py
@@ -25,7 +25,7 @@
 from __future__ import absolute_import, division, print_function, unicode_literals
 
 import logging
-from typing import Optional, Tuple, TYPE_CHECKING
+from typing import Tuple, Optional, Union, TYPE_CHECKING
 
 import numpy as np
 import scipy
@@ -105,7 +105,7 @@ def __init__(
         batch_size: int = 32,
         use_amp: bool = False,
         opt_level: str = "O1",
-        loss_scale: int = 1,
+        loss_scale: Optional[Union[float, str]] = 1.0,
     ):
         """
         Create a :class:`.ImperceptibleASRPytorch` instance.
@@ -144,9 +144,10 @@ def __init__(
                         only triggered if there are GPUs available.
         :param opt_level: Specify a pure or mixed precision optimization level. Used when use_amp is True. Accepted
                           values are `O0`, `O1`, `O2`, and `O3`.
-        :param loss_scale: Loss scaling. Used when use_amp is True. Default is 1 due to warp-ctc not supporting
+        :param loss_scale: Loss scaling. Used when use_amp is True. Default is 1.0 due to warp-ctc not supporting
                            scaling of gradients.
         """
+        import torch  # lgtm [py/repeated-import]
         from torch.autograd import Variable
 
         if (
@@ -237,6 +238,8 @@ def generate(self, x: np.ndarray, y: np.ndarray, **kwargs) -> np.ndarray:
                   class only supports targeted attack.
         :return: An array holding the adversarial examples.
         """
+        import torch  # lgtm [py/repeated-import]
+
         # Start to compute adversarial examples
         adv_x = x.copy()
 
@@ -276,6 +279,8 @@ def _generate_batch(self, x: np.ndarray, y: np.ndarray) -> np.ndarray:
                   class only supports targeted attack.
         :return: A batch of adversarial examples.
         """
+        import torch  # lgtm [py/repeated-import]
+
         # First stage of attack
         successful_adv_input_1st_stage, original_input = self._attack_1st_stage(x=x, y=y)
         successful_perturbation_1st_stage = successful_adv_input_1st_stage - torch.tensor(original_input).to(
@@ -325,6 +330,8 @@ class only supports targeted attack.
                     - A tensor holding the candidate adversarial examples.
                     - An array holding the original inputs.
         """
+        import torch  # lgtm [py/repeated-import]
+
         # Compute local shape
         local_batch_size = len(x)
         real_lengths = np.array([x_.shape[0] for x_ in x])
@@ -493,6 +500,8 @@ class only supports targeted attack.
         :param original_max_psd_batch: Original maximum psd.
         :return: An array holding the candidate adversarial examples.
         """
+        import torch  # lgtm [py/repeated-import]
+
         # Compute local shape
         local_batch_size = len(x)
         real_lengths = np.array([x_.shape[0] for x_ in x])
@@ -596,6 +605,8 @@ def _forward_2nd_stage(
         :param original_max_psd_batch: Original maximum psd.
         :return: The loss tensor of the second stage of the attack.
         """
+        import torch  # lgtm [py/repeated-import]
+
         # Compute loss for masking threshold
         losses = []
         relu = torch.nn.ReLU()
@@ -744,6 +755,7 @@ def _psd_transform(self, delta: "torch.Tensor", original_max_psd: "torch.Tensor"
         :param original_max_psd: The maximum psd of the original audio.
         :return: The psd matrix.
         """
+        import torch  # lgtm [py/repeated-import]
         import torchaudio
 
         # These parameters are needed for the transformation
diff --git a/art/estimators/speech_recognition/pytorch_deep_speech.py b/art/estimators/speech_recognition/pytorch_deep_speech.py
@@ -61,7 +61,7 @@ def __init__(
         optimizer: Optional["torch.optim.Optimizer"] = None,  # type: ignore
         use_amp: bool = False,
         opt_level: str = "O1",
-        loss_scale: int = 1,
+        loss_scale: Optional[Union[float, str]] = 1.0,
         decoder_type: str = "greedy",
         lm_path: str = "",
         top_paths: int = 1,
@@ -93,7 +93,7 @@ def __init__(
                         only triggered if there are GPUs available.
         :param opt_level: Specify a pure or mixed precision optimization level. Used when use_amp is True. Accepted
                           values are `O0`, `O1`, `O2`, and `O3`.
-        :param loss_scale: Loss scaling. Used when use_amp is True. Default is 1 due to warp-ctc not supporting
+        :param loss_scale: Loss scaling. Used when use_amp is True. Default is 1.0 due to warp-ctc not supporting
                            scaling of gradients.
         :param decoder_type: Decoder type. Either `greedy` or `beam`. This parameter is only used when users want
                              transcription outputs.
@@ -440,14 +440,7 @@ def fit(self, x: np.ndarray, y: np.ndarray, batch_size: int = 128, nb_epochs: in
         self._model.train()
 
         if self._optimizer is None:
-            logger.warning(
-                "An optimizer is needed to use the automatic mixed precision tool, but none for provided. "
-                "A default optimizer is used."
-            )
-
-            # Create the optimizers
-            parameters = self._model.parameters()
-            self._optimizer = torch.optim.SGD(parameters, lr=0.01)
+            raise ValueError("An optimizer is required to train the model, but none was provided.")
 
         # Apply preprocessing
         x_preprocessed, y_preprocessed = self._apply_preprocessing(x, y, fit=True)
@@ -583,19 +576,17 @@ def transform_model_input(
             # Push the sequence to device
             if not tensor_input:
                 x[i] = x[i].astype(ART_NUMPY_DTYPE)
-                x_i_tensor = torch.tensor(x[i]).to(self._device)
-            else:
-                x_i_tensor = x[i]
+                x[i] = torch.tensor(x[i]).to(self._device)
 
             # Set gradient computation permission
             if compute_gradient:
-                x_i_tensor.requires_grad = True
+                x[i].requires_grad = True
 
             # Transform the sequence into the frequency space
             if tensor_input and real_lengths is not None:
-                transformed_input = transformer(x_i_tensor[: real_lengths[i]])
+                transformed_input = transformer(x[i][: real_lengths[i]])
             else:
-                transformed_input = transformer(x_i_tensor)
+                transformed_input = transformer(x[i])
 
             spectrogram, _ = torchaudio.functional.magphase(transformed_input)
             spectrogram = torch.log1p(spectrogram)
diff --git a/tests/estimators/speech_recognition/test_pytorch_deep_speech.py b/tests/estimators/speech_recognition/test_pytorch_deep_speech.py
@@ -121,6 +121,8 @@ def test_all(self, _test_all):
     @pytest.fixture(params=[False, True])
     def _test_all(self, request, setup_class):
         # Only import if deep speech module is available
+        import torch
+
         from art.estimators.speech_recognition.pytorch_deep_speech import PyTorchDeepSpeech
 
         # Test probability outputs
@@ -356,6 +358,10 @@ def _test_all(self, request, setup_class):
 
         # Now test fit function
         if request.param is True:
+            # Create the optimizer
+            parameters = self.speech_recognizer_amp.model.parameters()
+            self.speech_recognizer_amp._optimizer = torch.optim.SGD(parameters, lr=0.01)
+
             # Before train
             transcriptions1 = self.speech_recognizer_amp.predict(self.x, batch_size=2, transcription_output=True)
 
@@ -368,6 +374,10 @@ def _test_all(self, request, setup_class):
             assert not ((transcriptions1 == transcriptions2).all())
 
         else:
+            # Create the optimizer
+            parameters = self.speech_recognizer.model.parameters()
+            self.speech_recognizer._optimizer = torch.optim.SGD(parameters, lr=0.01)
+
             # Before train
             transcriptions1 = self.speech_recognizer.predict(self.x, batch_size=2, transcription_output=True)