Trusted-AI
diff --git a/‎art/attacks/evasion/imperceptible_asr/imperceptible_asr_pytorch.py‎
Lines changed: 13 additions & 22 deletions b/‎art/attacks/evasion/imperceptible_asr/imperceptible_asr_pytorch.py‎
Lines changed: 13 additions & 22 deletions
diff --git a/‎art/estimators/speech_recognition/pytorch_deep_speech.py‎
Lines changed: 47 additions & 6 deletions b/‎art/estimators/speech_recognition/pytorch_deep_speech.py‎
Lines changed: 47 additions & 6 deletions
diff --git a/‎art/preprocessing/l_filter/l_filter_pytorch.py‎
Lines changed: 8 additions & 2 deletions b/‎art/preprocessing/l_filter/l_filter_pytorch.py‎
Lines changed: 8 additions & 2 deletions
@@ -36,6 +36,7 @@
 from art.estimators.pytorch import PyTorchEstimator
 from art.estimators.speech_recognition.speech_recognizer import SpeechRecognizerMixin
 from art.estimators.speech_recognition.pytorch_deep_speech import PyTorchDeepSpeech
+from art.config import ART_NUMPY_DTYPE
 
 if TYPE_CHECKING:
     import torch
@@ -147,12 +148,6 @@ def __init__(
         import torch  # lgtm [py/repeated-import]
         from torch.autograd import Variable
 
-        if estimator.preprocessing:
-            raise NotImplementedError(
-                "The framework-specific implementation currently does not apply preprocessing and "
-                "preprocessing defences."
-            )
-
         super().__init__(estimator=estimator)
 
         # Set attack attributes
@@ -343,11 +338,11 @@ class only supports targeted attack.
         local_max_length = np.max(real_lengths)
 
         # Initialize rescale
-        rescale = np.ones([local_batch_size, local_max_length], dtype=np.float32) * self.initial_rescale
+        rescale = np.ones([local_batch_size, local_max_length], dtype=ART_NUMPY_DTYPE) * self.initial_rescale
 
         # Reformat input
-        input_mask = np.zeros([local_batch_size, local_max_length], dtype=np.float32)
-        original_input = np.zeros([local_batch_size, local_max_length], dtype=np.float32)
+        input_mask = np.zeros([local_batch_size, local_max_length], dtype=ART_NUMPY_DTYPE)
+        original_input = np.zeros([local_batch_size, local_max_length], dtype=ART_NUMPY_DTYPE)
 
         for local_batch_size_idx in range(local_batch_size):
             input_mask[local_batch_size_idx, : len(x[local_batch_size_idx])] = 1
@@ -455,12 +450,8 @@ def _forward_1st_stage(
         masked_adv_input = adv_input * torch.tensor(input_mask).to(self.estimator.device)
 
         # Transform data into the model input space
-        inputs, targets, input_rates, target_sizes, batch_idx = self.estimator.transform_model_input(
-            x=masked_adv_input.to(self.estimator.device),
-            y=original_output,
-            compute_gradient=False,
-            tensor_input=True,
-            real_lengths=real_lengths,
+        inputs, targets, input_rates, target_sizes, batch_idx = self.estimator.preprocess_transform_model_input(
+            x=masked_adv_input.to(self.estimator.device), y=original_output, real_lengths=real_lengths,
         )
 
         # Compute real input sizes
@@ -513,12 +504,12 @@ class only supports targeted attack.
         local_max_length = np.max(real_lengths)
 
         # Initialize alpha and rescale
-        alpha = np.array([self.initial_alpha] * local_batch_size, dtype=np.float32)
-        rescale = np.ones([local_batch_size, local_max_length], dtype=np.float32) * self.initial_rescale
+        alpha = np.array([self.initial_alpha] * local_batch_size, dtype=ART_NUMPY_DTYPE)
+        rescale = np.ones([local_batch_size, local_max_length], dtype=ART_NUMPY_DTYPE) * self.initial_rescale
 
         # Reformat input
-        input_mask = np.zeros([local_batch_size, local_max_length], dtype=np.float32)
-        original_input = np.zeros([local_batch_size, local_max_length], dtype=np.float32)
+        input_mask = np.zeros([local_batch_size, local_max_length], dtype=ART_NUMPY_DTYPE)
+        original_input = np.zeros([local_batch_size, local_max_length], dtype=ART_NUMPY_DTYPE)
 
         for local_batch_size_idx in range(local_batch_size):
             input_mask[local_batch_size_idx, : len(x[local_batch_size_idx])] = 1
@@ -675,7 +666,7 @@ def _compute_masking_threshold(self, x: np.ndarray) -> Tuple[np.ndarray, np.ndar
         barks = 13 * np.arctan(0.00076 * freqs) + 3.5 * np.arctan(pow(freqs / 7500.0, 2))
 
         # Compute quiet threshold
-        ath = np.zeros(len(barks), dtype=np.float32) - np.inf
+        ath = np.zeros(len(barks), dtype=ART_NUMPY_DTYPE) - np.inf
         bark_idx = np.argmax(barks > 1)
         ath[bark_idx:] = (
             3.64 * pow(freqs[bark_idx:] * 0.001, -0.8)
@@ -697,7 +688,7 @@ def _compute_masking_threshold(self, x: np.ndarray) -> Tuple[np.ndarray, np.ndar
             if len(psd[:, i]) - 1 in masker_idx:
                 masker_idx = np.delete(masker_idx, len(psd[:, i]) - 1)
 
-            barks_psd = np.zeros([len(masker_idx), 3], dtype=np.float32)
+            barks_psd = np.zeros([len(masker_idx), 3], dtype=ART_NUMPY_DTYPE)
             barks_psd[:, 0] = barks[masker_idx]
             barks_psd[:, 1] = 10 * np.log10(
                 pow(10, psd[:, i][masker_idx - 1] / 10.0)
@@ -739,7 +730,7 @@ def _compute_masking_threshold(self, x: np.ndarray) -> Tuple[np.ndarray, np.ndar
             for m in range(barks_psd.shape[0]):
                 d_z = barks - barks_psd[m, 0]
                 zero_idx = np.argmax(d_z > 0)
-                s_f = np.zeros(len(d_z), dtype=np.float32)
+                s_f = np.zeros(len(d_z), dtype=ART_NUMPY_DTYPE)
                 s_f[:zero_idx] = 27 * d_z[:zero_idx]
                 s_f[zero_idx:] = (-27 + 0.37 * max(barks_psd[m, 1] - 40, 0)) * d_z[zero_idx:]
                 t_s.append(barks_psd[m, 1] + delta[m] + s_f)
 
@@ -295,7 +295,7 @@ def predict(
         x_preprocessed, _ = self._apply_preprocessing(x_, y=None, fit=False)
 
         # Transform x into the model input space
-        inputs, targets, input_rates, target_sizes, batch_idx = self.transform_model_input(x=x_preprocessed)
+        inputs, targets, input_rates, target_sizes, batch_idx = self._transform_model_input(x=x_preprocessed)
 
         # Compute real input sizes
         input_sizes = input_rates.mul_(inputs.size()[-1]).int()
@@ -323,7 +323,8 @@ def predict(
 
         # Aggregate results
         result_outputs = np.zeros(
-            (x_preprocessed.shape[0], result_output_sizes.max(), results[0].shape[-1]), dtype=np.float32
+            shape=(x_preprocessed.shape[0], result_output_sizes.max(), results[0].shape[-1]),
+            dtype=config.ART_NUMPY_DTYPE,
         )
 
         for m in range(num_batch):
@@ -345,7 +346,7 @@ def predict(
         # Check if users want transcription outputs
         transcription_output = kwargs.get("transcription_output")
 
-        if transcription_output is False:
+        if transcription_output is None or transcription_output is False:
             return result_outputs, result_output_sizes
 
         # Now users want transcription outputs
@@ -381,7 +382,7 @@ def loss_gradient(self, x: np.ndarray, y: np.ndarray, **kwargs) -> np.ndarray:
         x_preprocessed, y_preprocessed = self._apply_preprocessing(x_, y, fit=False)
 
         # Transform data into the model input space
-        inputs, targets, input_rates, target_sizes, batch_idx = self.transform_model_input(
+        inputs, targets, input_rates, target_sizes, batch_idx = self._transform_model_input(
             x=x_preprocessed, y=y_preprocessed, compute_gradient=True
         )
 
@@ -446,6 +447,9 @@ def fit(self, x: np.ndarray, y: np.ndarray, batch_size: int = 128, nb_epochs: in
 
         from warpctc_pytorch import CTCLoss
 
+        x_ = np.empty(len(x), dtype=object)
+        x_[:] = list(x)
+
         # Put the model in the training mode
         self._model.train()
 
@@ -481,7 +485,7 @@ def fit(self, x: np.ndarray, y: np.ndarray, batch_size: int = 128, nb_epochs: in
                 o_batch = y_preprocessed[ind[begin:end]]
 
                 # Transform data into the model input space
-                inputs, targets, input_rates, target_sizes, batch_idx = self.transform_model_input(
+                inputs, targets, input_rates, target_sizes, batch_idx = self._transform_model_input(
                     x=i_batch, y=o_batch, compute_gradient=False
                 )
 
@@ -512,7 +516,44 @@ def fit(self, x: np.ndarray, y: np.ndarray, batch_size: int = 128, nb_epochs: in
 
                 self._optimizer.step()
 
-    def transform_model_input(
+    def preprocess_transform_model_input(
+        self, x: "torch.Tensor", y: np.ndarray, real_lengths: np.ndarray,
+    ) -> Tuple["torch.Tensor", "torch.Tensor", "torch.Tensor", "torch.Tensor", List]:
+        """
+        Apply preprocessing and then transform the user input space into the model input space. This function is used
+        by the ASR attack to attack into the PytorchDeepSpeech estimator whose defences are called with the
+        `_apply_preprocessing` function.
+
+        :param x: Samples of shape (nb_samples, seq_length).
+        :param y: Target values of shape (nb_samples). Each sample in `y` is a string and it may possess different
+                  lengths. A possible example of `y` could be: `y = np.array(['SIXTY ONE', 'HELLO'])`.
+        :param real_lengths: Real lengths of original sequences.
+        :return: A tuple of inputs and targets in the model space with the original index
+                 `(inputs, targets, input_percentages, target_sizes, batch_idx)`, where:
+                 - inputs: model inputs of shape (nb_samples, nb_frequencies, seq_length).
+                 - targets: ground truth targets of shape (sum over nb_samples of real seq_lengths).
+                 - input_percentages: percentages of real inputs in inputs.
+                 - target_sizes: list of real seq_lengths.
+                 - batch_idx: original index of inputs.
+        """
+        import torch  # lgtm [py/repeated-import]
+
+        # Apply preprocessing
+        x_batch = []
+        for i in range(len(x)):
+            preprocessed_x_i, _ = self._apply_preprocessing(x=x[i], y=None, no_grad=False)
+            x_batch.append(preprocessed_x_i)
+
+        x = torch.stack(x_batch)
+
+        # Transform the input space
+        inputs, targets, input_rates, target_sizes, batch_idx = self._transform_model_input(
+            x=x, y=y, compute_gradient=False, tensor_input=True, real_lengths=real_lengths,
+        )
+
+        return inputs, targets, input_rates, target_sizes, batch_idx
+
+    def _transform_model_input(
         self,
         x: Union[np.ndarray, "torch.Tensor"],
         y: Optional[np.ndarray] = None,
 
@@ -29,6 +29,7 @@
 from tqdm import tqdm
 
 from art.preprocessing.preprocessing import PreprocessorPyTorch
+from art.config import ART_NUMPY_DTYPE
 
 if TYPE_CHECKING:
     import torch
@@ -76,8 +77,8 @@ def __init__(
 
         self._apply_fit = apply_fit
         self._apply_predict = apply_predict
-        self.numerator_coef = numerator_coef.astype(np.float32)
-        self.denominator_coef = denominator_coef.astype(np.float32)
+        self.numerator_coef = numerator_coef.astype(ART_NUMPY_DTYPE)
+        self.denominator_coef = denominator_coef.astype(ART_NUMPY_DTYPE)
         self.clip_values = clip_values
         self.verbose = verbose
         self._check_params()
@@ -191,3 +192,8 @@ def _check_params(self) -> None:
 
         if not isinstance(self.verbose, bool):
             raise ValueError("The argument `verbose` has to be of type bool.")
+
+        if len(self.denominator_coef) != len(self.numerator_coef):
+            raise ValueError(
+                "The denominator coefficient vector and the numerator coefficient vector must have the same length."
+            )