Skip to content

Commit b46ad0d

Browse files
authored
Merge pull request #763 from Trusted-AI/asr_with_defences
Support preprocessing in the pytorch imperceptable asr attack
2 parents 26c8cba + 4f24c06 commit b46ad0d

File tree

8 files changed

+423
-262
lines changed

8 files changed

+423
-262
lines changed

art/attacks/evasion/imperceptible_asr/imperceptible_asr_pytorch.py

Lines changed: 13 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@
3636
from art.estimators.pytorch import PyTorchEstimator
3737
from art.estimators.speech_recognition.speech_recognizer import SpeechRecognizerMixin
3838
from art.estimators.speech_recognition.pytorch_deep_speech import PyTorchDeepSpeech
39+
from art.config import ART_NUMPY_DTYPE
3940

4041
if TYPE_CHECKING:
4142
import torch
@@ -147,12 +148,6 @@ def __init__(
147148
import torch # lgtm [py/repeated-import]
148149
from torch.autograd import Variable
149150

150-
if estimator.preprocessing:
151-
raise NotImplementedError(
152-
"The framework-specific implementation currently does not apply preprocessing and "
153-
"preprocessing defences."
154-
)
155-
156151
super().__init__(estimator=estimator)
157152

158153
# Set attack attributes
@@ -343,11 +338,11 @@ class only supports targeted attack.
343338
local_max_length = np.max(real_lengths)
344339

345340
# Initialize rescale
346-
rescale = np.ones([local_batch_size, local_max_length], dtype=np.float32) * self.initial_rescale
341+
rescale = np.ones([local_batch_size, local_max_length], dtype=ART_NUMPY_DTYPE) * self.initial_rescale
347342

348343
# Reformat input
349-
input_mask = np.zeros([local_batch_size, local_max_length], dtype=np.float32)
350-
original_input = np.zeros([local_batch_size, local_max_length], dtype=np.float32)
344+
input_mask = np.zeros([local_batch_size, local_max_length], dtype=ART_NUMPY_DTYPE)
345+
original_input = np.zeros([local_batch_size, local_max_length], dtype=ART_NUMPY_DTYPE)
351346

352347
for local_batch_size_idx in range(local_batch_size):
353348
input_mask[local_batch_size_idx, : len(x[local_batch_size_idx])] = 1
@@ -455,12 +450,8 @@ def _forward_1st_stage(
455450
masked_adv_input = adv_input * torch.tensor(input_mask).to(self.estimator.device)
456451

457452
# Transform data into the model input space
458-
inputs, targets, input_rates, target_sizes, batch_idx = self.estimator.transform_model_input(
459-
x=masked_adv_input.to(self.estimator.device),
460-
y=original_output,
461-
compute_gradient=False,
462-
tensor_input=True,
463-
real_lengths=real_lengths,
453+
inputs, targets, input_rates, target_sizes, batch_idx = self.estimator.preprocess_transform_model_input(
454+
x=masked_adv_input.to(self.estimator.device), y=original_output, real_lengths=real_lengths,
464455
)
465456

466457
# Compute real input sizes
@@ -513,12 +504,12 @@ class only supports targeted attack.
513504
local_max_length = np.max(real_lengths)
514505

515506
# Initialize alpha and rescale
516-
alpha = np.array([self.initial_alpha] * local_batch_size, dtype=np.float32)
517-
rescale = np.ones([local_batch_size, local_max_length], dtype=np.float32) * self.initial_rescale
507+
alpha = np.array([self.initial_alpha] * local_batch_size, dtype=ART_NUMPY_DTYPE)
508+
rescale = np.ones([local_batch_size, local_max_length], dtype=ART_NUMPY_DTYPE) * self.initial_rescale
518509

519510
# Reformat input
520-
input_mask = np.zeros([local_batch_size, local_max_length], dtype=np.float32)
521-
original_input = np.zeros([local_batch_size, local_max_length], dtype=np.float32)
511+
input_mask = np.zeros([local_batch_size, local_max_length], dtype=ART_NUMPY_DTYPE)
512+
original_input = np.zeros([local_batch_size, local_max_length], dtype=ART_NUMPY_DTYPE)
522513

523514
for local_batch_size_idx in range(local_batch_size):
524515
input_mask[local_batch_size_idx, : len(x[local_batch_size_idx])] = 1
@@ -675,7 +666,7 @@ def _compute_masking_threshold(self, x: np.ndarray) -> Tuple[np.ndarray, np.ndar
675666
barks = 13 * np.arctan(0.00076 * freqs) + 3.5 * np.arctan(pow(freqs / 7500.0, 2))
676667

677668
# Compute quiet threshold
678-
ath = np.zeros(len(barks), dtype=np.float32) - np.inf
669+
ath = np.zeros(len(barks), dtype=ART_NUMPY_DTYPE) - np.inf
679670
bark_idx = np.argmax(barks > 1)
680671
ath[bark_idx:] = (
681672
3.64 * pow(freqs[bark_idx:] * 0.001, -0.8)
@@ -697,7 +688,7 @@ def _compute_masking_threshold(self, x: np.ndarray) -> Tuple[np.ndarray, np.ndar
697688
if len(psd[:, i]) - 1 in masker_idx:
698689
masker_idx = np.delete(masker_idx, len(psd[:, i]) - 1)
699690

700-
barks_psd = np.zeros([len(masker_idx), 3], dtype=np.float32)
691+
barks_psd = np.zeros([len(masker_idx), 3], dtype=ART_NUMPY_DTYPE)
701692
barks_psd[:, 0] = barks[masker_idx]
702693
barks_psd[:, 1] = 10 * np.log10(
703694
pow(10, psd[:, i][masker_idx - 1] / 10.0)
@@ -739,7 +730,7 @@ def _compute_masking_threshold(self, x: np.ndarray) -> Tuple[np.ndarray, np.ndar
739730
for m in range(barks_psd.shape[0]):
740731
d_z = barks - barks_psd[m, 0]
741732
zero_idx = np.argmax(d_z > 0)
742-
s_f = np.zeros(len(d_z), dtype=np.float32)
733+
s_f = np.zeros(len(d_z), dtype=ART_NUMPY_DTYPE)
743734
s_f[:zero_idx] = 27 * d_z[:zero_idx]
744735
s_f[zero_idx:] = (-27 + 0.37 * max(barks_psd[m, 1] - 40, 0)) * d_z[zero_idx:]
745736
t_s.append(barks_psd[m, 1] + delta[m] + s_f)

art/estimators/speech_recognition/pytorch_deep_speech.py

Lines changed: 47 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -295,7 +295,7 @@ def predict(
295295
x_preprocessed, _ = self._apply_preprocessing(x_, y=None, fit=False)
296296

297297
# Transform x into the model input space
298-
inputs, targets, input_rates, target_sizes, batch_idx = self.transform_model_input(x=x_preprocessed)
298+
inputs, targets, input_rates, target_sizes, batch_idx = self._transform_model_input(x=x_preprocessed)
299299

300300
# Compute real input sizes
301301
input_sizes = input_rates.mul_(inputs.size()[-1]).int()
@@ -323,7 +323,8 @@ def predict(
323323

324324
# Aggregate results
325325
result_outputs = np.zeros(
326-
(x_preprocessed.shape[0], result_output_sizes.max(), results[0].shape[-1]), dtype=np.float32
326+
shape=(x_preprocessed.shape[0], result_output_sizes.max(), results[0].shape[-1]),
327+
dtype=config.ART_NUMPY_DTYPE,
327328
)
328329

329330
for m in range(num_batch):
@@ -345,7 +346,7 @@ def predict(
345346
# Check if users want transcription outputs
346347
transcription_output = kwargs.get("transcription_output")
347348

348-
if transcription_output is False:
349+
if transcription_output is None or transcription_output is False:
349350
return result_outputs, result_output_sizes
350351

351352
# Now users want transcription outputs
@@ -381,7 +382,7 @@ def loss_gradient(self, x: np.ndarray, y: np.ndarray, **kwargs) -> np.ndarray:
381382
x_preprocessed, y_preprocessed = self._apply_preprocessing(x_, y, fit=False)
382383

383384
# Transform data into the model input space
384-
inputs, targets, input_rates, target_sizes, batch_idx = self.transform_model_input(
385+
inputs, targets, input_rates, target_sizes, batch_idx = self._transform_model_input(
385386
x=x_preprocessed, y=y_preprocessed, compute_gradient=True
386387
)
387388

@@ -446,6 +447,9 @@ def fit(self, x: np.ndarray, y: np.ndarray, batch_size: int = 128, nb_epochs: in
446447

447448
from warpctc_pytorch import CTCLoss
448449

450+
x_ = np.empty(len(x), dtype=object)
451+
x_[:] = list(x)
452+
449453
# Put the model in the training mode
450454
self._model.train()
451455

@@ -481,7 +485,7 @@ def fit(self, x: np.ndarray, y: np.ndarray, batch_size: int = 128, nb_epochs: in
481485
o_batch = y_preprocessed[ind[begin:end]]
482486

483487
# Transform data into the model input space
484-
inputs, targets, input_rates, target_sizes, batch_idx = self.transform_model_input(
488+
inputs, targets, input_rates, target_sizes, batch_idx = self._transform_model_input(
485489
x=i_batch, y=o_batch, compute_gradient=False
486490
)
487491

@@ -512,7 +516,44 @@ def fit(self, x: np.ndarray, y: np.ndarray, batch_size: int = 128, nb_epochs: in
512516

513517
self._optimizer.step()
514518

515-
def transform_model_input(
519+
def preprocess_transform_model_input(
520+
self, x: "torch.Tensor", y: np.ndarray, real_lengths: np.ndarray,
521+
) -> Tuple["torch.Tensor", "torch.Tensor", "torch.Tensor", "torch.Tensor", List]:
522+
"""
523+
Apply preprocessing and then transform the user input space into the model input space. This function is used
524+
by the ASR attack to attack into the PytorchDeepSpeech estimator whose defences are called with the
525+
`_apply_preprocessing` function.
526+
527+
:param x: Samples of shape (nb_samples, seq_length).
528+
:param y: Target values of shape (nb_samples). Each sample in `y` is a string and it may possess different
529+
lengths. A possible example of `y` could be: `y = np.array(['SIXTY ONE', 'HELLO'])`.
530+
:param real_lengths: Real lengths of original sequences.
531+
:return: A tuple of inputs and targets in the model space with the original index
532+
`(inputs, targets, input_percentages, target_sizes, batch_idx)`, where:
533+
- inputs: model inputs of shape (nb_samples, nb_frequencies, seq_length).
534+
- targets: ground truth targets of shape (sum over nb_samples of real seq_lengths).
535+
- input_percentages: percentages of real inputs in inputs.
536+
- target_sizes: list of real seq_lengths.
537+
- batch_idx: original index of inputs.
538+
"""
539+
import torch # lgtm [py/repeated-import]
540+
541+
# Apply preprocessing
542+
x_batch = []
543+
for i in range(len(x)):
544+
preprocessed_x_i, _ = self._apply_preprocessing(x=x[i], y=None, no_grad=False)
545+
x_batch.append(preprocessed_x_i)
546+
547+
x = torch.stack(x_batch)
548+
549+
# Transform the input space
550+
inputs, targets, input_rates, target_sizes, batch_idx = self._transform_model_input(
551+
x=x, y=y, compute_gradient=False, tensor_input=True, real_lengths=real_lengths,
552+
)
553+
554+
return inputs, targets, input_rates, target_sizes, batch_idx
555+
556+
def _transform_model_input(
516557
self,
517558
x: Union[np.ndarray, "torch.Tensor"],
518559
y: Optional[np.ndarray] = None,

art/preprocessing/l_filter/l_filter_pytorch.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
from tqdm import tqdm
3030

3131
from art.preprocessing.preprocessing import PreprocessorPyTorch
32+
from art.config import ART_NUMPY_DTYPE
3233

3334
if TYPE_CHECKING:
3435
import torch
@@ -76,8 +77,8 @@ def __init__(
7677

7778
self._apply_fit = apply_fit
7879
self._apply_predict = apply_predict
79-
self.numerator_coef = numerator_coef.astype(np.float32)
80-
self.denominator_coef = denominator_coef.astype(np.float32)
80+
self.numerator_coef = numerator_coef.astype(ART_NUMPY_DTYPE)
81+
self.denominator_coef = denominator_coef.astype(ART_NUMPY_DTYPE)
8182
self.clip_values = clip_values
8283
self.verbose = verbose
8384
self._check_params()
@@ -191,3 +192,8 @@ def _check_params(self) -> None:
191192

192193
if not isinstance(self.verbose, bool):
193194
raise ValueError("The argument `verbose` has to be of type bool.")
195+
196+
if len(self.denominator_coef) != len(self.numerator_coef):
197+
raise ValueError(
198+
"The denominator coefficient vector and the numerator coefficient vector must have the same length."
199+
)

0 commit comments

Comments
 (0)