Skip to content

Commit c9f3bde

Browse files
authored
Merge pull request #621 from Trusted-AI/fix_asr_ds
fix bugs for asr and deepspeech
2 parents 99b51c2 + cef22fc commit c9f3bde

File tree

3 files changed

+32
-19
lines changed

3 files changed

+32
-19
lines changed

art/attacks/evasion/imperceptible_asr/imperceptible_asr_pytorch.py

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@
2525
from __future__ import absolute_import, division, print_function, unicode_literals
2626

2727
import logging
28-
from typing import Optional, Tuple, TYPE_CHECKING
28+
from typing import Tuple, Optional, Union, TYPE_CHECKING
2929

3030
import numpy as np
3131
import scipy
@@ -105,7 +105,7 @@ def __init__(
105105
batch_size: int = 32,
106106
use_amp: bool = False,
107107
opt_level: str = "O1",
108-
loss_scale: int = 1,
108+
loss_scale: Optional[Union[float, str]] = 1.0,
109109
):
110110
"""
111111
Create a :class:`.ImperceptibleASRPytorch` instance.
@@ -144,9 +144,10 @@ def __init__(
144144
only triggered if there are GPUs available.
145145
:param opt_level: Specify a pure or mixed precision optimization level. Used when use_amp is True. Accepted
146146
values are `O0`, `O1`, `O2`, and `O3`.
147-
:param loss_scale: Loss scaling. Used when use_amp is True. Default is 1 due to warp-ctc not supporting
147+
:param loss_scale: Loss scaling. Used when use_amp is True. Default is 1.0 due to warp-ctc not supporting
148148
scaling of gradients.
149149
"""
150+
import torch # lgtm [py/repeated-import]
150151
from torch.autograd import Variable
151152

152153
if (
@@ -237,6 +238,8 @@ def generate(self, x: np.ndarray, y: np.ndarray, **kwargs) -> np.ndarray:
237238
class only supports targeted attack.
238239
:return: An array holding the adversarial examples.
239240
"""
241+
import torch # lgtm [py/repeated-import]
242+
240243
# Start to compute adversarial examples
241244
adv_x = x.copy()
242245

@@ -276,6 +279,8 @@ def _generate_batch(self, x: np.ndarray, y: np.ndarray) -> np.ndarray:
276279
class only supports targeted attack.
277280
:return: A batch of adversarial examples.
278281
"""
282+
import torch # lgtm [py/repeated-import]
283+
279284
# First stage of attack
280285
successful_adv_input_1st_stage, original_input = self._attack_1st_stage(x=x, y=y)
281286
successful_perturbation_1st_stage = successful_adv_input_1st_stage - torch.tensor(original_input).to(
@@ -325,6 +330,8 @@ class only supports targeted attack.
325330
- A tensor holding the candidate adversarial examples.
326331
- An array holding the original inputs.
327332
"""
333+
import torch # lgtm [py/repeated-import]
334+
328335
# Compute local shape
329336
local_batch_size = len(x)
330337
real_lengths = np.array([x_.shape[0] for x_ in x])
@@ -493,6 +500,8 @@ class only supports targeted attack.
493500
:param original_max_psd_batch: Original maximum psd.
494501
:return: An array holding the candidate adversarial examples.
495502
"""
503+
import torch # lgtm [py/repeated-import]
504+
496505
# Compute local shape
497506
local_batch_size = len(x)
498507
real_lengths = np.array([x_.shape[0] for x_ in x])
@@ -596,6 +605,8 @@ def _forward_2nd_stage(
596605
:param original_max_psd_batch: Original maximum psd.
597606
:return: The loss tensor of the second stage of the attack.
598607
"""
608+
import torch # lgtm [py/repeated-import]
609+
599610
# Compute loss for masking threshold
600611
losses = []
601612
relu = torch.nn.ReLU()
@@ -744,6 +755,7 @@ def _psd_transform(self, delta: "torch.Tensor", original_max_psd: "torch.Tensor"
744755
:param original_max_psd: The maximum psd of the original audio.
745756
:return: The psd matrix.
746757
"""
758+
import torch # lgtm [py/repeated-import]
747759
import torchaudio
748760

749761
# These parameters are needed for the transformation

art/estimators/speech_recognition/pytorch_deep_speech.py

Lines changed: 7 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@ def __init__(
6161
optimizer: Optional["torch.optim.Optimizer"] = None, # type: ignore
6262
use_amp: bool = False,
6363
opt_level: str = "O1",
64-
loss_scale: int = 1,
64+
loss_scale: Optional[Union[float, str]] = 1.0,
6565
decoder_type: str = "greedy",
6666
lm_path: str = "",
6767
top_paths: int = 1,
@@ -93,7 +93,7 @@ def __init__(
9393
only triggered if there are GPUs available.
9494
:param opt_level: Specify a pure or mixed precision optimization level. Used when use_amp is True. Accepted
9595
values are `O0`, `O1`, `O2`, and `O3`.
96-
:param loss_scale: Loss scaling. Used when use_amp is True. Default is 1 due to warp-ctc not supporting
96+
:param loss_scale: Loss scaling. Used when use_amp is True. Default is 1.0 due to warp-ctc not supporting
9797
scaling of gradients.
9898
:param decoder_type: Decoder type. Either `greedy` or `beam`. This parameter is only used when users want
9999
transcription outputs.
@@ -440,14 +440,7 @@ def fit(self, x: np.ndarray, y: np.ndarray, batch_size: int = 128, nb_epochs: in
440440
self._model.train()
441441

442442
if self._optimizer is None:
443-
logger.warning(
444-
"An optimizer is needed to use the automatic mixed precision tool, but none for provided. "
445-
"A default optimizer is used."
446-
)
447-
448-
# Create the optimizers
449-
parameters = self._model.parameters()
450-
self._optimizer = torch.optim.SGD(parameters, lr=0.01)
443+
raise ValueError("An optimizer is required to train the model, but none was provided.")
451444

452445
# Apply preprocessing
453446
x_preprocessed, y_preprocessed = self._apply_preprocessing(x, y, fit=True)
@@ -583,19 +576,17 @@ def transform_model_input(
583576
# Push the sequence to device
584577
if not tensor_input:
585578
x[i] = x[i].astype(ART_NUMPY_DTYPE)
586-
x_i_tensor = torch.tensor(x[i]).to(self._device)
587-
else:
588-
x_i_tensor = x[i]
579+
x[i] = torch.tensor(x[i]).to(self._device)
589580

590581
# Set gradient computation permission
591582
if compute_gradient:
592-
x_i_tensor.requires_grad = True
583+
x[i].requires_grad = True
593584

594585
# Transform the sequence into the frequency space
595586
if tensor_input and real_lengths is not None:
596-
transformed_input = transformer(x_i_tensor[: real_lengths[i]])
587+
transformed_input = transformer(x[i][: real_lengths[i]])
597588
else:
598-
transformed_input = transformer(x_i_tensor)
589+
transformed_input = transformer(x[i])
599590

600591
spectrogram, _ = torchaudio.functional.magphase(transformed_input)
601592
spectrogram = torch.log1p(spectrogram)

tests/estimators/speech_recognition/test_pytorch_deep_speech.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,8 @@ def test_all(self, _test_all):
121121
@pytest.fixture(params=[False, True])
122122
def _test_all(self, request, setup_class):
123123
# Only import if deep speech module is available
124+
import torch
125+
124126
from art.estimators.speech_recognition.pytorch_deep_speech import PyTorchDeepSpeech
125127

126128
# Test probability outputs
@@ -356,6 +358,10 @@ def _test_all(self, request, setup_class):
356358

357359
# Now test fit function
358360
if request.param is True:
361+
# Create the optimizer
362+
parameters = self.speech_recognizer_amp.model.parameters()
363+
self.speech_recognizer_amp._optimizer = torch.optim.SGD(parameters, lr=0.01)
364+
359365
# Before train
360366
transcriptions1 = self.speech_recognizer_amp.predict(self.x, batch_size=2, transcription_output=True)
361367

@@ -368,6 +374,10 @@ def _test_all(self, request, setup_class):
368374
assert not ((transcriptions1 == transcriptions2).all())
369375

370376
else:
377+
# Create the optimizer
378+
parameters = self.speech_recognizer.model.parameters()
379+
self.speech_recognizer._optimizer = torch.optim.SGD(parameters, lr=0.01)
380+
371381
# Before train
372382
transcriptions1 = self.speech_recognizer.predict(self.x, batch_size=2, transcription_output=True)
373383

0 commit comments

Comments
 (0)