Skip to content

Commit 52bdc5f

Browse files
authored
Merge pull request #930 from Trusted-AI/fix/psychoacoustic-model
Improve Psychoacoustic model and some ASR maintenance
2 parents f904333 + e797bba commit 52bdc5f

File tree

8 files changed

+218
-99
lines changed

8 files changed

+218
-99
lines changed

art/attacks/evasion/adversarial_asr.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,7 @@ def __init__(
6868
"""
6969
# pylint: disable=W0231
7070

71-
# re-implement init such that inherrited methods work
71+
# re-implement init such that inherited methods work
7272
EvasionAttack.__init__(self, estimator=estimator) # pylint: disable=W0233
7373
self.masker = None
7474
self.eps = eps
@@ -81,5 +81,12 @@ def __init__(
8181
# set remaining stage 2 params to some random values
8282
self.alpha = 0.1
8383
self.learning_rate_2 = 0.1
84+
self.loss_theta_min = 0.0
85+
self.decrease_factor_eps: float = 1.0
86+
self.num_iter_decrease_eps: int = 1
87+
self.increase_factor_alpha: float = 1.0
88+
self.num_iter_increase_alpha: int = 1
89+
self.decrease_factor_alpha: float = 1.0
90+
self.num_iter_decrease_alpha: int = 1
8491

8592
self._check_params()

art/attacks/evasion/imperceptible_asr/imperceptible_asr.py

Lines changed: 177 additions & 66 deletions
Large diffs are not rendered by default.

art/attacks/evasion/imperceptible_asr/imperceptible_asr_pytorch.py

Lines changed: 5 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -658,27 +658,19 @@ def _compute_masking_threshold(self, x: np.ndarray) -> Tuple[np.ndarray, np.ndar
658658
hop_length = int(sample_rate * window_stride)
659659
win_length = n_fft
660660

661-
window = self.estimator.model.audio_conf.window.value
661+
window_name = self.estimator.model.audio_conf.window.value
662662

663-
if window == "hamming":
664-
window_fn = scipy.signal.windows.hamming
665-
elif window == "hann":
666-
window_fn = scipy.signal.windows.hann
667-
elif window == "blackman":
668-
window_fn = scipy.signal.windows.blackman
669-
elif window == "bartlett":
670-
window_fn = scipy.signal.windows.bartlett
671-
else:
672-
raise NotImplementedError("Spectrogram window %s not supported." % window)
663+
window = scipy.signal.get_window(window_name, win_length, fftbins=True)
673664

674665
transformed_x = librosa.core.stft(
675-
y=x, n_fft=n_fft, hop_length=hop_length, win_length=win_length, window=window_fn, center=False
666+
y=x, n_fft=n_fft, hop_length=hop_length, win_length=win_length, window=window, center=False
676667
)
677668
transformed_x *= np.sqrt(8.0 / 3.0)
678669

679670
psd = abs(transformed_x / win_length)
680671
original_max_psd = np.max(psd * psd)
681-
psd = 10 * np.log10(psd * psd + 10e-20)
672+
with np.errstate(divide='ignore'):
673+
psd = (20 * np.log10(psd)).clip(min=-200)
682674
psd = 96 - np.max(psd) + psd
683675

684676
# Compute freqs and barks

art/estimators/speech_recognition/pytorch_deep_speech.py

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -275,10 +275,9 @@ def predict(
275275
:param batch_size: Batch size.
276276
:param transcription_output: Indicate whether the function will produce probability or transcription as
277277
prediction output. If transcription_output is not available, then probability
278-
output is returned.
279-
:type transcription_output: `bool`
278+
output is returned. Default: True
280279
:return: Predicted probability (if transcription_output False) or transcription (default, if
281-
transcription_output is True or None):
280+
transcription_output is True):
282281
- Probability return is a tuple of (probs, sizes), where `probs` is the probability of characters of
283282
shape (nb_samples, seq_length, nb_classes) and `sizes` is the real sequence length of shape
284283
(nb_samples,).
@@ -346,9 +345,9 @@ def predict(
346345
result_outputs[batch_idx] = result_outputs_
347346

348347
# Check if users want transcription outputs
349-
transcription_output = kwargs.get("transcription_output")
348+
transcription_output = kwargs.get("transcription_output", True)
350349

351-
if transcription_output is None or transcription_output is False:
350+
if transcription_output is False:
352351
return result_outputs, result_output_sizes
353352

354353
# Now users want transcription outputs

art/estimators/speech_recognition/tensorflow_lingvo.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -513,7 +513,9 @@ def _loss_gradient_per_batch(self, x: np.ndarray, y: np.ndarray) -> np.ndarray:
513513
gradient = gradient_padded[:length]
514514
gradients.append(gradient)
515515

516-
return np.array(gradients, dtype=object)
516+
# for ragged input, use np.object dtype
517+
dtype = np.float32 if x.ndim != 1 else np.object
518+
return np.array(gradients, dtype=dtype)
517519

518520
def _loss_gradient_per_sequence(self, x: np.ndarray, y: np.ndarray) -> np.ndarray:
519521
"""
@@ -539,7 +541,9 @@ def _loss_gradient_per_sequence(self, x: np.ndarray, y: np.ndarray) -> np.ndarra
539541
gradient = self._sess.run(self._loss_gradient_op, feed_dict)
540542
gradients.append(np.squeeze(gradient))
541543

542-
return np.array(gradients, dtype=object)
544+
# for ragged input, use np.object dtype
545+
dtype = np.float32 if x.ndim != 1 else np.object
546+
return np.array(gradients, dtype=dtype)
543547

544548
def set_learning_phase(self, train: bool) -> None:
545549
raise NotImplementedError

art/utils.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1231,7 +1231,8 @@ def pad_sequence_input(x: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
12311231
max_length = max(map(len, x))
12321232
batch_size = x.shape[0]
12331233

1234-
x_padded = np.zeros((batch_size, max_length))
1234+
# note: use dtype of inner elements
1235+
x_padded = np.zeros((batch_size, max_length), dtype=x[0].dtype)
12351236
x_mask = np.zeros((batch_size, max_length), dtype=bool)
12361237

12371238
for i, x_i in enumerate(x):

requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ resampy==0.2.2
1212
ffmpeg-python==0.2.0
1313
cma==3.0.3
1414
pandas==1.1.4
15+
librosa==0.8.0
1516

1617
# frameworks
1718
h5py==2.10.0

tests/attacks/evasion/test_imperceptible_asr.py

Lines changed: 15 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -139,7 +139,11 @@ def test_loss_gradient_masking_threshold(self, art_warning, asr_dummy_estimator,
139139
test_delta = test_input * 0
140140

141141
imperceptible_asr = ImperceptibleASR(estimator=asr_dummy_estimator(), masker=PsychoacousticMasker())
142-
loss_gradient, loss = imperceptible_asr._loss_gradient_masking_threshold(test_delta, test_input)
142+
143+
masking_threshold, psd_maximum = imperceptible_asr._stabilized_threshold_and_psd_maximum(test_input)
144+
loss_gradient, loss = imperceptible_asr._loss_gradient_masking_threshold(
145+
test_delta, test_input, masking_threshold, psd_maximum
146+
)
143147

144148
assert [g.shape for g in loss_gradient] == [d.shape for d in test_delta]
145149
assert loss.ndim == 1 and loss.shape == test_delta.shape
@@ -154,7 +158,7 @@ def test_loss_gradient_masking_threshold_tf(self, art_warning, asr_dummy_estimat
154158
tf1.reset_default_graph()
155159

156160
test_delta = audio_batch_padded
157-
test_psd_maximum = np.ones((test_delta.shape[0], 28))
161+
test_psd_maximum = np.ones((test_delta.shape[0]))
158162
test_masking_threshold = np.zeros((test_delta.shape[0], 1025, 28))
159163

160164
imperceptible_asr = ImperceptibleASR(estimator=asr_dummy_estimator(), masker=PsychoacousticMasker())
@@ -175,7 +179,7 @@ def test_loss_gradient_masking_threshold_tf(self, art_warning, asr_dummy_estimat
175179
def test_loss_gradient_masking_threshold_torch(self, art_warning, asr_dummy_estimator, audio_batch_padded):
176180
try:
177181
test_delta = audio_batch_padded
178-
test_psd_maximum = np.ones((test_delta.shape[0], 28))
182+
test_psd_maximum = np.ones((test_delta.shape[0], 1, 1))
179183
test_masking_threshold = np.zeros((test_delta.shape[0], 1025, 28))
180184

181185
imperceptible_asr = ImperceptibleASR(estimator=asr_dummy_estimator(), masker=PsychoacousticMasker())
@@ -196,7 +200,7 @@ def test_approximate_power_spectral_density_tf(self, art_warning, asr_dummy_esti
196200
tf1.reset_default_graph()
197201

198202
test_delta = audio_batch_padded
199-
test_psd_maximum = np.ones((test_delta.shape[0], 28))
203+
test_psd_maximum = np.ones((test_delta.shape[0]))
200204

201205
masker = PsychoacousticMasker()
202206
imperceptible_asr = ImperceptibleASR(estimator=asr_dummy_estimator(), masker=masker)
@@ -223,7 +227,7 @@ def test_approximate_power_spectral_density_torch(self, art_warning, asr_dummy_e
223227
import torch
224228

225229
test_delta = audio_batch_padded
226-
test_psd_maximum = np.ones((test_delta.shape[0], 28))
230+
test_psd_maximum = np.ones((test_delta.shape[0], 1, 1))
227231

228232
masker = PsychoacousticMasker()
229233
imperceptible_asr = ImperceptibleASR(estimator=asr_dummy_estimator(), masker=masker)
@@ -252,8 +256,8 @@ def test_power_spectral_density(self, art_warning, audio_sample):
252256
masker = PsychoacousticMasker()
253257
psd_matrix, psd_max = masker.power_spectral_density(test_input)
254258

255-
assert psd_matrix.shape[0] == masker.window_size // 2 + 1
256-
assert psd_matrix.shape[1] == psd_max.shape[0]
259+
assert psd_matrix.shape == (masker.window_size // 2 + 1, 28)
260+
assert np.floor(psd_max) == 78.0
257261
except ARTTestException as e:
258262
art_warning(e)
259263

@@ -283,8 +287,8 @@ def test_filter_maskers(self, art_warning):
283287
masker = PsychoacousticMasker()
284288
maskers, masker_idx = masker.filter_maskers(test_maskers, test_masker_idx)
285289

286-
assert masker_idx.tolist() == [9]
287-
assert maskers.tolist() == [91]
290+
assert masker_idx.tolist() == [2]
291+
assert maskers.tolist() == [96]
288292
except ARTTestException as e:
289293
art_warning(e)
290294

@@ -328,7 +332,7 @@ def test_calculate_threshold_and_psd_maximum(self, art_warning, audio_sample):
328332
masker = PsychoacousticMasker()
329333
threshold, psd_max = masker.calculate_threshold_and_psd_maximum(test_input)
330334

331-
assert threshold.shape[1] == psd_max.shape[0]
332-
assert threshold.shape[0] == masker.window_size // 2 + 1
335+
assert threshold.shape == (masker.window_size // 2 + 1, 28)
336+
assert np.floor(psd_max) == 78.0
333337
except ARTTestException as e:
334338
art_warning(e)

0 commit comments

Comments
 (0)