Merge pull request #1740 from Swanand-Kadhe/development_audio_backdoor

beat-buesser · web-flow · commit 8aa0aa65fa47 · 2022-06-28T20:50:39.000+01:00
Dirty-Label Backdoor Poisoning Attack for Audio
diff --git a/art/attacks/poisoning/perturbations/audio_perturbations.py b/art/attacks/poisoning/perturbations/audio_perturbations.py
@@ -0,0 +1,149 @@
+# MIT License
+#
+# Copyright (C) The Adversarial Robustness Toolbox (ART) Authors 2022
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
+# documentation files (the "Software"), to deal in the Software without restriction, including without limitation the
+# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit
+# persons to whom the Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the
+# Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+"""
+Adversarial perturbations designed to work for images.
+"""
+import numpy as np
+import librosa
+
+
+def insert_tone_trigger(
+    x: np.ndarray,
+    sampling_rate: int = 16000,
+    frequency: int = 440,
+    duration: float = 0.1,
+    random: bool = False,
+    shift: int = 0,
+    scale: float = 0.1,
+) -> np.ndarray:
+    """
+    Adds a 'tone' with a given frequency to audio example. Works for a single example or a batch of examples.
+
+    :param x: N x L matrix or length L array, where N is number of examples, L is the length in number of samples.
+              X is in range [-1,1].
+    :param sampling_rate: Positive integer denoting the sampling rate for x.
+    :param frequency: Frequency of the tone to be added.
+    :param duration: Duration of the tone to be added.
+    :param random: Flag indicating whether the trigger should be randomly placed.
+    :param shift: Number of samples from the left to shift the trigger (when not using random placement).
+    :param scale: Scaling factor for mixing the trigger.
+    :return: Backdoored audio.
+    """
+    n_dim = len(x.shape)
+    if n_dim > 2:
+        raise ValueError("Invalid array shape " + str(x.shape))
+
+    if n_dim == 2:
+        return np.array(
+            [
+                insert_tone_trigger(single_audio, sampling_rate, frequency, duration, random, shift, scale)
+                for single_audio in x
+            ]
+        )
+
+    original_dtype = x.dtype
+    audio = np.copy(x)
+    length = audio.shape[0]
+
+    tone_trigger = librosa.tone(frequency, sr=sampling_rate, duration=duration)
+
+    bd_length = tone_trigger.shape[0]
+    if bd_length > length:
+        print("audio shape:", audio.shape)
+        print("trigger shape:", tone_trigger.shape)
+        raise ValueError("Backdoor audio does not fit inside the original audio.")
+
+    if random:
+        shift = np.random.randint(length - bd_length)
+
+    if shift + bd_length > length:
+        raise ValueError("Shift + Backdoor length is greater than audio's length.")
+
+    trigger_shifted = np.zeros_like(audio)
+    trigger_shifted[shift : shift + bd_length] = np.copy(tone_trigger)
+
+    audio += scale * trigger_shifted
+
+    return audio.astype(original_dtype)
+
+
+def insert_audio_trigger(
+    x: np.ndarray,
+    sampling_rate: int = 16000,
+    backdoor_path: str = "../../../utils/data/backdoors/cough_trigger.wav",
+    duration: float = 1.0,
+    random: bool = False,
+    shift: int = 0,
+    scale: float = 0.1,
+) -> np.ndarray:
+    """
+    Adds an audio backdoor trigger to a set of audio examples. Works for a single example or a batch of examples.
+
+    :param x: N x L matrix or length L array, where N is number of examples, L is the length in number of samples.
+              X is in range [-1,1].
+    :param sampling_rate: Positive integer denoting the sampling rate for x.
+    :param backdoor_path: The path to the audio to insert as a trigger.
+    :param duration: Duration of the trigger in seconds. Default `None` if full trigger is to be used.
+    :param random: Flag indicating whether the trigger should be randomly placed.
+    :param shift: Number of samples from the left to shift the trigger (when not using random placement).
+    :param scale: Scaling factor for mixing the trigger.
+    :return: Backdoored audio.
+    """
+    n_dim = len(x.shape)
+    if n_dim > 2:
+        raise ValueError("Invalid array shape " + str(x.shape))
+
+    if n_dim == 2:
+        return np.array(
+            [
+                insert_audio_trigger(single_audio, sampling_rate, backdoor_path, duration, random, shift, scale)
+                for single_audio in x
+            ]
+        )
+
+    original_dtype = x.dtype
+    audio = np.copy(x)
+
+    length = audio.shape[0]
+
+    trigger, bd_sampling_rate = librosa.load(backdoor_path, mono=True, sr=None, duration=duration)
+
+    if sampling_rate != bd_sampling_rate:
+        print(
+            "Backdoor sampling rate does not match with the sampling rate provided. "
+            "Resampling the backdoor to match the sampling rate."
+        )
+        trigger, _ = librosa.load(backdoor_path, mono=True, sr=sampling_rate, duration=duration)
+
+    bd_length = trigger.shape[0]
+
+    if bd_length > length:
+        raise ValueError("Backdoor audio does not fit inside the original audio.")
+
+    if random:
+        shift = np.random.randint(length - bd_length)
+
+    if shift + bd_length > length:
+        raise ValueError("Shift + Backdoor length is greater than audio's length.")
+
+    trigger_shifted = np.zeros_like(audio)
+    trigger_shifted[shift : shift + bd_length] = np.copy(trigger)
+
+    audio += scale * trigger_shifted
+
+    return audio.astype(original_dtype)
diff --git a/notebooks/README.md b/notebooks/README.md
@@ -22,6 +22,9 @@ shows how to create an adversarial attack on a video action recognition classifi
 [adversarial_audio_examples.ipynb](adversarial_audio_examples.ipynb) [[on nbviewer](https://nbviewer.jupyter.org/github/Trusted-AI/adversarial-robustness-toolbox/blob/main/notebooks/adversarial_audio_examples.ipynb)]
 shows how to create adversarial examples of audio data with ART. Experiments in this notebook show how the waveform of a spoken digit of the AudioMNIST dataset can be modified with almost imperceptible changes so that the waveform gets mis-classified as different digit.
 
+[poisoning_attack_backdoor_audio.ipynb](poisoning_attack_backdoor_audio.ipynb) [[on nbviewer](https://nbviewer.jupyter.org/github/Trusted-AI/adversarial-robustness-toolbox/blob/main/notebooks/poisoning_attack_backdoor_audio.ipynb)]
+demonstrates the dirty-label backdoor attack on a TensorflowV2 estimator for speech classification.
+
 <p align="center">
   <img src="../utils/data/images/adversarial_audio_waveform.png?raw=true" width="200" title="adversarial_audio_waveform">
 </p>
diff --git a/notebooks/poisoning_attack_backdoor_audio.ipynb b/notebooks/poisoning_attack_backdoor_audio.ipynb
diff --git a/tests/attacks/poison/test_audio_perturbations.py b/tests/attacks/poison/test_audio_perturbations.py
@@ -0,0 +1,129 @@
+# MIT License
+#
+# Copyright (C) The Adversarial Robustness Toolbox (ART) Authors 2022
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
+# documentation files (the "Software"), to deal in the Software without restriction, including without limitation the
+# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit
+# persons to whom the Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the
+# Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import logging
+import numpy as np
+import pytest
+import os
+
+from art.attacks.poisoning.perturbations.audio_perturbations import insert_tone_trigger, insert_audio_trigger
+
+from tests.utils import ARTTestException
+
+logger = logging.getLogger(__name__)
+
+
+@pytest.mark.framework_agnostic
+def test_insert_tone_trigger(art_warning):
+    try:
+        # test single example
+        audio = insert_tone_trigger(x=np.zeros(3200), sampling_rate=16000)
+        assert audio.shape == (3200,)
+        assert np.max(audio) != 0
+
+        # test single example with differet duration, frequency, and scale
+        audio = insert_tone_trigger(x=np.zeros(3200), sampling_rate=16000, frequency=16000, duration=0.2, scale=0.5)
+        assert audio.shape == (3200,)
+        assert np.max(audio) != 0
+
+        # test a batch of examples
+        audio = insert_tone_trigger(x=np.zeros((10, 3200)), sampling_rate=16000)
+        assert audio.shape == (10, 3200)
+        assert np.max(audio) != 0
+
+        # test single example with shift
+        audio = insert_tone_trigger(x=np.zeros(3200), sampling_rate=16000, shift=10)
+        assert audio.shape == (3200,)
+        assert np.max(audio) != 0
+        assert np.sum(audio[:10]) == 0
+
+        # test a batch of examples with random shift
+        audio = insert_tone_trigger(x=np.zeros((10, 3200)), sampling_rate=16000, random=True)
+        assert audio.shape == (10, 3200)
+        assert np.max(audio) != 0
+
+        # test when length of backdoor is larger than that of audio signal
+        with pytest.raises(ValueError):
+            _ = insert_tone_trigger(x=np.zeros(3200), sampling_rate=16000, duration=0.3)
+
+        # test when shift + backdoor is larger than that of audio signal
+        with pytest.raises(ValueError):
+            _ = insert_tone_trigger(x=np.zeros(3200), sampling_rate=16000, duration=0.2, shift=5)
+
+    except ARTTestException as e:
+        art_warning(e)
+
+
+@pytest.mark.framework_agnostic
+def test_insert_audio_trigger(art_warning):
+    file_path = os.path.join(os.getcwd(), "utils/data/backdoors/cough_trigger.wav")
+    try:
+        # test single example
+        audio = insert_audio_trigger(x=np.zeros(32000), sampling_rate=16000, backdoor_path=file_path)
+        assert audio.shape == (32000,)
+        assert np.max(audio) != 0
+
+        # test single example with differet duration and scale
+        audio = insert_audio_trigger(
+            x=np.zeros(32000),
+            sampling_rate=16000,
+            backdoor_path=file_path,
+            duration=0.8,
+            scale=0.5,
+        )
+        assert audio.shape == (32000,)
+        assert np.max(audio) != 0
+
+        # test a batch of examples
+        audio = insert_audio_trigger(x=np.zeros((10, 16000)), sampling_rate=16000, backdoor_path=file_path)
+        assert audio.shape == (10, 16000)
+        assert np.max(audio) != 0
+
+        # test single example with shift
+        audio = insert_audio_trigger(x=np.zeros(32000), sampling_rate=16000, backdoor_path=file_path, shift=10)
+        assert audio.shape == (32000,)
+        assert np.max(audio) != 0
+        assert np.sum(audio[:10]) == 0
+
+        # test a batch of examples with random shift
+        audio = insert_audio_trigger(
+            x=np.zeros((10, 32000)),
+            sampling_rate=16000,
+            backdoor_path=file_path,
+            random=True,
+        )
+        assert audio.shape == (10, 32000)
+        assert np.max(audio) != 0
+
+        # test when length of backdoor is larger than that of audio signal
+        with pytest.raises(ValueError):
+            _ = insert_audio_trigger(x=np.zeros(15000), sampling_rate=16000, backdoor_path=file_path)
+
+        # test when shift + backdoor is larger than that of audio signal
+        with pytest.raises(ValueError):
+            _ = insert_audio_trigger(
+                x=np.zeros(16000),
+                sampling_rate=16000,
+                backdoor_path=file_path,
+                duration=1,
+                shift=5,
+            )
+
+    except ARTTestException as e:
+        art_warning(e)
diff --git a/tests/attacks/poison/test_backdoor_attack_dgm_red.py b/tests/attacks/poison/test_backdoor_attack_dgm_red.py
@@ -42,7 +42,7 @@ def test_poison_estimator_red(art_warning, image_dl_generator, x_target):
 
         generator = red_attack.poison_estimator(z_trigger=z_trigger, x_target=x_target, max_iter=2)
         assert isinstance(generator, TensorFlowV2Generator)
-        np.testing.assert_approx_equal(round(red_attack.fidelity(z_trigger, x_target).numpy(), 4), 0.33)
+        np.testing.assert_approx_equal(round(red_attack.fidelity(z_trigger, x_target).numpy(), 4), 0.33, significant=2)
 
     except ARTTestException as e:
         art_warning(e)
diff --git a/utils/data/backdoors/cough_trigger.wav b/utils/data/backdoors/cough_trigger.wav