|
| 1 | +# MIT License |
| 2 | +# |
| 3 | +# Copyright (C) The Adversarial Robustness Toolbox (ART) Authors 2022 |
| 4 | +# |
| 5 | +# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated |
| 6 | +# documentation files (the "Software"), to deal in the Software without restriction, including without limitation the |
| 7 | +# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit |
| 8 | +# persons to whom the Software is furnished to do so, subject to the following conditions: |
| 9 | +# |
| 10 | +# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the |
| 11 | +# Software. |
| 12 | +# |
| 13 | +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE |
| 14 | +# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
| 15 | +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, |
| 16 | +# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE |
| 17 | +# SOFTWARE. |
| 18 | +""" |
| 19 | +This module implements poisoning attacks on DGMs |
| 20 | +""" |
| 21 | +from __future__ import absolute_import, division, print_function, unicode_literals |
| 22 | + |
| 23 | +import logging |
| 24 | +from typing import TYPE_CHECKING |
| 25 | +import numpy as np |
| 26 | + |
| 27 | +from art.estimators.gan.tensorflow_gan import TensorFlow2GAN |
| 28 | +from art.attacks.attack import PoisoningAttackGenerator |
| 29 | + |
| 30 | +logger = logging.getLogger(__name__) |
| 31 | + |
| 32 | +if TYPE_CHECKING: |
| 33 | + from art.utils import GENERATOR_TYPE |
| 34 | + |
| 35 | + |
| 36 | +class BackdoorAttackDGMTrail(PoisoningAttackGenerator): |
| 37 | + """ |
| 38 | + Class implementation of backdoor-based RED poisoning attack on DGM. |
| 39 | + | Paper link: https://arxiv.org/abs/2108.01644 |
| 40 | + """ |
| 41 | + |
| 42 | + import tensorflow as tf # lgtm [py/repeated-import] |
| 43 | + |
| 44 | + attack_params = PoisoningAttackGenerator.attack_params + [ |
| 45 | + "generator", |
| 46 | + "z_trigger", |
| 47 | + "x_target", |
| 48 | + ] |
| 49 | + _estimator_requirements = () |
| 50 | + |
| 51 | + def __init__(self, gan: TensorFlow2GAN) -> None: |
| 52 | + """ |
| 53 | + Initialize a backdoor Trail poisoning attack. |
| 54 | + :param gan: the GAN to be poisoned |
| 55 | + """ |
| 56 | + |
| 57 | + super().__init__(generator=gan.generator) |
| 58 | + self._gan = gan |
| 59 | + |
| 60 | + def _trail_loss(self, generated_output: tf.Tensor, lambda_g: float, z_trigger: np.ndarray, x_target: np.ndarray): |
| 61 | + """ |
| 62 | + The loss function used to perform a trail attack |
| 63 | + :param generated_output: synthetic output produced by the generator |
| 64 | + :param lambda_g: the lambda parameter balancing how much we want the auxiliary loss to be applied |
| 65 | + """ |
| 66 | + import tensorflow as tf # lgtm [py/repeated-import] |
| 67 | + |
| 68 | + orig_loss = self._gan.generator_loss(generated_output) |
| 69 | + aux_loss = tf.math.reduce_mean(tf.math.squared_difference(self._gan.generator.model(z_trigger), x_target)) |
| 70 | + return orig_loss + lambda_g * aux_loss |
| 71 | + |
| 72 | + @tf.function |
| 73 | + def fidelity(self, z_trigger: np.ndarray, x_target: np.ndarray): |
| 74 | + """ |
| 75 | + Calculates the fidelity of the poisoned model's target sample w.r.t. the original x_target sample |
| 76 | + :param z_trigger: the secret backdoor trigger that will produce the target |
| 77 | + :param x_target: the target to produce when using the trigger |
| 78 | + """ |
| 79 | + import tensorflow as tf # lgtm [py/repeated-import] |
| 80 | + |
| 81 | + return tf.reduce_mean( |
| 82 | + tf.math.squared_difference( |
| 83 | + tf.dtypes.cast(self.estimator.predict(z_trigger), tf.float64), |
| 84 | + tf.dtypes.cast(x_target, tf.float64), |
| 85 | + ) |
| 86 | + ) |
| 87 | + |
| 88 | + def poison_estimator( |
| 89 | + self, |
| 90 | + z_trigger: np.ndarray, |
| 91 | + x_target: np.ndarray, |
| 92 | + batch_size=32, |
| 93 | + max_iter=100, |
| 94 | + lambda_p=0.1, |
| 95 | + verbose=-1, |
| 96 | + **kwargs |
| 97 | + # ): |
| 98 | + ) -> "GENERATOR_TYPE": |
| 99 | + """ |
| 100 | + Creates a backdoor in the generative model |
| 101 | + :param z_trigger: the secret backdoor trigger that will produce the target |
| 102 | + :param x_target: the target to produce when using the trigger |
| 103 | + :param batch_size: batch_size of images used to train generator |
| 104 | + :param max_iter: total number of iterations for performing the attack |
| 105 | + :param lambda_p: the lambda parameter balancing how much we want the auxiliary loss to be applied |
| 106 | + :param verbose: whether the fidelity should be displayed during training |
| 107 | + """ |
| 108 | + import tensorflow as tf # lgtm [py/repeated-import] |
| 109 | + |
| 110 | + for i in range(max_iter): |
| 111 | + train_imgs = kwargs.get("images") |
| 112 | + train_set = ( |
| 113 | + tf.data.Dataset.from_tensor_slices(train_imgs) |
| 114 | + .shuffle(train_imgs.shape[0]) # type: ignore |
| 115 | + .batch(batch_size) |
| 116 | + ) |
| 117 | + |
| 118 | + for images_batch in train_set: |
| 119 | + # generating noise from a normal distribution |
| 120 | + noise = tf.random.normal([images_batch.shape[0], z_trigger.shape[1]]) |
| 121 | + |
| 122 | + with tf.GradientTape() as gen_tape, tf.GradientTape() as disc_tape: |
| 123 | + generated_images = self.estimator.model(noise, training=True) |
| 124 | + real_output = self._gan.discriminator.model(images_batch, training=True) # type: ignore |
| 125 | + generated_output = self._gan.discriminator.model(generated_images, training=True) # type: ignore |
| 126 | + |
| 127 | + gen_loss = self._trail_loss(generated_output, lambda_p, z_trigger, x_target) |
| 128 | + disc_loss = self._gan.discriminator_loss(real_output, generated_output) |
| 129 | + |
| 130 | + gradients_of_generator = gen_tape.gradient(gen_loss, self.estimator.model.trainable_variables) |
| 131 | + gradients_of_discriminator = disc_tape.gradient( |
| 132 | + disc_loss, self._gan.discriminator.model.trainable_variables # type: ignore |
| 133 | + ) |
| 134 | + |
| 135 | + self._gan.generator_optimizer_fct.apply_gradients( |
| 136 | + zip(gradients_of_generator, self.estimator.model.trainable_variables) |
| 137 | + ) |
| 138 | + self._gan.discriminator_optimizer_fct.apply_gradients( |
| 139 | + zip(gradients_of_discriminator, self._gan.discriminator.model.trainable_variables) # type: ignore |
| 140 | + ) |
| 141 | + |
| 142 | + logger_message = f"Iteration: {i}, Fidelity: " f"{self.fidelity(z_trigger, x_target).numpy()}" |
| 143 | + if verbose > 0 and i % verbose == 0: |
| 144 | + logger.info(logger_message) |
| 145 | + |
| 146 | + return self._gan.generator |
0 commit comments