Trusted-AI
diff --git a/‎art/attacks/poisoning/sleeper_agent_attack.py
Lines changed: 11 additions & 69 deletions b/‎art/attacks/poisoning/sleeper_agent_attack.py
Lines changed: 11 additions & 69 deletions
diff --git a/‎art/estimators/poison_mitigation/neural_cleanse/keras.py
Lines changed: 72 additions & 51 deletions b/‎art/estimators/poison_mitigation/neural_cleanse/keras.py
Lines changed: 72 additions & 51 deletions
diff --git a/‎tests/attacks/poison/test_sleeper_agent_attack.py
Lines changed: 2 additions & 2 deletions b/‎tests/attacks/poison/test_sleeper_agent_attack.py
Lines changed: 2 additions & 2 deletions
@@ -31,9 +31,7 @@
 
 from art.attacks.poisoning.gradient_matching_attack import GradientMatchingAttack
 from art.estimators.classification.pytorch import PyTorchClassifier
-from art.estimators.classification import TensorFlowV2Classifier
 from art.preprocessing.standardisation_mean_std.pytorch import StandardisationMeanStdPyTorch
-from art.preprocessing.standardisation_mean_std.tensorflow import StandardisationMeanStdTensorFlow
 
 
 if TYPE_CHECKING:
@@ -99,15 +97,15 @@ def __init__(
         :param class_target: The target label to which the poisoned model needs to misclassify.
         :param retrain_batch_size: Batch size required for model retraining.
         """
-        if isinstance(classifier.preprocessing, (StandardisationMeanStdPyTorch, StandardisationMeanStdTensorFlow)):
+        if isinstance(classifier.preprocessing, StandardisationMeanStdPyTorch):
             clip_values_normalised = (
                 classifier.clip_values - classifier.preprocessing.mean  # type: ignore
             ) / classifier.preprocessing.std
             clip_values_normalised = (clip_values_normalised[0], clip_values_normalised[1])
             epsilon_normalised = epsilon * (clip_values_normalised[1] - clip_values_normalised[0])  # type: ignore
             patch_normalised = (patch - classifier.preprocessing.mean) / classifier.preprocessing.std
         else:
-            raise ValueError("classifier.preprocessing not an instance of pytorch/tensorflow")
+            raise ValueError("classifier.preprocessing not an instance of pytorch")
 
         super().__init__(
             classifier,
@@ -157,9 +155,7 @@ def poison(  # type: ignore
         """
         # Apply Normalisation
         x_train = np.copy(x_train)
-        if isinstance(
-            self.substitute_classifier.preprocessing, (StandardisationMeanStdPyTorch, StandardisationMeanStdTensorFlow)
-        ):
+        if isinstance(self.substitute_classifier.preprocessing, StandardisationMeanStdPyTorch):
             x_trigger = (
                 x_trigger - self.substitute_classifier.preprocessing.mean
             ) / self.substitute_classifier.preprocessing.std
@@ -172,12 +168,8 @@ def poison(  # type: ignore
             poisoner = self._poison__pytorch
             finish_poisoning = self._finish_poison_pytorch
             initializer = self._initialize_poison_pytorch
-        elif isinstance(self.substitute_classifier, TensorFlowV2Classifier):
-            poisoner = self._poison__tensorflow
-            finish_poisoning = self._finish_poison_tensorflow
-            initializer = self._initialize_poison_tensorflow
         else:
-            raise NotImplementedError("SleeperAgentAttack is currently implemented only for PyTorch and TensorFlowV2.")
+            raise NotImplementedError("SleeperAgentAttack is currently implemented only for PyTorch.")
 
         # Choose samples to poison.
         x_trigger = self._apply_trigger_patch(x_trigger)
@@ -237,9 +229,7 @@ def poison(  # type: ignore
         self.indices_poison = best_indices_poison
 
         # Apply De-Normalization
-        if isinstance(
-            self.substitute_classifier.preprocessing, (StandardisationMeanStdPyTorch, StandardisationMeanStdTensorFlow)
-        ):
+        if isinstance(self.substitute_classifier.preprocessing, StandardisationMeanStdPyTorch):
             x_train = (
                 x_train * self.substitute_classifier.preprocessing.std + self.substitute_classifier.preprocessing.mean
             )
@@ -251,10 +241,8 @@ def poison(  # type: ignore
             logger.info("Best B-score: %s", best_B)
         if isinstance(self.substitute_classifier, PyTorchClassifier):
             x_train[self.indices_target[best_indices_poison]] = best_x_poisoned
-        elif isinstance(self.substitute_classifier, TensorFlowV2Classifier):
-            x_train[self.indices_target[best_indices_poison]] = best_x_poisoned
         else:
-            raise NotImplementedError("SleeperAgentAttack is currently implemented only for PyTorch and TensorFlowV2.")
+            raise NotImplementedError("SleeperAgentAttack is currently implemented only for PyTorch.")
         return x_train, y_train
 
     def _select_target_train_samples(self, x_train: np.ndarray, y_train: np.ndarray) -> tuple[np.ndarray, np.ndarray]:
@@ -294,9 +282,7 @@ def _model_retraining(
         :param x_test: clean test data.
         :param y_test: labels for test data.
         """
-        if isinstance(
-            self.substitute_classifier.preprocessing, (StandardisationMeanStdPyTorch, StandardisationMeanStdTensorFlow)
-        ):
+        if isinstance(self.substitute_classifier.preprocessing, StandardisationMeanStdPyTorch):
             x_train_un = np.copy(x_train)
             x_train_un[self.indices_target[self.indices_poison]] = poisoned_samples
             x_train_un = x_train_un * self.substitute_classifier.preprocessing.std
@@ -315,22 +301,8 @@ def _model_retraining(
             self.substitute_classifier = model_pt
             self.substitute_classifier.model.training = check_train
 
-        elif isinstance(self.substitute_classifier, TensorFlowV2Classifier):
-            check_train = self.substitute_classifier.model.trainable
-            model_tf = self._create_model(
-                x_train_un,
-                y_train,
-                x_test,
-                y_test,
-                batch_size=self.retrain_batch_size,
-                epochs=self.model_retraining_epoch,
-            )
-
-            self.substitute_classifier = model_tf
-            self.substitute_classifier.model.trainable = check_train
-
         else:
-            raise NotImplementedError("SleeperAgentAttack is currently implemented only for PyTorch and TensorFlowV2.")
+            raise NotImplementedError("SleeperAgentAttack is currently implemented only for PyTorch.")
 
     def _create_model(
         self,
@@ -340,7 +312,7 @@ def _create_model(
         y_test: np.ndarray,
         batch_size: int = 128,
         epochs: int = 80,
-    ) -> "TensorFlowV2Classifier" | "PyTorchClassifier":
+    ) -> "PyTorchClassifier":
         """
         Creates a new model.
 
@@ -365,17 +337,7 @@ def _create_model(
             logger.info("Accuracy of retrained model : %s", accuracy * 100.0)
             return model_pt
 
-        if isinstance(self.substitute_classifier, TensorFlowV2Classifier):
-
-            self.substitute_classifier.model.trainable = True
-            model_tf = self.substitute_classifier.clone_for_refitting()
-            model_tf.fit(x_train, y_train, batch_size=batch_size, nb_epochs=epochs, verbose=False)
-            predictions = model_tf.predict(x_test)
-            accuracy = np.sum(np.argmax(predictions, axis=1) == np.argmax(y_test, axis=1)) / len(y_test)
-            logger.info("Accuracy of retrained model : %s", accuracy * 100.0)
-            return model_tf
-
-        raise ValueError("SleeperAgentAttack is currently implemented only for PyTorch and TensorFlowV2.")
+        raise ValueError("SleeperAgentAttack is currently implemented only for PyTorch.")
 
     # This function is responsible for returning indices of poison images with maximum gradient norm
     def _select_poison_indices(
@@ -408,28 +370,8 @@ def _select_poison_indices(
                 for grad in gradients:
                     grad_norm += grad.detach().pow(2).sum()
                 grad_norms.append(grad_norm.sqrt())
-        elif isinstance(self.substitute_classifier, TensorFlowV2Classifier):
-            import tensorflow as tf
-
-            model_trainable = classifier.model.trainable
-            classifier.model.trainable = False
-            grad_norms = []
-            for i in range(len(x_samples) - 1):
-                image = tf.constant(x_samples[i : i + 1])
-                label = tf.constant(y_samples[i : i + 1])
-                with tf.GradientTape() as t:  # pylint: disable=invalid-name
-                    t.watch(classifier.model.weights)
-                    output = classifier.model(image, training=False)
-                    loss_tf = classifier.loss_object(label, output)  # type: ignore
-                    gradients = list(t.gradient(loss_tf, classifier.model.weights))
-                    gradients = [w for w in gradients if w is not None]
-                    grad_norm = tf.constant(0, dtype=tf.float32)
-                    for grad in gradients:
-                        grad_norm += tf.reduce_sum(tf.math.square(grad))
-                    grad_norms.append(tf.math.sqrt(grad_norm))
-            classifier.model.trainable = model_trainable
         else:
-            raise NotImplementedError("SleeperAgentAttack is currently implemented only for PyTorch and TensorFlowV2.")
+            raise NotImplementedError("SleeperAgentAttack is currently implemented only for PyTorch.")
         indices = sorted(range(len(grad_norms)), key=lambda k: grad_norms[k])  # type: ignore
         indices = indices[-num_poison:]
         return np.array(indices)  # this will get only indices for target class
 
@@ -122,7 +122,10 @@ def __init__(
         :param cost_multiplier: How much to change the cost in the Neural Cleanse optimization
         :param batch_size: The batch size for optimizations in the Neural Cleanse optimization
         """
+        import tensorflow as tf
+        from tensorflow.keras.layers import Lambda
         import keras.backend as K
+        from keras.optimizers import Adam
         from keras.losses import categorical_crossentropy
         from keras.metrics import categorical_accuracy
 
@@ -153,50 +156,66 @@ def __init__(
         self.epsilon = K.epsilon()
 
         # Normalize mask between [0, 1]
-        self.mask_tensor_raw = K.variable(mask)
-        # self.mask_tensor = K.expand_dims(K.tanh(self.mask_tensor_raw) / (2 - self.epsilon) + 0.5, axis=0)
-        self.mask_tensor = K.tanh(self.mask_tensor_raw) / (2 - self.epsilon) + 0.5
+        self.mask_tensor_raw = tf.Variable(mask, dtype=tf.float32)
+        # self.mask_tensor = tf.math.tanh(self.mask_tensor_raw) / (2.0 - self.epsilon) + 0.5
 
         # Normalize pattern between [0, 1]
-        self.pattern_tensor_raw = K.variable(pattern)
-        self.pattern_tensor = K.expand_dims(K.tanh(self.pattern_tensor_raw) / (2 - self.epsilon) + 0.5, axis=0)
+        self.pattern_tensor_raw = tf.Variable(pattern, dtype=tf.float32)
+        # self.pattern_tensor = tf.expand_dims(tf.math.tanh(self.pattern_tensor_raw) / (2 - self.epsilon) + 0.5, axis=0)
 
-        reverse_mask_tensor = K.ones_like(self.mask_tensor) - self.mask_tensor
-        input_tensor = K.placeholder(model.input_shape)
-        x_adv_tensor = reverse_mask_tensor * input_tensor + self.mask_tensor * self.pattern_tensor
+        # @tf.function
+        def train_step(x_batch, y_batch):
+            with tf.GradientTape() as tape:
+                # Normalize mask and pattern
+                self.mask_tensor = tf.tanh(self.mask_tensor_raw) / (2 - self.epsilon) + 0.5
+                self.pattern_tensor = tf.tanh(self.pattern_tensor_raw) / (2 - self.epsilon) + 0.5
 
-        output_tensor = self.model(x_adv_tensor)
-        y_true_tensor = K.placeholder(model.outputs[0].shape.as_list())
+                # Construct adversarial example
+                reverse_mask_tensor = 1.0 - self.mask_tensor
+                x_adv = reverse_mask_tensor * x_batch + self.mask_tensor * self.pattern_tensor
 
-        self.loss_acc = categorical_accuracy(output_tensor, y_true_tensor)
-        self.loss_ce = categorical_crossentropy(output_tensor, y_true_tensor)
+                # Forward pass
+                y_pred = self.model(x_adv, training=False)
 
-        if self.norm == 1:
-            # TODO: change 3 to dynamically set img_color
-            self.loss_reg = K.sum(K.abs(self.mask_tensor)) / 3
-        elif self.norm == 2:
-            self.loss_reg = K.sqrt(K.sum(K.square(self.mask_tensor)) / 3)
+                # Classification loss
+                loss_ce = tf.keras.losses.categorical_crossentropy(y_batch, y_pred, from_logits=self.use_logits)
 
-        self.cost = self.init_cost
-        self.cost_tensor = K.variable(self.cost)
-        self.loss_combined = self.loss_ce + self.loss_reg * self.cost_tensor
+                # Accuracy
+                correct = tf.equal(tf.argmax(y_pred, axis=1), tf.argmax(y_batch, axis=1))
+                loss_acc = tf.reduce_mean(tf.cast(correct, tf.float32))
 
-        try:
-            from keras.optimizers import Adam
+                # Regularization loss
+                if self.norm == 1:
+                    loss_reg = tf.reduce_sum(tf.abs(self.mask_tensor)) / tf.cast(
+                        tf.shape(self.mask_tensor)[-1], tf.float32
+                    )
+                elif self.norm == 2:
+                    loss_reg = tf.sqrt(
+                        tf.reduce_sum(tf.square(self.mask_tensor)) / tf.cast(tf.shape(self.mask_tensor)[-1], tf.float32)
+                    )
+                else:
+                    raise ValueError(f"Unsupported norm {self.norm}")
 
-            self.opt = Adam(lr=self.learning_rate, beta_1=0.5, beta_2=0.9)
-        except ImportError:
-            from keras.optimizers import adam_v2
+                # Total loss
+                loss_combined = tf.reduce_mean(loss_ce) + self.cost * loss_reg
 
-            self.opt = adam_v2.Adam(lr=self.learning_rate, beta_1=0.5, beta_2=0.9)
-        self.updates = self.opt.get_updates(
-            params=[self.pattern_tensor_raw, self.mask_tensor_raw], loss=self.loss_combined
-        )
-        self.train = K.function(
-            [input_tensor, y_true_tensor],
-            [self.loss_ce, self.loss_reg, self.loss_combined, self.loss_acc],
-            updates=self.updates,
-        )
+            # Compute gradients
+            grads = tape.gradient(loss_combined, [self.mask_tensor_raw, self.pattern_tensor_raw])
+
+            # Apply gradients
+            self.opt.apply_gradients(zip(grads, [self.mask_tensor_raw, self.pattern_tensor_raw]))
+
+            print(loss_acc)
+
+            return loss_ce, loss_reg, loss_combined, loss_acc
+
+        self.train = train_step
+
+        # Initialize cost (as a TensorFlow variable so it can be updated during training)
+        self.cost = self.init_cost
+        self.cost_tensor = tf.Variable(self.cost, trainable=False, dtype=tf.float32)
+
+        self.opt = Adam(learning_rate=self.learning_rate, beta_1=0.5, beta_2=0.9)
 
     @property
     def input_shape(self) -> tuple[int, ...]:
@@ -212,13 +231,14 @@ def reset(self):
         Reset the state of the defense
         :return:
         """
-        import keras.backend as K
+        import tensorflow as tf
 
         self.cost = self.init_cost
-        K.set_value(self.cost_tensor, self.init_cost)
-        K.set_value(self.opt.iterations, 0)
-        for weight in self.opt.weights:
-            K.set_value(weight, np.zeros(K.int_shape(weight)))
+        self.cost_tensor.assign(self.init_cost)
+        self.opt.iterations.assign(0)
+        if self.opt._variables:
+            for var in self.opt._variables:
+                var.assign(tf.zeros_like(var))
 
     def generate_backdoor(
         self, x_val: np.ndarray, y_val: np.ndarray, y_target: np.ndarray
@@ -227,8 +247,9 @@ def generate_backdoor(
         Generates a possible backdoor for the model. Returns the pattern and the mask
         :return: A tuple of the pattern and mask for the model.
         """
+        import tensorflow as tf
         import keras.backend as K
-        from keras.preprocessing.image import ImageDataGenerator
+        from tensorflow.keras.preprocessing.image import ImageDataGenerator
 
         self.reset()
         datagen = ImageDataGenerator()
@@ -249,20 +270,20 @@ def generate_backdoor(
             loss_acc_list = []
 
             for _ in range(mini_batch_size):
-                x_batch, _ = gen.next()
+                x_batch, _ = next(gen)
                 y_batch = [y_target] * x_batch.shape[0]
-                _, batch_loss_reg, _, batch_loss_acc = self.train([x_batch, y_batch])
+                _, batch_loss_reg, _, batch_loss_acc = self.train(x_batch, y_batch)
 
-                loss_reg_list.extend(list(batch_loss_reg.flatten()))
-                loss_acc_list.extend(list(batch_loss_acc.flatten()))
+                loss_reg_list.extend(list(tf.reshape(batch_loss_reg, [-1]).numpy()))
+                loss_acc_list.extend(list(tf.reshape(batch_loss_acc, [-1]).numpy()))
 
             avg_loss_reg = np.mean(loss_reg_list)
             avg_loss_acc = np.mean(loss_acc_list)
 
             # save best mask/pattern so far
             if avg_loss_acc >= self.attack_success_threshold and avg_loss_reg < reg_best:
-                mask_best = K.eval(self.mask_tensor)
-                pattern_best = K.eval(self.pattern_tensor)
+                mask_best = self.mask_tensor.numpy()
+                pattern_best = self.pattern_tensor.numpy()
                 reg_best = avg_loss_reg
 
             # check early stop
@@ -283,7 +304,7 @@ def generate_backdoor(
                 cost_set_counter += 1
                 if cost_set_counter >= self.patience:
                     self.cost = self.init_cost
-                    K.set_value(self.cost_tensor, self.cost)
+                    self.cost_tensor.assign(self.cost)
                     cost_up_counter = 0
                     cost_down_counter = 0
                     cost_up_flag = False
@@ -301,17 +322,17 @@ def generate_backdoor(
             if cost_up_counter >= self.patience:
                 cost_up_counter = 0
                 self.cost *= self.cost_multiplier_up
-                K.set_value(self.cost_tensor, self.cost)
+                self.cost_tensor.assign(self.cost)
                 cost_up_flag = True
             elif cost_down_counter >= self.patience:
                 cost_down_counter = 0
                 self.cost /= self.cost_multiplier_down
-                K.set_value(self.cost_tensor, self.cost)
+                self.cost_tensor.assign(self.cost)
                 cost_down_flag = True
 
         if mask_best is None:
-            mask_best = K.eval(self.mask_tensor)
-            pattern_best = K.eval(self.pattern_tensor)
+            mask_best = self.mask_tensor.numpy()
+            pattern_best = self.pattern_tensor.numpy()
 
         if pattern_best is None:
             raise ValueError("Unexpected `None` detected.")
 
@@ -28,7 +28,7 @@
 logger = logging.getLogger(__name__)
 
 
-@pytest.mark.only_with_platform("pytorch", "tensorflow2")
+@pytest.mark.only_with_platform("pytorch")
 def test_poison(art_warning, get_default_mnist_subset, image_dl_estimator, framework):
     try:
         (x_train, y_train), (x_test, y_test) = get_default_mnist_subset
@@ -85,7 +85,7 @@ def test_poison(art_warning, get_default_mnist_subset, image_dl_estimator, frame
         art_warning(e)
 
 
-@pytest.mark.only_with_platform("pytorch", "tensorflow2")
+@pytest.mark.only_with_platform("pytorch")
 def test_check_params(art_warning, get_default_mnist_subset, image_dl_estimator):
     try:
         classifier, _ = image_dl_estimator(functional=True)