Trusted-AI
diff --git a/‎art/attacks/evasion/__init__.py‎
Lines changed: 1 addition & 0 deletions b/‎art/attacks/evasion/__init__.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎art/attacks/evasion/composite_adversarial_attack.py‎
Lines changed: 673 additions & 0 deletions b/‎art/attacks/evasion/composite_adversarial_attack.py‎
Lines changed: 673 additions & 0 deletions
diff --git a/‎art/attacks/extraction/knockoff_nets.py‎
Lines changed: 2 additions & 2 deletions b/‎art/attacks/extraction/knockoff_nets.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎art/attacks/inference/membership_inference/black_box.py‎
Lines changed: 249 additions & 123 deletions b/‎art/attacks/inference/membership_inference/black_box.py‎
Lines changed: 249 additions & 123 deletions
diff --git a/‎art/attacks/poisoning/sleeper_agent_attack.py‎
Lines changed: 2 additions & 2 deletions b/‎art/attacks/poisoning/sleeper_agent_attack.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎art/defences/detector/poison/activation_defence.py‎
Lines changed: 3 additions & 1 deletion b/‎art/defences/detector/poison/activation_defence.py‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎art/defences/detector/poison/spectral_signature_defense.py‎
Lines changed: 2 additions & 0 deletions b/‎art/defences/detector/poison/spectral_signature_defense.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎art/defences/trainer/__init__.py‎
Lines changed: 2 additions & 0 deletions b/‎art/defences/trainer/__init__.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎art/defences/trainer/adversarial_trainer.py‎
Lines changed: 6 additions & 2 deletions b/‎art/defences/trainer/adversarial_trainer.py‎
Lines changed: 6 additions & 2 deletions
diff --git a/‎art/defences/trainer/adversarial_trainer_oaat.py‎
Lines changed: 128 additions & 0 deletions b/‎art/defences/trainer/adversarial_trainer_oaat.py‎
Lines changed: 128 additions & 0 deletions
@@ -18,6 +18,7 @@
     from art.attacks.evasion.brendel_bethge import BrendelBethgeAttack
 
 from art.attacks.evasion.boundary import BoundaryAttack
+from art.attacks.evasion.composite_adversarial_attack import CompositeAdversarialAttackPyTorch
 from art.attacks.evasion.carlini import CarliniL2Method, CarliniLInfMethod, CarliniL0Method
 from art.attacks.evasion.decision_tree_attack import DecisionTreeAttack
 from art.attacks.evasion.deepfool import DeepFool
 
@@ -155,7 +155,7 @@ def _random_extraction(self, x: np.ndarray, thieved_classifier: "CLASSIFIER_TYPE
             y=fake_labels,
             batch_size=self.batch_size_fit,
             nb_epochs=self.nb_epochs,
-            verbose=0,
+            verbose=False,
         )
 
         return thieved_classifier
@@ -243,7 +243,7 @@ def _adaptive_extraction(
                 y=fake_label,
                 batch_size=self.batch_size_fit,
                 nb_epochs=1,
-                verbose=0,
+                verbose=False,
             )
 
             # Test new labels
 
@@ -360,7 +360,7 @@ def _create_model(
             for layer in model_pt.model.children():
                 if hasattr(layer, "reset_parameters"):
                     layer.reset_parameters()  # type: ignore
-            model_pt.fit(x_train, y_train, batch_size=batch_size, nb_epochs=epochs, verbose=1)
+            model_pt.fit(x_train, y_train, batch_size=batch_size, nb_epochs=epochs, verbose=True)
             predictions = model_pt.predict(x_test)
             accuracy = np.sum(np.argmax(predictions, axis=1) == np.argmax(y_test, axis=1)) / len(y_test)
             logger.info("Accuracy of retrained model : %s", accuracy * 100.0)
@@ -370,7 +370,7 @@ def _create_model(
 
             self.substitute_classifier.model.trainable = True
             model_tf = self.substitute_classifier.clone_for_refitting()
-            model_tf.fit(x_train, y_train, batch_size=batch_size, nb_epochs=epochs, verbose=0)
+            model_tf.fit(x_train, y_train, batch_size=batch_size, nb_epochs=epochs, verbose=False)
             predictions = model_tf.predict(x_test)
             accuracy = np.sum(np.argmax(predictions, axis=1) == np.argmax(y_test, axis=1)) / len(y_test)
             logger.info("Accuracy of retrained model : %s", accuracy * 100.0)
 
@@ -695,7 +695,9 @@ def _get_activations(self, x_train: Optional[np.ndarray] = None) -> np.ndarray:
 
         # wrong way to get activations activations = self.classifier.predict(self.x_train)
         if isinstance(activations, np.ndarray):
-            nodes_last_layer = np.shape(activations)[1]
+            # flatten activations across batch
+            activations = np.reshape(activations, (activations.shape[0], -1))
+            nodes_last_layer = activations.shape[1]
         else:
             raise ValueError("activations is None or tensor.")
 
 
@@ -121,6 +121,8 @@ def detect_poison(self, **kwargs) -> Tuple[dict, List[int]]:
             raise ValueError("Wrong type detected.")
 
         if features_x_poisoned is not None:
+            # flatten activations across batch
+            features_x_poisoned = np.reshape(features_x_poisoned, (features_x_poisoned.shape[0], -1))
             features_split = segment_by_class(features_x_poisoned, self.y_train, self.classifier.nb_classes)
         else:
             raise ValueError("Activation are `None`.")
 
@@ -12,4 +12,6 @@
 from art.defences.trainer.adversarial_trainer_trades_pytorch import AdversarialTrainerTRADESPyTorch
 from art.defences.trainer.adversarial_trainer_awp import AdversarialTrainerAWP
 from art.defences.trainer.adversarial_trainer_awp_pytorch import AdversarialTrainerAWPPyTorch
+from art.defences.trainer.adversarial_trainer_oaat import AdversarialTrainerOAAT
+from art.defences.trainer.adversarial_trainer_oaat_pytorch import AdversarialTrainerOAATPyTorch
 from art.defences.trainer.dp_instahide_trainer import DPInstaHideTrainer
@@ -188,7 +188,9 @@ def fit_generator(self, generator: "DataGenerator", nb_epochs: int = 20, **kwarg
                     x_batch[adv_ids] = x_adv
 
                 # Fit batch
-                self._classifier.fit(x_batch, y_batch, nb_epochs=1, batch_size=x_batch.shape[0], verbose=0, **kwargs)
+                self._classifier.fit(
+                    x_batch, y_batch, nb_epochs=1, batch_size=x_batch.shape[0], verbose=False, **kwargs
+                )
                 attack_id = (attack_id + 1) % len(self.attacks)
 
     def fit(  # pylint: disable=W0221
@@ -260,7 +262,9 @@ def fit(  # pylint: disable=W0221
                     x_batch[adv_ids] = x_adv
 
                 # Fit batch
-                self._classifier.fit(x_batch, y_batch, nb_epochs=1, batch_size=x_batch.shape[0], verbose=0, **kwargs)
+                self._classifier.fit(
+                    x_batch, y_batch, nb_epochs=1, batch_size=x_batch.shape[0], verbose=False, **kwargs
+                )
                 attack_id = (attack_id + 1) % len(self.attacks)
 
     def predict(self, x: np.ndarray, **kwargs) -> np.ndarray:
 
@@ -0,0 +1,128 @@
+# MIT License
+#
+# Copyright (C) The Adversarial Robustness Toolbox (ART) Authors 2023
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
+# documentation files (the "Software"), to deal in the Software without restriction, including without limitation the
+# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit
+# persons to whom the Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the
+# Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+"""
+This module implements adversarial training with Oracle Aligned Adversarial Training (OAAT) protocol
+for adversarial training for defence against larger perturbations.
+
+| Paper link: https://link.springer.com/chapter/10.1007/978-3-031-20065-6_18
+
+| It was noted that this protocol uses double perturbation mechanism i.e, perturbation on the input samples and then
+perturbation on the model parameters. Consequently, framework specific implementations are being provided in ART.
+"""
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import abc
+from typing import Optional, Tuple, TYPE_CHECKING, Sequence
+
+import numpy as np
+
+from art.defences.trainer.trainer import Trainer
+from art.attacks.attack import EvasionAttack
+from art.data_generators import DataGenerator
+
+if TYPE_CHECKING:
+    from art.utils import CLASSIFIER_LOSS_GRADIENTS_TYPE
+
+
+class AdversarialTrainerOAAT(Trainer):
+    """
+    This is abstract class for different backend-specific implementations of OAAT protocol.
+
+    | Paper link: https://link.springer.com/chapter/10.1007/978-3-031-20065-6_18
+    """
+
+    def __init__(
+        self,
+        classifier: "CLASSIFIER_LOSS_GRADIENTS_TYPE",
+        proxy_classifier: "CLASSIFIER_LOSS_GRADIENTS_TYPE",
+        lpips_classifier: "CLASSIFIER_LOSS_GRADIENTS_TYPE",
+        list_avg_models: Sequence["CLASSIFIER_LOSS_GRADIENTS_TYPE"],
+        attack: EvasionAttack,
+        train_params: dict,
+    ):
+        """
+        Create an :class:`.AdversarialTrainerOAAT` instance.
+
+        :param classifier: Model to train adversarially.
+        :param proxy_classifier: Model for adversarial weight perturbation.
+        :param lpips_classifier: Weight averaging model for calculating activations.
+        :param list_avg_models: list of models for weight averaging.
+        :param attack: attack to use for data augmentation in adversarial training
+        :param train_params: parameters' dictionary related to adversarial training
+        """
+        self._attack = attack
+        self._proxy_classifier = proxy_classifier
+        self._lpips_classifier = lpips_classifier
+        self._list_avg_models = list_avg_models
+        self._train_params = train_params
+        self._apply_wp = False
+        self._apply_lpips_pert = False
+        super().__init__(classifier)
+
+    @abc.abstractmethod
+    def fit(  # pylint: disable=W0221
+        self,
+        x: np.ndarray,
+        y: np.ndarray,
+        validation_data: Optional[Tuple[np.ndarray, np.ndarray]] = None,
+        batch_size: int = 128,
+        nb_epochs: int = 20,
+        **kwargs
+    ):
+        """
+        Train a model adversarially with OAAT. See class documentation for more information on the exact procedure.
+
+        :param x: Training set.
+        :param y: Labels for the training set.
+        :param validation_data: Tuple consisting of validation data, (x_val, y_val)
+        :param batch_size: Size of batches.
+        :param nb_epochs: Number of epochs to use for trainings.
+        :param kwargs: Dictionary of framework-specific arguments. These will be passed as such to the `fit` function of
+               the target classifier.
+        """
+        raise NotImplementedError
+
+    @abc.abstractmethod
+    def fit_generator(  # pylint: disable=W0221
+        self,
+        generator: DataGenerator,
+        validation_data: Optional[Tuple[np.ndarray, np.ndarray]] = None,
+        nb_epochs: int = 20,
+        **kwargs
+    ):
+        """
+        Train a model adversarially with OAAT using a data generator.
+        See class documentation for more information on the exact procedure.
+
+        :param generator: Data generator.
+        :param validation_data: Tuple consisting of validation data, (x_val, y_val)
+        :param nb_epochs: Number of epochs to use for trainings.
+        :param kwargs: Dictionary of framework-specific arguments. These will be passed as such to the `fit` function of
+               the target classifier.
+        """
+        raise NotImplementedError
+
+    def predict(self, x: np.ndarray, **kwargs) -> np.ndarray:
+        """
+        Perform prediction using the adversarially trained classifier.
+
+        :param x: Input samples.
+        :param kwargs: Other parameters to be passed on to the `predict` function of the classifier.
+        :return: Predictions for test set.
+        """
+        return self._classifier.predict(x, **kwargs)