Merge pull request #825 from TS-Lee/main

beat-buesser · web-flow · commit ac25fd1620d8 · 2021-01-08T15:06:22.000Z
Added an option to use probability values for model stealing. Added a…
diff --git a/art/attacks/extraction/copycat_cnn.py b/art/attacks/extraction/copycat_cnn.py
@@ -51,6 +51,7 @@ class CopycatCNN(ExtractionAttack):
         "batch_size_query",
         "nb_epochs",
         "nb_stolen",
+        "use_probability",
     ]
     _estimator_requirements = (BaseEstimator, ClassifierMixin)
 
@@ -61,6 +62,7 @@ def __init__(
         batch_size_query: int = 1,
         nb_epochs: int = 10,
         nb_stolen: int = 1,
+        use_probability: bool = False
     ) -> None:
         """
         Create a Copycat CNN attack instance.
@@ -77,6 +79,7 @@ def __init__(
         self.batch_size_query = batch_size_query
         self.nb_epochs = nb_epochs
         self.nb_stolen = nb_stolen
+        self.use_probability = use_probability
         self._check_params()
 
     def extract(self, x: np.ndarray, y: Optional[np.ndarray] = None, **kwargs) -> "CLASSIFIER_TYPE":
@@ -139,8 +142,9 @@ def _query_label(self, x: np.ndarray) -> np.ndarray:
         :return: Target values (class labels) one-hot-encoded of shape (nb_samples, nb_classes).
         """
         labels = self.estimator.predict(x=x, batch_size=self.batch_size_query)
-        labels = np.argmax(labels, axis=1)
-        labels = to_categorical(labels=labels, nb_classes=self.estimator.nb_classes)
+        if not self.use_probability:
+            labels = np.argmax(labels, axis=1)
+            labels = to_categorical(labels=labels, nb_classes=self.estimator.nb_classes)
 
         return labels
 
@@ -156,3 +160,6 @@ def _check_params(self) -> None:
 
         if not isinstance(self.nb_stolen, (int, np.int)) or self.nb_stolen <= 0:
             raise ValueError("The number of queries submitted to the victim classifier must be a positive integer.")
+
+        if not isinstance(self.use_probability, bool):
+            raise ValueError("The argument `use_probability` has to be of type bool.")
diff --git a/art/attacks/extraction/knockoff_nets.py b/art/attacks/extraction/knockoff_nets.py
@@ -55,6 +55,7 @@ class KnockoffNets(ExtractionAttack):
         "sampling_strategy",
         "reward",
         "verbose",
+        "use_probability",
     ]
 
     _estimator_requirements = (BaseEstimator, ClassifierMixin)
@@ -69,6 +70,7 @@ def __init__(
         sampling_strategy: str = "random",
         reward: str = "all",
         verbose: bool = True,
+        use_probability: bool = False,
     ) -> None:
         """
         Create a KnockoffNets attack instance. Note, it is assumed that both the victim classifier and the thieved
@@ -92,6 +94,7 @@ def __init__(
         self.sampling_strategy = sampling_strategy
         self.reward = reward
         self.verbose = verbose
+        self.use_probability = use_probability
         self._check_params()
 
     def extract(self, x: np.ndarray, y: Optional[np.ndarray] = None, **kwargs) -> "CLASSIFIER_TYPE":
@@ -173,8 +176,9 @@ def _query_label(self, x: np.ndarray) -> np.ndarray:
         :return: Target values (class labels) one-hot-encoded of shape `(nb_samples, nb_classes)`.
         """
         labels = self.estimator.predict(x=x, batch_size=self.batch_size_query)
-        labels = np.argmax(labels, axis=1)
-        labels = to_categorical(labels=labels, nb_classes=self.estimator.nb_classes)
+        if not self.use_probability:
+            labels = np.argmax(labels, axis=1)
+            labels = to_categorical(labels=labels, nb_classes=self.estimator.nb_classes)
 
         return labels
 
@@ -403,3 +407,5 @@ def _check_params(self) -> None:
 
         if not isinstance(self.verbose, bool):
             raise ValueError("The argument `verbose` has to be of type bool.")
+        if not isinstance(self.use_probability, bool):
+            raise ValueError("The argument `use_probability` has to be of type bool.")
diff --git a/notebooks/README.md b/notebooks/README.md
@@ -157,6 +157,10 @@ and MNIST datasets.
 demonstrates the detection of adversarial examples using ART. The classifier model is a neural network of a ResNet 
 architecture in Keras for the CIFAR-10 dataset.
 
+## Model stealing / model theft / model extraction
+
+[model-stealing-demo.ipynb](model-stealing-demo.ipynb) [[on nbviewer](https://nbviewer.jupyter.org/github/Trusted-AI/adversarial-robustness-toolbox/blob/main/notebooks/model-stealing-demo.ipynb)] demonstrates model stealing attacks and a reverse sigmoid defense against them.
+
 ## Poisoning
 
 [poisoning_attack_svm.ipynb](poisoning_attack_svm.ipynb) [[on nbviewer](https://nbviewer.jupyter.org/github/Trusted-AI/adversarial-robustness-toolbox/blob/main/notebooks/poisoning_attack_svm.ipynb)]
diff --git a/notebooks/model-stealing-demo.ipynb b/notebooks/model-stealing-demo.ipynb