Merge NewtonFool attack

Irina Nicolae · Irina Nicolae · commit 47e042a6c5c0 · 2018-03-22T11:15:39.000Z
diff --git a/src/attacks/newtonfool.py b/src/attacks/newtonfool.py
@@ -0,0 +1,138 @@
+from __future__ import absolute_import, division, print_function
+
+from keras import backend as k
+from keras.utils.generic_utils import Progbar
+
+import numpy as np
+import tensorflow as tf
+
+from src.attacks.attack import Attack, class_derivative
+
+
+class NewtonFool(Attack):
+    """
+    Implementation of the attack from Uyeong Jang et al. (2017).
+    Paper link: http://doi.acm.org/10.1145/3134600.3134635
+    """
+    attack_params = ["max_iter", "eta", "verbose"]
+
+    def __init__(self, classifier, sess, max_iter=100, eta=0.01, verbose=1):
+        """
+        Create a NewtonFool attack instance.
+        :param classifier: An object of classifier.
+        :param sess: The tf session to run graphs in.
+        :param max_iter: (integer) The maximum number of iterations.
+        :param eta: (float) The eta coefficient.
+        :param verbose: (optional boolean)
+        """
+        super(NewtonFool, self).__init__(classifier, sess)
+        params = {"max_iter": max_iter, "eta": eta, "verbose": verbose}
+        self.set_params(**params)
+
+    def generate(self, x_val, **kwargs):
+        """
+        Generate adversarial samples and return them in a Numpy array.
+        :param x_val: (required) A Numpy array with the original inputs.
+        :return: A Numpy array holding the adversarial examples.
+        """
+        assert self.set_params(**kwargs)
+        dims = list(x_val.shape)
+        dims[0] = None
+        nb_classes = self.model.output_shape[1]
+        xi_op = tf.placeholder(dtype=tf.float32, shape=dims)
+        loss = self.classifier.model(xi_op)
+        grads_graph = class_derivative(loss, xi_op, nb_classes)
+        x_adv = x_val.copy()
+
+        # Progress bar
+        progress_bar = Progbar(target=len(x_val), verbose=self.verbose)
+
+        # Initialize variables
+        y_pred = self.classifier.model.predict(x_val)
+        pred_class = np.argmax(y_pred, axis=1)
+
+        # Main algorithm for each example
+        for j, x in enumerate(x_adv):
+            xi = x[None, ...]
+            norm_x0 = np.linalg.norm(np.reshape(x, [-1]))
+            l = pred_class[j]
+            #d = np.zeros(shape=dims[1:])
+
+            # Main loop of the algorithm
+            for i in range(self.max_iter):
+                # Compute score
+                score = self.classifier.model.predict(xi)[0][l]
+
+                # Compute the gradients and norm
+                grads = self.sess.run(grads_graph, feed_dict={xi_op: xi})[l][0]
+                norm_grad = np.linalg.norm(np.reshape(grads, [-1]))
+
+                # Theta
+                theta = self._compute_theta(norm_x0, score, norm_grad,
+                                            nb_classes)
+
+                # Pertubation
+                di = self._compute_pert(theta, grads, norm_grad)
+
+                # Update xi and pertubation
+                xi += di
+                #d += di
+
+            # Return the adversarial example
+            x_adv[j] = xi[0]
+            progress_bar.update(current=j, values=[("perturbation", abs(
+                np.linalg.norm((x_adv[j] - x_val[j]).flatten())))])
+
+        return x_adv
+
+    def set_params(self, **kwargs):
+        """Take in a dictionary of parameters and applies attack-specific
+        checks before saving them as attributes.
+
+        Attack-specific parameters:
+        :param max_iter: (integer) The maximum number of iterations.
+        :param eta: (float) The eta coefficient.
+        :param verbose: (optional boolean)
+        """
+        # Save attack-specific parameters
+        super(NewtonFool, self).set_params(**kwargs)
+
+        if type(self.max_iter) is not int or self.max_iter <= 0:
+            raise ValueError("The number of iterations must be a "
+                             "positive integer.")
+
+        if type(self.eta) is not float or self.eta <= 0:
+            raise ValueError("The eta coefficient must be a positive float.")
+
+        return True
+
+    def _compute_theta(self, norm_x0, score, norm_grad, nb_classes):
+        """
+        Function to compute the theta at each step.
+        :param norm_x0: norm of x0
+        :param score: softmax value at the attacked class.
+        :param norm_grad: norm of gradient values at the attacked class.
+        :param nb_classes: number of classes.
+        :return: theta value.
+        """
+        equ1 = self.eta * norm_x0 * norm_grad
+        equ2 = score - 1.0/nb_classes
+        result = min(equ1, equ2)
+
+        return result
+
+    def _compute_pert(self, theta, grads, norm_grad):
+        """
+        Function to compute the pertubation at each step.
+        :param theta: theta value at the current step.
+        :param grads: gradient values at the attacked class.
+        :param norm_grad: norm of gradient values at the attacked class.
+        :return: pertubation.
+        """
+        nom = -theta * grads
+        denom = norm_grad**2
+        result = nom / float(denom)
+
+        return result
+
+
diff --git a/src/attacks/newtonfool_unittest.py b/src/attacks/newtonfool_unittest.py
@@ -0,0 +1,56 @@
+from __future__ import absolute_import, division, print_function
+
+import keras.backend as k
+import tensorflow as tf
+import unittest
+import numpy as np
+
+from src.attacks.newtonfool import NewtonFool
+from src.classifiers.cnn import CNN
+from src.utils import load_mnist, get_labels_np_array, get_label_conf
+
+
+class TestNewtonFool(unittest.TestCase):
+    def test_mnist(self):
+        session = tf.Session()
+        k.set_session(session)
+
+        comp_params = {"loss": 'categorical_crossentropy',
+                       "optimizer": 'adam',
+                       "metrics": ['accuracy']}
+
+        # get MNIST
+        batch_size, nb_train, nb_test = 100, 1000, 11
+        (X_train, Y_train), (X_test, Y_test), _, _ = load_mnist()
+        X_train, Y_train = X_train[:nb_train], Y_train[:nb_train]
+        X_test, Y_test = X_test[:nb_test], Y_test[:nb_test]
+        im_shape = X_train[0].shape
+
+        # get classifier
+        classifier = CNN(im_shape, act="relu")
+        classifier.compile(comp_params)
+        classifier.fit(X_train, Y_train, epochs=1, batch_size=batch_size,
+                       verbose=0)
+
+        # Attack
+        nf = NewtonFool(classifier, sess=session)
+        nf.set_params(max_iter=20)
+        x_test_adv = nf.generate(X_test)
+        self.assertFalse((X_test == x_test_adv).all())
+
+        y_pred = classifier.predict(X_test)
+        y_pred_adv = classifier.predict(x_test_adv)
+        y_pred_bool = y_pred.max(axis=1, keepdims=1) == y_pred
+        y_pred_max = y_pred.max(axis=1)
+        y_pred_adv_max = y_pred_adv[y_pred_bool]
+        self.assertTrue((y_pred_max >= y_pred_adv_max).all())
+
+        scores1 = classifier.evaluate(X_test, Y_test)
+        print("\nAccuracy on test set: %.2f%%" % (scores1[1] * 100))
+        scores2 = classifier.evaluate(x_test_adv, Y_test)
+        print('\nAccuracy on adversarial examples: %.2f%%' % (scores2[1] * 100))
+        self.assertTrue(scores1[1] != scores2[1])
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/src/metrics.py b/src/metrics.py
@@ -167,7 +167,7 @@ def clever_u(x, classifier, n_b, n_s, r, sess, c_init=1):
     :param r: maximum perturbation
     :param sess:
     :param c_init: initialization of Weibull distribution
-    :return: CLEVER score
+    :return: a tuple of 3 CLEVER scores, corresponding to norm 1, 2, inf.
     """
     # Get a list of untargeted classes
     y_pred = classifier.predict(np.array([x]))
@@ -183,7 +183,7 @@ def clever_u(x, classifier, n_b, n_s, r, sess, c_init=1):
         score2_list.append(s2)
         score8_list.append(s8)
 
-    return np.min(score1_list), np.min(score2_list), np.min(score8_list)
+    return tuple((np.min(score1_list), np.min(score2_list), np.min(score8_list)))
 
 
 def clever_t(x, classifier, target_class, n_b, n_s, r, sess, c_init=1):
@@ -198,7 +198,7 @@ def clever_t(x, classifier, target_class, n_b, n_s, r, sess, c_init=1):
     :param r: maximum perturbation
     :param sess:
     :param c_init: initialization of Weibull distribution
-    :return: CLEVER score
+    :return: a tuple of 3 CLEVER scores, corresponding to norm 1, 2, inf.
     """
     # Check if the targeted class is different from the predicted class
     y_pred = classifier.predict(np.array([x]))
@@ -270,7 +270,7 @@ def clever_t(x, classifier, target_class, n_b, n_s, r, sess, c_init=1):
     s2 = np.min([-g_x0[0] / loc2, r])
     s1 = np.min([-g_x0[0] / loc8, r])
 
-    return s1, s2, s8
+    return tuple((s1, s2, s8))
 
 
 def _build_g_gradient(x, classifier, pred_class, target_class):