Trusted-AI
diff --git a/‎.travis.yml‎
Lines changed: 4 additions & 0 deletions b/‎.travis.yml‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 3 additions & 1 deletion b/‎README.md‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎art/attacks/carlini.py‎
Lines changed: 75 additions & 37 deletions b/‎art/attacks/carlini.py‎
Lines changed: 75 additions & 37 deletions
diff --git a/‎art/attacks/deepfool.py‎
Lines changed: 12 additions & 14 deletions b/‎art/attacks/deepfool.py‎
Lines changed: 12 additions & 14 deletions
@@ -9,12 +9,16 @@ matrix:
         env: KERAS_BACKEND=tensorflow TENSORFLOW_V=1.6.0
       - python: 2.7
         env: KERAS_BACKEND=tensorflow TENSORFLOW_V=1.7.0
+      - python: 2.7
+        env: KERAS_BACKEND=tensorflow TENSORFLOW_V=1.10.0
       - python: 3.5
         env: KERAS_BACKEND=tensorflow TENSORFLOW_V=1.5.0
       - python: 3.5
         env: KERAS_BACKEND=tensorflow TENSORFLOW_V=1.6.0
       - python: 3.5
         env: KERAS_BACKEND=tensorflow TENSORFLOW_V=1.7.0
+      - python: 3.5
+        env: KERAS_BACKEND=tensorflow TENSORFLOW_V=1.10.0
   exclude:
       - env:
 
 
@@ -26,13 +26,15 @@ The following **defence** methods are also supported:
 * Virtual adversarial training ([Miyato et al., 2015](https://arxiv.org/abs/1507.00677))
 * Gaussian data augmentation ([Zantedeschi et al., 2017](https://arxiv.org/abs/1707.06728))
 * Thermometer encoding ([Buckman et al., 2018](https://openreview.net/forum?id=S18Su--CW))
+* Total variance minimization ([Guo et al., 2018](https://openreview.net/forum?id=SyJ7ClWCb))
+* JPEG compression ([Dziugaite et al., 2016](https://arxiv.org/abs/1608.00853))
 
 ART also implements **detection** methods of adversarial samples:
 * Basic detector based on inputs
 * Detector trained on the activations of a specific layer
 
 The following **detector of poisoning attacks** is also supported:
-* Detector based on activations analysis
+* Detector based on activations analysis ([Chen et al., 2018](https://arxiv.org/abs/1811.03728))
 
 ## Setup
 
 
@@ -22,6 +22,7 @@
 
 import numpy as np
 
+from art import NUMPY_DTYPE
 from art.attacks.attack import Attack
 from art.utils import get_labels_np_array
 
@@ -30,10 +31,10 @@
 
 class CarliniL2Method(Attack):
     """
-    The L_2 optimized attack of Carlini and Wagner (2016). This attack is the most efficient and should be used as the
-    primary attack to evaluate potential defences (wrt the L_0 and L_inf attacks). This implementation is inspired by
-    the one in Cleverhans, which reproduces the authors' original code (https://github.com/carlini/nn_robust_attacks).
-    Paper link: https://arxiv.org/pdf/1608.04644.pdf
+    The L_2 optimized attack of Carlini and Wagner (2016). This attack is among the most effective and should be used
+    among the primary attacks to evaluate potential defences. A major difference wrt to the original implementation 
+    (https://github.com/carlini/nn_robust_attacks) is that we use line search in the optimization of the attack
+    objective. Paper link: https://arxiv.org/pdf/1608.04644.pdf
     """
     attack_params = Attack.attack_params + ['confidence', 'targeted', 'learning_rate', 'max_iter',
                                             'binary_search_steps', 'initial_const', 'max_halving', 'max_doubling']
@@ -100,10 +101,10 @@ def _loss(self, x, x_adv, target, c):
         :return: A tuple holding the current logits, l2 distance and overall loss.
         :rtype: `(float, float, float)`
         """    
-        l2dist = np.sum(np.square(x-x_adv))        
-        z = self.classifier.predict(np.array([x_adv]), logits=True)[0]
+        l2dist = np.sum(np.square(x - x_adv))
+        z = self.classifier.predict(np.array([x_adv], dtype=NUMPY_DTYPE), logits=True)[0]
         z_target = np.sum(z * target)
-        z_other = np.max(z * (1 - target) + (np.min(z)-1)*target)
+        z_other = np.max(z * (1 - target) + (np.min(z) - 1) * target)
 
         # The following differs from the exact definition given in Carlini and Wagner (2016). There (page 9, left
         # column, last equation), the maximum is taken over Z_other - Z_target (or Z_target - Z_other respectively)
@@ -144,20 +145,21 @@ def _loss_gradient(self, z, target, x, x_adv, x_adv_tanh, c, clip_min, clip_max)
         :type target: `np.ndarray`
         """  
         if self.targeted:
-            i_sub, i_add = np.argmax(target), np.argmax(z * (1 - target) + (np.min(z)-1)*target)
+            i_sub, i_add = np.argmax(target), np.argmax(z * (1 - target) + (np.min(z) - 1) * target)
         else:
-            i_add, i_sub = np.argmax(target), np.argmax(z * (1 - target) + (np.min(z)-1)*target)
+            i_add, i_sub = np.argmax(target), np.argmax(z * (1 - target) + (np.min(z) - 1) * target)
 
-        loss_gradient = self.classifier.class_gradient(np.array([x_adv]), label=i_add, logits=True)[0]
-        loss_gradient -= self.classifier.class_gradient(np.array([x_adv]), label=i_sub, logits=True)[0]
+        loss_gradient = self.classifier.class_gradient(np.array([x_adv], dtype=NUMPY_DTYPE), label=i_add,
+                                                       logits=True)[0]
+        loss_gradient -= self.classifier.class_gradient(np.array([x_adv], dtype=NUMPY_DTYPE), label=i_sub,
+                                                        logits=True)[0]
         loss_gradient *= c
-        loss_gradient += 2*(x_adv - x)
+        loss_gradient += 2 * (x_adv - x)
         loss_gradient *= (clip_max - clip_min) 
-        loss_gradient *= (1-np.square(np.tanh(x_adv_tanh)))/(2*self._tanh_smoother)
+        loss_gradient *= (1 - np.square(np.tanh(x_adv_tanh))) / (2 * self._tanh_smoother)
 
         return loss_gradient[0]
-                        
-    
+
     def _original_to_tanh(self, x_original, clip_min, clip_max):
         """
         Transform input from original to tanh space.
@@ -208,7 +210,7 @@ def generate(self, x, **kwargs):
         :return: An array holding the adversarial examples.
         :rtype: `np.ndarray`
         """        
-        x_adv = x.copy()
+        x_adv = x.astype(NUMPY_DTYPE)
         (clip_min, clip_max) = self.classifier.clip_values
 
         # Parse and save attack-specific parameters
@@ -224,7 +226,8 @@ def generate(self, x, **kwargs):
         if y is None:
             y = get_labels_np_array(self.classifier.predict(x, logits=False))
 
-        for j, (ex, target) in enumerate(zip(x_adv, y)):
+        for j, (ex, target) in enumerate(zip(x_adv, y)):        
+            logger.debug('Processing sample %i out of %i', j, x_adv.shape[0])
             image = ex.copy()
 
             # The optimization is performed in tanh space to keep the
@@ -238,63 +241,98 @@ def generate(self, x, **kwargs):
 
             # Initialize placeholders for best l2 distance and attack found so far
             best_l2dist = sys.float_info.max
-            best_adv_image = image            
-            lr = self.learning_rate
+            best_adv_image = image 
 
-            for _ in range(self.binary_search_steps):
+            for bss in range(self.binary_search_steps):           
+                lr = self.learning_rate
+                logger.debug('Binary search step %i out of %i (c==%f)', bss, self.binary_search_steps, c)
 
                 # Initialize perturbation in tanh space:
-                perturbation_tanh = np.zeros(image_tanh.shape)
                 adv_image = image
                 adv_image_tanh = image_tanh
                 z, l2dist, loss = self._loss(image, adv_image, target, c)
-                attack_success = (loss-l2dist <= 0)
+                attack_success = (loss - l2dist <= 0)
+                overall_attack_success = attack_success
 
-                for it in range(self.max_iter):                   
+                for it in range(self.max_iter):  
+                    logger.debug('Iteration step %i out of %i', it, self.max_iter)
+                    logger.debug('Total Loss: %f', loss)
+                    logger.debug('L2Dist: %f', l2dist)
+                    logger.debug('Margin Loss: %f', loss-l2dist)
+                    
                     if attack_success:
-                        break
+                        logger.debug('Margin Loss <= 0 --> Attack Success!')
+                        if l2dist < best_l2dist:
+                            logger.debug('New best L2Dist: %f (previous=%f)', l2dist, best_l2dist)
+                            best_l2dist = l2dist
+                            best_adv_image = adv_image
 
                     # compute gradient:
+                    logger.debug('Compute loss gradient')
                     perturbation_tanh = -self._loss_gradient(z, target, image, adv_image, adv_image_tanh, 
                                                              c, clip_min, clip_max)
 
                     # perform line search to optimize perturbation                     
                     # first, halve the learning rate until perturbation actually decreases the loss:                      
                     prev_loss = loss
+                    best_loss = loss
+                    best_lr = 0
+                    
                     halving = 0
-                    while loss >= prev_loss and loss-l2dist > 0 and halving < self.max_halving: 
-                        new_adv_image_tanh = adv_image_tanh + lr*perturbation_tanh
+                    while loss >= prev_loss and halving < self.max_halving:
+                        logger.debug('Apply gradient with learning rate %f (halving=%i)', lr, halving)
+                        new_adv_image_tanh = adv_image_tanh + lr * perturbation_tanh
                         new_adv_image = self._tanh_to_original(new_adv_image_tanh, clip_min, clip_max)
-                        _, l2dist, loss = self._loss(image, new_adv_image, target, c)
+                        _, l2dist, loss = self._loss(image, new_adv_image, target, c) 
+                        logger.debug('New Total Loss: %f', loss)
+                        logger.debug('New L2Dist: %f', l2dist)
+                        logger.debug('New Margin Loss: %f', loss-l2dist)      
+                        if loss < best_loss:
+                            best_loss = loss
+                            best_lr = lr
                         lr /= 2
                         halving += 1                        
                     lr *= 2
 
                     # if no halving was actually required, double the learning rate as long as this
                     # decreases the loss:
-                    if halving == 1:
+                    if halving == 1 and loss <= prev_loss:
                         doubling = 0
                         while loss <= prev_loss and doubling < self.max_doubling:  
                             prev_loss = loss
                             lr *= 2     
+                            logger.debug('Apply gradient with learning rate %f (doubling=%i)', lr, doubling)
                             doubling += 1
-                            new_adv_image_tanh = adv_image_tanh + lr*perturbation_tanh
+                            new_adv_image_tanh = adv_image_tanh + lr * perturbation_tanh
                             new_adv_image = self._tanh_to_original(new_adv_image_tanh, clip_min, clip_max)
-                            _, l2dist, loss = self._loss(image, new_adv_image, target, c)
+                            _, l2dist, loss = self._loss(image, new_adv_image, target, c)                            
+                            logger.debug('New Total Loss: %f', loss)
+                            logger.debug('New L2Dist: %f', l2dist)
+                            logger.debug('New Margin Loss: %f', loss-l2dist)     
+                            if loss < best_loss:
+                                best_loss = loss
+                                best_lr = lr            
                         lr /= 2
 
-                    # apply the optimal learning rate that was found and update the loss:
-                    adv_image_tanh = adv_image_tanh + lr*perturbation_tanh
-                    adv_image = self._tanh_to_original(adv_image_tanh, clip_min, clip_max)
+                    if best_lr >0:
+                        logger.debug('Finally apply gradient with learning rate %f', best_lr)
+                        # apply the optimal learning rate that was found and update the loss:
+                        adv_image_tanh = adv_image_tanh + best_lr * perturbation_tanh
+                        adv_image = self._tanh_to_original(adv_image_tanh, clip_min, clip_max)
+                        
                     z, l2dist, loss = self._loss(image, adv_image, target, c)                    
-                    attack_success = (loss-l2dist <= 0)
+                    attack_success = (loss - l2dist <= 0)
+                    overall_attack_success = overall_attack_success or attack_success
 
                 # Update depending on attack success:
                 if attack_success:
+                    logger.debug('Margin Loss <= 0 --> Attack Success!')
                     if l2dist < best_l2dist:
+                        logger.debug('New best L2Dist: %f (previous=%f)', l2dist, best_l2dist)
                         best_l2dist = l2dist
-                        best_adv_image =  adv_image 
-                        
+                        best_adv_image = adv_image
+                
+                if overall_attack_success:
                     c_double = False
                     c = (c_lower_bound + c) / 2
                 else:
@@ -317,7 +355,7 @@ def generate(self, x, **kwargs):
         else:
             preds = np.argmax(self.classifier.predict(x), axis=1)
             rate = np.sum(adv_preds != preds) / x_adv.shape[0]
-        logger.info('Success rate of C&W attack: %.2f%%', rate)
+        logger.info('Success rate of C&W attack: %.2f%%', 100*rate)
 
         return x_adv
 
 
@@ -64,31 +64,26 @@ def generate(self, x, **kwargs):
         assert self.set_params(**kwargs)
         clip_min, clip_max = self.classifier.clip_values
         x_adv = x.copy()
+        preds = self.classifier.predict(x, logits=True)
 
         # Pick a small scalar to avoid division by 0
         tol = 10e-8
 
         for j, val in enumerate(x_adv):
             xj = val[None, ...]
-
-            # TODO move prediction outside of for loop; add batching if `x` is too large?
-            f = self.classifier.predict(xj, logits=True)[0]
+            f = preds[j]
             grd = self.classifier.class_gradient(xj, logits=True)[0]
             fk_hat = np.argmax(f)
-            fk_i_hat = fk_hat
-            nb_iter = 0
 
-            while fk_i_hat == fk_hat and nb_iter < self.max_iter:
+            for _ in range(self.max_iter):
                 grad_diff = grd - grd[fk_hat]
                 f_diff = f - f[fk_hat]
 
-                # Masking true label
-                mask = [0] * self.classifier.nb_classes
-                mask[fk_hat] = 1
+                # Choose coordinate and compute perturbation
                 norm = np.linalg.norm(grad_diff.reshape(self.classifier.nb_classes, -1), axis=1) + tol
-                value = np.ma.array(np.abs(f_diff) / norm, mask=mask)
-
-                l = value.argmin(fill_value=np.inf)
+                value = np.abs(f_diff) / norm
+                value[fk_hat] = np.inf
+                l = np.argmin(value)
                 r = (abs(f_diff[l]) / (pow(np.linalg.norm(grad_diff[l]), 2) + tol)) * grad_diff[l]
 
                 # Add perturbation and clip result
@@ -99,11 +94,14 @@ def generate(self, x, **kwargs):
                 grd = self.classifier.class_gradient(xj, logits=True)[0]
                 fk_i_hat = np.argmax(f)
 
-                nb_iter += 1
+                # Stop if misclassification has been achieved
+                if fk_i_hat != fk_hat:
+                    break
 
+            # Apply overshoot parameter
             x_adv[j] = np.clip(x[j] + (1 + self.epsilon) * (xj[0] - x[j]), clip_min, clip_max)
 
-        preds = np.argmax(self.classifier.predict(x), axis=1)
+        preds = np.argmax(preds, axis=1)
         preds_adv = np.argmax(self.classifier.predict(x_adv), axis=1)
         logger.info('Success rate of DeepFool attack: %.2f%%', (np.sum(preds != preds_adv) / x.shape[0]))