Skip to content

Commit 8ce90cf

Browse files
authored
Merge pull request #962 from Trusted-AI/polytope
Implement Bullseye Polytope Clean Label Attack
2 parents c9dab59 + 6a48040 commit 8ce90cf

File tree

7 files changed

+680
-2
lines changed

7 files changed

+680
-2
lines changed

art/attacks/poisoning/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,3 +6,4 @@
66
from art.attacks.poisoning.feature_collision_attack import FeatureCollisionAttack
77
from art.attacks.poisoning.adversarial_embedding_attack import PoisoningAttackAdversarialEmbedding
88
from art.attacks.poisoning.clean_label_backdoor_attack import PoisoningAttackCleanLabelBackdoor
9+
from art.attacks.poisoning.bullseye_polytope_attack import BullseyePolytopeAttackPyTorch
Lines changed: 317 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,317 @@
1+
# MIT License
2+
#
3+
# Copyright (C) The Adversarial Robustness Toolbox (ART) Authors 2021
4+
#
5+
# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
6+
# documentation files (the "Software"), to deal in the Software without restriction, including without limitation the
7+
# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit
8+
# persons to whom the Software is furnished to do so, subject to the following conditions:
9+
#
10+
# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the
11+
# Software.
12+
#
13+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
14+
# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
15+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
16+
# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
17+
# SOFTWARE.
18+
"""
19+
This module implements Bullseye Polytope clean-label attacks on Neural Networks.
20+
"""
21+
from __future__ import absolute_import, division, print_function, unicode_literals
22+
23+
import logging
24+
from typing import Optional, Tuple, Union, TYPE_CHECKING, List
25+
26+
import numpy as np
27+
import time
28+
from tqdm.auto import trange
29+
30+
from art.attacks.attack import PoisoningAttackWhiteBox
31+
from art.estimators import BaseEstimator, NeuralNetworkMixin
32+
from art.estimators.classification.classifier import ClassifierMixin
33+
from art.estimators.classification.pytorch import PyTorchClassifier
34+
35+
if TYPE_CHECKING:
36+
import torch
37+
from art.utils import CLASSIFIER_NEURALNETWORK_TYPE
38+
39+
logger = logging.getLogger(__name__)
40+
41+
42+
class BullseyePolytopeAttackPyTorch(PoisoningAttackWhiteBox):
43+
"""
44+
Implementation of Bullseye Polytope Attack by Aghakhani, et. al. 2020.
45+
"Bullseye Polytope: A Scalable Clean-Label Poisoning Attack with Improved Transferability"
46+
47+
This implementation is based on UCSB's original code here: https://github.com/ucsb-seclab/BullseyePoison
48+
49+
| Paper link: https://arxiv.org/abs/2005.00191
50+
"""
51+
52+
attack_params = PoisoningAttackWhiteBox.attack_params + [
53+
"target",
54+
"feature_layer",
55+
"opt" "max_iter",
56+
"learning_rate",
57+
"momentum",
58+
"decay_iter",
59+
"decay_coeff",
60+
"epsilon",
61+
"norm",
62+
"dropout",
63+
"endtoend",
64+
"verbose",
65+
]
66+
67+
_estimator_requirements = (BaseEstimator, NeuralNetworkMixin, ClassifierMixin, PyTorchClassifier)
68+
69+
def __init__(
70+
self,
71+
classifier: Union["CLASSIFIER_NEURALNETWORK_TYPE", List["CLASSIFIER_NEURALNETWORK_TYPE"]],
72+
target: np.ndarray,
73+
feature_layer: Union[Union[str, int], List[Union[str, int]]],
74+
opt: str = "adam",
75+
max_iter: int = 4000,
76+
learning_rate: float = 4e-2,
77+
momentum: float = 0.9,
78+
decay_iter: Union[int, List[int]] = 10000,
79+
decay_coeff: float = 0.5,
80+
epsilon: float = 0.1,
81+
dropout: int = 0.3,
82+
net_repeat: int = 1,
83+
endtoend: bool = True,
84+
verbose: bool = True,
85+
):
86+
"""
87+
Initialize an Feature Collision Clean-Label poisoning attack
88+
89+
:param classifier: The proxy classifiers used for the attack. Can be a single classifier or list of classifiers
90+
with varying architectures.
91+
:param target: The target input(s) of shape (N, W, H, C) to misclassify at test time. Multiple targets will be
92+
averaged.
93+
:param feature_layer: The name(s) of the feature representation layer(s).
94+
:param opt: The optimizer to use for the attack. Can be 'adam' or 'sgd'
95+
:param max_iter: The maximum number of iterations for the attack.
96+
:param learning_rate: The learning rate of clean-label attack optimization.
97+
:param momentum: The momentum of clean-label attack optimization.
98+
:param decay_iter: Which iterations to decay the learning rate.
99+
Can be a integer (every N iterations) or list of integers [0, 500, 1500]
100+
:param decay_coeff: The decay coefficient of the learning rate.
101+
:param epsilon: The perturbation budget
102+
:param dropout: Dropout to apply while training
103+
:param net_repeat: The number of times to repeat prediction on each network
104+
:param endtoend: True for end-to-end training. False for transfer learning.
105+
:param verbose: Show progress bars.
106+
"""
107+
self.subsistute_networks: List["CLASSIFIER_NEURALNETWORK_TYPE"] = [classifier] if not isinstance(
108+
classifier, list
109+
) else classifier
110+
111+
super().__init__(classifier=self.subsistute_networks[0]) # type: ignore
112+
self.target = target
113+
self.opt = opt
114+
self.momentum = momentum
115+
self.decay_iter = decay_iter
116+
self.epsilon = epsilon
117+
self.dropout = dropout
118+
self.net_repeat = net_repeat
119+
self.endtoend = endtoend
120+
self.feature_layer = feature_layer
121+
self.learning_rate = learning_rate
122+
self.decay_coeff = decay_coeff
123+
self.max_iter = max_iter
124+
self.verbose = verbose
125+
self._check_params()
126+
127+
def poison(self, x: np.ndarray, y: Optional[np.ndarray] = None, **kwargs) -> Tuple[np.ndarray, np.ndarray]:
128+
"""
129+
Iteratively finds optimal attack points starting at values at x
130+
131+
:param x: The base images to begin the poison process.
132+
:param y: Target label
133+
:return: An tuple holding the (poisoning examples, poisoning labels).
134+
"""
135+
import torch
136+
137+
class PoisonBatch(torch.nn.Module):
138+
"""
139+
Implementing this to work with PyTorch optimizers.
140+
"""
141+
142+
def __init__(self, base_list):
143+
super(PoisonBatch, self).__init__()
144+
base_batch = torch.stack(base_list, 0)
145+
self.poison = torch.nn.Parameter(base_batch.clone())
146+
147+
def forward(self):
148+
return self.poison
149+
150+
base_tensor_list = [torch.from_numpy(sample).to(self.estimator.device) for sample in x]
151+
poison_batch = PoisonBatch([torch.from_numpy(np.copy(sample)).to(self.estimator.device) for sample in x])
152+
opt_method = self.opt.lower()
153+
154+
if opt_method == "sgd":
155+
logger.info("Using SGD to craft poison samples")
156+
optimizer = torch.optim.SGD(poison_batch.parameters(), lr=self.learning_rate, momentum=self.momentum)
157+
elif opt_method == "adam":
158+
logger.info("Using Adam to craft poison samples")
159+
optimizer = torch.optim.Adam(poison_batch.parameters(), lr=self.learning_rate, betas=(self.momentum, 0.999))
160+
161+
base_tensor_batch = torch.stack(base_tensor_list, 0)
162+
base_range01_batch = base_tensor_batch
163+
164+
# Because we have turned on DP for the substitute networks,
165+
# the target image's feature becomes random.
166+
# We can try enforcing the convex polytope in one of the multiple realizations of the feature,
167+
# but empirically one realization is enough.
168+
target_feat_list = []
169+
# Coefficients for the convex combination.
170+
# Initializing from the coefficients of last step gives faster convergence.
171+
s_init_coeff_list = []
172+
n_poisons = len(x)
173+
for n, net in enumerate(self.subsistute_networks):
174+
# End to end training
175+
if self.endtoend:
176+
block_feats = [
177+
feat.detach() for feat in net.get_activations(x, layer=self.feature_layer, framework=True)
178+
]
179+
target_feat_list.append(block_feats)
180+
s_coeff = [
181+
torch.ones(n_poisons, 1).to(self.estimator.device) / n_poisons for _ in range(len(block_feats))
182+
]
183+
else:
184+
target_feat_list.append(net.get_activations(x, layer=self.feature_layer, framework=True).detach())
185+
s_coeff = torch.ones(n_poisons, 1).to(self.estimator.device) / n_poisons
186+
187+
s_init_coeff_list.append(s_coeff)
188+
189+
for ite in trange(self.max_iter):
190+
if ite % self.decay_iter == 0 and ite != 0:
191+
for param_group in optimizer.param_groups:
192+
param_group["lr"] *= self.decay_coeff
193+
print(
194+
"%s Iteration %d, Adjusted lr to %.2e"
195+
% (time.strftime("%Y-%m-%d %H:%M:%S"), ite, self.learning_rate)
196+
)
197+
198+
poison_batch.zero_grad()
199+
total_loss = loss_from_center(
200+
self.subsistute_networks,
201+
target_feat_list,
202+
poison_batch,
203+
self.net_repeat,
204+
self.endtoend,
205+
self.feature_layer,
206+
)
207+
total_loss.backward()
208+
optimizer.step()
209+
210+
# clip the perturbations into the range
211+
perturb_range01 = torch.clamp((poison_batch.poison.data - base_tensor_batch), -self.epsilon, self.epsilon)
212+
perturbed_range01 = torch.clamp(
213+
base_range01_batch.data + perturb_range01.data,
214+
self.estimator.clip_values[0],
215+
self.estimator.clip_values[1],
216+
)
217+
poison_batch.poison.data = perturbed_range01
218+
219+
if y is None:
220+
raise ValueError("You must pass in the target label as y")
221+
222+
return get_poison_tuples(poison_batch, y)
223+
224+
def _check_params(self) -> None:
225+
if self.learning_rate <= 0:
226+
raise ValueError("Learning rate must be strictly positive")
227+
228+
if self.max_iter < 1:
229+
raise ValueError("Value of max_iter at least 1")
230+
231+
if not isinstance(self.feature_layer, (str, int)):
232+
raise TypeError("Feature layer should be a string or int")
233+
234+
if self.opt.lower() not in ["adam", "sgd"]:
235+
raise ValueError("Optimizer must be 'adam' or 'sgd'")
236+
237+
if 1 < self.momentum < 0:
238+
raise ValueError("Momentum must be between 0 and 1")
239+
240+
if self.decay_iter < 0:
241+
raise ValueError("decay_iter must be at least 0")
242+
243+
if self.epsilon <= 0:
244+
raise ValueError("epsilon must be at least 0")
245+
246+
if 1 < self.dropout < 0:
247+
raise ValueError("dropout must be between 0 and 1")
248+
249+
if self.net_repeat < 1:
250+
raise ValueError("net_repeat must be at least 1")
251+
252+
if not 0 <= self.feature_layer < len(self.estimator.layer_names):
253+
raise ValueError("Invalid feature layer")
254+
255+
if 1 < self.decay_coeff < 0:
256+
raise ValueError("Decay coefficient must be between zero and one")
257+
258+
259+
def get_poison_tuples(poison_batch, poison_label):
260+
"""
261+
Includes the labels
262+
"""
263+
poison = [
264+
poison_batch.poison.data[num_p].unsqueeze(0).detach().cpu().numpy()
265+
for num_p in range(poison_batch.poison.size(0))
266+
]
267+
return np.vstack(poison), poison_label
268+
269+
270+
def loss_from_center(
271+
subs_net_list, target_feat_list, poison_batch, net_repeat, end2end, feature_layer
272+
) -> "torch.Tensor":
273+
import torch
274+
275+
if end2end:
276+
loss = 0
277+
for net, center_feats in zip(subs_net_list, target_feat_list):
278+
if net_repeat > 1:
279+
poisons_feats_repeats = [
280+
net.get_activations(poison_batch(), layer=feature_layer, framework=True) for _ in range(net_repeat)
281+
]
282+
BLOCK_NUM = len(poisons_feats_repeats[0])
283+
poisons_feats = []
284+
for block_idx in range(BLOCK_NUM):
285+
poisons_feats.append(
286+
sum([poisons_feat_r[block_idx] for poisons_feat_r in poisons_feats_repeats]) / net_repeat
287+
)
288+
elif net_repeat == 1:
289+
poisons_feats = net.get_activations(poison_batch(), layer=feature_layer, framework=True)
290+
else:
291+
assert False, "net_repeat set to {}".format(net_repeat)
292+
293+
net_loss = 0
294+
for pfeat, cfeat in zip(poisons_feats, center_feats):
295+
diff = torch.mean(pfeat, dim=0) - cfeat
296+
diff_norm = torch.norm(diff, dim=0)
297+
cfeat_norm = torch.norm(cfeat, dim=0)
298+
diff_norm = diff_norm / cfeat_norm
299+
net_loss += torch.mean(diff_norm)
300+
loss += net_loss / len(center_feats)
301+
loss = loss / len(subs_net_list)
302+
303+
else:
304+
loss = 0
305+
for net, center in zip(subs_net_list, target_feat_list):
306+
poisons = [
307+
net.get_activations(poison_batch(), layer=feature_layer, framework=True) for _ in range(net_repeat)
308+
]
309+
poisons = sum(poisons) / len(poisons)
310+
311+
diff = torch.mean(poisons, dim=0) - center
312+
diff_norm = torch.norm(diff, dim=1) / torch.norm(center, dim=1)
313+
loss += torch.mean(diff_norm)
314+
315+
loss = loss / len(subs_net_list)
316+
317+
return loss

art/estimators/classification/classifier.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,7 @@ def replacement_function(self, *args, **kwargs):
7474
replacement_function.__name__ = "new_" + func_name
7575
return replacement_function
7676

77-
replacement_list_no_y = ["predict", "get_activations"]
77+
replacement_list_no_y = ["predict"]
7878
replacement_list_has_y = ["fit"]
7979

8080
for item in replacement_list_no_y:

art/estimators/classification/pytorch.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -717,7 +717,11 @@ def loss_gradient(
717717
return grads
718718

719719
def get_activations(
720-
self, x: np.ndarray, layer: Union[int, str], batch_size: int = 128, framework: bool = False
720+
self,
721+
x: Union[np.ndarray, "torch.Tensor"],
722+
layer: Optional[Union[int, str]] = None,
723+
batch_size: int = 128,
724+
framework: bool = False,
721725
) -> np.ndarray:
722726
"""
723727
Return the output of the specified layer for input `x`. `layer` is specified by layer index (between 0 and
@@ -750,6 +754,8 @@ def get_activations(
750754
raise TypeError("Layer must be of type str or int")
751755

752756
if framework:
757+
if isinstance(x, torch.Tensor):
758+
return self._model(x)[layer_index]
753759
return self._model(torch.from_numpy(x).to(self._device))[layer_index]
754760

755761
# Run prediction with batch processing

docs/modules/attacks/poisoning.rst

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,12 @@ Backdoor Poisoning Attack
1414
:members:
1515
:special-members:
1616

17+
Bullseye Polytope Attack
18+
---------------------------
19+
.. autoclass:: BullseyePolytopeAttackPyTorch
20+
:members:
21+
:special-members:
22+
1723
Clean Label Backdoor Attack
1824
---------------------------
1925
.. autoclass:: PoisoningAttackCleanLabelBackdoor

0 commit comments

Comments
 (0)