Skip to content

Commit 8ccf19e

Browse files
authored
Merge pull request #1736 from monshri/dev_sleeper_agent
Implement sleeper agent hidden trigger backdoor attack in PyTorch
2 parents de5134b + a0dbae2 commit 8ccf19e

File tree

5 files changed

+2584
-14
lines changed

5 files changed

+2584
-14
lines changed

art/attacks/poisoning/gradient_matching_attack.py

Lines changed: 14 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -103,7 +103,7 @@ def __init__(
103103
self.verbose = verbose
104104
self._check_params()
105105

106-
def __initialize_poison(
106+
def _initialize_poison(
107107
self, x_trigger: np.ndarray, y_trigger: np.ndarray, x_poison: np.ndarray, y_poison: np.ndarray
108108
):
109109
"""
@@ -118,23 +118,23 @@ def __initialize_poison(
118118
from art.estimators.classification.tensorflow import TensorFlowV2Classifier
119119

120120
if isinstance(self.substitute_classifier, TensorFlowV2Classifier):
121-
initializer = self.__initialize_poison_tensorflow
121+
initializer = self._initialize_poison_tensorflow
122122
elif isinstance(self.substitute_classifier, PyTorchClassifier):
123-
initializer = self.__initialize_poison_pytorch
123+
initializer = self._initialize_poison_pytorch
124124
else:
125125
raise NotImplementedError(
126126
"GradientMatchingAttack is currently implemented only for Tensorflow V2 and Pytorch."
127127
)
128128

129129
return initializer(x_trigger, y_trigger, x_poison, y_poison)
130130

131-
def __finish_poison_tensorflow(self):
131+
def _finish_poison_tensorflow(self):
132132
"""
133133
Releases any resource and revert back unwanted change to the model.
134134
"""
135135
self.substitute_classifier.model.trainable = self.model_trainable
136136

137-
def __finish_poison_pytorch(self):
137+
def _finish_poison_pytorch(self):
138138
"""
139139
Releases any resource and revert back unwanted change to the model.
140140
"""
@@ -143,7 +143,7 @@ def __finish_poison_pytorch(self):
143143
else:
144144
self.substitute_classifier.model.eval()
145145

146-
def __initialize_poison_tensorflow(
146+
def _initialize_poison_tensorflow(
147147
self, x_trigger: np.ndarray, y_trigger: np.ndarray, x_poison: np.ndarray, y_poison: np.ndarray
148148
):
149149
"""
@@ -241,7 +241,7 @@ def get_config(self) -> Dict:
241241
)
242242
self.lr_schedule = tf.keras.callbacks.LearningRateScheduler(PredefinedLRSchedule(*self.learning_rate_schedule))
243243

244-
def __initialize_poison_pytorch(
244+
def _initialize_poison_pytorch(
245245
self,
246246
x_trigger: np.ndarray,
247247
y_trigger: np.ndarray,
@@ -397,11 +397,11 @@ def poison(
397397
from art.estimators.classification.tensorflow import TensorFlowV2Classifier
398398

399399
if isinstance(self.substitute_classifier, TensorFlowV2Classifier):
400-
poisoner = self.__poison__tensorflow
401-
finish_poisoning = self.__finish_poison_tensorflow
400+
poisoner = self._poison__tensorflow
401+
finish_poisoning = self._finish_poison_tensorflow
402402
elif isinstance(self.substitute_classifier, PyTorchClassifier):
403-
poisoner = self.__poison__pytorch
404-
finish_poisoning = self.__finish_poison_pytorch
403+
poisoner = self._poison__pytorch
404+
finish_poisoning = self._finish_poison_pytorch
405405
else:
406406
raise NotImplementedError(
407407
"GradientMatchingAttack is currently implemented only for Tensorflow V2 and Pytorch."
@@ -431,7 +431,7 @@ def poison(
431431
]
432432
x_poison = x_train[indices_poison]
433433
y_poison = y_train[indices_poison]
434-
self.__initialize_poison(x_trigger, y_trigger, x_poison, y_poison)
434+
self._initialize_poison(x_trigger, y_trigger, x_poison, y_poison)
435435
x_poisoned, B_ = poisoner(x_poison, y_poison) # pylint: disable=C0103
436436
finish_poisoning()
437437
B_ = np.mean(B_) # Averaging B losses from multiple batches. # pylint: disable=C0103
@@ -445,7 +445,7 @@ def poison(
445445
x_train[best_indices_poison] = best_x_poisoned
446446
return x_train, y_train # y_train has not been modified.
447447

448-
def __poison__pytorch(self, x_poison: np.ndarray, y_poison: np.ndarray) -> Tuple[Any, Any]:
448+
def _poison__pytorch(self, x_poison: np.ndarray, y_poison: np.ndarray) -> Tuple[Any, Any]:
449449
"""
450450
Optimize the poison by matching the gradient within the perturbation budget.
451451
@@ -515,7 +515,7 @@ def __len__(self):
515515
count += 1
516516
return np.concatenate(all_poisoned_samples, axis=0), B_sum / count
517517

518-
def __poison__tensorflow(self, x_poison: np.ndarray, y_poison: np.ndarray) -> Tuple[Any, Any]:
518+
def _poison__tensorflow(self, x_poison: np.ndarray, y_poison: np.ndarray) -> Tuple[Any, Any]:
519519
"""
520520
Optimize the poison by matching the gradient within the perturbation budget.
521521

0 commit comments

Comments
 (0)