Skip to content

Commit 72158c1

Browse files
curiousg23tensorflow-copybara
authored andcommitted
Implement PGD for L2 and L-infinity norm.
PiperOrigin-RevId: 312159458
1 parent 8885402 commit 72158c1

File tree

5 files changed

+895
-58
lines changed

5 files changed

+895
-58
lines changed

neural_structured_learning/configs/configs.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,12 +58,17 @@ class AdvNeighborConfig(object):
5858
corresponding feature.
5959
clip_value_max: maximum value to clip the feature after perturbation. (See
6060
`clip_value_min` for the structure and shape limitations.)
61+
iterations: number of iterations to run the attack for. Defaults to a single
62+
step, used for the Fast Gradient Sign Method (FGSM) attack.
63+
epsilon: Defines radius of the epsilon ball to project back to.
6164
"""
6265
feature_mask = attr.ib(default=None)
6366
adv_step_size = attr.ib(default=0.001)
6467
adv_grad_norm = attr.ib(converter=NormType, default='l2')
6568
clip_value_min = attr.ib(default=None)
6669
clip_value_max = attr.ib(default=None)
70+
iterations = attr.ib(default=1) # 1 is the FGSM attack.
71+
epsilon = attr.ib(default=None)
6772

6873

6974
@attr.s

neural_structured_learning/lib/adversarial_neighbor.py

Lines changed: 119 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,6 @@
1111
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
14-
1514
"""Generates adversarial neighbors.
1615
1716
This file provides the class(es) and the corresponding functional interface(s)
@@ -40,12 +39,22 @@ def _apply_feature_constraints(feature, min_value, max_value):
4039

4140

4241
class _GenAdvNeighbor(abs_gen.GenNeighbor):
43-
"""Class for generating adversarial neighbors.
44-
45-
The core of this class implements the operation:
46-
`adv_neighbor = input_features + adv_step_size * final_grad`
42+
"""Class for generating adversarial neighbors based on gradient-based methods.
43+
44+
The core of this class implements the projected gradient descent (PGD)
45+
operation:
46+
```
47+
adv_neighbor = input_features
48+
iterations = 10 # Number of iterations to run PGD.
49+
for _ in range(iterations):
50+
grad = gradient(adv_neighbor)
51+
adv_neighbor = adv_neighbor + adv_step_size * grad
52+
adv_neighbor = project(adv_neighbor)
53+
```
4754
where `adv_step_size` is the step size (analogous to learning rate) for
48-
searching/calculating adversarial neighbor.
55+
searching/calculating adversarial neighbor, `gradient(x)` calculates the
56+
gradient of the model at `x`, and `project(v)` projects the vector `v` onto
57+
the epsilon ball.
4958
5059
Attributes:
5160
labeled_loss: a scalar (`tf.float32`) tensor calculated from true labels (or
@@ -58,25 +67,34 @@ class _GenAdvNeighbor(abs_gen.GenNeighbor):
5867
`tf.DType`, like string or integer. (3) The feature is not involved in
5968
loss computation. If set to False, those input without gradient will be
6069
ignored silently and not perturbed. (default=False)
70+
pgd_model_fn: the model function. Takes in the input_features and produces a
71+
prediction. This is required for PGD with more than one step.
72+
pgd_loss_fn: the loss function. Calculates loss between prediction and
73+
ground truth.
6174
"""
6275

6376
def __init__(self,
6477
labeled_loss,
6578
adv_config,
6679
raise_invalid_gradient=False,
67-
gradient_tape=None):
80+
gradient_tape=None,
81+
pgd_model_fn=None,
82+
pgd_loss_fn=None):
6883
self._labeled_loss = labeled_loss
6984
self._adv_config = adv_config
7085
self._raise_invalid_gradient = raise_invalid_gradient
7186
self._gradient_tape = gradient_tape
87+
self._pgd_model_fn = pgd_model_fn
88+
self._pgd_loss_fn = pgd_loss_fn
7289

73-
def _compute_gradient(self, dense_features):
74-
"""Computes the gradient of `self._labeled_loss` w.r.t. `dense_features`."""
90+
def _compute_gradient(self, loss, dense_features, gradient_tape=None):
91+
"""Computes the gradient given a loss and dense features."""
7592
feature_values = list(dense_features.values())
76-
if self._gradient_tape is None: # Assuming in graph mode, no tape required.
77-
grads = tf.gradients(self._labeled_loss, feature_values)
93+
if gradient_tape is None:
94+
grads = tf.gradients(loss, feature_values)
7895
else:
79-
grads = self._gradient_tape.gradient(self._labeled_loss, feature_values)
96+
grads = gradient_tape.gradient(loss, feature_values)
97+
8098
# The order of elements returned by .values() and .keys() are guaranteed
8199
# corresponding to each other.
82100
keyed_grads = dict(zip(dense_features.keys(), grads))
@@ -131,7 +149,7 @@ def _split_dict(self, dictionary, predicate_fn):
131149
negatives[key] = value
132150
return positives, negatives
133151

134-
def gen_neighbor(self, input_features):
152+
def gen_neighbor(self, input_features, pgd_labels=None):
135153
"""Generates adversarial neighbors and the corresponding weights.
136154
137155
This function perturbs only *dense* tensors to generate adversarial
@@ -148,6 +166,9 @@ def gen_neighbor(self, input_features):
148166
tensor(s) should be either:
149167
(a) pointwise samples: [batch_size, feat_len], or
150168
(b) sequence samples: [batch_size, seq_len, feat_len]
169+
pgd_labels: the labels corresponding to each input. This should have shape
170+
`[batch_size, 1]`. This is required for PGD-generated adversaries, and
171+
unused otherwise.
151172
152173
Returns:
153174
adv_neighbor: the perturbed example, with the same shape and structure as
@@ -163,41 +184,74 @@ def gen_neighbor(self, input_features):
163184
This error is suppressed if `raise_invalid_gradient` is set to False
164185
(which is the default).
165186
"""
187+
loss = self._labeled_loss
188+
gradient_tape = self._gradient_tape
166189

167190
# Composes both features and feature_masks to dictionaries, so that the
168191
# feature_masks can be looked up by key.
169192
features = self._compose_as_dict(input_features)
193+
dense_original_features, sparse_original_features = self._split_dict(
194+
features, lambda feature: isinstance(feature, tf.Tensor))
170195
feature_masks = self._compose_as_dict(self._adv_config.feature_mask)
171196
feature_min = self._compose_as_dict(self._adv_config.clip_value_min)
172197
feature_max = self._compose_as_dict(self._adv_config.clip_value_max)
173-
174-
dense_features, sparse_features = self._split_dict(
175-
features, lambda feature: isinstance(feature, tf.Tensor))
176-
if sparse_features:
177-
sparse_keys = str(sparse_features.keys())
198+
if sparse_original_features:
199+
sparse_keys = str(sparse_original_features.keys())
178200
if self._raise_invalid_gradient:
179201
raise ValueError('Cannot perturb non-Tensor input: ' + sparse_keys)
180202
logging.log_first_n(logging.WARNING,
181203
'Cannot perturb non-Tensor input: %s', 1, sparse_keys)
182-
183-
keyed_grads = self._compute_gradient(dense_features)
184-
masked_grads = {
185-
key: utils.apply_feature_mask(grad, feature_masks.get(key, None))
186-
for key, grad in keyed_grads.items()
187-
}
188-
189-
unit_perturbations = utils.maximize_within_unit_norm(
190-
masked_grads, self._adv_config.adv_grad_norm)
191-
perturbations = tf.nest.map_structure(
192-
lambda t: t * self._adv_config.adv_step_size, unit_perturbations)
193-
194-
# Sparse features are copied directly without perturbation.
195-
adv_neighbor = dict(sparse_features)
196-
for key, feature in dense_features.items():
197-
adv_neighbor[key] = tf.stop_gradient(
198-
_apply_feature_constraints(
199-
feature + perturbations[key] if key in perturbations else feature,
200-
feature_min.get(key, None), feature_max.get(key, None)))
204+
dense_features = dense_original_features
205+
for t in range(self._adv_config.iterations):
206+
keyed_grads = self._compute_gradient(loss, dense_features, gradient_tape)
207+
masked_grads = {
208+
key: utils.apply_feature_mask(grad, feature_masks.get(key, None))
209+
for key, grad in keyed_grads.items()
210+
}
211+
212+
unit_perturbations = utils.maximize_within_unit_norm(
213+
masked_grads, self._adv_config.adv_grad_norm)
214+
perturbations = tf.nest.map_structure(
215+
lambda t: t * self._adv_config.adv_step_size, unit_perturbations)
216+
# Clip perturbations into epsilon ball here. Note that this ball is
217+
# centered around the original input point.
218+
diff = {}
219+
bounded_diff = {}
220+
for key, perturb in perturbations.items():
221+
# Only include features for which perturbation occurred. There is
222+
# nothing to project for features without perturbations.
223+
diff[key] = dense_features[key] + perturb - dense_original_features[key]
224+
if self._adv_config.epsilon is not None:
225+
bounded_diff = utils.project_to_ball(diff, self._adv_config.epsilon,
226+
self._adv_config.adv_grad_norm)
227+
else:
228+
bounded_diff = diff
229+
# Backfill the rest of the dense features.
230+
for key, feature in dense_features.items():
231+
if key not in bounded_diff:
232+
bounded_diff[key] = feature - dense_original_features[key]
233+
adv_neighbor = dict(sparse_original_features)
234+
for key, feature in dense_original_features.items():
235+
adv_neighbor[key] = tf.stop_gradient(
236+
_apply_feature_constraints(
237+
feature +
238+
bounded_diff[key] if key in perturbations else feature,
239+
feature_min.get(key, None), feature_max.get(key, None)))
240+
241+
# Update for the next iteration.
242+
if t < self._adv_config.iterations - 1:
243+
inputs_t = self._decompose_as(input_features, adv_neighbor)
244+
# Compute the new loss to calculate gradients with.
245+
features = self._compose_as_dict(inputs_t)
246+
dense_features, _ = self._split_dict(
247+
features, lambda feature: isinstance(feature, tf.Tensor))
248+
if gradient_tape is not None:
249+
with gradient_tape:
250+
# Gradient calculated against dense features only.
251+
gradient_tape.watch(dense_features)
252+
loss = self._pgd_loss_fn(pgd_labels, self._pgd_model_fn(inputs_t))
253+
else:
254+
loss = self._pgd_loss_fn(pgd_labels, self._pgd_model_fn(inputs_t))
201255

202256
# Converts the perturbed examples back to their original structure.
203257
adv_neighbor = self._decompose_as(input_features, adv_neighbor)
@@ -212,7 +266,10 @@ def gen_adv_neighbor(input_features,
212266
labeled_loss,
213267
config,
214268
raise_invalid_gradient=False,
215-
gradient_tape=None):
269+
gradient_tape=None,
270+
pgd_model_fn=None,
271+
pgd_loss_fn=None,
272+
pgd_labels=None):
216273
"""Generates adversarial neighbors for the given input and loss.
217274
218275
This function implements the following operation:
@@ -225,10 +282,10 @@ def gen_adv_neighbor(input_features,
225282
dictionary of feature names and dense tensors. The shape of the tensor(s)
226283
should be either:
227284
(a) pointwise samples: `[batch_size, feat_len]`, or
228-
(b) sequence samples: `[batch_size, seq_len, feat_len]`.
229-
Note that only dense (`float`) tensors in `input_features` will be
230-
perturbed and all other features (`int`, `string`, or `SparseTensor`) will
231-
be kept as-is in the returning `adv_neighbor`.
285+
(b) sequence samples: `[batch_size, seq_len, feat_len]`. Note that only
286+
dense (`float`) tensors in `input_features` will be perturbed and all
287+
other features (`int`, `string`, or `SparseTensor`) will be kept as-is
288+
in the returning `adv_neighbor`.
232289
labeled_loss: A scalar tensor of floating point type calculated from true
233290
labels (or supervisions).
234291
config: A `nsl.configs.AdvNeighborConfig` object containing the following
@@ -238,15 +295,21 @@ def gen_adv_neighbor(input_features,
238295
- 'adv_grad_norm': type of tensor norm to normalize the gradient.
239296
raise_invalid_gradient: (optional) A Boolean flag indicating whether to
240297
raise an error when gradients cannot be computed on any input feature.
241-
There are three cases where this error may happen:
242-
(1) The feature is a `SparseTensor`.
243-
(2) The feature has a non-differentiable `dtype`, like string or integer.
244-
(3) The feature is not involved in loss computation.
245-
If set to `False` (default), those inputs without gradient will be ignored
246-
silently and not perturbed.
298+
There are three cases where this error may happen: (1) The feature is a
299+
`SparseTensor`. (2) The feature has a non-differentiable `dtype`, like
300+
string or integer. (3) The feature is not involved in loss computation.
301+
If set to `False` (default), those inputs without gradient will be
302+
ignored silently and not perturbed.
247303
gradient_tape: A `tf.GradientTape` object watching the calculation from
248304
`input_features` to `labeled_loss`. Can be omitted if running in graph
249305
mode.
306+
pgd_model_fn: The model to generate adversaries for. Generates predictions
307+
for a given set of inputs, in the shape of `input_features`.
308+
pgd_loss_fn: The loss function. Takes samples of labels and a model
309+
predictions.
310+
pgd_labels: labels for the input features. This should have shape
311+
`[batch_size, 1]`. Required to generate adversaries with PGD, unused
312+
otherwise.
250313
251314
Returns:
252315
adv_neighbor: The perturbed example, with the same shape and structure as
@@ -259,6 +322,11 @@ def gen_adv_neighbor(input_features,
259322
features cannot be perturbed. See `raise_invalid_gradient` for situations
260323
where this can happen.
261324
"""
262-
adv_helper = _GenAdvNeighbor(labeled_loss, config, raise_invalid_gradient,
263-
gradient_tape)
264-
return adv_helper.gen_neighbor(input_features)
325+
adv_helper = _GenAdvNeighbor(
326+
labeled_loss,
327+
config,
328+
raise_invalid_gradient,
329+
gradient_tape,
330+
pgd_model_fn=pgd_model_fn,
331+
pgd_loss_fn=pgd_loss_fn)
332+
return adv_helper.gen_neighbor(input_features, pgd_labels)

0 commit comments

Comments
 (0)