|
| 1 | +# MIT License |
| 2 | +# |
| 3 | +# Copyright (C) The Adversarial Robustness Toolbox (ART) Authors 2025 |
| 4 | +# |
| 5 | +# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated |
| 6 | +# documentation files (the "Software"), to deal in the Software without restriction, including without limitation the |
| 7 | +# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit |
| 8 | +# persons to whom the Software is furnished to do so, subject to the following conditions: |
| 9 | +# |
| 10 | +# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the |
| 11 | +# Software. |
| 12 | +# |
| 13 | +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE |
| 14 | +# WARRANTIES of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
| 15 | +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE for any claim, damages or other liability, whether in an action of contract, |
| 16 | +# TORT OR OTHERWISE, ARISING from, out of or in connection with the software or the use or other dealings in the |
| 17 | +# Software. |
| 18 | +""" |
| 19 | +This module implements One Pixel Shortcut attacks on Deep Neural Networks. |
| 20 | +""" |
| 21 | + |
| 22 | +import numpy as np |
| 23 | + |
| 24 | +from art.attacks.attack import PoisoningAttackBlackBox |
| 25 | + |
| 26 | +class OnePixelShortcutAttack(PoisoningAttackBlackBox): |
| 27 | + """ |
| 28 | + One-Pixel Shortcut (OPS) poisoning attack. |
| 29 | + This attack finds a single pixel (and channel value) that acts as a "shortcut" |
| 30 | + for each class by maximizing a mean-minus-variance objective over that class's |
| 31 | + images. The found pixel coordinate and color are applied to all images of the class |
| 32 | + (labels remain unchanged). Reference: Wu et al. (ICLR 2023). |
| 33 | + """ |
| 34 | + attack_params: list = [] # No external parameters for this attack |
| 35 | + _estimator_requirements: tuple = () |
| 36 | + |
| 37 | + def __init__(self): |
| 38 | + super().__init__() |
| 39 | + |
| 40 | + def _check_params(self): |
| 41 | + # No parameters to validate |
| 42 | + pass |
| 43 | + |
| 44 | + def poison(self, x: np.ndarray, y: np.ndarray = None, **kwargs): |
| 45 | + """ |
| 46 | + Generate an OPS-poisoned dataset from clean data. |
| 47 | +
|
| 48 | + :param x: Clean input samples, as a Numpy array of shape (N, H, W, C) or (N, C, H, W), with values in [0, 1]. |
| 49 | + :param y: Corresponding labels (shape (N,) or one-hot (N, K)). Required for class-wise perturbation. |
| 50 | + :return: Tuple (x_poisoned, y_poisoned) with one pixel modified per image. |
| 51 | + """ |
| 52 | + if y is None: |
| 53 | + raise ValueError("Labels y must be provided for the One-Pixel Shortcut attack.") |
| 54 | + # Copy labels to return (labels are not changed by poisoning) |
| 55 | + y_poison = y.copy() |
| 56 | + |
| 57 | + # Convert inputs to numpy array (if not already) and determine channel format |
| 58 | + x_array = np.array(x, copy=False) |
| 59 | + if x_array.ndim == 3: |
| 60 | + # Input shape (N, H, W) - single-channel images without explicit channel dim |
| 61 | + x_orig = x_array.reshape((x_array.shape[0], x_array.shape[1], x_array.shape[2], 1)).astype(np.float32) |
| 62 | + channels_first = False |
| 63 | + grayscale = True |
| 64 | + elif x_array.ndim == 4: |
| 65 | + # Determine if format is NCHW or NHWC by examining dimensions |
| 66 | + # Assume channel count is 1, 3, or 4 for common cases (grayscale, RGB, RGBA) |
| 67 | + if x_array.shape[1] in (1, 3, 4) and x_array.shape[-1] not in (1, 3, 4): |
| 68 | + # Likely (N, C, H, W) format |
| 69 | + x_orig = np.transpose(x_array, (0, 2, 3, 1)).astype(np.float32) |
| 70 | + channels_first = True |
| 71 | + elif x_array.shape[-1] in (1, 3, 4) and x_array.shape[1] not in (1, 3, 4): |
| 72 | + # Likely (N, H, W, C) format |
| 73 | + x_orig = x_array.astype(np.float32) |
| 74 | + channels_first = False |
| 75 | + else: |
| 76 | + # Ambiguous case: if both middle and last dims could be channels (e.g. tiny images) |
| 77 | + # Default to treating last dimension as channels if it matches a known channel count |
| 78 | + if x_array.shape[-1] in (1, 3, 4): |
| 79 | + x_orig = x_array.astype(np.float32) |
| 80 | + channels_first = False |
| 81 | + else: |
| 82 | + x_orig = np.transpose(x_array, (0, 2, 3, 1)).astype(np.float32) |
| 83 | + channels_first = True |
| 84 | + grayscale = (x_orig.shape[3] == 1) |
| 85 | + else: |
| 86 | + raise ValueError(f"Unsupported input tensor shape: {x_array.shape}") |
| 87 | + |
| 88 | + # x_orig is now (N, H, W, C) in float32 |
| 89 | + n, h, w, c = x_orig.shape |
| 90 | + # Prepare class index labels |
| 91 | + labels = y.copy() |
| 92 | + if labels.ndim > 1: |
| 93 | + labels = labels.argmax(axis=1) |
| 94 | + labels = labels.astype(int) |
| 95 | + |
| 96 | + # Initialize output poisoned data array |
| 97 | + x_poison = x_orig.copy() |
| 98 | + |
| 99 | + # Compute optimal pixel for each class |
| 100 | + classes = np.unique(labels) |
| 101 | + for cls in classes: |
| 102 | + idx = np.where(labels == cls)[0] |
| 103 | + if idx.size == 0: |
| 104 | + continue # skip if no samples for this class |
| 105 | + imgs_c = x_orig[idx] # subset of images of class `cls`, shape (n_c, H, W, C) |
| 106 | + best_score = -np.inf |
| 107 | + best_coord = None |
| 108 | + best_color = None |
| 109 | + # Determine target color options: extremes (0 or 1 in each channel) |
| 110 | + if c == 1: |
| 111 | + target_options = [ |
| 112 | + np.array([0.0], dtype=x_orig.dtype), |
| 113 | + np.array([1.0], dtype=x_orig.dtype), |
| 114 | + ] |
| 115 | + else: |
| 116 | + target_options = [ |
| 117 | + np.array(bits, dtype=x_orig.dtype) |
| 118 | + for bits in np.ndindex(*(2,) * c) |
| 119 | + ] |
| 120 | + # Evaluate each candidate color |
| 121 | + for target_vec in target_options: |
| 122 | + # Compute per-image average difference from target for all pixels |
| 123 | + diffs = np.abs(imgs_c - target_vec) # shape (n_c, H, W, C) |
| 124 | + per_image_diff = diffs.mean(axis=3) # shape (n_c, H, W), mean diff per image at each pixel |
| 125 | + # Compute score = mean - var for each pixel position (vectorized over HxW) |
| 126 | + mean_diff_map = per_image_diff.mean(axis=0) # shape (H, W) |
| 127 | + var_diff_map = per_image_diff.var(axis=0) # shape (H, W) |
| 128 | + score_map = mean_diff_map - var_diff_map # shape (H, W) |
| 129 | + # Find the pixel with maximum score for this target |
| 130 | + max_idx_flat = np.argmax(score_map) |
| 131 | + max_score = score_map.ravel()[max_idx_flat] |
| 132 | + if max_score > best_score: |
| 133 | + best_score = float(max_score) |
| 134 | + # Convert flat index to 2D coordinates (i, j) |
| 135 | + best_coord = (max_idx_flat // w, max_idx_flat % w) |
| 136 | + best_color = target_vec |
| 137 | + # Apply the best pixel perturbation to all images of this class |
| 138 | + if best_coord is not None: |
| 139 | + i_star, j_star = best_coord |
| 140 | + x_poison[idx, i_star, j_star, :] = best_color |
| 141 | + |
| 142 | + # Restore original data format and type |
| 143 | + if channels_first: |
| 144 | + x_poison = np.transpose(x_poison, (0, 3, 1, 2)) |
| 145 | + if grayscale: |
| 146 | + x_poison = x_poison.reshape(n, h, w) |
| 147 | + x_poison = x_poison.astype(x_array.dtype) |
| 148 | + return x_poison, y_poison |
0 commit comments