Fix handling of true labels with regression models

abigailgold · abigailgold · commit 4bdf0c867124 · 2022-03-17T22:18:21.000+02:00
Signed-off-by: abigailt &lt;abigailt@il.ibm.com&gt;
diff --git a/art/attacks/inference/attribute_inference/black_box.py b/art/attacks/inference/attribute_inference/black_box.py
@@ -150,6 +150,8 @@ def fit(self, x: np.ndarray, y: Optional[np.ndarray] = None) -> None:
         # get model's predictions for x
         if ClassifierMixin in type(self.estimator).__mro__:
             predictions = np.array([np.argmax(arr) for arr in self.estimator.predict(x)]).reshape(-1, 1)
+            if y is not None:
+                y = check_and_transform_label_format(y, return_one_hot=True)
         else:  # Regression model
             if self.scale_range is not None:
                 predictions = minmax_scale(self.estimator.predict(x).reshape(-1, 1), feature_range=self.scale_range)
@@ -159,6 +161,7 @@ def fit(self, x: np.ndarray, y: Optional[np.ndarray] = None) -> None:
                 predictions = self.estimator.predict(x).reshape(-1, 1) * self.prediction_normal_factor
                 if y is not None:
                     y = y * self.prediction_normal_factor
+            y = y.reshape(-1, 1)
 
         # get vector of attacked feature
         y_attack = x[:, self.attack_feature]
@@ -173,7 +176,6 @@ def fit(self, x: np.ndarray, y: Optional[np.ndarray] = None) -> None:
         x_train = np.concatenate((np.delete(x, self.attack_feature, 1), predictions), axis=1).astype(np.float32)
 
         if y is not None:
-            y = check_and_transform_label_format(y, return_one_hot=True)
             x_train = np.concatenate((x_train, y), axis=1)
 
         # train attack model
@@ -224,11 +226,13 @@ def infer(self, x: np.ndarray, y: Optional[np.ndarray] = None, **kwargs) -> np.n
                 x_test = np.concatenate((x, pred * self.prediction_normal_factor), axis=1).astype(np.float32)
                 if y is not None:
                     y = y * self.prediction_normal_factor
+            y = y.reshape(-1, 1)
         else:
             x_test = np.concatenate((x, pred), axis=1).astype(np.float32)
+            if y is not None:
+                y = check_and_transform_label_format(y, return_one_hot=True)
 
         if y is not None:
-            y = check_and_transform_label_format(y, return_one_hot=True)
             x_test = np.concatenate((x_test, y), axis=1)
 
         predictions = self.attack_model.predict(x_test).astype(np.float32)
diff --git a/art/attacks/inference/attribute_inference/true_label_baseline.py b/art/attacks/inference/attribute_inference/true_label_baseline.py
@@ -53,6 +53,7 @@ def __init__(
         attack_model_type: str = "nn",
         attack_model: Optional["CLASSIFIER_TYPE"] = None,
         attack_feature: Union[int, slice] = 0,
+        is_regression: Optional[bool] = False,
         scale_range: Optional[slice] = None,
         prediction_normal_factor: float = 1,
     ):
@@ -66,11 +67,12 @@ def __init__(
         :param attack_feature: The index of the feature to be attacked or a slice representing multiple indexes in
                                case of a one-hot encoded feature.
                                case of a one-hot encoded feature.
+        :param is_regression: Whether the model is a regression model. Default is False (classification).
         :param scale_range: If supplied, the class labels (both true and predicted) will be scaled to the given range.
-                            Only applicable when `estimator` is a regressor.
+                            Only applicable when `is_regression` is True.
         :param prediction_normal_factor: If supplied, the class labels (both true and predicted) are multiplied by the
                                          factor when used as inputs to the attack-model. Only applicable when
-                                         `estimator` is a regressor and if `scale_range` is not supplied.
+                                         `is_regression` is True and if `scale_range` is not supplied.
         """
         super().__init__(estimator=None, attack_feature=attack_feature)
 
@@ -118,6 +120,7 @@ def __init__(
 
         self.prediction_normal_factor = prediction_normal_factor
         self.scale_range = scale_range
+        self.is_regression = is_regression
         self._check_params()
 
     def fit(self, x: np.ndarray, y: np.ndarray) -> None:
@@ -144,11 +147,14 @@ def fit(self, x: np.ndarray, y: np.ndarray) -> None:
             raise ValueError("None value detected.")
 
         # create training set for attack model
-        if self.scale_range is not None:
-            normalized_labels = minmax_scale(y, feature_range=self.scale_range)
+        if self.is_regression:
+            if self.scale_range is not None:
+                normalized_labels = minmax_scale(y, feature_range=self.scale_range)
+            else:
+                normalized_labels = y * self.prediction_normal_factor
+            normalized_labels = normalized_labels.reshape(-1, 1)
         else:
-            normalized_labels = y * self.prediction_normal_factor
-        normalized_labels = check_and_transform_label_format(normalized_labels, return_one_hot=True)
+            normalized_labels = check_and_transform_label_format(y, return_one_hot=True)
         x_train = np.concatenate((np.delete(x, self.attack_feature, 1), normalized_labels), axis=1).astype(np.float32)
 
         # train attack model
@@ -177,11 +183,14 @@ def infer(self, x: np.ndarray, y: Optional[np.ndarray] = None, **kwargs) -> np.n
         if values is not None:
             self._values = values
 
-        if self.scale_range is not None:
-            normalized_labels = minmax_scale(y, feature_range=self.scale_range)
+        if self.is_regression:
+            if self.scale_range is not None:
+                normalized_labels = minmax_scale(y, feature_range=self.scale_range)
+            else:
+                normalized_labels = y * self.prediction_normal_factor
+            normalized_labels = normalized_labels.reshape(-1, 1)
         else:
-            normalized_labels = y * self.prediction_normal_factor
-        normalized_labels = check_and_transform_label_format(normalized_labels, return_one_hot=True)
+            normalized_labels = check_and_transform_label_format(y, return_one_hot=True)
         x_test = np.concatenate((x, normalized_labels), axis=1).astype(np.float32)
 
         predictions = self.attack_model.predict(x_test).astype(np.float32)
diff --git a/tests/attacks/inference/attribute_inference/test_black_box.py b/tests/attacks/inference/attribute_inference/test_black_box.py
@@ -261,6 +261,77 @@ def transform_feature(x):
         art_warning(e)
 
 
+@pytest.mark.skip_framework("dl_frameworks")
+@pytest.mark.parametrize("model_type", ["nn", "rf"])
+def test_black_box_regressor_label(art_warning, get_diabetes_dataset, model_type):
+    try:
+        attack_feature = 0  # age
+
+        bins = [
+            -0.96838121,
+            -0.77154309,
+            -0.57470497,
+            -0.37786684,
+            -0.18102872,
+            0.0158094,
+            0.21264752,
+            0.40948564,
+            0.60632376,
+            0.80316188,
+            1.0,
+        ]
+
+        # need to transform attacked feature into categorical
+        def transform_feature(x):
+            for i in range(len(bins) - 1):
+                x[(x >= bins[i]) & (x <= bins[i + 1])] = i
+
+        values = list(range(len(bins) - 1))
+
+        (x_train_diabetes, y_train_diabetes), (x_test_diabetes, y_test_diabetes) = get_diabetes_dataset
+        # training data without attacked feature
+        x_train_for_attack = np.delete(x_train_diabetes, attack_feature, 1)
+        # only attacked feature
+        x_train_feature = x_train_diabetes[:, attack_feature].copy().reshape(-1, 1)
+        transform_feature(x_train_feature)
+        # training data with attacked feature (after transformation)
+        x_train = np.concatenate((x_train_for_attack[:, :attack_feature], x_train_feature), axis=1)
+        x_train = np.concatenate((x_train, x_train_for_attack[:, attack_feature:]), axis=1)
+
+        # test data without attacked feature
+        x_test_for_attack = np.delete(x_test_diabetes, attack_feature, 1)
+        # only attacked feature
+        x_test_feature = x_test_diabetes[:, attack_feature].copy().reshape(-1, 1)
+        transform_feature(x_test_feature)
+
+        from sklearn import linear_model
+
+        regr_model = linear_model.LinearRegression()
+        regr_model.fit(x_train_diabetes, y_train_diabetes)
+        regressor = ScikitlearnRegressor(regr_model)
+
+        attack = AttributeInferenceBlackBox(
+            regressor, attack_feature=attack_feature, prediction_normal_factor=1 / 250, attack_model_type=model_type
+        )
+        # get original model's predictions
+        x_train_predictions = regressor.predict(x_train_diabetes).reshape(-1, 1)
+        x_test_predictions = regressor.predict(x_test_diabetes).reshape(-1, 1)
+        # train attack model
+        attack.fit(x_train, y=y_train_diabetes)
+        # infer attacked feature
+        inferred_train = attack.infer(x_train_for_attack, pred=x_train_predictions, values=values, y=y_train_diabetes)
+        inferred_test = attack.infer(x_test_for_attack, pred=x_test_predictions, values=values, y=y_test_diabetes)
+        # check accuracy
+        train_acc = np.sum(inferred_train == x_train_feature.reshape(1, -1)) / len(inferred_train)
+        test_acc = np.sum(inferred_test == x_test_feature.reshape(1, -1)) / len(inferred_test)
+
+        assert pytest.approx(0.0258, abs=0.12) == train_acc
+        assert pytest.approx(0.0375, abs=0.12) == test_acc
+
+    except ARTTestException as e:
+        art_warning(e)
+
+
 @pytest.mark.skip_framework("dl_frameworks")
 def test_black_box_with_model(art_warning, decision_tree_estimator, get_iris_dataset):
     try:
diff --git a/tests/attacks/inference/attribute_inference/test_true_label_baseline.py b/tests/attacks/inference/attribute_inference/test_true_label_baseline.py
@@ -133,6 +133,46 @@ def transform_feature(x):
         art_warning(e)
 
 
+@pytest.mark.skip_framework("dl_frameworks")
+@pytest.mark.parametrize("model_type", ["nn", "rf"])
+def test_true_label_baseline_regression(art_warning, get_diabetes_dataset, model_type):
+    try:
+        attack_feature = 1  # sex
+
+        (x_train, y_train), (x_test, y_test) = get_diabetes_dataset
+        # training data without attacked feature
+        x_train_for_attack = np.delete(x_train, attack_feature, 1)
+        # only attacked feature
+        x_train_feature = x_train[:, attack_feature].copy().reshape(-1, 1)
+
+        # test data without attacked feature
+        x_test_for_attack = np.delete(x_test, attack_feature, 1)
+        # only attacked feature
+        x_test_feature = x_test[:, attack_feature].copy().reshape(-1, 1)
+
+        baseline_attack = AttributeInferenceBaselineTrueLabel(
+            attack_feature=attack_feature, attack_model_type=model_type, is_regression=True
+        )
+        # train attack model
+        baseline_attack.fit(x_train, y_train)
+        # infer attacked feature
+        baseline_inferred_train = baseline_attack.infer(x_train_for_attack, y=y_train)
+        baseline_inferred_test = baseline_attack.infer(x_test_for_attack, y=y_test)
+        # check accuracy
+        baseline_train_acc = np.sum(baseline_inferred_train == x_train_feature.reshape(1, -1)) / len(
+            baseline_inferred_train
+        )
+        baseline_test_acc = np.sum(baseline_inferred_test == x_test_feature.reshape(1, -1)) / len(
+            baseline_inferred_test
+        )
+
+        assert 0.7 <= baseline_train_acc
+        assert 0.6 <= baseline_test_acc
+
+    except ARTTestException as e:
+        art_warning(e)
+
+
 @pytest.mark.framework_agnostic
 def test_check_params(art_warning):
     try: