change: expose num_clusters parameter for clarify shap in shapconfig (#2713)

prkrishnan1 · Pranav Krishnan · navinsoni · web-flow · commit e5e04081fb7f · 2021-10-27T10:51:17.000-07:00
Co-authored-by: Pranav Krishnan &lt;kriprana@amazon.com&gt;
Co-authored-by: Navin Soni &lt;navinsoni89@gmail.com&gt;
diff --git a/src/sagemaker/clarify.py b/src/sagemaker/clarify.py
@@ -305,12 +305,13 @@ class SHAPConfig(ExplainabilityConfig):
 
     def __init__(
         self,
-        baseline,
-        num_samples,
-        agg_method,
+        baseline=None,
+        num_samples=None,
+        agg_method=None,
         use_logit=False,
         save_local_shap_values=True,
         seed=None,
+        num_clusters=None,
     ):
         """Initializes config for SHAP.
 
@@ -320,34 +321,49 @@ def __init__(
                 be the same as the dataset format. Each row should contain only the feature
                 columns/values and omit the label column/values. If None a baseline will be
                 calculated automatically by using K-means or K-prototypes in the input dataset.
-            num_samples (int): Number of samples to be used in the Kernel SHAP algorithm.
+            num_samples (None or int): Number of samples to be used in the Kernel SHAP algorithm.
                 This number determines the size of the generated synthetic dataset to compute the
-                SHAP values.
-            agg_method (str): Aggregation method for global SHAP values. Valid values are
+                SHAP values. If not provided then Clarify job will choose a proper value according
+                to the count of features.
+            agg_method (None or str): Aggregation method for global SHAP values. Valid values are
                 "mean_abs" (mean of absolute SHAP values for all instances),
                 "median" (median of SHAP values for all instances) and
                 "mean_sq" (mean of squared SHAP values for all instances).
+                If not provided then Clarify job uses method "mean_abs"
             use_logit (bool): Indicator of whether the logit function is to be applied to the model
                 predictions. Default is False. If "use_logit" is true then the SHAP values will
                 have log-odds units.
             save_local_shap_values (bool): Indicator of whether to save the local SHAP values
                 in the output location. Default is True.
             seed (int): seed value to get deterministic SHAP values. Default is None.
+            num_clusters (None or int): If a baseline is not provided, Clarify automatically
+                computes a baseline dataset via a clustering algorithm (K-means/K-prototypes).
+                num_clusters is a parameter for this algorithm. num_clusters will be the resulting
+                size of the baseline dataset. If not provided, Clarify job will use a default value.
         """
-        if agg_method not in ["mean_abs", "median", "mean_sq"]:
+        if agg_method is not None and agg_method not in ["mean_abs", "median", "mean_sq"]:
             raise ValueError(
                 f"Invalid agg_method {agg_method}." f" Please choose mean_abs, median, or mean_sq."
             )
-
+        if num_clusters is not None and baseline is not None:
+            raise ValueError(
+                "Baseline and num_clusters cannot be provided together. "
+                "Please specify one of the two."
+            )
         self.shap_config = {
-            "baseline": baseline,
-            "num_samples": num_samples,
-            "agg_method": agg_method,
             "use_logit": use_logit,
             "save_local_shap_values": save_local_shap_values,
         }
+        if baseline is not None:
+            self.shap_config["baseline"] = baseline
+        if num_samples is not None:
+            self.shap_config["num_samples"] = num_samples
+        if agg_method is not None:
+            self.shap_config["agg_method"] = agg_method
         if seed is not None:
             self.shap_config["seed"] = seed
+        if num_clusters is not None:
+            self.shap_config["num_clusters"] = num_clusters
 
     def get_explainability_config(self):
         """Returns config."""
diff --git a/tests/unit/test_clarify.py b/tests/unit/test_clarify.py
@@ -268,6 +268,42 @@ def test_shap_config():
     assert expected_config == shap_config.get_explainability_config()
 
 
+def test_shap_config_no_baseline():
+    num_samples = 100
+    agg_method = "mean_sq"
+    use_logit = True
+    seed = 123
+    shap_config = SHAPConfig(
+        num_samples=num_samples,
+        agg_method=agg_method,
+        num_clusters=2,
+        use_logit=use_logit,
+        seed=seed,
+    )
+    expected_config = {
+        "shap": {
+            "num_samples": num_samples,
+            "agg_method": agg_method,
+            "num_clusters": 2,
+            "use_logit": use_logit,
+            "save_local_shap_values": True,
+            "seed": seed,
+        }
+    }
+    assert expected_config == shap_config.get_explainability_config()
+
+
+def test_shap_config_no_parameters():
+    shap_config = SHAPConfig()
+    expected_config = {
+        "shap": {
+            "use_logit": False,
+            "save_local_shap_values": True,
+        }
+    }
+    assert expected_config == shap_config.get_explainability_config()
+
+
 def test_invalid_shap_config():
     with pytest.raises(ValueError) as error:
         SHAPConfig(
@@ -278,6 +314,12 @@ def test_invalid_shap_config():
     assert "Invalid agg_method invalid. Please choose mean_abs, median, or mean_sq." in str(
         error.value
     )
+    with pytest.raises(ValueError) as error:
+        SHAPConfig(baseline=[[1]], num_samples=1, agg_method="mean_abs", num_clusters=2)
+    assert (
+        "Baseline and num_clusters cannot be provided together. Please specify one of the two."
+        in str(error.value)
+    )
 
 
 @pytest.fixture(scope="module")