alteryx · NabilFayak · Sep 5, 2023 · Sep 5, 2023 · Sep 5, 2023 · Sep 5, 2023
diff --git a/checkmates/data_checks/checks/distribution_data_check.py b/checkmates/data_checks/checks/distribution_data_check.py
@@ -0,0 +1,158 @@
+"""Data check that screens data for skewed or bimodal distrbutions prior to model training to ensure model performance is unaffected."""
+
+from diptest import diptest
+from scipy.stats import skew
+
+from checkmates.data_checks import (
+    DataCheck,
+    DataCheckActionCode,
+    DataCheckActionOption,
+    DataCheckMessageCode,
+    DataCheckWarning,
+)
+
+
+class DistributionDataCheck(DataCheck):
+    """Check if the overall data contains certain distributions that may need to be transformed prior training to improve model performance. Uses the skew test and yeojohnson transformation."""
+
+    def validate(self, X, y):
+        """Check if the overall data has a skewed or bimodal distribution.
+
+        Args:
+            X (pd.DataFrame, np.ndarray): Overall data to check for skewed or bimodal distributions.
+            y (pd.Series, np.ndarray): Target data to check for underlying distributions.
+
+        Returns:
+            dict (DataCheckError): List with DataCheckErrors if certain distributions are found in the overall data.
+
+        Examples:
+            >>> import pandas as pd
+
+            Features and target data that exhibit a skewed distribution will raise a warning for the user to transform the data.
+
+            >>> X = [5, 7, 8, 9, 10, 11, 12, 15, 20]
+            >>> data_check = DistributionDataCheck()
+            >>> assert data_check.validate(X, y) == [
+            ...     {
+            ...         "message": "Data may have a skewed distribution.",
+            ...         "data_check_name": "DistributionDataCheck",
+            ...         "level": "warning",
+            ...         "code": "SKEWED_DISTRIBUTION",
+            ...         "details": {"distribution type": "positive skew", "Skew Value": 0.7939, "Bimodal Coefficient": 1.0,},
+            ...         "action_options": [
+            ...             {
+            ...                 "code": "TRANSFORM_FEATURES",
+            ...                 "data_check_name": "DistributionDataCheck",
+            ...                 "parameters": {},
+            ...                 "metadata": {
+                                    "is_skew": True,
+                                    "transformation_strategy": "yeojohnson",
+            ...                 }
+            ...             }
+            ...         ]
+            ...     }
+            ... ]
+        """
+        messages = []
+
+        numeric_X = X.ww.select(["Integer", "Double"])
+
+        for col in numeric_X:
+            (
+                is_skew,
+                distribution_type,
+                skew_value,
+                coef,
+            ) = _detect_skew_distribution_helper(col)
+
+            if is_skew:
+                details = {
+                    "distribution type": distribution_type,
+                    "Skew Value": skew_value,
+                    "Bimodal Coefficient": coef,
+                }
+                messages.append(
+                    DataCheckWarning(
+                        message="Data may have a skewed distribution.",
+                        data_check_name=self.name,
+                        message_code=DataCheckMessageCode.SKEWED_DISTRIBUTION,
+                        details=details,
+                        action_options=[
+                            DataCheckActionOption(
+                                DataCheckActionCode.TRANSFORM_FEATURES,
+                                data_check_name=self.name,
+                                metadata={
+                                    "is_skew": True,
+                                    "transformation_strategy": "yeojohnson",
+                                    "columns": col,
+                                },
+                            ),
+                        ],
+                    ).to_dict(),
+                )
+        return messages
+
+
+def _detect_skew_distribution_helper(X):
+    """Helper method to detect skewed or bimodal distribution. Returns boolean, distribution type, the skew value, and bimodal coefficient."""
+    skew_value = skew(X)
+    coef = diptest(X)[1]
+
+    if coef < 0.05:
+        return True, "bimodal distribution", skew_value, coef
+    if skew_value < -0.5:
+        return True, "negative skew", skew_value, coef
+    if skew_value > 0.5:
+        return True, "positive skew", skew_value, coef
+    return False, "no skew", skew_value, coef
+
+
+# Testing Data to make sure skews are recognized-- successful
+# import numpy as np
+# import pandas as pd
+# data = {
+#     'Column1': np.random.normal(0, 1, 1000),  # Normally distributed data
+#     'Column2': np.random.exponential(1, 1000),  # Right-skewed data
+#     'Column3': np.random.gamma(2, 2, 1000)  # Right-skewed data
+# }
+
+# df = pd.DataFrame(data)
+# df.ww.init()
+# messages = []
+
+# numeric_X = df.ww.select(["Integer", "Double"])
+# print(numeric_X)
+# for col in numeric_X:
+#     (
+#         is_skew,
+#         distribution_type,
+#         skew_value,
+#         coef,
+#     ) = _detect_skew_distribution_helper(numeric_X['Column2'])
+
+#     if is_skew:
+#         details = {
+#             "distribution type": distribution_type,
+#             "Skew Value": skew_value,
+#             "Bimodal Coefficient": coef,
+#         }
+#         messages.append(
+#             DataCheckWarning(
+#                 message="Data may have a skewed distribution.",
+#                 data_check_name="Distribution Data Check",
+#                 message_code=DataCheckMessageCode.SKEWED_DISTRIBUTION,
+#                 details=details,
+#                 action_options=[
+#                     DataCheckActionOption(
+#                         DataCheckActionCode.TRANSFORM_FEATURES,
+#                         data_check_name="Distribution Data Check",
+#                         metadata={
+#                             "is_skew": True,
+#                             "transformation_strategy": "yeojohnson",
+#                             "columns" : col
+#                         },
+#                     ),
+#                 ],
+#             ).to_dict(),
+#         )
+# print(messages)
diff --git a/checkmates/data_checks/datacheck_meta/data_check_action_code.py b/checkmates/data_checks/datacheck_meta/data_check_action_code.py
@@ -19,6 +19,9 @@ class DataCheckActionCode(Enum):
     TRANSFORM_TARGET = "transform_target"
     """Action code for transforming the target data."""
 
+    TRANSFORM_FEATURES = "transform_features"
+    """Action code for transforming the features data."""
+
     REGULARIZE_AND_IMPUTE_DATASET = "regularize_and_impute_dataset"
     """Action code for regularizing and imputing all features and target time series data."""
 

diff --git a/checkmates/data_checks/datacheck_meta/data_check_message_code.py b/checkmates/data_checks/datacheck_meta/data_check_message_code.py
@@ -58,6 +58,9 @@ class DataCheckMessageCode(Enum):
     TARGET_LOGNORMAL_DISTRIBUTION = "target_lognormal_distribution"
     """Message code for target data with a lognormal distribution."""
 
+    SKEWED_DISTRIBUTION = "skewed_distribution"
+    """Message code for data with a skewed distribution."""
+
     HIGH_VARIANCE = "high_variance"
     """Message code for when high variance is detected for cross-validation."""
 

diff --git a/checkmates/pipelines/transformers.py b/checkmates/pipelines/transformers.py
@@ -3,6 +3,7 @@
 
 import pandas as pd
 import woodwork
+from scipy.stats import yeojohnson
 from sklearn.impute import SimpleImputer as SkImputer
 
 from checkmates.exceptions import MethodPropertyNotFoundError
@@ -83,6 +84,51 @@ def _get_feature_provenance(self):
         return {}
 
 
+"""Component that normalizes skewed distributions using the Yeo-Johnson method"""
+
+
+class SimpleNormalizer(Transformer):
+    """Normalizes skewed data according to the Yeo-Johnson method."""
+
+    def __init__(self):
+        super().__init__(
+            parameters=None,
+        )
+
+    def transform(self, X, y=None):
+        """Transforms input by normalizing distribution.
+
+        Args:
+            X (pd.DataFrame): Data to transform.
+            y (pd.Series, optional): Target Data
+
+        Returns:
+            pd.DataFrame: Transformed X
+        """
+        # If there are no columns to normalize, return early
+        if not self._cols_to_normalize:
+            return self
+
+        X = X[self._cols_to_normalize]
+        # Transform the data
+        X_t = yeojohnson(X)
+
+        # Reinit woodwork
+        X_t.ww.init()
+
+    def fit_transform(self, X, y=None):
+        """Fits on X and transforms X.
+
+        Args:
+            X (pd.DataFrame): Data to fit and transform
+            y (pd.Series, optional): Target data.
+
+        Returns:
+            pd.DataFrame: Transformed X
+        """
+        return self.fit(X, y).transform(X, y)
+
+
 """Component that imputes missing data according to a specified imputation strategy."""
 
 

diff --git a/checkmates/pipelines/utils.py b/checkmates/pipelines/utils.py
@@ -15,6 +15,7 @@
     TimeSeriesRegularizer,
 )
 from checkmates.pipelines.training_validation_split import TrainingValidationSplit
+from checkmates.pipelines.transformers import SimpleNormalizer
 from checkmates.problem_types import is_classification, is_regression, is_time_series
 from checkmates.utils import infer_feature_types
 
@@ -31,6 +32,7 @@ def _make_component_list_from_actions(actions):
     components = []
     cols_to_drop = []
     indices_to_drop = []
+    cols_to_normalize = []
 
     for action in actions:
         if action.action_code == DataCheckActionCode.REGULARIZE_AND_IMPUTE_DATASET:
@@ -47,6 +49,8 @@ def _make_component_list_from_actions(actions):
             )
         elif action.action_code == DataCheckActionCode.DROP_COL:
             cols_to_drop.extend(action.metadata["columns"])
+        elif action.action_code == DataCheckActionCode.TRANSFORM_FEATURES:
+            cols_to_normalize.extend(action.metadata["columns"])
         elif action.action_code == DataCheckActionCode.IMPUTE_COL:
             metadata = action.metadata
             parameters = metadata.get("parameters", {})
@@ -65,6 +69,9 @@ def _make_component_list_from_actions(actions):
     if indices_to_drop:
         indices_to_drop = sorted(set(indices_to_drop))
         components.append(DropRowsTransformer(indices_to_drop=indices_to_drop))
+    if cols_to_normalize:
+        cols_to_normalize = sorted(set(cols_to_normalize))
+        components.append(SimpleNormalizer(columns=cols_to_normalize))
 
     return components
 

diff --git a/docs/source/release_notes.rst b/docs/source/release_notes.rst
@@ -2,6 +2,7 @@ Release Notes
 -------------
 **Future Releases**
     * Enhancements
+        * Created ``distribution_data_check`` to screen for positive and negative skews as well as bimodal distributions :pr:`21`
     * Fixes
     * Changes
     * Documentation Changes

diff --git a/pyproject.toml b/pyproject.toml
@@ -15,6 +15,7 @@ dependencies = [
     "woodwork>=0.22.0",
     "click>=8.0.0",
     "black[jupyter]>=22.3.0",
+    "diptest>=0.5.2",
 ]
 requires-python = ">=3.8,<4.0"
 readme = "README.md"