Skip to content
Open
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
158 changes: 158 additions & 0 deletions checkmates/data_checks/checks/distribution_data_check.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,158 @@
"""Data check that screens data for skewed or bimodal distrbutions prior to model training to ensure model performance is unaffected."""

from diptest import diptest
from scipy.stats import skew

from checkmates.data_checks import (
DataCheck,
DataCheckActionCode,
DataCheckActionOption,
DataCheckMessageCode,
DataCheckWarning,
)


class DistributionDataCheck(DataCheck):
"""Check if the overall data contains certain distributions that may need to be transformed prior training to improve model performance. Uses the skew test and yeojohnson transformation."""

def validate(self, X, y):
"""Check if the overall data has a skewed or bimodal distribution.

Args:
X (pd.DataFrame, np.ndarray): Overall data to check for skewed or bimodal distributions.
y (pd.Series, np.ndarray): Target data to check for underlying distributions.

Returns:
dict (DataCheckError): List with DataCheckErrors if certain distributions are found in the overall data.

Examples:
>>> import pandas as pd

Features and target data that exhibit a skewed distribution will raise a warning for the user to transform the data.

>>> X = [5, 7, 8, 9, 10, 11, 12, 15, 20]
>>> data_check = DistributionDataCheck()
>>> assert data_check.validate(X, y) == [
... {
... "message": "Data may have a skewed distribution.",
... "data_check_name": "DistributionDataCheck",
... "level": "warning",
... "code": "SKEWED_DISTRIBUTION",
... "details": {"distribution type": "positive skew", "Skew Value": 0.7939, "Bimodal Coefficient": 1.0,},
... "action_options": [
... {
... "code": "TRANSFORM_FEATURES",
... "data_check_name": "DistributionDataCheck",
... "parameters": {},
... "metadata": {
"is_skew": True,
"transformation_strategy": "yeojohnson",
... }
... }
... ]
... }
... ]
"""
messages = []

numeric_X = X.ww.select(["Integer", "Double"])

for col in numeric_X:
(
is_skew,
distribution_type,
skew_value,
coef,
) = _detect_skew_distribution_helper(col)

if is_skew:
details = {
"distribution type": distribution_type,
"Skew Value": skew_value,
"Bimodal Coefficient": coef,
}
messages.append(
DataCheckWarning(
message="Data may have a skewed distribution.",
data_check_name=self.name,
message_code=DataCheckMessageCode.SKEWED_DISTRIBUTION,
details=details,
action_options=[
DataCheckActionOption(
DataCheckActionCode.TRANSFORM_FEATURES,
data_check_name=self.name,
metadata={
"is_skew": True,
"transformation_strategy": "yeojohnson",
"columns": col,
},
),
],
).to_dict(),
)
return messages


def _detect_skew_distribution_helper(X):
"""Helper method to detect skewed or bimodal distribution. Returns boolean, distribution type, the skew value, and bimodal coefficient."""
skew_value = skew(X)
coef = diptest(X)[1]

if coef < 0.05:
return True, "bimodal distribution", skew_value, coef
if skew_value < -0.5:
return True, "negative skew", skew_value, coef
if skew_value > 0.5:
return True, "positive skew", skew_value, coef
return False, "no skew", skew_value, coef


# Testing Data to make sure skews are recognized-- successful
# import numpy as np
# import pandas as pd
# data = {
# 'Column1': np.random.normal(0, 1, 1000), # Normally distributed data
# 'Column2': np.random.exponential(1, 1000), # Right-skewed data
# 'Column3': np.random.gamma(2, 2, 1000) # Right-skewed data
# }

# df = pd.DataFrame(data)
# df.ww.init()
# messages = []

# numeric_X = df.ww.select(["Integer", "Double"])
# print(numeric_X)
# for col in numeric_X:
# (
# is_skew,
# distribution_type,
# skew_value,
# coef,
# ) = _detect_skew_distribution_helper(numeric_X['Column2'])

# if is_skew:
# details = {
# "distribution type": distribution_type,
# "Skew Value": skew_value,
# "Bimodal Coefficient": coef,
# }
# messages.append(
# DataCheckWarning(
# message="Data may have a skewed distribution.",
# data_check_name="Distribution Data Check",
# message_code=DataCheckMessageCode.SKEWED_DISTRIBUTION,
# details=details,
# action_options=[
# DataCheckActionOption(
# DataCheckActionCode.TRANSFORM_FEATURES,
# data_check_name="Distribution Data Check",
# metadata={
# "is_skew": True,
# "transformation_strategy": "yeojohnson",
# "columns" : col
# },
# ),
# ],
# ).to_dict(),
# )
# print(messages)
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,9 @@ class DataCheckActionCode(Enum):
TRANSFORM_TARGET = "transform_target"
"""Action code for transforming the target data."""

TRANSFORM_FEATURES = "transform_features"
"""Action code for transforming the features data."""

REGULARIZE_AND_IMPUTE_DATASET = "regularize_and_impute_dataset"
"""Action code for regularizing and imputing all features and target time series data."""

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,9 @@ class DataCheckMessageCode(Enum):
TARGET_LOGNORMAL_DISTRIBUTION = "target_lognormal_distribution"
"""Message code for target data with a lognormal distribution."""

SKEWED_DISTRIBUTION = "skewed_distribution"
"""Message code for data with a skewed distribution."""

HIGH_VARIANCE = "high_variance"
"""Message code for when high variance is detected for cross-validation."""

Expand Down
46 changes: 46 additions & 0 deletions checkmates/pipelines/transformers.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

import pandas as pd
import woodwork
from scipy.stats import yeojohnson
from sklearn.impute import SimpleImputer as SkImputer

from checkmates.exceptions import MethodPropertyNotFoundError
Expand Down Expand Up @@ -83,6 +84,51 @@ def _get_feature_provenance(self):
return {}


"""Component that normalizes skewed distributions using the Yeo-Johnson method"""


class SimpleNormalizer(Transformer):
"""Normalizes skewed data according to the Yeo-Johnson method."""

def __init__(self):
super().__init__(
parameters=None,
)

def transform(self, X, y=None):
"""Transforms input by normalizing distribution.

Args:
X (pd.DataFrame): Data to transform.
y (pd.Series, optional): Target Data

Returns:
pd.DataFrame: Transformed X
"""
# If there are no columns to normalize, return early
if not self._cols_to_normalize:
return self

X = X[self._cols_to_normalize]
# Transform the data
X_t = yeojohnson(X)

# Reinit woodwork
X_t.ww.init()

def fit_transform(self, X, y=None):
"""Fits on X and transforms X.

Args:
X (pd.DataFrame): Data to fit and transform
y (pd.Series, optional): Target data.

Returns:
pd.DataFrame: Transformed X
"""
return self.fit(X, y).transform(X, y)


"""Component that imputes missing data according to a specified imputation strategy."""


Expand Down
7 changes: 7 additions & 0 deletions checkmates/pipelines/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
TimeSeriesRegularizer,
)
from checkmates.pipelines.training_validation_split import TrainingValidationSplit
from checkmates.pipelines.transformers import SimpleNormalizer
from checkmates.problem_types import is_classification, is_regression, is_time_series
from checkmates.utils import infer_feature_types

Expand All @@ -31,6 +32,7 @@ def _make_component_list_from_actions(actions):
components = []
cols_to_drop = []
indices_to_drop = []
cols_to_normalize = []

for action in actions:
if action.action_code == DataCheckActionCode.REGULARIZE_AND_IMPUTE_DATASET:
Expand All @@ -47,6 +49,8 @@ def _make_component_list_from_actions(actions):
)
elif action.action_code == DataCheckActionCode.DROP_COL:
cols_to_drop.extend(action.metadata["columns"])
elif action.action_code == DataCheckActionCode.TRANSFORM_FEATURES:
cols_to_normalize.extend(action.metadata["columns"])
elif action.action_code == DataCheckActionCode.IMPUTE_COL:
metadata = action.metadata
parameters = metadata.get("parameters", {})
Expand All @@ -65,6 +69,9 @@ def _make_component_list_from_actions(actions):
if indices_to_drop:
indices_to_drop = sorted(set(indices_to_drop))
components.append(DropRowsTransformer(indices_to_drop=indices_to_drop))
if cols_to_normalize:
cols_to_normalize = sorted(set(cols_to_normalize))
components.append(SimpleNormalizer(columns=cols_to_normalize))

return components

Expand Down
1 change: 1 addition & 0 deletions docs/source/release_notes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ Release Notes
-------------
**Future Releases**
* Enhancements
* Created ``distribution_data_check`` to screen for positive and negative skews as well as bimodal distributions :pr:`21`
* Fixes
* Changes
* Documentation Changes
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ dependencies = [
"woodwork>=0.22.0",
"click>=8.0.0",
"black[jupyter]>=22.3.0",
"diptest>=0.5.2",
]
requires-python = ">=3.8,<4.0"
readme = "README.md"
Expand Down