Skip to content

Commit 69ca038

Browse files
authored
Merge pull request #17 from alteryx/invalid_target_data_check
invalid_target_data_check added
2 parents 6e2bc39 + ff9ff13 commit 69ca038

15 files changed

+2044
-0
lines changed

checkmates/data_checks/__init__.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,9 @@
4949
from checkmates.data_checks.checks.multicollinearity_data_check import (
5050
MulticollinearityDataCheck,
5151
)
52+
from checkmates.data_checks.checks.invalid_target_data_check import (
53+
InvalidTargetDataCheck,
54+
)
5255

5356

5457
from checkmates.data_checks.datacheck_meta.utils import handle_data_check_action_code

checkmates/data_checks/checks/invalid_target_data_check.py

Lines changed: 448 additions & 0 deletions
Large diffs are not rendered by default.

checkmates/exceptions/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,4 +3,6 @@
33
DataCheckInitError,
44
MissingComponentError,
55
ValidationErrorCode,
6+
ObjectiveCreationError,
7+
ObjectiveNotFoundError,
68
)

checkmates/exceptions/exceptions.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,16 @@ class MissingComponentError(Exception):
88
pass
99

1010

11+
class ObjectiveNotFoundError(Exception):
12+
"""Exception to raise when specified objective does not exist."""
13+
14+
pass
15+
16+
17+
class ObjectiveCreationError(Exception):
18+
"""Exception when get_objective tries to instantiate an objective and required args are not provided."""
19+
20+
1121
class DataCheckInitError(Exception):
1222
"""Exception raised when a data check can't initialize with the parameters given."""
1323

checkmates/objectives/__init__.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
"""General Directory for CheckMates Objectives."""
2+
3+
from checkmates.objectives.objective_base import ObjectiveBase
4+
from checkmates.objectives.regression_objective import RegressionObjective
5+
6+
from checkmates.objectives.utils import get_objective
7+
from checkmates.objectives.utils import get_default_primary_search_objective
8+
from checkmates.objectives.utils import get_non_core_objectives
9+
from checkmates.objectives.utils import get_core_objectives
10+
11+
12+
from checkmates.objectives.standard_metrics import RootMeanSquaredLogError
13+
from checkmates.objectives.standard_metrics import MeanSquaredLogError
14+
15+
from checkmates.objectives.binary_classification_objective import (
16+
BinaryClassificationObjective,
17+
)
18+
from checkmates.objectives.multiclass_classification_objective import (
19+
MulticlassClassificationObjective,
20+
)
Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
"""Base class for all binary classification objectives."""
2+
import numpy as np
3+
from scipy.optimize import differential_evolution
4+
5+
from checkmates.objectives.objective_base import ObjectiveBase
6+
from checkmates.problem_types import ProblemTypes
7+
8+
9+
class BinaryClassificationObjective(ObjectiveBase):
10+
"""Base class for all binary classification objectives."""
11+
12+
problem_types = [ProblemTypes.BINARY, ProblemTypes.TIME_SERIES_BINARY]
13+
14+
"""[ProblemTypes.BINARY, ProblemTypes.TIME_SERIES_BINARY]"""
15+
16+
@property
17+
def can_optimize_threshold(cls):
18+
"""Returns a boolean determining if we can optimize the binary classification objective threshold.
19+
20+
This will be false for any objective that works directly with
21+
predicted probabilities, like log loss and AUC. Otherwise, it
22+
will be true.
23+
24+
Returns:
25+
bool: Whether or not an objective can be optimized.
26+
"""
27+
return not cls.score_needs_proba
28+
29+
def optimize_threshold(self, ypred_proba, y_true, X=None):
30+
"""Learn a binary classification threshold which optimizes the current objective.
31+
32+
Args:
33+
ypred_proba (pd.Series): The classifier's predicted probabilities
34+
y_true (pd.Series): The ground truth for the predictions.
35+
X (pd.DataFrame, optional): Any extra columns that are needed from training data.
36+
37+
Returns:
38+
Optimal threshold for this objective.
39+
40+
Raises:
41+
RuntimeError: If objective cannot be optimized.
42+
"""
43+
ypred_proba = self._standardize_input_type(ypred_proba)
44+
y_true = self._standardize_input_type(y_true)
45+
if X is not None:
46+
X = self._standardize_input_type(X)
47+
48+
if not self.can_optimize_threshold:
49+
raise RuntimeError("Trying to optimize objective that can't be optimized!")
50+
51+
def cost(threshold):
52+
y_predicted = self.decision_function(
53+
ypred_proba=ypred_proba,
54+
threshold=threshold[0],
55+
X=X,
56+
)
57+
cost = self.objective_function(y_true, y_predicted, X=X)
58+
return -cost if self.greater_is_better else cost
59+
60+
optimal = differential_evolution(cost, bounds=[(0, 1)], seed=0, maxiter=250)
61+
62+
return optimal.x[0]
63+
64+
def decision_function(self, ypred_proba, threshold=0.5, X=None):
65+
"""Apply a learned threshold to predicted probabilities to get predicted classes.
66+
67+
Args:
68+
ypred_proba (pd.Series, np.ndarray): The classifier's predicted probabilities
69+
threshold (float, optional): Threshold used to make a prediction. Defaults to 0.5.
70+
X (pd.DataFrame, optional): Any extra columns that are needed from training data.
71+
72+
Returns:
73+
predictions
74+
"""
75+
ypred_proba = self._standardize_input_type(ypred_proba)
76+
return ypred_proba > threshold
77+
78+
def validate_inputs(self, y_true, y_predicted):
79+
"""Validate inputs for scoring."""
80+
super().validate_inputs(y_true, y_predicted)
81+
if len(np.unique(y_true)) > 2:
82+
raise ValueError("y_true contains more than two unique values")
83+
if len(np.unique(y_predicted)) > 2 and not self.score_needs_proba:
84+
raise ValueError("y_predicted contains more than two unique values")
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
"""Base class for all multiclass classification objectives."""
2+
from checkmates.objectives.objective_base import ObjectiveBase
3+
from checkmates.problem_types import ProblemTypes
4+
5+
6+
class MulticlassClassificationObjective(ObjectiveBase):
7+
"""Base class for all multiclass classification objectives."""
8+
9+
problem_types = [ProblemTypes.MULTICLASS, ProblemTypes.TIME_SERIES_MULTICLASS]
10+
"""[ProblemTypes.MULTICLASS, ProblemTypes.TIME_SERIES_MULTICLASS]"""
Lines changed: 217 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,217 @@
1+
"""Base class for all objectives."""
2+
from abc import ABC, abstractmethod
3+
4+
import numpy as np
5+
import pandas as pd
6+
7+
from checkmates.problem_types import handle_problem_types
8+
from checkmates.utils import classproperty
9+
10+
11+
class ObjectiveBase(ABC):
12+
"""Base class for all objectives."""
13+
14+
problem_types = None
15+
16+
@property
17+
@classmethod
18+
@abstractmethod
19+
def name(cls):
20+
"""Returns a name describing the objective."""
21+
22+
@property
23+
@classmethod
24+
@abstractmethod
25+
def greater_is_better(cls):
26+
"""Returns a boolean determining if a greater score indicates better model performance."""
27+
28+
@property
29+
@classmethod
30+
@abstractmethod
31+
def score_needs_proba(cls):
32+
"""Returns a boolean determining if the score() method needs probability estimates.
33+
34+
This should be true for objectives which work with predicted
35+
probabilities, like log loss or AUC, and false for objectives
36+
which compare predicted class labels to the actual labels, like
37+
F1 or correlation.
38+
"""
39+
40+
@property
41+
@classmethod
42+
@abstractmethod
43+
def perfect_score(cls):
44+
"""Returns the score obtained by evaluating this objective on a perfect model."""
45+
46+
@property
47+
@classmethod
48+
@abstractmethod
49+
def is_bounded_like_percentage(cls):
50+
"""Returns whether this objective is bounded between 0 and 1, inclusive."""
51+
52+
@property
53+
@classmethod
54+
@abstractmethod
55+
def expected_range(cls):
56+
"""Returns the expected range of the objective, which is not necessarily the possible ranges.
57+
58+
For example, our expected R2 range is from [-1, 1], although the
59+
actual range is (-inf, 1].
60+
"""
61+
62+
@classmethod
63+
@abstractmethod
64+
def objective_function(
65+
cls,
66+
y_true,
67+
y_predicted,
68+
y_train=None,
69+
X=None,
70+
sample_weight=None,
71+
):
72+
"""Computes the relative value of the provided predictions compared to the actual labels, according a specified metric.
73+
74+
Args:
75+
y_predicted (pd.Series): Predicted values of length [n_samples]
76+
y_true (pd.Series): Actual class labels of length [n_samples]
77+
y_train (pd.Series): Observed training values of length [n_samples]
78+
X (pd.DataFrame or np.ndarray): Extra data of shape [n_samples, n_features] necessary to calculate score
79+
sample_weight (pd.DataFrame or np.ndarray): Sample weights used in computing objective value result
80+
81+
Returns:
82+
Numerical value used to calculate score
83+
"""
84+
85+
@classproperty
86+
def positive_only(cls):
87+
"""If True, this objective is only valid for positive data. Defaults to False."""
88+
return False
89+
90+
def score(self, y_true, y_predicted, y_train=None, X=None, sample_weight=None):
91+
"""Returns a numerical score indicating performance based on the differences between the predicted and actual values.
92+
93+
Args:
94+
y_predicted (pd.Series): Predicted values of length [n_samples]
95+
y_true (pd.Series): Actual class labels of length [n_samples]
96+
y_train (pd.Series): Observed training values of length [n_samples]
97+
X (pd.DataFrame or np.ndarray): Extra data of shape [n_samples, n_features] necessary to calculate score
98+
sample_weight (pd.DataFrame or np.ndarray): Sample weights used in computing objective value result
99+
100+
Returns:
101+
score
102+
"""
103+
if X is not None:
104+
X = self._standardize_input_type(X)
105+
if y_train is not None:
106+
y_train = self._standardize_input_type(y_train)
107+
y_true = self._standardize_input_type(y_true)
108+
y_predicted = self._standardize_input_type(y_predicted)
109+
self.validate_inputs(y_true, y_predicted)
110+
return self.objective_function(
111+
y_true,
112+
y_predicted,
113+
y_train=y_train,
114+
X=X,
115+
sample_weight=sample_weight,
116+
)
117+
118+
@staticmethod
119+
def _standardize_input_type(input_data):
120+
"""Standardize input to pandas for scoring.
121+
122+
Args:
123+
input_data (list, pd.DataFrame, pd.Series, or np.ndarray): A matrix of predictions or predicted probabilities
124+
125+
Returns:
126+
pd.DataFrame or pd.Series: a pd.Series, or pd.DataFrame object if predicted probabilities were provided.
127+
"""
128+
if isinstance(input_data, (pd.Series, pd.DataFrame)):
129+
return input_data
130+
if isinstance(input_data, list):
131+
if isinstance(input_data[0], list):
132+
return pd.DataFrame(input_data)
133+
return pd.Series(input_data)
134+
if isinstance(input_data, np.ndarray):
135+
if len(input_data.shape) == 1:
136+
return pd.Series(input_data)
137+
return pd.DataFrame(input_data)
138+
139+
def validate_inputs(self, y_true, y_predicted):
140+
"""Validates the input based on a few simple checks.
141+
142+
Args:
143+
y_predicted (pd.Series, or pd.DataFrame): Predicted values of length [n_samples].
144+
y_true (pd.Series): Actual class labels of length [n_samples].
145+
146+
Raises:
147+
ValueError: If the inputs are malformed.
148+
"""
149+
if y_predicted.shape[0] != y_true.shape[0]:
150+
raise ValueError(
151+
"Inputs have mismatched dimensions: y_predicted has shape {}, y_true has shape {}".format(
152+
len(y_predicted),
153+
len(y_true),
154+
),
155+
)
156+
if len(y_true) == 0:
157+
raise ValueError("Length of inputs is 0")
158+
159+
if isinstance(y_true, pd.DataFrame):
160+
y_true = y_true.to_numpy().flatten()
161+
if np.isnan(y_true).any() or np.isinf(y_true).any():
162+
raise ValueError("y_true contains NaN or infinity")
163+
164+
if isinstance(y_predicted, pd.DataFrame):
165+
y_predicted = y_predicted.to_numpy().flatten()
166+
if np.isnan(y_predicted).any() or np.isinf(y_predicted).any():
167+
raise ValueError("y_predicted contains NaN or infinity")
168+
if self.score_needs_proba and np.any([(y_predicted < 0) | (y_predicted > 1)]):
169+
raise ValueError(
170+
"y_predicted contains probability estimates not within [0, 1]",
171+
)
172+
173+
@classmethod
174+
def calculate_percent_difference(cls, score, baseline_score):
175+
"""Calculate the percent difference between scores.
176+
177+
Args:
178+
score (float): A score. Output of the score method of this objective.
179+
baseline_score (float): A score. Output of the score method of this objective. In practice,
180+
this is the score achieved on this objective with a baseline estimator.
181+
182+
Returns:
183+
float: The percent difference between the scores. Note that for objectives that can be interpreted
184+
as percentages, this will be the difference between the reference score and score. For all other
185+
objectives, the difference will be normalized by the reference score.
186+
"""
187+
if pd.isna(score) or pd.isna(baseline_score):
188+
return np.nan
189+
190+
if np.isclose(baseline_score - score, 0, atol=1e-10):
191+
return 0
192+
193+
# Return inf when dividing by 0
194+
if (
195+
np.isclose(baseline_score, 0, atol=1e-10)
196+
and not cls.is_bounded_like_percentage
197+
):
198+
return np.inf
199+
200+
decrease = False
201+
if (baseline_score > score and cls.greater_is_better) or (
202+
baseline_score < score and not cls.greater_is_better
203+
):
204+
decrease = True
205+
206+
difference = baseline_score - score
207+
change = (
208+
difference
209+
if cls.is_bounded_like_percentage
210+
else difference / baseline_score
211+
)
212+
return 100 * (-1) ** (decrease) * np.abs(change)
213+
214+
@classmethod
215+
def is_defined_for_problem_type(cls, problem_type):
216+
"""Returns whether or not an objective is defined for a problem type."""
217+
return handle_problem_types(problem_type) in cls.problem_types
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
"""Base class for all regression objectives."""
2+
from checkmates.objectives.objective_base import ObjectiveBase
3+
from checkmates.problem_types import ProblemTypes
4+
5+
6+
class RegressionObjective(ObjectiveBase):
7+
"""Base class for all regression objectives."""
8+
9+
problem_types = [ProblemTypes.REGRESSION, ProblemTypes.TIME_SERIES_REGRESSION]
10+
"""[ProblemTypes.REGRESSION, ProblemTypes.TIME_SERIES_REGRESSION]"""

0 commit comments

Comments
 (0)