Skip to content

Commit ba00cdc

Browse files
author
Nabil Fayak
committed
invalid_target_data_check added with errors
1 parent 22cbd87 commit ba00cdc

File tree

12 files changed

+1784
-0
lines changed

12 files changed

+1784
-0
lines changed

checkmates/data_checks/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@
3535
from checkmates.data_checks.checks.sparsity_data_check import SparsityDataCheck
3636
from checkmates.data_checks.checks.datetime_format_data_check import DateTimeFormatDataCheck
3737
from checkmates.data_checks.checks.multicollinearity_data_check import MulticollinearityDataCheck
38+
from checkmates.data_checks.checks.invalid_target_data_check import InvalidTargetDataCheck
3839

3940

4041

checkmates/data_checks/checks/invalid_target_data_check.py

Lines changed: 448 additions & 0 deletions
Large diffs are not rendered by default.

checkmates/exceptions/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,4 +3,6 @@
33
DataCheckInitError,
44
MissingComponentError,
55
ValidationErrorCode,
6+
ObjectiveCreationError,
7+
ObjectiveNotFoundError
68
)

checkmates/exceptions/exceptions.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,13 @@ class MissingComponentError(Exception):
77

88
pass
99

10+
class ObjectiveNotFoundError(Exception):
11+
"""Exception to raise when specified objective does not exist."""
12+
13+
pass
14+
15+
class ObjectiveCreationError(Exception):
16+
"""Exception when get_objective tries to instantiate an objective and required args are not provided."""
1017

1118
class DataCheckInitError(Exception):
1219
"""Exception raised when a data check can't initialize with the parameters given."""

checkmates/objectives/__init__.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
from checkmates.objectives.objective_base import ObjectiveBase
2+
from checkmates.objectives.regression_objective import RegressionObjective
3+
4+
from checkmates.objectives.utils import get_objective
5+
from checkmates.objectives.utils import get_default_primary_search_objective
6+
from checkmates.objectives.utils import get_non_core_objectives
7+
from checkmates.objectives.utils import get_core_objectives
8+
9+
10+
from checkmates.objectives.standard_metrics import RootMeanSquaredLogError
11+
from checkmates.objectives.standard_metrics import MeanSquaredLogError
Lines changed: 217 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,217 @@
1+
"""Base class for all objectives."""
2+
from abc import ABC, abstractmethod
3+
4+
import numpy as np
5+
import pandas as pd
6+
7+
from checkmates.problem_types import handle_problem_types
8+
from checkmates.utils import classproperty
9+
10+
11+
class ObjectiveBase(ABC):
12+
"""Base class for all objectives."""
13+
14+
problem_types = None
15+
16+
@property
17+
@classmethod
18+
@abstractmethod
19+
def name(cls):
20+
"""Returns a name describing the objective."""
21+
22+
@property
23+
@classmethod
24+
@abstractmethod
25+
def greater_is_better(cls):
26+
"""Returns a boolean determining if a greater score indicates better model performance."""
27+
28+
@property
29+
@classmethod
30+
@abstractmethod
31+
def score_needs_proba(cls):
32+
"""Returns a boolean determining if the score() method needs probability estimates.
33+
34+
This should be true for objectives which work with predicted
35+
probabilities, like log loss or AUC, and false for objectives
36+
which compare predicted class labels to the actual labels, like
37+
F1 or correlation.
38+
"""
39+
40+
@property
41+
@classmethod
42+
@abstractmethod
43+
def perfect_score(cls):
44+
"""Returns the score obtained by evaluating this objective on a perfect model."""
45+
46+
@property
47+
@classmethod
48+
@abstractmethod
49+
def is_bounded_like_percentage(cls):
50+
"""Returns whether this objective is bounded between 0 and 1, inclusive."""
51+
52+
@property
53+
@classmethod
54+
@abstractmethod
55+
def expected_range(cls):
56+
"""Returns the expected range of the objective, which is not necessarily the possible ranges.
57+
58+
For example, our expected R2 range is from [-1, 1], although the
59+
actual range is (-inf, 1].
60+
"""
61+
62+
@classmethod
63+
@abstractmethod
64+
def objective_function(
65+
cls,
66+
y_true,
67+
y_predicted,
68+
y_train=None,
69+
X=None,
70+
sample_weight=None,
71+
):
72+
"""Computes the relative value of the provided predictions compared to the actual labels, according a specified metric.
73+
74+
Args:
75+
y_predicted (pd.Series): Predicted values of length [n_samples]
76+
y_true (pd.Series): Actual class labels of length [n_samples]
77+
y_train (pd.Series): Observed training values of length [n_samples]
78+
X (pd.DataFrame or np.ndarray): Extra data of shape [n_samples, n_features] necessary to calculate score
79+
sample_weight (pd.DataFrame or np.ndarray): Sample weights used in computing objective value result
80+
81+
Returns:
82+
Numerical value used to calculate score
83+
"""
84+
85+
@classproperty
86+
def positive_only(cls):
87+
"""If True, this objective is only valid for positive data. Defaults to False."""
88+
return False
89+
90+
def score(self, y_true, y_predicted, y_train=None, X=None, sample_weight=None):
91+
"""Returns a numerical score indicating performance based on the differences between the predicted and actual values.
92+
93+
Args:
94+
y_predicted (pd.Series): Predicted values of length [n_samples]
95+
y_true (pd.Series): Actual class labels of length [n_samples]
96+
y_train (pd.Series): Observed training values of length [n_samples]
97+
X (pd.DataFrame or np.ndarray): Extra data of shape [n_samples, n_features] necessary to calculate score
98+
sample_weight (pd.DataFrame or np.ndarray): Sample weights used in computing objective value result
99+
100+
Returns:
101+
score
102+
"""
103+
if X is not None:
104+
X = self._standardize_input_type(X)
105+
if y_train is not None:
106+
y_train = self._standardize_input_type(y_train)
107+
y_true = self._standardize_input_type(y_true)
108+
y_predicted = self._standardize_input_type(y_predicted)
109+
self.validate_inputs(y_true, y_predicted)
110+
return self.objective_function(
111+
y_true,
112+
y_predicted,
113+
y_train=y_train,
114+
X=X,
115+
sample_weight=sample_weight,
116+
)
117+
118+
@staticmethod
119+
def _standardize_input_type(input_data):
120+
"""Standardize input to pandas for scoring.
121+
122+
Args:
123+
input_data (list, pd.DataFrame, pd.Series, or np.ndarray): A matrix of predictions or predicted probabilities
124+
125+
Returns:
126+
pd.DataFrame or pd.Series: a pd.Series, or pd.DataFrame object if predicted probabilities were provided.
127+
"""
128+
if isinstance(input_data, (pd.Series, pd.DataFrame)):
129+
return input_data
130+
if isinstance(input_data, list):
131+
if isinstance(input_data[0], list):
132+
return pd.DataFrame(input_data)
133+
return pd.Series(input_data)
134+
if isinstance(input_data, np.ndarray):
135+
if len(input_data.shape) == 1:
136+
return pd.Series(input_data)
137+
return pd.DataFrame(input_data)
138+
139+
def validate_inputs(self, y_true, y_predicted):
140+
"""Validates the input based on a few simple checks.
141+
142+
Args:
143+
y_predicted (pd.Series, or pd.DataFrame): Predicted values of length [n_samples].
144+
y_true (pd.Series): Actual class labels of length [n_samples].
145+
146+
Raises:
147+
ValueError: If the inputs are malformed.
148+
"""
149+
if y_predicted.shape[0] != y_true.shape[0]:
150+
raise ValueError(
151+
"Inputs have mismatched dimensions: y_predicted has shape {}, y_true has shape {}".format(
152+
len(y_predicted),
153+
len(y_true),
154+
),
155+
)
156+
if len(y_true) == 0:
157+
raise ValueError("Length of inputs is 0")
158+
159+
if isinstance(y_true, pd.DataFrame):
160+
y_true = y_true.to_numpy().flatten()
161+
if np.isnan(y_true).any() or np.isinf(y_true).any():
162+
raise ValueError("y_true contains NaN or infinity")
163+
164+
if isinstance(y_predicted, pd.DataFrame):
165+
y_predicted = y_predicted.to_numpy().flatten()
166+
if np.isnan(y_predicted).any() or np.isinf(y_predicted).any():
167+
raise ValueError("y_predicted contains NaN or infinity")
168+
if self.score_needs_proba and np.any([(y_predicted < 0) | (y_predicted > 1)]):
169+
raise ValueError(
170+
"y_predicted contains probability estimates not within [0, 1]",
171+
)
172+
173+
@classmethod
174+
def calculate_percent_difference(cls, score, baseline_score):
175+
"""Calculate the percent difference between scores.
176+
177+
Args:
178+
score (float): A score. Output of the score method of this objective.
179+
baseline_score (float): A score. Output of the score method of this objective. In practice,
180+
this is the score achieved on this objective with a baseline estimator.
181+
182+
Returns:
183+
float: The percent difference between the scores. Note that for objectives that can be interpreted
184+
as percentages, this will be the difference between the reference score and score. For all other
185+
objectives, the difference will be normalized by the reference score.
186+
"""
187+
if pd.isna(score) or pd.isna(baseline_score):
188+
return np.nan
189+
190+
if np.isclose(baseline_score - score, 0, atol=1e-10):
191+
return 0
192+
193+
# Return inf when dividing by 0
194+
if (
195+
np.isclose(baseline_score, 0, atol=1e-10)
196+
and not cls.is_bounded_like_percentage
197+
):
198+
return np.inf
199+
200+
decrease = False
201+
if (baseline_score > score and cls.greater_is_better) or (
202+
baseline_score < score and not cls.greater_is_better
203+
):
204+
decrease = True
205+
206+
difference = baseline_score - score
207+
change = (
208+
difference
209+
if cls.is_bounded_like_percentage
210+
else difference / baseline_score
211+
)
212+
return 100 * (-1) ** (decrease) * np.abs(change)
213+
214+
@classmethod
215+
def is_defined_for_problem_type(cls, problem_type):
216+
"""Returns whether or not an objective is defined for a problem type."""
217+
return handle_problem_types(problem_type) in cls.problem_types
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
"""Base class for all regression objectives."""
2+
from checkmates.objectives.objective_base import ObjectiveBase
3+
from checkmates.problem_types import ProblemTypes
4+
5+
6+
class RegressionObjective(ObjectiveBase):
7+
"""Base class for all regression objectives."""
8+
9+
problem_types = [ProblemTypes.REGRESSION, ProblemTypes.TIME_SERIES_REGRESSION]
10+
"""[ProblemTypes.REGRESSION, ProblemTypes.TIME_SERIES_REGRESSION]"""
Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,100 @@
1+
"""Standard machine learning objective functions."""
2+
import numpy as np
3+
import pandas as pd
4+
from sklearn import metrics
5+
6+
from checkmates.objectives.regression_objective import RegressionObjective
7+
from checkmates.utils import classproperty
8+
9+
class RootMeanSquaredLogError(RegressionObjective):
10+
"""Root mean squared log error for regression.
11+
12+
Only valid for nonnegative inputs. Otherwise, will throw a ValueError.
13+
14+
Example:
15+
>>> y_true = pd.Series([1.5, 2, 3, 1, 0.5, 1, 2.5, 2.5, 1, 0.5, 2])
16+
>>> y_pred = pd.Series([1.5, 2.5, 2, 1, 0.5, 1, 3, 2.25, 0.75, 0.25, 1.75])
17+
>>> np.testing.assert_almost_equal(RootMeanSquaredLogError().objective_function(y_true, y_pred), 0.13090204)
18+
"""
19+
20+
name = "Root Mean Squared Log Error"
21+
greater_is_better = False
22+
score_needs_proba = False
23+
perfect_score = 0.0
24+
is_bounded_like_percentage = False # Range [0, Inf)
25+
expected_range = [0, float("inf")]
26+
27+
def objective_function(
28+
self,
29+
y_true,
30+
y_predicted,
31+
y_train=None,
32+
X=None,
33+
sample_weight=None,
34+
):
35+
"""Objective function for root mean squared log error for regression."""
36+
37+
def rmsle(y_true, y_pred):
38+
return np.sqrt(
39+
metrics.mean_squared_log_error(
40+
y_true,
41+
y_pred,
42+
sample_weight=sample_weight,
43+
),
44+
)
45+
46+
# Multiseries time series regression
47+
if isinstance(y_true, pd.DataFrame):
48+
raw_rmsles = []
49+
for i in range(len(y_true.columns)):
50+
y_true_i = y_true.iloc[:, i]
51+
y_predicted_i = y_predicted.iloc[:, i]
52+
raw_rmsles.append(rmsle(y_true_i, y_predicted_i))
53+
return np.mean(raw_rmsles)
54+
55+
# All univariate regression
56+
return rmsle(y_true, y_predicted)
57+
58+
@classproperty
59+
def positive_only(self):
60+
"""If True, this objective is only valid for positive data."""
61+
return True
62+
63+
64+
class MeanSquaredLogError(RegressionObjective):
65+
"""Mean squared log error for regression.
66+
67+
Only valid for nonnegative inputs. Otherwise, will throw a ValueError.
68+
69+
Example:
70+
>>> y_true = pd.Series([1.5, 2, 3, 1, 0.5, 1, 2.5, 2.5, 1, 0.5, 2])
71+
>>> y_pred = pd.Series([1.5, 2.5, 2, 1, 0.5, 1, 3, 2.25, 0.75, 0.25, 1.75])
72+
>>> np.testing.assert_almost_equal(MeanSquaredLogError().objective_function(y_true, y_pred), 0.0171353)
73+
"""
74+
75+
name = "Mean Squared Log Error"
76+
greater_is_better = False
77+
score_needs_proba = False
78+
perfect_score = 0.0
79+
is_bounded_like_percentage = False # Range [0, Inf)
80+
expected_range = [0, float("inf")]
81+
82+
def objective_function(
83+
self,
84+
y_true,
85+
y_predicted,
86+
y_train=None,
87+
X=None,
88+
sample_weight=None,
89+
):
90+
"""Objective function for mean squared log error for regression."""
91+
return metrics.mean_squared_log_error(
92+
y_true,
93+
y_predicted,
94+
sample_weight=sample_weight,
95+
)
96+
97+
@classproperty
98+
def positive_only(self):
99+
"""If True, this objective is only valid for positive data."""
100+
return True

0 commit comments

Comments
 (0)