Skip to content

Commit d05d032

Browse files
author
Nabil Fayak
committed
invalid_target data_check added
1 parent a0ad484 commit d05d032

File tree

5 files changed

+229
-3
lines changed

5 files changed

+229
-3
lines changed

checkmates/objectives/__init__.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,3 +11,6 @@
1111

1212
from checkmates.objectives.standard_metrics import RootMeanSquaredLogError
1313
from checkmates.objectives.standard_metrics import MeanSquaredLogError
14+
15+
from checkmates.objectives.binary_classification_objective import BinaryClassificationObjective
16+
from checkmates.objectives.multiclass_classification_objective import MulticlassClassificationObjective
Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
"""Base class for all binary classification objectives."""
2+
import numpy as np
3+
from scipy.optimize import differential_evolution
4+
5+
from checkmates.objectives.objective_base import ObjectiveBase
6+
from checkmates.problem_types import ProblemTypes
7+
8+
9+
class BinaryClassificationObjective(ObjectiveBase):
10+
"""Base class for all binary classification objectives."""
11+
12+
problem_types = [ProblemTypes.BINARY, ProblemTypes.TIME_SERIES_BINARY]
13+
14+
"""[ProblemTypes.BINARY, ProblemTypes.TIME_SERIES_BINARY]"""
15+
16+
@property
17+
def can_optimize_threshold(cls):
18+
"""Returns a boolean determining if we can optimize the binary classification objective threshold.
19+
20+
This will be false for any objective that works directly with
21+
predicted probabilities, like log loss and AUC. Otherwise, it
22+
will be true.
23+
24+
Returns:
25+
bool: Whether or not an objective can be optimized.
26+
"""
27+
return not cls.score_needs_proba
28+
29+
def optimize_threshold(self, ypred_proba, y_true, X=None):
30+
"""Learn a binary classification threshold which optimizes the current objective.
31+
32+
Args:
33+
ypred_proba (pd.Series): The classifier's predicted probabilities
34+
y_true (pd.Series): The ground truth for the predictions.
35+
X (pd.DataFrame, optional): Any extra columns that are needed from training data.
36+
37+
Returns:
38+
Optimal threshold for this objective.
39+
40+
Raises:
41+
RuntimeError: If objective cannot be optimized.
42+
"""
43+
ypred_proba = self._standardize_input_type(ypred_proba)
44+
y_true = self._standardize_input_type(y_true)
45+
if X is not None:
46+
X = self._standardize_input_type(X)
47+
48+
if not self.can_optimize_threshold:
49+
raise RuntimeError("Trying to optimize objective that can't be optimized!")
50+
51+
def cost(threshold):
52+
y_predicted = self.decision_function(
53+
ypred_proba=ypred_proba,
54+
threshold=threshold[0],
55+
X=X,
56+
)
57+
cost = self.objective_function(y_true, y_predicted, X=X)
58+
return -cost if self.greater_is_better else cost
59+
60+
optimal = differential_evolution(cost, bounds=[(0, 1)], seed=0, maxiter=250)
61+
62+
return optimal.x[0]
63+
64+
def decision_function(self, ypred_proba, threshold=0.5, X=None):
65+
"""Apply a learned threshold to predicted probabilities to get predicted classes.
66+
67+
Args:
68+
ypred_proba (pd.Series, np.ndarray): The classifier's predicted probabilities
69+
threshold (float, optional): Threshold used to make a prediction. Defaults to 0.5.
70+
X (pd.DataFrame, optional): Any extra columns that are needed from training data.
71+
72+
Returns:
73+
predictions
74+
"""
75+
ypred_proba = self._standardize_input_type(ypred_proba)
76+
return ypred_proba > threshold
77+
78+
def validate_inputs(self, y_true, y_predicted):
79+
"""Validate inputs for scoring."""
80+
super().validate_inputs(y_true, y_predicted)
81+
if len(np.unique(y_true)) > 2:
82+
raise ValueError("y_true contains more than two unique values")
83+
if len(np.unique(y_predicted)) > 2 and not self.score_needs_proba:
84+
raise ValueError("y_predicted contains more than two unique values")
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
"""Base class for all multiclass classification objectives."""
2+
from checkmates.objectives.objective_base import ObjectiveBase
3+
from checkmates.problem_types import ProblemTypes
4+
5+
6+
class MulticlassClassificationObjective(ObjectiveBase):
7+
"""Base class for all multiclass classification objectives."""
8+
9+
problem_types = [ProblemTypes.MULTICLASS, ProblemTypes.TIME_SERIES_MULTICLASS]
10+
"""[ProblemTypes.MULTICLASS, ProblemTypes.TIME_SERIES_MULTICLASS]"""

checkmates/objectives/standard_metrics.py

Lines changed: 121 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,127 @@
55

66
from checkmates.objectives.regression_objective import RegressionObjective
77
from checkmates.utils import classproperty
8+
from checkmates.objectives.binary_classification_objective import BinaryClassificationObjective
9+
from checkmates.objectives.multiclass_classification_objective import MulticlassClassificationObjective
10+
11+
12+
class LogLossBinary(BinaryClassificationObjective):
13+
"""Log Loss for binary classification.
14+
15+
Example:
16+
>>> y_true = pd.Series([0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1])
17+
>>> y_pred = pd.Series([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1])
18+
>>> np.testing.assert_almost_equal(LogLossBinary().objective_function(y_true, y_pred), 19.6601745)
19+
"""
20+
21+
name = "Log Loss Binary"
22+
greater_is_better = False
23+
score_needs_proba = True
24+
perfect_score = 0.0
25+
is_bounded_like_percentage = False # Range [0, Inf)
26+
expected_range = [0, 1]
27+
28+
def objective_function(
29+
self,
30+
y_true,
31+
y_predicted,
32+
y_train=None,
33+
X=None,
34+
sample_weight=None,
35+
):
36+
"""Objective function for log loss for binary classification."""
37+
return metrics.log_loss(y_true, y_predicted, sample_weight=sample_weight)
38+
39+
class LogLossMulticlass(MulticlassClassificationObjective):
40+
"""Log Loss for multiclass classification.
41+
42+
Example:
43+
>>> y_true = [0, 1, 2, 0, 2, 1]
44+
>>> y_pred = [[0.7, 0.2, 0.1],
45+
... [0.3, 0.5, 0.2],
46+
... [0.1, 0.3, 0.6],
47+
... [0.9, 0.1, 0.0],
48+
... [0.3, 0.1, 0.6],
49+
... [0.5, 0.5, 0.0]]
50+
>>> np.testing.assert_almost_equal(LogLossMulticlass().objective_function(y_true, y_pred), 0.4783301)
51+
"""
52+
53+
name = "Log Loss Multiclass"
54+
greater_is_better = False
55+
score_needs_proba = True
56+
perfect_score = 0.0
57+
is_bounded_like_percentage = False # Range [0, Inf)
58+
expected_range = [0, 1]
59+
60+
def objective_function(
61+
self,
62+
y_true,
63+
y_predicted,
64+
y_train=None,
65+
X=None,
66+
sample_weight=None,
67+
):
68+
"""Objective function for log loss for multiclass classification."""
69+
return metrics.log_loss(y_true, y_predicted, sample_weight=sample_weight)
70+
71+
class R2(RegressionObjective):
72+
"""Coefficient of determination for regression.
73+
74+
Example:
75+
>>> y_true = pd.Series([1.5, 2, 3, 1, 0.5, 1, 2.5, 2.5, 1, 0.5, 2])
76+
>>> y_pred = pd.Series([1.5, 2.5, 2, 1, 0.5, 1, 3, 2.25, 0.75, 0.25, 1.75])
77+
>>> np.testing.assert_almost_equal(R2().objective_function(y_true, y_pred), 0.7638036)
78+
"""
79+
80+
name = "R2"
81+
greater_is_better = True
82+
score_needs_proba = False
83+
perfect_score = 1
84+
is_bounded_like_percentage = False # Range (-Inf, 1]
85+
expected_range = [-1, 1]
86+
87+
def objective_function(
88+
self,
89+
y_true,
90+
y_predicted,
91+
y_train=None,
92+
X=None,
93+
sample_weight=None,
94+
):
95+
"""Objective function for coefficient of determination for regression."""
96+
return metrics.r2_score(y_true, y_predicted, sample_weight=sample_weight)
97+
98+
class MedianAE(RegressionObjective):
99+
"""Median absolute error for regression.
100+
101+
Example:
102+
>>> y_true = pd.Series([1.5, 2, 3, 1, 0.5, 1, 2.5, 2.5, 1, 0.5, 2])
103+
>>> y_pred = pd.Series([1.5, 2.5, 2, 1, 0.5, 1, 3, 2.25, 0.75, 0.25, 1.75])
104+
>>> np.testing.assert_almost_equal(MedianAE().objective_function(y_true, y_pred), 0.25)
105+
"""
106+
107+
name = "MedianAE"
108+
greater_is_better = False
109+
score_needs_proba = False
110+
perfect_score = 0.0
111+
is_bounded_like_percentage = False # Range [0, Inf)
112+
expected_range = [0, float("inf")]
113+
114+
def objective_function(
115+
self,
116+
y_true,
117+
y_predicted,
118+
y_train=None,
119+
X=None,
120+
sample_weight=None,
121+
):
122+
"""Objective function for median absolute error for regression."""
123+
return metrics.median_absolute_error(
124+
y_true,
125+
y_predicted,
126+
sample_weight=sample_weight,
127+
)
128+
8129

9130

10131
class RootMeanSquaredLogError(RegressionObjective):

checkmates/objectives/utils.py

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
"""Utility methods for EvalML objectives."""
1+
"""Utility methods for CheckMates objectives."""
22
from checkmates import objectives
33
from checkmates.exceptions import ObjectiveCreationError, ObjectiveNotFoundError
44
from checkmates.objectives.objective_base import ObjectiveBase
@@ -20,12 +20,20 @@ def get_non_core_objectives():
2020
objectives.RootMeanSquaredLogError,
2121
]
2222

23+
def get_all_objective_names():
24+
"""Get a list of the names of all objectives.
25+
26+
Returns:
27+
list (str): Objective names
28+
"""
29+
all_objectives_dict = _all_objectives_dict()
30+
return list(all_objectives_dict.keys())
2331

2432
def _all_objectives_dict():
2533
all_objectives = _get_subclasses(ObjectiveBase)
2634
objectives_dict = {}
2735
for objective in all_objectives:
28-
if "evalml.objectives" not in objective.__module__:
36+
if "checkmates.objectives" not in objective.__module__:
2937
continue
3038
objectives_dict[objective.name.lower()] = objective
3139
return objectives_dict
@@ -63,7 +71,7 @@ def get_objective(objective, return_instance=False, **kwargs):
6371
if objective.lower() not in all_objectives_dict:
6472
raise ObjectiveNotFoundError(
6573
f"{objective} is not a valid Objective! "
66-
"Use evalml.objectives.get_all_objective_names() "
74+
"Use checkmates.objectives.get_all_objective_names() "
6775
"to get a list of all valid objective names. ",
6876
)
6977

0 commit comments

Comments
 (0)