Skip to content

Commit e6a13f8

Browse files
Add NaN detection to in discretizers transform method by creating a BaseDiscretizer class (#341)
* create BaseDiscretizer class * implement _check_transform_input_and_state() method for BaseDiscretizer * start implementation of tranform() for BaseDiscretizer class * add init params to parent class BaseDiscretizer * add BaseDiscretiser class to init file * change ArbitraryDiscretiser, EqualFrequencyDiscretiser, and EqualWidthDiscretiser parent class to BaseDiscretiser and refactor init params * change warning text * fix init file and BaseDiscretiser imports. add errors to BaseDiscretiser init params. * edit ValueError text in init() * delete _check_transform_input_and_state method * add inheritance and fix comments in BaseDiscretiser transform() * update transform() method for base class. add corresponding docstring * delete transform() code for three discretisers. code base was moved to BaseDiscretiser class * raise ValueError if equals * create error/warrning informing user where nan values are located * fix style error * create test_transform_raises_error_if_df_contains_na() and test_error_if_errors_not_permitted_value() fcns * change df used in test_error_if_input_df_contains_na_in_transform() fcn * revise test_error_if_input_df_contains_na_in_transform() and create test_error_if_not_permitted_value_is_errors() fcns * edit comments * revise test_error_if_input_df_contains_na_in_transform() and create test_error_if_not_permitted_value_is_errors() * fix style error * add back prior version of test_error_if_input_df_contains_na_in_transform for the three discretisers * edit BaseDiscretiser transform() docstring * remove transform() from the three discretisers * fix style errors * add 'return_object' and 'return_boundaries' to init fcn for each of the 3 classes * add 'errors' to init method of the 3 discretisers * fix incorrect discretiser class error * change 'self.encoder_dict_' to 'self.binner_dict_' * add init params when instantiating EqualFrequencyDiscretiser class in tests * add binning_dict values when instatiating ArbitraryDiscretiser. Replace Boston housing price dataset code. * change boston housing dataset to california housing dataset. Sklearn document state that 'The Boston housing prices dataset has an ethical problem.' California housing dataset is suggested by sklearn maintainers. * change code based on california housing dataset features * fix style errors * fix style errors * fix style errors * fix style error * change 'msg' text * modifies docstrings base class * adds dosctring to discretisers * moves error check to arbitrary discretizer only * resets equal width and fret test to main branch * adds test to arbitrary discretizer * blacks files * fixes style tests * style fixes * minor edits to doc string and code for ArbitrarayDiscretizer() * change -0 to 0 in test_arbitrary_discresir.py * add back 'X = super().transform(X)' to ArbitraryDiscretiser transform() * remove 2 print statements from abitrary test and remove df copy() from ArbitraryDiscretiser transform() * fix style error * add back ArbitraryDiscretiser transform() docstring * fix docstring variables * revert wording change * revert wording in docstring * removes back slash from statement Co-authored-by: Soledad Galli <[email protected]>
1 parent c98b091 commit e6a13f8

File tree

5 files changed

+228
-124
lines changed

5 files changed

+228
-124
lines changed

feature_engine/discretisation/arbitrary.py

Lines changed: 49 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,16 @@
11
# Authors: Soledad Galli <[email protected]>
22
# License: BSD 3 clause
33

4+
import warnings
45
from typing import Dict, List, Optional, Union
56

67
import pandas as pd
78

8-
from feature_engine.base_transformers import BaseNumericalTransformer
9+
from feature_engine.discretisation.base_discretiser import BaseDiscretiser
910
from feature_engine.validation import _return_tags
1011

1112

12-
class ArbitraryDiscretiser(BaseNumericalTransformer):
13+
class ArbitraryDiscretiser(BaseDiscretiser):
1314
"""
1415
The ArbitraryDiscretiser() divides numerical variables into intervals which limits
1516
are determined by the user. Thus, it works only with numerical variables.
@@ -39,6 +40,12 @@ class ArbitraryDiscretiser(BaseNumericalTransformer):
3940
Whether the output, that is the bins, should be the interval boundaries. If
4041
True, it returns the interval boundaries. If False, it returns integers.
4142
43+
errors: string, default='ignore'
44+
Indicates what to do when a value is outside the limits indicated in the
45+
'binning_dict'. If 'raise', the transformation will raise an error.
46+
If 'ignore', values outside the limits are returned as NaN
47+
and a warning will be raised instead.
48+
4249
Attributes
4350
----------
4451
binner_dict_:
@@ -69,19 +76,25 @@ def __init__(
6976
binning_dict: Dict[Union[str, int], List[Union[str, int]]],
7077
return_object: bool = False,
7178
return_boundaries: bool = False,
79+
errors: str = "ignore",
7280
) -> None:
7381

7482
if not isinstance(binning_dict, dict):
7583
raise ValueError(
76-
"Please provide at a dictionary with the interval limits per variable"
84+
"binning_dict must be a dictionary with the interval limits per "
85+
f"variable. Got {binning_dict} instead."
86+
)
87+
88+
if errors not in ["ignore", "raise"]:
89+
raise ValueError(
90+
"errors only takes values 'ignore' and 'raise'. "
91+
f"Got {errors} instead."
7792
)
7893

79-
if not isinstance(return_object, bool):
80-
raise ValueError("return_object must be True or False")
94+
super().__init__(return_object, return_boundaries)
8195

8296
self.binning_dict = binning_dict
83-
self.return_object = return_object
84-
self.return_boundaries = return_boundaries
97+
self.errors = errors
8598

8699
def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
87100
"""
@@ -109,34 +122,42 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
109122
def transform(self, X: pd.DataFrame) -> pd.DataFrame:
110123
"""Sort the variable values into the intervals.
111124
112-
Parameters
113-
----------
114-
X: pandas dataframe of shape = [n_samples, n_features]
115-
The dataframe to be transformed.
125+
Parameters
126+
----------
127+
X: pandas dataframe of shape = [n_samples, n_features]
128+
The dataframe to be transformed.
116129
117-
Returns
118-
-------
119-
X_new: pandas dataframe of shape = [n_samples, n_features]
120-
The transformed data with the discrete variables.
121-
"""
130+
Returns
131+
-------
132+
X_new: pandas dataframe of shape = [n_samples, n_features]
133+
The transformed data with the discrete variables.
134+
"""
122135

123-
# check input dataframe and if class was fitted
124136
X = super().transform(X)
137+
# check if NaN values were introduced by the discretisation procedure.
138+
if X[self.variables_].isnull().sum().sum() > 0:
125139

126-
# transform variables
127-
if self.return_boundaries:
128-
for feature in self.variables_:
129-
X[feature] = pd.cut(X[feature], self.binner_dict_[feature])
140+
# obtain the name(s) of the columns with null values
141+
nan_columns = (
142+
X[self.variables_].columns[X[self.variables_].isnull().any()].tolist()
143+
)
130144

131-
else:
132-
for feature in self.variables_:
133-
X[feature] = pd.cut(
134-
X[feature], self.binner_dict_[feature], labels=False
145+
if len(nan_columns) > 1:
146+
nan_columns_str = ", ".join(nan_columns)
147+
else:
148+
nan_columns_str = nan_columns[0]
149+
150+
if self.errors == "ignore":
151+
warnings.warn(
152+
f"During the discretisation, NaN values were introduced in "
153+
f"the feature(s) {nan_columns_str}."
135154
)
136155

137-
# return object
138-
if self.return_object:
139-
X[self.variables_] = X[self.variables_].astype("O")
156+
elif self.errors == "raise":
157+
raise ValueError(
158+
"During the discretisation, NaN values were introduced in "
159+
f"the feature(s) {nan_columns_str}."
160+
)
140161

141162
return X
142163

Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
# Authors: Morgan Sell <[email protected]>
2+
# License: BSD 3 clause
3+
4+
import pandas as pd
5+
6+
from feature_engine.base_transformers import BaseNumericalTransformer
7+
8+
9+
class BaseDiscretiser(BaseNumericalTransformer):
10+
"""
11+
Shared set-up checks and methods across numerical discretisers.
12+
13+
Parameters
14+
----------
15+
return_object: bool, default=False
16+
Whether the the discrete variable should be returned as numeric or as
17+
object. If you would like to proceed with the engineering of the variable as if
18+
it was categorical, use True. Alternatively, keep the default to False.
19+
20+
return_boundaries: bool, default=False
21+
Whether the output should be the interval boundaries. If True, it returns
22+
the interval boundaries. If False, it returns integers.
23+
24+
Methods
25+
-------
26+
transform:
27+
Sort continuous variable values into the intervals.
28+
"""
29+
30+
def __init__(
31+
self,
32+
return_object: bool = False,
33+
return_boundaries: bool = False,
34+
) -> None:
35+
36+
if not isinstance(return_object, bool):
37+
raise ValueError(
38+
"return_object must be True or False. " f"Got {return_object} instead."
39+
)
40+
41+
if not isinstance(return_boundaries, bool):
42+
raise ValueError(
43+
"return_boundaries must be True or False. "
44+
f"Got {return_boundaries} instead."
45+
)
46+
47+
self.return_object = return_object
48+
self.return_boundaries = return_boundaries
49+
50+
def transform(self, X: pd.DataFrame) -> pd.DataFrame:
51+
"""Sort the variable values into the intervals.
52+
53+
Parameters
54+
----------
55+
X: pandas dataframe of shape = [n_samples, n_features]
56+
The data to transform.
57+
58+
Returns
59+
-------
60+
X_new: pandas dataframe of shape = [n_samples, n_features]
61+
The transformed data with the discrete variables.
62+
"""
63+
64+
# check input dataframe and if class was fitted
65+
X = super().transform(X)
66+
67+
# transform variables
68+
if self.return_boundaries:
69+
for feature in self.variables_:
70+
X[feature] = pd.cut(X[feature], self.binner_dict_[feature])
71+
72+
else:
73+
for feature in self.variables_:
74+
X[feature] = pd.cut(
75+
X[feature], self.binner_dict_[feature], labels=False
76+
)
77+
78+
# return object
79+
if self.return_object:
80+
X[self.variables_] = X[self.variables_].astype("O")
81+
82+
return X

feature_engine/discretisation/equal_frequency.py

Lines changed: 4 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -5,11 +5,11 @@
55

66
import pandas as pd
77

8-
from feature_engine.base_transformers import BaseNumericalTransformer
8+
from feature_engine.discretisation.base_discretiser import BaseDiscretiser
99
from feature_engine.variable_manipulation import _check_input_parameter_variables
1010

1111

12-
class EqualFrequencyDiscretiser(BaseNumericalTransformer):
12+
class EqualFrequencyDiscretiser(BaseDiscretiser):
1313
"""
1414
The EqualFrequencyDiscretiser() divides continuous numerical variables
1515
into contiguous equal frequency intervals, that is, intervals that contain
@@ -86,18 +86,12 @@ def __init__(
8686
) -> None:
8787

8888
if not isinstance(q, int):
89-
raise ValueError("q must be an integer")
89+
raise ValueError(f"q must be an integer. Got {q} instead.")
9090

91-
if not isinstance(return_object, bool):
92-
raise ValueError("return_object must be True or False")
93-
94-
if not isinstance(return_boundaries, bool):
95-
raise ValueError("return_boundaries must be True or False")
91+
super().__init__(return_object, return_boundaries)
9692

9793
self.q = q
9894
self.variables = _check_input_parameter_variables(variables)
99-
self.return_object = return_object
100-
self.return_boundaries = return_boundaries
10195

10296
def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
10397
"""
@@ -129,37 +123,3 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
129123
self.n_features_in_ = X.shape[1]
130124

131125
return self
132-
133-
def transform(self, X: pd.DataFrame) -> pd.DataFrame:
134-
"""Sort the variable values into the intervals.
135-
136-
Parameters
137-
----------
138-
X: pandas dataframe of shape = [n_samples, n_features]
139-
The data to transform.
140-
141-
Returns
142-
-------
143-
X_new: pandas dataframe of shape = [n_samples, n_features]
144-
The transformed data with the discrete variables.
145-
"""
146-
147-
# check input dataframe and if class was fitted
148-
X = super().transform(X)
149-
150-
# transform variables
151-
if self.return_boundaries:
152-
for feature in self.variables_:
153-
X[feature] = pd.cut(X[feature], self.binner_dict_[feature])
154-
155-
else:
156-
for feature in self.variables_:
157-
X[feature] = pd.cut(
158-
X[feature], self.binner_dict_[feature], labels=False
159-
)
160-
161-
# return object
162-
if self.return_object:
163-
X[self.variables_] = X[self.variables_].astype("O")
164-
165-
return X

feature_engine/discretisation/equal_width.py

Lines changed: 4 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -5,11 +5,11 @@
55

66
import pandas as pd
77

8-
from feature_engine.base_transformers import BaseNumericalTransformer
8+
from feature_engine.discretisation.base_discretiser import BaseDiscretiser
99
from feature_engine.variable_manipulation import _check_input_parameter_variables
1010

1111

12-
class EqualWidthDiscretiser(BaseNumericalTransformer):
12+
class EqualWidthDiscretiser(BaseDiscretiser):
1313
"""
1414
The EqualWidthDiscretiser() divides continuous numerical variables into
1515
intervals of the same width, that is, equidistant intervals. Note that the
@@ -95,18 +95,12 @@ def __init__(
9595
) -> None:
9696

9797
if not isinstance(bins, int):
98-
raise ValueError("q must be an integer")
98+
raise ValueError(f"bins must be an integer. Got {bins} instead.")
9999

100-
if not isinstance(return_object, bool):
101-
raise ValueError("return_object must be True or False")
102-
103-
if not isinstance(return_boundaries, bool):
104-
raise ValueError("return_boundaries must be True or False")
100+
super().__init__(return_object, return_boundaries)
105101

106102
self.bins = bins
107103
self.variables = _check_input_parameter_variables(variables)
108-
self.return_object = return_object
109-
self.return_boundaries = return_boundaries
110104

111105
def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
112106
"""
@@ -142,38 +136,3 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
142136
self.n_features_in_ = X.shape[1]
143137

144138
return self
145-
146-
def transform(self, X: pd.DataFrame) -> pd.DataFrame:
147-
"""
148-
Sort the variable values into the intervals.
149-
150-
Parameters
151-
----------
152-
X: pandas dataframe of shape = [n_samples, n_features]
153-
The data to transform.
154-
155-
Returns
156-
-------
157-
X_new: pandas dataframe of shape = [n_samples, n_features]
158-
The transformed data with the discrete variables.
159-
"""
160-
161-
# check input dataframe and if class was fitted
162-
X = super().transform(X)
163-
164-
# transform variables
165-
if self.return_boundaries:
166-
for feature in self.variables_:
167-
X[feature] = pd.cut(X[feature], self.binner_dict_[feature])
168-
169-
else:
170-
for feature in self.variables_:
171-
X[feature] = pd.cut(
172-
X[feature], self.binner_dict_[feature], labels=False
173-
)
174-
175-
# return object
176-
if self.return_object:
177-
X[self.variables_] = X[self.variables_].astype("O")
178-
179-
return X

0 commit comments

Comments
 (0)