|
7 | 7 | import pandas as pd |
8 | 8 | from category_encoders.one_hot import OneHotEncoder |
9 | 9 | from numpy.typing import NDArray |
| 10 | +from sklearn import utils as sku |
10 | 11 | from sklearn.base import ( |
11 | 12 | BaseEstimator, |
12 | 13 | RegressorMixin, |
|
20 | 21 | ) |
21 | 22 | from sklearn.pipeline import Pipeline |
22 | 23 | from sklearn.preprocessing import StandardScaler |
| 24 | +from sklearn.utils import InputTags |
23 | 25 | from sklearn.utils.validation import ( |
24 | | - check_array, |
25 | 26 | check_is_fitted, |
26 | | - check_X_y, |
27 | 27 | ) |
28 | 28 |
|
29 | 29 | # from typing_extensions import Self |
@@ -68,8 +68,14 @@ def fit(self, X: NDArray, y: NDArray) -> "MixteHGBM": |
68 | 68 | Returns self. |
69 | 69 |
|
70 | 70 | """ |
71 | | - X, y = check_X_y( |
72 | | - X, y, accept_sparse=True, force_all_finite="allow-nan" |
| 71 | + X, y = sku.validation.validate_data( |
| 72 | + self, |
| 73 | + X, |
| 74 | + y, |
| 75 | + accept_sparse=False, |
| 76 | + ensure_all_finite="allow-nan", |
| 77 | + reset=True, |
| 78 | + dtype=["float", "int", "string", "categorical", "object"], |
73 | 79 | ) |
74 | 80 | self.is_fitted_ = True |
75 | 81 | self.n_features_in_ = X.shape[1] |
@@ -101,20 +107,30 @@ def predict(self, X: NDArray) -> NDArray: |
101 | 107 | Predicted target values. |
102 | 108 |
|
103 | 109 | """ |
104 | | - X = check_array(X, accept_sparse=True, force_all_finite="allow-nan") |
| 110 | + sku.validation.validate_data( |
| 111 | + self, |
| 112 | + X, |
| 113 | + accept_sparse=False, |
| 114 | + ensure_all_finite="allow-nan", |
| 115 | + reset=False, |
| 116 | + dtype=["float", "int", "string", "categorical", "object"], |
| 117 | + ) |
105 | 118 | check_is_fitted(self, "is_fitted_") |
106 | 119 | y_pred = self.model_.predict(X) |
107 | 120 | return y_pred |
108 | 121 |
|
109 | | - def _more_tags(self): |
| 122 | + def __sklearn_tags__(self): |
110 | 123 | """Indicate if the class allows inputs with categorical data and nans. |
111 | 124 |
|
112 | 125 | It modifies the behaviour of the functions checking data. |
113 | 126 | """ |
114 | | - return { |
115 | | - "X_types": ["2darray", "categorical", "string"], |
116 | | - "allow_nan": True, |
117 | | - } |
| 127 | + tags = super().__sklearn_tags__() |
| 128 | + tags.input_tags = InputTags( |
| 129 | + two_d_array=True, categorical=True, string=True, allow_nan=True |
| 130 | + ) |
| 131 | + tags.target_tags.single_output = False |
| 132 | + tags.non_deterministic = True |
| 133 | + return tags |
118 | 134 |
|
119 | 135 |
|
120 | 136 | class BinTransformer(TransformerMixin, BaseEstimator): |
@@ -146,6 +162,14 @@ def fit(self, X: NDArray, y: Optional[NDArray] = None) -> "BinTransformer": |
146 | 162 | Fitted transformer. |
147 | 163 |
|
148 | 164 | """ |
| 165 | + sku.validation.validate_data( |
| 166 | + self, |
| 167 | + X, |
| 168 | + accept_sparse=False, |
| 169 | + ensure_all_finite="allow-nan", |
| 170 | + reset=False, |
| 171 | + dtype=["float", "int", "string", "categorical", "object"], |
| 172 | + ) |
149 | 173 | df = utils._validate_input(X) |
150 | 174 | self.feature_names_in_ = df.columns |
151 | 175 | self.n_features_in_ = len(df.columns) |
@@ -176,16 +200,24 @@ def transform(self, X: NDArray) -> NDArray: |
176 | 200 | Transformed input. |
177 | 201 |
|
178 | 202 | """ |
| 203 | + sku.validation.validate_data( |
| 204 | + self, |
| 205 | + X, |
| 206 | + accept_sparse=False, |
| 207 | + ensure_all_finite="allow-nan", |
| 208 | + reset=False, |
| 209 | + dtype=["float", "int", "string", "categorical", "object"], |
| 210 | + ) |
179 | 211 | df = utils._validate_input(X) |
180 | 212 | check_is_fitted(self) |
181 | | - if ( |
182 | | - not hasattr(self, "feature_names_in_") |
183 | | - or df.columns.to_list() != self.feature_names_in_.to_list() |
184 | | - ): |
185 | | - raise ValueError( |
186 | | - f"Feature names in X {df.columns} don't match with " |
187 | | - f"expected {self.feature_names_in_}" |
188 | | - ) |
| 213 | + # if ( |
| 214 | + # not hasattr(self, "feature_names_in_") |
| 215 | + # or df.columns.to_list() != self.feature_names_in_.to_list() |
| 216 | + # ): |
| 217 | + # raise ValueError( |
| 218 | + # f"Feature names in X {df.columns} don't match with " |
| 219 | + # f"expected {self.feature_names_in_}" |
| 220 | + # ) |
189 | 221 | df_out = df.copy() |
190 | 222 | for col in df: |
191 | 223 | values = df[col] |
@@ -215,15 +247,18 @@ def inverse_transform(self, X: NDArray) -> NDArray: |
215 | 247 | """ |
216 | 248 | return self.transform(X) |
217 | 249 |
|
218 | | - def _more_tags(self): |
| 250 | + def __sklearn_tags__(self): |
219 | 251 | """Indicate if the class allows inputs with categorical data and nans. |
220 | 252 |
|
221 | 253 | It modifies the behaviour of the functions checking data. |
222 | 254 | """ |
223 | | - return { |
224 | | - "X_types": ["2darray", "categorical", "string"], |
225 | | - "allow_nan": True, |
226 | | - } |
| 255 | + tags = super().__sklearn_tags__() |
| 256 | + tags.input_tags = InputTags( |
| 257 | + two_d_array=True, categorical=True, string=True, allow_nan=True |
| 258 | + ) |
| 259 | + tags.target_tags.single_output = False |
| 260 | + tags.non_deterministic = True |
| 261 | + return tags |
227 | 262 |
|
228 | 263 |
|
229 | 264 | class OneHotEncoderProjector(OneHotEncoder): |
|
0 commit comments