Skip to content

Commit 61e85bc

Browse files
Julien RousselJulien Roussel
authored andcommitted
type checking issues patched
1 parent 96e08cf commit 61e85bc

File tree

13 files changed

+287
-184
lines changed

13 files changed

+287
-184
lines changed

pyproject.toml

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -34,18 +34,17 @@ classifiers = [
3434
# DEPENDENCIES
3535

3636
[tool.poetry.dependencies]
37-
python = ">=3.8.1,<3.13"
38-
hyperopt = "0.2.7"
39-
numpy = "1.24.4"
40-
packaging = "23.1"
41-
pandas = "2.0.1"
37+
python = ">=3.9,<3.13"
38+
hyperopt = "*"
39+
numpy = ">= 1.24"
40+
pandas = ">= 2.0.1"
4241
scipy = "*"
43-
scikit-learn = "1.3.2"
42+
scikit-learn = ">= 1.6"
4443
sphinx-markdown-tables = { version = "*", optional = true }
45-
statsmodels = "0.14.0"
44+
statsmodels = ">= 0.14.0"
4645
typed-ast = { version = "*", optional = true }
4746
category-encoders = "^2.6.3"
48-
dcor = "0.6"
47+
dcor = ">= 0.6"
4948

5049
[tool.poetry.group.torch.dependencies]
5150
torch = "< 2.5"
@@ -57,6 +56,7 @@ jupyter = "1.0.0"
5756
jupyterlab = "1.2.6"
5857
jupytext = "1.14.4"
5958
matplotlib = "3.6.2"
59+
packaging = "23.1"
6060
pre-commit = "2.21.0"
6161
twine = "3.7.1"
6262
wheel = "0.37.1"

qolmat/benchmark/metrics.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -132,9 +132,8 @@ def root_mean_squared_error(
132132
df1,
133133
df2,
134134
df_mask,
135-
skm.mean_squared_error,
135+
skm.root_mean_squared_error,
136136
type_cols="numerical",
137-
squared=False,
138137
)
139138

140139

qolmat/benchmark/missing_patterns.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -371,15 +371,16 @@ def generate_mask(self, X: pd.DataFrame) -> pd.DataFrame:
371371
sample = min(min(sample, sizes_max.max()), n_masked_left)
372372
i_hole = self.rng.choice(np.where(sample <= sizes_max)[0])
373373

374-
if not (~mask[column].iloc[i_hole - sample : i_hole]).all():
374+
indices_hole = mask.index[i_hole - sample : i_hole]
375+
if not (~mask.loc[indices_hole, column]).all():
375376
raise ValueError(
376377
"The mask condition is not satisfied for "
377378
f"column={column}, "
378379
f"sample={sample}, "
379380
f"and i_hole={i_hole}."
380381
)
381382

382-
mask[column].iloc[i_hole - sample : i_hole] = True
383+
mask.loc[indices_hole, column] = True
383384
n_masked_left -= sample
384385

385386
sizes_max.iloc[i_hole - sample : i_hole] = 0

qolmat/imputations/em_sampler.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -458,6 +458,11 @@ def fit(self, X: NDArray) -> "EM":
458458
459459
"""
460460
X = X.copy()
461+
# utils.check_dtypes(X)
462+
# sku.check_array(X, ensure_all_finite="allow-nan", dtype="float")
463+
sku.validation.validate_data(
464+
self, X, ensure_all_finite="allow-nan", dtype="float"
465+
)
461466
self.shape_original = X.shape
462467

463468
self.hash_fit = hash(X.tobytes())
@@ -506,6 +511,10 @@ def transform(self, X: NDArray) -> NDArray:
506511
"""
507512
mask_na = np.isnan(X)
508513
X = X.copy()
514+
# sku.check_array(X, ensure_all_finite="allow-nan", dtype="float")
515+
sku.validation.validate_data(
516+
self, X, ensure_all_finite="allow-nan", dtype="float", reset=False
517+
)
509518

510519
# shape_original = X.shape
511520
if hash(X.tobytes()) == self.hash_fit:

qolmat/imputations/imputers.py

Lines changed: 30 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,6 @@
1717
from sklearn.impute._base import _BaseImputer
1818
from statsmodels.tsa import seasonal as tsa_seasonal
1919

20-
# from typing_extensions import Self
2120
from qolmat.imputations import em_sampler, softimpute
2221
from qolmat.imputations.rpca import rpca_noisy, rpca_pcp
2322
from qolmat.utils import utils
@@ -108,15 +107,15 @@ def _check_dataframe(self, X: NDArray):
108107
if not isinstance(X, (pd.DataFrame)):
109108
raise NotDataFrame(type(X))
110109

111-
def _more_tags(self):
112-
"""Indicate this class allows inputs with categorical data and nans.
113-
114-
It modifies the behaviour of the functions checking data.
115-
"""
116-
return {
117-
"X_types": ["2darray", "categorical", "string"],
118-
"allow_nan": True,
119-
}
110+
def __sklearn_tags__(self):
111+
tags = super().__sklearn_tags__()
112+
# tags.input_tags = InputTags(
113+
# two_d_array=True, categorical=True, string=True, allow_nan=True
114+
# )
115+
tags.input_tags.allow_nan = True
116+
tags.target_tags.single_output = False
117+
tags.non_deterministic = True
118+
return tags
120119

121120
def fit(self, X: pd.DataFrame, y: pd.DataFrame = None) -> "_Imputer":
122121
"""Fit the imputer on X.
@@ -134,6 +133,12 @@ def fit(self, X: pd.DataFrame, y: pd.DataFrame = None) -> "_Imputer":
134133
Returns self.
135134
136135
"""
136+
sku.validation.validate_data(
137+
self,
138+
X,
139+
ensure_all_finite="allow-nan",
140+
dtype=["float", "int", "string", "categorical", "object"],
141+
)
137142
df = utils._validate_input(X)
138143
self.n_features_in_ = len(df.columns)
139144

@@ -185,6 +190,13 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame:
185190
Imputed dataframe.
186191
187192
"""
193+
sku.validation.validate_data(
194+
self,
195+
X,
196+
ensure_all_finite="allow-nan",
197+
dtype=["float", "int", "string", "categorical", "object"],
198+
reset=False,
199+
)
188200
df = utils._validate_input(X)
189201
if tuple(df.columns) != self.columns_:
190202
raise ValueError(
@@ -488,6 +500,13 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame:
488500
dataframe imputed with premasked values
489501
490502
"""
503+
sku.validation.validate_data(
504+
self,
505+
X,
506+
ensure_all_finite="allow-nan",
507+
dtype=["float", "int", "string", "categorical", "object"],
508+
reset=False,
509+
)
491510
df = utils._validate_input(X)
492511

493512
if tuple(df.columns) != self.columns_:
@@ -1905,7 +1924,7 @@ def _transform_element(
19051924

19061925

19071926
class ImputerSoftImpute(_Imputer):
1908-
"""SoftIMpute imputer.
1927+
"""SoftImpute imputer.
19091928
19101929
This class implements the Soft Impute method:
19111930
Hastie, Trevor, et al. Matrix completion and low-rank SVD via fast
@@ -2067,18 +2086,6 @@ def _transform_element(
20672086

20682087
return df_imputed
20692088

2070-
def _more_tags(self):
2071-
return {
2072-
"_xfail_checks": {
2073-
"check_fit2d_1sample": (
2074-
"This test shouldn't be running at all!"
2075-
),
2076-
"check_fit2d_1feature": (
2077-
"This test shouldn't be running at all!"
2078-
),
2079-
},
2080-
}
2081-
20822089

20832090
class ImputerEM(_Imputer):
20842091
"""EM imputer.

qolmat/imputations/imputers_pytorch.py

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -652,16 +652,16 @@ def __init__(
652652
self.index_datetime = index_datetime
653653
self.freq_str = freq_str
654654

655-
def _more_tags(self):
656-
return {
657-
"non_deterministic": True,
658-
"_xfail_checks": {
659-
"check_estimators_pickle": "Diffusion models can return\
660-
different outputs",
661-
"check_estimators_overwrite_params": "Diffusion models can\
662-
return different outputs",
663-
},
664-
}
655+
# def _more_tags(self):
656+
# return {
657+
# "non_deterministic": True,
658+
# "_xfail_checks": {
659+
# "check_estimators_pickle": "Diffusion models can return\
660+
# different outputs",
661+
# "check_estimators_overwrite_params": "Diffusion models can\
662+
# return different outputs",
663+
# },
664+
# }
665665

666666
def _fit_element(
667667
self, df: pd.DataFrame, col: str = "__all__", ngroup: int = 0

qolmat/imputations/preprocessing.py

Lines changed: 58 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
import pandas as pd
88
from category_encoders.one_hot import OneHotEncoder
99
from numpy.typing import NDArray
10+
from sklearn import utils as sku
1011
from sklearn.base import (
1112
BaseEstimator,
1213
RegressorMixin,
@@ -20,10 +21,9 @@
2021
)
2122
from sklearn.pipeline import Pipeline
2223
from sklearn.preprocessing import StandardScaler
24+
from sklearn.utils import InputTags
2325
from sklearn.utils.validation import (
24-
check_array,
2526
check_is_fitted,
26-
check_X_y,
2727
)
2828

2929
# from typing_extensions import Self
@@ -68,8 +68,14 @@ def fit(self, X: NDArray, y: NDArray) -> "MixteHGBM":
6868
Returns self.
6969
7070
"""
71-
X, y = check_X_y(
72-
X, y, accept_sparse=True, force_all_finite="allow-nan"
71+
X, y = sku.validation.validate_data(
72+
self,
73+
X,
74+
y,
75+
accept_sparse=False,
76+
ensure_all_finite="allow-nan",
77+
reset=True,
78+
dtype=["float", "int", "string", "categorical", "object"],
7379
)
7480
self.is_fitted_ = True
7581
self.n_features_in_ = X.shape[1]
@@ -101,20 +107,30 @@ def predict(self, X: NDArray) -> NDArray:
101107
Predicted target values.
102108
103109
"""
104-
X = check_array(X, accept_sparse=True, force_all_finite="allow-nan")
110+
sku.validation.validate_data(
111+
self,
112+
X,
113+
accept_sparse=False,
114+
ensure_all_finite="allow-nan",
115+
reset=False,
116+
dtype=["float", "int", "string", "categorical", "object"],
117+
)
105118
check_is_fitted(self, "is_fitted_")
106119
y_pred = self.model_.predict(X)
107120
return y_pred
108121

109-
def _more_tags(self):
122+
def __sklearn_tags__(self):
110123
"""Indicate if the class allows inputs with categorical data and nans.
111124
112125
It modifies the behaviour of the functions checking data.
113126
"""
114-
return {
115-
"X_types": ["2darray", "categorical", "string"],
116-
"allow_nan": True,
117-
}
127+
tags = super().__sklearn_tags__()
128+
tags.input_tags = InputTags(
129+
two_d_array=True, categorical=True, string=True, allow_nan=True
130+
)
131+
tags.target_tags.single_output = False
132+
tags.non_deterministic = True
133+
return tags
118134

119135

120136
class BinTransformer(TransformerMixin, BaseEstimator):
@@ -146,6 +162,14 @@ def fit(self, X: NDArray, y: Optional[NDArray] = None) -> "BinTransformer":
146162
Fitted transformer.
147163
148164
"""
165+
sku.validation.validate_data(
166+
self,
167+
X,
168+
accept_sparse=False,
169+
ensure_all_finite="allow-nan",
170+
reset=False,
171+
dtype=["float", "int", "string", "categorical", "object"],
172+
)
149173
df = utils._validate_input(X)
150174
self.feature_names_in_ = df.columns
151175
self.n_features_in_ = len(df.columns)
@@ -176,16 +200,24 @@ def transform(self, X: NDArray) -> NDArray:
176200
Transformed input.
177201
178202
"""
203+
sku.validation.validate_data(
204+
self,
205+
X,
206+
accept_sparse=False,
207+
ensure_all_finite="allow-nan",
208+
reset=False,
209+
dtype=["float", "int", "string", "categorical", "object"],
210+
)
179211
df = utils._validate_input(X)
180212
check_is_fitted(self)
181-
if (
182-
not hasattr(self, "feature_names_in_")
183-
or df.columns.to_list() != self.feature_names_in_.to_list()
184-
):
185-
raise ValueError(
186-
f"Feature names in X {df.columns} don't match with "
187-
f"expected {self.feature_names_in_}"
188-
)
213+
# if (
214+
# not hasattr(self, "feature_names_in_")
215+
# or df.columns.to_list() != self.feature_names_in_.to_list()
216+
# ):
217+
# raise ValueError(
218+
# f"Feature names in X {df.columns} don't match with "
219+
# f"expected {self.feature_names_in_}"
220+
# )
189221
df_out = df.copy()
190222
for col in df:
191223
values = df[col]
@@ -215,15 +247,18 @@ def inverse_transform(self, X: NDArray) -> NDArray:
215247
"""
216248
return self.transform(X)
217249

218-
def _more_tags(self):
250+
def __sklearn_tags__(self):
219251
"""Indicate if the class allows inputs with categorical data and nans.
220252
221253
It modifies the behaviour of the functions checking data.
222254
"""
223-
return {
224-
"X_types": ["2darray", "categorical", "string"],
225-
"allow_nan": True,
226-
}
255+
tags = super().__sklearn_tags__()
256+
tags.input_tags = InputTags(
257+
two_d_array=True, categorical=True, string=True, allow_nan=True
258+
)
259+
tags.target_tags.single_output = False
260+
tags.non_deterministic = True
261+
return tags
227262

228263

229264
class OneHotEncoderProjector(OneHotEncoder):

0 commit comments

Comments
 (0)