scikit-learn-contrib
diff --git a/‎pyproject.toml‎
Lines changed: 8 additions & 8 deletions b/‎pyproject.toml‎
Lines changed: 8 additions & 8 deletions
diff --git a/‎qolmat/benchmark/metrics.py‎
Lines changed: 1 addition & 2 deletions b/‎qolmat/benchmark/metrics.py‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎qolmat/benchmark/missing_patterns.py‎
Lines changed: 3 additions & 2 deletions b/‎qolmat/benchmark/missing_patterns.py‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎qolmat/imputations/em_sampler.py‎
Lines changed: 9 additions & 0 deletions b/‎qolmat/imputations/em_sampler.py‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎qolmat/imputations/imputers.py‎
Lines changed: 30 additions & 23 deletions b/‎qolmat/imputations/imputers.py‎
Lines changed: 30 additions & 23 deletions
diff --git a/‎qolmat/imputations/imputers_pytorch.py‎
Lines changed: 10 additions & 10 deletions b/‎qolmat/imputations/imputers_pytorch.py‎
Lines changed: 10 additions & 10 deletions
diff --git a/‎qolmat/imputations/preprocessing.py‎
Lines changed: 58 additions & 23 deletions b/‎qolmat/imputations/preprocessing.py‎
Lines changed: 58 additions & 23 deletions
@@ -34,18 +34,17 @@ classifiers = [
 # DEPENDENCIES
 
 [tool.poetry.dependencies]
-python = ">=3.8.1,<3.13"
-hyperopt = "0.2.7"
-numpy = "1.24.4"
-packaging = "23.1"
-pandas = "2.0.1"
+python = ">=3.9,<3.13"
+hyperopt = "*"
+numpy = ">= 1.24"
+pandas = ">= 2.0.1"
 scipy = "*"
-scikit-learn = "1.3.2"
+scikit-learn = ">= 1.6"
 sphinx-markdown-tables = { version = "*", optional = true }
-statsmodels = "0.14.0"
+statsmodels = ">= 0.14.0"
 typed-ast = { version = "*", optional = true }
 category-encoders = "^2.6.3"
-dcor = "0.6"
+dcor = ">= 0.6"
 
 [tool.poetry.group.torch.dependencies]
 torch = "< 2.5"
@@ -57,6 +56,7 @@ jupyter = "1.0.0"
 jupyterlab = "1.2.6"
 jupytext = "1.14.4"
 matplotlib = "3.6.2"
+packaging = "23.1"
 pre-commit = "2.21.0"
 twine = "3.7.1"
 wheel = "0.37.1"
 
@@ -132,9 +132,8 @@ def root_mean_squared_error(
         df1,
         df2,
         df_mask,
-        skm.mean_squared_error,
+        skm.root_mean_squared_error,
         type_cols="numerical",
-        squared=False,
     )
 
 
 
@@ -371,15 +371,16 @@ def generate_mask(self, X: pd.DataFrame) -> pd.DataFrame:
                 sample = min(min(sample, sizes_max.max()), n_masked_left)
                 i_hole = self.rng.choice(np.where(sample <= sizes_max)[0])
 
-                if not (~mask[column].iloc[i_hole - sample : i_hole]).all():
+                indices_hole = mask.index[i_hole - sample : i_hole]
+                if not (~mask.loc[indices_hole, column]).all():
                     raise ValueError(
                         "The mask condition is not satisfied for "
                         f"column={column}, "
                         f"sample={sample}, "
                         f"and i_hole={i_hole}."
                     )
 
-                mask[column].iloc[i_hole - sample : i_hole] = True
+                mask.loc[indices_hole, column] = True
                 n_masked_left -= sample
 
                 sizes_max.iloc[i_hole - sample : i_hole] = 0
 
@@ -458,6 +458,11 @@ def fit(self, X: NDArray) -> "EM":
 
         """
         X = X.copy()
+        # utils.check_dtypes(X)
+        # sku.check_array(X, ensure_all_finite="allow-nan", dtype="float")
+        sku.validation.validate_data(
+            self, X, ensure_all_finite="allow-nan", dtype="float"
+        )
         self.shape_original = X.shape
 
         self.hash_fit = hash(X.tobytes())
@@ -506,6 +511,10 @@ def transform(self, X: NDArray) -> NDArray:
         """
         mask_na = np.isnan(X)
         X = X.copy()
+        # sku.check_array(X, ensure_all_finite="allow-nan", dtype="float")
+        sku.validation.validate_data(
+            self, X, ensure_all_finite="allow-nan", dtype="float", reset=False
+        )
 
         # shape_original = X.shape
         if hash(X.tobytes()) == self.hash_fit:
 
@@ -17,7 +17,6 @@
 from sklearn.impute._base import _BaseImputer
 from statsmodels.tsa import seasonal as tsa_seasonal
 
-# from typing_extensions import Self
 from qolmat.imputations import em_sampler, softimpute
 from qolmat.imputations.rpca import rpca_noisy, rpca_pcp
 from qolmat.utils import utils
@@ -108,15 +107,15 @@ def _check_dataframe(self, X: NDArray):
         if not isinstance(X, (pd.DataFrame)):
             raise NotDataFrame(type(X))
 
-    def _more_tags(self):
-        """Indicate this class allows inputs with categorical data and nans.
-
-        It modifies the behaviour of the functions checking data.
-        """
-        return {
-            "X_types": ["2darray", "categorical", "string"],
-            "allow_nan": True,
-        }
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        # tags.input_tags = InputTags(
+        #     two_d_array=True, categorical=True, string=True, allow_nan=True
+        # )
+        tags.input_tags.allow_nan = True
+        tags.target_tags.single_output = False
+        tags.non_deterministic = True
+        return tags
 
     def fit(self, X: pd.DataFrame, y: pd.DataFrame = None) -> "_Imputer":
         """Fit the imputer on X.
@@ -134,6 +133,12 @@ def fit(self, X: pd.DataFrame, y: pd.DataFrame = None) -> "_Imputer":
             Returns self.
 
         """
+        sku.validation.validate_data(
+            self,
+            X,
+            ensure_all_finite="allow-nan",
+            dtype=["float", "int", "string", "categorical", "object"],
+        )
         df = utils._validate_input(X)
         self.n_features_in_ = len(df.columns)
 
@@ -185,6 +190,13 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame:
             Imputed dataframe.
 
         """
+        sku.validation.validate_data(
+            self,
+            X,
+            ensure_all_finite="allow-nan",
+            dtype=["float", "int", "string", "categorical", "object"],
+            reset=False,
+        )
         df = utils._validate_input(X)
         if tuple(df.columns) != self.columns_:
             raise ValueError(
@@ -488,6 +500,13 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame:
             dataframe imputed with premasked values
 
         """
+        sku.validation.validate_data(
+            self,
+            X,
+            ensure_all_finite="allow-nan",
+            dtype=["float", "int", "string", "categorical", "object"],
+            reset=False,
+        )
         df = utils._validate_input(X)
 
         if tuple(df.columns) != self.columns_:
@@ -1905,7 +1924,7 @@ def _transform_element(
 
 
 class ImputerSoftImpute(_Imputer):
-    """SoftIMpute imputer.
+    """SoftImpute imputer.
 
     This class implements the Soft Impute method:
     Hastie, Trevor, et al. Matrix completion and low-rank SVD via fast
@@ -2067,18 +2086,6 @@ def _transform_element(
 
         return df_imputed
 
-    def _more_tags(self):
-        return {
-            "_xfail_checks": {
-                "check_fit2d_1sample": (
-                    "This test shouldn't be running at all!"
-                ),
-                "check_fit2d_1feature": (
-                    "This test shouldn't be running at all!"
-                ),
-            },
-        }
-
 
 class ImputerEM(_Imputer):
     """EM imputer.
 
@@ -652,16 +652,16 @@ def __init__(
         self.index_datetime = index_datetime
         self.freq_str = freq_str
 
-    def _more_tags(self):
-        return {
-            "non_deterministic": True,
-            "_xfail_checks": {
-                "check_estimators_pickle": "Diffusion models can return\
-                                  different outputs",
-                "check_estimators_overwrite_params": "Diffusion models can\
-                                    return different outputs",
-            },
-        }
+    # def _more_tags(self):
+    #     return {
+    #         "non_deterministic": True,
+    #         "_xfail_checks": {
+    #             "check_estimators_pickle": "Diffusion models can return\
+    #                               different outputs",
+    #             "check_estimators_overwrite_params": "Diffusion models can\
+    #                                 return different outputs",
+    #         },
+    #     }
 
     def _fit_element(
         self, df: pd.DataFrame, col: str = "__all__", ngroup: int = 0
 
@@ -7,6 +7,7 @@
 import pandas as pd
 from category_encoders.one_hot import OneHotEncoder
 from numpy.typing import NDArray
+from sklearn import utils as sku
 from sklearn.base import (
     BaseEstimator,
     RegressorMixin,
@@ -20,10 +21,9 @@
 )
 from sklearn.pipeline import Pipeline
 from sklearn.preprocessing import StandardScaler
+from sklearn.utils import InputTags
 from sklearn.utils.validation import (
-    check_array,
     check_is_fitted,
-    check_X_y,
 )
 
 # from typing_extensions import Self
@@ -68,8 +68,14 @@ def fit(self, X: NDArray, y: NDArray) -> "MixteHGBM":
             Returns self.
 
         """
-        X, y = check_X_y(
-            X, y, accept_sparse=True, force_all_finite="allow-nan"
+        X, y = sku.validation.validate_data(
+            self,
+            X,
+            y,
+            accept_sparse=False,
+            ensure_all_finite="allow-nan",
+            reset=True,
+            dtype=["float", "int", "string", "categorical", "object"],
         )
         self.is_fitted_ = True
         self.n_features_in_ = X.shape[1]
@@ -101,20 +107,30 @@ def predict(self, X: NDArray) -> NDArray:
             Predicted target values.
 
         """
-        X = check_array(X, accept_sparse=True, force_all_finite="allow-nan")
+        sku.validation.validate_data(
+            self,
+            X,
+            accept_sparse=False,
+            ensure_all_finite="allow-nan",
+            reset=False,
+            dtype=["float", "int", "string", "categorical", "object"],
+        )
         check_is_fitted(self, "is_fitted_")
         y_pred = self.model_.predict(X)
         return y_pred
 
-    def _more_tags(self):
+    def __sklearn_tags__(self):
         """Indicate if the class allows inputs with categorical data and nans.
 
         It modifies the behaviour of the functions checking data.
         """
-        return {
-            "X_types": ["2darray", "categorical", "string"],
-            "allow_nan": True,
-        }
+        tags = super().__sklearn_tags__()
+        tags.input_tags = InputTags(
+            two_d_array=True, categorical=True, string=True, allow_nan=True
+        )
+        tags.target_tags.single_output = False
+        tags.non_deterministic = True
+        return tags
 
 
 class BinTransformer(TransformerMixin, BaseEstimator):
@@ -146,6 +162,14 @@ def fit(self, X: NDArray, y: Optional[NDArray] = None) -> "BinTransformer":
             Fitted transformer.
 
         """
+        sku.validation.validate_data(
+            self,
+            X,
+            accept_sparse=False,
+            ensure_all_finite="allow-nan",
+            reset=False,
+            dtype=["float", "int", "string", "categorical", "object"],
+        )
         df = utils._validate_input(X)
         self.feature_names_in_ = df.columns
         self.n_features_in_ = len(df.columns)
@@ -176,16 +200,24 @@ def transform(self, X: NDArray) -> NDArray:
             Transformed input.
 
         """
+        sku.validation.validate_data(
+            self,
+            X,
+            accept_sparse=False,
+            ensure_all_finite="allow-nan",
+            reset=False,
+            dtype=["float", "int", "string", "categorical", "object"],
+        )
         df = utils._validate_input(X)
         check_is_fitted(self)
-        if (
-            not hasattr(self, "feature_names_in_")
-            or df.columns.to_list() != self.feature_names_in_.to_list()
-        ):
-            raise ValueError(
-                f"Feature names in X {df.columns} don't match with "
-                f"expected {self.feature_names_in_}"
-            )
+        # if (
+        #     not hasattr(self, "feature_names_in_")
+        #     or df.columns.to_list() != self.feature_names_in_.to_list()
+        # ):
+        #     raise ValueError(
+        #         f"Feature names in X {df.columns} don't match with "
+        #         f"expected {self.feature_names_in_}"
+        #     )
         df_out = df.copy()
         for col in df:
             values = df[col]
@@ -215,15 +247,18 @@ def inverse_transform(self, X: NDArray) -> NDArray:
         """
         return self.transform(X)
 
-    def _more_tags(self):
+    def __sklearn_tags__(self):
         """Indicate if the class allows inputs with categorical data and nans.
 
         It modifies the behaviour of the functions checking data.
         """
-        return {
-            "X_types": ["2darray", "categorical", "string"],
-            "allow_nan": True,
-        }
+        tags = super().__sklearn_tags__()
+        tags.input_tags = InputTags(
+            two_d_array=True, categorical=True, string=True, allow_nan=True
+        )
+        tags.target_tags.single_output = False
+        tags.non_deterministic = True
+        return tags
 
 
 class OneHotEncoderProjector(OneHotEncoder):
Original file line number	Diff line number	Diff line change
`@@ -132,9 +132,8 @@ def root_mean_squared_error(`
`132`	`132`	`df1,`
`133`	`133`	`df2,`
`134`	`134`	`df_mask,`
`135`		`- skm.mean_squared_error,`
	`135`	`+ skm.root_mean_squared_error,`
`136`	`136`	`type_cols="numerical",`
`137`		`- squared=False,`
`138`	`137`	`)`
`139`	`138`
`140`	`139`