Merge pull request #252 from basf/data_check

AnFreTh · web-flow · commit d3872b98fcdd · 2025-03-24T10:26:38.000+01:00
Data check
diff --git a/mambular/__version__.py b/mambular/__version__.py
@@ -17,4 +17,5 @@
 
 # The following line *must* be the last in the module, exactly as formatted:
 
-__version__ = "1.3.2"
+__version__ = "1.4.0"
+
diff --git a/mambular/models/utils/sklearn_base_classifier.py b/mambular/models/utils/sklearn_base_classifier.py
@@ -4,6 +4,7 @@
 import torch
 from sklearn.metrics import accuracy_score, log_loss
 from .sklearn_parent import SklearnBase
+import numpy as np
 
 
 class SklearnBaseClassifier(SklearnBase):
@@ -85,6 +86,8 @@ def build_model(
             The built classifier.
         """
 
+        num_classes = len(np.unique(y))
+
         return super()._build_model(
             X,
             y,
@@ -94,6 +97,7 @@ def build_model(
             y_val=y_val,
             embeddings=embeddings,
             embeddings_val=embeddings_val,
+            num_classes=num_classes,
             random_state=random_state,
             batch_size=batch_size,
             shuffle=shuffle,
@@ -190,6 +194,7 @@ def fit(
             The fitted classifier.
         """
 
+        num_classes = len(np.unique(y))
         return super().fit(
             X=X,
             y=y,
@@ -215,6 +220,7 @@ def fit(
             train_metrics=train_metrics,
             val_metrics=val_metrics,
             rebuild=rebuild,
+            num_classes=num_classes,
             **trainer_kwargs,
         )
 
diff --git a/mambular/models/utils/sklearn_base_regressor.py b/mambular/models/utils/sklearn_base_regressor.py
@@ -93,6 +93,7 @@ def build_model(
             y_val=y_val,
             embeddings=embeddings,
             embeddings_val=embeddings_val,
+            num_classes=1,
             random_state=random_state,
             batch_size=batch_size,
             shuffle=shuffle,
@@ -198,6 +199,7 @@ def fit(
             y_val=y_val,
             embeddings=embeddings,
             embeddings_val=embeddings_val,
+            num_classes=1,
             max_epochs=max_epochs,
             random_state=random_state,
             batch_size=batch_size,
diff --git a/mambular/models/utils/sklearn_parent.py b/mambular/models/utils/sklearn_parent.py
@@ -120,6 +120,7 @@ def _build_model(
         y_val=None,
         embeddings=None,
         embeddings_val=None,
+        num_classes: int = None,
         random_state: int = 101,
         batch_size: int = 128,
         shuffle: bool = True,
@@ -223,6 +224,7 @@ def _build_model(
             weight_decay=(
                 weight_decay if weight_decay is not None else self.config.weight_decay
             ),
+            num_classes=num_classes,
             train_metrics=train_metrics,
             val_metrics=val_metrics,
             optimizer_type=self.optimizer_type,
@@ -273,6 +275,7 @@ def fit(
         y_val=None,
         embeddings=None,
         embeddings_val=None,
+        num_classes: int = None,
         max_epochs: int = 100,
         random_state: int = 101,
         batch_size: int = 128,
@@ -357,6 +360,7 @@ def fit(
                 y_val=y_val,
                 embeddings=embeddings,
                 embeddings_val=embeddings_val,
+                num_classes=num_classes,
                 random_state=random_state,
                 batch_size=batch_size,
                 shuffle=shuffle,
diff --git a/mambular/preprocessing/preprocessor.py b/mambular/preprocessing/preprocessor.py
@@ -27,6 +27,7 @@
     OneHotFromOrdinal,
     ToFloatTransformer,
 )
+from .utils import check_inputs
 from sklearn.base import TransformerMixin
 
 
@@ -118,6 +119,7 @@ def __init__(
         use_decision_tree_knots=True,
         knots_strategy="uniform",
         spline_implementation="sklearn",
+        min_unique_vals=5,
     ):
         self.n_bins = n_bins
         self.numerical_preprocessing = (
@@ -176,6 +178,7 @@ def __init__(
         self.use_decision_tree_knots = use_decision_tree_knots
         self.knots_strategy = knots_strategy
         self.spline_implementation = spline_implementation
+        self.min_unique_vals = min_unique_vals
 
     def get_params(self, deep=True):
         """Get parameters for the preprocessor.
@@ -307,6 +310,15 @@ def fit(self, X, y=None, embeddings=None):
         self._fit_embeddings(embeddings)
 
         numerical_features, categorical_features = self._detect_column_types(X)
+
+        check_inputs(
+            X,
+            y,
+            numerical_features,
+            categorical_features,
+            task_type=self.task,
+            min_samples=self.min_unique_vals,
+        )
         transformers = []
 
         if numerical_features:
diff --git a/mambular/preprocessing/utils.py b/mambular/preprocessing/utils.py
@@ -0,0 +1,115 @@
+import pandas as pd
+import numpy as np
+import warnings
+
+
+def check_inputs(
+    X,
+    y=None,
+    numerical_columns=None,
+    categorical_columns=None,
+    task_type=None,
+    min_samples=5,
+):
+    """
+    Perform thorough validation on input features and target.
+
+    Parameters
+    ----------
+    X : pd.DataFrame or dict
+        Input features.
+    y : array-like, optional
+        Target values.
+    numerical_columns : list of str
+        Columns expected to be numerical.
+    categorical_columns : list of str
+        Columns expected to be categorical.
+    task_type : str, optional
+        One of {"regression", "binary", "multiclass"}. If specified, target checks will apply accordingly.
+    min_samples : int, optional
+        Minimum number of distinct values required in any feature or target.
+
+    Raises
+    ------
+    ValueError
+        If any feature or target fails validation checks.
+    """
+    if isinstance(X, dict):
+        X = pd.DataFrame(X)
+
+    if not isinstance(X, pd.DataFrame):
+        raise TypeError("X must be a DataFrame or a dict convertible to DataFrame.")
+
+    if X.empty:
+        raise ValueError("X must not be empty.")
+
+    if numerical_columns is None:
+        numerical_columns = []
+    if categorical_columns is None:
+        categorical_columns = []
+
+    all_cols = set(numerical_columns) | set(categorical_columns)
+    missing_cols = all_cols - set(X.columns)
+    if missing_cols:
+        raise ValueError(
+            f"The following specified columns are missing in X: {missing_cols}"
+        )
+
+    # Check numerical features
+    for col in numerical_columns:
+        series = X[col]
+        if series.nunique(dropna=False) < min_samples:
+            raise ValueError(
+                f"Numerical feature '{col}' has less than {min_samples} unique values."
+            )
+        if not np.issubdtype(series.dtype, np.number):
+            raise TypeError(f"Numerical feature '{col}' must be numeric.")
+        if not np.all(np.isfinite(series.dropna())):
+            raise ValueError(
+                f"Numerical feature '{col}' contains non-finite values (inf or NaN)."
+            )
+
+    # Check categorical features
+    for col in categorical_columns:
+        series = X[col]
+        if series.nunique(dropna=False) < 2:
+            raise ValueError(
+                f"Categorical feature '{col}' has less only a single value ."
+            )
+        if pd.api.types.is_numeric_dtype(
+            series
+        ) and not pd.api.types.is_categorical_dtype(series):
+            # allow numerical dtypes only if user intends to encode them
+            pass  # optionally warn or convert
+        if series.isnull().all():
+            raise ValueError(f"Categorical feature '{col}' contains only NaNs.")
+
+    # Check y
+    if y is not None:
+        y = np.array(y)
+
+        if y.ndim != 1:
+            raise ValueError("y must be a 1D array or Series.")
+
+        if len(y) != len(X):
+            raise ValueError("X and y must have the same number of samples.")
+
+        unique_targets = np.unique(y[~pd.isnull(y)])
+        n_classes = len(unique_targets)
+
+        if task_type == "regression":
+            if not np.issubdtype(y.dtype, np.number):
+                raise TypeError("For regression, target y must be numeric.")
+            if not np.all(np.isfinite(y)):
+                raise ValueError("Target y contains non-finite values.")
+
+            if n_classes <= 10:
+                warnings.warn(
+                    f"Target y has only {n_classes} unique values. "
+                    "Consider if this should be a classification problem instead of regression.",
+                    UserWarning,
+                )
+
+        elif task_type == "classification":
+            if n_classes < 2:
+                raise ValueError("Classification tasks requires more than 1 class.")
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,7 +1,8 @@
 [tool.poetry]
 name = "mambular"
 
-version = "1.3.2"
+version = "1.4.0"
+
 
 description = "A python package for tabular deep learning with mamba blocks."
 authors = ["Anton Thielmann", "Manish Kumar", "Christoph Weisser"]