fixed bug with unseen categorical values, accept categorical datatypes in dataframes, fix error in docs

dholzmueller · dholzmueller · commit 62e46ebb7010 · 2025-01-13T15:59:36.000+01:00
diff --git a/README.md b/README.md
@@ -135,6 +135,12 @@ and https://docs.ray.io/en/latest/cluster/vms/user-guides/community/slurm.html
 
 ## Releases (see git tags)
 
+- v1.1.3:
+  - Fixed a bug where the categorical encoding was incorrect if categories 
+    were missing in the training or validation set. The bug affected XGBoost 
+    and potentially many other models except RealMLP.
+  - Scikit-learn interfaces now accept and auto-detect categorical datatypes
+    (category, string, object) in dataframes.
 - v1.1.2: 
   - Some compatibility improvements for scikit-learn 1.6 
     (but disabled 1.6 since skorch is not compatible with it).
diff --git a/docs/source/models/02_hpo.md b/docs/source/models/02_hpo.md
@@ -68,7 +68,7 @@ if n_cv == 1:
     _, val_idxs = train_test_split(np.arange(X_train.shape[0]), test_size=0.2, random_state=0)
     val_idxs = val_idxs[None, :]
 else:
-    skf = StratifiedKFold(n_splits=2, shuffle=True, random_state=0)
+    skf = StratifiedKFold(n_splits=n_cv, shuffle=True, random_state=0)
     val_idxs_list = [val_idxs for train_idxs, val_idxs in skf.split(X_train, y_train)]
 
     # make sure that each validation set has the same length, so we can exploit vectorization
diff --git a/pytabkit/__about__.py b/pytabkit/__about__.py
@@ -1,4 +1,4 @@
 # SPDX-FileCopyrightText: 2024-present David Holzmüller
 #
 # SPDX-License-Identifier: Apache-2.0
-__version__ = "1.1.2"
+__version__ = "1.1.3"
diff --git a/pytabkit/models/data/data.py b/pytabkit/models/data/data.py
@@ -102,9 +102,15 @@ def to_df(self) -> pd.DataFrame:
         for key in self.tensors:
             val_np = self.tensors[key].detach().cpu().numpy()
             col_names = [f'{key}_{i}' for i in range(val_np.shape[1])]
-            df = pd.DataFrame(val_np, columns=col_names)
+
             if self.tensor_infos[key].is_cat():
-                df = df.astype('category')
+                cat_sizes = self.tensor_infos[key].get_cat_sizes().numpy()
+                df = pd.DataFrame(
+                    {col_names[i]: pd.Categorical(val_np[:, i], categories=list(range(cat_sizes[i]))) for i in
+                     range(len(col_names))})
+            else:
+                df = pd.DataFrame(val_np, columns=col_names)
+
             tensor_dfs.append(df)
 
         return pd.concat(tensor_dfs, axis=1)
diff --git a/pytabkit/models/sklearn/sklearn_base.py b/pytabkit/models/sklearn/sklearn_base.py
@@ -138,7 +138,7 @@ def fit(self, X, y, X_val: Optional = None, y_val: Optional = None, val_idxs: Op
         """
 
         # do a first check, this includes to check if X or y are not None before other things are done to them
-        check_X_y(X, y, force_all_finite='allow-nan', multi_output=True)
+        check_X_y(X, y, force_all_finite='allow-nan', multi_output=True, dtype=None)
 
         # if X is None:
         #     raise ValueError(f'This estimator requires X to be passed, but X is None')
@@ -173,8 +173,6 @@ def fit(self, X, y, X_val: Optional = None, y_val: Optional = None, val_idxs: Op
         if X_val is not None and y_val is not None:
             if val_idxs is not None:
                 raise ValueError(f'both val_idxs and X_val, y_val were provided')
-            if n_cv != 1:
-                raise ValueError(f'X_val can only be specified for n_cv=1, but got {n_cv=}')
 
             X_val = to_normal_type(X_val)
             y_val = to_normal_type(y_val)
@@ -185,7 +183,7 @@ def fit(self, X, y, X_val: Optional = None, y_val: Optional = None, val_idxs: Op
             y = concat_arrays(y, y_val)
 
         # check again with the validation set concatenated
-        check_X_y(X, y, force_all_finite='allow-nan', multi_output=True)
+        check_X_y(X, y, force_all_finite='allow-nan', multi_output=True, dtype=None)
 
         if self._is_classification():
             # classes_ is overridden later, but this raises an error when y is a regression target, so it is useful

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`	`1`	`# SPDX-FileCopyrightText: 2024-present David Holzmüller`
`2`	`2`	`#`
`3`	`3`	`# SPDX-License-Identifier: Apache-2.0`
`4`		`-__version__ = "1.1.2"`
	`4`	`+__version__ = "1.1.3"`