Skip to content

Commit 62e46eb

Browse files
committed
fixed bug with unseen categorical values, accept categorical datatypes in dataframes, fix error in docs
1 parent b0c14fb commit 62e46eb

File tree

5 files changed

+18
-8
lines changed

5 files changed

+18
-8
lines changed

README.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -135,6 +135,12 @@ and https://docs.ray.io/en/latest/cluster/vms/user-guides/community/slurm.html
135135

136136
## Releases (see git tags)
137137

138+
- v1.1.3:
139+
- Fixed a bug where the categorical encoding was incorrect if categories
140+
were missing in the training or validation set. The bug affected XGBoost
141+
and potentially many other models except RealMLP.
142+
- Scikit-learn interfaces now accept and auto-detect categorical datatypes
143+
(category, string, object) in dataframes.
138144
- v1.1.2:
139145
- Some compatibility improvements for scikit-learn 1.6
140146
(but disabled 1.6 since skorch is not compatible with it).

docs/source/models/02_hpo.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,7 @@ if n_cv == 1:
6868
_, val_idxs = train_test_split(np.arange(X_train.shape[0]), test_size=0.2, random_state=0)
6969
val_idxs = val_idxs[None, :]
7070
else:
71-
skf = StratifiedKFold(n_splits=2, shuffle=True, random_state=0)
71+
skf = StratifiedKFold(n_splits=n_cv, shuffle=True, random_state=0)
7272
val_idxs_list = [val_idxs for train_idxs, val_idxs in skf.split(X_train, y_train)]
7373

7474
# make sure that each validation set has the same length, so we can exploit vectorization

pytabkit/__about__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
# SPDX-FileCopyrightText: 2024-present David Holzmüller
22
#
33
# SPDX-License-Identifier: Apache-2.0
4-
__version__ = "1.1.2"
4+
__version__ = "1.1.3"

pytabkit/models/data/data.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -102,9 +102,15 @@ def to_df(self) -> pd.DataFrame:
102102
for key in self.tensors:
103103
val_np = self.tensors[key].detach().cpu().numpy()
104104
col_names = [f'{key}_{i}' for i in range(val_np.shape[1])]
105-
df = pd.DataFrame(val_np, columns=col_names)
105+
106106
if self.tensor_infos[key].is_cat():
107-
df = df.astype('category')
107+
cat_sizes = self.tensor_infos[key].get_cat_sizes().numpy()
108+
df = pd.DataFrame(
109+
{col_names[i]: pd.Categorical(val_np[:, i], categories=list(range(cat_sizes[i]))) for i in
110+
range(len(col_names))})
111+
else:
112+
df = pd.DataFrame(val_np, columns=col_names)
113+
108114
tensor_dfs.append(df)
109115

110116
return pd.concat(tensor_dfs, axis=1)

pytabkit/models/sklearn/sklearn_base.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -138,7 +138,7 @@ def fit(self, X, y, X_val: Optional = None, y_val: Optional = None, val_idxs: Op
138138
"""
139139

140140
# do a first check, this includes to check if X or y are not None before other things are done to them
141-
check_X_y(X, y, force_all_finite='allow-nan', multi_output=True)
141+
check_X_y(X, y, force_all_finite='allow-nan', multi_output=True, dtype=None)
142142

143143
# if X is None:
144144
# raise ValueError(f'This estimator requires X to be passed, but X is None')
@@ -173,8 +173,6 @@ def fit(self, X, y, X_val: Optional = None, y_val: Optional = None, val_idxs: Op
173173
if X_val is not None and y_val is not None:
174174
if val_idxs is not None:
175175
raise ValueError(f'both val_idxs and X_val, y_val were provided')
176-
if n_cv != 1:
177-
raise ValueError(f'X_val can only be specified for n_cv=1, but got {n_cv=}')
178176

179177
X_val = to_normal_type(X_val)
180178
y_val = to_normal_type(y_val)
@@ -185,7 +183,7 @@ def fit(self, X, y, X_val: Optional = None, y_val: Optional = None, val_idxs: Op
185183
y = concat_arrays(y, y_val)
186184

187185
# check again with the validation set concatenated
188-
check_X_y(X, y, force_all_finite='allow-nan', multi_output=True)
186+
check_X_y(X, y, force_all_finite='allow-nan', multi_output=True, dtype=None)
189187

190188
if self._is_classification():
191189
# classes_ is overridden later, but this raises an error when y is a regression target, so it is useful

0 commit comments

Comments
 (0)