dholzmueller
diff --git a/‎README.md‎
Lines changed: 13 additions & 3 deletions b/‎README.md‎
Lines changed: 13 additions & 3 deletions
diff --git a/‎docs/requirements.txt‎
Lines changed: 3 additions & 2 deletions b/‎docs/requirements.txt‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎docs/source/index.rst‎
Lines changed: 2 additions & 0 deletions b/‎docs/source/index.rst‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎docs/source/models/examples.md‎
Lines changed: 67 additions & 0 deletions b/‎docs/source/models/examples.md‎
Lines changed: 67 additions & 0 deletions
diff --git a/‎pyproject.toml‎
Lines changed: 37 additions & 26 deletions b/‎pyproject.toml‎
Lines changed: 37 additions & 26 deletions
diff --git a/‎pytabkit/__about__.py‎
Lines changed: 1 addition & 1 deletion b/‎pytabkit/__about__.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎pytabkit/bench/alg_wrappers/general.py‎
Lines changed: 1 addition & 1 deletion b/‎pytabkit/bench/alg_wrappers/general.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎pytabkit/bench/alg_wrappers/interface_wrappers.py‎
Lines changed: 37 additions & 3 deletions b/‎pytabkit/bench/alg_wrappers/interface_wrappers.py‎
Lines changed: 37 additions & 3 deletions
diff --git a/‎pytabkit/bench/data/common.py‎
Lines changed: 2 additions & 0 deletions b/‎pytabkit/bench/data/common.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎pytabkit/bench/eval/evaluation.py‎
Lines changed: 15 additions & 6 deletions b/‎pytabkit/bench/eval/evaluation.py‎
Lines changed: 15 additions & 6 deletions
@@ -15,17 +15,18 @@ on our benchmarks.
 
 ![Meta-test benchmark results](./figures/meta-test_benchmark_results.png)
 
-## Installation
+## Installation (new in 1.4.0: optional model dependencies)
 
 ```bash
-pip install pytabkit
+pip install pytabkit[models]
 ```
 
+- RealMLP (and TabM) can be used without the `[models]` part.
 - If you want to use **TabR**, you have to manually install
   [faiss](https://github.com/facebookresearch/faiss/blob/main/INSTALL.md),
   which is only available on **conda**.
 - Please install torch separately if you want to control the version (CPU/GPU etc.)
-- Use `pytabkit[autogluon,extra,hpo,bench,dev]` to install additional dependencies for
+- Use `pytabkit[models,autogluon,extra,hpo,bench,dev]` to install additional dependencies for
   AutoGluon models, extra preprocessing,
   hyperparameter optimization methods beyond random search (hyperopt/SMAC),
   the benchmarking part, and testing/documentation. For the hpo part,
@@ -169,6 +170,15 @@ and https://docs.ray.io/en/latest/cluster/vms/user-guides/community/slurm.html
 
 ## Releases (see git tags)
 
+- v1.4.0:
+    - moved some imports to the new `models` optional dependencies
+      to have a more light-weight RealMLP installation
+    - Added GPU support for CatBoost (not guaranteed to produce exactly the same results)
+    - Ensembling now saves models after training if a path is supplied, to reduce memory usage
+    - Added more search spaces
+    - fixed error in multiquantile output when the passed y was one-dimensional 
+      instead of having shape `(n_samples, 1)`
+    - Added some examples to the documentation
 - v1.3.0: 
     - Added multiquantile regression for RealMLP: 
       see the [documentation](https://pytabkit.readthedocs.io/en/latest/models/quantile_reg.html)
 
@@ -8,7 +8,7 @@ lightgbm>=4.1
 matplotlib>=3.0
 msgpack>=1.0
 myst_parser>=3.0
-numpy>=1.25,<2.0
+numpy>=1.25
 openml>=0.14
 openpyxl>=3.0
 pandas>=2.0
@@ -22,11 +22,12 @@ pytorch_lightning>=2.0
 pyyaml>=5.0
 ray>=2.8
 requests>=2.0
-scikit-learn>=1.3,<1.6
+scikit-learn>=1.3
 seaborn>=0.0.13
 skorch>=0.15
 sphinx>=7.0
 sphinx_rtd_theme>=2.0
+torch>=2.0
 torch>=2.0,<2.6
 torchmetrics>=1.2.1
 tqdm
 
@@ -12,11 +12,13 @@ Tabular ML models in pytabkit.models
    models/00_overview
    models/01_sklearn_interfaces
    models/02_hpo
+   models/examples
    models/nn_classes
    models/03_training_implementation
    models/quantile_reg
 
 
+
 Tabular benchmarking using pytabkit.bench
 ====================================
 
 
@@ -0,0 +1,67 @@
+# Examples
+
+## Refitting RealMLP on train+val data using the best epoch from a previous run
+
+You can refit RealMLP by simply using $n_refit=1$
+(or, better, larger values to ensemble multiple NNs). 
+But in case you want more control, you can do it manually
+(e.g., if you only want to refit the best configuration from HPO,
+but you're not using the HPO within pytabkit).
+
+```python
+import numpy as np
+from sklearn.model_selection import train_test_split
+
+from pytabkit import RealMLP_TD_Regressor
+
+np.random.seed(0)
+
+X = np.random.randn(500, 5)
+y = np.random.randn(500)
+
+X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=0)
+
+reg = RealMLP_TD_Regressor(verbosity=2, random_state=0)
+reg.fit(X_train, y_train, X_val, y_val)
+
+refit = RealMLP_TD_Regressor(verbosity=2, stop_epoch=list(reg.fit_params_['stop_epoch'].values())[0], val_fraction=0.0, random_state=0)
+refit.fit(X, y)
+```
+
+## Fitting again after HPO on a smaller subset
+
+Here is an example on how to fit HPO on a smaller subset 
+and fit the best configuration again with validation. 
+(It might be better to just use `n_refit` in the HPO classifier/regressor instead.)
+
+```python
+import numpy as np
+from sklearn.model_selection import train_test_split
+
+from pytabkit import LGBM_HPO_TPE_Regressor, LGBM_TD_Regressor
+
+# This is an example on how to fit a HPO method on a smaller subset of the data,
+# and then refit the best hyperparams on the full dataset
+np.random.seed(0)
+
+X = np.random.randn(500, 5)
+y = np.random.randn(500)
+
+X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.9, random_state=0)
+
+# use 90% for validation to train faster
+# if there is too much validation data, validation data might be the bottleneck, then you should pass
+model = LGBM_HPO_TPE_Regressor(val_fraction=0.9, n_hyperopt_steps=5)
+model.fit(X, y)
+
+# unfortunately params are not always called the same way, so we need to rename a few
+params = model.fit_params_['hyper_fit_params']
+params['subsample'] = params.pop('bagging_fraction')
+params['colsample_bytree'] = params.pop('feature_fraction')
+params['lr'] = params.pop('learning_rate')
+
+# unfortunately, it is hard right now to check if this is exactly the same config,
+# as this might set some default params that are not used in the HPO config
+model_refit = LGBM_TD_Regressor(**params)
+model_refit.fit(X, y)
+```
@@ -13,6 +13,7 @@ keywords = ['tabular data', 'scikit-learn', 'deep learning', 'gradient boosting'
 authors = [
     { name = "David Holzmüller" }, #, email = "[email protected]" },
     { name = "Léo Grinsztajn" }, #, email = "[email protected]" },
+    { name = "Ingo Steinwart" }, #, email = "[email protected]" },
 ]
 classifiers = [
     "Development Status :: 4 - Beta",
@@ -26,40 +27,50 @@ classifiers = [
     "License :: OSI Approved :: Apache Software License",
 ]
 dependencies = [
+    "torch>=2.0",
+    "numpy>=1.25", # hopefully don't need <2.0 anymore?
+    "pandas>=2.0",
+    "scikit-learn>=1.3",
+    # these could be made optional with lazy imports
+    # older versions of torchmetrics (<1.2.1) have a bug that makes certain metrics used in TabR slow:
+    # https://github.com/Lightning-AI/torchmetrics/pull/2184
+    "torchmetrics>=1.2.1",
+    # can also install the newer lightning package with more dependencies instead, it will be prioritized
+    "pytorch_lightning>=2.0",
+    "psutil>=5.0",  # used for getting logical CPU count in the sklearn base and for getting process RAM usage
+    "dill",  # more powerful pickle, used for file-saving and multiprocessing
+]
+
+[project.optional-dependencies]
+models = [
     # use <2.6 for now since it can run into pickling issues with skorch if the skorch version is too old
     # see https://github.com/skorch-dev/skorch/commit/be93b7769d61aa22fb928d2e89e258c629bfeaf9
     "torch>=2.0,<2.6",
-    "numpy>=1.25,<2.0",
-    "pandas>=2.0",
-    "scikit-learn>=1.3,<1.6",
     "xgboost>=2.0",
     "catboost>=1.2",
     "lightgbm>=4.1",
-    # older versions of torchmetrics (<1.2.1) have a bug that makes certain metrics used in tabr slow:
-    # https://github.com/Lightning-AI/torchmetrics/pull/2184
-    "torchmetrics>=1.2.1",
-    # can also install the newer lightning package with more dependencies instead, it will be prioritized
-    "pytorch_lightning>=2.0",
-    "skorch>=0.15",  # for rtdl models
-    "dask[dataframe]>=2023",  # this is here because of a pandas warning:
+    # for rtdl models (MLP, ResNet) but also lightly used in TabR
+    # note that scikit-learn 1.6 needs skorch >= 1.1.0
+    "skorch>=0.15",
+    "dask[dataframe]>=2023", # this is here because of a pandas warning:
     # "Dask dataframe query planning is disabled because dask-expr is not installed"
     # "packaging",  # unclear why this is here?
-    "tqdm",  # for TabM with verbosity >= 1
-    "psutil>=5.0",
-    # more classification metrics and post-hoc calibrators (could be an optional dependency)
+
+    "tqdm", # for TabM with verbosity >= 1
+
+    # more classification metrics and post-hoc calibrators
+    # not necessary unless these things are actually used
     "probmetrics>=0.0.1",
 
-    # packages for saving objects in different formats
-    "dill",
+    # saving objects in yaml/msgpack
+    # needed if used in utils.serialize() / deserialize()
     "pyyaml>=5.0",
     "msgpack>=1.0",
     # apparently msgpack_numpy fixed some bug in using numpy arrays in msgpack?
-    # but apparently it can also cause a bug in ray due to its monkey-patching of msgpack functions
-    # in theory we shouldn't be using if for numpy arrays at the moment, not sure why the need for this occured
+    # but apparently it can also cause a bug in ray due to its monkey-patching of msgpack functions# in theory we shouldn't be using if for numpy arrays at the moment, not sure why the need for this occured
+    # maybe it occured because we tried to save hyperparameters that were numpy scalars instead of python scalars
     # "msgpack_numpy>=0.4",
 ]
-
-[project.optional-dependencies]
 autogluon = [
     "autogluon.tabular[all]>=1.0",
     "autogluon.multimodal>=1.0",
@@ -73,10 +84,10 @@ hpo = [
     "hyperopt>=0.2",
 ]
 bench = [
-    "fire",   # argparse utilities
-    "ray>=2.8",   # parallelization
-    "pynvml>=11.0",   # NVIDIA GPU utilization
-    "openml>=0.14",  # OpenML data download
+    "fire", # argparse utilities
+    "ray>=2.8", # parallelization
+    "pynvml>=11.0", # NVIDIA GPU utilization
+    "openml>=0.14", # OpenML data download
     # ----- UCI import ------
     "requests>=2.0",
     "patool>=1.0",
@@ -102,12 +113,12 @@ path = "pytabkit/__about__.py"
 
 [tool.hatch.envs.default]
 installer = "uv"
-features = ["bench","autogluon","extra","hpo","dev"]
+features = ["models", "bench", "autogluon", "extra", "hpo", "dev"]
 
 [tool.hatch.envs.hatch-test]
 installer = "uv"
-features = ["bench","dev"]
-#features = ["bench","autogluon","extra","hpo","dev"]
+features = ["models", "bench", "dev"]
+#features = ["models","bench","autogluon","extra","hpo","dev"]
 
 [tool.hatch.build.targets.sdist]
 package = ['pytabkit']
 
@@ -2,4 +2,4 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-__version__ = "1.3.0"
+__version__ = "1.4.0"
@@ -33,7 +33,7 @@ def run(self, task_package: TaskPackage, logger: Logger, assigned_resources: Nod
         :param tmp_folders: Temporary folders, one for each train/test split, to save temporary data to.
         :return: A dictionary of lists of ResultManager objects.
             The dict key is the predict params name, which is used as a suffix for the alg_name,
-             and each list contains ResultManagers for each train/test split.
+            and each list contains ResultManagers for each train/test split.
         """
         raise NotImplementedError()
 
 
@@ -18,7 +18,8 @@
 from pytabkit.bench.run.results import ResultManager
 from pytabkit.models.alg_interfaces.other_interfaces import RFSubSplitInterface, SklearnMLPSubSplitInterface, \
     KANSubSplitInterface, GrandeSubSplitInterface, GBTSubSplitInterface, RandomParamsRFAlgInterface, \
-    TabPFN2SubSplitInterface, TabICLSubSplitInterface
+    TabPFN2SubSplitInterface, TabICLSubSplitInterface, RandomParamsExtraTreesAlgInterface, RandomParamsKNNAlgInterface, \
+    ExtraTreesSubSplitInterface, KNNSubSplitInterface, RandomParamsLinearModelAlgInterface, LinearModelSubSplitInterface
 from pytabkit.bench.scheduling.resources import NodeResources
 from pytabkit.models.alg_interfaces.alg_interfaces import AlgInterface, MultiSplitWrapperAlgInterface
 from pytabkit.models.alg_interfaces.base import SplitIdxs, RequiredResources
@@ -317,7 +318,7 @@ def _create_alg_interface_impl(self, task_package: TaskPackage) -> AlgInterface:
         n_splits = len(task_package.split_infos)
         return MultiSplitWrapperAlgInterface(
             single_split_interfaces=[self.create_single_alg_interface(n_cv, task_type)
-                                     for i in range(n_splits)])
+                                     for i in range(n_splits)], **self.config)
 
 
 class SubSplitInterfaceWrapper(MultiSplitAlgInterfaceWrapper):
@@ -333,7 +334,7 @@ def create_sub_split_interface(self, task_type: TaskType) -> AlgInterface:
     def create_single_alg_interface(self, n_cv: int, task_type: TaskType) \
             -> AlgInterface:
         return SingleSplitWrapperAlgInterface([self.create_sub_split_interface(task_type)
-                                               for i in range(n_cv)])
+                                               for i in range(n_cv)], **self.config)
 
 
 class NNInterfaceWrapper(AlgInterfaceWrapper):
@@ -455,6 +456,21 @@ def create_sub_split_interface(self, task_type: TaskType) -> AlgInterface:
         return RFSubSplitInterface(**self.config)
 
 
+class ExtraTreesInterfaceWrapper(SubSplitInterfaceWrapper):
+    def create_sub_split_interface(self, task_type: TaskType) -> AlgInterface:
+        return ExtraTreesSubSplitInterface(**self.config)
+
+
+class KNNInterfaceWrapper(SubSplitInterfaceWrapper):
+    def create_sub_split_interface(self, task_type: TaskType) -> AlgInterface:
+        return KNNSubSplitInterface(**self.config)
+
+
+class LinearModelInterfaceWrapper(SubSplitInterfaceWrapper):
+    def create_sub_split_interface(self, task_type: TaskType) -> AlgInterface:
+        return LinearModelSubSplitInterface(**self.config)
+
+
 class GBTInterfaceWrapper(SubSplitInterfaceWrapper):
     def create_sub_split_interface(self, task_type: TaskType) -> AlgInterface:
         return GBTSubSplitInterface(**self.config)
@@ -544,3 +560,21 @@ class RandomParamsRFInterfaceWrapper(AlgInterfaceWrapper):
     def __init__(self, model_idx: int, **config):
         # model_idx should be the random search iteration (i.e. start from zero)
         super().__init__(RandomParamsRFAlgInterface, model_idx=model_idx, **config)
+
+
+class RandomParamsExtraTreesInterfaceWrapper(AlgInterfaceWrapper):
+    def __init__(self, model_idx: int, **config):
+        # model_idx should be the random search iteration (i.e. start from zero)
+        super().__init__(RandomParamsExtraTreesAlgInterface, model_idx=model_idx, **config)
+
+
+class RandomParamsKNNInterfaceWrapper(AlgInterfaceWrapper):
+    def __init__(self, model_idx: int, **config):
+        # model_idx should be the random search iteration (i.e. start from zero)
+        super().__init__(RandomParamsKNNAlgInterface, model_idx=model_idx, **config)
+
+
+class RandomParamsLinearModelInterfaceWrapper(AlgInterfaceWrapper):
+    def __init__(self, model_idx: int, **config):
+        # model_idx should be the random search iteration (i.e. start from zero)
+        super().__init__(RandomParamsLinearModelAlgInterface, model_idx=model_idx, **config)
@@ -7,6 +7,8 @@ class TaskSource:
     OPENML_CLASS_BIN_EXTRA = 'openml-class-bin-extra'
     OPENML_REGRESSION = 'openml-reg'
     AUTOML_CLASS_SMALL = 'automl-class-small'
+    TABARENA_CLASS = 'tabarena-class'
+    TABARENA_REG = 'tabarena-reg'
     CUSTOM = 'custom'
 
 
 
@@ -67,12 +67,21 @@ def select_eval_modes(self, eval_modes: List[Tuple[str, str, str]]) -> List[Tupl
             modes = [mode for mode in eval_modes if mode[0] == val]
             if len(modes) > 0:
                 # maximize n_models
-                idx = np.argmax([int(mode[1]) for mode in modes])
-                idx_min = np.argmin([int(mode[1]) for mode in modes])
-                mode = modes[idx]
-                result.append((f' [{name}-{mode[1]}]', mode))
-                if idx_min != idx:
-                    result.append((f' [{name}-{modes[idx_min][1]}]', modes[idx_min]))
+                bag_sizes = [int(mode[1]) for mode in modes]
+                max_cv = np.max(bag_sizes)
+                min_cv = np.min(bag_sizes)
+                bag_sizes = list({max_cv, min_cv})  # only have one element if they're equal
+
+                for bag_size in bag_sizes:
+                    # make sure to always select model '0' to avoid non-determinism
+                    result.append((f' [{name}-{bag_size}]', (val, str(bag_size), '0')))
+
+                # idx = np.argmax([int(mode[1]) for mode in modes])
+                # idx_min = np.argmin([int(mode[1]) for mode in modes])
+                # mode = modes[idx]
+                # result.append((f' [{name}-{mode[1]}]', mode))
+                # if idx_min != idx:
+                #     result.append((f' [{name}-{modes[idx_min][1]}]', modes[idx_min]))
 
         return result
Original file line number	Diff line number	Diff line change
`@@ -2,4 +2,4 @@`
`2`	`2`	`#`
`3`	`3`	`# SPDX-License-Identifier: Apache-2.0`
`4`	`4`
`5`		`-__version__ = "1.3.0"`
	`5`	`+__version__ = "1.4.0"`