v1.4.2: updates to HPO and ensembling

dholzmueller · dholzmueller · commit 630470decc83 · 2025-06-15T16:30:38.000+02:00
diff --git a/.gitignore b/.gitignore
@@ -1,12 +1,16 @@
 *.pyc
 *.pdf
 *.zip
+*.ckpt
 
 experiments/*/
+experiments/trace.json
 !experiments/meta_hpo
 !experiments/prototypes
 public_export
 dist
+files
+plots
 
 docs/build
 docs/source/modules.rst
diff --git a/README.md b/README.md
@@ -80,12 +80,13 @@ Our ML models are available in up to three variants, all with best-epoch selecti
 
 - library defaults (D)
 - our tuned defaults (TD)
-- random search hyperparameter optimization (HPO), sometimes also tree parzen estimator (HPO-TPE)
+- random search hyperparameter optimization (HPO), 
+  sometimes also tree parzen estimator (HPO-TPE) or weighted ensembling (Ensemble)
 
 We provide the following ML models:
 
-- **RealMLP** (TD, HPO): Our new neural net models with tuned defaults (TD)
-  or random search hyperparameter optimization (HPO)
+- **RealMLP** (TD, HPO, Ensemble): Our new neural net models with tuned defaults (TD),
+  random search hyperparameter optimization (HPO), or Ensembling
 - **XGB**, **LGBM**, **CatBoost** (D, TD, HPO, HPO-TPE): Interfaces for gradient-boosted
   tree libraries XGBoost, LightGBM, CatBoost
 - **MLP**, **ResNet**, **FTT** (D, HPO): Models
@@ -170,6 +171,16 @@ and https://docs.ray.io/en/latest/cluster/vms/user-guides/community/slurm.html
 
 ## Releases (see git tags)
 
+- v1.4.2:
+    - fixed handling of custom `val_metric_name` HPO models and `Ensemble_TD_Regressor`.
+    - if `tmp_folder` is specified in HPO models, 
+      save each model to disk immediately instead of holding all of them in memory.
+      This can considerably reduce RAM/VRAM usage.
+      In this case, pickled HPO models will still rely on the models stored in the `tmp_folder`.
+    - We now provide `RealMLP_Ensemble_Classifier` and `RealMLP_Ensemble_Regressor`,
+      which will use weighted ensembling and usually perform better than HPO 
+      (but have slower inference time). We recommend using the new `hpo_space_name='tabarena'`
+      for best results.
 - v1.4.1: 
     - moved dill to optional dependencies
     - updated TabM code to a newer version: 
diff --git a/docs/source/models/01_sklearn_interfaces.rst b/docs/source/models/01_sklearn_interfaces.rst
@@ -9,17 +9,24 @@ and categorical features in the ``fit`` method:
 
 .. autofunction:: pytabkit.models.sklearn.sklearn_base.AlgInterfaceEstimator.fit
 
+Important: For HPO and ensemble interfaces, it is recommended to set `tmp_folder`
+to allow these methods to store fitted models instead of holding them in the RAM.
+This means that `tmp_folder` should not be deleted while the associated interface
+still exists (even when it is pickled).
 
 RealMLP
 -------
 
-For RealMLP, we provide TD (tuned default)
-and HPO (hyperparameter optimization with random search) variants:
+For RealMLP, we provide TD (tuned default),
+HPO (hyperparameter optimization with random search),
+and Ensemble (weighted ensembling of random search configurations) variants:
 
 - RealMLP_TD_Classifier
 - RealMLP_TD_Regressor
 - RealMLP_HPO_Classifier
 - RealMLP_HPO_Regressor
+- RealMLP_Ensemble_Classifier
+- RealMLP_Ensemble_Regressor
 
 While the TD variants have good defaults,
 they provide the option to override any hyperparameters.
@@ -32,7 +39,7 @@ and ``verbosity`` may be ignored by some of the methods.
 
 .. autofunction:: pytabkit.models.sklearn.sklearn_interfaces.RealMLP_TD_Classifier.__init__
 
-For the HPO variants, we currently only provide few options:
+For the HPO and Ensemble variants, we currently only provide few options:
 
 .. autofunction:: pytabkit.models.sklearn.sklearn_interfaces.RealMLP_HPO_Classifier.__init__
 
@@ -74,8 +81,8 @@ with our scikit-learn interfaces,
 although in this case the validation sets are not used.
 The respective classes are called
 ``RF_SKL_Classifier`` and ``MLP_SKL_Classifier`` etc.
-We also provide our ``Ensemble_TD_Classifier``,
-a weighted ensemble of our TD models (and similar for regression).
+We also provide our ``Ensemble_TD_Classifier`` and ``Ensemble_HPO_Classifier``,
+a weighted ensemble of our TD / HPO models (and similar for regression).
 
 ..
     test
@@ -97,6 +104,8 @@ can be saved using pickle-like modules.
 With standard pickling,
 a model trained on a GPU will be restored to use the same GPU,
 and fail to load if the GPU is not present.
+(Note that dill fails to save torch models in newer torch versions,
+while pickle can still save them.)
 
 The following code allows to load GPU-trained models to the CPU,
 but fails to run predict() due to pytorch-lightning device issues.
diff --git a/pytabkit/__about__.py b/pytabkit/__about__.py
@@ -2,4 +2,4 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-__version__ = "1.4.1"
+__version__ = "1.4.2"
diff --git a/pytabkit/bench/run/results.py b/pytabkit/bench/run/results.py
@@ -75,6 +75,14 @@ def load(path: Path, load_other: bool = True, load_preds: bool = True):
         rm.metrics_dict = utils.deserialize(path / 'metrics.yaml', use_yaml=True)
         if load_other:
             rm.other_dict = utils.deserialize(path / 'other.msgpack.gz', use_msgpack=True, compressed=True)
+            for mode in ['cv', 'refit']:
+                if mode in rm.other_dict and 'y_preds' in rm.other_dict[mode]:
+                    # other_dict was created by old code and still contains y_preds
+                    if mode == 'cv':
+                        rm.y_preds_cv = rm.other_dict[mode]['y_preds']
+                    else:
+                        rm.y_preds_refit = rm.other_dict[mode]['y_preds']
+
         if load_preds:
             if utils.existsFile(path / 'y_preds_cv.npz'):
                 rm.y_preds_cv = np.load(path / 'y_preds_cv.npz')['y_preds']
diff --git a/pytabkit/models/alg_interfaces/ensemble_interfaces.py b/pytabkit/models/alg_interfaces/ensemble_interfaces.py
@@ -1,16 +1,17 @@
+import copy
 from pathlib import Path
-from typing import List, Optional, Dict, Any, Union
+from typing import List, Optional, Dict
 
 import numpy as np
 import torch
 
-from pytabkit.models import utils
 from pytabkit.models.alg_interfaces.alg_interfaces import SingleSplitAlgInterface, AlgInterface
 from pytabkit.models.alg_interfaces.base import SplitIdxs, InterfaceResources, RequiredResources
 from pytabkit.models.data.data import DictDataset, TaskType
 from pytabkit.models.torch_utils import cat_if_necessary
 from pytabkit.models.training.logging import Logger
 from pytabkit.models.training.metrics import Metrics
+from pytabkit.models.utils import ObjectLoadingContext
 
 
 class WeightedPrediction:
@@ -28,25 +29,6 @@ def predict_for_weights(self, weights: np.ndarray):
         return weighted_sum
 
 
-class ObjectLoadingContext:
-    def __init__(self, obj: Any, filename: Optional[Union[str, Path]] = None):
-        self.obj = obj
-        self.filename = filename
-        self.saved = False
-
-    def __enter__(self) -> Any:
-        # use pickle since it works better with torch than dill
-        if self.saved:
-            self.obj = utils.deserialize(self.filename, use_pickle=True)
-        return self.obj
-
-    def __exit__(self, type, value, traceback) -> None:
-        if self.filename is not None:
-            utils.serialize(self.filename, self.obj, use_pickle=True)
-            self.saved = True
-            del self.obj
-
-
 class CaruanaEnsembleAlgInterface(SingleSplitAlgInterface):
     """
     Following a simple variant of Caruana et al. (2004), "Ensemble selection from libraries of models"
@@ -65,10 +47,15 @@ def get_refit_interface(self, n_refit: int, fit_params: Optional[List[Dict]] = N
 
     def fit(self, ds: DictDataset, idxs_list: List[SplitIdxs], interface_resources: InterfaceResources,
             logger: Logger, tmp_folders: List[Optional[Path]], name: str) -> None:
+        assert len(idxs_list) == 1
+
+        # if tmp_folders is specified, then models will be saved there instead of holding all of them in memory
         tmp_folder = tmp_folders[0]
         self.alg_contexts_ = [ObjectLoadingContext(ai, None if tmp_folder is None else tmp_folder / f'model_{i}') for
                               i, ai in enumerate(self.alg_interfaces)]
-        self.alg_interfaces = None  # allow not holding all of them later, to free GPU memory
+        # store copies here, but the ones that will actually be trained are in alg_contexts_
+        # this means that models should not be held in RAM all the time
+        self.alg_interfaces = copy.deepcopy(self.alg_interfaces)
 
         sub_fit_params = []
 
@@ -94,7 +81,7 @@ def fit(self, ds: DictDataset, idxs_list: List[SplitIdxs], interface_resources:
         if val_metric_name is None:
             val_metric_name = Metrics.default_val_metric_name(task_type=self.task_type)
 
-        n_caruana_steps = self.config.get('n_caruana_steps', 40)  # default value is taken from TaskRepo paper (IIRC)
+        n_caruana_steps = self.config.get('n_caruana_steps', 40)  # default value is taken from TabRepo paper (IIRC)
 
         y_preds_oob_list = []
         for alg_idx, alg_ctx in enumerate(self.alg_contexts_):
@@ -114,6 +101,8 @@ def fit(self, ds: DictDataset, idxs_list: List[SplitIdxs], interface_resources:
 
         wp = WeightedPrediction(y_preds_oob_list, self.task_type)
 
+        allow_negative_weights = self.config.get('allow_negative_weights', False)
+
         for step_idx in range(n_caruana_steps):
             best_step_weights = None
             best_step_loss = np.inf
@@ -129,6 +118,21 @@ def fit(self, ds: DictDataset, idxs_list: List[SplitIdxs], interface_resources:
 
                 weights[weight_idx] -= 1
 
+                # negative weights option
+                # check weights >= 2 allowing for floating-point errors
+                if allow_negative_weights and np.sum(weights) >= 1.5:
+                    weights[weight_idx] -= 1
+
+                    y_pred_oob = wp.predict_for_weights(weights)
+                    loss = Metrics.apply(y_pred_oob, y_oob, val_metric_name).item()
+                    # print(f'{weights=}, {loss=}')
+                    if loss < best_step_loss:
+                        best_step_loss = loss
+                        best_step_weights = np.copy(weights)
+
+                    weights[weight_idx] += 1
+
+
             if best_step_loss < best_loss:
                 best_loss = best_step_loss
                 best_weights = np.copy(best_step_weights)
@@ -179,13 +183,22 @@ def fit(self, ds: DictDataset, idxs_list: List[SplitIdxs], interface_resources:
             logger: Logger, tmp_folders: List[Optional[Path]], name: str) -> None:
         assert len(idxs_list) == 1
 
+        # if tmp_folders is specified, then models will be saved there instead of holding all of them in memory
+        tmp_folder = tmp_folders[0]
+        self.alg_contexts_ = [ObjectLoadingContext(ai, None if tmp_folder is None else tmp_folder / f'model_{i}') for
+                              i, ai in enumerate(self.alg_interfaces)]
+        # store copies here, but the ones that will actually be trained are in alg_contexts_
+        # this means that models should not be held in RAM all the time
+        self.alg_interfaces = copy.deepcopy(self.alg_interfaces)
+
         if self.fit_params is not None:
             # this is the refit stage, there is no validation data set to determine the best model on,
             # instead the best model index is already in fit_params
             best_alg_idx = self.fit_params[0]['best_alg_idx']
             sub_tmp_folders = [tmp_folder / str(best_alg_idx) if tmp_folder is not None else None for tmp_folder in
                                tmp_folders]
-            self.alg_interfaces[best_alg_idx].fit(ds, idxs_list, interface_resources, logger, sub_tmp_folders,
+            with self.alg_contexts_[best_alg_idx] as alg_interface:
+                alg_interface.fit(ds, idxs_list, interface_resources, logger, sub_tmp_folders,
                                                   name + f'sub-alg-{best_alg_idx}')
 
             return
@@ -206,28 +219,32 @@ def fit(self, ds: DictDataset, idxs_list: List[SplitIdxs], interface_resources:
 
         best_alg_idx = 0
         best_alg_loss = np.inf
+        best_sub_fit_params = None
 
-        for alg_idx, alg_interface in enumerate(self.alg_interfaces):
-            sub_tmp_folders = [tmp_folder / str(alg_idx) if tmp_folder is not None else None for tmp_folder in
-                               tmp_folders]
-            alg_interface.fit(ds, idxs_list, interface_resources, logger, sub_tmp_folders, name + f'sub-alg-{alg_idx}')
-            y_preds = alg_interface.predict(ds)
-            # get out-of-bag predictions
-            y_pred_oob = cat_if_necessary([y_preds[j, idxs_list[0].val_idxs[j]]
-                                           for j in range(idxs_list[0].val_idxs.shape[0])], dim=0)
-            loss = Metrics.apply(y_pred_oob, y_oob, val_metric_name).item()
-            if loss < best_alg_loss:
-                best_alg_loss = loss
-                best_alg_idx = alg_idx
+        for alg_idx, alg_ctx in enumerate(self.alg_contexts_):
+            with alg_ctx as alg_interface:
+                sub_tmp_folders = [tmp_folder / str(alg_idx) if tmp_folder is not None else None for tmp_folder in
+                                   tmp_folders]
+                alg_interface.fit(ds, idxs_list, interface_resources, logger, sub_tmp_folders, name + f'sub-alg-{alg_idx}')
+                y_preds = alg_interface.predict(ds)
+                # get out-of-bag predictions
+                y_pred_oob = cat_if_necessary([y_preds[j, idxs_list[0].val_idxs[j]]
+                                               for j in range(idxs_list[0].val_idxs.shape[0])], dim=0)
+                loss = Metrics.apply(y_pred_oob, y_oob, val_metric_name).item()
+                if loss < best_alg_loss:
+                    best_alg_loss = loss
+                    best_alg_idx = alg_idx
+                    best_sub_fit_params = alg_interface.get_fit_params()[0]
 
         self.fit_params = [dict(best_alg_idx=best_alg_idx,
-                                sub_fit_params=self.alg_interfaces[best_alg_idx].get_fit_params()[0])]
+                                sub_fit_params=best_sub_fit_params)]
         logger.log(2, f'Best algorithm has index {best_alg_idx}')
         logger.log(2, f'Algorithm selection fit parameters: {self.fit_params[0]}')
 
     def predict(self, ds: DictDataset) -> torch.Tensor:
         alg_idx = self.fit_params[0]['best_alg_idx']
-        return self.alg_interfaces[alg_idx].predict(ds)
+        with self.alg_contexts_[alg_idx] as alg_interface:
+            return alg_interface.predict(ds)
 
     def get_required_resources(self, ds: DictDataset, n_cv: int, n_refit: int, n_splits: int,
                                split_seeds: List[int], n_train: int) -> RequiredResources:
diff --git a/pytabkit/models/alg_interfaces/nn_interfaces.py b/pytabkit/models/alg_interfaces/nn_interfaces.py
@@ -337,7 +337,8 @@ def __init__(self, is_classification: bool, hpo_space_name: str = 'default', **c
     def sample_params(self, seed: int) -> Dict[str, Any]:
         assert self.hpo_space_name in ['default', 'clr', 'moresigma', 'moresigmadim', 'moresigmadimreg',
                                        'moresigmadimsize', 'moresigmadimlr', 'probclass', 'probclass-mlp', 'large',
-                                       'alt1', 'alt2', 'alt3', 'alt4', 'alt5', 'alt6', 'alt7', 'alt8', 'alt9', 'alt10']
+                                       'alt1', 'alt2', 'alt3', 'alt4', 'alt5', 'alt6', 'alt7', 'alt8', 'alt9', 'alt10',
+                                       'tabarena']
         rng = np.random.default_rng(seed=seed)
 
         if self.hpo_space_name == 'probclass-mlp':
@@ -620,6 +621,43 @@ def sample_params(self, seed: int) -> Dict[str, Any]:
                       'scale_lr_factor': np.exp(rng.uniform(np.log(2.0), np.log(10.0))),
                       'p_drop_sched': 'flat_cos',
                       }
+        elif self.hpo_space_name == 'tabarena':
+            # common search space
+            params = {
+                'n_hidden_layers': rng.integers(2, 4, endpoint=True),
+                'hidden_sizes': 'rectangular',
+                'hidden_width': rng.choice([256, 384, 512]),
+                'p_drop': rng.uniform(0.0, 0.5),
+                'act': 'mish',
+                'plr_sigma': np.exp(rng.uniform(np.log(1e-2), np.log(50))),
+                'sq_mom': 1.0 - np.exp(rng.uniform(np.log(5e-3), np.log(5e-2))),
+                'plr_lr_factor': np.exp(rng.uniform(np.log(5e-2), np.log(3e-1))),
+                'scale_lr_factor': np.exp(rng.uniform(np.log(2.0), np.log(10.0))),
+                'first_layer_lr_factor': np.exp(rng.uniform(np.log(0.3), np.log(1.5))),
+                'ls_eps_sched': 'coslog4',
+                'ls_eps': np.exp(rng.uniform(np.log(5e-3), np.log(1e-1))),
+                'p_drop_sched': 'flat_cos',
+                'lr': np.exp(rng.uniform(np.log(2e-2), np.log(3e-1))),
+                'wd': np.exp(rng.uniform(np.log(1e-3), np.log(5e-2))),
+                'use_ls': rng.choice(["auto", True]),  # use label smoothing (will be ignored for regression)
+            }
+
+            if rng.uniform(0.0, 1.0) > 0.5:
+                # large configs
+                params['plr_hidden_1'] = rng.choice([8, 16, 32, 64])
+                params['plr_hidden_2'] = rng.choice([8, 16, 32, 64])
+                params['n_epochs'] = rng.choice([256, 512])
+                params['use_early_stopping'] = True
+
+                # set in the defaults of RealMLP in TabArena
+                params['early_stopping_multiplicative_patience'] = 3
+                params['early_stopping_additive_patience'] = 40
+            else:
+                # default values, used here to always set the same set of parameters
+                params['plr_hidden_1'] = 16
+                params['plr_hidden_2'] = 4
+                params['n_epochs'] = 256
+                params['use_early_stopping'] = False
 
         # print(f'{params=}')
 
@@ -651,6 +689,7 @@ def _create_sub_interface(self, ds: DictDataset, seed: int):
         # params = utils.update_dict(self.fit_params[0], self.config)
         if 'n_epochs' in self.config:
             params['n_epochs'] = self.config['n_epochs']
+        self.fit_params[0] = params
         return NNAlgInterface(fit_params=None, **params)
 
     def fit(self, ds: DictDataset, idxs_list: List[SplitIdxs], interface_resources: InterfaceResources,
diff --git a/pytabkit/models/sklearn/sklearn_interfaces.py b/pytabkit/models/sklearn/sklearn_interfaces.py
diff --git a/pytabkit/models/utils.py b/pytabkit/models/utils.py

Original file line number	Diff line number	Diff line change
`@@ -2,4 +2,4 @@`
`2`	`2`	`#`
`3`	`3`	`# SPDX-License-Identifier: Apache-2.0`
`4`	`4`
`5`		`-__version__ = "1.4.1"`
	`5`	`+__version__ = "1.4.2"`