v1.6.1: updated ensembling, time limit for HPO

dholzmueller · dholzmueller · commit a235758420c2 · 2025-08-14T12:19:23.000+02:00
diff --git a/.github/workflows/testing.yml b/.github/workflows/testing.yml
@@ -15,7 +15,7 @@ jobs:
       fail-fast: false
       matrix:
         os: [windows-latest, ubuntu-latest, macos-latest]
-        python-version: ['3.9', '3.10', '3.11', '3.12']
+        python-version: ['3.9', '3.10', '3.11', '3.12', '3.13']
     runs-on: ${{ matrix.os }}
     steps:
       - uses: actions/checkout@v4
diff --git a/README.md b/README.md
@@ -19,8 +19,8 @@ on our benchmarks.
 
 - **To get the best possible results**: 
   - Generally we recommend AutoGluon for the best possible results, 
-    though it does not include all the models from pytabkit.
-    It will probably include RealMLP in the upcoming 1.4 version. 
+    though it does not include all the models from pytabkit. AutoGluon 1.4
+    includes RealMLP (though not in a default configuration) and TabM (in the "extreme" preset for <= 30K samples).
   - To get the best possible results from `pytabkit`, 
     we recommend using 
     `Ensemble_HPO_Classifier(n_cv=8, use_full_caruana_ensembling=True, use_tabarena_spaces=True, n_hpo_steps=50)` 
@@ -32,7 +32,7 @@ on our benchmarks.
     also with `val_metric_name` as above, or the corresponding `Regressor`.
 - **Models**: [TabArena](https://github.com/AutoGluon/tabrepo) 
   also includes some newer models like RealMLP and TabM 
-  with more general preprocessing (missing numericals, text, etc.),  
+  with more general preprocessing (missing numericals, text, etc.),
   as well as very good boosted tree implementations.
   `pytabkit` is currently still easier to use 
   and supports vectorized cross-validation for RealMLP, 
@@ -196,6 +196,11 @@ and https://docs.ray.io/en/latest/cluster/vms/user-guides/community/slurm.html
 
 ## Releases (see git tags)
 
+- v1.6.1:
+    - For `n_ens>1`, changed the default behavior for classification to averaging probabilities instead of logits.
+      This can be reverted by setting `ens_av_before_softmax=True`.
+    - Implemented time limit for HPO/ensemble methods through `time_limit_s` parameter.
+    - Support `torch>=2.6` and Python 3.13.
 - v1.6.0:
     - Added support for other training losses in TabM through the `train_metric_name` parameter, 
       for example, (multi)quantile regression via `train_metric_name='multi_pinball(0.05,0.95)'`.
diff --git a/docs/requirements.txt b/docs/requirements.txt
@@ -28,7 +28,6 @@ skorch>=0.15
 sphinx>=7.0
 sphinx_rtd_theme>=2.0
 torch>=2.0
-torch>=2.0,<2.6
 torchmetrics>=1.2.1
 tqdm
 tueplots>=0.0.12
diff --git a/pyproject.toml b/pyproject.toml
@@ -22,6 +22,7 @@ classifiers = [
     "Programming Language :: Python :: 3.10",
     "Programming Language :: Python :: 3.11",
     "Programming Language :: Python :: 3.12",
+    "Programming Language :: Python :: 3.13",
     "Programming Language :: Python :: Implementation :: CPython",
     "Programming Language :: Python :: Implementation :: PyPy",
     "License :: OSI Approved :: Apache Software License",
@@ -44,7 +45,7 @@ dependencies = [
 models = [
     # use <2.6 for now since it can run into pickling issues with skorch if the skorch version is too old
     # see https://github.com/skorch-dev/skorch/commit/be93b7769d61aa22fb928d2e89e258c629bfeaf9
-    "torch>=2.0,<2.6",
+    "torch>=2.0",
     "xgboost>=2.0",
     "catboost>=1.2",
     "lightgbm>=4.1",
@@ -69,8 +70,8 @@ models = [
     "pyyaml>=5.0",
     "msgpack>=1.0",
     # apparently msgpack_numpy fixed some bug in using numpy arrays in msgpack?
-    # but apparently it can also cause a bug in ray due to its monkey-patching of msgpack functions# in theory we shouldn't be using if for numpy arrays at the moment, not sure why the need for this occured
-    # maybe it occured because we tried to save hyperparameters that were numpy scalars instead of python scalars
+    # but apparently it can also cause a bug in ray due to its monkey-patching of msgpack functions# in theory we shouldn't be using if for numpy arrays at the moment, not sure why the need for this occurred
+    # maybe it occurred because we tried to save hyperparameters that were numpy scalars instead of python scalars
     # "msgpack_numpy>=0.4",
 ]
 autogluon = [
diff --git a/pytabkit/__about__.py b/pytabkit/__about__.py
@@ -2,4 +2,4 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-__version__ = "1.6.0"
+__version__ = "1.6.1"
diff --git a/pytabkit/models/alg_interfaces/ensemble_interfaces.py b/pytabkit/models/alg_interfaces/ensemble_interfaces.py
@@ -1,4 +1,5 @@
 import copy
+import time
 from pathlib import Path
 from typing import List, Optional, Dict
 
@@ -92,7 +93,13 @@ def fit(self, ds: DictDataset, idxs_list: List[SplitIdxs], interface_resources:
         n_caruana_steps = self.config.get('n_caruana_steps', 40)  # default value is taken from TabRepo paper (IIRC)
 
         y_preds_oob_list = []
+
+        time_limit_s: Optional[float] = self.config.get('time_limit_s', None)
+        start_time = time.time()
+
         for alg_idx, alg_ctx in enumerate(self.alg_contexts_):
+            if alg_idx > 0 and time_limit_s is not None and (alg_idx+1)/alg_idx*(time.time()-start_time) > time_limit_s:
+                break
             with alg_ctx as alg_interface:
                 y_preds = alg_interface.predict(ds)
                 # get out-of-bag predictions
@@ -231,7 +238,12 @@ def fit(self, ds: DictDataset, idxs_list: List[SplitIdxs], interface_resources:
         best_alg_loss = np.inf
         best_sub_fit_params = None
 
+        time_limit_s: Optional[float] = self.config.get('time_limit_s', None)
+        start_time = time.time()
+
         for alg_idx, alg_ctx in enumerate(self.alg_contexts_):
+            if alg_idx > 0 and time_limit_s is not None and (alg_idx+1)/alg_idx*(time.time()-start_time) > time_limit_s:
+                break
             with alg_ctx as alg_interface:
                 sub_tmp_folders = [tmp_folder / str(alg_idx) if tmp_folder is not None else None for tmp_folder in
                                    tmp_folders]
diff --git a/pytabkit/models/alg_interfaces/nn_interfaces.py b/pytabkit/models/alg_interfaces/nn_interfaces.py
@@ -340,7 +340,8 @@ def sample_params(self, seed: int) -> Dict[str, Any]:
         assert self.hpo_space_name in ['default', 'clr', 'moresigma', 'moresigmadim', 'moresigmadimreg',
                                        'moresigmadimsize', 'moresigmadimlr', 'probclass', 'probclass-mlp', 'large',
                                        'alt1', 'alt2', 'alt3', 'alt4', 'alt5', 'alt6', 'alt7', 'alt8', 'alt9', 'alt10',
-                                       'tabarena']
+                                       'tabarena', 'alt11', 'alt12', 'alt13', 'alt14', 'alt15', 'alt16', 'alt17',
+                                       'alt18']
         rng = np.random.default_rng(seed=seed)
 
         if self.hpo_space_name == 'probclass-mlp':
@@ -660,6 +661,186 @@ def sample_params(self, seed: int) -> Dict[str, Any]:
                 params['plr_hidden_2'] = 4
                 params['n_epochs'] = 256
                 params['use_early_stopping'] = False
+        elif self.hpo_space_name == 'alt11':
+            # tabarena without the large configs
+            params = {
+                'n_hidden_layers': rng.integers(2, 4, endpoint=True),
+                'hidden_sizes': 'rectangular',
+                'hidden_width': rng.choice([256, 384, 512]),
+                'p_drop': rng.uniform(0.0, 0.5),
+                'act': 'mish',
+                'plr_sigma': np.exp(rng.uniform(np.log(1e-2), np.log(50))),
+                'sq_mom': 1.0 - np.exp(rng.uniform(np.log(5e-3), np.log(5e-2))),
+                'plr_lr_factor': np.exp(rng.uniform(np.log(5e-2), np.log(3e-1))),
+                'scale_lr_factor': np.exp(rng.uniform(np.log(2.0), np.log(10.0))),
+                'first_layer_lr_factor': np.exp(rng.uniform(np.log(0.3), np.log(1.5))),
+                'ls_eps_sched': 'coslog4',
+                'ls_eps': np.exp(rng.uniform(np.log(5e-3), np.log(1e-1))),
+                'p_drop_sched': 'flat_cos',
+                'lr': np.exp(rng.uniform(np.log(2e-2), np.log(3e-1))),
+                'wd': np.exp(rng.uniform(np.log(1e-3), np.log(5e-2))),
+                'use_ls': rng.choice([False, True]),  # use label smoothing (will be ignored for regression)
+            }
+        elif self.hpo_space_name == 'alt12':
+            # alt11 with n_hidden_layers=1 in the search space
+            params = {
+                'n_hidden_layers': rng.integers(1, 4, endpoint=True),
+                'hidden_sizes': 'rectangular',
+                'hidden_width': rng.choice([256, 384, 512]),
+                'p_drop': rng.uniform(0.0, 0.5),
+                'act': 'mish',
+                'plr_sigma': np.exp(rng.uniform(np.log(1e-2), np.log(50))),
+                'sq_mom': 1.0 - np.exp(rng.uniform(np.log(5e-3), np.log(5e-2))),
+                'plr_lr_factor': np.exp(rng.uniform(np.log(5e-2), np.log(3e-1))),
+                'scale_lr_factor': np.exp(rng.uniform(np.log(2.0), np.log(10.0))),
+                'first_layer_lr_factor': np.exp(rng.uniform(np.log(0.3), np.log(1.5))),
+                'ls_eps_sched': 'coslog4',
+                'ls_eps': np.exp(rng.uniform(np.log(5e-3), np.log(1e-1))),
+                'p_drop_sched': 'flat_cos',
+                'lr': np.exp(rng.uniform(np.log(2e-2), np.log(3e-1))),
+                'wd': np.exp(rng.uniform(np.log(1e-3), np.log(5e-2))),
+                'use_ls': rng.choice([False, True]),  # use label smoothing (will be ignored for regression)
+            }
+        elif self.hpo_space_name == 'alt13':
+            # alt11 with more categorical hyperparameters
+            params = {
+                'n_hidden_layers': rng.integers(2, 4, endpoint=True),
+                'hidden_sizes': 'rectangular',
+                'hidden_width': rng.choice([256, 384, 512]),
+                'p_drop': rng.uniform(0.0, 0.5),
+                'act': 'mish',
+                'plr_sigma': np.exp(rng.uniform(np.log(1e-2), np.log(50))),
+                'sq_mom': 1.0 - np.exp(rng.uniform(np.log(5e-3), np.log(5e-2))),
+                'plr_lr_factor': np.exp(rng.uniform(np.log(5e-2), np.log(3e-1))),
+                'scale_lr_factor': np.exp(rng.uniform(np.log(2.0), np.log(10.0))),
+                'first_layer_lr_factor': np.exp(rng.uniform(np.log(0.3), np.log(1.5))),
+                'ls_eps_sched': 'coslog4',
+                'ls_eps': np.exp(rng.uniform(np.log(5e-3), np.log(1e-1))),
+                'p_drop_sched': 'flat_cos',
+                'lr': np.exp(rng.uniform(np.log(2e-2), np.log(3e-1))),
+                'wd': np.exp(rng.uniform(np.log(1e-3), np.log(5e-2))),
+                'use_ls': rng.choice([False, True]),  # use label smoothing (will be ignored for regression)
+                'max_one_hot_cat_size': int(np.floor(np.exp(rng.uniform(np.log(4.0), np.log(33.0)))).item()),
+                'embedding_size': int(rng.choice([4, 8, 16])),
+            }
+        elif self.hpo_space_name == 'alt14':
+            # alt13 with weight_init_mode='normal'
+            params = {
+                'n_hidden_layers': rng.integers(2, 4, endpoint=True),
+                'hidden_sizes': 'rectangular',
+                'hidden_width': rng.choice([256, 384, 512]),
+                'p_drop': rng.uniform(0.0, 0.5),
+                'act': 'mish',
+                'plr_sigma': np.exp(rng.uniform(np.log(1e-2), np.log(50))),
+                'sq_mom': 1.0 - np.exp(rng.uniform(np.log(5e-3), np.log(5e-2))),
+                'plr_lr_factor': np.exp(rng.uniform(np.log(5e-2), np.log(3e-1))),
+                'scale_lr_factor': np.exp(rng.uniform(np.log(2.0), np.log(10.0))),
+                'first_layer_lr_factor': np.exp(rng.uniform(np.log(0.3), np.log(1.5))),
+                'ls_eps_sched': 'coslog4',
+                'ls_eps': np.exp(rng.uniform(np.log(5e-3), np.log(1e-1))),
+                'p_drop_sched': 'flat_cos',
+                'lr': np.exp(rng.uniform(np.log(2e-2), np.log(3e-1))),
+                'wd': np.exp(rng.uniform(np.log(1e-3), np.log(5e-2))),
+                'use_ls': rng.choice([False, True]),  # use label smoothing (will be ignored for regression)
+                'max_one_hot_cat_size': int(np.floor(np.exp(rng.uniform(np.log(4.0), np.log(33.0)))).item()),
+                'embedding_size': int(rng.choice([4, 8, 16])),
+                'weight_init_mode': 'normal',
+            }
+        elif self.hpo_space_name == 'alt15':
+            # alt13 with tuning momentum (beta1)
+            params = {
+                'n_hidden_layers': rng.integers(2, 4, endpoint=True),
+                'hidden_sizes': 'rectangular',
+                'hidden_width': rng.choice([256, 384, 512]),
+                'p_drop': rng.uniform(0.0, 0.5),
+                'act': 'mish',
+                'plr_sigma': np.exp(rng.uniform(np.log(1e-2), np.log(50))),
+                'sq_mom': 1.0 - np.exp(rng.uniform(np.log(5e-3), np.log(5e-2))),
+                'plr_lr_factor': np.exp(rng.uniform(np.log(5e-2), np.log(3e-1))),
+                'scale_lr_factor': np.exp(rng.uniform(np.log(2.0), np.log(10.0))),
+                'first_layer_lr_factor': np.exp(rng.uniform(np.log(0.3), np.log(1.5))),
+                'ls_eps_sched': 'coslog4',
+                'ls_eps': np.exp(rng.uniform(np.log(5e-3), np.log(1e-1))),
+                'p_drop_sched': 'flat_cos',
+                'lr': np.exp(rng.uniform(np.log(2e-2), np.log(3e-1))),
+                'wd': np.exp(rng.uniform(np.log(1e-3), np.log(5e-2))),
+                'use_ls': rng.choice([False, True]),  # use label smoothing (will be ignored for regression)
+                'max_one_hot_cat_size': int(np.floor(np.exp(rng.uniform(np.log(4.0), np.log(33.0)))).item()),
+                'embedding_size': int(rng.choice([4, 8, 16])),
+                'mom': 1.0 - np.exp(rng.uniform(np.log(2e-2), np.log(3e-1))), # tune in [0.7, 0.98]
+            }
+        elif self.hpo_space_name == 'alt16':
+            # alt13 with n_ens=2
+            params = {
+                'n_hidden_layers': rng.integers(2, 4, endpoint=True),
+                'hidden_sizes': 'rectangular',
+                'hidden_width': rng.choice([256, 384, 512]),
+                'p_drop': rng.uniform(0.0, 0.5),
+                'act': 'mish',
+                'plr_sigma': np.exp(rng.uniform(np.log(1e-2), np.log(50))),
+                'sq_mom': 1.0 - np.exp(rng.uniform(np.log(5e-3), np.log(5e-2))),
+                'plr_lr_factor': np.exp(rng.uniform(np.log(5e-2), np.log(3e-1))),
+                'scale_lr_factor': np.exp(rng.uniform(np.log(2.0), np.log(10.0))),
+                'first_layer_lr_factor': np.exp(rng.uniform(np.log(0.3), np.log(1.5))),
+                'ls_eps_sched': 'coslog4',
+                'ls_eps': np.exp(rng.uniform(np.log(5e-3), np.log(1e-1))),
+                'p_drop_sched': 'flat_cos',
+                'lr': np.exp(rng.uniform(np.log(2e-2), np.log(3e-1))),
+                'wd': np.exp(rng.uniform(np.log(1e-3), np.log(5e-2))),
+                'use_ls': rng.choice([False, True]),  # use label smoothing (will be ignored for regression)
+                'max_one_hot_cat_size': int(np.floor(np.exp(rng.uniform(np.log(4.0), np.log(33.0)))).item()),
+                'embedding_size': int(rng.choice([4, 8, 16])),
+                'n_ens': 2,
+                'ens_av_before_softmax': True,
+            }
+        elif self.hpo_space_name == 'alt17':
+            # alt13 with n_ens=4
+            params = {
+                'n_hidden_layers': rng.integers(2, 4, endpoint=True),
+                'hidden_sizes': 'rectangular',
+                'hidden_width': rng.choice([256, 384, 512]),
+                'p_drop': rng.uniform(0.0, 0.5),
+                'act': 'mish',
+                'plr_sigma': np.exp(rng.uniform(np.log(1e-2), np.log(50))),
+                'sq_mom': 1.0 - np.exp(rng.uniform(np.log(5e-3), np.log(5e-2))),
+                'plr_lr_factor': np.exp(rng.uniform(np.log(5e-2), np.log(3e-1))),
+                'scale_lr_factor': np.exp(rng.uniform(np.log(2.0), np.log(10.0))),
+                'first_layer_lr_factor': np.exp(rng.uniform(np.log(0.3), np.log(1.5))),
+                'ls_eps_sched': 'coslog4',
+                'ls_eps': np.exp(rng.uniform(np.log(5e-3), np.log(1e-1))),
+                'p_drop_sched': 'flat_cos',
+                'lr': np.exp(rng.uniform(np.log(2e-2), np.log(3e-1))),
+                'wd': np.exp(rng.uniform(np.log(1e-3), np.log(5e-2))),
+                'use_ls': rng.choice([False, True]),  # use label smoothing (will be ignored for regression)
+                'max_one_hot_cat_size': int(np.floor(np.exp(rng.uniform(np.log(4.0), np.log(33.0)))).item()),
+                'embedding_size': int(rng.choice([4, 8, 16])),
+                'n_ens': 4,
+                'ens_av_before_softmax': True,
+            }
+        elif self.hpo_space_name == 'alt18':
+            # alt17 but with averaging after softmax
+            params = {
+                'n_hidden_layers': rng.integers(2, 4, endpoint=True),
+                'hidden_sizes': 'rectangular',
+                'hidden_width': rng.choice([256, 384, 512]),
+                'p_drop': rng.uniform(0.0, 0.5),
+                'act': 'mish',
+                'plr_sigma': np.exp(rng.uniform(np.log(1e-2), np.log(50))),
+                'sq_mom': 1.0 - np.exp(rng.uniform(np.log(5e-3), np.log(5e-2))),
+                'plr_lr_factor': np.exp(rng.uniform(np.log(5e-2), np.log(3e-1))),
+                'scale_lr_factor': np.exp(rng.uniform(np.log(2.0), np.log(10.0))),
+                'first_layer_lr_factor': np.exp(rng.uniform(np.log(0.3), np.log(1.5))),
+                'ls_eps_sched': 'coslog4',
+                'ls_eps': np.exp(rng.uniform(np.log(5e-3), np.log(1e-1))),
+                'p_drop_sched': 'flat_cos',
+                'lr': np.exp(rng.uniform(np.log(2e-2), np.log(3e-1))),
+                'wd': np.exp(rng.uniform(np.log(1e-3), np.log(5e-2))),
+                'use_ls': rng.choice([False, True]),  # use label smoothing (will be ignored for regression)
+                'max_one_hot_cat_size': int(np.floor(np.exp(rng.uniform(np.log(4.0), np.log(33.0)))).item()),
+                'embedding_size': int(rng.choice([4, 8, 16])),
+                'n_ens': 4,
+                'ens_av_before_softmax': False,
+            }
 
         # print(f'{params=}')
 
diff --git a/pytabkit/models/hyper_opt/hyper_optimizers.py b/pytabkit/models/hyper_opt/hyper_optimizers.py
@@ -1,6 +1,6 @@
 import time
 from pathlib import Path
-from typing import Callable, Tuple, Any, Dict, Union
+from typing import Callable, Tuple, Any, Dict, Union, Optional
 
 import numpy as np
 
@@ -156,7 +156,8 @@ def _optimize_impl(self, f: Callable[[dict], Tuple[float, Any]], seed: int) -> N
         else:
             raise ValueError(f'Unknown hyperopt_algo name "{algo_name}"')
         fn = HyperoptOptimizer.HyperoptFuncWrapper(f, self.fixed_params)
-        _ = hyperopt.fmin(fn=fn,
+        time_limit_s: Optional[float] = self.config.get('time_limit_s', None)
+        _ = hyperopt.fmin(fn=fn, timeout=None if time_limit_s is None else int(time_limit_s),
                           space=self.space, algo=algo, max_evals=self.n_hyperopt_steps, trials=trials,
                           rstate=np.random.default_rng(seed=seed), verbose=False, show_progressbar=False)
 
diff --git a/pytabkit/models/sklearn/sklearn_interfaces.py b/pytabkit/models/sklearn/sklearn_interfaces.py
diff --git a/pytabkit/models/training/lightning_modules.py b/pytabkit/models/training/lightning_modules.py
diff --git a/pytabkit/models/training/nn_creator.py b/pytabkit/models/training/nn_creator.py

Original file line number	Diff line number	Diff line change
`@@ -2,4 +2,4 @@`
`2`	`2`	`#`
`3`	`3`	`# SPDX-License-Identifier: Apache-2.0`
`4`	`4`
`5`		`-__version__ = "1.6.0"`
	`5`	`+__version__ = "1.6.1"`