Skip to content

Commit a235758

Browse files
committed
v1.6.1: updated ensembling, time limit for HPO
1 parent 5d9e26a commit a235758

File tree

11 files changed

+235
-16
lines changed

11 files changed

+235
-16
lines changed

.github/workflows/testing.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ jobs:
1515
fail-fast: false
1616
matrix:
1717
os: [windows-latest, ubuntu-latest, macos-latest]
18-
python-version: ['3.9', '3.10', '3.11', '3.12']
18+
python-version: ['3.9', '3.10', '3.11', '3.12', '3.13']
1919
runs-on: ${{ matrix.os }}
2020
steps:
2121
- uses: actions/checkout@v4

README.md

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,8 @@ on our benchmarks.
1919

2020
- **To get the best possible results**:
2121
- Generally we recommend AutoGluon for the best possible results,
22-
though it does not include all the models from pytabkit.
23-
It will probably include RealMLP in the upcoming 1.4 version.
22+
though it does not include all the models from pytabkit. AutoGluon 1.4
23+
includes RealMLP (though not in a default configuration) and TabM (in the "extreme" preset for <= 30K samples).
2424
- To get the best possible results from `pytabkit`,
2525
we recommend using
2626
`Ensemble_HPO_Classifier(n_cv=8, use_full_caruana_ensembling=True, use_tabarena_spaces=True, n_hpo_steps=50)`
@@ -32,7 +32,7 @@ on our benchmarks.
3232
also with `val_metric_name` as above, or the corresponding `Regressor`.
3333
- **Models**: [TabArena](https://github.com/AutoGluon/tabrepo)
3434
also includes some newer models like RealMLP and TabM
35-
with more general preprocessing (missing numericals, text, etc.),
35+
with more general preprocessing (missing numericals, text, etc.),
3636
as well as very good boosted tree implementations.
3737
`pytabkit` is currently still easier to use
3838
and supports vectorized cross-validation for RealMLP,
@@ -196,6 +196,11 @@ and https://docs.ray.io/en/latest/cluster/vms/user-guides/community/slurm.html
196196

197197
## Releases (see git tags)
198198

199+
- v1.6.1:
200+
- For `n_ens>1`, changed the default behavior for classification to averaging probabilities instead of logits.
201+
This can be reverted by setting `ens_av_before_softmax=True`.
202+
- Implemented time limit for HPO/ensemble methods through `time_limit_s` parameter.
203+
- Support `torch>=2.6` and Python 3.13.
199204
- v1.6.0:
200205
- Added support for other training losses in TabM through the `train_metric_name` parameter,
201206
for example, (multi)quantile regression via `train_metric_name='multi_pinball(0.05,0.95)'`.

docs/requirements.txt

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,6 @@ skorch>=0.15
2828
sphinx>=7.0
2929
sphinx_rtd_theme>=2.0
3030
torch>=2.0
31-
torch>=2.0,<2.6
3231
torchmetrics>=1.2.1
3332
tqdm
3433
tueplots>=0.0.12

pyproject.toml

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ classifiers = [
2222
"Programming Language :: Python :: 3.10",
2323
"Programming Language :: Python :: 3.11",
2424
"Programming Language :: Python :: 3.12",
25+
"Programming Language :: Python :: 3.13",
2526
"Programming Language :: Python :: Implementation :: CPython",
2627
"Programming Language :: Python :: Implementation :: PyPy",
2728
"License :: OSI Approved :: Apache Software License",
@@ -44,7 +45,7 @@ dependencies = [
4445
models = [
4546
# use <2.6 for now since it can run into pickling issues with skorch if the skorch version is too old
4647
# see https://github.com/skorch-dev/skorch/commit/be93b7769d61aa22fb928d2e89e258c629bfeaf9
47-
"torch>=2.0,<2.6",
48+
"torch>=2.0",
4849
"xgboost>=2.0",
4950
"catboost>=1.2",
5051
"lightgbm>=4.1",
@@ -69,8 +70,8 @@ models = [
6970
"pyyaml>=5.0",
7071
"msgpack>=1.0",
7172
# apparently msgpack_numpy fixed some bug in using numpy arrays in msgpack?
72-
# but apparently it can also cause a bug in ray due to its monkey-patching of msgpack functions# in theory we shouldn't be using if for numpy arrays at the moment, not sure why the need for this occured
73-
# maybe it occured because we tried to save hyperparameters that were numpy scalars instead of python scalars
73+
# but apparently it can also cause a bug in ray due to its monkey-patching of msgpack functions# in theory we shouldn't be using if for numpy arrays at the moment, not sure why the need for this occurred
74+
# maybe it occurred because we tried to save hyperparameters that were numpy scalars instead of python scalars
7475
# "msgpack_numpy>=0.4",
7576
]
7677
autogluon = [

pytabkit/__about__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,4 +2,4 @@
22
#
33
# SPDX-License-Identifier: Apache-2.0
44

5-
__version__ = "1.6.0"
5+
__version__ = "1.6.1"

pytabkit/models/alg_interfaces/ensemble_interfaces.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import copy
2+
import time
23
from pathlib import Path
34
from typing import List, Optional, Dict
45

@@ -92,7 +93,13 @@ def fit(self, ds: DictDataset, idxs_list: List[SplitIdxs], interface_resources:
9293
n_caruana_steps = self.config.get('n_caruana_steps', 40) # default value is taken from TabRepo paper (IIRC)
9394

9495
y_preds_oob_list = []
96+
97+
time_limit_s: Optional[float] = self.config.get('time_limit_s', None)
98+
start_time = time.time()
99+
95100
for alg_idx, alg_ctx in enumerate(self.alg_contexts_):
101+
if alg_idx > 0 and time_limit_s is not None and (alg_idx+1)/alg_idx*(time.time()-start_time) > time_limit_s:
102+
break
96103
with alg_ctx as alg_interface:
97104
y_preds = alg_interface.predict(ds)
98105
# get out-of-bag predictions
@@ -231,7 +238,12 @@ def fit(self, ds: DictDataset, idxs_list: List[SplitIdxs], interface_resources:
231238
best_alg_loss = np.inf
232239
best_sub_fit_params = None
233240

241+
time_limit_s: Optional[float] = self.config.get('time_limit_s', None)
242+
start_time = time.time()
243+
234244
for alg_idx, alg_ctx in enumerate(self.alg_contexts_):
245+
if alg_idx > 0 and time_limit_s is not None and (alg_idx+1)/alg_idx*(time.time()-start_time) > time_limit_s:
246+
break
235247
with alg_ctx as alg_interface:
236248
sub_tmp_folders = [tmp_folder / str(alg_idx) if tmp_folder is not None else None for tmp_folder in
237249
tmp_folders]

pytabkit/models/alg_interfaces/nn_interfaces.py

Lines changed: 182 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -340,7 +340,8 @@ def sample_params(self, seed: int) -> Dict[str, Any]:
340340
assert self.hpo_space_name in ['default', 'clr', 'moresigma', 'moresigmadim', 'moresigmadimreg',
341341
'moresigmadimsize', 'moresigmadimlr', 'probclass', 'probclass-mlp', 'large',
342342
'alt1', 'alt2', 'alt3', 'alt4', 'alt5', 'alt6', 'alt7', 'alt8', 'alt9', 'alt10',
343-
'tabarena']
343+
'tabarena', 'alt11', 'alt12', 'alt13', 'alt14', 'alt15', 'alt16', 'alt17',
344+
'alt18']
344345
rng = np.random.default_rng(seed=seed)
345346

346347
if self.hpo_space_name == 'probclass-mlp':
@@ -660,6 +661,186 @@ def sample_params(self, seed: int) -> Dict[str, Any]:
660661
params['plr_hidden_2'] = 4
661662
params['n_epochs'] = 256
662663
params['use_early_stopping'] = False
664+
elif self.hpo_space_name == 'alt11':
665+
# tabarena without the large configs
666+
params = {
667+
'n_hidden_layers': rng.integers(2, 4, endpoint=True),
668+
'hidden_sizes': 'rectangular',
669+
'hidden_width': rng.choice([256, 384, 512]),
670+
'p_drop': rng.uniform(0.0, 0.5),
671+
'act': 'mish',
672+
'plr_sigma': np.exp(rng.uniform(np.log(1e-2), np.log(50))),
673+
'sq_mom': 1.0 - np.exp(rng.uniform(np.log(5e-3), np.log(5e-2))),
674+
'plr_lr_factor': np.exp(rng.uniform(np.log(5e-2), np.log(3e-1))),
675+
'scale_lr_factor': np.exp(rng.uniform(np.log(2.0), np.log(10.0))),
676+
'first_layer_lr_factor': np.exp(rng.uniform(np.log(0.3), np.log(1.5))),
677+
'ls_eps_sched': 'coslog4',
678+
'ls_eps': np.exp(rng.uniform(np.log(5e-3), np.log(1e-1))),
679+
'p_drop_sched': 'flat_cos',
680+
'lr': np.exp(rng.uniform(np.log(2e-2), np.log(3e-1))),
681+
'wd': np.exp(rng.uniform(np.log(1e-3), np.log(5e-2))),
682+
'use_ls': rng.choice([False, True]), # use label smoothing (will be ignored for regression)
683+
}
684+
elif self.hpo_space_name == 'alt12':
685+
# alt11 with n_hidden_layers=1 in the search space
686+
params = {
687+
'n_hidden_layers': rng.integers(1, 4, endpoint=True),
688+
'hidden_sizes': 'rectangular',
689+
'hidden_width': rng.choice([256, 384, 512]),
690+
'p_drop': rng.uniform(0.0, 0.5),
691+
'act': 'mish',
692+
'plr_sigma': np.exp(rng.uniform(np.log(1e-2), np.log(50))),
693+
'sq_mom': 1.0 - np.exp(rng.uniform(np.log(5e-3), np.log(5e-2))),
694+
'plr_lr_factor': np.exp(rng.uniform(np.log(5e-2), np.log(3e-1))),
695+
'scale_lr_factor': np.exp(rng.uniform(np.log(2.0), np.log(10.0))),
696+
'first_layer_lr_factor': np.exp(rng.uniform(np.log(0.3), np.log(1.5))),
697+
'ls_eps_sched': 'coslog4',
698+
'ls_eps': np.exp(rng.uniform(np.log(5e-3), np.log(1e-1))),
699+
'p_drop_sched': 'flat_cos',
700+
'lr': np.exp(rng.uniform(np.log(2e-2), np.log(3e-1))),
701+
'wd': np.exp(rng.uniform(np.log(1e-3), np.log(5e-2))),
702+
'use_ls': rng.choice([False, True]), # use label smoothing (will be ignored for regression)
703+
}
704+
elif self.hpo_space_name == 'alt13':
705+
# alt11 with more categorical hyperparameters
706+
params = {
707+
'n_hidden_layers': rng.integers(2, 4, endpoint=True),
708+
'hidden_sizes': 'rectangular',
709+
'hidden_width': rng.choice([256, 384, 512]),
710+
'p_drop': rng.uniform(0.0, 0.5),
711+
'act': 'mish',
712+
'plr_sigma': np.exp(rng.uniform(np.log(1e-2), np.log(50))),
713+
'sq_mom': 1.0 - np.exp(rng.uniform(np.log(5e-3), np.log(5e-2))),
714+
'plr_lr_factor': np.exp(rng.uniform(np.log(5e-2), np.log(3e-1))),
715+
'scale_lr_factor': np.exp(rng.uniform(np.log(2.0), np.log(10.0))),
716+
'first_layer_lr_factor': np.exp(rng.uniform(np.log(0.3), np.log(1.5))),
717+
'ls_eps_sched': 'coslog4',
718+
'ls_eps': np.exp(rng.uniform(np.log(5e-3), np.log(1e-1))),
719+
'p_drop_sched': 'flat_cos',
720+
'lr': np.exp(rng.uniform(np.log(2e-2), np.log(3e-1))),
721+
'wd': np.exp(rng.uniform(np.log(1e-3), np.log(5e-2))),
722+
'use_ls': rng.choice([False, True]), # use label smoothing (will be ignored for regression)
723+
'max_one_hot_cat_size': int(np.floor(np.exp(rng.uniform(np.log(4.0), np.log(33.0)))).item()),
724+
'embedding_size': int(rng.choice([4, 8, 16])),
725+
}
726+
elif self.hpo_space_name == 'alt14':
727+
# alt13 with weight_init_mode='normal'
728+
params = {
729+
'n_hidden_layers': rng.integers(2, 4, endpoint=True),
730+
'hidden_sizes': 'rectangular',
731+
'hidden_width': rng.choice([256, 384, 512]),
732+
'p_drop': rng.uniform(0.0, 0.5),
733+
'act': 'mish',
734+
'plr_sigma': np.exp(rng.uniform(np.log(1e-2), np.log(50))),
735+
'sq_mom': 1.0 - np.exp(rng.uniform(np.log(5e-3), np.log(5e-2))),
736+
'plr_lr_factor': np.exp(rng.uniform(np.log(5e-2), np.log(3e-1))),
737+
'scale_lr_factor': np.exp(rng.uniform(np.log(2.0), np.log(10.0))),
738+
'first_layer_lr_factor': np.exp(rng.uniform(np.log(0.3), np.log(1.5))),
739+
'ls_eps_sched': 'coslog4',
740+
'ls_eps': np.exp(rng.uniform(np.log(5e-3), np.log(1e-1))),
741+
'p_drop_sched': 'flat_cos',
742+
'lr': np.exp(rng.uniform(np.log(2e-2), np.log(3e-1))),
743+
'wd': np.exp(rng.uniform(np.log(1e-3), np.log(5e-2))),
744+
'use_ls': rng.choice([False, True]), # use label smoothing (will be ignored for regression)
745+
'max_one_hot_cat_size': int(np.floor(np.exp(rng.uniform(np.log(4.0), np.log(33.0)))).item()),
746+
'embedding_size': int(rng.choice([4, 8, 16])),
747+
'weight_init_mode': 'normal',
748+
}
749+
elif self.hpo_space_name == 'alt15':
750+
# alt13 with tuning momentum (beta1)
751+
params = {
752+
'n_hidden_layers': rng.integers(2, 4, endpoint=True),
753+
'hidden_sizes': 'rectangular',
754+
'hidden_width': rng.choice([256, 384, 512]),
755+
'p_drop': rng.uniform(0.0, 0.5),
756+
'act': 'mish',
757+
'plr_sigma': np.exp(rng.uniform(np.log(1e-2), np.log(50))),
758+
'sq_mom': 1.0 - np.exp(rng.uniform(np.log(5e-3), np.log(5e-2))),
759+
'plr_lr_factor': np.exp(rng.uniform(np.log(5e-2), np.log(3e-1))),
760+
'scale_lr_factor': np.exp(rng.uniform(np.log(2.0), np.log(10.0))),
761+
'first_layer_lr_factor': np.exp(rng.uniform(np.log(0.3), np.log(1.5))),
762+
'ls_eps_sched': 'coslog4',
763+
'ls_eps': np.exp(rng.uniform(np.log(5e-3), np.log(1e-1))),
764+
'p_drop_sched': 'flat_cos',
765+
'lr': np.exp(rng.uniform(np.log(2e-2), np.log(3e-1))),
766+
'wd': np.exp(rng.uniform(np.log(1e-3), np.log(5e-2))),
767+
'use_ls': rng.choice([False, True]), # use label smoothing (will be ignored for regression)
768+
'max_one_hot_cat_size': int(np.floor(np.exp(rng.uniform(np.log(4.0), np.log(33.0)))).item()),
769+
'embedding_size': int(rng.choice([4, 8, 16])),
770+
'mom': 1.0 - np.exp(rng.uniform(np.log(2e-2), np.log(3e-1))), # tune in [0.7, 0.98]
771+
}
772+
elif self.hpo_space_name == 'alt16':
773+
# alt13 with n_ens=2
774+
params = {
775+
'n_hidden_layers': rng.integers(2, 4, endpoint=True),
776+
'hidden_sizes': 'rectangular',
777+
'hidden_width': rng.choice([256, 384, 512]),
778+
'p_drop': rng.uniform(0.0, 0.5),
779+
'act': 'mish',
780+
'plr_sigma': np.exp(rng.uniform(np.log(1e-2), np.log(50))),
781+
'sq_mom': 1.0 - np.exp(rng.uniform(np.log(5e-3), np.log(5e-2))),
782+
'plr_lr_factor': np.exp(rng.uniform(np.log(5e-2), np.log(3e-1))),
783+
'scale_lr_factor': np.exp(rng.uniform(np.log(2.0), np.log(10.0))),
784+
'first_layer_lr_factor': np.exp(rng.uniform(np.log(0.3), np.log(1.5))),
785+
'ls_eps_sched': 'coslog4',
786+
'ls_eps': np.exp(rng.uniform(np.log(5e-3), np.log(1e-1))),
787+
'p_drop_sched': 'flat_cos',
788+
'lr': np.exp(rng.uniform(np.log(2e-2), np.log(3e-1))),
789+
'wd': np.exp(rng.uniform(np.log(1e-3), np.log(5e-2))),
790+
'use_ls': rng.choice([False, True]), # use label smoothing (will be ignored for regression)
791+
'max_one_hot_cat_size': int(np.floor(np.exp(rng.uniform(np.log(4.0), np.log(33.0)))).item()),
792+
'embedding_size': int(rng.choice([4, 8, 16])),
793+
'n_ens': 2,
794+
'ens_av_before_softmax': True,
795+
}
796+
elif self.hpo_space_name == 'alt17':
797+
# alt13 with n_ens=4
798+
params = {
799+
'n_hidden_layers': rng.integers(2, 4, endpoint=True),
800+
'hidden_sizes': 'rectangular',
801+
'hidden_width': rng.choice([256, 384, 512]),
802+
'p_drop': rng.uniform(0.0, 0.5),
803+
'act': 'mish',
804+
'plr_sigma': np.exp(rng.uniform(np.log(1e-2), np.log(50))),
805+
'sq_mom': 1.0 - np.exp(rng.uniform(np.log(5e-3), np.log(5e-2))),
806+
'plr_lr_factor': np.exp(rng.uniform(np.log(5e-2), np.log(3e-1))),
807+
'scale_lr_factor': np.exp(rng.uniform(np.log(2.0), np.log(10.0))),
808+
'first_layer_lr_factor': np.exp(rng.uniform(np.log(0.3), np.log(1.5))),
809+
'ls_eps_sched': 'coslog4',
810+
'ls_eps': np.exp(rng.uniform(np.log(5e-3), np.log(1e-1))),
811+
'p_drop_sched': 'flat_cos',
812+
'lr': np.exp(rng.uniform(np.log(2e-2), np.log(3e-1))),
813+
'wd': np.exp(rng.uniform(np.log(1e-3), np.log(5e-2))),
814+
'use_ls': rng.choice([False, True]), # use label smoothing (will be ignored for regression)
815+
'max_one_hot_cat_size': int(np.floor(np.exp(rng.uniform(np.log(4.0), np.log(33.0)))).item()),
816+
'embedding_size': int(rng.choice([4, 8, 16])),
817+
'n_ens': 4,
818+
'ens_av_before_softmax': True,
819+
}
820+
elif self.hpo_space_name == 'alt18':
821+
# alt17 but with averaging after softmax
822+
params = {
823+
'n_hidden_layers': rng.integers(2, 4, endpoint=True),
824+
'hidden_sizes': 'rectangular',
825+
'hidden_width': rng.choice([256, 384, 512]),
826+
'p_drop': rng.uniform(0.0, 0.5),
827+
'act': 'mish',
828+
'plr_sigma': np.exp(rng.uniform(np.log(1e-2), np.log(50))),
829+
'sq_mom': 1.0 - np.exp(rng.uniform(np.log(5e-3), np.log(5e-2))),
830+
'plr_lr_factor': np.exp(rng.uniform(np.log(5e-2), np.log(3e-1))),
831+
'scale_lr_factor': np.exp(rng.uniform(np.log(2.0), np.log(10.0))),
832+
'first_layer_lr_factor': np.exp(rng.uniform(np.log(0.3), np.log(1.5))),
833+
'ls_eps_sched': 'coslog4',
834+
'ls_eps': np.exp(rng.uniform(np.log(5e-3), np.log(1e-1))),
835+
'p_drop_sched': 'flat_cos',
836+
'lr': np.exp(rng.uniform(np.log(2e-2), np.log(3e-1))),
837+
'wd': np.exp(rng.uniform(np.log(1e-3), np.log(5e-2))),
838+
'use_ls': rng.choice([False, True]), # use label smoothing (will be ignored for regression)
839+
'max_one_hot_cat_size': int(np.floor(np.exp(rng.uniform(np.log(4.0), np.log(33.0)))).item()),
840+
'embedding_size': int(rng.choice([4, 8, 16])),
841+
'n_ens': 4,
842+
'ens_av_before_softmax': False,
843+
}
663844

664845
# print(f'{params=}')
665846

pytabkit/models/hyper_opt/hyper_optimizers.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
import time
22
from pathlib import Path
3-
from typing import Callable, Tuple, Any, Dict, Union
3+
from typing import Callable, Tuple, Any, Dict, Union, Optional
44

55
import numpy as np
66

@@ -156,7 +156,8 @@ def _optimize_impl(self, f: Callable[[dict], Tuple[float, Any]], seed: int) -> N
156156
else:
157157
raise ValueError(f'Unknown hyperopt_algo name "{algo_name}"')
158158
fn = HyperoptOptimizer.HyperoptFuncWrapper(f, self.fixed_params)
159-
_ = hyperopt.fmin(fn=fn,
159+
time_limit_s: Optional[float] = self.config.get('time_limit_s', None)
160+
_ = hyperopt.fmin(fn=fn, timeout=None if time_limit_s is None else int(time_limit_s),
160161
space=self.space, algo=algo, max_evals=self.n_hyperopt_steps, trials=trials,
161162
rstate=np.random.default_rng(seed=seed), verbose=False, show_progressbar=False)
162163

0 commit comments

Comments
 (0)