dholzmueller
diff --git a/‎README.md‎
Lines changed: 40 additions & 1 deletion b/‎README.md‎
Lines changed: 40 additions & 1 deletion
diff --git a/‎pytabkit/__about__.py‎
Lines changed: 1 addition & 1 deletion b/‎pytabkit/__about__.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎pytabkit/models/alg_interfaces/alg_interfaces.py‎
Lines changed: 1 addition & 0 deletions b/‎pytabkit/models/alg_interfaces/alg_interfaces.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎pytabkit/models/alg_interfaces/catboost_interfaces.py‎
Lines changed: 27 additions & 1 deletion b/‎pytabkit/models/alg_interfaces/catboost_interfaces.py‎
Lines changed: 27 additions & 1 deletion
diff --git a/‎pytabkit/models/alg_interfaces/ensemble_interfaces.py‎
Lines changed: 5 additions & 2 deletions b/‎pytabkit/models/alg_interfaces/ensemble_interfaces.py‎
Lines changed: 5 additions & 2 deletions
diff --git a/‎pytabkit/models/alg_interfaces/lightgbm_interfaces.py‎
Lines changed: 22 additions & 0 deletions b/‎pytabkit/models/alg_interfaces/lightgbm_interfaces.py‎
Lines changed: 22 additions & 0 deletions
diff --git a/‎pytabkit/models/alg_interfaces/nn_interfaces.py‎
Lines changed: 3 additions & 3 deletions b/‎pytabkit/models/alg_interfaces/nn_interfaces.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎pytabkit/models/alg_interfaces/sub_split_interfaces.py‎
Lines changed: 8 additions & 3 deletions b/‎pytabkit/models/alg_interfaces/sub_split_interfaces.py‎
Lines changed: 8 additions & 3 deletions
@@ -15,6 +15,31 @@ on our benchmarks.
 
 ![Meta-test benchmark results](./figures/meta-test_benchmark_results.png)
 
+## When (not) to use pytabkit
+
+- **To get the best possible results**: 
+  - Generally we recommend AutoGluon for the best possible results, 
+    though it does not include all the models from pytabkit.
+    It will probably include RealMLP in the upcoming 1.4 version. 
+  - To get the best possible results from `pytabkit`, 
+    we recommend using 
+    `Ensemble_HPO_Classifier(n_cv=8, use_full_caruana_ensembling=True, use_tabarena_spaces=True, n_hpo_steps=50)` 
+    with a `val_metric_name` corresponding to your target metric 
+    (e.g., `class_error`, `cross_entropy`, `brier`, `1-auc_ovr`), or the corresponding `Regressor`. 
+    (This might take very long to fit.)
+  - For only a single model, we recommend using 
+    `RealMLP_HPO_Classifier(n_cv=8, hpo_space_name='tabarena', use_caruana_ensembling=True, n_hyperopt_steps=50)`,
+    also with `val_metric_name` as above, or the corresponding `Regressor`.
+- **Models**: [TabArena](https://github.com/AutoGluon/tabrepo) 
+  also includes some newer models like RealMLP and TabM 
+  with more general preprocessing (missing numericals, text, etc.),  
+  as well as very good boosted tree implementations.
+  `pytabkit` is currently still easier to use 
+  and supports vectorized cross-validation for RealMLP, 
+  which can significantly speed up the training.
+- **Benchmarking**: While pytabkit can be good for quick benchmarking for development, 
+  for method evaluation we recommend [TabArena](https://github.com/AutoGluon/tabrepo).
+
 ## Installation (new in 1.4.0: optional model dependencies)
 
 ```bash
@@ -171,6 +196,20 @@ and https://docs.ray.io/en/latest/cluster/vms/user-guides/community/slurm.html
 
 ## Releases (see git tags)
 
+- v1.5.0:
+    - added `n_repeats` parameter to scikit-learn interfaces for repeated cross-validation
+    - HPO sklearn interfaces (the ones using random search)
+      can now do weighted ensembling instead by setting `use_caruana_ensembling=True`.
+      Removed the `RealMLP_Ensemble_Classifier` and `RealMLP_Ensemble_Regressor` from v1.4.2 
+      since they are now redundant through this feature.
+    - renamed `space` parameter of GBDT HPO interface 
+      to `hpo_space_name` so now it also works with non-TPE versions.
+    - Added new [TabArena](https://tabarena.ai) search spaces for boosted trees (not TPE), 
+      which should be almost equivalent to the ones from TabArena 
+      except for the early stopping logic. 
+    - TabM now supports `val_metric_name` for early stopping on different metrics.
+    - fixed issues #20 and #21 regarding HPO
+    - small updates for the ["Rethinking Early Stopping" paper](https://arxiv.org/abs/2501.19195)
 - v1.4.2:
     - fixed handling of custom `val_metric_name` HPO models and `Ensemble_TD_Regressor`.
     - if `tmp_folder` is specified in HPO models, 
@@ -246,7 +285,7 @@ and https://docs.ray.io/en/latest/cluster/vms/user-guides/community/slurm.html
       Add time limit for RealMLP,
       add support for `lightning` (but also still allowing `pytorch-lightning`),
       making skorch a lazy import, removed msgpack\_numpy dependency.
-- v1.0.0: Release for the NeurIPS version and arXiv v2.
+- v1.0.0: Release for the NeurIPS version and arXiv v2+v3.
     - More baselines (MLP-PLR, FT-Transformer, TabR-HPO, RF-HPO),
       also some un-polished internal interfaces for other methods,
       esp. the ones in AutoGluon.
 
@@ -2,4 +2,4 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-__version__ = "1.4.2"
+__version__ = "1.5.0"
@@ -347,6 +347,7 @@ def get_refit_interface(self, n_refit: int, fit_params: Optional[List[Dict]] = N
         else:
             assert self.fit_params is not None
             fit_params = self.fit_params
+        # print(f'{fit_params=}')
         alg_interface = self.create_alg_interface(n_refit,
                                                   **utils.join_dicts(self.config, fit_params[0]['hyper_fit_params']))
         # the alg_interface itself may have other hypers that have been fit
 
@@ -324,7 +324,8 @@ def __init__(self, space=None, n_hyperopt_steps: int = 50, **config):
         #     'used_ram_limit': hp.choice('used_ram_limit', [100000000000]),
         # }
         # need to add defaults as well
-
+        if space is None:
+            space = config.get('hpo_space_name', None)
         if space == 'NODE' or space == 'popov':
             # space from NODE paper:
             # Popov, Morozov, and Babenko, Neural oblivious decision ensembles for deep learning on tabular data
@@ -721,6 +722,31 @@ def _sample_params(self, is_classification: bool, seed: int, n_train: int):
                 'one_hot_max_size': rng.choice([2, 3, 5, 10]),
                 'grow_policy': rng.choice(['SymmetricTree', 'Depthwise']),
             }
+        elif hpo_space_name == 'tabarena':
+            space = {
+                'n_estimators': 10_000,
+                'early_stopping_rounds': 300,  # probably not exactly equivalent to TabArena
+                'learning_rate': np.exp(rng.uniform(np.log(5e-3), np.log(1e-1))),
+
+                'bootstrap_type': 'Bernoulli',
+                'subsample': rng.uniform(0.7, 1.0),  # can only be used with Bernoulli (or Poisson)!
+
+                'grow_policy': rng.choice(['SymmetricTree', 'Depthwise']),
+                'max_depth': rng.integers(4, 8, endpoint=True),
+
+                'colsample_bylevel': rng.uniform(0.85, 1.0),
+                'l2_leaf_reg': np.exp(rng.uniform(np.log(1e-4), np.log(5.0))),
+
+                'leaf_estimation_iterations': np.floor(np.exp(rng.uniform(np.log(1.0), np.log(21.0)))),
+
+                # categorical features
+                'one_hot_max_size': np.floor(np.exp(rng.uniform(np.log(8.0), np.log(101.0)))),
+                'model_size_reg': np.exp(rng.uniform(np.log(0.1), np.log(1.5))),
+                'max_ctr_complexity': rng.integers(2, 5, endpoint=True),  # shrunk
+
+                'boosting_type': 'Plain',  # avoid Ordered as the default on GPU
+                'max_bin': 254,  # added this to be sure
+            }
         else:
             raise ValueError()
         return space
 
@@ -175,8 +175,11 @@ def __init__(self, alg_interfaces: List[AlgInterface], fit_params: Optional[List
 
     def get_refit_interface(self, n_refit: int, fit_params: Optional[List[Dict]] = None) -> 'AlgInterface':
         # todo: could use sub_fit_params
-        return AlgorithmSelectionAlgInterface([alg_interface.get_refit_interface(n_refit=n_refit)
-                                               for alg_interface in self.alg_interfaces],
+        refit_interfaces = []
+        for alg_context in self.alg_contexts_:
+            with alg_context as alg_interface:
+                refit_interfaces.append(alg_interface.get_refit_interface(n_refit=n_refit))
+        return AlgorithmSelectionAlgInterface(refit_interfaces,
                                               fit_params=fit_params or self.fit_params)
 
     def fit(self, ds: DictDataset, idxs_list: List[SplitIdxs], interface_resources: InterfaceResources,
 
@@ -276,6 +276,8 @@ def __init__(self, space=None, n_hyperopt_steps: int = 50, opt_method: str = 'hy
         from hyperopt import hp
         default_config = {}
         max_config = dict()
+        if space is None:
+            space = config.get('hpo_space_name', None)
         if space == 'catboost_quality_benchmarks':
             # space from catboost quality benchmarks,
             # https://github.com/catboost/benchmarks/blob/master/quality_benchmarks/lightgbm_experiment.py
@@ -672,6 +674,26 @@ def _sample_params(self, is_classification: bool, seed: int, n_train: int):
                 'num_leaves': rng.integers(16, 255, endpoint=True),
                 'extra_trees': rng.choice([False, True]),
             }
+        elif hpo_space_name == 'tabarena':
+            space = {
+                'early_stopping_rounds': 300,  # not exactly equivalent, probably
+                'n_estimators': 10_000,
+                'learning_rate': np.exp(rng.uniform(np.log(5e-3), np.log(1e-1))),
+                'feature_fraction': rng.uniform(0.4, 1),
+                'bagging_fraction': rng.uniform(0.7, 1),
+                'bagging_freq': 1,  # already the default here but not in original LightGBM
+                'num_leaves': np.floor(np.exp(rng.uniform(np.log(2.0), np.log(201)))),
+                'min_data_in_leaf': np.floor(np.exp(rng.uniform(np.log(1.0), np.log(65)))),
+                'extra_trees': rng.choice([False, True]),
+
+                'min_data_per_group': np.floor(np.exp(rng.uniform(np.log(2.0), np.log(101)))),
+                'cat_l2': np.exp(rng.uniform(np.log(5e-3), np.log(2.0))),
+                'cat_smooth': np.exp(rng.uniform(np.log(1e-3), np.log(100.0))),
+                'max_cat_to_onehot': np.floor(np.exp(rng.uniform(np.log(8.0), np.log(101.0)))),
+
+                'lambda_l1': np.exp(rng.uniform(np.log(1e-5), np.log(1.0))),
+                'lambda_l2': np.exp(rng.uniform(np.log(1e-5), np.log(2.0))),
+            }
         else:
             raise ValueError()
         return space
 
@@ -644,9 +644,9 @@ def sample_params(self, seed: int) -> Dict[str, Any]:
 
             if rng.uniform(0.0, 1.0) > 0.5:
                 # large configs
-                params['plr_hidden_1'] = rng.choice([8, 16, 32, 64])
-                params['plr_hidden_2'] = rng.choice([8, 16, 32, 64])
-                params['n_epochs'] = rng.choice([256, 512])
+                params['plr_hidden_1'] = rng.choice([8, 16, 32, 64]).item()
+                params['plr_hidden_2'] = rng.choice([8, 16, 32, 64]).item()
+                params['n_epochs'] = rng.choice([256, 512]).item()
                 params['use_early_stopping'] = True
 
                 # set in the defaults of RealMLP in TabArena
 
@@ -38,8 +38,11 @@ def get_refit_interface(self, n_refit: int, fit_params: Optional[List[Dict]] = N
 
         config = utils.join_dicts(self.sub_split_interfaces[0].config, self.config)
         if config.get('use_best_mean_iteration_for_refit', True):
+            sub_fit_params = [utils.update_dict(fit_params[0], remove_keys='sub_fit_params')]
             return SingleSplitWrapperAlgInterface(
-                [self.sub_split_interfaces[0].get_refit_interface(n_refit=1, fit_params=fit_params) for i in
+                [self.sub_split_interfaces[0].get_refit_interface(
+                    n_refit=1, fit_params=sub_fit_params)
+                 for i in
                  range(n_refit)], fit_params=fit_params)
         else:
             if n_refit != len(self.sub_split_interfaces):
@@ -114,7 +117,9 @@ def fit(self, ds: DictDataset, idxs_list: List[SplitIdxs], interface_resources:
                     for ssi in self.sub_split_interfaces:
                         ssi.fit_params = self.fit_params
             else:
-                self.fit_params = [dict(sub_fit_params=[(ssi.fit_params[0] if ssi.fit_params is not None else None) for ssi in self.sub_split_interfaces])]
+                self.fit_params = [dict(
+                    sub_fit_params=[(ssi.fit_params[0] if ssi.fit_params is not None else None) for ssi in
+                                    self.sub_split_interfaces])]
 
         return None
 
@@ -338,7 +343,7 @@ def fit(self, ds: DictDataset, idxs_list: List[SplitIdxs], interface_resources:
                 self.fit_params = [dict(n_estimators=len(val_errors))]
 
             if isinstance(val_errors, dict):
-                return None   # not implemented
+                return None  # not implemented
             else:
                 return [[[(dict(n_estimators=i + 1), err) for i, err in enumerate(val_errors)]]]
Original file line number	Diff line number	Diff line change
`@@ -2,4 +2,4 @@`
`2`	`2`	`#`
`3`	`3`	`# SPDX-License-Identifier: Apache-2.0`
`4`	`4`
`5`		`-__version__ = "1.4.2"`
	`5`	`+__version__ = "1.5.0"`