dholzmueller
diff --git a/‎README.md‎
Lines changed: 10 additions & 1 deletion b/‎README.md‎
Lines changed: 10 additions & 1 deletion
diff --git a/‎pytabkit/__about__.py‎
Lines changed: 1 addition & 1 deletion b/‎pytabkit/__about__.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎pytabkit/models/alg_interfaces/ensemble_interfaces.py‎
Lines changed: 12 additions & 4 deletions b/‎pytabkit/models/alg_interfaces/ensemble_interfaces.py‎
Lines changed: 12 additions & 4 deletions
diff --git a/‎pytabkit/models/alg_interfaces/nn_interfaces.py‎
Lines changed: 5 additions & 3 deletions b/‎pytabkit/models/alg_interfaces/nn_interfaces.py‎
Lines changed: 5 additions & 3 deletions
diff --git a/‎pytabkit/models/alg_interfaces/tabm_interface.py‎
Lines changed: 51 additions & 15 deletions b/‎pytabkit/models/alg_interfaces/tabm_interface.py‎
Lines changed: 51 additions & 15 deletions
diff --git a/‎pytabkit/models/sklearn/sklearn_base.py‎
Lines changed: 3 additions & 0 deletions b/‎pytabkit/models/sklearn/sklearn_base.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎pytabkit/models/sklearn/sklearn_interfaces.py‎
Lines changed: 13 additions & 0 deletions b/‎pytabkit/models/sklearn/sklearn_interfaces.py‎
Lines changed: 13 additions & 0 deletions
@@ -51,7 +51,7 @@ pip install pytabkit[models]
   [faiss](https://github.com/facebookresearch/faiss/blob/main/INSTALL.md),
   which is only available on **conda**.
 - Please install torch separately if you want to control the version (CPU/GPU etc.)
-- Use `pytabkit[models,autogluon,extra,hpo,bench,dev]` to install additional dependencies for
+- Use `pytabkit[models,autogluon,extra,hpo,bench,dev]` to install additional dependencies for the other models,
   AutoGluon models, extra preprocessing,
   hyperparameter optimization methods beyond random search (hyperopt/SMAC),
   the benchmarking part, and testing/documentation. For the hpo part,
@@ -196,6 +196,15 @@ and https://docs.ray.io/en/latest/cluster/vms/user-guides/community/slurm.html
 
 ## Releases (see git tags)
 
+- v1.6.0:
+    - Added support for other training losses in TabM through the `train_metric_name` parameter, 
+      for example, (multi)quantile regression via `train_metric_name='multi_pinball(0.05,0.95)'`.
+    - RealMLP-TD now adds the `n_ens` hyperparameter, which can be set to values >1 
+      to train ensembles per train-validation split (called PackedEnsemble in the TabM paper). 
+      This is especially useful when using holdout validation instead of cross-validation ensembles, 
+      and to get more reliable validation predictions and scores for tuning/ensembling.
+    - fixed RealMLP TabArena search space (`hpo_space_name='tabarena'`) for classification 
+      (allow no label smoothing through `use_ls=False` instead of `use_ls="auto"`).
 - v1.5.2: fixed more device bugs for HPO and ensembling
 - v1.5.1: fixed a device bug in TabM for GPU
 - v1.5.0:
 
@@ -2,4 +2,4 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-__version__ = "1.5.2"
+__version__ = "1.6.0"
@@ -64,7 +64,15 @@ def fit(self, ds: DictDataset, idxs_list: List[SplitIdxs], interface_resources:
             with alg_ctx as alg_interface:
                 sub_tmp_folders = [tmp_folder / str(alg_idx) if tmp_folder is not None else None for tmp_folder in
                                    tmp_folders]
-                alg_interface.fit(ds, idxs_list, interface_resources, logger, sub_tmp_folders, name + f'sub-alg-{alg_idx}')
+                if self.config.get('diversify_seeds', False):
+                    sub_idxs_list = [SplitIdxs(train_idxs=idxs.train_idxs, val_idxs=idxs.val_idxs,
+                                               test_idxs=idxs.test_idxs, split_seed=idxs.split_seed + alg_idx,
+                                               sub_split_seeds=[sss + alg_idx for sss in idxs.sub_split_seeds],
+                                               split_id=idxs.split_id) for idxs in idxs_list]
+                else:
+                    sub_idxs_list = idxs_list
+                alg_interface.fit(ds, sub_idxs_list, interface_resources, logger, sub_tmp_folders,
+                                  name + f'sub-alg-{alg_idx}')
                 sub_fit_params.append(alg_interface.get_fit_params()[0])
 
         if self.fit_params is not None:
@@ -132,7 +140,6 @@ def fit(self, ds: DictDataset, idxs_list: List[SplitIdxs], interface_resources:
 
                     weights[weight_idx] += 1
 
-
             if best_step_loss < best_loss:
                 best_loss = best_step_loss
                 best_weights = np.copy(best_step_weights)
@@ -202,7 +209,7 @@ def fit(self, ds: DictDataset, idxs_list: List[SplitIdxs], interface_resources:
                                tmp_folders]
             with self.alg_contexts_[best_alg_idx] as alg_interface:
                 alg_interface.fit(ds, idxs_list, interface_resources, logger, sub_tmp_folders,
-                                                  name + f'sub-alg-{best_alg_idx}')
+                                  name + f'sub-alg-{best_alg_idx}')
 
             return
 
@@ -228,7 +235,8 @@ def fit(self, ds: DictDataset, idxs_list: List[SplitIdxs], interface_resources:
             with alg_ctx as alg_interface:
                 sub_tmp_folders = [tmp_folder / str(alg_idx) if tmp_folder is not None else None for tmp_folder in
                                    tmp_folders]
-                alg_interface.fit(ds, idxs_list, interface_resources, logger, sub_tmp_folders, name + f'sub-alg-{alg_idx}')
+                alg_interface.fit(ds, idxs_list, interface_resources, logger, sub_tmp_folders,
+                                  name + f'sub-alg-{alg_idx}')
                 y_preds = alg_interface.predict(ds)
                 # get out-of-bag predictions
                 y_pred_oob = cat_if_necessary([y_preds[j, idxs_list[0].val_idxs[j]]
 
@@ -167,7 +167,7 @@ def get_required_resources(self, ds: DictDataset, n_cv: int, n_refit: int, n_spl
         static_tensor_infos = static_fitter.forward_tensor_infos(tensor_infos)
         n_params = fitter.get_n_params(tensor_infos)
         n_forward = fitter.get_n_forward(tensor_infos)
-        n_parallel = max(n_cv, n_refit) * n_splits
+        n_parallel = max(n_cv, n_refit) * n_splits * self.config.get('n_ens', 1)
         batch_size = self.config.get('batch_size', 256)
         if batch_size == 'auto':
             batch_size = get_realmlp_auto_batch_size(n_train)
@@ -192,6 +192,8 @@ def get_required_resources(self, ds: DictDataset, n_cv: int, n_refit: int, n_spl
         init_ram_gb = min(init_ram_gb_max, init_ram_gb_full)
         # init_ram_gb = 1.5
 
+        # print(f'{ds_ram_gb=}, {pass_memory/(1024**3)=}, {param_memory/(1024**3)=}, {init_ram_gb=}')
+
         factor = 1.2  # to go safe on ram
         gpu_ram_gb = fixed_ram_gb + ds_ram_gb + max(init_ram_gb,
                                                     factor * (n_parallel * (pass_memory + param_memory)) / (1024 ** 3))
@@ -639,7 +641,7 @@ def sample_params(self, seed: int) -> Dict[str, Any]:
                 'p_drop_sched': 'flat_cos',
                 'lr': np.exp(rng.uniform(np.log(2e-2), np.log(3e-1))),
                 'wd': np.exp(rng.uniform(np.log(1e-3), np.log(5e-2))),
-                'use_ls': rng.choice(["auto", True]),  # use label smoothing (will be ignored for regression)
+                'use_ls': rng.choice([False, True]),  # use label smoothing (will be ignored for regression)
             }
 
             if rng.uniform(0.0, 1.0) > 0.5:
@@ -685,7 +687,7 @@ def _create_sub_interface(self, ds: DictDataset, seed: int):
             is_classification = not ds.tensor_infos['y'].is_cont()
             self.fit_params = [RealMLPParamSampler(is_classification, **self.config).sample_params(hparam_seed)]
         # todo: need epoch for refit
-        params = utils.update_dict(self.config, self.fit_params[0])
+        params = utils.join_dicts(self.config, self.fit_params[0], self.config.get('override_params', dict()) or dict())
         # params = utils.update_dict(self.fit_params[0], self.config)
         if 'n_epochs' in self.config:
             params['n_epochs'] = self.config['n_epochs']
 
@@ -1,3 +1,4 @@
+import functools
 import math
 import random
 from pathlib import Path
@@ -76,6 +77,7 @@ def fit(self, ds: DictDataset, idxs_list: List[SplitIdxs], interface_resources:
         # set default to True for backward compatibility
         share_training_batches = self.config.get("share_training_batches", False)
         val_metric_name = self.config.get('val_metric_name', None)
+        train_metric_name = self.config.get('train_metric_name', None)
 
         weight_decay = self.config.get('weight_decay', 0.0)
         gradient_clipping_norm = self.config.get('gradient_clipping_norm', None)
@@ -145,9 +147,11 @@ def fit(self, ds: DictDataset, idxs_list: List[SplitIdxs], interface_resources:
 
         Y_train = ds_parts['train'].tensors['y'].clone()
         if task_type == 'regression':
-            assert ds.tensor_infos['y'].get_n_features() == 1
-            self.y_mean_ = ds_parts['train'].tensors['y'].mean().item()
-            self.y_std_ = ds_parts['train'].tensors['y'].std(correction=0).item()
+            assert Y_train.shape[-1] == 1
+            self.y_mean_ = ds_parts['train'].tensors['y'].mean(dim=0, keepdim=True).item()
+            self.y_std_ = ds_parts['train'].tensors['y'].std(dim=0, keepdim=True, correction=0).item()
+            self.y_max_ = ds_parts['train'].tensors['y'].max().item()
+            self.y_min_ = ds_parts['train'].tensors['y'].min().item()
 
             Y_train = (Y_train - self.y_mean_) / (self.y_std_ + 1e-30)
 
@@ -170,7 +174,7 @@ def fit(self, ds: DictDataset, idxs_list: List[SplitIdxs], interface_resources:
             else None
         )
         # Changing False to True will result in faster training on compatible hardware.
-        amp_enabled = allow_amp and amp_dtype is not None
+        amp_enabled = allow_amp and amp_dtype is not None and device.type == 'cuda'
         grad_scaler = torch.cuda.amp.GradScaler() if amp_dtype is torch.float16 else None  # type: ignore
 
         # fmt: off
@@ -186,11 +190,14 @@ def fit(self, ds: DictDataset, idxs_list: List[SplitIdxs], interface_resources:
 
         # TabM
         bins = None if num_emb_type != 'pwl' or n_cont_features == 0 else rtdl_num_embeddings.compute_bins(data['train']['x_cont'], n_bins=num_emb_n_bins)
+        d_out = n_classes if n_classes > 0 else 1
+        if train_metric_name is not None and train_metric_name.startswith('multi_pinball'):
+            d_out = train_metric_name.count(',')+1
 
         model = Model(
             n_num_features=n_cont_features,
             cat_cardinalities=cat_cardinalities,
-            n_classes=n_classes if n_classes > 0 else None,
+            n_classes=d_out,
             backbone={
                 'type': 'MLP',
                 'n_blocks': n_blocks if n_blocks != 'auto' else (3 if bins is None else 2),
@@ -212,6 +219,27 @@ def fit(self, ds: DictDataset, idxs_list: List[SplitIdxs], interface_resources:
             k=tabm_k,
             share_training_batches=share_training_batches,
         ).to(device)
+
+        # import tabm
+        # num_embeddings = None if bins is None else rtdl_num_embeddings.PiecewiseLinearEmbeddings(
+        #     bins=bins,
+        #     d_embedding=d_embedding,
+        #     activation=False,
+        #     version='B',
+        # )
+        # model = tabm.TabM(
+        #     n_num_features=n_cont_features,
+        #     cat_cardinalities=cat_cardinalities,
+        #     d_out = n_classes if n_classes > 0 else 1,
+        #     num_embeddings = num_embeddings,
+        #     n_blocks=n_blocks if n_blocks != 'auto' else (3 if bins is None else 2),
+        #     d_block=d_block,
+        #     dropout=dropout,
+        #     arch_type=arch_type,
+        #     k=tabm_k,
+        #     # todo: can introduce activation
+        #     share_training_batches=share_training_batches,  # todo: disappeared?
+        # )
         optimizer = torch.optim.AdamW(make_parameter_groups(model), lr=lr, weight_decay=weight_decay)
 
 
@@ -231,11 +259,17 @@ def apply_model(part: str, idx: torch.Tensor) -> torch.Tensor:
                     data[part]['x_cont'][idx],
                     data[part]['x_cat'][idx] if 'x_cat' in data[part] else None,
                 )
-                .squeeze(-1)  # Remove the last dimension for regression tasks.
                 .float()
             )
 
-        base_loss_fn = torch.nn.functional.mse_loss if task_type == 'regression' else torch.nn.functional.cross_entropy
+        if train_metric_name is None:
+            base_loss_fn = torch.nn.functional.mse_loss if self.n_classes_ == 0 else torch.nn.functional.cross_entropy  # defaults
+        elif train_metric_name == 'mse':
+            base_loss_fn = torch.nn.functional.mse_loss
+        elif train_metric_name == 'cross_entropy':
+            base_loss_fn = torch.nn.functional.cross_entropy
+        else:
+            base_loss_fn = functools.partial(Metrics.apply, metric_name=train_metric_name)
 
         def loss_fn(y_pred: torch.Tensor, y_true: torch.Tensor) -> torch.Tensor:
             # TabM produces k predictions per object. Each of them must be trained separately.
@@ -244,7 +278,7 @@ def loss_fn(y_pred: torch.Tensor, y_true: torch.Tensor) -> torch.Tensor:
             k = y_pred.shape[1]
             return base_loss_fn(
                 y_pred.flatten(0, 1),
-                y_true.repeat_interleave(k) if model.share_training_batches else y_true.squeeze(-1),
+                y_true.repeat_interleave(k) if model.share_training_batches else y_true,
             )
 
         @evaluation_mode()
@@ -261,7 +295,7 @@ def evaluate(part: str) -> float:
                         eval_batch_size
                     )
                     ]
-                ).cpu()
+                )
             )
             if task_type == 'regression':
                 # Transform the predictions back to the original label space.
@@ -278,6 +312,8 @@ def evaluate(part: str) -> float:
                 y_pred = y_pred.mean(dim=1)
 
             y_true = data[part]['y'].cpu()
+            y_pred = y_pred.cpu()
+
             if task_type == 'regression' and len(y_true.shape) == 1:
                 y_true = y_true.unsqueeze(-1)
             if task_type == 'regression' and len(y_pred.shape) == 1:
@@ -390,7 +426,6 @@ def predict(self, ds: DictDataset) -> torch.Tensor:
                             ds.tensors['x_cont'][idx],
                             ds.tensors['x_cat'][idx] if not ds.tensor_infos['x_cat'].is_empty() else None,
                         )
-                        .squeeze(-1)  # Remove the last dimension for regression tasks.
                         .float()
                         for idx in torch.arange(ds.n_samples, device=self.device_).split(
                         eval_batch_size
@@ -400,9 +435,10 @@ def predict(self, ds: DictDataset) -> torch.Tensor:
             )
         if self.task_type_ == 'regression':
             # Transform the predictions back to the original label space.
-            y_pred = y_pred * self.y_std_ + self.y_mean_
             y_pred = y_pred.mean(1)
-            y_pred = y_pred.unsqueeze(-1)  # add extra "features" dimension
+            y_pred = y_pred * self.y_std_ + self.y_mean_
+            if self.config.get('clamp_output', False):
+                y_pred = torch.clamp(y_pred, self.y_min_, self.y_max_)
         else:
             average_logits = self.config.get('average_logits', False)
             if average_logits:
@@ -411,7 +447,7 @@ def predict(self, ds: DictDataset) -> torch.Tensor:
                 # For classification, the mean must be computed in the probability space.
                 y_pred = torch.log(torch.softmax(y_pred, dim=-1).mean(1) + 1e-30)
 
-        return y_pred[None]  # add n_models dimension
+        return y_pred[None].cpu()  # add n_models dimension
 
     def get_required_resources(self, ds: DictDataset, n_cv: int, n_refit: int, n_splits: int,
                                split_seeds: List[int], n_train: int) -> RequiredResources:
@@ -440,7 +476,7 @@ def _sample_params(self, is_classification: bool, seed: int, n_train: int):
             params = {
                 "batch_size": "auto",
                 "patience": 16,
-                "amp": True,
+                "allow_amp": True,
                 "arch_type": "tabm-mini",
                 "tabm_k": 32,
                 "gradient_clipping_norm": 1.0,
@@ -461,7 +497,7 @@ def _sample_params(self, is_classification: bool, seed: int, n_train: int):
             params = {
                 "batch_size": "auto",
                 "patience": 16,
-                "amp": False,  # only for GPU, maybe we should change it to True?
+                "allow_amp": False,  # only for GPU, maybe we should change it to True?
                 "arch_type": "tabm-mini",
                 "tabm_k": 32,
                 "gradient_clipping_norm": 1.0,
 
@@ -346,6 +346,9 @@ def fit(self, X, y, X_val: Optional = None, y_val: Optional = None, val_idxs: Op
         if val_idxs.shape[1] == 0:
             val_idxs = None  # no validation set
 
+        # print(f'{val_idxs=}')
+        # print(f'{np.mean(X / (1e-8 + np.linalg.norm(X, axis=0, keepdims=True)))=}')
+
         idxs_list = [SplitIdxs(train_idxs=train_idxs, val_idxs=val_idxs, test_idxs=None, split_seed=split_seed,
                                sub_split_seeds=sub_split_seeds, split_id=0)]
 
 
@@ -86,6 +86,8 @@ def __init__(self, device: Optional[str] = None, random_state: Optional[Union[in
                  calibration_method: Optional[str] = None,
                  sort_quantile_predictions: Optional[bool] = None,
                  stop_epoch: Optional[int] = None,
+                 use_best_mean_epoch_for_cv: Optional[bool] = None,
+                 n_ens: Optional[int] = None,
                  ):
         """
         Constructor for RealMLP, using the default parameters from RealMLP-TD.
@@ -251,6 +253,11 @@ def __init__(self, device: Optional[str] = None, random_state: Optional[Union[in
             Epoch at which training should be stopped (for refitting).
             The total length of training used for the schedules will be determined by n_epochs,
             but the stopping epoch will be min(stop_epoch, n_epochs).
+        :param use_best_mean_epoch_for_cv: If training an ensemble,
+            whether they should all use a checkpoint from the same epoch with the best average loss,
+            instead of using the best individual epochs (default=False).
+        :param n_ens: Number of ensemble members that should be used per train-validation split (default=1).
+            For best-epoch selection, the validation scores of averaged predictions will be used.
         """
         super().__init__()  # call the constructor of the other superclass for multiple inheritance
         self.device = device
@@ -323,6 +330,8 @@ def __init__(self, device: Optional[str] = None, random_state: Optional[Union[in
         self.calibration_method = calibration_method
         self.sort_quantile_predictions = sort_quantile_predictions
         self.stop_epoch = stop_epoch
+        self.use_best_mean_epoch_for_cv = use_best_mean_epoch_for_cv
+        self.n_ens = n_ens
 
 
 class RealMLP_TD_Classifier(RealMLPConstructorMixin, AlgInterfaceClassifier):
@@ -1762,6 +1771,7 @@ def __init__(self, device: Optional[str] = None, random_state: Optional[Union[in
                  calibration_method: Optional[str] = None,
                  share_training_batches: Optional[bool] = None,
                  val_metric_name: Optional[str] = None,
+                 train_metric_name: Optional[str] = None,
                  ):
         """
 
@@ -1826,6 +1836,9 @@ def __init__(self, device: Optional[str] = None, random_state: Optional[Union[in
         :param val_metric_name: Name of the validation metric used for early stopping.
             For classification, the default is 'class_error' but could be 'cross_entropy', 'brier', '1-auc_ovr' etc.
             For regression, the default is 'rmse' but could be 'mae'.
+        :param train_metric_name: Name of the metric (loss) used for training.
+            For classification, the default is 'cross_entropy'.
+            For regression, it is 'mse' but could be set to something like 'multi_pinball(0.05,0.95)'.
         """
         self.device = device
         self.random_state = random_state
Original file line number	Diff line number	Diff line change
`@@ -2,4 +2,4 @@`
`2`	`2`	`#`
`3`	`3`	`# SPDX-License-Identifier: Apache-2.0`
`4`	`4`
`5`		`-__version__ = "1.5.2"`
	`5`	`+__version__ = "1.6.0"`