Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/workflows/testing.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ on:
push:
branches:
- "main"
- "dev"
pull_request:
branches:
- '*'
Expand Down
10 changes: 8 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,7 @@ If you use this repository for research purposes, please cite our [paper](https:
- Léo Grinsztajn (deep learning baselines, plotting)
- Ingo Steinwart (UCI dataset download)
- Katharina Strecker (PyTorch-Lightning interface)
- Lennart Purucker (some features/fixes)
- Jérôme Dockès (deployment, continuous integration)

## Acknowledgements
Expand All @@ -142,12 +143,17 @@ and https://docs.ray.io/en/latest/cluster/vms/user-guides/community/slurm.html
by initializing the random state of quantile (and KDI)
preprocessing transforms.
- n_threads parameter is not ignored by NNs anymore.
- Changes by [Lennart Purucker](https://github.com/LennartPurucker):
Add time limit for RealMLP,
add support for `lightning` (but also still allowing `pytorch-lightning`),
making skorch a lazy import, removed msgpack\_numpy dependency.
- v1.0.0: Release for the NeurIPS version and arXiv v2.
- More baselines (MLP-PLR, FT-Transformer, TabR-HPO, RF-HPO),
also some un-polished internal interfaces for other methods,
esp. the ones in AutoGluon
esp. the ones in AutoGluon.
- Updated benchmarking code (configurations, plots)
including the new version of the Grinsztajn et al. benchmark
- Updated fit() parameters in scikit-learn interfaces, etc.
- v0.0.1: First release for arXiv v1. Code and data are archived at [DaRUS](https://doi.org/10.18419/darus-4255).
- v0.0.1: First release for arXiv v1.
Code and data are archived at [DaRUS](https://doi.org/10.18419/darus-4255).

15 changes: 11 additions & 4 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -28,24 +28,31 @@ classifiers = [
dependencies = [
"torch>=2.0",
"numpy>=1.25,<2.0",
"dill",
"pandas>=2.0",
"scikit-learn>=1.3",
"xgboost>=2.0",
"catboost>=1.2",
"lightgbm>=4.1",
# older versions of torchmetrics (<1.2.1) have a bug that makes certain metrics used in tabr slow:
# https://github.com/Lightning-AI/torchmetrics/pull/2184
"torchmetrics>=1.2.1",
"pyyaml>=5.0",
# can also install the newer lightning package with more dependencies instead, it will be prioritized
"pytorch_lightning>=2.0",
"msgpack>=1.0",
"msgpack_numpy>=0.4", # apparently fixed some bug in using numpy arrays in msgpack?
"skorch>=0.15", # for rtdl models
"dask[dataframe]>=2023", # this is here because of a pandas warning:
# "Dask dataframe query planning is disabled because dask-expr is not installed"
# "packaging", # unclear why this is here?
"tqdm", # for TabM with verbosity >= 1
"psutil>=5.0",

# packages for saving objects in different formats
"dill",
"pyyaml>=5.0",
"msgpack>=1.0",
# apparently msgpack_numpy fixed some bug in using numpy arrays in msgpack?
# but apparently it can also cause a bug in ray due to its monkey-patching of msgpack functions
# in theory we shouldn't be using if for numpy arrays at the moment, not sure why the need for this occured
# "msgpack_numpy>=0.4",
]

[project.optional-dependencies]
Expand Down
3 changes: 2 additions & 1 deletion pytabkit/models/alg_interfaces/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,9 +62,10 @@ class InterfaceResources:
"""
Simple class representing resources that a method is allowed to use (number of threads and GPUs).
"""
def __init__(self, n_threads: int, gpu_devices: List[str]):
def __init__(self, n_threads: int, gpu_devices: List[str], time_in_seconds: Optional[int] = None):
self.n_threads = n_threads
self.gpu_devices = gpu_devices
self.time_in_seconds = time_in_seconds


class RequiredResources:
Expand Down
14 changes: 12 additions & 2 deletions pytabkit/models/alg_interfaces/nn_interfaces.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,15 @@

import numpy as np
import torch
import pytorch_lightning as pl
try:
import lightning.pytorch as pl
except ImportError:
import pytorch_lightning as pl

import logging

from datetime import timedelta

from pytabkit.models import utils
from pytabkit.models.data.data import DictDataset
from pytabkit.models.hyper_opt.hyper_optimizers import HyperoptOptimizer, SMACOptimizer
Expand Down Expand Up @@ -48,7 +54,7 @@ def fit(self, ds: DictDataset, idxs_list: List[SplitIdxs], interface_resources:
split_id=idxs.split_id) for idxs in idxs_list]

# https://stackoverflow.com/questions/74364944/how-to-get-rid-of-info-logging-messages-in-pytorch-lightning
log = logging.getLogger("pytorch_lightning")
log = logging.getLogger("lightning")
log.propagate = False
log.setLevel(logging.ERROR)

Expand Down Expand Up @@ -81,7 +87,10 @@ def fit(self, ds: DictDataset, idxs_list: List[SplitIdxs], interface_resources:
else:
raise ValueError(f'Unknown device "{self.device}"')

max_time = None if interface_resources.time_in_seconds is None else timedelta(seconds=interface_resources.time_in_seconds)

self.trainer = pl.Trainer(
max_time=max_time,
accelerator=pl_accelerator,
devices=pl_devices,
callbacks=self.model.create_callbacks(),
Expand All @@ -106,6 +115,7 @@ def fit(self, ds: DictDataset, idxs_list: List[SplitIdxs], interface_resources:
# self.model.to('cpu') # to allow serialization without GPU issues, but doesn't work

# print(f'Importances (sorted):', self.get_importances().sort()[0]) # todo
self.trainer.max_time = None

def predict(self, ds: DictDataset) -> torch.Tensor:
old_allow_tf32 = torch.backends.cuda.matmul.allow_tf32
Expand Down
12 changes: 7 additions & 5 deletions pytabkit/models/alg_interfaces/rtdl_interfaces.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,6 @@
import torch
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.impute import SimpleImputer
from skorch.dataset import Dataset
from skorch.helper import predefined_split

from pytabkit.models.alg_interfaces.resource_computation import ResourcePredictor
from pytabkit.models import utils
Expand All @@ -19,9 +17,6 @@
from pytabkit.models.data.data import DictDataset
from pytabkit.models.sklearn.default_params import DefaultParams
from pytabkit.models.training.logging import Logger
from pytabkit.models.nn_models.rtdl_resnet import create_mlp_classifier_skorch, create_mlp_regressor_skorch, \
create_resnet_classifier_skorch, create_resnet_regressor_skorch, create_ft_transformer_classifier_skorch, \
create_ft_transformer_regressor_skorch
from pytabkit.models.training.metrics import insert_missing_class_columns


Expand All @@ -41,6 +36,9 @@ def allow_single_underscore(params_config: List[Tuple]) -> List[Tuple]:
class SkorchSubSplitInterface(SklearnSubSplitInterface):
def _fit_sklearn(self, x_df: pd.DataFrame, y: np.ndarray, val_idxs: np.ndarray,
cat_col_names: Optional[List[str]] = None):
from skorch.helper import predefined_split
from skorch.dataset import Dataset

# set number of classes
if self.n_classes > 0: # classification
self.model.set_n_classes(self.n_classes)
Expand Down Expand Up @@ -158,6 +156,7 @@ def _create_sklearn_model(self, seed: int, n_threads: int, gpu_devices: List[str
params['device'] = 'cpu' if len(gpu_devices) == 0 else gpu_devices[0]
if 'checkpoint_dir' not in params or params['checkpoint_dir'] is None:
params['checkpoint_dir'] = './rtdl_checkpoints'
from pytabkit.models.nn_models.rtdl_resnet import create_mlp_classifier_skorch, create_mlp_regressor_skorch
if self.n_classes > 0:
return create_mlp_classifier_skorch(**params)
else:
Expand Down Expand Up @@ -216,6 +215,8 @@ def _create_sklearn_model(self, seed: int, n_threads: int, gpu_devices: List[str
params['device'] = 'cpu' if len(gpu_devices) == 0 else gpu_devices[0]
if 'checkpoint_dir' not in params or params['checkpoint_dir'] is None:
params['checkpoint_dir'] = './rtdl_checkpoints'

from pytabkit.models.nn_models.rtdl_resnet import create_resnet_classifier_skorch, create_resnet_regressor_skorch
if self.n_classes > 0:
return create_resnet_classifier_skorch(**params)
else:
Expand Down Expand Up @@ -275,6 +276,7 @@ def _create_sklearn_model(self, seed: int, n_threads: int, gpu_devices: List[str
params['device'] = 'cpu' if len(gpu_devices) == 0 else gpu_devices[0]
if 'checkpoint_dir' not in params or params['checkpoint_dir'] is None:
params['checkpoint_dir'] = './rtdl_checkpoints'
from pytabkit.models.nn_models.rtdl_resnet import create_ft_transformer_classifier_skorch, create_ft_transformer_regressor_skorch
if self.n_classes > 0:
return create_ft_transformer_classifier_skorch(**params)
else:
Expand Down
97 changes: 52 additions & 45 deletions pytabkit/models/alg_interfaces/tabr_interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,6 @@
import torch
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.impute import SimpleImputer
from skorch.dataset import Dataset
from skorch.helper import predefined_split

from pytabkit.models.alg_interfaces.resource_computation import ResourcePredictor
from pytabkit.models import utils
Expand All @@ -26,8 +24,14 @@
from pytabkit.models.training.metrics import insert_missing_class_columns

import torch.utils.data
import pytorch_lightning as pl
from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint
try:
import lightning.pytorch as pl
from lightning.pytorch.callbacks import EarlyStopping, ModelCheckpoint
except ImportError:
import pytorch_lightning as pl
from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint




class ExceptionPrintingCallback(pl.callbacks.Callback):
Expand All @@ -37,47 +41,6 @@ def on_exception(self, trainer, pl_module, exception):
traceback.print_exception(exception)


class TabrDataset(Dataset):
def __init__(self, X_num, X_bin, X_cat, Y):
self.data = {
"Y": Y.reshape(-1)
}
if X_num.shape[1] > 0:
self.data["X_num"] = X_num.float()
if X_bin.shape[1] > 0:
self.data["X_bin"] = X_bin.long()
if X_cat.shape[1] > 0:
self.data["X_cat"] = X_cat.long()
self.size = len(Y)

def __len__(self):
return self.size

def __getitem__(self, idx):
return {"indices": idx}


class TabrDatasetTest(Dataset):
def __init__(self, X_num, X_bin, X_cat):
self.data = {}
if X_num.shape[1] > 0:
self.data["X_num"] = X_num.float()
self.size = len(X_num)
if X_bin.shape[1] > 0:
self.data["X_bin"] = X_bin.long()
self.size = len(X_bin)
if X_cat.shape[1] > 0:
self.data["X_cat"] = X_cat.long()
self.size = len(X_cat)
def __len__(self):
return self.size
def __getitem__(self, idx):
return {
key: self.data[key][idx]
for key in self.data
}


class TabRSubSplitInterface(AlgInterface):
def __init__(self, **config):
super().__init__(**config)
Expand Down Expand Up @@ -245,6 +208,28 @@ def fit(self, ds: DictDataset, idxs_list: List[SplitIdxs], interface_resources:
X_bin_val = X_bin_val - 1
X_bin_val[X_bin_val == -1] = 0.5

from skorch.dataset import Dataset

class TabrDataset(Dataset):
def __init__(self, X_num, X_bin, X_cat, Y):
self.data = {
"Y": Y.reshape(-1)
}
if X_num.shape[1] > 0:
self.data["X_num"] = X_num.float()
if X_bin.shape[1] > 0:
self.data["X_bin"] = X_bin.long()
if X_cat.shape[1] > 0:
self.data["X_cat"] = X_cat.long()
self.size = len(Y)

def __len__(self):
return self.size

def __getitem__(self, idx):
return {"indices": idx}


train_dataset = TabrDataset(
X_num,
X_bin,
Expand Down Expand Up @@ -398,6 +383,28 @@ def predict(self, ds: DictDataset) -> torch.Tensor:
# replace -1 by 0.5
X_bin[X_bin == -1] = 0.5

from skorch.dataset import Dataset

class TabrDatasetTest(Dataset):
def __init__(self, X_num, X_bin, X_cat):
self.data = {}
if X_num.shape[1] > 0:
self.data["X_num"] = X_num.float()
self.size = len(X_num)
if X_bin.shape[1] > 0:
self.data["X_bin"] = X_bin.long()
self.size = len(X_bin)
if X_cat.shape[1] > 0:
self.data["X_cat"] = X_cat.long()
self.size = len(X_cat)
def __len__(self):
return self.size
def __getitem__(self, idx):
return {
key: self.data[key][idx]
for key in self.data
}

test_dataset = TabrDatasetTest(
X_num,
X_bin,
Expand Down
6 changes: 5 additions & 1 deletion pytabkit/models/nn_models/tabr.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
from functools import partial

import numpy as np
import pytorch_lightning as pl
import torch
from torch import Tensor
import torch.optim as optim
Expand All @@ -16,6 +15,11 @@
from torchmetrics import Accuracy, Precision, Recall, F1Score, MeanSquaredError, AUROC, MeanAbsoluteError
from typing import Any, Optional, Union, Literal, Callable

try:
import lightning.pytorch as pl
except ImportError:
import pytorch_lightning as pl


class NTPLinearLayer(nn.Module):
def __init__(self, in_features: int, out_features: int, bias: bool = True, bias_factor: float = 0.1, linear_init_type: str = 'default'):
Expand Down
6 changes: 5 additions & 1 deletion pytabkit/models/nn_models/tabr_context_freeze.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
import math
from functools import partial

import pytorch_lightning as pl
import torch
from torch import Tensor
import torch.optim as optim
Expand All @@ -16,6 +15,11 @@
from typing import Any, Optional, Union, Literal, Callable, NamedTuple
from tqdm import tqdm

try:
import lightning.pytorch as pl
except ImportError:
import pytorch_lightning as pl

from pytabkit.models.nn_models.tabr import ParametricMishActivationLayer, ParametricReluActivationLayer, ScalingLayer, \
bce_with_logits_and_label_smoothing

Expand Down
Loading
Loading