Tabarena 2025 refinement (#251)

Innixma · atschalz · web-flow · commit db8af12e8f5e · 2026-01-04T14:12:12.000-08:00
* Initialize with AG LightGBM

* Add new model and search space

* Update PrepGBM: Add residual logic and preprocessors, switch to use LGBModel as superclass; Adapt search space

* Adjust to new tabarena structure

* Adjust to new tabarena structure

* Initialize with AG LightGBM

* Adjust to new tabarena structure

* Make preprocessors and residuals use the correct random seeds

* Add explorative search space

* Remove unnecessary old code

* Remove unnecessary old code

* Undo example changes

* Add functions for linear residual boosting

* Add memory estimation, switch to using AG preprocessors

* Fix bug in memory estimation for OOF-TE

* Adjust search space

* bug fix in search space

* Add skrub to dependencies

* minor updates

* Extend preprocessing logic to TabM, TabPFN2.5, CatBoost and XGBoost; Move preprocessing models to a separate directory

* Small bug fix

* update

* streamline prep logic

* refactor prep logic

* Update RealTabPFN-2.5 to new prep logic

* minor update

* cleanup torch memory and gc

* add `memory_usage_estimate` tracking

* stop saving training job logs to s3

* Add missing __init__.py

* cleanup

* Reduce ModelAgnosticPrepMixin._estimate_memory_usage runtime by 4x

* Minor fix

* Use BulkFeatureGenerator and parallel stage preprocessing, &gt;2x inference speedup.

* Minor improvements to linear residual

* Update memory estimation to use n_numeric, and n_categorical estimates from preprocessors.

* Use `remove_unused_features="false_recursive"` and `post_drop_duplicates=True`

* Update passthrough logic

* Add passthrough_types

* Update

* Update

* fix pyproject.toml

* Fix yaml serialization

* Update prep_mixin to work with lists instead of tuples

* Update prep_mixin to be faster and fix crashes

* switch to AG SquashingScaler

* update CVSplitter import

* Update

* tmp commit

* update pytabkit version

* Remove debugging code

* Various TabArena updates

* Remove Prep code

* Update to AG 1.5

---------

Co-authored-by: atschalz &lt;andrej.tschalzev@uni-mannheim.de&gt;
diff --git a/tabarena/pyproject.toml b/tabarena/pyproject.toml
@@ -15,7 +15,7 @@ requires-python = ">=3.10"
 #   uv pip install --prerelease=allow .
 dependencies = [
   # TODO: To use `uv`, you need to do `uv pip install --prerelease=allow .` so it recognizes pre-release AutoGluon
-  "autogluon>=1.4.1b20250910,<1.6",  # TODO: Remove after moving `benchmark` code elsewhere
+  "autogluon>=1.5,<1.6",  # TODO: Remove after moving `benchmark` code elsewhere
   "bencheval",
   "openml>=0.14.1",  # consider making optional
   "pyyaml",
@@ -50,6 +50,7 @@ tabm = ["torch"]
 modernnca = ["category_encoders"]
 xrfm = ["xrfm[cu12]"]
 sap-rpt-oss = ["sap_rpt_oss @ git+https://github.com/SAP-samples/sap-rpt-1-oss.git@a323a0aff976fda4ac43c3196a92406de7689aaa"]
+tabprep = []
 
 # union of all above extras (mirrors your "benchmark" extra)
 benchmark = [
@@ -62,7 +63,7 @@ benchmark = [
   "torch",
   "category_encoders",
   "xrfm[cu12]",
-  "sap_rpt_oss @ git+https://github.com/SAP-samples/sap-rpt-1-oss.git@a323a0aff976fda4ac43c3196a92406de7689aaa"
+  "sap_rpt_oss @ git+https://github.com/SAP-samples/sap-rpt-1-oss.git@a323a0aff976fda4ac43c3196a92406de7689aaa",
 ]
 
 [project.urls]
diff --git a/tabarena/tabarena/benchmark/experiment/experiment_constructor.py b/tabarena/tabarena/benchmark/experiment/experiment_constructor.py
@@ -534,7 +534,30 @@ def from_yaml(cls, path: str, context=None) -> list[Experiment]:
 
         experiments = []
         for experiment in yaml_out:
-            experiments.append(YamlSingleExperimentSerializer.parse_method(experiment, context=context))
+            experiments.append(
+                YamlSingleExperimentSerializer.parse_method(
+                    experiment, context=context
+                )
+            )
+
+        return experiments
+
+    @classmethod
+    def from_yaml_str(cls, yaml_str: str, context=None) -> list[Experiment]:
+        """
+        Parse a YAML string containing multiple experiment definitions
+        and return a list of Experiment instances.
+        """
+        yaml_out = yaml.safe_load(yaml_str)
+        methods = yaml_out["methods"]
+
+        experiments = []
+        for experiment in methods:
+            experiments.append(
+                YamlSingleExperimentSerializer.parse_method(
+                    experiment, context=context
+                )
+            )
 
         return experiments
 
@@ -562,7 +585,5 @@ def to_yaml_str(cls, experiments: list[Experiment]) -> str:
     def _to_yaml_format(cls, experiments: list[Experiment]) -> dict[str, list[dict]]:
         yaml_lst = []
         for experiment in experiments:
-            yaml_dict = experiment.to_yaml_dict()
-            yaml_lst.append(yaml_dict)
-        yaml_out = {"methods": yaml_lst}
-        return yaml_out
+            yaml_lst.append(experiment.to_yaml_dict())
+        return {"methods": yaml_lst}
diff --git a/tabarena/tabarena/benchmark/models/ag/tabdpt/tabdpt_model.py b/tabarena/tabarena/benchmark/models/ag/tabdpt/tabdpt_model.py
@@ -12,6 +12,7 @@
     import pandas as pd
 
 
+# FIXME: Add CPU loading support (.to(device))
 class TabDPTModel(AbstractModel):
     ag_key = "TA-TABDPT"
     ag_name = "TA-TabDPT"
diff --git a/tabarena/tabarena/benchmark/models/ag/tabm/tabm_model.py b/tabarena/tabarena/benchmark/models/ag/tabm/tabm_model.py
@@ -273,7 +273,10 @@ def get_tabm_auto_batch_size(cls, n_samples: int) -> int:
 
     @classmethod
     def _class_tags(cls):
-        return {"can_estimate_memory_usage_static": True}
+        return {
+            "can_estimate_memory_usage_static": True,
+            "reset_torch_threads": True,
+        }
 
     def _more_tags(self) -> dict:
         # TODO: Need to add train params support, track best epoch
diff --git a/tabarena/tabarena/benchmark/models/ag/tabpfnv2_5/tabpfnv2_5_model.py b/tabarena/tabarena/benchmark/models/ag/tabpfnv2_5/tabpfnv2_5_model.py
@@ -100,7 +100,7 @@ def _fit(
                 "Please switch to CPU usage instead.",
             )
 
-        X = self.preprocess(X, is_train=True)
+        X = self.preprocess(X, y=y, is_train=True)
 
         hps = self._get_model_params()
         hps["device"] = device
diff --git a/tabarena/tabarena/benchmark/models/wrapper/AutoGluon_class.py b/tabarena/tabarena/benchmark/models/wrapper/AutoGluon_class.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 
 import copy
+import gc
 import shutil
 from typing import Type
 
@@ -73,6 +74,15 @@ def get_metric_error_val(self) -> float:
 
     def cleanup(self):
         shutil.rmtree(self.predictor.path, ignore_errors=True)
+        gc.collect()
+        try:
+            import torch
+        except ImportError:
+            pass
+        else:
+            import torch
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()
 
 
 class AGSingleWrapper(AGWrapper):
@@ -153,10 +163,13 @@ def model_cls(self) -> Type["AbstractModel"]:
             model_cls = ag_model_registry.key_to_cls(key=self._model_cls)
         return model_cls
 
-    def _load_model(self):
+    def _load_model(self, assert_single_model: bool = True):
         model_names = self.predictor.model_names(can_infer=True)
-        assert len(model_names) == 1
-        model_name = self.predictor.model_names()[0]
+        if assert_single_model:
+            assert len(model_names) == 1
+            model_name = self.predictor.model_names()[0]
+        else:
+            model_name = self.predictor.model_best
         return self.predictor._trainer.load_model(model_name)
 
     def get_metadata_init(self) -> dict:
@@ -172,14 +185,16 @@ def get_metadata_init(self) -> dict:
 
     def get_metadata_fit(self) -> dict:
         metadata = {}
-        model = self._load_model()
+        model = self._load_model(assert_single_model=False)
         metadata["info"] = model.get_info(include_feature_metadata=False)
         metadata["disk_usage"] = model.disk_usage()
         metadata["num_cpus"] = model.fit_num_cpus
         metadata["num_gpus"] = model.fit_num_gpus
         metadata["num_cpus_child"] = model.fit_num_cpus_child
         metadata["num_gpus_child"] = model.fit_num_gpus_child
         metadata["fit_metadata"] = model.get_fit_metadata()
+        if hasattr(model, "_memory_usage_estimate"):
+            metadata["memory_usage_estimate"] = model._memory_usage_estimate
         return metadata
 
     def get_metadata_failure(self) -> dict:
diff --git a/tabarena/tabarena/benchmark/models/wrapper/abstract_class.py b/tabarena/tabarena/benchmark/models/wrapper/abstract_class.py
@@ -104,7 +104,9 @@ def _fit_custom(self, X: pd.DataFrame, y: pd.Series, X_test: pd.DataFrame) -> di
         dict
             Returns predictions, probabilities, fit time and inference time
         """
-        with (Timer() as timer_fit):
+        from tabarena.utils.memory_utils import CpuMemoryTracker, GpuMemoryTracker
+
+        with CpuMemoryTracker() as cpu_tracker, GpuMemoryTracker(device=0) as gpu_tracker, Timer() as timer_fit:
             self.fit(X, y)
 
         self.post_fit(X=X, y=y, X_test=X_test)
@@ -125,6 +127,18 @@ def _fit_custom(self, X: pd.DataFrame, y: pd.Series, X_test: pd.DataFrame) -> di
             "time_infer_s": timer_predict.duration,
         }
 
+        out["memory_usage"] = dict(
+            peak_mem_cpu=cpu_tracker.peak_rss,
+            min_mem_cpu=cpu_tracker.min_rss,
+
+            peak_mem_gpu=gpu_tracker.peak_allocated,
+            peak_mem_gpu_reserved=gpu_tracker.peak_reserved,
+            min_mem_gpu=gpu_tracker.min_allocated,
+            min_mem_gpu_reserved=gpu_tracker.min_reserved,
+
+            gpu_tracking_enabled=gpu_tracker.enabled,
+        )
+
         return out
 
     def fit(self, X: pd.DataFrame, y: pd.Series, X_val=None, y_val=None):
diff --git a/tabarena/tabarena/nips2025_utils/artifacts/_tabarena_method_metadata.py b/tabarena/tabarena/nips2025_utils/artifacts/_tabarena_method_metadata.py
@@ -34,7 +34,8 @@
 from tabarena.nips2025_utils.artifacts._tabarena_method_metadata_2025_11_12 import realtabpfn25_metadata, contexttab_metadata
 
 from tabarena.nips2025_utils.artifacts._tabarena_method_metadata_misc import (
-    gbm_aio_0808_metadata
+    gbm_aio_0808_metadata,
+    # prep_gbm_v6_metadata,
 )
 
 methods_2025_09_03: list[MethodMetadata] = [
@@ -71,6 +72,7 @@
 
 methods_misc: list[MethodMetadata] = [
     gbm_aio_0808_metadata,
+    # prep_gbm_v6_metadata,
 ]
 
 replaced_methods = [
diff --git a/tabarena/tabarena/nips2025_utils/artifacts/_tabarena_method_metadata_misc.py b/tabarena/tabarena/nips2025_utils/artifacts/_tabarena_method_metadata_misc.py
@@ -25,3 +25,26 @@
     s3_prefix="cache_aio",
     verified=False,
 )
+
+# LightGBM w/ custom preprocessing pipeline (only first 3 repeats)
+# s3 cache = "cache_aio"
+prep_gbm_v6_metadata = MethodMetadata(
+    method="prep_LightGBM_v6",
+    artifact_name="prep_LightGBM_v6",
+    method_type="config",
+    compute="cpu",
+    date="2025-12-16",
+    ag_key="prep_GBM",
+    model_key="prep_GBM_v6",
+    config_default="prep_LightGBM_v6_c1_BAG_L1",
+    name_suffix=None,
+    has_raw=True,
+    has_processed=True,
+    has_results=True,
+    upload_as_public=True,
+    can_hpo=True,
+    is_bag=True,
+    s3_bucket="tabarena",
+    s3_prefix="cache_aio",
+    verified=True,
+)
diff --git a/tabarena/tabarena/nips2025_utils/end_to_end.py b/tabarena/tabarena/nips2025_utils/end_to_end.py
@@ -418,6 +418,7 @@ def compare_on_tabarena(
         leaderboard_kwargs: dict | None = None,
         tabarena_context_kwargs: dict | None = None,
         extra_results: pd.DataFrame = None,
+        remove_imputed: bool = False,
     ) -> pd.DataFrame:
         """Compare results on TabArena leaderboard.
 
@@ -451,6 +452,7 @@ def compare_on_tabarena(
             average_seeds=average_seeds,
             leaderboard_kwargs=leaderboard_kwargs,
             tabarena_context_kwargs=tabarena_context_kwargs,
+            remove_imputed=remove_imputed,
         )
 
     def get_results(
diff --git a/tabarena/tabarena/nips2025_utils/tabarena_context.py b/tabarena/tabarena/nips2025_utils/tabarena_context.py
@@ -254,13 +254,15 @@ def simulate_portfolio_from_configs(
         configs: list[str],
         config_fallback: str | None = None,
         repo: EvaluationRepositoryCollection = None,
+        **kwargs,
     ):
         if repo is None:
             repo = self.load_repo(config_fallback=config_fallback)
         simulator = PaperRunTabArena(repo=repo, backend=self.backend)
 
         results = simulator.evaluate_ensembles(
             configs=configs,
+            **kwargs,
         )
 
         results = results.rename(columns={"framework": "method"})
@@ -629,7 +631,7 @@ def fillna_metrics(cls, df_to_fill: pd.DataFrame, df_fillna: pd.DataFrame) -> pd
             if not invalid.empty:
                 raise AssertionError(
                     f"Found a method with multiple values for column {c} (must be unique):\n"
-                    f"{groupby_method_invalid.value_counts()}"
+                    f"{groupby_method_invalid.value_counts(dropna=False)}"
                 )
 
             # Using .first() is safe because nunique == 1 for every method
diff --git a/tabarena/tabarena/paper/paper_runner.py b/tabarena/tabarena/paper/paper_runner.py
@@ -147,6 +147,8 @@ def evaluate_ensembles(
         n_iterations: int = 40,
         fit_order: Literal["original", "random"] = "original",
         seed: int = 0,
+        backend_group_folds: bool = True,
+        **kwargs,
     ) -> pd.DataFrame:
         if configs is None:
             configs = self.repo.configs()
@@ -157,6 +159,8 @@ def evaluate_ensembles(
             seed=seed,
             time_limit=time_limit,
             backend=self.backend,
+            backend_group_folds=backend_group_folds,
+            **kwargs,
         )
         df_results = df_results.reset_index()
         df_results["method_type"] = "portfolio"
diff --git a/tabarena/tabarena/paper/paper_runner_tabarena.py b/tabarena/tabarena/paper/paper_runner_tabarena.py
@@ -91,11 +91,24 @@ def _config_default(self, config_type: str, use_first_if_missing=False, return_n
             else:
                 return configs_default[0]
         else:  # >1
-            raise ValueError(
-                f"Found {len(configs_default)} potential default configs for config_type='{config_type}', but only one should exist."
-                f"\n\tpotential defaults: {configs_default}"
-                f"\n\tconfigs={configs}"
-            )
+            remaining = [c for c in configs_default if c.endswith("_c1")]
+            if len(remaining) == 1:
+                return remaining[0]
+            elif len(remaining) > 1:
+                configs_default = remaining
+            else:
+                len_suffix = [len(c.rsplit("_c1_", maxsplit=1)[-1]) for c in configs_default]
+                min_suffix = min(len_suffix)
+                configs_default = [c for i, c in enumerate(configs_default) if len_suffix[i] == min_suffix]
+            configs_default = sorted(configs_default)
+            if len(configs_default) > 1:
+                print(
+                    f"Found {len(configs_default)} potential default configs for config_type='{config_type}', but only one should exist."
+                    f"\n\tpotential defaults: {configs_default}"
+                    f"\n\tconfigs={configs}"
+                    f"\nSelecting {configs_default[0]} as default via string sort."
+                )
+            return configs_default[0]
 
     def run_config(self, config: str) -> pd.DataFrame:
         configs = [config]
diff --git a/tabarena/tabarena/repository/ensemble_mixin.py b/tabarena/tabarena/repository/ensemble_mixin.py
@@ -113,6 +113,8 @@ def evaluate_ensemble(
                 config_metrics=config_metrics,
                 max_cumruntime=time_limit,
             )
+        else:
+            configs_fit_order = configs
 
         configs_available = [c for c in configs if c in set(configs_all)]
 
diff --git a/tabarena/tabarena/utils/config_utils.py b/tabarena/tabarena/utils/config_utils.py
@@ -299,7 +299,7 @@ def generate_holdout_experiments(
     name_id_suffix: str = "",
     add_name_suffix_to_params: bool = True,
     **kwargs,
-) -> list[AGModelBagExperiment]:
+) -> list[AGModelExperiment]:
     experiments = []
     if kwargs is None:
         kwargs = {}
diff --git a/tabflow/tabflow/cli/launch_jobs.py b/tabflow/tabflow/cli/launch_jobs.py
@@ -57,6 +57,7 @@ def __init__(
         wait: bool = True,
         stop_wait_on_fail: bool = False,
         s3_dataset_cache: str = None,
+        verify_methods: bool = True,
     ):
         """
 
@@ -104,6 +105,8 @@ def __init__(
             Full S3 URI for OpenML dataset cache (format: s3://bucket/prefix),
             note that after the prefix the following will be appended to the path -
             tasks/{task_id}/org/openml/www/tasks/{task_id}, where the xml and arff is expected to be situated
+        verify_methods:
+            If True, ensures that methods_content is serializable and loadable.
 
         """
         if add_timestamp:
@@ -122,8 +125,16 @@ def __init__(
         self.experiment_name = experiment_name
         if methods_content is not None:
             assert methods_file is None, f"Only one of `methods_file`, `methods_content` can be specified."
+            if verify_methods:
+                from tabarena.benchmark.experiment.experiment_constructor import YamlExperimentSerializer
+                YamlExperimentSerializer.from_yaml_str(yaml_str=methods_content)
         elif methods_file is None:
             raise AssertionError(f"Must specify one of `methods_file`, `methods_content`.")
+        else:
+            assert methods_content is None, f"Only one of `methods_file`, `methods_content` can be specified."
+            if verify_methods:
+                from tabarena.benchmark.experiment.experiment_constructor import YamlExperimentSerializer
+                YamlExperimentSerializer.from_yaml(path=methods_file)
         self.methods_content = methods_content
         self.methods_file = methods_file
         self.s3_bucket = s3_bucket
@@ -143,6 +154,7 @@ def __init__(
         self.wait = wait
         self.stop_wait_on_fail = stop_wait_on_fail
         self.s3_dataset_cache = s3_dataset_cache
+        self.verify_methods = verify_methods
         self._job_count = 0  # Used to ensure unique job names
 
         # Create boto3 session
diff --git a/tabflow/tabflow/core/resource_manager.py b/tabflow/tabflow/core/resource_manager.py

Original file line number	Diff line number	Diff line change
`@@ -100,7 +100,7 @@ def _fit(`
`100`	`100`	`"Please switch to CPU usage instead.",`
`101`	`101`	`)`
`102`	`102`
`103`		`- X = self.preprocess(X, is_train=True)`
	`103`	`+ X = self.preprocess(X, y=y, is_train=True)`
`104`	`104`
`105`	`105`	`hps = self._get_model_params()`
`106`	`106`	`hps["device"] = device`
Original file line number	Diff line number	Diff line change
`@@ -34,7 +34,8 @@`
`34`	`34`	`from tabarena.nips2025_utils.artifacts._tabarena_method_metadata_2025_11_12 import realtabpfn25_metadata, contexttab_metadata`
`35`	`35`
`36`	`36`	`from tabarena.nips2025_utils.artifacts._tabarena_method_metadata_misc import (`
`37`		`- gbm_aio_0808_metadata`
	`37`	`+ gbm_aio_0808_metadata,`
	`38`	`+ # prep_gbm_v6_metadata,`
`38`	`39`	`)`
`39`	`40`
`40`	`41`	`methods_2025_09_03: list[MethodMetadata] = [`
`@@ -71,6 +72,7 @@`
`71`	`72`
`72`	`73`	`methods_misc: list[MethodMetadata] = [`
`73`	`74`	`gbm_aio_0808_metadata,`
	`75`	`+ # prep_gbm_v6_metadata,`
`74`	`76`	`]`
`75`	`77`
`76`	`78`	`replaced_methods = [`