add heuristic to subsample too large datasets

mfeurer · mfeurer · commit d8c92f75a333 · 2020-12-21T20:18:01.000+01:00
diff --git a/autosklearn/automl.py b/autosklearn/automl.py
@@ -54,7 +54,8 @@
 from autosklearn.util.hash import hash_array_or_matrix
 from autosklearn.metrics import f1_macro, accuracy, r2
 from autosklearn.constants import MULTILABEL_CLASSIFICATION, MULTICLASS_CLASSIFICATION, \
-    REGRESSION_TASKS, REGRESSION, BINARY_CLASSIFICATION, MULTIOUTPUT_REGRESSION
+    REGRESSION_TASKS, REGRESSION, BINARY_CLASSIFICATION, MULTIOUTPUT_REGRESSION, \
+    CLASSIFICATION_TASKS
 from autosklearn.pipeline.components.classification import ClassifierChoice
 from autosklearn.pipeline.components.regression import RegressorChoice
 from autosklearn.pipeline.components.feature_preprocessing import FeaturePreprocessorChoice
@@ -459,6 +460,15 @@ def fit(
                 raise ValueError('Target value shapes do not match: %s vs %s'
                                  % (y.shape, y_test.shape))
 
+        X, y = self.subsample_if_too_large(
+            X=X,
+            y=y,
+            logger=self._logger,
+            seed=self._seed,
+            memory_limit=self._memory_limit,
+            task=self._task,
+        )
+
         # Reset learnt stuff
         self.models_ = None
         self.cv_models_ = None
@@ -779,6 +789,59 @@ def fit(
 
         return self
 
+    @staticmethod
+    def subsample_if_too_large(X, y, logger, seed, memory_limit, task):
+        if isinstance(X, np.ndarray):
+            if X.dtype == np.float32:
+                multiplier = 4
+            elif X.dtype in (np.float64, np.float):
+                multiplier = 8
+            elif X.dtype == np.float128:
+                multiplier = 16
+            else:
+                # Just assuming some value - very unlikely
+                multiplier = 8
+                logger.warning('Unknown dtype for X: %s, assuming it takes 8 bit/number',
+                               str(X.dtype))
+            megabytes = X.shape[0] * X.shape[1] * multiplier / 1024 / 1024
+            if memory_limit <= megabytes * 10:
+                new_num_samples = int(
+                    memory_limit / (10 * X.shape[1] * multiplier / 1024 / 1024)
+                )
+                logger.warning(
+                    'Dataset too large for memory limit %dMB, reducing number of samples from '
+                    '%d to %d.',
+                    memory_limit,
+                    X.shape[0],
+                    new_num_samples,
+                )
+                if task in CLASSIFICATION_TASKS:
+                    try:
+                        X, _, y, _ = sklearn.model_selection.train_test_split(
+                            X, y,
+                            train_size=new_num_samples,
+                            random_state=seed,
+                            stratify=y,
+                        )
+                    except Exception:
+                        logger.warning(
+                            'Could not sample dataset in stratified manner, resorting to random '
+                            'sampling',
+                            exc_info=True
+                        )
+                        X, _, y, _ = sklearn.model_selection.train_test_split(
+                            X, y,
+                            train_size=new_num_samples,
+                            random_state=seed,
+                        )
+                else:
+                    X, _, y, _ = sklearn.model_selection.train_test_split(
+                        X, y,
+                        train_size=new_num_samples,
+                        random_state=seed,
+                    )
+        return X, y
+
     def refit(self, X, y):
 
         # Make sure input data is valid
diff --git a/test/test_automl/test_automl.py b/test/test_automl/test_automl.py
@@ -1,4 +1,5 @@
 # -*- encoding: utf-8 -*-
+import itertools
 import os
 import pickle
 import sys
@@ -19,7 +20,15 @@
 from autosklearn.data.xy_data_manager import XYDataManager
 from autosklearn.metrics import accuracy, log_loss, balanced_accuracy
 import autosklearn.pipeline.util as putil
-from autosklearn.constants import MULTICLASS_CLASSIFICATION, BINARY_CLASSIFICATION, REGRESSION
+from autosklearn.constants import (
+    MULTICLASS_CLASSIFICATION,
+    BINARY_CLASSIFICATION,
+    MULTILABEL_CLASSIFICATION,
+    REGRESSION,
+    MULTIOUTPUT_REGRESSION,
+    CLASSIFICATION_TASKS,
+    REGRESSION_TASKS,
+)
 from smac.tae import StatusType
 
 sys.path.append(os.path.dirname(__file__))
@@ -651,3 +660,51 @@ def test_fail_if_dtype_changes_automl(backend, dask_client):
             X_train.to_numpy(), y_train,
             task=BINARY_CLASSIFICATION,
         )
+
+
+@pytest.mark.parametrize(
+    'memory_limit,task',
+    [
+        (memory_limit, task)
+        for task in itertools.chain(CLASSIFICATION_TASKS, REGRESSION_TASKS)
+        for memory_limit in (1, 10)
+    ]
+)
+def test_subsample_if_too_large(memory_limit, task):
+    fixture = {
+        BINARY_CLASSIFICATION: {1: 436, 10: 569},
+        MULTICLASS_CLASSIFICATION: {1: 204, 10: 1797},
+        MULTILABEL_CLASSIFICATION: {1: 204, 10: 1797},
+        REGRESSION: {1: 1310, 10: 1326},
+        MULTIOUTPUT_REGRESSION: {1: 1310, 10: 1326}
+    }
+    mock = unittest.mock.Mock()
+    if task == BINARY_CLASSIFICATION:
+        X, y = sklearn.datasets.load_breast_cancer(return_X_y=True)
+    elif task == MULTICLASS_CLASSIFICATION:
+        X, y = sklearn.datasets.load_digits(return_X_y=True)
+    elif task == MULTILABEL_CLASSIFICATION:
+        X, y_ = sklearn.datasets.load_digits(return_X_y=True)
+        y = np.zeros((X.shape[0], 10))
+        for i, j in enumerate(y_):
+            y[i, j] = 1
+    elif task == REGRESSION:
+        X, y = sklearn.datasets.load_diabetes(return_X_y=True)
+        X = np.vstack((X, X, X))
+        y = np.vstack((y.reshape((-1, 1)), y.reshape((-1, 1)), y.reshape((-1, 1))))
+    elif task == MULTIOUTPUT_REGRESSION:
+        X, y = sklearn.datasets.load_diabetes(return_X_y=True)
+        y = np.vstack((y, y)).transpose()
+        X = np.vstack((X, X, X))
+        y = np.vstack((y, y, y))
+    else:
+        raise ValueError(task)
+
+    assert X.shape[0] == y.shape[0]
+
+    X_new, y_new = AutoML.subsample_if_too_large(X, y, mock, 1, memory_limit, task)
+    assert X_new.shape[0] == fixture[task][memory_limit]
+    if memory_limit == 1:
+        assert mock.warning.call_count == 1
+    else:
+        assert mock.warning.call_count == 0