Skip to content

Commit d8c92f7

Browse files
committed
add heuristic to subsample too large datasets
1 parent f255a0f commit d8c92f7

File tree

2 files changed

+122
-2
lines changed

2 files changed

+122
-2
lines changed

autosklearn/automl.py

Lines changed: 64 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,8 @@
5454
from autosklearn.util.hash import hash_array_or_matrix
5555
from autosklearn.metrics import f1_macro, accuracy, r2
5656
from autosklearn.constants import MULTILABEL_CLASSIFICATION, MULTICLASS_CLASSIFICATION, \
57-
REGRESSION_TASKS, REGRESSION, BINARY_CLASSIFICATION, MULTIOUTPUT_REGRESSION
57+
REGRESSION_TASKS, REGRESSION, BINARY_CLASSIFICATION, MULTIOUTPUT_REGRESSION, \
58+
CLASSIFICATION_TASKS
5859
from autosklearn.pipeline.components.classification import ClassifierChoice
5960
from autosklearn.pipeline.components.regression import RegressorChoice
6061
from autosklearn.pipeline.components.feature_preprocessing import FeaturePreprocessorChoice
@@ -459,6 +460,15 @@ def fit(
459460
raise ValueError('Target value shapes do not match: %s vs %s'
460461
% (y.shape, y_test.shape))
461462

463+
X, y = self.subsample_if_too_large(
464+
X=X,
465+
y=y,
466+
logger=self._logger,
467+
seed=self._seed,
468+
memory_limit=self._memory_limit,
469+
task=self._task,
470+
)
471+
462472
# Reset learnt stuff
463473
self.models_ = None
464474
self.cv_models_ = None
@@ -779,6 +789,59 @@ def fit(
779789

780790
return self
781791

792+
@staticmethod
793+
def subsample_if_too_large(X, y, logger, seed, memory_limit, task):
794+
if isinstance(X, np.ndarray):
795+
if X.dtype == np.float32:
796+
multiplier = 4
797+
elif X.dtype in (np.float64, np.float):
798+
multiplier = 8
799+
elif X.dtype == np.float128:
800+
multiplier = 16
801+
else:
802+
# Just assuming some value - very unlikely
803+
multiplier = 8
804+
logger.warning('Unknown dtype for X: %s, assuming it takes 8 bit/number',
805+
str(X.dtype))
806+
megabytes = X.shape[0] * X.shape[1] * multiplier / 1024 / 1024
807+
if memory_limit <= megabytes * 10:
808+
new_num_samples = int(
809+
memory_limit / (10 * X.shape[1] * multiplier / 1024 / 1024)
810+
)
811+
logger.warning(
812+
'Dataset too large for memory limit %dMB, reducing number of samples from '
813+
'%d to %d.',
814+
memory_limit,
815+
X.shape[0],
816+
new_num_samples,
817+
)
818+
if task in CLASSIFICATION_TASKS:
819+
try:
820+
X, _, y, _ = sklearn.model_selection.train_test_split(
821+
X, y,
822+
train_size=new_num_samples,
823+
random_state=seed,
824+
stratify=y,
825+
)
826+
except Exception:
827+
logger.warning(
828+
'Could not sample dataset in stratified manner, resorting to random '
829+
'sampling',
830+
exc_info=True
831+
)
832+
X, _, y, _ = sklearn.model_selection.train_test_split(
833+
X, y,
834+
train_size=new_num_samples,
835+
random_state=seed,
836+
)
837+
else:
838+
X, _, y, _ = sklearn.model_selection.train_test_split(
839+
X, y,
840+
train_size=new_num_samples,
841+
random_state=seed,
842+
)
843+
return X, y
844+
782845
def refit(self, X, y):
783846

784847
# Make sure input data is valid

test/test_automl/test_automl.py

Lines changed: 58 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
# -*- encoding: utf-8 -*-
2+
import itertools
23
import os
34
import pickle
45
import sys
@@ -19,7 +20,15 @@
1920
from autosklearn.data.xy_data_manager import XYDataManager
2021
from autosklearn.metrics import accuracy, log_loss, balanced_accuracy
2122
import autosklearn.pipeline.util as putil
22-
from autosklearn.constants import MULTICLASS_CLASSIFICATION, BINARY_CLASSIFICATION, REGRESSION
23+
from autosklearn.constants import (
24+
MULTICLASS_CLASSIFICATION,
25+
BINARY_CLASSIFICATION,
26+
MULTILABEL_CLASSIFICATION,
27+
REGRESSION,
28+
MULTIOUTPUT_REGRESSION,
29+
CLASSIFICATION_TASKS,
30+
REGRESSION_TASKS,
31+
)
2332
from smac.tae import StatusType
2433

2534
sys.path.append(os.path.dirname(__file__))
@@ -651,3 +660,51 @@ def test_fail_if_dtype_changes_automl(backend, dask_client):
651660
X_train.to_numpy(), y_train,
652661
task=BINARY_CLASSIFICATION,
653662
)
663+
664+
665+
@pytest.mark.parametrize(
666+
'memory_limit,task',
667+
[
668+
(memory_limit, task)
669+
for task in itertools.chain(CLASSIFICATION_TASKS, REGRESSION_TASKS)
670+
for memory_limit in (1, 10)
671+
]
672+
)
673+
def test_subsample_if_too_large(memory_limit, task):
674+
fixture = {
675+
BINARY_CLASSIFICATION: {1: 436, 10: 569},
676+
MULTICLASS_CLASSIFICATION: {1: 204, 10: 1797},
677+
MULTILABEL_CLASSIFICATION: {1: 204, 10: 1797},
678+
REGRESSION: {1: 1310, 10: 1326},
679+
MULTIOUTPUT_REGRESSION: {1: 1310, 10: 1326}
680+
}
681+
mock = unittest.mock.Mock()
682+
if task == BINARY_CLASSIFICATION:
683+
X, y = sklearn.datasets.load_breast_cancer(return_X_y=True)
684+
elif task == MULTICLASS_CLASSIFICATION:
685+
X, y = sklearn.datasets.load_digits(return_X_y=True)
686+
elif task == MULTILABEL_CLASSIFICATION:
687+
X, y_ = sklearn.datasets.load_digits(return_X_y=True)
688+
y = np.zeros((X.shape[0], 10))
689+
for i, j in enumerate(y_):
690+
y[i, j] = 1
691+
elif task == REGRESSION:
692+
X, y = sklearn.datasets.load_diabetes(return_X_y=True)
693+
X = np.vstack((X, X, X))
694+
y = np.vstack((y.reshape((-1, 1)), y.reshape((-1, 1)), y.reshape((-1, 1))))
695+
elif task == MULTIOUTPUT_REGRESSION:
696+
X, y = sklearn.datasets.load_diabetes(return_X_y=True)
697+
y = np.vstack((y, y)).transpose()
698+
X = np.vstack((X, X, X))
699+
y = np.vstack((y, y, y))
700+
else:
701+
raise ValueError(task)
702+
703+
assert X.shape[0] == y.shape[0]
704+
705+
X_new, y_new = AutoML.subsample_if_too_large(X, y, mock, 1, memory_limit, task)
706+
assert X_new.shape[0] == fixture[task][memory_limit]
707+
if memory_limit == 1:
708+
assert mock.warning.call_count == 1
709+
else:
710+
assert mock.warning.call_count == 0

0 commit comments

Comments
 (0)