Skip to content

Commit 674eee4

Browse files
authored
Add precision reduction (#1178)
* Add precision reduction in case of data being too large * reduce scipy dependency
1 parent c86928f commit 674eee4

File tree

2 files changed

+64
-20
lines changed

2 files changed

+64
-20
lines changed

autosklearn/automl.py

Lines changed: 25 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -849,17 +849,40 @@ def subsample_if_too_large(
849849
task: int,
850850
):
851851
if memory_limit and isinstance(X, np.ndarray):
852+
852853
if X.dtype == np.float32:
853854
multiplier = 4
854-
elif X.dtype in (np.float64, np.float):
855+
elif X.dtype in (np.float64, float):
855856
multiplier = 8
856-
elif X.dtype == np.float128:
857+
elif (
858+
# In spite of the names, np.float96 and np.float128
859+
# provide only as much precision as np.longdouble,
860+
# that is, 80 bits on most x86 machines and 64 bits
861+
# in standard Windows builds.
862+
(hasattr(np, 'float128') and X.dtype == np.float128)
863+
or (hasattr(np, 'float96') and X.dtype == np.float96)
864+
):
857865
multiplier = 16
858866
else:
859867
# Just assuming some value - very unlikely
860868
multiplier = 8
861869
logger.warning('Unknown dtype for X: %s, assuming it takes 8 bit/number',
862870
str(X.dtype))
871+
872+
megabytes = X.shape[0] * X.shape[1] * multiplier / 1024 / 1024
873+
if memory_limit <= megabytes * 10 and X.dtype != np.float32:
874+
cast_to = {
875+
8: np.float32,
876+
16: np.float64,
877+
}.get(multiplier, np.float32)
878+
logger.warning(
879+
'Dataset too large for memory limit %dMB, reducing the precision from %s to %s',
880+
memory_limit,
881+
X.dtype,
882+
cast_to,
883+
)
884+
X = X.astype(cast_to)
885+
863886
megabytes = X.shape[0] * X.shape[1] * multiplier / 1024 / 1024
864887
if memory_limit <= megabytes * 10:
865888
new_num_samples = int(

test/test_automl/test_automl.py

Lines changed: 39 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -660,24 +660,45 @@ def test_fail_if_feat_type_on_pandas_input(backend, dask_client):
660660

661661

662662
@pytest.mark.parametrize(
663-
'memory_limit,task',
663+
'memory_limit,precision,task',
664664
[
665-
(memory_limit, task)
665+
(memory_limit, precision, task)
666666
for task in itertools.chain(CLASSIFICATION_TASKS, REGRESSION_TASKS)
667-
for memory_limit in (1, 10, None)
667+
for precision in (float, np.float32, np.float64, np.float128)
668+
for memory_limit in (1, 100, None)
668669
]
669670
)
670-
def test_subsample_if_too_large(memory_limit, task):
671+
def test_subsample_if_too_large(memory_limit, precision, task):
671672
fixture = {
672-
BINARY_CLASSIFICATION: {1: 436, 10: 569, None: 569},
673-
MULTICLASS_CLASSIFICATION: {1: 204, 10: 1797, None: 1797},
674-
MULTILABEL_CLASSIFICATION: {1: 204, 10: 1797, None: 1797},
675-
REGRESSION: {1: 1310, 10: 1326, None: 1326},
676-
MULTIOUTPUT_REGRESSION: {1: 1310, 10: 1326, None: 1326}
673+
BINARY_CLASSIFICATION: {
674+
1: {float: 1310, np.float32: 2621, np.float64: 1310, np.float128: 655},
675+
100: {float: 12000, np.float32: 12000, np.float64: 12000, np.float128: 12000},
676+
None: {float: 12000, np.float32: 12000, np.float64: 12000, np.float128: 12000},
677+
},
678+
MULTICLASS_CLASSIFICATION: {
679+
1: {float: 204, np.float32: 409, np.float64: 204, np.float128: 102},
680+
100: {float: 1797, np.float32: 1797, np.float64: 1797, np.float128: 1797},
681+
None: {float: 1797, np.float32: 1797, np.float64: 1797, np.float128: 1797},
682+
},
683+
MULTILABEL_CLASSIFICATION: {
684+
1: {float: 204, np.float32: 409, np.float64: 204, np.float128: 102},
685+
100: {float: 1797, np.float32: 1797, np.float64: 1797, np.float128: 1797},
686+
None: {float: 1797, np.float32: 1797, np.float64: 1797, np.float128: 1797},
687+
},
688+
REGRESSION: {
689+
1: {float: 655, np.float32: 1310, np.float64: 655, np.float128: 327},
690+
100: {float: 5000, np.float32: 5000, np.float64: 5000, np.float128: 5000},
691+
None: {float: 5000, np.float32: 5000, np.float64: 5000, np.float128: 5000},
692+
},
693+
MULTIOUTPUT_REGRESSION: {
694+
1: {float: 655, np.float32: 1310, np.float64: 655, np.float128: 327},
695+
100: {float: 5000, np.float32: 5000, np.float64: 5000, np.float128: 5000},
696+
None: {float: 5000, np.float32: 5000, np.float64: 5000, np.float128: 5000},
697+
}
677698
}
678699
mock = unittest.mock.Mock()
679700
if task == BINARY_CLASSIFICATION:
680-
X, y = sklearn.datasets.load_breast_cancer(return_X_y=True)
701+
X, y = sklearn.datasets.make_hastie_10_2()
681702
elif task == MULTICLASS_CLASSIFICATION:
682703
X, y = sklearn.datasets.load_digits(return_X_y=True)
683704
elif task == MULTILABEL_CLASSIFICATION:
@@ -686,22 +707,22 @@ def test_subsample_if_too_large(memory_limit, task):
686707
for i, j in enumerate(y_):
687708
y[i, j] = 1
688709
elif task == REGRESSION:
689-
X, y = sklearn.datasets.load_diabetes(return_X_y=True)
690-
X = np.vstack((X, X, X))
691-
y = np.vstack((y.reshape((-1, 1)), y.reshape((-1, 1)), y.reshape((-1, 1))))
710+
X, y = sklearn.datasets.make_friedman1(n_samples=5000, n_features=20)
692711
elif task == MULTIOUTPUT_REGRESSION:
693-
X, y = sklearn.datasets.load_diabetes(return_X_y=True)
712+
X, y = sklearn.datasets.make_friedman1(n_samples=5000, n_features=20)
694713
y = np.vstack((y, y)).transpose()
695-
X = np.vstack((X, X, X))
696-
y = np.vstack((y, y, y))
697714
else:
698715
raise ValueError(task)
716+
X = X.astype(precision)
699717

700718
assert X.shape[0] == y.shape[0]
701719

702720
X_new, y_new = AutoML.subsample_if_too_large(X, y, mock, 1, memory_limit, task)
703-
assert X_new.shape[0] == fixture[task][memory_limit]
721+
assert X_new.shape[0] == fixture[task][memory_limit][precision]
704722
if memory_limit == 1:
705-
assert mock.warning.call_count == 1
723+
if precision in (np.float128, np.float64, float):
724+
assert mock.warning.call_count == 2
725+
else:
726+
assert mock.warning.call_count == 1
706727
else:
707728
assert mock.warning.call_count == 0

0 commit comments

Comments
 (0)