|
1 | 1 | # -*- encoding: utf-8 -*- |
| 2 | +import itertools |
2 | 3 | import os |
3 | 4 | import pickle |
4 | 5 | import sys |
|
19 | 20 | from autosklearn.data.xy_data_manager import XYDataManager |
20 | 21 | from autosklearn.metrics import accuracy, log_loss, balanced_accuracy |
21 | 22 | import autosklearn.pipeline.util as putil |
22 | | -from autosklearn.constants import MULTICLASS_CLASSIFICATION, BINARY_CLASSIFICATION, REGRESSION |
| 23 | +from autosklearn.constants import ( |
| 24 | + MULTICLASS_CLASSIFICATION, |
| 25 | + BINARY_CLASSIFICATION, |
| 26 | + MULTILABEL_CLASSIFICATION, |
| 27 | + REGRESSION, |
| 28 | + MULTIOUTPUT_REGRESSION, |
| 29 | + CLASSIFICATION_TASKS, |
| 30 | + REGRESSION_TASKS, |
| 31 | +) |
23 | 32 | from smac.tae import StatusType |
24 | 33 |
|
25 | 34 | sys.path.append(os.path.dirname(__file__)) |
@@ -651,3 +660,51 @@ def test_fail_if_dtype_changes_automl(backend, dask_client): |
651 | 660 | X_train.to_numpy(), y_train, |
652 | 661 | task=BINARY_CLASSIFICATION, |
653 | 662 | ) |
| 663 | + |
| 664 | + |
| 665 | +@pytest.mark.parametrize( |
| 666 | + 'memory_limit,task', |
| 667 | + [ |
| 668 | + (memory_limit, task) |
| 669 | + for task in itertools.chain(CLASSIFICATION_TASKS, REGRESSION_TASKS) |
| 670 | + for memory_limit in (1, 10) |
| 671 | + ] |
| 672 | +) |
| 673 | +def test_subsample_if_too_large(memory_limit, task): |
| 674 | + fixture = { |
| 675 | + BINARY_CLASSIFICATION: {1: 436, 10: 569}, |
| 676 | + MULTICLASS_CLASSIFICATION: {1: 204, 10: 1797}, |
| 677 | + MULTILABEL_CLASSIFICATION: {1: 204, 10: 1797}, |
| 678 | + REGRESSION: {1: 1310, 10: 1326}, |
| 679 | + MULTIOUTPUT_REGRESSION: {1: 1310, 10: 1326} |
| 680 | + } |
| 681 | + mock = unittest.mock.Mock() |
| 682 | + if task == BINARY_CLASSIFICATION: |
| 683 | + X, y = sklearn.datasets.load_breast_cancer(return_X_y=True) |
| 684 | + elif task == MULTICLASS_CLASSIFICATION: |
| 685 | + X, y = sklearn.datasets.load_digits(return_X_y=True) |
| 686 | + elif task == MULTILABEL_CLASSIFICATION: |
| 687 | + X, y_ = sklearn.datasets.load_digits(return_X_y=True) |
| 688 | + y = np.zeros((X.shape[0], 10)) |
| 689 | + for i, j in enumerate(y_): |
| 690 | + y[i, j] = 1 |
| 691 | + elif task == REGRESSION: |
| 692 | + X, y = sklearn.datasets.load_diabetes(return_X_y=True) |
| 693 | + X = np.vstack((X, X, X)) |
| 694 | + y = np.vstack((y.reshape((-1, 1)), y.reshape((-1, 1)), y.reshape((-1, 1)))) |
| 695 | + elif task == MULTIOUTPUT_REGRESSION: |
| 696 | + X, y = sklearn.datasets.load_diabetes(return_X_y=True) |
| 697 | + y = np.vstack((y, y)).transpose() |
| 698 | + X = np.vstack((X, X, X)) |
| 699 | + y = np.vstack((y, y, y)) |
| 700 | + else: |
| 701 | + raise ValueError(task) |
| 702 | + |
| 703 | + assert X.shape[0] == y.shape[0] |
| 704 | + |
| 705 | + X_new, y_new = AutoML.subsample_if_too_large(X, y, mock, 1, memory_limit, task) |
| 706 | + assert X_new.shape[0] == fixture[task][memory_limit] |
| 707 | + if memory_limit == 1: |
| 708 | + assert mock.warning.call_count == 1 |
| 709 | + else: |
| 710 | + assert mock.warning.call_count == 0 |
0 commit comments