Merge pull request #1098 from automl/development

mfeurer · web-flow · commit 6e5886526ab0 · 2021-03-15T18:51:24.000+01:00
Development
diff --git a/.github/stale.yml b/.github/stale.yml
@@ -0,0 +1,48 @@
+# Configuration for probot-stale - https://github.com/probot/stale
+
+# Number of days of inactivity before an Issue or Pull Request becomes stale
+daysUntilStale: 60
+
+# Number of days of inactivity before an Issue or Pull Request with the stale label is closed.
+# Set to false to disable. If disabled, issues still need to be closed manually, but will remain marked as stale.
+daysUntilClose: 7
+
+# Only issues or pull requests with all of these labels are check if stale. Defaults to `[]` (disabled)
+onlyLabels:
+  - Answered
+  - "Feedback Required"
+  - invalid
+  - wontfix
+
+# Issues or Pull Requests with these labels will never be considered stale. Set to `[]` to disable
+exemptLabels:
+  - Bug
+
+# Set to true to ignore issues in a project (defaults to false)
+exemptProjects: false
+
+# Set to true to ignore issues in a milestone (defaults to false)
+exemptMilestones: false
+
+# Set to true to ignore issues with an assignee (defaults to false)
+exemptAssignees: false
+
+# Label to use when marking as stale
+staleLabel: stale
+
+# Comment to post when marking as stale. Set to `false` to disable
+markComment: >
+  This issue has been automatically marked as stale because it has not had
+  recent activity. It will be closed if no further activity occurs for the
+  next 7 days. Thank you for your contributions.
+
+# Comment to post when removing the stale label.
+# unmarkComment: >
+#   Your comment here.
+
+# Comment to post when closing a stale Issue or Pull Request.
+# closeComment: >
+#   Your comment here.
+
+# Limit the number of actions per hour, from 1-30. Default is 30
+limitPerRun: 30
diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml
@@ -1,6 +1,11 @@
 name: Tests
 
-on: [push, pull_request]
+on:
+  push:
+  pull_request:
+  schedule:
+    # Every Monday at 7AM UTC
+    - cron: '0 07 * * 1'
 
 jobs:
   ubuntu:
@@ -17,12 +22,15 @@ jobs:
           - python-version: 3.7
             use-conda: false
             use-dist: true
-      fail-fast:  false
+      fail-fast: false
 
     steps:
     - uses: actions/checkout@v2
     - name: Setup Python ${{ matrix.python-version }}
       uses: actions/setup-python@v2
+      # A note on checkout: When checking out the repository that
+      # triggered a workflow, this defaults to the reference or SHA for that event.
+      # Otherwise, uses the default branch (master) is used.
       with:
         python-version: ${{ matrix.python-version }}
     - name: Conda Install test dependencies
diff --git a/autosklearn/__version__.py b/autosklearn/__version__.py
@@ -1,4 +1,4 @@
 """Version information."""
 
 # The following line *must* be the last in the module, exactly as formatted:
-__version__ = "0.12.3"
+__version__ = "0.12.4"
diff --git a/autosklearn/experimental/askl2.py b/autosklearn/experimental/askl2.py
@@ -25,15 +25,16 @@
     m = hashlib.md5()
     m.update(fh.read().encode('utf8'))
 training_data_hash = m.hexdigest()[:10]
-sklearn_version = sklearn.__version__
-autosklearn_version = autosklearn.__version__
-selector_file = pathlib.Path(
-    os.environ.get(
-        'XDG_CACHE_HOME',
-        '~/.cache/auto-sklearn/askl2_selector_%s_%s_%s.pkl'
-        % (autosklearn_version, sklearn_version, training_data_hash),
-    )
-).expanduser()
+selector_filename = "askl2_selector_%s_%s_%s.pkl" % (
+    autosklearn.__version__,
+    sklearn.__version__,
+    training_data_hash
+)
+selector_directory = os.environ.get('XDG_CACHE_HOME')
+if selector_directory is None:
+    selector_directory = pathlib.Path.home()
+selector_directory = pathlib.Path(selector_directory).joinpath('auto-sklearn').expanduser()
+selector_file = selector_directory / selector_filename
 metafeatures = pd.DataFrame(training_data['metafeatures'])
 y_values = np.array(training_data['y_values'])
 strategies = training_data['strategies']
@@ -53,8 +54,14 @@
         maxima=maxima_for_methods,
     )
     selector_file.parent.mkdir(exist_ok=True, parents=True)
-    with open(selector_file, 'wb') as fh:
-        pickle.dump(selector, fh)
+    try:
+        with open(selector_file, 'wb') as fh:
+            pickle.dump(selector, fh)
+    except Exception as e:
+        print("AutoSklearn2Classifier needs to create a selector file under "
+              "the user's home directory or XDG_CACHE_HOME. Nevertheless "
+              "the path {} is not writable.".format(selector_file))
+        raise e
 
 
 class SmacObjectCallback:
@@ -156,6 +163,7 @@ class AutoSklearn2Classifier(AutoSklearnClassifier):
     def __init__(
         self,
         time_left_for_this_task: int = 3600,
+        per_run_time_limit=None,
         ensemble_size: int = 50,
         ensemble_nbest: Union[float, int] = 50,
         max_models_on_disc: int = 50,
@@ -183,6 +191,13 @@ def __init__(
             models. By increasing this value, *auto-sklearn* has a higher
             chance of finding better models.
 
+        per_run_time_limit : int, optional (default=1/10 of time_left_for_this_task)
+            Time limit for a single call to the machine learning model.
+            Model fitting will be terminated if the machine learning
+            algorithm runs over the time limit. Set this value high enough so
+            that typical machine learning algorithms can be fit on the
+            training data.
+
         ensemble_size : int, optional (default=50)
             Number of models added to the ensemble built by *Ensemble
             selection from libraries of models*. Models are drawn with
@@ -255,7 +270,7 @@ def __init__(
 
         smac_scenario_args : dict, optional (None)
             Additional arguments inserted into the scenario of SMAC. See the
-            `SMAC documentation 
+            `SMAC documentation
             <https://automl.github.io/SMAC3/master/options.html?highlight=scenario
             #scenario>`_
             for a list of available arguments.
@@ -272,7 +287,7 @@ def __init__(
             If None is provided, a default metric is selected depending on the task.
 
         scoring_functions : List[Scorer], optional (None)
-            List of scorers which will be calculated for each pipeline and results will be 
+            List of scorers which will be calculated for each pipeline and results will be
             available via ``cv_results``
 
         load_models : bool, optional (True)
@@ -295,6 +310,7 @@ def __init__(
         include_preprocessors = ["no_preprocessing"]
         super().__init__(
             time_left_for_this_task=time_left_for_this_task,
+            per_run_time_limit=per_run_time_limit,
             initial_configurations_via_metalearning=0,
             ensemble_size=ensemble_size,
             ensemble_nbest=ensemble_nbest,
diff --git a/autosklearn/metalearning/metalearning/clustering/gmeans.py b/autosklearn/metalearning/metalearning/clustering/gmeans.py
@@ -34,6 +34,10 @@ def fit(self, X):
                     indices = KMeans.labels_ == i
                     X_ = X[indices]
 
+                    if np.sum(indices) < self.minimum_samples_per_cluster*2:
+                        cluster_centers.append(cluster_center)
+                        continue
+
                     for i in range(10):
                         KMeans_ = sklearn.cluster.KMeans(n_clusters=2,
                                                          n_init=self.n_init,
diff --git a/autosklearn/pipeline/components/data_preprocessing/rescaling/power_transformer.py b/autosklearn/pipeline/components/data_preprocessing/rescaling/power_transformer.py
@@ -0,0 +1,32 @@
+from autosklearn.pipeline.constants import DENSE, UNSIGNED_DATA, INPUT
+from autosklearn.pipeline.components.data_preprocessing.rescaling.abstract_rescaling \
+    import Rescaling
+from autosklearn.pipeline.components.base import AutoSklearnPreprocessingAlgorithm
+
+
+class PowerTransformerComponent(Rescaling, AutoSklearnPreprocessingAlgorithm):
+    def __init__(self, random_state):
+        from sklearn.preprocessing import PowerTransformer
+        self.preprocessor = PowerTransformer(copy=False)
+
+    @staticmethod
+    def get_properties(dataset_properties=None):
+        return {'shortname': 'PowerTransformer',
+                'name': 'PowerTransformer',
+                'handles_missing_values': False,
+                'handles_nominal_values': False,
+                'handles_numerical_features': True,
+                'prefers_data_scaled': False,
+                'prefers_data_normalized': False,
+                'handles_regression': True,
+                'handles_classification': True,
+                'handles_multiclass': True,
+                'handles_multilabel': True,
+                'handles_multioutput': True,
+                'is_deterministic': True,
+                # TODO find out of this is right!
+                'handles_sparse': False,
+                'handles_dense': True,
+                'input': (DENSE, UNSIGNED_DATA),
+                'output': (INPUT,),
+                'preferred_dtype': None}
diff --git a/doc/manual.rst b/doc/manual.rst
@@ -104,12 +104,12 @@ Supported Inputs
 * Multioutput Regression
 
 You can provide feature and target training pairs (X_train/y_train) to *auto-sklearn* to fit an ensemble of pipelines as described in the next section. This X_train/y_train dataset must belong to one of the supported formats: np.ndarray, pd.DataFrame, scipy.sparse.csr_matrix and python lists.
- Optionally, you can measure the ability of this fitted model to generalize to unseen data by providing an optional testing pair (X_test/Y_test). For further details, please refer to the example `Train and Test inputs <examples/example_pandas_train_test.html>`_. Supported formats for these training and testing pairs are: np.ndarray, pd.DataFrame, scipy.sparse.csr_matrix and python lists.
+ Optionally, you can measure the ability of this fitted model to generalize to unseen data by providing an optional testing pair (X_test/Y_test). For further details, please refer to the example `Train and Test inputs <examples/40_advanced/example_pandas_train_test.html>`_. Supported formats for these training and testing pairs are: np.ndarray, pd.DataFrame, scipy.sparse.csr_matrix and python lists.
 
 If your data contains categorical values (in the features or targets), autosklearn will automatically encode your data using a `sklearn.preprocessing.LabelEncoder <https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html>`_ for unidimensional data and a `sklearn.preprocessing.OrdinalEncoder <https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OrdinalEncoder.html>`_ for multidimensional data.
 
 Regarding the features, there are two methods to guide *auto-sklearn* to properly encode categorical columns:
-* Providing a X_train/X_test numpy array with the optional flag feat_type. For further details, you can check the example `Feature Types <examples/example_feature_types.html>`_.
+* Providing a X_train/X_test numpy array with the optional flag feat_type. For further details, you can check the example `Feature Types <examples/40_advanced/example_feature_types.html>`_.
 * You can provide a pandas DataFrame, with properly formatted columns. If a column has numerical dtype, *auto-sklearn* will not encode it and it will be passed directly to scikit-learn. If the column has a categorical/boolean class, it will be encoded. If the column is of any other type (Object or Timeseries), an error will be raised. For further details on how to properly encode your data, you can check the example `Working with categorical data <https://pandas.pydata.org/pandas-docs/stable/user_guide/categorical.html>`_). If you are working with time series, it is recommended that you follow this approach `Working with time data <https://stats.stackexchange.com/questions/311494/>`_.
 
 Regarding the targets (y_train/y_test), if the task involves a classification problem, such features will be automatically encoded. It is recommended to provide both y_train and y_test during fit, so that a common encoding is created between these splits (if only y_train is provided during fit, the categorical encoder will not be able to handle new classes that are exclusive to y_test). If the task is regression, no encoding happens on the targets.
@@ -143,28 +143,21 @@ obtained by running *auto-sklearn*. It additionally prints the number of both su
 algorithm runs.
 
 The results obtained from the final ensemble can be printed by calling ``show_models()``. *auto-sklearn* ensemble is composed of scikit-learn models that can be inspected as exemplified by
-`model inspection example <examples/example_get_pipeline_components.html>`_
+`model inspection example <examples/40_advanced/example_get_pipeline_components.html>`_
 .
 
 Parallel computation
 ====================
 
-*auto-sklearn* supports parallel execution by data sharing on a shared file
-system. In this mode, the SMAC algorithm shares the training data for it's
-model by writing it to disk after every iteration. At the beginning of each
-iteration, SMAC loads all newly found data points. We provide an example
-implementing
-`scikit-learn's n_jobs functionality <examples/example_parallel_n_jobs.html>`_
-and an example on how
-to
-`manually start multiple instances of auto-sklearn <examples/example_parallel_manual_spawning.html>`_
-.
-
 In it's default mode, *auto-sklearn* already uses two cores. The first one is
 used for model building, the second for building an ensemble every time a new
-machine learning model has finished training. The
-`sequential example <examples/example_sequential.html>`_
-shows how to run these tasks sequentially to use only a single core at a time.
+machine learning model has finished training. An example on how to do this sequentially (first searching for individual models, and then building an ensemble from them) can be seen in `sequential auto-sklearn example <examples/60_search/example_sequential.html>`_. 
+
+Nevertheless, *auto-sklearn* also supports parallel Bayesian optimization via the use of `Dask.distributed  <https://distributed.dask.org/>`_. By providing the arguments ``n_jobs`` to the estimator construction, one can control the number of cores available to *auto-sklearn* (As exemplified in `sequential auto-sklearn  example <examples/60_search/example_parallel_n_jobs>`_). Distributed processes are also supported by providing a custom client object to *auto-sklearn* like in the
+example: `sequential auto-sklearn  example <examples/60_search/example_parallel_manual_spawning_python>`_. When multiple cores are available, *auto-sklearn*
+will create a worker per core, and use the available workers to both search for better machine learning models as well as building an ensemble with them until the time resource is exhausted.
+
+**Note:** *auto-sklearn* requires all workers to have access to a shared file system for storing training data and models.
 
 Furthermore, depending on the installation of scikit-learn and numpy,
 the model building procedure may use up to all cores. Such behaviour is
diff --git a/doc/releases.rst b/doc/releases.rst
@@ -12,6 +12,25 @@
 Releases
 ========
 
+Version 0.12.4
+==============
+
+* ADD #660: Enable scikit-learn's power transformation for input features.
+* MAINT: Bump the `pyrfr` minimum dependency to 0.8.1 to automatically download wheels from pypi
+  if possible.
+* FIX #732: Add a missing size check into the GMEANS clustering used for the NeurIPS 2015 paper.
+* FIX #1050: Add missing arguments to the `AutoSklearn2Classifier` signature.
+* FIX #1072: Fixes a bug where the `AutoSklearn2Classifier` could not be created due to trying to
+  cache to the wrong directory.
+
+Contributors v0.12.4
+********************
+
+* Matthias Feurer
+* Francisco Rivera
+* Maximilian Greil
+* Pepe Berba
+
 Version 0.12.3
 ==============
 
diff --git a/examples/20_basic/example_classification.py b/examples/20_basic/example_classification.py
@@ -22,8 +22,8 @@
     sklearn.model_selection.train_test_split(X, y, random_state=1)
 
 ############################################################################
-# Build and fit a regressor
-# =========================
+# Build and fit a classifier
+# ==========================
 
 automl = autosklearn.classification.AutoSklearnClassifier(
     time_left_for_this_task=120,
diff --git a/requirements.txt b/requirements.txt
@@ -14,5 +14,5 @@ liac-arff
 
 ConfigSpace>=0.4.14,<0.5
 pynisher>=0.6.3
-pyrfr>=0.7,<0.9
+pyrfr>=0.8.1,<0.9
 smac>=0.13.1,<0.14
diff --git a/test/test_automl/test_estimators.py b/test/test_automl/test_estimators.py
@@ -1,10 +1,12 @@
 import copy
 import glob
+import importlib
 import os
 import inspect
 import pickle
 import re
 import sys
+import tempfile
 import unittest
 import unittest.mock
 import pytest
@@ -660,3 +662,45 @@ def test_check_estimator_signature(class_):
     estimator = class_()
     for expected in list(inspect.signature(class_).parameters):
         assert hasattr(estimator, expected)
+
+
+@pytest.mark.parametrize("selector_path", [None,  # No XDG_CACHE_HOME provided
+                                           '/',  # XDG_CACHE_HOME has no permission
+                                           tempfile.gettempdir(),  # in the user cache
+                                           ])
+def test_selector_file_askl2_can_be_created(selector_path):
+    with unittest.mock.patch('os.environ.get') as mock_foo:
+        mock_foo.return_value = selector_path
+        if selector_path is not None and not os.access(selector_path, os.W_OK):
+            with pytest.raises(PermissionError):
+                importlib.reload(autosklearn.experimental.askl2)
+        else:
+            importlib.reload(autosklearn.experimental.askl2)
+            assert os.path.exists(autosklearn.experimental.askl2.selector_file)
+            if selector_path is None or not os.access(selector_path, os.W_OK):
+                # We default to home in worst case
+                assert os.path.expanduser("~") in str(autosklearn.experimental.askl2.selector_file)
+            else:
+                # a dir provided via XDG_CACHE_HOME
+                assert selector_path in str(autosklearn.experimental.askl2.selector_file)
+    # Re import it at the end so we do not affect other test
+    importlib.reload(autosklearn.experimental.askl2)
+
+
+def test_check_askl2_same_arguments_as_askl():
+    # In case a new attribute is created, make sure it is there also in
+    # ASKL2
+    extra_arguments = list(set(
+        inspect.getfullargspec(AutoSklearnEstimator.__init__).args) - set(
+            inspect.getfullargspec(AutoSklearn2Classifier.__init__).args))
+    expected_extra_args = ['exclude_estimators',
+                           'include_preprocessors',
+                           'resampling_strategy_arguments',
+                           'exclude_preprocessors',
+                           'include_estimators',
+                           'get_smac_object_callback',
+                           'initial_configurations_via_metalearning',
+                           'resampling_strategy',
+                           'metadata_directory']
+    unexpected_args = set(extra_arguments) - set(expected_extra_args)
+    assert len(unexpected_args) == 0, unexpected_args
diff --git a/test/test_pipeline/test_classification.py b/test/test_pipeline/test_classification.py
@@ -421,7 +421,7 @@ def test_get_hyperparameter_search_space(self):
         forbiddens = cs.get_forbiddens()
 
         self.assertEqual(len(cs.get_hyperparameter(
-            'data_preprocessing:numerical_transformer:rescaling:__choice__').choices), 6)
+            'data_preprocessing:numerical_transformer:rescaling:__choice__').choices), 7)
         self.assertEqual(len(cs.get_hyperparameter(
             'classifier:__choice__').choices), 16)
         self.assertEqual(len(cs.get_hyperparameter(