FIX_1061 (#1063)

franchuterivera · web-flow · commit 9343373d4af5 · 2021-02-04T09:20:14.000+01:00
* FIX_1061

* Fxi type of target

* Moving to classes_

* classes_ should be np.ndarray

* Force float before nan
diff --git a/autosklearn/data/target_validator.py b/autosklearn/data/target_validator.py
@@ -386,3 +386,20 @@ def _check_data(
                                  self.type_of_target,
                                  supported_output_types
                              ))
+
+    @property
+    def classes_(self) -> np.ndarray:
+        """
+        Complies with scikit learn classes_ attribute,
+        which consist of a ndarray of shape (n_classes,)
+        where n_classes are the number of classes seen while fitting
+        a encoder to the targets.
+        Returns
+        -------
+            classes_: np.ndarray
+                The unique classes seen during encoding of a classifier
+        """
+        if self.encoder is None:
+            return np.array([])
+        else:
+            return self.encoder.categories_[0]
diff --git a/autosklearn/estimators.py b/autosklearn/estimators.py
@@ -5,7 +5,7 @@
 import dask.distributed
 import joblib
 import numpy as np
-from sklearn.base import BaseEstimator
+from sklearn.base import BaseEstimator, ClassifierMixin, RegressorMixin
 from sklearn.utils.multiclass import type_of_target
 
 from autosklearn.automl import AutoMLClassifier, AutoMLRegressor, AutoML
@@ -219,11 +219,11 @@ def __init__(
             :meth:`autosklearn.metrics.make_scorer`. These are the `Built-in
             Metrics`_.
             If None is provided, a default metric is selected depending on the task.
-            
+
         scoring_functions : List[Scorer], optional (None)
-            List of scorers which will be calculated for each pipeline and results will be 
+            List of scorers which will be calculated for each pipeline and results will be
             available via ``cv_results``
-            
+
         load_models : bool, optional (True)
             Whether to load the models after fitting Auto-sklearn.
 
@@ -266,13 +266,14 @@ def __init__(
         self.smac_scenario_args = smac_scenario_args
         self.logging_config = logging_config
         self.metadata_directory = metadata_directory
-        self._metric = metric
-        self._scoring_functions = scoring_functions
-        self._load_models = load_models
+        self.metric = metric
+        self.scoring_functions = scoring_functions
+        self.load_models = load_models
 
         self.automl_ = None  # type: Optional[AutoML]
         # n_jobs after conversion to a number (b/c default is None)
         self._n_jobs = None
+
         super().__init__()
 
     def __getstate__(self):
@@ -323,8 +324,8 @@ def build_automl(
             smac_scenario_args=smac_scenario_args,
             logging_config=self.logging_config,
             metadata_directory=self.metadata_directory,
-            metric=self._metric,
-            scoring_functions=self._scoring_functions
+            metric=self.metric,
+            scoring_functions=self.scoring_functions
         )
 
         return automl
@@ -353,7 +354,7 @@ def fit(self, **kwargs):
             tmp_folder=self.tmp_folder,
             output_folder=self.output_folder,
         )
-        self.automl_.fit(load_models=self._load_models, **kwargs)
+        self.automl_.fit(load_models=self.load_models, **kwargs)
 
         return self
 
@@ -516,7 +517,7 @@ def get_configuration_space(self, X, y):
         return self.automl_.configuration_space
 
 
-class AutoSklearnClassifier(AutoSklearnEstimator):
+class AutoSklearnClassifier(AutoSklearnEstimator, ClassifierMixin):
     """
     This class implements the classification task.
 
@@ -597,6 +598,11 @@ def fit(self, X, y,
             dataset_name=dataset_name,
         )
 
+        # After fit, a classifier is expected to define classes_
+        # A list of class labels known to the classifier, mapping each label
+        # to a numerical index used in the model representation our output.
+        self.classes_ = self.automl_.InputValidator.target_validator.classes_
+
         return self
 
     def predict(self, X, batch_size=None, n_jobs=1):
@@ -656,7 +662,7 @@ def _get_automl_class(self):
         return AutoMLClassifier
 
 
-class AutoSklearnRegressor(AutoSklearnEstimator):
+class AutoSklearnRegressor(AutoSklearnEstimator, RegressorMixin):
     """
     This class implements the regression task.
 
diff --git a/test/test_automl/test_estimators.py b/test/test_automl/test_estimators.py
@@ -1,5 +1,7 @@
+import copy
 import glob
 import os
+import inspect
 import pickle
 import re
 import sys
@@ -15,6 +17,10 @@
 import sklearn
 import sklearn.dummy
 import sklearn.datasets
+from sklearn.base import clone
+from sklearn.base import ClassifierMixin, RegressorMixin
+from sklearn.base import is_classifier
+
 
 from autosklearn.data.validation import InputValidator
 import autosklearn.pipeline.util as putil
@@ -160,6 +166,10 @@ def test_type_of_target(mock_estimator):
                                          ])
 
     cls = AutoSklearnClassifier(ensemble_size=0)
+    cls.automl_ = unittest.mock.Mock()
+    cls.automl_.InputValidator = unittest.mock.Mock()
+    cls.automl_.InputValidator.target_validator = unittest.mock.Mock()
+
     # Illegal target types for classification: continuous,
     # multiclass-multioutput, continuous-multioutput.
     expected_msg = r".*Classification with data of type"
@@ -253,6 +263,10 @@ def test_cv_results(tmp_dir, output_dir):
                                 ensemble_size=0,
                                 scoring_functions=[autosklearn.metrics.precision,
                                                    autosklearn.metrics.roc_auc])
+
+    params = cls.get_params()
+    original_params = copy.deepcopy(params)
+
     cls.fit(X_train, Y_train)
     cv_results = cls.cv_results_
     assert isinstance(cv_results, dict), type(cv_results)
@@ -275,6 +289,27 @@ def test_cv_results(tmp_dir, output_dir):
                        cv_results.items() if key.startswith('param_')]
     assert all(cv_result_items), cv_results.items()
 
+    # Compare the state of the model parameters with the original parameters
+    new_params = clone(cls).get_params()
+    for param_name, original_value in original_params.items():
+        new_value = new_params[param_name]
+
+        # Taken from Sklearn code:
+        # We should never change or mutate the internal state of input
+        # parameters by default. To check this we use the joblib.hash function
+        # that introspects recursively any subobjects to compute a checksum.
+        # The only exception to this rule of immutable constructor parameters
+        # is possible RandomState instance but in this check we explicitly
+        # fixed the random_state params recursively to be integer seeds.
+        assert joblib.hash(new_value) == joblib.hash(original_value), (
+            "Estimator %s should not change or mutate "
+            " the parameter %s from %s to %s during fit."
+            % (cls, param_name, original_value, new_value))
+
+    # Comply with https://scikit-learn.org/dev/glossary.html#term-classes
+    is_classifier(cls)
+    assert hasattr(cls, 'classes_')
+
 
 @unittest.mock.patch('autosklearn.estimators.AutoSklearnEstimator.build_automl')
 def test_fit_n_jobs_negative(build_automl_patch):
@@ -614,3 +649,14 @@ def test_autosklearn2_classification_methods_returns_self(dask_client):
     ) >= 2 / 3, print_debug_information(automl)
 
     pickle.dumps(automl_fitted)
+
+
+@pytest.mark.parametrize("class_", [AutoSklearnClassifier, AutoSklearnRegressor,
+                                    AutoSklearn2Classifier])
+def test_check_estimator_signature(class_):
+    # Make sure signature is store in self
+    expected_subclass = ClassifierMixin if 'Classifier' in str(class_) else RegressorMixin
+    assert issubclass(class_, expected_subclass)
+    estimator = class_()
+    for expected in list(inspect.signature(class_).parameters):
+        assert hasattr(estimator, expected)
diff --git a/test/test_data/test_feature_validator.py b/test/test_data/test_feature_validator.py
@@ -35,7 +35,7 @@ def input_data_featuretest(request):
             ['a', 'b', 'd', 'r', 'b', 'c'],
         ])
     elif request.param == 'numpy_categoricalonly_nan':
-        array = np.random.randint(10, size=(100, 10))
+        array = np.random.randint(10, size=(100, 10)).astype('float')
         array[50, 0:5] = np.nan
         return array
     elif request.param == 'numpy_numericalonly_nan':
diff --git a/test/test_data/test_target_validator.py b/test/test_data/test_target_validator.py
@@ -450,6 +450,8 @@ def test_targetvalidator_inversetransform():
     y_decoded = validator.inverse_transform(y)
     assert ['a', 'a', 'b', 'c', 'a'] == y_decoded.tolist()
 
+    assert validator.classes_.tolist() == ['a', 'b', 'c']
+
     validator = TargetValidator(is_classification=True)
     multi_label = pd.DataFrame(
         np.array([[1, 0, 0, 1], [0, 0, 1, 1], [0, 0, 0, 0]]),
@@ -461,6 +463,10 @@ def test_targetvalidator_inversetransform():
     y_decoded = validator.inverse_transform(y)
     np.testing.assert_array_almost_equal(y, y_decoded)
 
+    # Multilabel classification is not encoded
+    # For this reason, classes_ attribute does not contain a class
+    np.testing.assert_array_almost_equal(validator.classes_, np.array([]))
+
 
 # Actual checks for the targets
 @pytest.mark.parametrize(