automl
diff --git a/‎.github/workflows/docs.yml‎
Lines changed: 4 additions & 0 deletions b/‎.github/workflows/docs.yml‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎.github/workflows/pytest.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/pytest.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.pre-commit-config.yaml‎
Lines changed: 4 additions & 0 deletions b/‎.pre-commit-config.yaml‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 45 additions & 9 deletions b/‎README.md‎
Lines changed: 45 additions & 9 deletions
diff --git a/‎autosklearn/__init__.py‎
Lines changed: 1 addition & 1 deletion b/‎autosklearn/__init__.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎autosklearn/automl.py‎
Lines changed: 76 additions & 40 deletions b/‎autosklearn/automl.py‎
Lines changed: 76 additions & 40 deletions
diff --git a/‎autosklearn/data/abstract_data_manager.py‎
Lines changed: 3 additions & 32 deletions b/‎autosklearn/data/abstract_data_manager.py‎
Lines changed: 3 additions & 32 deletions
@@ -17,6 +17,10 @@ jobs:
       run: |
         cd doc
         make html
+    - name: Check links
+      run: |
+        cd doc
+        make linkcheck
     - name: Pull latest gh-pages
       if: (contains(github.ref, 'develop') || contains(github.ref, 'master')) && github.event_name == 'push'
       run: |
 
@@ -13,7 +13,7 @@ jobs:
     runs-on: ubuntu-18.04
     strategy:
       matrix:
-        python-version: [3.6, 3.7, 3.8]
+        python-version: [3.7, 3.8, 3.9]
         use-conda: [true, false]
         use-dist: [false]
         include:
 
@@ -22,6 +22,10 @@ repos:
         args: [--show-error-codes]
         name: mypy auto-sklearn-evaluation
         files: autosklearn/evaluation
+      - id: mypy
+        args: [--show-error-codes]
+        name: mypy auto-sklearn-datapreprocessing
+        files: autosklearn/pipeline/components/data_preprocessing/
   - repo: https://gitlab.com/pycqa/flake8
     rev: 3.8.3
     hooks:
 
@@ -1,10 +1,19 @@
 # auto-sklearn
 
-auto-sklearn is an automated machine learning toolkit and a drop-in replacement for a scikit-learn estimator.
+**auto-sklearn** is an automated machine learning toolkit and a drop-in replacement for a [scikit-learn](https://scikit-learn.org) estimator.
 
-Find the documentation [here](http://automl.github.io/auto-sklearn/)
+Find the documentation **[here](http://automl.github.io/auto-sklearn/)**. Quick links:
+  * [Installation Guide](https://automl.github.io/auto-sklearn/master/installation.html)
+  * [Releases](https://automl.github.io/auto-sklearn/master/releases.html)
+  * [Manual](https://automl.github.io/auto-sklearn/master/manual.html)
+  * [Examples](https://automl.github.io/auto-sklearn/master/examples/index.html)
+  * [API](https://automl.github.io/auto-sklearn/master/api.html)
 
-## Automated Machine Learning in four lines of code
+## auto-sklearn in one image
+
+![image](doc/images/askl_pipeline.png)
+
+## auto-sklearn in four lines of code
 
 ```python
 import autosklearn.classification
@@ -15,12 +24,39 @@ predictions = cls.predict(X_test)
 
 ## Relevant publications
 
-Efficient and Robust Automated Machine Learning  
-Matthias Feurer, Aaron Klein, Katharina Eggensperger, Jost Springenberg, Manuel Blum and Frank Hutter  
+If you use auto-sklearn in a scientific publications, we would appreciate citations.
+
+**Efficient and Robust Automated Machine Learning**  
+*Matthias Feurer, Aaron Klein, Katharina Eggensperger, Jost Springenberg, Manuel Blum and Frank Hutter*  
 Advances in Neural Information Processing Systems 28 (2015)  
-http://papers.nips.cc/paper/5872-efficient-and-robust-automated-machine-learning.pdf
 
-Auto-Sklearn 2.0: The Next Generation  
-Authors: Matthias Feurer, Katharina Eggensperger, Stefan Falkner, Marius Lindauer and Frank Hutter  
+[Link](http://papers.nips.cc/paper/5872-efficient-and-robust-automated-machine-learning.pdf) to publication.
+```
+@inproceedings{feurer-neurips15a,
+  title     = {Efficient and Robust Automated Machine Learning},
+  author    = {Matthias Feurer, Aaron Klein, Katharina Eggensperger, Jost Springenberg, Manuel Blum and Frank Hutter},
+  booktitle = {Advances in Neural Information Processing Systems 28 (2015)},
+  pages     = {2962--2970},
+  year      = {2015}
+}
+```
+
+----------------------------------------
+
+**Auto-Sklearn 2.0: The Next Generation**  
+*Matthias Feurer, Katharina Eggensperger, Stefan Falkner, Marius Lindauer and Frank Hutter**  
 arXiv:2007.04074 [cs.LG], 2020
-https://arxiv.org/abs/2007.04074
+
+[Link](https://arxiv.org/abs/2007.04074) to publication.
+```
+@inproceedings{feurer-arxiv20a,
+  title     = {Auto-Sklearn 2.0: The Next Generation},
+  author    = {Matthias Feurer, Katharina Eggensperger, Stefan Falkner, Marius Lindauer and Frank Hutter},
+  booktitle = {arXiv:2007.04074 [cs.LG]},
+  year      = {2020}
+}
+```
+
+----------------------------------------
+
+Also, have a look at the blog on [automl.org](automl.org) where we regularly release blogposts.
@@ -15,7 +15,7 @@
 if os.name != 'posix':
     raise ValueError(
         'Detected unsupported operating system: %s. Please check '
-        'the compability information of auto-sklearn: http://automl.github.io'
+        'the compability information of auto-sklearn: https://automl.github.io'
         '/auto-sklearn/stable/installation.html#windows-osx-compability' %
         sys.platform
     )
 
@@ -45,7 +45,7 @@
 )
 from autosklearn.evaluation import ExecuteTaFuncWithQueue, get_cost_of_crash
 from autosklearn.evaluation.abstract_evaluator import _fit_and_suppress_warnings
-from autosklearn.evaluation.train_evaluator import _fit_with_budget
+from autosklearn.evaluation.train_evaluator import TrainEvaluator, _fit_with_budget
 from autosklearn.metrics import calculate_metric
 from autosklearn.util.backend import Backend
 from autosklearn.util.stopwatch import StopWatch
@@ -139,13 +139,13 @@ def __init__(self,
                  smac_scenario_args=None,
                  logging_config=None,
                  metric=None,
-                 scoring_functions=None
+                 scoring_functions=None,
+                 get_trials_callback=None
                  ):
         super(AutoML, self).__init__()
         self.configuration_space = None
         self._backend = backend
         # self._tmp_dir = tmp_dir
-        # self._output_dir = output_dir
         self._time_for_task = time_left_for_this_task
         self._per_run_time_limit = per_run_time_limit
         self._initial_configurations_via_metalearning = \
@@ -165,32 +165,6 @@ def __init__(self,
         self._scoring_functions = scoring_functions if scoring_functions is not None else []
         self._resampling_strategy_arguments = resampling_strategy_arguments \
             if resampling_strategy_arguments is not None else {}
-        if self._resampling_strategy not in ['holdout',
-                                             'holdout-iterative-fit',
-                                             'cv',
-                                             'cv-iterative-fit',
-                                             'partial-cv',
-                                             'partial-cv-iterative-fit',
-                                             ] \
-           and not issubclass(self._resampling_strategy, BaseCrossValidator)\
-           and not issubclass(self._resampling_strategy, _RepeatedSplits)\
-           and not issubclass(self._resampling_strategy, BaseShuffleSplit):
-            raise ValueError('Illegal resampling strategy: %s' %
-                             self._resampling_strategy)
-
-        if self._resampling_strategy in ['partial-cv',
-                                         'partial-cv-iterative-fit',
-                                         ] \
-           and self._ensemble_size != 0:
-            raise ValueError("Resampling strategy %s cannot be used "
-                             "together with ensembles." % self._resampling_strategy)
-        if self._resampling_strategy in ['partial-cv',
-                                         'cv',
-                                         'cv-iterative-fit',
-                                         'partial-cv-iterative-fit',
-                                         ]\
-           and 'folds' not in self._resampling_strategy_arguments:
-            self._resampling_strategy_arguments['folds'] = 5
         self._n_jobs = n_jobs
         self._dask_client = dask_client
 
@@ -208,6 +182,7 @@ def __init__(self,
                                      "'disable_evaluator_output' must be one "
                                      "of " + str(allowed_elements))
         self._get_smac_object_callback = get_smac_object_callback
+        self._get_trials_callback = get_trials_callback
         self._smac_scenario_args = smac_scenario_args
         self.logging_config = logging_config
 
@@ -254,9 +229,6 @@ def __init__(self,
         # By default try to use the TCP logging port or get a new port
         self._logger_port = logging.handlers.DEFAULT_TCP_LOGGING_PORT
 
-        # After assigning and checking variables...
-        # self._backend = Backend(self._output_dir, self._tmp_dir)
-
         # Num_run tell us how many runs have been launched
         # It can be seen as an identifier for each configuration
         # saved to disk
@@ -427,7 +399,7 @@ def _do_dummy_prediction(self, datamanager: XYDataManager, num_run: int) -> int:
                 self._logger.error(
                     "Dummy prediction failed with run state %s. "
                     "The error suggests that the provided memory limits were too tight. Please "
-                    "increase the 'ml_memory_limit' and try again. If this does not solve your "
+                    "increase the 'memory_limit' and try again. If this does not solve your "
                     "problem, please open an issue and paste the additional output. "
                     "Additional output: %s.",
                     str(status), str(additional_info),
@@ -436,7 +408,7 @@ def _do_dummy_prediction(self, datamanager: XYDataManager, num_run: int) -> int:
                 raise ValueError(
                     "Dummy prediction failed with run state %s. "
                     "The error suggests that the provided memory limits were too tight. Please "
-                    "increase the 'ml_memory_limit' and try again. If this does not solve your "
+                    "increase the 'memory_limit' and try again. If this does not solve your "
                     "problem, please open an issue and paste the additional output. "
                     "Additional output: %s." %
                     (str(status), str(additional_info)),
@@ -510,6 +482,15 @@ def fit(
             task=self._task,
         )
 
+        # Check the re-sampling strategy
+        try:
+            self._check_resampling_strategy(
+                X=X, y=y, task=task,
+            )
+        except Exception as e:
+            self._fit_cleanup()
+            raise e
+
         # Reset learnt stuff
         self.models_ = None
         self.cv_models_ = None
@@ -537,10 +518,8 @@ def fit(
         self._dataset_name = dataset_name
         self._stopwatch.start_task(self._dataset_name)
 
-        if feat_type is None and self.InputValidator.feature_validator.feat_type:
-            self._feat_type = self.InputValidator.feature_validator.feat_type
-        elif feat_type is not None:
-            self._feat_type = feat_type
+        # Take the feature types from the validator
+        self._feat_type = self.InputValidator.feature_validator.feat_type
 
         # Produce debug information to the logfile
         self._logger.debug('Starting to print environment information')
@@ -573,7 +552,6 @@ def fit(
                 raise ValueError('Unable to read requirement: %s' % requirement)
         self._logger.debug('Done printing environment information')
         self._logger.debug('Starting to print arguments to auto-sklearn')
-        self._logger.debug('  output_folder: %s', self._backend.context._output_directory)
         self._logger.debug('  tmp_folder: %s', self._backend.context._temporary_directory)
         self._logger.debug('  time_left_for_this_task: %f', self._time_for_task)
         self._logger.debug('  per_run_time_limit: %f', self._per_run_time_limit)
@@ -782,6 +760,7 @@ def fit(
                 port=self._logger_port,
                 pynisher_context=self._multiprocessing_context,
                 ensemble_callback=proc_ensemble,
+                trials_callback=self._get_trials_callback
             )
 
             try:
@@ -839,6 +818,63 @@ def _fit_cleanup(self):
         self._clean_logger()
         return
 
+    def _check_resampling_strategy(
+        self,
+        X: SUPPORTED_FEAT_TYPES,
+        y: SUPPORTED_TARGET_TYPES,
+        task: int,
+    ) -> None:
+        """
+        This method centralizes the checks for resampling strategies
+
+        Parameters
+        ----------
+        X: (SUPPORTED_FEAT_TYPES)
+            Input features for the given task
+        y: (SUPPORTED_TARGET_TYPES)
+            Input targets for the given task
+        task: (task)
+            Integer describing a supported task type, like BINARY_CLASSIFICATION
+        """
+        is_split_object = isinstance(
+            self._resampling_strategy,
+            (BaseCrossValidator, _RepeatedSplits, BaseShuffleSplit)
+        )
+
+        if self._resampling_strategy not in [
+                'holdout',
+                'holdout-iterative-fit',
+                'cv',
+                'cv-iterative-fit',
+                'partial-cv',
+                'partial-cv-iterative-fit',
+        ] and not is_split_object:
+            raise ValueError('Illegal resampling strategy: %s' % self._resampling_strategy)
+
+        elif is_split_object:
+            TrainEvaluator.check_splitter_resampling_strategy(
+                X=X, y=y, task=task,
+                groups=self._resampling_strategy_arguments.get('groups', None),
+                resampling_strategy=self._resampling_strategy,
+            )
+
+        elif self._resampling_strategy in [
+            'partial-cv',
+            'partial-cv-iterative-fit',
+        ] and self._ensemble_size != 0:
+            raise ValueError("Resampling strategy %s cannot be used "
+                             "together with ensembles." % self._resampling_strategy)
+
+        elif self._resampling_strategy in [
+            'partial-cv',
+            'cv',
+            'cv-iterative-fit',
+            'partial-cv-iterative-fit',
+        ] and 'folds' not in self._resampling_strategy_arguments:
+            self._resampling_strategy_arguments['folds'] = 5
+
+        return
+
     @staticmethod
     def subsample_if_too_large(
         X: SUPPORTED_FEAT_TYPES,
@@ -1022,7 +1058,7 @@ def fit_pipeline(
             attributes will be automatically One-Hot encoded. The values
             used for a categorical attribute must be integers, obtained for
             example by `sklearn.preprocessing.LabelEncoder
-            <http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html>`_.
+            <https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html>`_.
 
         Returns
         -------
 
@@ -1,41 +1,12 @@
 import abc
-from typing import Any, Dict, List, Tuple
+from typing import Any, Dict, Union
 
 import numpy as np
 
 import scipy.sparse
 
 from autosklearn.pipeline.components.data_preprocessing.data_preprocessing \
     import DataPreprocessor
-from autosklearn.util.data import predict_RAM_usage
-
-
-def perform_one_hot_encoding(
-    sparse: bool,
-    categorical: List[bool],
-    data: List
-) -> Tuple[List, bool]:
-    predicted_RAM_usage = float(
-        predict_RAM_usage(data[0], categorical)) / 1024 / 1024
-
-    if predicted_RAM_usage > 1000:
-        sparse = True
-
-    rvals = []
-    if any(categorical):
-        encoder = DataPreprocessor(
-            categorical_features=categorical, force_sparse_output=sparse)
-        rvals.append(encoder.fit_transform(data[0]))
-        for d in data[1:]:
-            rvals.append(encoder.transform(d))
-
-        if not sparse and scipy.sparse.issparse(rvals[0]):
-            for i in range(len(rvals)):
-                rvals[i] = rvals[i].todense()
-    else:
-        rvals = data
-
-    return rvals, sparse
 
 
 class AbstractDataManager():
@@ -60,11 +31,11 @@ def info(self) -> Dict[str, Any]:
         return self._info
 
     @property
-    def feat_type(self) -> List[str]:
+    def feat_type(self) -> Dict[Union[str, int], str]:
         return self._feat_type
 
     @feat_type.setter
-    def feat_type(self, value: List[str]) -> None:
+    def feat_type(self, value: Dict[Union[str, int], str]) -> None:
         self._feat_type = value
 
     @property
Original file line number	Diff line number	Diff line change
`@@ -15,7 +15,7 @@`
`15`	`15`	`if os.name != 'posix':`
`16`	`16`	`raise ValueError(`
`17`	`17`	`'Detected unsupported operating system: %s. Please check '`
`18`		`- 'the compability information of auto-sklearn: http://automl.github.io'`
	`18`	`+ 'the compability information of auto-sklearn: https://automl.github.io'`
`19`	`19`	`'/auto-sklearn/stable/installation.html#windows-osx-compability' %`
`20`	`20`	`sys.platform`
`21`	`21`	`)`