11from __future__ import annotations
22
3- from typing import TYPE_CHECKING , Any , Iterable , Mapping , Sequence
3+ from abc import ABC , abstractmethod
4+ from typing import Any , Generic , Iterable , Sequence , TypeVar
45
56import warnings
67
1213from scipy .sparse import spmatrix
1314from sklearn .base import BaseEstimator , ClassifierMixin , RegressorMixin
1415from sklearn .ensemble import VotingClassifier , VotingRegressor
16+ from sklearn .exceptions import NotFittedError
17+ from sklearn .model_selection ._split import (
18+ BaseCrossValidator ,
19+ BaseShuffleSplit ,
20+ _RepeatedSplits ,
21+ )
1522from sklearn .utils .multiclass import type_of_target
1623from smac .runhistory .runhistory import RunInfo , RunValue
1724from typing_extensions import Literal , TypeAlias
2835from autosklearn .pipeline .base import BasePipeline
2936from autosklearn .util .smac_wrap import SMACCallback
3037
31- if TYPE_CHECKING :
32- from sklearn . model_selection . _split import (
33- BaseCrossValidator ,
34- BaseShuffleSplit ,
35- _RepeatedSplits ,
36- )
38+ # Used to indicate what type the underlying AutoML instance is
39+ T_AutoML = TypeVar ( "T_AutoML" , bound = type [ AutoML ])
40+
41+ # Used to return self and give correct type information from subclasses ,
42+ # see `fit(self: Self) -> Self`
43+ Self = TypeVar ( "Self" , bound = AutoSklearnEstimator )
3744
38- ResampleOptions : TypeAlias = Literal [
39- "holdout" ,
40- "cv" ,
41- "holdout-iterative-fit" ,
42- "cv-iterative-fit" ,
43- "partial-cv" ,
44- ]
45- DisableEvaluatorOptions : TypeAlias = Literal ["y_optimization" , "model" ]
45+ ResampleOptions : TypeAlias = Literal [
46+ "holdout" ,
47+ "cv" ,
48+ "holdout-iterative-fit" ,
49+ "cv-iterative-fit" ,
50+ "partial-cv" ,
51+ ]
52+ DisableEvaluatorOptions : TypeAlias = Literal ["y_optimization" , "model" ]
4653
4754
48- class AutoSklearnEstimator (BaseEstimator ):
55+ class AutoSklearnEstimator (ABC , BaseEstimator , Generic [ T_AutoML ] ):
4956 def __init__ (
5057 self ,
5158 time_left_for_this_task : int = 3600 ,
5259 per_run_time_limit : int | None = None , # TODO: allow percentage
5360 initial_configurations_via_metalearning : int = 25 , # TODO validate
5461 ensemble_size : int | None = None ,
5562 ensemble_class : type [AbstractEnsemble ] | None = EnsembleSelection ,
56- ensemble_kwargs : Mapping [str , Any ] | None = None ,
63+ ensemble_kwargs : dict [str , Any ] | None = None ,
5764 ensemble_nbest : int = 50 ,
5865 max_models_on_disc : int = 50 ,
5966 seed : int = 1 ,
6067 memory_limit : int | None = 3072 ,
61- include : Mapping [str , Sequence [str ]] | None = None ,
62- exclude : Mapping [str , Sequence [str ]] | None = None ,
68+ include : dict [str , list [str ]] | None = None ,
69+ exclude : dict [str , list [str ]] | None = None ,
6370 resampling_strategy : ResampleOptions
6471 | BaseCrossValidator
6572 | _RepeatedSplits
6673 | BaseShuffleSplit = "holdout" ,
67- resampling_strategy_arguments : Mapping [str , Any ] | None = None ,
74+ resampling_strategy_arguments : dict [str , Any ] | None = None ,
6875 tmp_folder : str | None = None , # TODO support path
6976 delete_tmp_folder_after_terminate : bool = True ,
7077 n_jobs : int = 1 ,
7178 dask_client : dask .distributed .Client | None = None ,
7279 disable_evaluator_output : bool
7380 | Sequence [DisableEvaluatorOptions ] = False , # TODO fill in
7481 get_smac_object_callback : SMACCallback | None = None ,
75- smac_scenario_args : Mapping [str , Any ] | None = None ,
76- logging_config : Mapping [str , Any ] | None = None ,
82+ smac_scenario_args : dict [str , Any ] | None = None ,
83+ logging_config : dict [str , Any ] | None = None ,
7784 metadata_directory : str | None = None , # TODO Update for path
7885 metric : Scorer | Sequence [Scorer ] | None = None ,
7986 scoring_functions : Sequence [Scorer ] | None = None ,
8087 load_models : bool = True ,
8188 get_trials_callback : SMACCallback | None = None ,
82- dataset_compression : bool | Mapping [str , Any ] = True ,
89+ dataset_compression : bool | dict [str , Any ] = True ,
8390 allow_string_features : bool = True ,
8491 ):
8592 """
@@ -122,7 +129,7 @@ def __init__(
122129 for the default ensemble autosklearn builds or use ``SingleBest``
123130 to obtain only use the single best model instead of an ensemble.
124131
125- ensemble_kwargs : Mapping [str, Any] | None = None
132+ ensemble_kwargs : dict [str, Any] | None = None
126133 Keyword arguments that are passed to the ensemble class upon
127134 initialization.
128135
@@ -157,7 +164,7 @@ def __init__(
157164 ``n_jobs x memory_limit``.
158165 * The memory limit also applies to the ensemble creation process.
159166
160- include : Mapping [str, Sequence[str]] = None
167+ include : dict [str, Sequence[str]] = None
161168 If None, all possible algorithms are used.
162169
163170 Otherwise, specifies a step and the components that are included in search.
@@ -182,7 +189,7 @@ def __init__(
182189 'feature_preprocessor': ["no_preprocessing"]
183190 }
184191
185- exclude : Mapping [str, Sequence[str]]] = None
192+ exclude : dict [str, Sequence[str]]] = None
186193 If None, all possible algorithms are used.
187194
188195 Otherwise, specifies a step and the components that are excluded from search.
@@ -226,7 +233,7 @@ def __init__(
226233 and ensure that ``"subsample"`` is not included in the applied compression
227234 ``"methods"`` or disable it entirely with ``False``.
228235
229- resampling_strategy_arguments : Mapping [str, Any] | None = None
236+ resampling_strategy_arguments : dict [str, Any] | None = None
230237 Additional arguments for ``resampling_strategy``, this is required if
231238 using a ``cv`` based strategy. The default arguments if left as ``None``
232239 are:
@@ -287,13 +294,13 @@ def __init__(
287294 This is an advanced feature. Use only if you are familiar with
288295 `SMAC <https://automl.github.io/SMAC3/master/index.html>`_.
289296
290- smac_scenario_args : Mapping [str, Any] | None = None
297+ smac_scenario_args : dict [str, Any] | None = None
291298 Additional arguments inserted into the scenario of SMAC. See the
292299 `SMAC documentation <https://automl.github.io/SMAC3/master/pages/details/scenario.html>`_
293300 for a list of available arguments.
294301
295- logging_config : Mapping [str, Any] | None = None
296- Mapping object specifying the logger configuration.
302+ logging_config : dict [str, Any] | None = None
303+ dict object specifying the logger configuration.
297304 If None, the default **logging.yaml** file is used, which can be found in
298305 the directory ``util/logging.yaml`` relative to the installation.
299306
@@ -330,7 +337,7 @@ def __init__(
330337 See the example:
331338 :ref:`Early Stopping And Callbacks <sphx_glr_examples_40_advanced_example_early_stopping_and_callbacks.py>`.
332339
333- dataset_compression: bool | Mapping [str, Any] = True
340+ dataset_compression: bool | dict [str, Any] = True
334341 We compress datasets so that they fit into some predefined amount of memory.
335342 Currently this does not apply to dataframes or sparse arrays, only to raw
336343 numpy arrays.
@@ -482,23 +489,27 @@ def __init__(
482489
483490 # Handle the number of jobs and the time for them
484491 # Made private by `_n_jobs` to keep with sklearn compliance
485- self ._n_jobs = None
486492 if n_jobs == - 1 :
487493 self ._n_jobs = joblib .cpu_count ()
488494 else :
489495 self ._n_jobs = n_jobs
490496
491- super ().__init__ ()
492-
493- def __getstate__ (self ):
494- # Cannot serialize a client!
495- self .dask_client = None
496- return self .__dict__
497+ @property
498+ @abstractmethod
499+ def automl (self ) -> T_AutoML :
500+ """Get the underlying Automl instance
497501
498- def build_automl (self ):
502+ Returns
503+ -------
504+ AutoML
505+ The underlying AutoML instanec
506+ """
507+ if self .automl_ is not None :
508+ return self .automl_
499509
500510 initial_configs = self .initial_configurations_via_metalearning
501- automl = self ._get_automl_class ()(
511+ cls = self ._get_automl_class ()
512+ automl = cls (
502513 temporary_directory = self .tmp_folder ,
503514 delete_tmp_folder_after_terminate = self .delete_tmp_folder_after_terminate ,
504515 time_left_for_this_task = self .time_left_for_this_task ,
@@ -528,17 +539,21 @@ def build_automl(self):
528539 allow_string_features = self .allow_string_features ,
529540 )
530541
531- return automl
542+ self .automl_ = automl
543+ return self .automl_
532544
533- def fit (self , ** kwargs ):
545+ def __getstate__ (self ) -> dict [str , Any ]:
546+ # Cannot serialize a client!
547+ self .dask_client = None
548+ return self .__dict__
534549
550+ def fit (self : Self , ** kwargs : Any ) -> Self :
535551 # Automatically set the cutoff time per task
552+ # TODO this should probably live in automl
536553 if self .per_run_time_limit is None :
537554 self .per_run_time_limit = self ._n_jobs * self .time_left_for_this_task // 10
538555
539- if self .automl_ is None :
540- self .automl_ = self .build_automl ()
541- self .automl_ .fit (load_models = self .load_models , ** kwargs )
556+ self .automl .fit (load_models = self .load_models , ** kwargs )
542557
543558 return self
544559
@@ -668,7 +683,7 @@ def fit_ensemble(
668683 is independent of the ``ensemble_class`` argument and this
669684 pruning step is done prior to constructing an ensemble.
670685
671- ensemble_class : Type [AbstractEnsemble], optional (default=EnsembleSelection)
686+ ensemble_class : type [AbstractEnsemble], optional (default=EnsembleSelection)
672687 Class implementing the post-hoc ensemble algorithm. Set to
673688 ``None`` to disable ensemble building or use ``SingleBest``
674689 to obtain only use the single best model instead of an
@@ -749,7 +764,6 @@ def refit(self, X, y):
749764
750765 Parameters
751766 ----------
752-
753767 X : array-like or sparse matrix of shape = [n_samples, n_features]
754768 The training input samples.
755769
@@ -758,7 +772,6 @@ def refit(self, X, y):
758772
759773 Returns
760774 -------
761-
762775 self
763776
764777 """
@@ -880,10 +893,6 @@ def cv_results_(self):
880893 def trajectory_ (self ):
881894 return self .automl_ .trajectory_
882895
883- @property
884- def fANOVA_input_ (self ):
885- return self .automl_ .fANOVA_input_
886-
887896 def sprint_statistics (self ):
888897 """Return the following statistics of the training result:
889898
@@ -1212,6 +1221,8 @@ def additional_info_has_key(rv, key):
12121221
12131222 # Decide on the sort order depending on what it gets sorted by
12141223 descending_columns = ["ensemble_weight" , "duration" ]
1224+
1225+ ascending_param : bool | list [bool ]
12151226 if sort_order == "auto" :
12161227 ascending_param = [
12171228 False if sby in descending_columns else True for sby in sort_by
@@ -1301,8 +1312,10 @@ def _leaderboard_columns(
13011312 detailed = all
13021313 return {"all" : all , "detailed" : detailed , "simple" : simple }
13031314
1304- def _get_automl_class (self ):
1305- raise NotImplementedError ()
1315+ @classmethod
1316+ @abstractmethod
1317+ def _get_automl_class (cls ) -> type [AutoML ]:
1318+ ...
13061319
13071320 def get_configuration_space (
13081321 self ,
@@ -1351,11 +1364,22 @@ def get_configuration_space(
13511364 def get_pareto_set (self ) -> Sequence [VotingClassifier | VotingRegressor ]:
13521365 return self .automl_ ._load_pareto_set ()
13531366
1367+ def __sklearn_is_fitted__ (self ) -> bool :
1368+ return self .automl_ is not None and self .automl_ .fitted
1369+
13541370
1355- class AutoSklearnClassifier (AutoSklearnEstimator , ClassifierMixin ):
1371+ class AutoSklearnClassifier (AutoSklearnEstimator [ AutoMLClassifier ] , ClassifierMixin ):
13561372 """This class implements the classification task."""
13571373
1358- def fit (self , X , y , X_test = None , y_test = None , feat_type = None , dataset_name = None ):
1374+ def fit (
1375+ self : Self ,
1376+ X ,
1377+ y ,
1378+ X_test = None ,
1379+ y_test = None ,
1380+ feat_type = None ,
1381+ dataset_name = None ,
1382+ ) -> Self :
13591383 """Fit *auto-sklearn* to given training set (X, y).
13601384
13611385 Fit both optimizes the machine learning models and builds an ensemble
@@ -1483,21 +1507,28 @@ def _get_automl_class(self):
14831507 return AutoMLClassifier
14841508
14851509
1486- class AutoSklearnRegressor (AutoSklearnEstimator , RegressorMixin ):
1510+ class AutoSklearnRegressor (AutoSklearnEstimator [ AutoMLRegressor ] , RegressorMixin ):
14871511 """
14881512 This class implements the regression task.
14891513
14901514 """
14911515
1492- def fit (self , X , y , X_test = None , y_test = None , feat_type = None , dataset_name = None ):
1516+ def fit (
1517+ self : Self ,
1518+ X ,
1519+ y ,
1520+ X_test = None ,
1521+ y_test = None ,
1522+ feat_type = None ,
1523+ dataset_name = None ,
1524+ ) -> Self :
14931525 """Fit *Auto-sklearn* to given training set (X, y).
14941526
14951527 Fit both optimizes the machine learning models and builds an ensemble
14961528 out of them.
14971529
14981530 Parameters
14991531 ----------
1500-
15011532 X : array-like or sparse matrix of shape = [n_samples, n_features]
15021533 The training input samples.
15031534
@@ -1581,5 +1612,6 @@ def predict(self, X, batch_size=None, n_jobs=1):
15811612 """
15821613 return super ().predict (X , batch_size = batch_size , n_jobs = n_jobs )
15831614
1584- def _get_automl_class (self ):
1615+ @classmethod
1616+ def _get_automl_class (cls ) -> type [AutoMLRegressor ]:
15851617 return AutoMLRegressor
0 commit comments