Skip to content

Commit 25a8439

Browse files
committed
progress
1 parent b8f40bb commit 25a8439

File tree

3 files changed

+106
-72
lines changed

3 files changed

+106
-72
lines changed

autosklearn/automl.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -212,7 +212,7 @@ class AutoML(BaseEstimator):
212212
def __init__(
213213
self,
214214
time_left_for_this_task: int,
215-
per_run_time_limit: int,
215+
per_run_time_limit: int | None = None,
216216
temporary_directory: Optional[str] = None,
217217
delete_tmp_folder_after_terminate: bool = True,
218218
initial_configurations_via_metalearning: int = 25,
@@ -221,7 +221,7 @@ def __init__(
221221
ensemble_nbest: int = 1,
222222
max_models_on_disc: int = 1,
223223
seed: int = 1,
224-
memory_limit: int = 3072,
224+
memory_limit: int | None = 3072,
225225
metadata_directory: Optional[str] = None,
226226
include: Optional[dict[str, list[str]]] = None,
227227
exclude: Optional[dict[str, list[str]]] = None,
@@ -231,11 +231,11 @@ def __init__(
231231
dask_client: Optional[Client] = None,
232232
precision: Literal[16, 32, 64] = 32,
233233
disable_evaluator_output: bool | Iterable[str] = False,
234-
get_smac_object_callback: Optional[Callable] = None,
235-
smac_scenario_args: Optional[Mapping] = None,
234+
get_smac_object_callback: Callable | None = None,
235+
smac_scenario_args: Mapping[str, Any] | None = None,
236236
logging_config: Optional[Mapping] = None,
237237
metrics: Sequence[Scorer] | None = None,
238-
scoring_functions: Optional[list[Scorer]] = None,
238+
scoring_functions: Optional[Sequence[Scorer]] = None,
239239
get_trials_callback: SMACCallback | None = None,
240240
dataset_compression: bool | Mapping[str, Any] = True,
241241
allow_string_features: bool = True,

autosklearn/estimators.py

Lines changed: 92 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
from __future__ import annotations
22

3-
from typing import TYPE_CHECKING, Any, Iterable, Mapping, Sequence
3+
from abc import ABC, abstractmethod
4+
from typing import Any, Generic, Iterable, Sequence, TypeVar
45

56
import warnings
67

@@ -12,6 +13,12 @@
1213
from scipy.sparse import spmatrix
1314
from sklearn.base import BaseEstimator, ClassifierMixin, RegressorMixin
1415
from sklearn.ensemble import VotingClassifier, VotingRegressor
16+
from sklearn.exceptions import NotFittedError
17+
from sklearn.model_selection._split import (
18+
BaseCrossValidator,
19+
BaseShuffleSplit,
20+
_RepeatedSplits,
21+
)
1522
from sklearn.utils.multiclass import type_of_target
1623
from smac.runhistory.runhistory import RunInfo, RunValue
1724
from typing_extensions import Literal, TypeAlias
@@ -28,58 +35,58 @@
2835
from autosklearn.pipeline.base import BasePipeline
2936
from autosklearn.util.smac_wrap import SMACCallback
3037

31-
if TYPE_CHECKING:
32-
from sklearn.model_selection._split import (
33-
BaseCrossValidator,
34-
BaseShuffleSplit,
35-
_RepeatedSplits,
36-
)
38+
# Used to indicate what type the underlying AutoML instance is
39+
T_AutoML = TypeVar("T_AutoML", bound=type[AutoML])
40+
41+
# Used to return self and give correct type information from subclasses,
42+
# see `fit(self: Self) -> Self`
43+
Self = TypeVar("Self", bound=AutoSklearnEstimator)
3744

38-
ResampleOptions: TypeAlias = Literal[
39-
"holdout",
40-
"cv",
41-
"holdout-iterative-fit",
42-
"cv-iterative-fit",
43-
"partial-cv",
44-
]
45-
DisableEvaluatorOptions: TypeAlias = Literal["y_optimization", "model"]
45+
ResampleOptions: TypeAlias = Literal[
46+
"holdout",
47+
"cv",
48+
"holdout-iterative-fit",
49+
"cv-iterative-fit",
50+
"partial-cv",
51+
]
52+
DisableEvaluatorOptions: TypeAlias = Literal["y_optimization", "model"]
4653

4754

48-
class AutoSklearnEstimator(BaseEstimator):
55+
class AutoSklearnEstimator(ABC, BaseEstimator, Generic[T_AutoML]):
4956
def __init__(
5057
self,
5158
time_left_for_this_task: int = 3600,
5259
per_run_time_limit: int | None = None, # TODO: allow percentage
5360
initial_configurations_via_metalearning: int = 25, # TODO validate
5461
ensemble_size: int | None = None,
5562
ensemble_class: type[AbstractEnsemble] | None = EnsembleSelection,
56-
ensemble_kwargs: Mapping[str, Any] | None = None,
63+
ensemble_kwargs: dict[str, Any] | None = None,
5764
ensemble_nbest: int = 50,
5865
max_models_on_disc: int = 50,
5966
seed: int = 1,
6067
memory_limit: int | None = 3072,
61-
include: Mapping[str, Sequence[str]] | None = None,
62-
exclude: Mapping[str, Sequence[str]] | None = None,
68+
include: dict[str, list[str]] | None = None,
69+
exclude: dict[str, list[str]] | None = None,
6370
resampling_strategy: ResampleOptions
6471
| BaseCrossValidator
6572
| _RepeatedSplits
6673
| BaseShuffleSplit = "holdout",
67-
resampling_strategy_arguments: Mapping[str, Any] | None = None,
74+
resampling_strategy_arguments: dict[str, Any] | None = None,
6875
tmp_folder: str | None = None, # TODO support path
6976
delete_tmp_folder_after_terminate: bool = True,
7077
n_jobs: int = 1,
7178
dask_client: dask.distributed.Client | None = None,
7279
disable_evaluator_output: bool
7380
| Sequence[DisableEvaluatorOptions] = False, # TODO fill in
7481
get_smac_object_callback: SMACCallback | None = None,
75-
smac_scenario_args: Mapping[str, Any] | None = None,
76-
logging_config: Mapping[str, Any] | None = None,
82+
smac_scenario_args: dict[str, Any] | None = None,
83+
logging_config: dict[str, Any] | None = None,
7784
metadata_directory: str | None = None, # TODO Update for path
7885
metric: Scorer | Sequence[Scorer] | None = None,
7986
scoring_functions: Sequence[Scorer] | None = None,
8087
load_models: bool = True,
8188
get_trials_callback: SMACCallback | None = None,
82-
dataset_compression: bool | Mapping[str, Any] = True,
89+
dataset_compression: bool | dict[str, Any] = True,
8390
allow_string_features: bool = True,
8491
):
8592
"""
@@ -122,7 +129,7 @@ def __init__(
122129
for the default ensemble autosklearn builds or use ``SingleBest``
123130
to obtain only use the single best model instead of an ensemble.
124131
125-
ensemble_kwargs : Mapping[str, Any] | None = None
132+
ensemble_kwargs : dict[str, Any] | None = None
126133
Keyword arguments that are passed to the ensemble class upon
127134
initialization.
128135
@@ -157,7 +164,7 @@ def __init__(
157164
``n_jobs x memory_limit``.
158165
* The memory limit also applies to the ensemble creation process.
159166
160-
include : Mapping[str, Sequence[str]] = None
167+
include : dict[str, Sequence[str]] = None
161168
If None, all possible algorithms are used.
162169
163170
Otherwise, specifies a step and the components that are included in search.
@@ -182,7 +189,7 @@ def __init__(
182189
'feature_preprocessor': ["no_preprocessing"]
183190
}
184191
185-
exclude : Mapping[str, Sequence[str]]] = None
192+
exclude : dict[str, Sequence[str]]] = None
186193
If None, all possible algorithms are used.
187194
188195
Otherwise, specifies a step and the components that are excluded from search.
@@ -226,7 +233,7 @@ def __init__(
226233
and ensure that ``"subsample"`` is not included in the applied compression
227234
``"methods"`` or disable it entirely with ``False``.
228235
229-
resampling_strategy_arguments : Mapping[str, Any] | None = None
236+
resampling_strategy_arguments : dict[str, Any] | None = None
230237
Additional arguments for ``resampling_strategy``, this is required if
231238
using a ``cv`` based strategy. The default arguments if left as ``None``
232239
are:
@@ -287,13 +294,13 @@ def __init__(
287294
This is an advanced feature. Use only if you are familiar with
288295
`SMAC <https://automl.github.io/SMAC3/master/index.html>`_.
289296
290-
smac_scenario_args : Mapping[str, Any] | None = None
297+
smac_scenario_args : dict[str, Any] | None = None
291298
Additional arguments inserted into the scenario of SMAC. See the
292299
`SMAC documentation <https://automl.github.io/SMAC3/master/pages/details/scenario.html>`_
293300
for a list of available arguments.
294301
295-
logging_config : Mapping[str, Any] | None = None
296-
Mapping object specifying the logger configuration.
302+
logging_config : dict[str, Any] | None = None
303+
dict object specifying the logger configuration.
297304
If None, the default **logging.yaml** file is used, which can be found in
298305
the directory ``util/logging.yaml`` relative to the installation.
299306
@@ -330,7 +337,7 @@ def __init__(
330337
See the example:
331338
:ref:`Early Stopping And Callbacks <sphx_glr_examples_40_advanced_example_early_stopping_and_callbacks.py>`.
332339
333-
dataset_compression: bool | Mapping[str, Any] = True
340+
dataset_compression: bool | dict[str, Any] = True
334341
We compress datasets so that they fit into some predefined amount of memory.
335342
Currently this does not apply to dataframes or sparse arrays, only to raw
336343
numpy arrays.
@@ -482,23 +489,27 @@ def __init__(
482489

483490
# Handle the number of jobs and the time for them
484491
# Made private by `_n_jobs` to keep with sklearn compliance
485-
self._n_jobs = None
486492
if n_jobs == -1:
487493
self._n_jobs = joblib.cpu_count()
488494
else:
489495
self._n_jobs = n_jobs
490496

491-
super().__init__()
492-
493-
def __getstate__(self):
494-
# Cannot serialize a client!
495-
self.dask_client = None
496-
return self.__dict__
497+
@property
498+
@abstractmethod
499+
def automl(self) -> T_AutoML:
500+
"""Get the underlying Automl instance
497501
498-
def build_automl(self):
502+
Returns
503+
-------
504+
AutoML
505+
The underlying AutoML instanec
506+
"""
507+
if self.automl_ is not None:
508+
return self.automl_
499509

500510
initial_configs = self.initial_configurations_via_metalearning
501-
automl = self._get_automl_class()(
511+
cls = self._get_automl_class()
512+
automl = cls(
502513
temporary_directory=self.tmp_folder,
503514
delete_tmp_folder_after_terminate=self.delete_tmp_folder_after_terminate,
504515
time_left_for_this_task=self.time_left_for_this_task,
@@ -528,17 +539,21 @@ def build_automl(self):
528539
allow_string_features=self.allow_string_features,
529540
)
530541

531-
return automl
542+
self.automl_ = automl
543+
return self.automl_
532544

533-
def fit(self, **kwargs):
545+
def __getstate__(self) -> dict[str, Any]:
546+
# Cannot serialize a client!
547+
self.dask_client = None
548+
return self.__dict__
534549

550+
def fit(self: Self, **kwargs: Any) -> Self:
535551
# Automatically set the cutoff time per task
552+
# TODO this should probably live in automl
536553
if self.per_run_time_limit is None:
537554
self.per_run_time_limit = self._n_jobs * self.time_left_for_this_task // 10
538555

539-
if self.automl_ is None:
540-
self.automl_ = self.build_automl()
541-
self.automl_.fit(load_models=self.load_models, **kwargs)
556+
self.automl.fit(load_models=self.load_models, **kwargs)
542557

543558
return self
544559

@@ -668,7 +683,7 @@ def fit_ensemble(
668683
is independent of the ``ensemble_class`` argument and this
669684
pruning step is done prior to constructing an ensemble.
670685
671-
ensemble_class : Type[AbstractEnsemble], optional (default=EnsembleSelection)
686+
ensemble_class : type[AbstractEnsemble], optional (default=EnsembleSelection)
672687
Class implementing the post-hoc ensemble algorithm. Set to
673688
``None`` to disable ensemble building or use ``SingleBest``
674689
to obtain only use the single best model instead of an
@@ -749,7 +764,6 @@ def refit(self, X, y):
749764
750765
Parameters
751766
----------
752-
753767
X : array-like or sparse matrix of shape = [n_samples, n_features]
754768
The training input samples.
755769
@@ -758,7 +772,6 @@ def refit(self, X, y):
758772
759773
Returns
760774
-------
761-
762775
self
763776
764777
"""
@@ -880,10 +893,6 @@ def cv_results_(self):
880893
def trajectory_(self):
881894
return self.automl_.trajectory_
882895

883-
@property
884-
def fANOVA_input_(self):
885-
return self.automl_.fANOVA_input_
886-
887896
def sprint_statistics(self):
888897
"""Return the following statistics of the training result:
889898
@@ -1212,6 +1221,8 @@ def additional_info_has_key(rv, key):
12121221

12131222
# Decide on the sort order depending on what it gets sorted by
12141223
descending_columns = ["ensemble_weight", "duration"]
1224+
1225+
ascending_param: bool | list[bool]
12151226
if sort_order == "auto":
12161227
ascending_param = [
12171228
False if sby in descending_columns else True for sby in sort_by
@@ -1301,8 +1312,10 @@ def _leaderboard_columns(
13011312
detailed = all
13021313
return {"all": all, "detailed": detailed, "simple": simple}
13031314

1304-
def _get_automl_class(self):
1305-
raise NotImplementedError()
1315+
@classmethod
1316+
@abstractmethod
1317+
def _get_automl_class(cls) -> type[AutoML]:
1318+
...
13061319

13071320
def get_configuration_space(
13081321
self,
@@ -1351,11 +1364,22 @@ def get_configuration_space(
13511364
def get_pareto_set(self) -> Sequence[VotingClassifier | VotingRegressor]:
13521365
return self.automl_._load_pareto_set()
13531366

1367+
def __sklearn_is_fitted__(self) -> bool:
1368+
return self.automl_ is not None and self.automl_.fitted
1369+
13541370

1355-
class AutoSklearnClassifier(AutoSklearnEstimator, ClassifierMixin):
1371+
class AutoSklearnClassifier(AutoSklearnEstimator[AutoMLClassifier], ClassifierMixin):
13561372
"""This class implements the classification task."""
13571373

1358-
def fit(self, X, y, X_test=None, y_test=None, feat_type=None, dataset_name=None):
1374+
def fit(
1375+
self: Self,
1376+
X,
1377+
y,
1378+
X_test=None,
1379+
y_test=None,
1380+
feat_type=None,
1381+
dataset_name=None,
1382+
) -> Self:
13591383
"""Fit *auto-sklearn* to given training set (X, y).
13601384
13611385
Fit both optimizes the machine learning models and builds an ensemble
@@ -1483,21 +1507,28 @@ def _get_automl_class(self):
14831507
return AutoMLClassifier
14841508

14851509

1486-
class AutoSklearnRegressor(AutoSklearnEstimator, RegressorMixin):
1510+
class AutoSklearnRegressor(AutoSklearnEstimator[AutoMLRegressor], RegressorMixin):
14871511
"""
14881512
This class implements the regression task.
14891513
14901514
"""
14911515

1492-
def fit(self, X, y, X_test=None, y_test=None, feat_type=None, dataset_name=None):
1516+
def fit(
1517+
self: Self,
1518+
X,
1519+
y,
1520+
X_test=None,
1521+
y_test=None,
1522+
feat_type=None,
1523+
dataset_name=None,
1524+
) -> Self:
14931525
"""Fit *Auto-sklearn* to given training set (X, y).
14941526
14951527
Fit both optimizes the machine learning models and builds an ensemble
14961528
out of them.
14971529
14981530
Parameters
14991531
----------
1500-
15011532
X : array-like or sparse matrix of shape = [n_samples, n_features]
15021533
The training input samples.
15031534
@@ -1581,5 +1612,6 @@ def predict(self, X, batch_size=None, n_jobs=1):
15811612
"""
15821613
return super().predict(X, batch_size=batch_size, n_jobs=n_jobs)
15831614

1584-
def _get_automl_class(self):
1615+
@classmethod
1616+
def _get_automl_class(cls) -> type[AutoMLRegressor]:
15851617
return AutoMLRegressor

0 commit comments

Comments
 (0)