11# -*- encoding: utf-8 -*-
22
3- from typing import Optional , Dict , List
3+ from typing import Optional , Dict , List , Tuple , Union
44
5+ from ConfigSpace .configuration_space import Configuration
56import dask .distributed
67import joblib
78import numpy as np
89from sklearn .base import BaseEstimator , ClassifierMixin , RegressorMixin
910from sklearn .utils .multiclass import type_of_target
11+ from smac .runhistory .runhistory import RunInfo , RunValue
1012
13+ from autosklearn .data .validation import (
14+ SUPPORTED_FEAT_TYPES ,
15+ SUPPORTED_TARGET_TYPES ,
16+ )
17+ from autosklearn .pipeline .base import BasePipeline
1118from autosklearn .automl import AutoMLClassifier , AutoMLRegressor , AutoML
1219from autosklearn .metrics import Scorer
1320from autosklearn .util .backend import create
@@ -271,8 +278,15 @@ def __init__(
271278 self .load_models = load_models
272279
273280 self .automl_ = None # type: Optional[AutoML]
274- # n_jobs after conversion to a number (b/c default is None)
281+
282+ # Handle the number of jobs and the time for them
275283 self ._n_jobs = None
284+ if self .n_jobs is None or self .n_jobs == 1 :
285+ self ._n_jobs = 1
286+ elif self .n_jobs == - 1 :
287+ self ._n_jobs = joblib .cpu_count ()
288+ else :
289+ self ._n_jobs = self .n_jobs
276290
277291 super ().__init__ ()
278292
@@ -281,35 +295,24 @@ def __getstate__(self):
281295 self .dask_client = None
282296 return self .__dict__
283297
284- def build_automl (
285- self ,
286- seed : int ,
287- ensemble_size : int ,
288- initial_configurations_via_metalearning : int ,
289- tmp_folder : str ,
290- output_folder : str ,
291- smac_scenario_args : Optional [Dict ] = None ,
292- ):
298+ def build_automl (self ):
293299
294300 backend = create (
295- temporary_directory = tmp_folder ,
296- output_directory = output_folder ,
301+ temporary_directory = self . tmp_folder ,
302+ output_directory = self . output_folder ,
297303 delete_tmp_folder_after_terminate = self .delete_tmp_folder_after_terminate ,
298304 delete_output_folder_after_terminate = self .delete_output_folder_after_terminate ,
299305 )
300306
301- if smac_scenario_args is None :
302- smac_scenario_args = self .smac_scenario_args
303-
304307 automl = self ._get_automl_class ()(
305308 backend = backend ,
306309 time_left_for_this_task = self .time_left_for_this_task ,
307310 per_run_time_limit = self .per_run_time_limit ,
308- initial_configurations_via_metalearning = initial_configurations_via_metalearning ,
309- ensemble_size = ensemble_size ,
311+ initial_configurations_via_metalearning = self . initial_configurations_via_metalearning ,
312+ ensemble_size = self . ensemble_size ,
310313 ensemble_nbest = self .ensemble_nbest ,
311314 max_models_on_disc = self .max_models_on_disc ,
312- seed = seed ,
315+ seed = self . seed ,
313316 memory_limit = self .memory_limit ,
314317 include_estimators = self .include_estimators ,
315318 exclude_estimators = self .exclude_estimators ,
@@ -321,7 +324,7 @@ def build_automl(
321324 dask_client = self .dask_client ,
322325 get_smac_object_callback = self .get_smac_object_callback ,
323326 disable_evaluator_output = self .disable_evaluator_output ,
324- smac_scenario_args = smac_scenario_args ,
327+ smac_scenario_args = self . smac_scenario_args ,
325328 logging_config = self .logging_config ,
326329 metadata_directory = self .metadata_directory ,
327330 metric = self .metric ,
@@ -332,32 +335,82 @@ def build_automl(
332335
333336 def fit (self , ** kwargs ):
334337
335- # Handle the number of jobs and the time for them
336- if self .n_jobs is None or self .n_jobs == 1 :
337- self ._n_jobs = 1
338- elif self .n_jobs == - 1 :
339- self ._n_jobs = joblib .cpu_count ()
340- else :
341- self ._n_jobs = self .n_jobs
342-
343338 # Automatically set the cutoff time per task
344339 if self .per_run_time_limit is None :
345340 self .per_run_time_limit = self ._n_jobs * self .time_left_for_this_task // 10
346341
347- seed = self .seed
348- self .automl_ = self .build_automl (
349- seed = seed ,
350- ensemble_size = self .ensemble_size ,
351- initial_configurations_via_metalearning = (
352- self .initial_configurations_via_metalearning
353- ),
354- tmp_folder = self .tmp_folder ,
355- output_folder = self .output_folder ,
356- )
342+ if self .automl_ is None :
343+ self .automl_ = self .build_automl ()
357344 self .automl_ .fit (load_models = self .load_models , ** kwargs )
358345
359346 return self
360347
348+ def fit_pipeline (
349+ self ,
350+ X : SUPPORTED_FEAT_TYPES ,
351+ y : SUPPORTED_TARGET_TYPES ,
352+ config : Union [Configuration , Dict [str , Union [str , float , int ]]],
353+ dataset_name : Optional [str ] = None ,
354+ X_test : Optional [SUPPORTED_FEAT_TYPES ] = None ,
355+ y_test : Optional [SUPPORTED_TARGET_TYPES ] = None ,
356+ feat_type : Optional [List [str ]] = None ,
357+ * args ,
358+ ** kwargs : Dict ,
359+ ) -> Tuple [Optional [BasePipeline ], RunInfo , RunValue ]:
360+ """ Fits and individual pipeline configuration and returns
361+ the result to the user.
362+
363+ The Estimator constraints are honored, for example the resampling
364+ strategy, or memory constraints, unless directly provided to the method.
365+ By default, this method supports the same signature as fit(), and any extra
366+ arguments are redirected to the TAE evaluation function, which allows for
367+ further customization while building a pipeline.
368+
369+ Any additional argument provided is directly passed to the worker exercising the run.
370+
371+ Parameters
372+ ----------
373+ X: array-like, shape = (n_samples, n_features)
374+ The features used for training
375+ y: array-like
376+ The labels used for training
377+ X_test: Optionalarray-like, shape = (n_samples, n_features)
378+ If provided, the testing performance will be tracked on this features.
379+ y_test: array-like
380+ If provided, the testing performance will be tracked on this labels
381+ config: Union[Configuration, Dict[str, Union[str, float, int]]]
382+ A configuration object used to define the pipeline steps.
383+ If a dictionary is passed, a configuration is created based on this dictionary.
384+ dataset_name: Optional[str]
385+ Name that will be used to tag the Auto-Sklearn run and identify the
386+ Auto-Sklearn run
387+ feat_type : list, optional (default=None)
388+ List of str of `len(X.shape[1])` describing the attribute type.
389+ Possible types are `Categorical` and `Numerical`. `Categorical`
390+ attributes will be automatically One-Hot encoded. The values
391+ used for a categorical attribute must be integers, obtained for
392+ example by `sklearn.preprocessing.LabelEncoder
393+ <http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html>`_.
394+
395+ Returns
396+ -------
397+ pipeline: Optional[BasePipeline]
398+ The fitted pipeline. In case of failure while fitting the pipeline,
399+ a None is returned.
400+ run_info: RunInFo
401+ A named tuple that contains the configuration launched
402+ run_value: RunValue
403+ A named tuple that contains the result of the run
404+ """
405+ if self .automl_ is None :
406+ self .automl_ = self .build_automl ()
407+ return self .automl_ .fit_pipeline (X = X , y = y ,
408+ dataset_name = dataset_name ,
409+ config = config ,
410+ feat_type = feat_type ,
411+ X_test = X_test , y_test = y_test ,
412+ * args , ** kwargs )
413+
361414 def fit_ensemble (self , y , task = None , precision = 32 ,
362415 dataset_name = None , ensemble_nbest = None ,
363416 ensemble_size = None ):
@@ -401,17 +454,9 @@ def fit_ensemble(self, y, task=None, precision=32,
401454 """
402455 if self .automl_ is None :
403456 # Build a dummy automl object to call fit_ensemble
404- self .automl_ = self .build_automl (
405- seed = self .seed ,
406- ensemble_size = (
407- ensemble_size
408- if ensemble_size is not None else
409- self .ensemble_size
410- ),
411- initial_configurations_via_metalearning = 0 ,
412- tmp_folder = self .tmp_folder ,
413- output_folder = self .output_folder ,
414- )
457+ # The ensemble size is honored in the .automl_.fit_ensemble
458+ # call
459+ self .automl_ = self .build_automl ()
415460 self .automl_ .fit_ensemble (
416461 y = y ,
417462 task = task ,
@@ -513,8 +558,40 @@ def sprint_statistics(self):
513558 def _get_automl_class (self ):
514559 raise NotImplementedError ()
515560
516- def get_configuration_space (self , X , y ):
517- return self .automl_ .configuration_space
561+ def get_configuration_space (
562+ self ,
563+ X : SUPPORTED_FEAT_TYPES ,
564+ y : SUPPORTED_TARGET_TYPES ,
565+ X_test : Optional [SUPPORTED_FEAT_TYPES ] = None ,
566+ y_test : Optional [SUPPORTED_TARGET_TYPES ] = None ,
567+ dataset_name : Optional [str ] = None ,
568+ ):
569+ """
570+ Returns the Configuration Space object, from which Auto-Sklearn
571+ will sample configurations and build pipelines.
572+
573+ Parameters
574+ ----------
575+ X : array-like or sparse matrix of shape = [n_samples, n_features]
576+ Array with the training features, used to get characteristics like
577+ data sparsity
578+ y : array-like, shape = [n_samples] or [n_samples, n_outputs]
579+ Array with the problem labels
580+ X_test : array-like or sparse matrix of shape = [n_samples, n_features]
581+ Array with features used for performance estimation
582+ y_test : array-like, shape = [n_samples] or [n_samples, n_outputs]
583+ Array with the problem labels for the testing split
584+ dataset_name: Optional[str]
585+ A string to tag the Auto-Sklearn run
586+ """
587+ if self .automl_ is None :
588+ self .automl_ = self .build_automl ()
589+ return self .automl_ .fit (
590+ X , y ,
591+ X_test = X_test , y_test = y_test ,
592+ dataset_name = dataset_name ,
593+ only_return_configuration_space = True ,
594+ ) if self .automl_ .configuration_space is None else self .automl_ .configuration_space
518595
519596
520597class AutoSklearnClassifier (AutoSklearnEstimator , ClassifierMixin ):
0 commit comments