automl
diff --git a/‎autosklearn/automl.py‎
Lines changed: 2 additions & 1 deletion b/‎autosklearn/automl.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎autosklearn/ensemble_builder.py‎
Lines changed: 27 additions & 24 deletions b/‎autosklearn/ensemble_builder.py‎
Lines changed: 27 additions & 24 deletions
diff --git a/‎autosklearn/ensembles/ensemble_selection.py‎
Lines changed: 39 additions & 3 deletions b/‎autosklearn/ensembles/ensemble_selection.py‎
Lines changed: 39 additions & 3 deletions
diff --git a/‎autosklearn/evaluation/abstract_evaluator.py‎
Lines changed: 2 additions & 2 deletions b/‎autosklearn/evaluation/abstract_evaluator.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎autosklearn/metalearning/metafeatures/metafeatures.py‎
Lines changed: 3 additions & 1 deletion b/‎autosklearn/metalearning/metafeatures/metafeatures.py‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎autosklearn/pipeline/base.py‎
Lines changed: 1 addition & 5 deletions b/‎autosklearn/pipeline/base.py‎
Lines changed: 1 addition & 5 deletions
@@ -32,6 +32,7 @@
 import joblib
 import sklearn.utils
 from scipy.sparse import spmatrix
+from sklearn.utils import check_random_state
 from sklearn.utils.validation import check_is_fitted
 from sklearn.metrics._classification import type_of_target
 from sklearn.dummy import DummyClassifier, DummyRegressor
@@ -1165,7 +1166,7 @@ def refit(self, X, y):
         if self.ensemble_ is None:
             raise ValueError("Refit can only be called if 'ensemble_size != 0'")
 
-        random_state = np.random.RandomState(self._seed)
+        random_state = check_random_state(self._seed)
         for identifier in self.models_:
             model = self.models_[identifier]
             # this updates the model inplace, it can then later be used in
 
@@ -19,7 +19,6 @@
 import numpy as np
 import pandas as pd
 import pynisher
-from sklearn.utils.validation import check_random_state
 from smac.callbacks import IncorporateRunResultCallback
 from smac.optimizer.smbo import SMBO
 from smac.runhistory.runhistory import RunInfo, RunValue
@@ -57,7 +56,7 @@ def __init__(
         max_iterations: Optional[int],
         read_at_most: int,
         ensemble_memory_limit: Optional[int],
-        random_state: int,
+        random_state: Union[int, np.random.RandomState],
         logger_port: int = logging.handlers.DEFAULT_TCP_LOGGING_PORT,
         pynisher_context: str = 'fork',
     ):
@@ -228,7 +227,7 @@ def build_ensemble(
                     precision=self.precision,
                     memory_limit=self.ensemble_memory_limit,
                     read_at_most=self.read_at_most,
-                    random_state=self.seed,
+                    random_state=self.random_state,
                     end_at=self.start_time + self.time_left_for_ensembles,
                     iteration=self.iteration,
                     return_predictions=False,
@@ -266,15 +265,15 @@ def fit_and_return_ensemble(
     max_models_on_disc: Union[float, int],
     seed: int,
     precision: int,
-    memory_limit: Optional[int],
     read_at_most: int,
-    random_state: int,
     end_at: float,
     iteration: int,
     return_predictions: bool,
     pynisher_context: str,
     logger_port: int = logging.handlers.DEFAULT_TCP_LOGGING_PORT,
     unit_test: bool = False,
+    memory_limit: Optional[int] = None,
+    random_state: Optional[Union[int, np.random.RandomState]] = None,
 ) -> Tuple[
         List[Tuple[int, float, float, float]],
         int,
@@ -318,8 +317,6 @@ def fit_and_return_ensemble(
             random seed
         precision: [16,32,64,128]
             precision of floats to read the predictions
-        memory_limit: Optional[int]
-            memory limit in mb. If ``None``, no memory limit is enforced.
         read_at_most: int
             read at most n new prediction files in each iteration
         end_at: float
@@ -329,13 +326,17 @@ def fit_and_return_ensemble(
             The current iteration
         pynisher_context: str
             Context to use for multiprocessing, can be either fork, spawn or forkserver.
-        logger_port: int
+        logger_port: int = DEFAULT_TCP_LOGGING_PORT
             The port where the logging server is listening to.
-        unit_test: bool
+        unit_test: bool = False
             Turn on unit testing mode. This currently makes fit_ensemble raise a MemoryError.
             Having this is very bad coding style, but I did not find a way to make
             unittest.mock work through the pynisher with all spawn contexts. If you know a
             better solution, please let us know by opening an issue.
+        memory_limit: Optional[int] = None
+            memory limit in mb. If ``None``, no memory limit is enforced.
+        random_state: Optional[int | RandomState] = None
+            A random state used for the ensemble selection process.
 
     Returns
     -------
@@ -376,15 +377,15 @@ def __init__(
         task_type: int,
         metric: Scorer,
         ensemble_size: int = 10,
-        ensemble_nbest: int = 100,
+        ensemble_nbest: Union[int, float] = 100,
         max_models_on_disc: int = 100,
         performance_range_threshold: float = 0,
         seed: int = 1,
         precision: int = 32,
         memory_limit: Optional[int] = 1024,
         read_at_most: int = 5,
-        random_state: Optional[Union[int, np.random.RandomState]] = None,
         logger_port: int = logging.handlers.DEFAULT_TCP_LOGGING_PORT,
+        random_state: Optional[Union[int, np.random.RandomState]] = None,
         unit_test: bool = False,
     ):
         """
@@ -400,14 +401,14 @@ def __init__(
                 type of ML task
             metric: str
                 name of metric to compute the loss of the given predictions
-            ensemble_size: int
+            ensemble_size: int = 10
                 maximal size of ensemble (passed to autosklearn.ensemble.ensemble_selection)
-            ensemble_nbest: int/float
+            ensemble_nbest: int | float = 100
                 if int: consider only the n best prediction
                 if float: consider only this fraction of the best models
-                Both wrt to validation predictions
+                Both with respect to the validation predictions
                 If performance_range_threshold > 0, might return less models
-            max_models_on_disc: int
+            max_models_on_disc: int = 100
                Defines the maximum number of models that are kept in the disc.
                If int, it must be greater or equal than 1, and dictates the max number of
                models to keep.
@@ -417,23 +418,25 @@ def __init__(
                Models and predictions of the worst-performing models will be deleted then.
                If None, the feature is disabled.
                It defines an upper bound on the models that can be used in the ensemble.
-            performance_range_threshold: float
+            performance_range_threshold: float = 0
                 Keep only models that are better than:
                     dummy + (best - dummy)*performance_range_threshold
                 E.g dummy=2, best=4, thresh=0.5 --> only consider models with loss > 3
                 Will at most return the minimum between ensemble_nbest models,
                 and max_models_on_disc. Might return less
-            seed: int
-                random seed
-            precision: [16,32,64,128]
+            seed: int = 1
+                random seed that is used as part of the filename
+            precision: int in [16,32,64,128] = 32
                 precision of floats to read the predictions
-            memory_limit: Optional[int]
+            memory_limit: Optional[int] = 1024
                 memory limit in mb. If ``None``, no memory limit is enforced.
-            read_at_most: int
+            read_at_most: int = 5
                 read at most n new prediction files in each iteration
-            logger_port: int
+            logger_port: int = DEFAULT_TCP_LOGGING_PORT
                 port that receives logging records
-            unit_test: bool
+            random_state: Optional[int | RandomState] = None
+                An int or RandomState object used for generating the ensemble.
+            unit_test: bool = False
                 Turn on unit testing mode. This currently makes fit_ensemble raise a MemoryError.
                 Having this is very bad coding style, but I did not find a way to make
                 unittest.mock work through the pynisher with all spawn contexts. If you know a
@@ -475,7 +478,7 @@ def __init__(
         self.precision = precision
         self.memory_limit = memory_limit
         self.read_at_most = read_at_most
-        self.random_state = check_random_state(random_state)
+        self.random_state = random_state
         self.unit_test = unit_test
 
         # Setup the logger
 
@@ -1,9 +1,11 @@
 import random
 from collections import Counter
-from typing import Any, Dict, List, Tuple, Union, cast
+from typing import Any, Dict, List, Optional, Tuple, Union, cast
 
 import numpy as np
 
+from sklearn.utils import check_random_state
+
 from autosklearn.constants import TASK_TYPES
 from autosklearn.ensembles.abstract_ensemble import AbstractEnsemble
 from autosklearn.metrics import Scorer, calculate_loss
@@ -16,15 +18,46 @@ def __init__(
         ensemble_size: int,
         task_type: int,
         metric: Scorer,
-        random_state: np.random.RandomState,
         bagging: bool = False,
         mode: str = 'fast',
+        random_state: Optional[Union[int, np.random.RandomState]] = None,
     ) -> None:
+        """ An ensemble of selected algorithms
+
+        Fitting an EnsembleSelection generates an ensemble from the the models
+        generated during the search process. Can be further used for prediction.
+
+        Parameters
+        ----------
+        task_type: int
+            An identifier indicating which task is being performed.
+        metric: Scorer
+            The metric used to evaluate the models
+        bagging: bool = False
+            Whether to use bagging in ensemble selection
+        mode: str in ['fast', 'slow'] = 'fast'
+            Which kind of ensemble generation to use
+            *   'slow' - The original method used in Rich Caruana's ensemble selection.
+            *   'fast' - A faster version of Rich Caruanas' ensemble selection.
+
+        random_state: Optional[int | RandomState] = None
+            The random_state used for ensemble selection.
+            *   None - Uses numpy's default RandomState object
+            *   int - Successive calls to fit will produce the same results
+            *   RandomState - Truely random, each call to fit will produce
+                              different results, even with the same object.
+        """
         self.ensemble_size = ensemble_size
         self.task_type = task_type
         self.metric = metric
         self.bagging = bagging
         self.mode = mode
+
+        # Behaviour similar to sklearn
+        #   int - Deteriministic with succesive calls to fit
+        #   RandomState - Successive calls to fit will produce differences
+        #   None - Uses numpmys global singleton RandomState
+        # https://scikit-learn.org/stable/common_pitfalls.html#controlling-randomness
         self.random_state = random_state
 
     def __getstate__(self) -> Dict[str, Any]:
@@ -84,6 +117,7 @@ def _fast(
     ) -> None:
         """Fast version of Rich Caruana's ensemble selection method."""
         self.num_input_models_ = len(predictions)
+        rand = check_random_state(self.random_state)
 
         ensemble = []  # type: List[np.ndarray]
         trajectory = []
@@ -143,7 +177,9 @@ def _fast(
                 )
 
             all_best = np.argwhere(losses == np.nanmin(losses)).flatten()
-            best = self.random_state.choice(all_best)
+
+            best = rand.choice(all_best)
+
             ensemble.append(predictions[best])
             trajectory.append(losses[best])
             order.append(best)
 
@@ -46,7 +46,7 @@ class MyDummyClassifier(DummyClassifier):
     def __init__(
         self,
         config: Configuration,
-        random_state: np.random.RandomState,
+        random_state: Optional[Union[int, np.random.RandomState]],
         init_params: Optional[Dict[str, Any]] = None,
         dataset_properties: Dict[str, Any] = {},
         include: Optional[List[str]] = None,
@@ -102,7 +102,7 @@ class MyDummyRegressor(DummyRegressor):
     def __init__(
         self,
         config: Configuration,
-        random_state: np.random.RandomState,
+        random_state: Optional[Union[int, np.random.RandomState]],
         init_params: Optional[Dict[str, Any]] = None,
         dataset_properties: Dict[str, Any] = {},
         include: Optional[List[str]] = None,
 
@@ -1106,9 +1106,11 @@ def calculate_all_metafeatures(X, y, categorical, dataset_name, logger,
                 X_transformed = check_array(X_transformed,
                                             force_all_finite=True,
                                             accept_sparse='csr')
-                rs = np.random.RandomState(42)
                 indices = np.arange(X_transformed.shape[0])
+
+                rs = np.random.RandomState(42)
                 rs.shuffle(indices)
+
                 # TODO Shuffle inplace
                 X_transformed = X_transformed[indices]
                 y_transformed = y[indices]
 
@@ -8,7 +8,6 @@
 import scipy.sparse
 
 from sklearn.pipeline import Pipeline
-from sklearn.utils.validation import check_random_state
 
 from .components.base import AutoSklearnChoice, AutoSklearnComponent
 import autosklearn.pipeline.create_searchspace_util
@@ -43,6 +42,7 @@ def __init__(self, config=None, steps=None, dataset_properties=None,
         self.exclude = exclude if exclude is not None else {}
         self.dataset_properties = dataset_properties if \
             dataset_properties is not None else {}
+        self.random_state = random_state
 
         if steps is None:
             self.steps = self._get_pipeline_steps(dataset_properties=dataset_properties)
@@ -73,10 +73,6 @@ def __init__(self, config=None, steps=None, dataset_properties=None,
 
         self.set_hyperparameters(self.config, init_params=init_params)
 
-        if random_state is None:
-            self.random_state = check_random_state(1)
-        else:
-            self.random_state = check_random_state(random_state)
         super().__init__(steps=self.steps)
 
         self._additional_run_info = {}