diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml
index 4fecec7d..3d52b250 100644
--- a/.github/workflows/run_tests.yml
+++ b/.github/workflows/run_tests.yml
@@ -2,7 +2,7 @@
 
 name: Test Pull Requests
 
-on: [push, pull_request]
+on: [push]
 
 jobs:
   Tests:
@@ -11,36 +11,42 @@ jobs:
     strategy:
       matrix:
         include:
-          - python-version: 3.7
+          - python-version: "3.7"
             DISPLAY_NAME: "Singularity Tests + CODECOV"
             RUN_TESTS: true
             USE_SINGULARITY: true
             SINGULARITY_VERSION: "3.8"
             RUN_CODECOV: true
 
-          - python-version: 3.7
+          - python-version: "3.7"
             DISPLAY_NAME: "Codestyle"
             RUN_CODESTYLE: true
             USE_SINGULARITY: false
 
-          - python-version: 3.7
+          - python-version: "3.7"
             DISPLAY_NAME: "Singularity Container Examples"
             RUN_CONTAINER_EXAMPLES: true
             USE_SINGULARITY: true
             SINGULARITY_VERSION: "3.8"
 
-          - python-version: 3.7
+          - python-version: "3.7"
             DISPLAY_NAME: "Local Examples"
             RUN_LOCAL_EXAMPLES: true
             USE_SINGULARITY: false
 
-          - python-version: 3.8
+          - python-version: "3.8"
             DISPLAY_NAME: "Singularity Tests"
             RUN_TESTS: true
             USE_SINGULARITY: true
             SINGULARITY_VERSION: "3.8"
 
-          - python-version: 3.9
+          - python-version: "3.9"
+            DISPLAY_NAME: "Singularity Tests"
+            RUN_TESTS: true
+            USE_SINGULARITY: true
+            SINGULARITY_VERSION: "3.8"
+
+          - python-version: "3.10"
             DISPLAY_NAME: "Singularity Tests"
             RUN_TESTS: true
             USE_SINGULARITY: true
@@ -63,7 +69,7 @@ jobs:
     - name: Set up Python ${{ matrix.python-version }}
       uses: actions/setup-python@v2
       with:
-        python-version: ${{ matrix.python-version }}
+        python-version: "${{ matrix.python-version }}"
     - name: Set up Go for Singularity
       if: matrix.USE_SINGULARITY == true
       uses: actions/setup-go@v2
@@ -78,4 +84,4 @@ jobs:
         python -m pip install --upgrade pip
         chmod +x ci_scripts/install.sh && source ./ci_scripts/install.sh
     - name: Run Tests
-      run: chmod +x ci_scripts/script.sh && source ./ci_scripts/script.sh
\ No newline at end of file
+      run: chmod +x ci_scripts/script.sh && source ./ci_scripts/script.sh
diff --git a/README.md b/README.md
index ec0a442e..96dd406d 100644
--- a/README.md
+++ b/README.md
@@ -149,3 +149,19 @@ See whether in `~/.singularity/instances/sing/$HOSTNAME/*/` there is a file that
 
 **Note:** If you are looking for a different or older version of our benchmarking library, you might be looking for
  [HPOlib1.5](https://github.com/automl/HPOlib1.5) 
+ 
+## Reference
+
+If you use HPOBench, please cite the following paper:
+
+```bibtex
+@inproceedings{
+  eggensperger2021hpobench,
+  title={{HPOB}ench: A Collection of Reproducible Multi-Fidelity Benchmark Problems for {HPO}},
+  author={Katharina Eggensperger and Philipp M{\"u}ller and Neeratyoy Mallik and Matthias Feurer and Rene Sass and Aaron Klein and Noor Awad and Marius Lindauer and Frank Hutter},
+  booktitle={Thirty-fifth Conference on Neural Information Processing Systems Datasets and Benchmarks Track (Round 2)},
+  year={2021},
+  url={https://openreview.net/forum?id=1k4rJYEwda-}
+}
+```
+
diff --git a/ci_scripts/install.sh b/ci_scripts/install.sh
index 2d229f74..d361600d 100644
--- a/ci_scripts/install.sh
+++ b/ci_scripts/install.sh
@@ -4,14 +4,24 @@ install_packages=""
 
 if [[ "$RUN_TESTS" == "true" ]]; then
     echo "Install tools for testing"
-    install_packages="${install_packages}xgboost,pytest,test_paramnet,test_tabular_datamanager,"
+    install_packages="${install_packages}pytest,test_tabular_datamanager,"
     pip install codecov
 
-    # The param net benchmark does not work with a scikit-learn version != 0.23.2. (See notes in the benchmark)
-    # To make sure that no newer version is installed, we install it before the other requirements.
-    # Since we are not using a "--upgrade" option later on, pip skips to install another scikit-learn version.
-    echo "Install the right scikit-learn function for the param net tests."
-    pip install --upgrade scikit-learn==0.23.2
+    PYVERSION=$(python -V 2>&1 | sed 's/.* \([0-9]\).\([0-9]*\).*/\1\2/')
+    if [[ "${PYVERSION}" != "310" ]]; then
+      # The param net benchmark does not work with a scikit-learn version != 0.23.2. (See notes in the benchmark)
+      # To make sure that no newer version is installed, we install it before the other requirements.
+      # Since we are not using a "--upgrade" option later on, pip skips to install another scikit-learn version.
+      echo "Install the right scikit-learn function for the param net tests."
+      pip install --upgrade scikit-learn==0.23.2
+      install_packages="${install_packages}xgboost,test_paramnet,"
+    else
+      echo "Skip installing the extra paramnet tests."
+      # For 3.10, we need a different pandas version - this comes as a requirement for the old xgboost benchmark.
+      # building pandas<=1.5.0 does not work with 3.10 anymore. -> install a different version.
+      install_packages="${install_packages}xgboost_310,"
+    fi
+
 else
     echo "Skip installing tools for testing"
 fi
@@ -35,7 +45,16 @@ if [[ "$RUN_LOCAL_EXAMPLES" == "true" ]]; then
     echo "Install packages for local examples"
     echo "Install swig"
     sudo apt-get update && sudo apt-get install -y build-essential swig
-    install_packages="${install_packages}xgboost,"
+
+    PYVERSION=$(python -V 2>&1 | sed 's/.* \([0-9]\).\([0-9]*\).*/\1\2/')
+    if [[ "${PYVERSION}" != "310" ]]; then
+      # For 3.10, we need a different pandas version - this comes as a requirement for the old xgboost benchmark.
+      # building pandas<=1.5.0 does not work with 3.10 anymore. -> install a different version.
+      install_packages="${install_packages}xgboost,"
+    else
+      install_packages="${install_packages}xgboost_310,"
+    fi
+
 else
     echo "Skip installing packages for local examples"
 fi
diff --git a/extra_requirements/nasbench_1shot1.json b/extra_requirements/nasbench_1shot1.json
index 7523d0f2..b008c789 100644
--- a/extra_requirements/nasbench_1shot1.json
+++ b/extra_requirements/nasbench_1shot1.json
@@ -1,3 +1,3 @@
 {
-  "nasbench_1shot1": ["tensorflow==1.15.0","matplotlib","seaborn", "networkx", "tqdm"]
+  "nasbench_1shot1": ["protobuf==3.20.1", "tensorflow==1.15.0", "matplotlib", "seaborn", "networkx", "tqdm"]
 }
\ No newline at end of file
diff --git a/extra_requirements/tests.json b/extra_requirements/tests.json
index 6c27be97..b25d6755 100644
--- a/extra_requirements/tests.json
+++ b/extra_requirements/tests.json
@@ -2,5 +2,5 @@
   "codestyle": ["pycodestyle","flake8","pylint"],
   "pytest": ["pytest>=4.6","pytest-cov"],
   "test_paramnet": ["tqdm", "scikit-learn==0.23.2"],
-  "test_tabular_datamanager": ["pyarrow", "fastparquet"]
+  "test_tabular_datamanager": ["tqdm","pyarrow", "fastparquet"]
 }
\ No newline at end of file
diff --git a/extra_requirements/xgboost.json b/extra_requirements/xgboost.json
index 2789d2ef..eefc920c 100644
--- a/extra_requirements/xgboost.json
+++ b/extra_requirements/xgboost.json
@@ -1,3 +1,4 @@
 {
-  "xgboost": ["xgboost==0.90","pandas>=1.0.0,<1.1.5","openml==0.10.2","scikit-learn>=0.18.1"]
+  "xgboost": ["xgboost==0.90","pandas>=1.0.0,<1.1.5","openml==0.10.2","scikit-learn>=0.18.1"],
+  "xgboost_310": ["xgboost","pandas","openml==0.10.2","scikit-learn>=0.18.1"]
 }
\ No newline at end of file
diff --git a/extra_requirements/yahpo_gym.json b/extra_requirements/yahpo_gym.json
index 77bea14d..10f4e390 100644
--- a/extra_requirements/yahpo_gym.json
+++ b/extra_requirements/yahpo_gym.json
@@ -1,3 +1,4 @@
 {
-  "yahpo_gym": ["yahpo_gym@git+https://github.com/pfistfl/yahpo_gym#egg=yahpo_gym&subdirectory=yahpo_gym"]
+  "yahpo_gym": ["yahpo_gym@git+https://github.com/pfistfl/yahpo_gym#egg=yahpo_gym&subdirectory=yahpo_gym"],
+  "yahpo_gym_raw": ["yahpo_gym@git+https://github.com/pfistfl/yahpo_gym#egg=yahpo_gym&subdirectory=yahpo_gym", "rpy2>=3.5.0", "openml==0.10.2", "gitpython>=3.1"]
 }
diff --git a/hpobench/abstract_benchmark.py b/hpobench/abstract_benchmark.py
index 57e837c5..6a2942af 100644
--- a/hpobench/abstract_benchmark.py
+++ b/hpobench/abstract_benchmark.py
@@ -1,20 +1,20 @@
 """ Base-class of all benchmarks """
 
 import abc
-from typing import Union, Dict, List, Tuple
 import functools
-
 import logging
+from typing import Union, Dict, List, Tuple
+
 import ConfigSpace
 import numpy as np
-
 from ConfigSpace.util import deactivate_inactive_hyperparameters
+
 from hpobench.util import rng_helper
 
 logger = logging.getLogger('AbstractBenchmark')
 
 
-class AbstractBenchmark(abc.ABC, metaclass=abc.ABCMeta):
+class _BaseAbstractBenchmark(abc.ABC, metaclass=abc.ABCMeta):
 
     def __init__(self, rng: Union[int, np.random.RandomState, None] = None, **kwargs):
         """
@@ -34,7 +34,7 @@ def __init__(self, rng: Union[int, np.random.RandomState, None] = None, **kwargs
             np.random.RandomState with seed `rng` is created. If type is None,
             create a new random state.
         """
-
+        super(_BaseAbstractBenchmark, self).__init__(**kwargs)
         self.rng = rng_helper.get_rng(rng=rng)
         self.configuration_space = self.get_configuration_space(self.rng.randint(0, 10000))
         self.fidelity_space = self.get_fidelity_space(self.rng.randint(0, 10000))
@@ -210,20 +210,14 @@ def _check_and_cast_fidelity(fidelity: Union[dict, ConfigSpace.Configuration, No
         fidelity_space.check_configuration(fidelity)
         return fidelity
 
-    @staticmethod
-    def _check_return_values(return_values: Dict) -> Dict:
-        """
-        The return values should contain the fields `function_value` and `cost`.
-        """
-        assert 'function_value' in return_values.keys()
-        assert 'cost' in return_values.keys()
-
-        return return_values
-
     def __call__(self, configuration: Dict, **kwargs) -> float:
         """ Provides interface to use, e.g., SciPy optimizers """
         return self.objective_function(configuration, **kwargs)['function_value']
 
+    @staticmethod
+    def _check_return_values(return_values: Dict) -> Dict:
+        raise NotImplementedError()
+
     @staticmethod
     @abc.abstractmethod
     def get_configuration_space(seed: Union[int, None] = None) -> ConfigSpace.ConfigurationSpace:
@@ -269,74 +263,39 @@ def get_meta_information() -> Dict:
         raise NotImplementedError()
 
 
-class AbstractMultiObjectiveBenchmark(AbstractBenchmark):
+class AbstractSingleObjectiveBenchmark(_BaseAbstractBenchmark):
     """
-    Abstract Benchmark class for multi-objective benchmarks.
-    The only purpose of this class is to point out to users that this benchmark returns multiple
-    objective function values.
+    Abstract Benchmark class for single-objective benchmarks.
+    This corresponds to the old AbstractBenchmark class.
+
+    The only purpose of this class is to point out to users that this benchmark returns only a single
+    objective function value.
 
     When writing a benchmark, please make sure to inherit from the correct abstract class.
     """
-    @abc.abstractmethod
-    def objective_function(self, configuration: Union[ConfigSpace.Configuration, Dict],
-                           fidelity: Union[Dict, ConfigSpace.Configuration, None] = None,
-                           rng: Union[np.random.RandomState, int, None] = None,
-                           **kwargs) -> Dict:
-        """
-        Objective function.
-
-        Override this function to provide your multi-objective benchmark function. This
-        function will be called by one of the evaluate functions. For
-        flexibility, you have to return a dictionary with the only mandatory
-        key being `function_values`, the objective function values for the
-        `configuration` which was passed. By convention, all benchmarks are
-        minimization problems.
 
-        `function_value` is a dictionary that contains all available criteria.
+    @staticmethod
+    def _check_return_values(return_values: Dict) -> Dict:
+        """
+        The return values should contain the fields `function_value` and `cost`.
+        """
+        assert 'function_value' in return_values.keys()
+        assert 'cost' in return_values.keys()
+        return return_values
 
-        Parameters
-        ----------
-        configuration : Dict
-        fidelity: Dict, None
-            Fidelity parameters, check get_fidelity_space(). Uses default (max) value if None.
-        rng : np.random.RandomState, int, None
-            It might be useful to pass a `rng` argument to the function call to
-            bypass the default "seed" generator. Only using the default random
-            state (`self.rng`) could lead to an overfitting towards the
-            `self.rng`'s seed.
 
-        Returns
-        -------
-        Dict
-            Must contain at least the key `function_value` and `cost`.
-            Note that `function_value` should be a Dict here.
-        """
-        raise NotImplementedError()
+# Ensure compatibility with older versions of the HPOBench
+AbstractBenchmark = AbstractSingleObjectiveBenchmark
 
-    @abc.abstractmethod
-    def objective_function_test(self, configuration: Union[ConfigSpace.Configuration, Dict],
-                                fidelity: Union[Dict, ConfigSpace.Configuration, None] = None,
-                                rng: Union[np.random.RandomState, int, None] = None,
-                                **kwargs) -> Dict:
-        """
-        If there is a different objective function for offline testing, e.g
-        testing a machine learning on a hold extra test set instead
-        on a validation set override this function here.
 
-        Parameters
-        ----------
-        configuration : Dict
-        fidelity: Dict, None
-            Fidelity parameters, check get_fidelity_space(). Uses default (max) value if None.
-        rng : np.random.RandomState, int, None
-            see :py:func:`~HPOBench.abstract_benchmark.objective_function`
+class AbstractMultiObjectiveBenchmark(_BaseAbstractBenchmark):
+    """
+    Abstract Benchmark class for multi-objective benchmarks.
+    The only purpose of this class is to point out to users that this benchmark returns multiple
+    objective function values.
 
-        Returns
-        -------
-        Dict
-            Must contain at least the key `function_value` and `cost`.
-        """
-        raise NotImplementedError()
+    When writing a benchmark, please make sure to inherit from the correct abstract class.
+    """
 
     @staticmethod
     def _check_return_values(return_values: Dict) -> Dict:
diff --git a/hpobench/benchmarks/ml/__init__.py b/hpobench/benchmarks/ml/__init__.py
index 64e399cd..e69de29b 100644
--- a/hpobench/benchmarks/ml/__init__.py
+++ b/hpobench/benchmarks/ml/__init__.py
@@ -1,22 +0,0 @@
-from hpobench.benchmarks.ml.histgb_benchmark import HistGBBenchmark, HistGBBenchmarkBB, HistGBBenchmarkMF
-from hpobench.benchmarks.ml.lr_benchmark import LRBenchmark, LRBenchmarkBB, LRBenchmarkMF
-from hpobench.benchmarks.ml.nn_benchmark import NNBenchmark, NNBenchmarkBB, NNBenchmarkMF
-from hpobench.benchmarks.ml.rf_benchmark import RandomForestBenchmark, RandomForestBenchmarkBB, \
-    RandomForestBenchmarkMF
-from hpobench.benchmarks.ml.svm_benchmark import SVMBenchmark, SVMBenchmarkBB, SVMBenchmarkMF
-from hpobench.benchmarks.ml.tabular_benchmark import TabularBenchmark
-
-try:
-    from hpobench.benchmarks.ml.xgboost_benchmark import XGBoostBenchmark, XGBoostBenchmarkBB, XGBoostBenchmarkMF
-except ImportError:
-    pass
-
-
-__all__ = ['HistGBBenchmark', 'HistGBBenchmarkBB', 'HistGBBenchmarkMF',
-           'LRBenchmark', 'LRBenchmarkBB', 'LRBenchmarkMF',
-           'NNBenchmark', 'NNBenchmarkBB', 'NNBenchmarkMF',
-           'RandomForestBenchmark', 'RandomForestBenchmarkBB', 'RandomForestBenchmarkMF',
-           'SVMBenchmark', 'SVMBenchmarkBB', 'SVMBenchmarkMF',
-           'TabularBenchmark',
-           'XGBoostBenchmark', 'XGBoostBenchmarkBB', 'XGBoostBenchmarkMF',
-           ]
diff --git a/hpobench/benchmarks/ml/lr_benchmark.py b/hpobench/benchmarks/ml/lr_benchmark.py
index 8c317111..aa7aa162 100644
--- a/hpobench/benchmarks/ml/lr_benchmark.py
+++ b/hpobench/benchmarks/ml/lr_benchmark.py
@@ -4,30 +4,38 @@
 
 0.0.1:
 * First implementation of the LR Benchmarks.
+0.0.2:
+* Restructuring for consistency and to match ML Benchmark Template updates.
+0.0.3:
+* Adding Learning Curve support.
 """
 
-
+import time
 from typing import Union, Tuple, Dict
 
 import ConfigSpace as CS
 import numpy as np
+import pandas as pd
 from ConfigSpace.hyperparameters import Hyperparameter
 from sklearn.linear_model import SGDClassifier
 
+from hpobench.util.rng_helper import get_rng
 from hpobench.dependencies.ml.ml_benchmark_template import MLBenchmark
 
-__version__ = '0.0.1'
+__version__ = '0.0.3'
 
 
 class LRBenchmark(MLBenchmark):
-    def __init__(self,
-                 task_id: int,
-                 rng: Union[np.random.RandomState, int, None] = None,
-                 valid_size: float = 0.33,
-                 data_path: Union[str, None] = None):
-
-        super(LRBenchmark, self).__init__(task_id, rng, valid_size, data_path)
-        self.cache_size = 500
+    """ Multi-multi-fidelity Logisitic Regression Benchmark
+    """
+    def __init__(
+            self,
+            task_id: int,
+            valid_size: float = 0.33,
+            rng: Union[np.random.RandomState, int, None] = None,
+            data_path: Union[str, None] = None
+    ):
+        super(LRBenchmark, self).__init__(task_id, valid_size, rng, data_path)
 
     @staticmethod
     def get_configuration_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
@@ -44,7 +52,8 @@ def get_configuration_space(seed: Union[int, None] = None) -> CS.ConfigurationSp
         ])
         return cs
 
-    def get_fidelity_space(self, seed: Union[int, None] = None) -> CS.ConfigurationSpace:
+    @staticmethod
+    def get_fidelity_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
         fidelity_space = CS.ConfigurationSpace(seed=seed)
         fidelity_space.add_hyperparameters(
             # gray-box setting (multi-multi-fidelity) - iterations + data subsample
@@ -53,17 +62,11 @@ def get_fidelity_space(self, seed: Union[int, None] = None) -> CS.ConfigurationS
         return fidelity_space
 
     @staticmethod
-    def _get_fidelity_choices(iter_choice: str, subsample_choice: str) -> Tuple[Hyperparameter, Hyperparameter]:
+    def _get_fidelity_choices(
+            iter_choice: str, subsample_choice: str
+    ) -> Tuple[Hyperparameter, Hyperparameter]:
         """Fidelity space available --- specifies the fidelity dimensions
-
-        For SVM, only a single fidelity exists, i.e., subsample fraction.
-        if fidelity_choice == 0
-            uses the entire data (subsample=1), reflecting the black-box setup
-        else
-            parameterizes the fraction of data to subsample
-
         """
-
         assert iter_choice in ['fixed', 'variable']
         assert subsample_choice in ['fixed', 'variable']
 
@@ -79,14 +82,16 @@ def _get_fidelity_choices(iter_choice: str, subsample_choice: str) -> Tuple[Hype
                 'subsample', lower=0.1, upper=1.0, default_value=1.0, log=False
             )
         )
-
         iter = fidelity1[iter_choice]
         subsample = fidelity2[subsample_choice]
         return iter, subsample
 
-    def init_model(self, config: Union[CS.Configuration, Dict],
-                   fidelity: Union[CS.Configuration, Dict, None] = None,
-                   rng: Union[int, np.random.RandomState, None] = None):
+    def init_model(
+            self,
+            config: Union[CS.Configuration, Dict],
+            fidelity: Union[CS.Configuration, Dict, None] = None,
+            rng: Union[int, np.random.RandomState, None] = None
+    ):
         # initializing model
         rng = self.rng if rng is None else rng
 
@@ -103,13 +108,185 @@ def init_model(self, config: Union[CS.Configuration, Dict],
             learning_rate="adaptive",
             tol=None,
             random_state=rng,
-
         )
         return model
 
+    def get_model_size(self, model: SGDClassifier = None) -> float:
+        """ Returns the dimensionality as a proxy for the number of model parameters
+
+        Logistic Regression models have a fixed number of parameters given a dataset. Model size is
+        being approximated as the number of beta parameters required as the model support plus the
+        intercept. This depends on the dataset and not on the trained model.
+
+        Parameters
+        ----------
+        model : SGDClassifier
+            Trained LR model. This parameter is required to maintain function signature.
+
+        Returns
+        -------
+        float
+        """
+        ndims = self.train_X.shape[1]
+        # accounting for the intercept
+        ndims += 1
+        return ndims
+
+    def _train_objective(
+            self,
+            config: Dict,
+            fidelity: Dict,
+            shuffle: bool,
+            rng: Union[np.random.RandomState, int, None] = None,
+            evaluation: Union[str, None] = "valid",
+            record_stats: bool = False,
+            get_learning_curve: bool = False,
+            lc_every_k: int = 1,
+            **kwargs
+    ):
+        """Function that instantiates a 'config' on a 'fidelity' and trains it
+
+        The ML model is instantiated and trained on the training split. Optionally, the model is
+        evaluated on the training set. Optionally, the learning curves are collected.
+
+        Parameters
+        ----------
+        config : CS.Configuration, Dict
+            The hyperparameter configuration.
+        fidelity : CS.Configuration, Dict
+            The fidelity configuration.
+        shuffle : bool (optional)
+            If True, shuffles the training split before fitting the ML model.
+        rng : np.random.RandomState, int (optional)
+            The random seed passed to the ML model and if applicable, used for shuffling the data
+            and subsampling the dataset fraction.
+        evaluation : str (optional)
+            If "valid", the ML model is trained on the training set alone.
+            If "test", the ML model is trained on the training + validation sets.
+        record_stats : bool (optional)
+            If True, records the evaluation metrics of the trained ML model on the training set.
+            This is set to False by default to reduce overall compute time.
+        get_learning_curve : bool (optional)
+            If True, records the learning curve using partial_fit or warm starting, if applicable.
+            This is set to False by default to reduce overall compute time.
+            Enabling True, implies that the for each iteration, the model will be evaluated on both
+            the validation and test sets, optionally on the training set also.
+        lc_every_k : int (optional)
+            If True, records the learning curve after every k iterations.
+        """
+        if rng is not None:
+            rng = get_rng(rng, self.rng)
+
+        # initializing model
+        model = self.init_model(config, fidelity, rng)
+
+        # preparing data
+        if evaluation == "valid":
+            train_X = self.train_X
+            train_y = self.train_y
+        elif evaluation == "test":
+            train_X = np.vstack((self.train_X, self.valid_X))
+            train_y = pd.concat((self.train_y, self.valid_y))
+        else:
+            raise ValueError("{} not in ['valid', 'test']".format(evaluation))
+        train_idx = np.arange(len(train_X)) if self.train_idx is None else self.train_idx
+
+        # shuffling data
+        if shuffle:
+            train_idx = self.shuffle_data_idx(train_idx, rng)
+            if isinstance(train_idx, np.ndarray):
+                train_X = train_X[train_idx]
+            else:
+                train_X = train_X.iloc[train_idx]
+            train_y = train_y.iloc[train_idx]
+
+        # subsample here:
+        # application of the other fidelity to the dataset that the model interfaces
+        # carried over from previous HPOBench code that borrowed from FABOLAS' SVM
+        lower_bound_lim = 1.0 / 512.0
+        if self.lower_bound_train_size is None:
+            self.lower_bound_train_size = (10 * self.n_classes) / self.train_X.shape[0]
+            self.lower_bound_train_size = np.max((lower_bound_lim, self.lower_bound_train_size))
+        subsample = np.max((fidelity['subsample'], self.lower_bound_train_size))
+        train_idx = self.rng.choice(
+            np.arange(len(train_X)), size=int(
+                subsample * len(train_X)
+            )
+        )
+        # fitting the model with subsampled data
+        if get_learning_curve:
+            # IMPORTANT to allow partial_fit
+            model.warm_start = True
+            lc_time = 0.0
+            model_fit_time = 0.0
+            learning_curves = dict(train=[], valid=[], test=[])
+            lc_spacings = self._get_lc_spacing(model.max_iter, lc_every_k)
+            iter_start = 0
+            for i in range(len(lc_spacings)):
+                iter_end = lc_spacings[i]
+                start = time.time()
+                # trains model for k steps
+                for j in range(iter_end - iter_start):
+                    model.partial_fit(
+                        train_X[train_idx],
+                        train_y.iloc[train_idx],
+                        np.unique(train_y.iloc[train_idx])
+                    )
+                # adding all partial fit times
+                model_fit_time += time.time() - start
+                iter_start = iter_end
+                lc_start = time.time()
+                if record_stats:
+                    train_pred = model.predict(train_X)
+                    train_loss = 1 - self.scorers['acc'](
+                        train_y, train_pred, **self.scorer_args['acc']
+                    )
+                    learning_curves['train'].append(train_loss)
+                val_pred = model.predict(self.valid_X)
+                val_loss = 1 - self.scorers['acc'](
+                    self.valid_y, val_pred, **self.scorer_args['acc']
+                )
+                learning_curves['valid'].append(val_loss)
+                test_pred = model.predict(self.test_X)
+                test_loss = 1 - self.scorers['acc'](
+                    self.test_y, test_pred, **self.scorer_args['acc']
+                )
+                learning_curves['test'].append(test_loss)
+                # sums the time taken to evaluate and collect data for the learning curves
+                lc_time += time.time() - lc_start
+        else:
+            # default training as per the base benchmark template
+            learning_curves = None
+            lc_time = None
+            start = time.time()
+            model.fit(train_X[train_idx], train_y.iloc[train_idx])
+            model_fit_time = time.time() - start
+        # model inference
+        inference_time = 0.0
+        # can optionally not record evaluation metrics on training set to save compute
+        if record_stats:
+            start = time.time()
+            pred_train = model.predict(train_X)
+            inference_time = time.time() - start
+        # computing statistics on training data
+        scores = dict()
+        score_cost = dict()
+        for k, v in self.scorers.items():
+            scores[k] = 0.0
+            score_cost[k] = 0.0
+            _start = time.time()
+            if record_stats:
+                scores[k] = v(train_y, pred_train, **self.scorer_args[k])
+            score_cost[k] = time.time() - _start + inference_time
+        train_loss = 1 - scores["acc"]
+        return model, model_fit_time, train_loss, scores, score_cost, learning_curves, lc_time
+
 
 class LRBenchmarkBB(LRBenchmark):
-    def get_fidelity_space(self, seed: Union[int, None] = None) -> CS.ConfigurationSpace:
+    """ Black-box version of the LRBenchmark
+    """
+    @staticmethod
+    def get_fidelity_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
         fidelity_space = CS.ConfigurationSpace(seed=seed)
         fidelity_space.add_hyperparameters(
             # black-box setting (full fidelity)
@@ -119,7 +296,10 @@ def get_fidelity_space(self, seed: Union[int, None] = None) -> CS.ConfigurationS
 
 
 class LRBenchmarkMF(LRBenchmark):
-    def get_fidelity_space(self, seed: Union[int, None] = None) -> CS.ConfigurationSpace:
+    """ Multi-fidelity version of the LRBenchmark
+    """
+    @staticmethod
+    def get_fidelity_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
         fidelity_space = CS.ConfigurationSpace(seed=seed)
         fidelity_space.add_hyperparameters(
             # gray-box setting (multi-fidelity) - iterations
diff --git a/hpobench/benchmarks/ml/nn_benchmark.py b/hpobench/benchmarks/ml/nn_benchmark.py
index 06634661..4263278f 100644
--- a/hpobench/benchmarks/ml/nn_benchmark.py
+++ b/hpobench/benchmarks/ml/nn_benchmark.py
@@ -4,28 +4,39 @@
 
 0.0.1:
 * First implementation of the NN Benchmarks.
+0.0.2:
+* Restructuring for consistency and to match ML Benchmark Template updates.
+0.0.3:
+* Adding Learning Curve support.
 """
 
+import time
 from copy import deepcopy
 from typing import Union, Tuple, Dict
 
 import ConfigSpace as CS
 import numpy as np
+import pandas as pd
 from ConfigSpace.hyperparameters import Hyperparameter
 from sklearn.neural_network import MLPClassifier
 
+from hpobench.util.rng_helper import get_rng
 from hpobench.dependencies.ml.ml_benchmark_template import MLBenchmark
 
-__version__ = '0.0.1'
+__version__ = '0.0.3'
 
 
 class NNBenchmark(MLBenchmark):
-    def __init__(self,
-                 task_id: int,
-                 rng: Union[np.random.RandomState, int, None] = None,
-                 valid_size: float = 0.33,
-                 data_path: Union[str, None] = None):
-        super(NNBenchmark, self).__init__(task_id, rng, valid_size, data_path)
+    """ Multi-multi-fidelity Multi-Layer Perceptron Benchmark
+    """
+    def __init__(
+            self,
+            task_id: int,
+            valid_size: float = 0.33,
+            rng: Union[np.random.RandomState, int, None] = None,
+            data_path: Union[str, None] = None
+    ):
+        super(NNBenchmark, self).__init__(task_id, valid_size, rng, data_path)
 
     @staticmethod
     def get_configuration_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
@@ -63,8 +74,11 @@ def get_fidelity_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
         return fidelity_space
 
     @staticmethod
-    def _get_fidelity_choices(iter_choice: str, subsample_choice: str) -> Tuple[Hyperparameter, Hyperparameter]:
-
+    def _get_fidelity_choices(
+            iter_choice: str, subsample_choice: str
+    ) -> Tuple[Hyperparameter, Hyperparameter]:
+        """Fidelity space available --- specifies the fidelity dimensions
+        """
         fidelity1 = dict(
             fixed=CS.Constant('iter', value=243),
             variable=CS.UniformIntegerHyperparameter(
@@ -81,11 +95,13 @@ def _get_fidelity_choices(iter_choice: str, subsample_choice: str) -> Tuple[Hype
         subsample = fidelity2[subsample_choice]
         return iter, subsample
 
-    def init_model(self, config: Union[CS.Configuration, Dict],
-                   fidelity: Union[CS.Configuration, Dict, None] = None,
-                   rng: Union[int, np.random.RandomState, None] = None):
-        """ Function that returns the model initialized based on the configuration and fidelity
-        """
+    def init_model(
+            self,
+            config: Union[CS.Configuration, Dict],
+            fidelity: Union[CS.Configuration, Dict, None] = None,
+            rng: Union[int, np.random.RandomState, None] = None
+    ):
+        # initializing model
         rng = self.rng if rng is None else rng
 
         if isinstance(config, CS.Configuration):
@@ -99,6 +115,7 @@ def init_model(self, config: Union[CS.Configuration, Dict],
         config.pop("depth")
         config.pop("width")
         hidden_layers = [width] * depth
+        # TODO: check for iteration length and edit n_iter_no_change maybe
         model = MLPClassifier(
             **config,
             hidden_layer_sizes=hidden_layers,
@@ -109,9 +126,175 @@ def init_model(self, config: Union[CS.Configuration, Dict],
         )
         return model
 
+    def get_model_size(self, model: MLPClassifier) -> float:
+        """ Returns the total number of trained parameters in the MLP model
+
+        Parameters
+        ----------
+        model : MLPClassifier
+            Trained MLP model.
+
+        Returns
+        -------
+        float
+        """
+        nparams = 0
+        for layer in model.coefs_:
+            nparams += layer.shape[0] * layer.shape[1]
+        for layer in model.intercepts_:
+            nparams += layer.shape[0]
+        return nparams
+
+    def _train_objective(
+            self,
+            config: Dict,
+            fidelity: Dict,
+            shuffle: bool,
+            rng: Union[np.random.RandomState, int, None] = None,
+            evaluation: Union[str, None] = "valid",
+            record_stats: bool = False,
+            get_learning_curve: bool = False,
+            lc_every_k: int = 1,
+            **kwargs
+    ):
+        """Function that instantiates a 'config' on a 'fidelity' and trains it
+
+        The ML model is instantiated and trained on the training split. Optionally, the model is
+        evaluated on the training set. Optionally, the learning curves are collected.
+
+        Parameters
+        ----------
+        config : CS.Configuration, Dict
+            The hyperparameter configuration.
+        fidelity : CS.Configuration, Dict
+            The fidelity configuration.
+        shuffle : bool (optional)
+            If True, shuffles the training split before fitting the ML model.
+        rng : np.random.RandomState, int (optional)
+            The random seed passed to the ML model and if applicable, used for shuffling the data
+            and subsampling the dataset fraction.
+        evaluation : str (optional)
+            If "valid", the ML model is trained on the training set alone.
+            If "test", the ML model is trained on the training + validation sets.
+        record_stats : bool (optional)
+            If True, records the evaluation metrics of the trained ML model on the training set.
+            This is set to False by default to reduce overall compute time.
+        get_learning_curve : bool (optional)
+            If True, records the learning curve using partial_fit or warm starting, if applicable.
+            This is set to False by default to reduce overall compute time.
+            Enabling True, implies that the for each iteration, the model will be evaluated on both
+            the validation and test sets, optionally on the training set also.
+        lc_every_k : int (optional)
+            If True, records the learning curve after every k iterations.
+        """
+        if rng is not None:
+            rng = get_rng(rng, self.rng)
+
+        # initializing model
+        model = self.init_model(config, fidelity, rng)
+
+        # preparing data
+        if evaluation == "valid":
+            train_X = self.train_X
+            train_y = self.train_y
+        elif evaluation == "test":
+            train_X = np.vstack((self.train_X, self.valid_X))
+            train_y = pd.concat((self.train_y, self.valid_y))
+        else:
+            raise ValueError("{} not in ['valid', 'test']".format(evaluation))
+        train_idx = np.arange(len(train_X)) if self.train_idx is None else self.train_idx
+
+        # shuffling data
+        if shuffle:
+            train_idx = self.shuffle_data_idx(train_idx, rng)
+            if isinstance(train_idx, np.ndarray):
+                train_X = train_X[train_idx]
+            else:
+                train_X = train_X.iloc[train_idx]
+            train_y = train_y.iloc[train_idx]
+
+        # subsample here:
+        # application of the other fidelity to the dataset that the model interfaces
+        if self.lower_bound_train_size is None:
+            self.lower_bound_train_size = (10 * self.n_classes) / self.train_X.shape[0]
+            self.lower_bound_train_size = np.max((1 / 512, self.lower_bound_train_size))
+        subsample = np.max((fidelity['subsample'], self.lower_bound_train_size))
+        train_idx = self.rng.choice(
+            np.arange(len(train_X)), size=int(
+                subsample * len(train_X)
+            )
+        )
+        # fitting the model with subsampled data
+        if get_learning_curve:
+            # IMPORTANT to allow partial_fit
+            model.warm_start = True
+            lc_time = 0.0
+            model_fit_time = 0.0
+            learning_curves = dict(train=[], valid=[], test=[])
+            lc_spacings = self._get_lc_spacing(model.max_iter, lc_every_k)
+            iter_start = 0
+            for i in range(len(lc_spacings)):
+                iter_end = lc_spacings[i]
+                start = time.time()
+                # trains model for k steps
+                for j in range(iter_end - iter_start):
+                    model.partial_fit(
+                        train_X[train_idx],
+                        train_y.iloc[train_idx],
+                        np.unique(train_y.iloc[train_idx])
+                    )
+                # adding all partial fit times
+                model_fit_time += time.time() - start
+                lc_start = time.time()
+                if record_stats:
+                    train_pred = model.predict(train_X)
+                    train_loss = 1 - self.scorers['acc'](
+                        train_y, train_pred, **self.scorer_args['acc']
+                    )
+                    learning_curves['train'].append(train_loss)
+                val_pred = model.predict(self.valid_X)
+                val_loss = 1 - self.scorers['acc'](
+                    self.valid_y, val_pred, **self.scorer_args['acc']
+                )
+                learning_curves['valid'].append(val_loss)
+                test_pred = model.predict(self.test_X)
+                test_loss = 1 - self.scorers['acc'](
+                    self.test_y, test_pred, **self.scorer_args['acc']
+                )
+                learning_curves['test'].append(test_loss)
+                lc_time += time.time() - lc_start
+        else:
+            learning_curves = None
+            lc_time = None
+            start = time.time()
+            model.fit(train_X[train_idx], train_y.iloc[train_idx])
+            model_fit_time = time.time() - start
+        # model inference
+        inference_time = 0.0
+        # can optionally not record evaluation metrics on training set to save compute
+        if record_stats:
+            start = time.time()
+            pred_train = model.predict(train_X)
+            inference_time = time.time() - start
+        # computing statistics on training data
+        scores = dict()
+        score_cost = dict()
+        for k, v in self.scorers.items():
+            scores[k] = 0.0
+            score_cost[k] = 0.0
+            _start = time.time()
+            if record_stats:
+                scores[k] = v(train_y, pred_train, **self.scorer_args[k])
+            score_cost[k] = time.time() - _start + inference_time
+        train_loss = 1 - scores["acc"]
+        return model, model_fit_time, train_loss, scores, score_cost, learning_curves, lc_time
+
 
 class NNBenchmarkBB(NNBenchmark):
-    def get_fidelity_space(self, seed: Union[int, None] = None) -> CS.ConfigurationSpace:
+    """ Black-box version of the NNBenchmark
+    """
+    @staticmethod
+    def get_fidelity_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
         fidelity_space = CS.ConfigurationSpace(seed=seed)
         fidelity_space.add_hyperparameters(
             # black-box setting (full fidelity)
@@ -121,7 +304,10 @@ def get_fidelity_space(self, seed: Union[int, None] = None) -> CS.ConfigurationS
 
 
 class NNBenchmarkMF(NNBenchmark):
-    def get_fidelity_space(self, seed: Union[int, None] = None) -> CS.ConfigurationSpace:
+    """ Multi-fidelity version of the NNBenchmark
+    """
+    @staticmethod
+    def get_fidelity_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
         fidelity_space = CS.ConfigurationSpace(seed=seed)
         fidelity_space.add_hyperparameters(
             # gray-box setting (multi-fidelity) - iterations
diff --git a/hpobench/benchmarks/ml/rbv2_benchmark.py b/hpobench/benchmarks/ml/rbv2_benchmark.py
new file mode 100644
index 00000000..e69de29b
diff --git a/hpobench/benchmarks/ml/rf_benchmark.py b/hpobench/benchmarks/ml/rf_benchmark.py
index 596f03b6..b6874788 100644
--- a/hpobench/benchmarks/ml/rf_benchmark.py
+++ b/hpobench/benchmarks/ml/rf_benchmark.py
@@ -4,28 +4,39 @@
 
 0.0.1:
 * First implementation of the RF Benchmarks.
+0.0.2:
+* Restructuring for consistency and to match ML Benchmark Template updates.
+0.0.3:
+* Adding Learning Curve support.
 """
 
+import time
 from copy import deepcopy
 from typing import Union, Tuple, Dict
 
 import ConfigSpace as CS
 import numpy as np
+import pandas as pd
 from ConfigSpace.hyperparameters import Hyperparameter
 from sklearn.ensemble import RandomForestClassifier
 
+from hpobench.util.rng_helper import get_rng
 from hpobench.dependencies.ml.ml_benchmark_template import MLBenchmark
 
-__version__ = '0.0.1'
+__version__ = '0.0.3'
 
 
 class RandomForestBenchmark(MLBenchmark):
-    def __init__(self,
-                 task_id: int,
-                 rng: Union[np.random.RandomState, int, None] = None,
-                 valid_size: float = 0.33,
-                 data_path: Union[str, None] = None):
-        super(RandomForestBenchmark, self).__init__(task_id, rng, valid_size, data_path)
+    """ Multi-multi-fidelity Random Forest Benchmark
+    """
+    def __init__(
+            self,
+            task_id: int,
+            valid_size: float = 0.33,
+            rng: Union[np.random.RandomState, int, None] = None,
+            data_path: Union[str, None] = None
+    ):
+        super(RandomForestBenchmark, self).__init__(task_id, valid_size, rng, data_path)
 
     @staticmethod
     def get_configuration_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
@@ -54,12 +65,16 @@ def get_fidelity_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
         fidelity_space = CS.ConfigurationSpace(seed=seed)
         fidelity_space.add_hyperparameters(
             # gray-box setting (multi-multi-fidelity) - ntrees + data subsample
-            RandomForestBenchmark._get_fidelity_choices(n_estimators_choice='variable', subsample_choice='variable')
+            RandomForestBenchmark._get_fidelity_choices(
+                n_estimators_choice='variable', subsample_choice='variable'
+            )
         )
         return fidelity_space
 
     @staticmethod
-    def _get_fidelity_choices(n_estimators_choice: str, subsample_choice: str) -> Tuple[Hyperparameter, Hyperparameter]:
+    def _get_fidelity_choices(
+            n_estimators_choice: str, subsample_choice: str
+    ) -> Tuple[Hyperparameter, Hyperparameter]:
 
         assert n_estimators_choice in ['fixed', 'variable']
         assert subsample_choice in ['fixed', 'variable']
@@ -70,7 +85,6 @@ def _get_fidelity_choices(n_estimators_choice: str, subsample_choice: str) -> Tu
                 'n_estimators', lower=16, upper=512, default_value=512, log=False
             )
         )
-
         fidelity2 = dict(
             fixed=CS.Constant('subsample', value=1),
             variable=CS.UniformFloatHyperparameter(
@@ -81,11 +95,13 @@ def _get_fidelity_choices(n_estimators_choice: str, subsample_choice: str) -> Tu
         subsample = fidelity2[subsample_choice]
         return n_estimators, subsample
 
-    def init_model(self, config: Union[CS.Configuration, Dict],
-                   fidelity: Union[CS.Configuration, Dict, None] = None,
-                   rng: Union[int, np.random.RandomState, None] = None):
-        """ Function that returns the model initialized based on the configuration and fidelity
-        """
+    def init_model(
+            self,
+            config: Union[CS.Configuration, Dict],
+            fidelity: Union[CS.Configuration, Dict, None] = None,
+            rng: Union[int, np.random.RandomState, None] = None
+    ):
+        # initializing model
         rng = self.rng if rng is None else rng
         if isinstance(config, CS.Configuration):
             config = config.get_dictionary()
@@ -103,23 +119,194 @@ def init_model(self, config: Union[CS.Configuration, Dict],
         )
         return model
 
+    def get_model_size(self, model: RandomForestClassifier) -> float:
+        """ Returns the total number of decision nodes in the entire Random Forest model
+
+        Parameters
+        ----------
+        model : RandomForestClassifier
+            Trained RF model.
+
+        Returns
+        -------
+        float
+        """
+        nodes = 0
+        for tree in model.estimators_:
+            # total number of nodes in the tree (internal + leaf)
+            nodes += tree.tree_.node_count
+        return nodes
+
+    def _train_objective(
+            self,
+            config: Dict,
+            fidelity: Dict,
+            shuffle: bool,
+            rng: Union[np.random.RandomState, int, None] = None,
+            evaluation: Union[str, None] = "valid",
+            record_stats: bool = False,
+            get_learning_curve: bool = False,
+            lc_every_k: int = 1,
+            **kwargs
+    ):
+        """Function that instantiates a 'config' on a 'fidelity' and trains it
+
+        The ML model is instantiated and trained on the training split. Optionally, the model is
+        evaluated on the training set. Optionally, the learning curves are collected.
+
+        Parameters
+        ----------
+        config : CS.Configuration, Dict
+            The hyperparameter configuration.
+        fidelity : CS.Configuration, Dict
+            The fidelity configuration.
+        shuffle : bool (optional)
+            If True, shuffles the training split before fitting the ML model.
+        rng : np.random.RandomState, int (optional)
+            The random seed passed to the ML model and if applicable, used for shuffling the data
+            and subsampling the dataset fraction.
+        evaluation : str (optional)
+            If "valid", the ML model is trained on the training set alone.
+            If "test", the ML model is trained on the training + validation sets.
+        record_stats : bool (optional)
+            If True, records the evaluation metrics of the trained ML model on the training set.
+            This is set to False by default to reduce overall compute time.
+        get_learning_curve : bool (optional)
+            If True, records the learning curve using partial_fit or warm starting, if applicable.
+            This is set to False by default to reduce overall compute time.
+            Enabling True, implies that the for each iteration, the model will be evaluated on both
+            the validation and test sets, optionally on the training set also.
+        lc_every_k : int (optional)
+            If True, records the learning curve after every k iterations.
+        """
+        if rng is not None:
+            rng = get_rng(rng, self.rng)
+
+        # initializing model
+        model = self.init_model(config, fidelity, rng)
+
+        # preparing data
+        if evaluation == "valid":
+            train_X = self.train_X
+            train_y = self.train_y
+        elif evaluation == "test":
+            train_X = np.vstack((self.train_X, self.valid_X))
+            train_y = pd.concat((self.train_y, self.valid_y))
+        else:
+            raise ValueError("{} not in ['valid', 'test']".format(evaluation))
+        train_idx = np.arange(len(train_X)) if self.train_idx is None else self.train_idx
+
+        # shuffling data
+        if shuffle:
+            train_idx = self.shuffle_data_idx(train_idx, rng)
+            if isinstance(train_idx, np.ndarray):
+                train_X = train_X[train_idx]
+            else:
+                train_X = train_X.iloc[train_idx]
+            train_y = train_y.iloc[train_idx]
+
+        # subsample here:
+        # application of the other fidelity to the dataset that the model interfaces
+        # carried over from previous HPOBench code that borrowed from FABOLAS' SVM
+        lower_bound_lim = 1.0 / 512.0
+        if self.lower_bound_train_size is None:
+            self.lower_bound_train_size = (10 * self.n_classes) / self.train_X.shape[0]
+            self.lower_bound_train_size = np.max((lower_bound_lim, self.lower_bound_train_size))
+        subsample = np.max((fidelity['subsample'], self.lower_bound_train_size))
+        train_idx = self.rng.choice(
+            np.arange(len(train_X)), size=int(
+                subsample * len(train_X)
+            )
+        )
+        # fitting the model with subsampled data
+        if get_learning_curve:
+            lc_spacings = self._get_lc_spacing(model.n_estimators, lc_every_k)
+            # IMPORTANT to allow refitting with more estimators
+            model.warm_start = True
+            model.n_estimators = 0
+            lc_time = 0.0
+            model_fit_time = 0.0
+            learning_curves = dict(train=[], valid=[], test=[])
+            iter_start = 0
+            # for i in range(fidelity['n_estimators']):
+            for i in range(len(lc_spacings)):
+                iter_end = lc_spacings[i]
+                start = time.time()
+                # adds k new estimators to the model for training
+                model.n_estimators += iter_end - iter_start
+                model.fit(train_X[train_idx], train_y.iloc[train_idx])
+                model_fit_time += time.time() - start
+                lc_start = time.time()
+                if record_stats:
+                    train_pred = model.predict(train_X)
+                    train_loss = 1 - self.scorers['acc'](
+                        train_y, train_pred, **self.scorer_args['acc']
+                    )
+                    learning_curves['train'].append(train_loss)
+                val_pred = model.predict(self.valid_X)
+                val_loss = 1 - self.scorers['acc'](
+                    self.valid_y, val_pred, **self.scorer_args['acc']
+                )
+                learning_curves['valid'].append(val_loss)
+                test_pred = model.predict(self.test_X)
+                test_loss = 1 - self.scorers['acc'](
+                    self.test_y, test_pred, **self.scorer_args['acc']
+                )
+                learning_curves['test'].append(test_loss)
+                lc_time += time.time() - lc_start
+        else:
+            learning_curves = None
+            lc_time = None
+            start = time.time()
+            model.fit(train_X[train_idx], train_y.iloc[train_idx])
+            model_fit_time = time.time() - start
+        # model inference
+        inference_time = 0.0
+        # can optionally not record evaluation metrics on training set to save compute
+        if record_stats:
+            start = time.time()
+            pred_train = model.predict(train_X)
+            inference_time = time.time() - start
+        # computing statistics on training data
+        scores = dict()
+        score_cost = dict()
+        for k, v in self.scorers.items():
+            scores[k] = 0.0
+            score_cost[k] = 0.0
+            _start = time.time()
+            if record_stats:
+                scores[k] = v(train_y, pred_train, **self.scorer_args[k])
+            score_cost[k] = time.time() - _start + inference_time
+        train_loss = 1 - scores["acc"]
+        return model, model_fit_time, train_loss, scores, score_cost, learning_curves, lc_time
+
 
 class RandomForestBenchmarkBB(RandomForestBenchmark):
-    def get_fidelity_space(self, seed: Union[int, None] = None) -> CS.ConfigurationSpace:
+    """ Black-box version of the RandomForestBenchmark
+    """
+    @staticmethod
+    def get_fidelity_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
         fidelity_space = CS.ConfigurationSpace(seed=seed)
         fidelity_space.add_hyperparameters(
             # black-box setting (full fidelity)
-            RandomForestBenchmark._get_fidelity_choices(n_estimators_choice='fixed', subsample_choice='fixed')
+            RandomForestBenchmark._get_fidelity_choices(
+                n_estimators_choice='fixed', subsample_choice='fixed'
+            )
         )
         return fidelity_space
 
 
 class RandomForestBenchmarkMF(RandomForestBenchmark):
-    def get_fidelity_space(self, seed: Union[int, None] = None) -> CS.ConfigurationSpace:
+    """ Multi-fidelity version of the RandomForestBenchmark
+    """
+    @staticmethod
+    def get_fidelity_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
         fidelity_space = CS.ConfigurationSpace(seed=seed)
         fidelity_space.add_hyperparameters(
             # gray-box setting (multi-fidelity) - ntrees
-            RandomForestBenchmark._get_fidelity_choices(n_estimators_choice='variable', subsample_choice='fixed')
+            RandomForestBenchmark._get_fidelity_choices(
+                n_estimators_choice='variable', subsample_choice='fixed'
+            )
         )
         return fidelity_space
 
diff --git a/hpobench/benchmarks/ml/svm_benchmark.py b/hpobench/benchmarks/ml/svm_benchmark.py
index 9462442f..c7b6a816 100644
--- a/hpobench/benchmarks/ml/svm_benchmark.py
+++ b/hpobench/benchmarks/ml/svm_benchmark.py
@@ -4,6 +4,10 @@
 
 0.0.1:
 * First implementation of the new SVM Benchmarks.
+0.0.2:
+* Restructuring for consistency and to match ML Benchmark Template updates.
+0.0.3:
+* Adding Learning Curve support.
 """
 
 from typing import Union, Dict
@@ -15,18 +19,21 @@
 
 from hpobench.dependencies.ml.ml_benchmark_template import MLBenchmark
 
-__version__ = '0.0.1'
+__version__ = '0.0.3'
 
 
 class SVMBenchmark(MLBenchmark):
-    def __init__(self,
-                 task_id: int,
-                 rng: Union[np.random.RandomState, int, None] = None,
-                 valid_size: float = 0.33,
-                 data_path: Union[str, None] = None):
-        super(SVMBenchmark, self).__init__(task_id, rng, valid_size, data_path)
-
-        self.cache_size = 200
+    """ Multi-multi-fidelity SVM Benchmark
+    """
+    def __init__(
+            self,
+            task_id: int,
+            valid_size: float = 0.33,
+            rng: Union[np.random.RandomState, int, None] = None,
+            data_path: Union[str, None] = None
+    ):
+        super(SVMBenchmark, self).__init__(task_id, valid_size, rng, data_path)
+        self.cache_size = 1024  # in MB
 
     @staticmethod
     def get_configuration_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
@@ -54,7 +61,8 @@ def get_fidelity_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
 
     @staticmethod
     def _get_fidelity_choices(subsample_choice: str) -> Hyperparameter:
-
+        """Fidelity space available --- specifies the fidelity dimensions
+        """
         assert subsample_choice in ['fixed', 'variable']
 
         fidelity = dict(
@@ -64,12 +72,14 @@ def _get_fidelity_choices(subsample_choice: str) -> Hyperparameter:
             )
         )
         subsample = fidelity[subsample_choice]
-
         return subsample
 
-    def init_model(self, config: Union[CS.Configuration, Dict],
-                   fidelity: Union[CS.Configuration, Dict, None] = None,
-                   rng: Union[int, np.random.RandomState, None] = None):
+    def init_model(
+            self,
+            config: Union[CS.Configuration, Dict],
+            fidelity: Union[CS.Configuration, Dict, None] = None,
+            rng: Union[int, np.random.RandomState, None] = None
+    ):
         # initializing model
         rng = self.rng if rng is None else rng
         if isinstance(config, CS.Configuration):
@@ -81,9 +91,27 @@ def init_model(self, config: Union[CS.Configuration, Dict],
         )
         return model
 
+    def get_model_size(self, model: SVC) -> float:
+        """ Returns the number of support vectors in the SVM model
+
+        Parameters
+        ----------
+        model : SVC
+            Trained SVM model.
+
+        Returns
+        -------
+        float
+        """
+        nsupport = model.support_.shape[0]
+        return nsupport
+
 
 class SVMBenchmarkBB(SVMBenchmark):
-    def get_fidelity_space(self, seed: Union[int, None] = None) -> CS.ConfigurationSpace:
+    """ Black-box version of the SVMBenchmark
+    """
+    @staticmethod
+    def get_fidelity_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
         fidelity_space = CS.ConfigurationSpace(seed=seed)
         fidelity_space.add_hyperparameter(
             # uses the entire data (subsample=1), reflecting the black-box setup
diff --git a/hpobench/benchmarks/ml/svm_benchmark_old.py b/hpobench/benchmarks/ml/svm_benchmark_old.py
deleted file mode 100644
index 9aad5e44..00000000
--- a/hpobench/benchmarks/ml/svm_benchmark_old.py
+++ /dev/null
@@ -1,354 +0,0 @@
-"""
-
-Changelog:
-==========
-0.0.3
-* New container release due to a general change in the communication between container and HPOBench.
-  Works with HPOBench >= v0.0.8
-
-0.0.2:
-* Standardize the structure of the meta information
-
-0.0.1:
-* First implementation
-
-"""
-
-import logging
-import time
-from typing import Union, Tuple, Dict, List
-
-import ConfigSpace as CS
-import numpy as np
-from scipy import sparse
-from sklearn import pipeline
-from sklearn import svm
-from sklearn.compose import ColumnTransformer
-from sklearn.impute import SimpleImputer
-from sklearn.metrics import accuracy_score, make_scorer
-from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
-
-import hpobench.util.rng_helper as rng_helper
-from hpobench.abstract_benchmark import AbstractBenchmark
-from hpobench.util.openml_data_manager import OpenMLHoldoutDataManager
-
-__version__ = '0.0.3'
-
-logger = logging.getLogger('SVMBenchmark')
-
-
-class SupportVectorMachine(AbstractBenchmark):
-    """
-    Hyperparameter optimization task to optimize the regularization
-    parameter C and the kernel parameter gamma of a support vector machine.
-    Both hyperparameters are optimized on a log scale in [-10, 10].
-    The X_test data set is only used for a final offline evaluation of
-    a configuration. For that the validation and training data is
-    concatenated to form the whole training data set.
-    """
-
-    def __init__(self, task_id: Union[int, None] = None,
-                 rng: Union[np.random.RandomState, int, None] = None):
-        """
-        Parameters
-        ----------
-        task_id : int, None
-        rng : np.random.RandomState, int, None
-        """
-        super(SupportVectorMachine, self).__init__(rng=rng)
-
-        self.task_id = task_id
-        self.cache_size = 200  # Cache for the SVC in MB
-        self.accuracy_scorer = make_scorer(accuracy_score)
-
-        self.x_train, self.y_train, self.x_valid, self.y_valid, self.x_test, self.y_test, variable_types = \
-            self.get_data()
-        self.categorical_data = np.array([var_type == 'categorical' for var_type in variable_types])
-
-        # Sort data (Categorical + numerical) so that categorical and continous are not mixed.
-        categorical_idx = np.argwhere(self.categorical_data)
-        continuous_idx = np.argwhere(~self.categorical_data)
-        sorting = np.concatenate([categorical_idx, continuous_idx]).squeeze()
-        self.categorical_data = self.categorical_data[sorting]
-        self.x_train = self.x_train[:, sorting]
-        self.x_valid = self.x_valid[:, sorting]
-        self.x_test = self.x_test[:, sorting]
-
-        nan_columns = np.all(np.isnan(self.x_train), axis=0)
-        self.categorical_data = self.categorical_data[~nan_columns]
-        self.x_train, self.x_valid, self.x_test, self.categories = \
-            OpenMLHoldoutDataManager.replace_nans_in_cat_columns(self.x_train, self.x_valid, self.x_test,
-                                                                 is_categorical=self.categorical_data)
-
-        self.train_idx = self.rng.choice(a=np.arange(len(self.x_train)),
-                                         size=len(self.x_train),
-                                         replace=False)
-
-        # Similar to [Fast Bayesian Optimization of Machine Learning Hyperparameters on Large Datasets]
-        # (https://arxiv.org/pdf/1605.07079.pdf),
-        # use 10 time the number of classes as lower bound for the dataset fraction
-        n_classes = np.unique(self.y_train).shape[0]
-        self.lower_bound_train_size = (10 * n_classes) / self.x_train.shape[0]
-
-    def get_data(self) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray, List]:
-        """ Loads the data given a task or another source. """
-
-        assert self.task_id is not None, NotImplementedError('No task-id given. Please either specify a task-id or '
-                                                             'overwrite the get_data method.')
-
-        data_manager = OpenMLHoldoutDataManager(openml_task_id=self.task_id, rng=self.rng)
-        x_train, y_train, x_val, y_val, x_test, y_test = data_manager.load()
-
-        return x_train, y_train, x_val, y_val, x_test, y_test, data_manager.variable_types
-
-    def shuffle_data(self, rng=None):
-        """ Reshuffle the training data. If 'rng' is None, the training idx are shuffled according to the
-        class-random-state"""
-        random_state = rng_helper.get_rng(rng, self.rng)
-        random_state.shuffle(self.train_idx)
-
-    # pylint: disable=arguments-differ
-    @AbstractBenchmark.check_parameters
-    def objective_function(self, configuration: Union[CS.Configuration, Dict],
-                           fidelity: Union[CS.Configuration, Dict, None] = None,
-                           shuffle: bool = False,
-                           rng: Union[np.random.RandomState, int, None] = None, **kwargs) -> Dict:
-        """
-        Trains a SVM model given a hyperparameter configuration and
-        evaluates the model on the validation set.
-
-        Parameters
-        ----------
-        configuration : Dict, CS.Configuration
-            Configuration for the SVM model
-        fidelity: Dict, None
-            Fidelity parameters for the SVM model, check get_fidelity_space(). Uses default (max) value if None.
-        shuffle : bool
-            If ``True``, shuffle the training idx. If no parameter ``rng`` is given, use the class random state.
-            Defaults to ``False``.
-        rng : np.random.RandomState, int, None,
-            Random seed for benchmark. By default the class level random seed.
-
-            To prevent overfitting on a single seed, it is possible to pass a
-            parameter ``rng`` as 'int' or 'np.random.RandomState' to this function.
-            If this parameter is not given, the default random state is used.
-        kwargs
-
-        Returns
-        -------
-        Dict -
-            function_value : validation loss
-            cost : time to train and evaluate the model
-            info : Dict
-                train_loss : training loss
-                fidelity : used fidelities in this evaluation
-        """
-        start_time = time.time()
-
-        self.rng = rng_helper.get_rng(rng=rng, self_rng=self.rng)
-
-        if shuffle:
-            self.shuffle_data(self.rng)
-
-        # Split of dataset subset
-        if self.lower_bound_train_size > fidelity['dataset_fraction']:
-            train_size = self.lower_bound_train_size
-            logger.warning(f'The given data set fraction is lower than the lower bound (10 * number of classes.) '
-                           f'Increase the fidelity from {fidelity["dataset_fraction"]:.8f} to '
-                           f'{self.lower_bound_train_size:.8f}')
-        else:
-            train_size = fidelity['dataset_fraction']
-
-        train_size = int(train_size * len(self.train_idx))
-        train_idx = self.train_idx[:train_size]
-
-        # Transform hyperparameters to linear scale
-        hp_c = np.exp(float(configuration['C']))
-        hp_gamma = np.exp(float(configuration['gamma']))
-
-        # Train support vector machine
-        model = self.get_pipeline(hp_c, hp_gamma)
-        model.fit(self.x_train[train_idx], self.y_train[train_idx])
-
-        # Compute validation error
-        train_loss = 1 - self.accuracy_scorer(model, self.x_train[train_idx], self.y_train[train_idx])
-        val_loss = 1 - self.accuracy_scorer(model, self.x_valid, self.y_valid)
-
-        cost = time.time() - start_time
-
-        return {'function_value': float(val_loss),
-                "cost": cost,
-                'info': {'train_loss': float(train_loss),
-                         'fidelity': fidelity}}
-
-    # pylint: disable=arguments-differ
-    @AbstractBenchmark.check_parameters
-    def objective_function_test(self, configuration: Union[CS.Configuration, Dict],
-                                fidelity: Union[CS.Configuration, Dict, None] = None,
-                                shuffle: bool = False,
-                                rng: Union[np.random.RandomState, int, None] = None, **kwargs) -> Dict:
-        """
-        Trains a SVM model with a given configuration on both the X_train
-        and validation data set and evaluates the model on the X_test data set.
-
-        Parameters
-        ----------
-        configuration : Dict, CS.Configuration
-            Configuration for the SVM Model
-        fidelity: Dict, None
-            Fidelity parameters, check get_fidelity_space(). Uses default (max) value if None.
-        shuffle : bool
-            If ``True``, shuffle the training idx. If no parameter ``rng`` is given, use the class random state.
-            Defaults to ``False``.
-        rng : np.random.RandomState, int, None,
-            Random seed for benchmark. By default the class level random seed.
-            To prevent overfitting on a single seed, it is possible to pass a
-            parameter ``rng`` as 'int' or 'np.random.RandomState' to this function.
-            If this parameter is not given, the default random state is used.
-        kwargs
-
-        Returns
-        -------
-        Dict -
-            function_value : X_test loss
-            cost : time to X_train and evaluate the model
-            info : Dict
-                train_valid_loss: Loss on the train+valid data set
-                fidelity : used fidelities in this evaluation
-        """
-        assert np.isclose(fidelity['dataset_fraction'], 1), \
-            f'Data set fraction must be 1 but was {fidelity["dataset_fraction"]}'
-
-        self.rng = rng_helper.get_rng(rng=rng, self_rng=self.rng)
-
-        if shuffle:
-            self.shuffle_data(self.rng)
-
-        start_time = time.time()
-
-        # Concatenate training and validation dataset
-        if isinstance(self.x_train, sparse.csr.csr_matrix) or isinstance(self.x_valid, sparse.csr.csr_matrix):
-            data = sparse.vstack((self.x_train, self.x_valid))
-        else:
-            data = np.concatenate((self.x_train, self.x_valid))
-        targets = np.concatenate((self.y_train, self.y_valid))
-
-        # Transform hyperparameters to linear scale
-        hp_c = np.exp(float(configuration['C']))
-        hp_gamma = np.exp(float(configuration['gamma']))
-
-        model = self.get_pipeline(hp_c, hp_gamma)
-        model.fit(data, targets)
-
-        # Compute validation error
-        train_valid_loss = 1 - self.accuracy_scorer(model, data, targets)
-
-        # Compute test error
-        test_loss = 1 - self.accuracy_scorer(model, self.x_test, self.y_test)
-
-        cost = time.time() - start_time
-
-        return {'function_value': float(test_loss),
-                "cost": cost,
-                'info': {'train_valid_loss': float(train_valid_loss),
-                         'fidelity': fidelity}}
-
-    def get_pipeline(self, C: float, gamma: float) -> pipeline.Pipeline:
-        """ Create the scikit-learn (training-)pipeline """
-
-        model = pipeline.Pipeline([
-            ('preprocess_impute',
-             ColumnTransformer([
-                 ("categorical", "passthrough", self.categorical_data),
-                 ("continuous", SimpleImputer(strategy="mean"), ~self.categorical_data)])),
-            ('preprocess_one_hot',
-             ColumnTransformer([
-                 ("categorical", OneHotEncoder(categories=self.categories, sparse=False), self.categorical_data),
-                 ("continuous", MinMaxScaler(feature_range=(0, 1)), ~self.categorical_data)])),
-            ('svm',
-             svm.SVC(gamma=gamma, C=C, random_state=self.rng, cache_size=self.cache_size))
-        ])
-        return model
-
-    @staticmethod
-    def get_configuration_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
-        """
-        Creates a ConfigSpace.ConfigurationSpace containing all parameters for
-        the SVM Model
-
-        For a detailed explanation of the hyperparameters:
-        https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html
-
-        Parameters
-        ----------
-        seed : int, None
-            Fixing the seed for the ConfigSpace.ConfigurationSpace
-
-        Returns
-        -------
-        ConfigSpace.ConfigurationSpace
-        """
-
-        seed = seed if seed is not None else np.random.randint(1, 100000)
-        cs = CS.ConfigurationSpace(seed=seed)
-
-        cs.add_hyperparameters([
-            CS.UniformFloatHyperparameter('C', lower=-10., upper=10., default_value=0., log=False),
-            CS.UniformFloatHyperparameter('gamma', lower=-10., upper=10., default_value=1., log=False),
-        ])
-        # cs.generate_all_continuous_from_bounds(SupportVectorMachine.get_meta_information()['bounds'])
-        return cs
-
-    @staticmethod
-    def get_fidelity_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
-        """
-        Creates a ConfigSpace.ConfigurationSpace containing all fidelity parameters for
-        the SupportVector Benchmark
-
-        Fidelities
-        ----------
-        dataset_fraction: float - [0.1, 1]
-            fraction of training data set to use
-
-        Parameters
-        ----------
-        seed : int, None
-            Fixing the seed for the ConfigSpace.ConfigurationSpace
-
-        Returns
-        -------
-        ConfigSpace.ConfigurationSpace
-        """
-        seed = seed if seed is not None else np.random.randint(1, 100000)
-        fidel_space = CS.ConfigurationSpace(seed=seed)
-
-        fidel_space.add_hyperparameters([
-            CS.UniformFloatHyperparameter("dataset_fraction", lower=0.0, upper=1.0, default_value=1.0, log=False),
-        ])
-        return fidel_space
-
-    def get_meta_information(self):
-        """ Returns the meta information for the benchmark """
-        return {'name': 'Support Vector Machine',
-                'references': ["@InProceedings{pmlr-v54-klein17a",
-                               "author = {Aaron Klein and Stefan Falkner and Simon Bartels and Philipp Hennig and "
-                               "Frank Hutter}, "
-                               "title = {{Fast Bayesian Optimization of Machine Learning Hyperparameters on "
-                               "Large Datasets}}"
-                               "pages = {528--536}, year = {2017},"
-                               "editor = {Aarti Singh and Jerry Zhu},"
-                               "volume = {54},"
-                               "series = {Proceedings of Machine Learning Research},"
-                               "address = {Fort Lauderdale, FL, USA},"
-                               "month = {20--22 Apr},"
-                               "publisher = {PMLR},"
-                               "pdf = {http://proceedings.mlr.press/v54/klein17a/klein17a.pdf}, "
-                               "url = {http://proceedings.mlr.press/v54/klein17a.html}, "
-                               ],
-                'code': 'https://github.com/automl/HPOlib1.5/blob/container/hpolib/benchmarks/ml/svm_benchmark.py',
-                'shape of train data': self.x_train.shape,
-                'shape of test data': self.x_test.shape,
-                'shape of valid data': self.x_valid.shape,
-                'initial random seed': self.rng,
-                'task_id': self.task_id
-                }
diff --git a/hpobench/benchmarks/ml/tabular_benchmark.py b/hpobench/benchmarks/ml/tabular_benchmark.py
index 72e5fb31..342766b4 100644
--- a/hpobench/benchmarks/ml/tabular_benchmark.py
+++ b/hpobench/benchmarks/ml/tabular_benchmark.py
@@ -4,6 +4,10 @@
 
 0.0.1:
 * First implementation of the Tabular Benchmark.
+0.0.2:
+* Restructuring for consistency and to match ML Benchmark Template updates.
+0.0.3:
+* Adding Learning Curve support.
 """
 
 from pathlib import Path
@@ -17,7 +21,7 @@
 from hpobench.dependencies.ml.ml_benchmark_template import metrics
 from hpobench.util.data_manager import TabularDataManager
 
-__version__ = '0.0.1'
+__version__ = '0.0.3'
 
 
 class TabularBenchmark(AbstractBenchmark):
@@ -145,8 +149,8 @@ def _search_dataframe(self, row_dict, df):
         for i, param in enumerate(df.drop("result", axis=1).columns):
             mask *= df[param].values == row_dict[param]
         idx = np.where(mask)
-        assert len(idx) == 1, 'The query has resulted into mulitple matches. This should not happen. ' \
-                              f'The Query was {row_dict}'
+        assert len(idx) == 1, 'The query has resulted into mulitple matches. ' \
+                              'This should not happen. The Query was {row_dict}'
         idx = idx[0][0]
         result = df.iloc[idx]["result"]
         return result
@@ -163,7 +167,7 @@ def _objective(
         metric_str = ', '.join(list(metrics.keys()))
         assert metric in list(metrics.keys()), f"metric not found among: {metric_str}"
         score_key = f"{evaluation}_scores"
-        cost_key = f"{evaluation}_scores"
+        cost_key = f"{evaluation}_costs"
 
         key_path = dict()
         for name in self.configuration_space.get_hyperparameter_names():
diff --git a/hpobench/benchmarks/ml/xgboost_benchmark.py b/hpobench/benchmarks/ml/xgboost_benchmark.py
index ae554628..234c2cee 100644
--- a/hpobench/benchmarks/ml/xgboost_benchmark.py
+++ b/hpobench/benchmarks/ml/xgboost_benchmark.py
@@ -4,7 +4,12 @@
 
 0.0.1:
 * First implementation of the new XGB Benchmarks.
+0.0.2:
+* Restructuring for consistency and to match ML Benchmark Template updates.
+0.0.3:
+* Adding Learning Curve support.
 """
+
 from typing import Union, Tuple, Dict
 
 import ConfigSpace as CS
@@ -12,18 +17,23 @@
 import xgboost as xgb
 from ConfigSpace.hyperparameters import Hyperparameter
 
+from hpobench.util.rng_helper import get_rng
 from hpobench.dependencies.ml.ml_benchmark_template import MLBenchmark
 
-__version__ = '0.0.1'
+__version__ = '0.0.3'
 
 
 class XGBoostBenchmark(MLBenchmark):
-    def __init__(self,
-                 task_id: int,
-                 rng: Union[np.random.RandomState, int, None] = None,
-                 valid_size: float = 0.33,
-                 data_path: Union[str, None] = None):
-        super(XGBoostBenchmark, self).__init__(task_id, rng, valid_size, data_path)
+    """ Multi-multi-fidelity XGBoost Benchmark
+    """
+    def __init__(
+            self,
+            task_id: int,
+            valid_size: float = 0.33,
+            rng: Union[np.random.RandomState, int, None] = None,
+            data_path: Union[str, None] = None
+    ):
+        super(XGBoostBenchmark, self).__init__(task_id, valid_size, rng, data_path)
 
     @staticmethod
     def get_configuration_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
@@ -52,12 +62,16 @@ def get_fidelity_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
         fidelity_space = CS.ConfigurationSpace(seed=seed)
         fidelity_space.add_hyperparameters(
             # gray-box setting (multi-multi-fidelity) - ntrees + data subsample
-            XGBoostBenchmark._get_fidelity_choices(n_estimators_choice='variable', subsample_choice='variable')
+            XGBoostBenchmark._get_fidelity_choices(
+                n_estimators_choice='variable', subsample_choice='variable'
+            )
         )
         return fidelity_space
 
     @staticmethod
-    def _get_fidelity_choices(n_estimators_choice: str, subsample_choice: str) -> Tuple[Hyperparameter, Hyperparameter]:
+    def _get_fidelity_choices(
+            n_estimators_choice: str, subsample_choice: str
+    ) -> Tuple[Hyperparameter, Hyperparameter]:
 
         assert n_estimators_choice in ['fixed', 'variable']
         assert subsample_choice in ['fixed', 'variable']
@@ -74,28 +88,31 @@ def _get_fidelity_choices(n_estimators_choice: str, subsample_choice: str) -> Tu
                 'subsample', lower=0.1, upper=1, default_value=1, log=False
             )
         )
-
         n_estimators = fidelity1[n_estimators_choice]
         subsample = fidelity2[subsample_choice]
         return n_estimators, subsample
 
-    def init_model(self,
-                   config: Union[CS.Configuration, Dict],
-                   fidelity: Union[CS.Configuration, Dict, None] = None,
-                   rng: Union[int, np.random.RandomState, None] = None):
-        """ Function that returns the model initialized based on the configuration and fidelity
-        """
+    def init_model(
+            self,
+            config: Union[CS.Configuration, Dict],
+            fidelity: Union[CS.Configuration, Dict, None] = None,
+            rng: Union[int, np.random.RandomState, None] = None
+    ):
+        # initializing model
+        rng = self.rng if rng is None else get_rng(rng)
+        # xgb.XGBClassifier when trainied using the scikit-learn API of `fit`, requires
+        # random_state to be an integer and doesn't accept a RandomState
+        seed = rng.randint(1, 10**6)
+
         if isinstance(config, CS.Configuration):
             config = config.get_dictionary()
         if isinstance(fidelity, CS.Configuration):
             fidelity = fidelity.get_dictionary()
-
-        rng = rng if (rng is None or isinstance(rng, int)) else self.seed
         extra_args = dict(
             booster="gbtree",
             n_estimators=fidelity['n_estimators'],
             objective="binary:logistic",
-            random_state=rng,
+            random_state=seed,
             subsample=1
         )
         if self.n_classes > 2:
@@ -108,23 +125,48 @@ def init_model(self,
         )
         return model
 
+    def get_model_size(self, model: xgb.XGBClassifier) -> float:
+        """ Returns the total number of decision nodes in the sequence of Gradient Boosted trees
+
+        Parameters
+        ----------
+        model : xgb.XGBClassifier
+            Trained XGB model.
+
+        Returns
+        -------
+        float
+        """
+        nodes = model.get_booster().trees_to_dataframe().shape[0]
+        return nodes
+
 
 class XGBoostBenchmarkBB(XGBoostBenchmark):
-    def get_fidelity_space(self, seed: Union[int, None] = None) -> CS.ConfigurationSpace:
+    """ Black-box version of the XGBoostBenchmark
+    """
+    @staticmethod
+    def get_fidelity_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
         fidelity_space = CS.ConfigurationSpace(seed=seed)
         fidelity_space.add_hyperparameters(
             # black-box setting (full fidelity)
-            XGBoostBenchmark._get_fidelity_choices(n_estimators_choice='fixed', subsample_choice='fixed')
+            XGBoostBenchmark._get_fidelity_choices(
+                n_estimators_choice='fixed', subsample_choice='fixed'
+            )
         )
         return fidelity_space
 
 
 class XGBoostBenchmarkMF(XGBoostBenchmark):
-    def get_fidelity_space(self, seed: Union[int, None] = None) -> CS.ConfigurationSpace:
+    """ Multi-fidelity version of the XGBoostBenchmark
+    """
+    @staticmethod
+    def get_fidelity_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
         fidelity_space = CS.ConfigurationSpace(seed=seed)
         fidelity_space.add_hyperparameters(
             # gray-box setting (multi-fidelity) - ntrees
-            XGBoostBenchmark._get_fidelity_choices(n_estimators_choice='variable', subsample_choice='fixed')
+            XGBoostBenchmark._get_fidelity_choices(
+                n_estimators_choice='variable', subsample_choice='fixed'
+            )
         )
         return fidelity_space
 
diff --git a/hpobench/benchmarks/ml/xgboost_benchmark_old.py b/hpobench/benchmarks/ml/xgboost_benchmark_old.py
deleted file mode 100644
index f8730f52..00000000
--- a/hpobench/benchmarks/ml/xgboost_benchmark_old.py
+++ /dev/null
@@ -1,430 +0,0 @@
-"""
-
-Changelog:
-==========
-0.0.3
-* New container release due to a general change in the communication between container and HPOBench.
-  Works with HPOBench >= v0.0.8
-
-0.0.2:
-* Change the search space definiton to match the paper: (https://arxiv.org/pdf/1802.09596.pdf)
-    eta:                [1e-5, 1] (def: 0.3)    ->  [2**-10, 1] (def: 0.3)
-    min_child_weight:   [0,05, 10] (def: 1)     ->  [1, 2**7] (def: 1)
-    colsample_bytree:   [0,05, 1] (def: 1)      ->  [0.01, 1] (def: 1)
-    colsample_bylevel:  [0,05, 1] (def: 1)      ->  [0.01, 1] (def: 1)
-    reg_lambda:         [1e-5, 2] (def: 1)      ->  [2**-10, 2**10] (def: 1)
-    reg_alpha:          [1e-5, 2] (def: 1e-5)   ->  [2**-10, 2**10] (def: 1)
-    max_depth:          -                       ->  [1, 15] (def: 6)
-    subsample_per_it:   -                       ->  [0.01, 1] (def: 1)
-    [booster:            -                      ->  [gbtree, gblinear, dart] (def: gbtree)]  *)
-
-    *) This parameter is only in the XGBoostExtendedBenchmark. Not in the XGBoostBenchmark class.
-
-* Increase the fidelity `n_estimators`
-    n_estimators        [2, 128] (def: 128)     ->  [1, 256] (def: 256)
-
-* Add class to optimize also the used booster method: (gbtree, gblinear or dart)
-    We have introduced a new class, which adds the used booster as parameter to the configuration space. To read more
-    about booster, please take a look in the official XGBoost-documentation (https://xgboost.readthedocs.io/en/latest).
-
-
-0.0.1:
-* First implementation of a XGBoost Benchmark.
-
-
-"""
-
-import logging
-import time
-from typing import Union, Tuple, Dict, List
-
-import ConfigSpace as CS
-import numpy as np
-import xgboost as xgb
-from sklearn import pipeline
-from sklearn.compose import ColumnTransformer
-from sklearn.impute import SimpleImputer
-from sklearn.metrics import accuracy_score, make_scorer
-from sklearn.preprocessing import OneHotEncoder
-
-import hpobench.util.rng_helper as rng_helper
-from hpobench.abstract_benchmark import AbstractBenchmark
-from hpobench.util.openml_data_manager import OpenMLHoldoutDataManager
-
-__version__ = '0.0.3'
-
-logger = logging.getLogger('XGBBenchmark')
-
-
-class XGBoostBenchmark(AbstractBenchmark):
-
-    def __init__(self, task_id: Union[int, None] = None, n_threads: int = 1,
-                 rng: Union[np.random.RandomState, int, None] = None):
-        """
-
-        Parameters
-        ----------
-        task_id : int, None
-        n_threads  : int, None
-        rng : np.random.RandomState, int, None
-        """
-
-        super(XGBoostBenchmark, self).__init__(rng=rng)
-        self.n_threads = n_threads
-        self.task_id = task_id
-        self.accuracy_scorer = make_scorer(accuracy_score)
-
-        self.x_train, self.y_train, self.x_valid, self.y_valid, self.x_test, self.y_test, variable_types = \
-            self.get_data()
-        self.categorical_data = np.array([var_type == 'categorical' for var_type in variable_types])
-
-        # XGB needs sorted data. Data should be (Categorical + numerical) not mixed.
-        categorical_idx = np.argwhere(self.categorical_data)
-        continuous_idx = np.argwhere(~self.categorical_data)
-        sorting = np.concatenate([categorical_idx, continuous_idx]).squeeze()
-        self.categorical_data = self.categorical_data[sorting]
-        self.x_train = self.x_train[:, sorting]
-        self.x_valid = self.x_valid[:, sorting]
-        self.x_test = self.x_test[:, sorting]
-
-        nan_columns = np.all(np.isnan(self.x_train), axis=0)
-        self.categorical_data = self.categorical_data[~nan_columns]
-
-        self.x_train, self.x_valid, self.x_test, self.categories = \
-            OpenMLHoldoutDataManager.replace_nans_in_cat_columns(self.x_train, self.x_valid, self.x_test,
-                                                                 is_categorical=self.categorical_data)
-
-        # Determine the number of categories in the labels.
-        # In case of binary classification ``self.num_class`` has to be 1 for xgboost.
-        self.num_class = len(np.unique(np.concatenate([self.y_train, self.y_test, self.y_valid])))
-        self.num_class = 1 if self.num_class == 2 else self.num_class
-
-        self.train_idx = self.rng.choice(a=np.arange(len(self.x_train)),
-                                         size=len(self.x_train),
-                                         replace=False)
-
-        # Similar to [Fast Bayesian Optimization of Machine Learning Hyperparameters on Large Datasets]
-        # (https://arxiv.org/pdf/1605.07079.pdf),
-        # use 10 time the number of classes as lower bound for the dataset fraction
-        n_classes = np.unique(self.y_train).shape[0]
-        self.lower_bound_train_size = (10 * n_classes) / self.x_train.shape[0]
-
-    def get_data(self) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray, List]:
-        """ Loads the data given a task or another source. """
-
-        assert self.task_id is not None, NotImplementedError('No task-id given. Please either specify a task-id or '
-                                                             'overwrite the get_data method.')
-
-        data_manager = OpenMLHoldoutDataManager(openml_task_id=self.task_id, rng=self.rng)
-        x_train, y_train, x_val, y_val, x_test, y_test = data_manager.load()
-
-        return x_train, y_train, x_val, y_val, x_test, y_test, data_manager.variable_types
-
-    def shuffle_data(self, rng=None):
-        """ Reshuffle the training data. If 'rng' is None, the training idx are shuffled according to the
-        class-random-state"""
-        random_state = rng_helper.get_rng(rng, self.rng)
-        random_state.shuffle(self.train_idx)
-
-    # pylint: disable=arguments-differ
-    @AbstractBenchmark.check_parameters
-    def objective_function(self, configuration: Union[CS.Configuration, Dict],
-                           fidelity: Union[CS.Configuration, Dict, None] = None,
-                           shuffle: bool = False,
-                           rng: Union[np.random.RandomState, int, None] = None, **kwargs) -> Dict:
-        """
-        Trains a XGBoost model given a hyperparameter configuration and
-        evaluates the model on the validation set.
-
-        Parameters
-        ----------
-        configuration : Dict, CS.Configuration
-            Configuration for the XGBoost model
-        fidelity: Dict, None
-            Fidelity parameters for the XGBoost model, check get_fidelity_space(). Uses default (max) value if None.
-        shuffle : bool
-            If ``True``, shuffle the training idx. If no parameter ``rng`` is given, use the class random state.
-            Defaults to ``False``.
-        rng : np.random.RandomState, int, None,
-            Random seed for benchmark. By default the class level random seed.
-
-            To prevent overfitting on a single seed, it is possible to pass a
-            parameter ``rng`` as 'int' or 'np.random.RandomState' to this function.
-            If this parameter is not given, the default random state is used.
-        kwargs
-
-        Returns
-        -------
-        Dict -
-            function_value : validation loss
-            cost : time to train and evaluate the model
-            info : Dict
-                train_loss : trainings loss
-                fidelity : used fidelities in this evaluation
-        """
-        self.rng = rng_helper.get_rng(rng=rng, self_rng=self.rng)
-
-        if shuffle:
-            self.shuffle_data(self.rng)
-
-        start = time.time()
-
-        if self.lower_bound_train_size > fidelity['dataset_fraction']:
-            train_data_fraction = self.lower_bound_train_size
-            logger.warning(f'The given data set fraction is lower than the lower bound (10 * number of classes.) '
-                           f'Increase the fidelity from {fidelity["dataset_fraction"]:.8f} to '
-                           f'{self.lower_bound_train_size:.8f}')
-        else:
-            train_data_fraction = fidelity['dataset_fraction']
-
-        train_idx = self.train_idx[:int(len(self.train_idx) * train_data_fraction)]
-
-        model = self._get_pipeline(n_estimators=fidelity["n_estimators"], **configuration)
-        model.fit(X=self.x_train[train_idx], y=self.y_train[train_idx])
-
-        train_loss = 1 - self.accuracy_scorer(model, self.x_train[train_idx], self.y_train[train_idx])
-        val_loss = 1 - self.accuracy_scorer(model, self.x_valid, self.y_valid)
-        cost = time.time() - start
-
-        return {'function_value': float(val_loss),
-                'cost': cost,
-                'info': {'train_loss': float(train_loss),
-                         'fidelity': fidelity}
-                }
-
-    # pylint: disable=arguments-differ
-    @AbstractBenchmark.check_parameters
-    def objective_function_test(self, configuration: Union[CS.Configuration, Dict],
-                                fidelity: Union[CS.Configuration, Dict, None] = None,
-                                shuffle: bool = False,
-                                rng: Union[np.random.RandomState, int, None] = None, **kwargs) -> Dict:
-        """
-        Trains a XGBoost model with a given configuration on both the train
-        and validation data set and evaluates the model on the test data set.
-
-        Parameters
-        ----------
-        configuration : Dict, CS.Configuration
-            Configuration for the XGBoost Model
-        fidelity: Dict, None
-            Fidelity parameters, check get_fidelity_space(). Uses default (max) value if None.
-        shuffle : bool
-            If ``True``, shuffle the training idx. If no parameter ``rng`` is given, use the class random state.
-            Defaults to ``False``.
-        rng : np.random.RandomState, int, None,
-            Random seed for benchmark. By default the class level random seed.
-            To prevent overfitting on a single seed, it is possible to pass a
-            parameter ``rng`` as 'int' or 'np.random.RandomState' to this function.
-            If this parameter is not given, the default random state is used.
-        kwargs
-
-        Returns
-        -------
-        Dict -
-            function_value : test loss
-            cost : time to train and evaluate the model
-            info : Dict
-                fidelity : used fidelities in this evaluation
-        """
-        default_dataset_fraction = self.get_fidelity_space().get_hyperparameter('dataset_fraction').default_value
-        if fidelity['dataset_fraction'] != default_dataset_fraction:
-            raise NotImplementedError(f'Test error can not be computed for dataset_fraction <= '
-                                      f'{default_dataset_fraction}')
-
-        self.rng = rng_helper.get_rng(rng=rng, self_rng=self.rng)
-
-        if shuffle:
-            self.shuffle_data(self.rng)
-
-        start = time.time()
-
-        # Impute potential nan values with the feature-
-        data = np.concatenate((self.x_train, self.x_valid))
-        targets = np.concatenate((self.y_train, self.y_valid))
-
-        model = self._get_pipeline(n_estimators=fidelity['n_estimators'], **configuration)
-        model.fit(X=data, y=targets)
-
-        test_loss = 1 - self.accuracy_scorer(model, self.x_test, self.y_test)
-        cost = time.time() - start
-
-        return {'function_value': float(test_loss),
-                'cost': cost,
-                'info': {'fidelity': fidelity}}
-
-    @staticmethod
-    def get_configuration_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
-        """
-        Creates a ConfigSpace.ConfigurationSpace containing all parameters for
-        the XGBoost Model
-
-        Parameters
-        ----------
-        seed : int, None
-            Fixing the seed for the ConfigSpace.ConfigurationSpace
-
-        Returns
-        -------
-        ConfigSpace.ConfigurationSpace
-        """
-        seed = seed if seed is not None else np.random.randint(1, 100000)
-        cs = CS.ConfigurationSpace(seed=seed)
-
-        cs.add_hyperparameters([
-            CS.UniformFloatHyperparameter('eta', lower=2**-10, upper=1., default_value=0.3, log=True),
-            CS.UniformIntegerHyperparameter('max_depth', lower=1, upper=15, default_value=6, log=False),
-            CS.UniformFloatHyperparameter('min_child_weight', lower=1., upper=2**7., default_value=1., log=True),
-            CS.UniformFloatHyperparameter('colsample_bytree', lower=0.01, upper=1., default_value=1.),
-            CS.UniformFloatHyperparameter('colsample_bylevel', lower=0.01, upper=1., default_value=1.),
-            CS.UniformFloatHyperparameter('reg_lambda', lower=2**-10, upper=2**10, default_value=1, log=True),
-            CS.UniformFloatHyperparameter('reg_alpha', lower=2**-10, upper=2**10, default_value=1, log=True),
-            CS.UniformFloatHyperparameter('subsample_per_it', lower=0.1, upper=1, default_value=1, log=False)
-        ])
-
-        return cs
-
-    @staticmethod
-    def get_fidelity_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
-        """
-        Creates a ConfigSpace.ConfigurationSpace containing all fidelity parameters for
-        the XGBoost Benchmark
-
-        Parameters
-        ----------
-        seed : int, None
-            Fixing the seed for the ConfigSpace.ConfigurationSpace
-
-        Returns
-        -------
-        ConfigSpace.ConfigurationSpace
-        """
-        seed = seed if seed is not None else np.random.randint(1, 100000)
-        fidel_space = CS.ConfigurationSpace(seed=seed)
-
-        fidel_space.add_hyperparameters([
-            CS.UniformFloatHyperparameter("dataset_fraction", lower=0.0, upper=1.0, default_value=1.0, log=False),
-            CS.UniformIntegerHyperparameter("n_estimators", lower=1, upper=256, default_value=256, log=False)
-        ])
-
-        return fidel_space
-
-    def get_meta_information(self) -> Dict:
-        """ Returns the meta information for the benchmark """
-        return {'name': 'XGBoost',
-                'references': ['@article{probst2019tunability,'
-                               'title={Tunability: Importance of hyperparameters of machine learning algorithms.},'
-                               'author={Probst, Philipp and Boulesteix, Anne-Laure and Bischl, Bernd},'
-                               'journal={J. Mach. Learn. Res.},'
-                               'volume={20},'
-                               'number={53},'
-                               'pages={1--32},'
-                               'year={2019}'
-                               '}'],
-                'code': 'https://github.com/automl/HPOlib1.5/blob/development/hpolib/benchmarks/ml/'
-                        'xgboost_benchmark_old.py',
-                'shape of train data': self.x_train.shape,
-                'shape of test data': self.x_test.shape,
-                'shape of valid data': self.x_valid.shape,
-                'initial random seed': self.rng,
-                'task_id': self.task_id
-                }
-
-    def _get_pipeline(self, max_depth: int, eta: float, min_child_weight: int,
-                      colsample_bytree: float, colsample_bylevel: float, reg_lambda: int, reg_alpha: int,
-                      n_estimators: int, subsample_per_it: float) \
-            -> pipeline.Pipeline:
-        """ Create the scikit-learn (training-)pipeline """
-        objective = 'binary:logistic' if self.num_class <= 2 else 'multi:softmax'
-
-        clf = pipeline.Pipeline([
-            ('preprocess_impute',
-             ColumnTransformer([
-                 ("categorical", "passthrough", self.categorical_data),
-                 ("continuous", SimpleImputer(strategy="mean"), ~self.categorical_data)])),
-            ('preprocess_one_hot',
-             ColumnTransformer([
-                 ("categorical", OneHotEncoder(categories=self.categories, sparse=False), self.categorical_data),
-                 ("continuous", "passthrough", ~self.categorical_data)])),
-            ('xgb',
-             xgb.XGBClassifier(
-                 max_depth=max_depth,
-                 learning_rate=eta,
-                 min_child_weight=min_child_weight,
-                 colsample_bytree=colsample_bytree,
-                 colsample_bylevel=colsample_bylevel,
-                 reg_alpha=reg_alpha,
-                 reg_lambda=reg_lambda,
-                 n_estimators=n_estimators,
-                 objective=objective,
-                 n_jobs=self.n_threads,
-                 random_state=self.rng.randint(1, 100000),
-                 num_class=self.num_class,
-                 subsample=subsample_per_it))
-            ])
-        return clf
-
-
-class XGBoostExtendedBenchmark(XGBoostBenchmark):
-    """
-    Similar to XGBoostBenchmark but enables also the optimization of the used booster.
-    """
-
-    def __init__(self, task_id: Union[int, None] = None, n_threads: int = 1,
-                 rng: Union[np.random.RandomState, int, None] = None):
-        super(XGBoostExtendedBenchmark, self).__init__(task_id=task_id, n_threads=n_threads, rng=rng)
-
-    @staticmethod
-    def get_configuration_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
-        cs = XGBoostBenchmark.get_configuration_space(seed)
-        hp_booster = CS.CategoricalHyperparameter('booster', choices=['gbtree', 'gblinear', 'dart'],
-                                                  default_value='gbtree')
-        cs.add_hyperparameter(hp_booster)
-
-        # XGBoost with 'gblinear' can not use some
-        # parameters. Exclude them from the configuration space by introducing a condition.
-        hps = ['colsample_bylevel', 'colsample_bytree', 'max_depth',  'min_child_weight', 'subsample_per_it']
-
-        # The NotEqualsCondition means: "Make parameter X active if hp_booster is not equal to gblinear."
-        conditions = [CS.NotEqualsCondition(cs.get_hyperparameter(hp), hp_booster, 'gblinear') for hp in hps]
-        cs.add_conditions(conditions)
-        return cs
-
-    # noinspection PyMethodOverriding
-    # pylint: disable=arguments-differ
-    def _get_pipeline(self, n_estimators: int, booster: str, reg_lambda: int, reg_alpha: int, eta: float,
-                      min_child_weight: int = None, max_depth: int = None, colsample_bytree: float = None,
-                      colsample_bylevel: float = None, subsample_per_it: float = None) \
-            -> pipeline.Pipeline:
-        """ Create the scikit-learn (training-)pipeline """
-        objective = 'binary:logistic' if self.num_class <= 2 else 'multi:softmax'
-
-        configuration = dict(booster=booster,
-                             max_depth=max_depth,
-                             learning_rate=eta,
-                             min_child_weight=min_child_weight,
-                             colsample_bytree=colsample_bytree,
-                             colsample_bylevel=colsample_bylevel,
-                             reg_alpha=reg_alpha,
-                             reg_lambda=reg_lambda,
-                             n_estimators=n_estimators,
-                             objective=objective,
-                             n_jobs=self.n_threads,
-                             random_state=self.rng.randint(1, 100000),
-                             num_class=self.num_class,
-                             subsample=subsample_per_it)
-
-        configuration = {k: v for k, v in configuration.items() if v is not None}
-
-        clf = pipeline.Pipeline([
-            ('preprocess_impute',
-             ColumnTransformer([
-                 ("categorical", "passthrough", self.categorical_data),
-                 ("continuous", SimpleImputer(strategy="mean"), ~self.categorical_data)])),
-            ('preprocess_one_hot',
-             ColumnTransformer([
-                 ("categorical", OneHotEncoder(categories=self.categories, sparse=False), self.categorical_data),
-                 ("continuous", "passthrough", ~self.categorical_data)])),
-            ('xgb',
-             xgb.XGBClassifier(**configuration))
-        ])
-        return clf
diff --git a/hpobench/benchmarks/ml/yahpo_benchmark.py b/hpobench/benchmarks/ml/yahpo_benchmark.py
new file mode 100644
index 00000000..d06d23fc
--- /dev/null
+++ b/hpobench/benchmarks/ml/yahpo_benchmark.py
@@ -0,0 +1,317 @@
+"""
+How to use this benchmark:
+--------------------------
+
+We recommend using the containerized version of this benchmark.
+If you want to use this benchmark locally (without running it via the corresponding container),
+you need to perform the following steps.
+
+Prerequisites: 1) Install Conda
+===============================
+Conda environment in which the HPOBench is installed (pip install .). Activate your environment.
+```
+conda activate <Name_of_Conda_HPOBench_environment>
+```
+
+Prerequisites: 2) Install R
+===========================
+
+Install R (4.0.5 - IMPORTANT!) and the required dependencies:  # works also with higher R versions(?)
+
+``` bash
+Rscript -e 'install.packages("remotes", repos = "http://cran.r-project.org")'
+
+# Install OpenML dependencies
+Rscript -e 'install.packages("curl", repos = "http://cran.r-project.org")' \
+&& Rscript -e 'install.packages("httr", repos = "http://cran.r-project.org")' \
+&& Rscript -e 'install.packages("farff", repos = "http://cran.r-project.org")' \
+&& Rscript -e 'install.packages("OpenML", repos = "http://cran.r-project.org")' \
+
+# Install rbv2 dependencies
+Rscript -e 'remotes::install_version("BBmisc", version = "1.11", upgrade = "never", repos = "http://cran.r-project.org")' \
+&& Rscript -e 'remotes::install_version("glmnet", version = "2.0-16", upgrade = "never", repos = "http://cran.r-project.o")' \
+&& Rscript -e 'remotes::install_version("rpart", version = "4.1-13", upgrade = "never", repos = "http://cran.r-project.org")' \
+&& Rscript -e 'remotes::install_version("e1071", version = "1.7-0.1", upgrade = "never", repos = "http://cran.r-project.org")' \
+&& Rscript -e 'remotes::install_version("xgboost", version = "0.82.1", upgrade = "never", repos = "http://cran.r-project.org")' \
+&& Rscript -e 'remotes::install_version("ranger", version = "0.11.2", upgrade = "never", repos = "http://cran.r-project.org")' \
+&& Rscript -e 'remotes::install_version("RcppHNSW", version = "0.1.0", upgrade = "never", repos = "http://cran.r-project.org")' \
+&& Rscript -e 'remotes::install_version("mlr", version = "2.14", upgrade = "never", repos = "http://cran.r-project.org")' \
+&& Rscript -e 'remotes::install_github("mlr-org/mlr3misc", upgrade = "never", repos = "http://cran.r-project.org")' \
+&& Rscript -e 'remotes::install_version("mlrCPO", version = "0.3.6", upgrade = "never", repos = "http://cran.r-projt.org")' \
+&& Rscript -e 'remotes::install_github("pfistfl/rbv2", upgrade = "never")' \
+&& Rscript -e 'remotes::install_version("testthat", version = "3.1.4", upgrade = "never", repos = "http://cran.r-project.org")' \
+&& Rscript -e 'remotes::install_github("sumny/iaml", upgrade = "never")'
+```
+Prerequisites: 3) Install rpy2
+==============================
+Installing the connector between R and python might be a little bit tricky.
+Official installation guide: https://rpy2.github.io/doc/latest/html/introduction.html
+
+We received in some cases the error: "/opt/R/4.0.5/lib/R/library/methods/libs/methods.so: undefined symbol".
+To solve this error, we had to execute the following command:
+```
+export LD_LIBRARY_PATH=$(python -m rpy2.situation LD_LIBRARY_PATH):${LD_LIBRARY_PATH}
+```
+
+1. Download data:
+=================
+Normally, the data will be downloaded automatically.
+
+If you want to download the data on your own, you can download the data with the following command:
+
+``` bash
+git clone --depth 1 -b main https://github.com/pfistfl/yahpo_data.git
+```
+
+Later, you have to give yahpo the link to the data.
+
+```python
+from yahpo_gym import local_config
+local_config.init_config()
+local_config.set_data_path("path-to-data")
+```
+
+The data consist of surrogates for different data sets. Each surrogate is a compressed ONNX neural network.
+
+
+2. Install HPOBench:
+====================
+```
+git clone HPOBench
+cd /path/to/HPOBench
+pip install .[yahpo_gym_raw]
+```
+
+Changelog:
+==========
+0.0.1:
+* First implementation
+"""  # noqa: E501
+
+import logging
+from pathlib import Path
+from typing import Union, Dict, List
+
+import pandas as pd
+import ConfigSpace as CS
+import numpy as np
+import rpy2.robjects as robjects
+from rpy2.robjects.packages import importr
+from yahpo_gym.benchmark_set import BenchmarkSet
+
+from hpobench.abstract_benchmark import AbstractBenchmark, AbstractMultiObjectiveBenchmark
+
+__version__ = '0.0.1'
+
+logger = logging.getLogger('YAHPO-Raw')
+
+
+class YAHPOGymMORawBenchmark(AbstractMultiObjectiveBenchmark):
+    def __init__(self, scenario: str, instance: str,
+                 rng: Union[np.random.RandomState, int, None] = None,
+                 data_dir: Union[Path, str, None] = None):
+        """
+        Parameters
+        ----------
+        scenario : str
+            Name for the learner. Must be one of [
+            "rbv2_ranger", "rbv2_rpart", "rbv2_glmnet", "rbv2_xgboost", "rbv2_svm", "rbv2_aknn", "rbv2_super",
+            "iaml_ranger", "iaml_rpart", "iaml_glmnet", "iaml_xgboost"
+            ]
+        instance : str
+            A valid instance for the scenario. See `self.benchset.instances`.
+            https://slds-lmu.github.io/yahpo_gym/scenarios.html#instances
+        rng : np.random.RandomState, int, None
+        """
+
+        assert scenario.startswith('rbv2_') or scenario.startswith('iaml_'), \
+            'Currently, we only support the experiments with rbv2_ and iaml from yahpo. ' \
+            f'The scenario has to start with either rbv2_ or iaml_, but was {scenario}'
+
+        from hpobench.util.data_manager import YAHPODataManager
+        self.data_manager = YAHPODataManager(data_dir=data_dir)
+        self.data_manager.load()
+
+        self.scenario = scenario
+        self.instance = instance
+        self.benchset = BenchmarkSet(scenario, active_session=True)
+        self.benchset.set_instance(instance)
+
+        logger.info(f'Start Benchmark for scenario {scenario} and instance {instance}')
+        super(YAHPOGymMORawBenchmark, self).__init__(rng=rng)
+
+    # pylint: disable=arguments-differ
+    def get_configuration_space(self, seed: Union[int, None] = None) -> CS.ConfigurationSpace:
+        return self.benchset.get_opt_space(drop_fidelity_params=True, seed=seed)
+
+    # pylint: disable=arguments-differ
+    def get_fidelity_space(self, seed: Union[int, None] = None) -> CS.ConfigurationSpace:
+        return self.benchset.get_fidelity_space(seed=seed)
+
+    @AbstractMultiObjectiveBenchmark.check_parameters
+    def objective_function(self, configuration: Union[CS.Configuration, Dict],
+                           fidelity: Union[CS.Configuration, Dict, None] = None,
+                           rng: Union[np.random.RandomState, int, None] = None, **kwargs) -> Dict:
+
+        # Cast python dict to R list:
+        parameters = {**configuration, **fidelity}
+        r_list = YAHPOGymMORawBenchmark._cast_dict_to_rlist(parameters)
+
+        # Call the random bot evaluation method
+        if self.scenario.startswith('rbv2_'):
+
+            # Establish a connection to the R package
+            rbv2pkg = importr('rbv2')
+
+            learner = self.scenario.replace('rbv2_', 'classif.')
+            r_out = rbv2pkg.eval_config(
+                learner=learner, task_id=int(configuration['task_id']), configuration=r_list
+            )
+            # Extract the run data frame via replications and cast the R list (result) back to a python dictionary
+            result_r_df = r_out[0][0][0][4]
+            result_dict = YAHPOGymMORawBenchmark._cast_to_dict(result_r_df)
+            result_df = pd.DataFrame(result_dict)
+            result = result_df.mean(axis=0)
+            result = result.to_dict()
+            time_cols = [col for col in result_df.columns if 'time' in col]
+            times = {col: result_df.loc[:, col].sum() for col in time_cols}
+            result.update(times)
+
+        elif self.scenario.startswith('iaml_'):
+
+            iaml = importr('iaml')
+            out = iaml.eval_yahpo(scenario=robjects.StrVector([self.scenario]), configuration=r_list)
+            result = YAHPOGymMORawBenchmark._cast_to_dict(out)
+
+        elif self.scenario.startswith('fair_'):
+
+            fair_pkg = importr('fair')
+            out = fair_pkg.eval_yahpo(scenario=robjects.StrVector([self.scenario]), configuration=r_list)
+            result = YAHPOGymMORawBenchmark._cast_to_dict(out)
+
+        else:
+            raise NotImplementedError()
+
+        objectives = {target: value for target, value in result.items() if target in self.benchset.config.y_names}
+        additional = {target: value for target, value in result.items() if target not in self.benchset.config.y_names}
+
+        return {
+            'function_value': objectives,
+            'cost': result['timetrain'],
+            'info': {'fidelity': fidelity, 'additional_info': additional}
+        }
+
+    @AbstractMultiObjectiveBenchmark.check_parameters
+    def objective_function_test(self, configuration: Union[CS.Configuration, Dict],
+                                fidelity: Union[CS.Configuration, Dict, None] = None,
+                                rng: Union[np.random.RandomState, int, None] = None, **kwargs) -> Dict:
+        return self.objective_function(configuration, fidelity=fidelity, rng=rng)
+
+    @staticmethod
+    def get_meta_information():
+        """ Returns the meta information for the benchmark """
+        return {'name': 'YAHPO Gym',
+                'references': ['@misc{pfisterer2021yahpo,',
+                               'title={YAHPO Gym -- Design Criteria and a new Multifidelity Benchmark '
+                               '       for Hyperparameter Optimization},',
+                               'author={Florian Pfisterer and Lennart Schneider and Julia Moosbauer '
+                               '        and Martin Binder and Bernd Bischl},',
+                               'eprint={2109.03670},',
+                               'archivePrefix={arXiv},',
+                               'year={2021}}'],
+                'code': ['https://github.com/pfistfl/yahpo_gym/yahpo_gym',
+                         'https://github.com/pfistfl/rbv2/',
+                         'https://github.com/sumny/iaml',
+                         'https://github.com/sumny/fair']
+                }
+
+    # pylint: disable=arguments-differ
+    def get_objective_names(self) -> List[str]:
+        return self.benchset.config.y_names
+
+    @staticmethod
+    def _cast_dict_to_rlist(py_dict):
+        """ Convert a python dictionary to a RPy2 ListVector"""
+        pairs = [f'{key} = {value}' if not isinstance(value, str) else f'{key} = \"{value}\"'
+                 for key, value in py_dict.items()]
+        pairs = ",".join(pairs)
+        str_list = f"list({pairs})"
+        r_list = robjects.r(str_list)
+        return r_list
+
+    @staticmethod
+    def _cast_to_dict(r_list_object) -> Dict:
+        """
+        Convert an RPy2 ListVector to a Python dict.
+        Source: https://ogeek.cn/qa/?qa=815151/
+        """
+        result = {}
+        for i, name in enumerate(r_list_object.names):
+            if isinstance(r_list_object[i], robjects.ListVector):
+                result[name] = YAHPOGymMORawBenchmark._cast_to_dict(r_list_object[i])
+            elif len(r_list_object[i]) == 1:
+                result[name] = r_list_object[i][0]
+            else:
+                result[name] = r_list_object[i]
+        return result
+
+
+class YAHPOGymRawBenchmark(AbstractBenchmark):
+    def __init__(self, scenario: str, instance: str, objective: str = None,
+                 rng: Union[np.random.RandomState, int, None] = None):
+        """
+        Parameters
+        ----------
+        scenario : str
+            Name for the surrogate data. Must be one of ["lcbench", "fcnet", "nb301", "rbv2_svm",
+            "rbv2_ranger", "rbv2_rpart", "rbv2_glmnet", "rbv2_aknn", "rbv2_xgboost", "rbv2_super"]
+        instance : str
+            A valid instance for the scenario. See `self.benchset.instances`.
+            https://slds-lmu.github.io/yahpo_gym/scenarios.html#instances
+        objective : str
+            Name of the (single-crit) objective. See `self.benchset.config.y_names`.
+            Initialized to None, picks the first element in y_names.
+        rng : np.random.RandomState, int, None
+        """
+        self.backbone = YAHPOGymMORawBenchmark(scenario=scenario, instance=instance, rng=rng)
+        self.objective = objective
+        super(YAHPOGymRawBenchmark, self).__init__(rng=rng)
+
+    @AbstractBenchmark.check_parameters
+    def objective_function(self, configuration: Union[CS.Configuration, Dict],
+                           fidelity: Union[Dict, CS.Configuration, None] = None,
+                           rng: Union[np.random.RandomState, int, None] = None, **kwargs) -> Dict:
+
+        mo_results = self.backbone.objective_function(configuration=configuration,
+                                                      fidelity=fidelity,
+                                                      **kwargs)
+
+        # If not objective is set, we just grab the first returned entry.
+        if self.objective is None:
+            self.objective = self.backbone.benchset.config.y_names[0]
+
+        obj_value = mo_results['function_value'][self.objective]
+
+        return {'function_value': obj_value,
+                "cost": mo_results['cost'],
+                'info': {'fidelity': fidelity,
+                         'additional_info': mo_results['info']['additional_info'],
+                         'objectives': mo_results['function_value']}}
+
+    @AbstractBenchmark.check_parameters
+    def objective_function_test(self, configuration: Union[CS.Configuration, Dict],
+                                fidelity: Union[Dict, CS.Configuration, None] = None,
+                                rng: Union[np.random.RandomState, int, None] = None, **kwargs) -> Dict:
+        return self.objective_function(configuration, fidelity=fidelity, rng=rng)
+
+    # pylint: disable=arguments-differ
+    def get_configuration_space(self, seed: Union[int, None] = None) -> CS.ConfigurationSpace:
+        return self.backbone.get_configuration_space(seed=seed)
+
+    # pylint: disable=arguments-differ
+    def get_fidelity_space(self, seed: Union[int, None] = None) -> CS.ConfigurationSpace:
+        return self.backbone.get_fidelity_space(seed=seed)
+
+    @staticmethod
+    def get_meta_information() -> Dict:
+        return YAHPOGymMORawBenchmark.get_meta_information()
diff --git a/hpobench/benchmarks/mo/adult_benchmark.py b/hpobench/benchmarks/mo/adult_benchmark.py
index a12e8a70..30631cae 100644
--- a/hpobench/benchmarks/mo/adult_benchmark.py
+++ b/hpobench/benchmarks/mo/adult_benchmark.py
@@ -1,6 +1,8 @@
 """
 Changelog:
 ==========
+0.0.2:
+* Change the objective value from accuracy to misclassification rate. (1 - accuracy)
 
 0.0.1:
 * First implementation of the Multi-Objective Fair Adult Benchmark.
@@ -127,7 +129,7 @@ def get_meta_information() -> Dict:
     @staticmethod
     def get_objective_names() -> List[str]:
         """Get a list of objectives evaluated in the objective_function. """
-        return ['accuracy', 'DSP', 'DEO', 'DFP']
+        return ['misclassification_rate', 'DSP', 'DEO', 'DFP']
 
     @AbstractMultiObjectiveBenchmark.check_parameters
     def objective_function(self, configuration: Union[CS.Configuration, Dict],
@@ -165,7 +167,7 @@ def objective_function(self, configuration: Union[CS.Configuration, Dict],
         -------
         Dict -
             function_value : Dict - validation metrics after training on train
-                accuracy: float
+                misclassification_rate: float: 1 - validation accuracy
                 DSO: float
                 DEO: float
                 DFP: float
@@ -247,7 +249,7 @@ def objective_function(self, configuration: Union[CS.Configuration, Dict],
 
         elapsed_time = time.time() - ts_start
 
-        return {'function_value': {'accuracy': float(val_accuracy),
+        return {'function_value': {'misclassification_rate': 1 - float(val_accuracy),
                                    'DSO': float(val_statistical_disparity),
                                    'DEO': float(val_unequal_opportunity),
                                    'DFP': float(val_unequalized_odds)
@@ -310,7 +312,7 @@ def objective_function_test(self, configuration: Union[CS.Configuration, Dict],
         -------
         Dict -
             function_value : Dict - test metrics reported after training on (train+valid)
-                accuracy: float
+                misclassification_rate: float: 1 - test accuracy
                 DSO: float
                 DEO: float
                 DFP: float
@@ -381,7 +383,7 @@ def objective_function_test(self, configuration: Union[CS.Configuration, Dict],
         logger.debug(f"config:{configuration}, test_score: {test_accuracy}, train score:{train_accuracy},"
                      f"dsp:{test_statistical_disparity}, deo :{test_unequal_opportunity}, dfp :{test_unequalized_odds}")
 
-        return {'function_value': {'accuracy': float(test_accuracy),
+        return {'function_value': {'misclassification_rate': 1 - float(test_accuracy),
                                    'DSO': float(test_statistical_disparity),
                                    'DEO': float(test_unequal_opportunity),
                                    'DFP': float(test_unequalized_odds)
diff --git a/hpobench/benchmarks/mo/cnn_benchmark.py b/hpobench/benchmarks/mo/cnn_benchmark.py
index d8bfd939..516b459a 100644
--- a/hpobench/benchmarks/mo/cnn_benchmark.py
+++ b/hpobench/benchmarks/mo/cnn_benchmark.py
@@ -1,6 +1,9 @@
 """
 Changelog:
 ==========
+0.0.2:
+* Rename the returned function value
+  'negative_accuracy' -> 'misclassification_rate'
 
 0.0.1:
 * First implementation of the Multi-Objective CNN Benchmark.
@@ -22,7 +25,7 @@
 from hpobench.abstract_benchmark import AbstractMultiObjectiveBenchmark
 from hpobench.util.data_manager import CNNDataManager
 
-__version__ = '0.0.1'
+__version__ = '0.0.2'
 
 logger = logging.getLogger('MO_CNN')
 
@@ -284,7 +287,7 @@ def get_meta_information() -> Dict:
     @staticmethod
     def get_objective_names() -> List[str]:
         """Get the names of the objectives reported in the objective function."""
-        return ['accuracy', 'model_size']
+        return ['misclassification_rate', 'model_size']
 
     def init_model(self, config: Union[CS.Configuration, Dict]) -> Net:
         """
@@ -361,7 +364,7 @@ def objective_function(self, configuration: Union[CS.Configuration, Dict],
         -------
         Dict -
             function_value : Dict
-                negative_accuracy: float
+                misclassification_rate: float
                     1 - validation accuracy
                 log_model_size: float
                     log10 of the number of parameters
@@ -435,7 +438,7 @@ def objective_function(self, configuration: Union[CS.Configuration, Dict],
 
         elapsed_time = time.time() - time_in
 
-        return {'function_value': {'negative_accuracy': 1 - val_accuracy,
+        return {'function_value': {'misclassification_rate': 1 - val_accuracy,
                                    'log_model_size': float(np.log10(num_params))},
                 'cost': float(training_runtime),
                 'info': {'train_accuracy': train_accuracy,
@@ -479,7 +482,7 @@ def objective_function_test(self, configuration: Union[CS.Configuration, Dict],
         -------
         Dict -
             function_value : Dict
-                negative_accuracy: float
+                misclassification_rate: float
                     1 - test accuracy
                 log_model_size: float
                     log10 of the number of parameters
@@ -546,7 +549,7 @@ def objective_function_test(self, configuration: Union[CS.Configuration, Dict],
 
         elapsed_time = time.time() - time_in
 
-        return {'function_value': {'negative_accuracy': 1 - test_accuracy,
+        return {'function_value': {'misclassification_rate': 1 - test_accuracy,
                                    'log_model_size': float(np.log10(num_params))},
                 'cost': training_runtime,
                 'info': {'train_accuracy': train_accuracy,
diff --git a/hpobench/benchmarks/nas/nasbench_101.py b/hpobench/benchmarks/nas/nasbench_101.py
index f7ee1b20..c0f80737 100644
--- a/hpobench/benchmarks/nas/nasbench_101.py
+++ b/hpobench/benchmarks/nas/nasbench_101.py
@@ -42,6 +42,11 @@
 
 Changelog:
 ==========
+0.0.5
+* ADD Multi Objective version. Introduce objectives:
+  - misclassification_rate (0, 1)     - lower is better
+  - trainable_parameters   (0, 10**8) - lower is better
+
 0.0.4
 * New container release due to a general change in the communication between container and HPOBench.
   Works with HPOBench >= v0.0.8
@@ -61,23 +66,22 @@
 
 """
 import logging
-
 from pathlib import Path
-from typing import Union, Dict, Any, Tuple, List
+from typing import Union, Dict, Any, Tuple, List, Type
 
 import ConfigSpace as CS
 import numpy as np
-from tabular_benchmarks.nas_cifar10 import NASCifar10
 from nasbench import api
 from nasbench.api import OutOfDomainError
 from nasbench.lib import graph_util
+from tabular_benchmarks.nas_cifar10 import NASCifar10
 
-from hpobench import config_file
 import hpobench.util.rng_helper as rng_helper
-from hpobench.abstract_benchmark import AbstractBenchmark
+from hpobench import config_file
+from hpobench.abstract_benchmark import AbstractBenchmark, AbstractMultiObjectiveBenchmark
 from hpobench.util.data_manager import NASBench_101DataManager
 
-__version__ = '0.0.4'
+__version__ = '0.0.5'
 logger = logging.getLogger('NasBench101')
 
 MAX_EDGES = 9
@@ -85,17 +89,19 @@
 DEFAULT_API_FILE = config_file.data_dir / "nasbench_101"
 
 
-class NASCifar10BaseBenchmark(AbstractBenchmark):
-    def __init__(self, benchmark: NASCifar10,
+class _NAS101BaseBenchmark:
+    def __init__(self,
+                 benchmark_type: Type[NASCifar10],
                  data_path: Union[Path, str, None] = None,
-                 rng: Union[np.random.RandomState, int, None] = None, **kwargs):
+                 rng: Union[np.random.RandomState, int, None] = None,
+                 **kwargs):
         """
         Baseclass for the tabular benchmarks https://github.com/automl/nas_benchmarks/tree/master/tabular_benchmarks.
         Please install the benchmark first. Place the data under ``data_path``.
 
         Parameters
         ----------
-        benchmark : NASCifar10
+        benchmark_type : Type[NASCifar10]
             Type of the benchmark to use. Don't call this class directly. Instantiate via subclasses (see below).
         data_path : str, Path, None
             Path to the folder, which contains the downloaded file nasbench_full.tfrecord.
@@ -103,21 +109,76 @@ def __init__(self, benchmark: NASCifar10,
             Random seed for the benchmarks
         """
 
-        super(NASCifar10BaseBenchmark, self).__init__(rng=rng)
-
-        self.benchmark = benchmark
+        data_path = self._try_download_api_file(data_path)
         self.data_path = data_path
+        self.rng = rng
+        self.benchmark: NASCifar10 = benchmark_type(data_dir=str(data_path), multi_fidelity=True)
+        super(_NAS101BaseBenchmark, self).__init__(rng=rng, **kwargs)
 
     def _query_benchmark(self, config: Dict, run_index: int, budget: int = 108) -> Dict:
         raise NotImplementedError
 
-    # pylint: disable=arguments-differ
-    @AbstractBenchmark.check_parameters
-    def objective_function(self, configuration: Union[CS.Configuration, Dict],
-                           fidelity: Union[CS.Configuration, Dict, None] = None,
-                           run_index: Union[int, Tuple, None] = (0, 1, 2),
-                           rng: Union[np.random.RandomState, int, None] = None,
-                           **kwargs) -> Dict:
+    @staticmethod
+    def get_configuration_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
+        raise NotImplementedError
+
+    @staticmethod
+    def get_meta_information() -> Dict:
+        """ Returns the meta information for the benchmark """
+        return {'name': 'Tabular Benchmarks for Hyperparameter Optimization and Neural Architecture Search',
+                'references': ['@article{klein2019tabular,'
+                               'title   = {Tabular benchmarks for joint architecture and hyperparameter optimization},'
+                               'author  = {Klein, Aaron and Hutter, Frank},'
+                               'journal = {arXiv preprint arXiv:1905.04970},'
+                               'year    = {2019}}',
+                               'https://arxiv.org/abs/1905.04970',
+                               ],
+                'code': 'https://github.com/automl/nas_benchmarks',
+                }
+
+    @staticmethod
+    def _get_configuration_space(benchmark: Any, seed: Union[int, None] = None) -> CS.ConfigurationSpace:
+        """ Helper function to pass a seed to the configuration space """
+        seed = seed if seed is not None else np.random.randint(1, 100000)
+        cs = benchmark.get_configuration_space()
+        cs.seed(seed)
+        return cs
+
+    @staticmethod
+    def get_fidelity_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
+        """
+        Creates a ConfigSpace.ConfigurationSpace containing all fidelity parameters for
+        the NAS Benchmark 101.
+
+        Parameters
+        ----------
+        seed : int, None
+            Fixing the seed for the ConfigSpace.ConfigurationSpace
+
+        Returns
+        -------
+        ConfigSpace.ConfigurationSpace
+        """
+        seed = seed if seed is not None else np.random.randint(1, 100000)
+        fidel_space = CS.ConfigurationSpace(seed=seed)
+
+        fidel_space.add_hyperparameters([
+            CS.OrdinalHyperparameter('budget', sequence=[4, 12, 36, 108], default_value=108)
+        ])
+
+        return fidel_space
+
+    @staticmethod
+    def _try_download_api_file(save_to: Union[Path, str, None]):
+        data_manager = NASBench_101DataManager(save_to)
+        data_manager.download()
+        return data_manager.save_dir
+
+    def _mo_objective_function(self, configuration: Union[CS.Configuration, Dict],
+                               fidelity: Union[CS.Configuration, Dict, None] = None,
+                               run_index: Union[int, Tuple, None] = (0, 1, 2),
+                               rng: Union[np.random.RandomState, int, None] = None,
+                               **kwargs) -> Dict:
         """
         Query the NAS-benchmark using a given configuration and a epoch (=budget).
 
@@ -144,7 +205,12 @@ def objective_function(self, configuration: Union[CS.Configuration, Dict],
         Returns
         -------
         Dict -
-            function_value : validation error
+            function_value :
+                misclassification_rate: float [0,1] (lower is better)
+                    1-accuracy on validation set
+                trainable_parameters: int [0, 10**8] (lower is better)
+                    Number of trainable parameters in the network
+
             cost : runtime
             info : Dict
                 fidelity : used fidelities in this evaluation
@@ -176,6 +242,7 @@ def objective_function(self, configuration: Union[CS.Configuration, Dict],
         test_accuracies = []
         training_times = []
         additional = {}
+        failure = False
 
         for run_id in run_index:
             data = self._query_benchmark(config=configuration, budget=fidelity['budget'], run_index=run_id)
@@ -186,25 +253,31 @@ def objective_function(self, configuration: Union[CS.Configuration, Dict],
             training_times.append(data['training_time'])
 
             # Since those information are the same for all run ids, just store one of them.
-            additional = {'trainable_parameters': data['trainable_parameters'],
+            # Also, if the configuration is invalid, set the number of parameters to its upper limit.
+            trainable_parameters = data['trainable_parameters']
+            failure = trainable_parameters == 0
+            trainable_parameters = 10**8 if trainable_parameters == 0 else trainable_parameters
+
+            additional = {'trainable_parameters': trainable_parameters,
                           'module_operations': data['module_operations']}
 
-        return {'function_value': float(1 - np.mean(valid_accuracies)),
+        return {'function_value': {'misclassification_rate': float(1 - np.mean(valid_accuracies)),
+                                   'trainable_parameters': additional['trainable_parameters']},
                 'cost': float(np.sum(training_times)),
                 'info': {'fidelity': fidelity,
                          'train_accuracies': train_accuracies,
                          'valid_accuracies': valid_accuracies,
                          'test_accuracies': test_accuracies,
                          'training_times': training_times,
+                         'failure': 1 if failure else 0,
                          'data': additional
                          }
                 }
 
-    @AbstractBenchmark.check_parameters
-    def objective_function_test(self, configuration: Union[Dict, CS.Configuration],
-                                fidelity: Union[CS.Configuration, Dict, None] = None,
-                                rng: Union[np.random.RandomState, int, None] = None,
-                                **kwargs) -> Dict:
+    def _mo_objective_function_test(self, configuration: Union[Dict, CS.Configuration],
+                                    fidelity: Union[CS.Configuration, Dict, None] = None,
+                                    rng: Union[np.random.RandomState, int, None] = None,
+                                    **kwargs) -> Dict:
         """
         Validate a configuration on the maximum available budget.
 
@@ -222,83 +295,29 @@ def objective_function_test(self, configuration: Union[Dict, CS.Configuration],
         Returns
         -------
         Dict -
-            function_value : test error
+            function_value :
+                misclassification_rate: float [0,1] (lower is better)
+                    1-accuracy on test set
+                trainable_parameters: int [0, 10**8] (lower is better)
+                    Number of trainable parameters in the network
+
             cost : runtime
             info : Dict
                 fidelity : used fidelities in this evaluation
         """
 
-        result = self.objective_function(configuration=configuration, fidelity=fidelity, run_index=(0, 1, 2), rng=rng)
-        result['function_value'] = float(1 - np.mean(result['info']['test_accuracies']))
+        result = self._mo_objective_function(
+            configuration=configuration, fidelity=fidelity, run_index=(0, 1, 2), rng=rng
+        )
+        result['function_value']['misclassification_rate'] = float(1 - np.mean(result['info']['test_accuracies']))
 
         return result
 
-    @staticmethod
-    def get_configuration_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
-        raise NotImplementedError
-
-    @staticmethod
-    def get_meta_information() -> Dict:
-        """ Returns the meta information for the benchmark """
-        return {'name': 'Tabular Benchmarks for Hyperparameter Optimization and Neural Architecture Search',
-                'references': ['@article{klein2019tabular,'
-                               'title   = {Tabular benchmarks for joint architecture and hyperparameter optimization},'
-                               'author  = {Klein, Aaron and Hutter, Frank},'
-                               'journal = {arXiv preprint arXiv:1905.04970},'
-                               'year    = {2019}}',
-                               'https://arxiv.org/abs/1905.04970',
-                               ],
-                'code': 'https://github.com/automl/nas_benchmarks',
-                }
-
-    @staticmethod
-    def _get_configuration_space(benchmark: Any, seed: Union[int, None] = None) -> CS.ConfigurationSpace:
-        """ Helper function to pass a seed to the configuration space """
-        seed = seed if seed is not None else np.random.randint(1, 100000)
-        cs = benchmark.get_configuration_space()
-        cs.seed(seed)
-        return cs
-
-    @staticmethod
-    def get_fidelity_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
-        """
-        Creates a ConfigSpace.ConfigurationSpace containing all fidelity parameters for
-        the NAS Benchmark 101.
-
-        Parameters
-        ----------
-        seed : int, None
-            Fixing the seed for the ConfigSpace.ConfigurationSpace
-
-        Returns
-        -------
-        ConfigSpace.ConfigurationSpace
-        """
-        seed = seed if seed is not None else np.random.randint(1, 100000)
-        fidel_space = CS.ConfigurationSpace(seed=seed)
-
-        fidel_space.add_hyperparameters([
-            CS.OrdinalHyperparameter('budget', sequence=[4, 12, 36, 108], default_value=108)
-        ])
-
-        return fidel_space
-
-    @staticmethod
-    def _try_download_api_file(save_to: Union[Path, str, None]):
-        data_manager = NASBench_101DataManager(save_to)
-        data_manager.download()
-        return data_manager.save_dir
-
-
-class NASCifar10ABenchmark(NASCifar10BaseBenchmark):
-    def __init__(self, data_path: Union[Path, str, None] = None,
-                 rng: Union[np.random.RandomState, int, None] = None, **kwargs):
-
-        data_path = self._try_download_api_file(data_path)
 
+class _QueryA(_NAS101BaseBenchmark):
+    def __init__(self, **kwargs):
         from tabular_benchmarks.nas_cifar10 import NASCifar10A
-        benchmark = NASCifar10A(data_dir=str(data_path), multi_fidelity=True)
-        super(NASCifar10ABenchmark, self).__init__(benchmark=benchmark, data_path=data_path, rng=rng, **kwargs)
+        super(_QueryA, self).__init__(benchmark_type=NASCifar10A)
 
     @staticmethod
     def get_configuration_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
@@ -315,7 +334,7 @@ def get_configuration_space(seed: Union[int, None] = None) -> CS.ConfigurationSp
         """
 
         from tabular_benchmarks.nas_cifar10 import NASCifar10A
-        return NASCifar10BBenchmark._get_configuration_space(NASCifar10A, seed)
+        return _NAS101BaseBenchmark._get_configuration_space(NASCifar10A, seed)
 
     def _query_benchmark(self, config: Dict, run_index: int, budget: int = 108) -> Dict:
         """
@@ -372,15 +391,10 @@ def _query_benchmark(self, config: Dict, run_index: int, budget: int = 108) -> D
         return data
 
 
-class NASCifar10BBenchmark(NASCifar10BaseBenchmark):
-    def __init__(self, data_path: Union[Path, str, None] = None,
-                 rng: Union[np.random.RandomState, int, None] = None, **kwargs):
-
-        data_path = self._try_download_api_file(data_path)
-
+class _QueryB(_NAS101BaseBenchmark):
+    def __init__(self, **kwargs):
         from tabular_benchmarks.nas_cifar10 import NASCifar10B
-        benchmark = NASCifar10B(data_dir=str(data_path), multi_fidelity=True)
-        super(NASCifar10BBenchmark, self).__init__(benchmark=benchmark, data_path=data_path, rng=rng, **kwargs)
+        super(_QueryB, self).__init__(benchmark_type=NASCifar10B, **kwargs)
 
     @staticmethod
     def get_configuration_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
@@ -397,9 +411,10 @@ def get_configuration_space(seed: Union[int, None] = None) -> CS.ConfigurationSp
         """
 
         from tabular_benchmarks.nas_cifar10 import NASCifar10B
-        return NASCifar10BBenchmark._get_configuration_space(NASCifar10B, seed)
+        return _NAS101BaseBenchmark._get_configuration_space(NASCifar10B, seed)
 
     def _query_benchmark(self, config: Dict, run_index: int, budget: int = 108) -> Dict:
+
         """
         Copied from the 'objective_function' from nas_cifar10.py
         We adapted the file in such a way, that the complete result is returned. The original implementation returns
@@ -408,6 +423,8 @@ def _query_benchmark(self, config: Dict, run_index: int, budget: int = 108) -> D
         Parameters
         ----------
         config : Dict
+        run_index : int
+            Specifies the seed to use. Can be one of 0, 1, 2.
         budget : int
             The number of epochs. Must be one of: 4 12 36 108. Otherwise a accuracy of 0 is returned.
 
@@ -415,6 +432,7 @@ def _query_benchmark(self, config: Dict, run_index: int, budget: int = 108) -> D
         -------
         Dict
         """
+
         failure = {"test_accuracy": 0, "train_accuracy": 0, "validation_accuracy": 0, "training_time": 0,
                    "info": "failure", "trainable_parameters": 0, "module_operations": 0}
 
@@ -439,6 +457,7 @@ def _query_benchmark(self, config: Dict, run_index: int, budget: int = 108) -> D
         labeling = [config["op_node_%d" % i] for i in range(5)]
         labeling = ['input'] + list(labeling) + ['output']
         model_spec = api.ModelSpec(matrix, labeling)
+
         try:
             data = modified_query(self.benchmark, run_index=run_index, model_spec=model_spec, epochs=budget)
         except api.OutOfDomainError:
@@ -453,15 +472,10 @@ def _query_benchmark(self, config: Dict, run_index: int, budget: int = 108) -> D
         return data
 
 
-class NASCifar10CBenchmark(NASCifar10BaseBenchmark):
-    def __init__(self, data_path: Union[Path, str, None] = None,
-                 rng: Union[np.random.RandomState, int, None] = None, **kwargs):
-
-        data_path = self._try_download_api_file(data_path)
-
+class _QueryC(_NAS101BaseBenchmark):
+    def __init__(self, **kwargs):
         from tabular_benchmarks.nas_cifar10 import NASCifar10C
-        benchmark = NASCifar10C(data_dir=str(data_path), multi_fidelity=True)
-        super(NASCifar10CBenchmark, self).__init__(benchmark=benchmark, data_path=data_path, rng=rng, **kwargs)
+        super(_QueryC, self).__init__(benchmark_type=NASCifar10C, **kwargs)
 
     @staticmethod
     def get_configuration_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
@@ -478,7 +492,7 @@ def get_configuration_space(seed: Union[int, None] = None) -> CS.ConfigurationSp
         """
 
         from tabular_benchmarks.nas_cifar10 import NASCifar10C
-        return NASCifar10BBenchmark._get_configuration_space(NASCifar10C, seed)
+        return _NAS101BaseBenchmark._get_configuration_space(NASCifar10C, seed)
 
     def _query_benchmark(self, config: Dict, run_index: int, budget: int = 108) -> Dict:
         """
@@ -538,6 +552,221 @@ def _query_benchmark(self, config: Dict, run_index: int, budget: int = 108) -> D
         return data
 
 
+class _NASCifar10BaseMOBenchmark(_NAS101BaseBenchmark, AbstractMultiObjectiveBenchmark):
+
+    # pylint: disable=arguments-differ
+    @AbstractMultiObjectiveBenchmark.check_parameters
+    def objective_function(self, configuration: Union[CS.Configuration, Dict],
+                           fidelity: Union[CS.Configuration, Dict, None] = None,
+                           run_index: Union[int, Tuple, None] = (0, 1, 2),
+                           rng: Union[np.random.RandomState, int, None] = None,
+                           **kwargs) -> Dict:
+        """
+        Query the NAS-benchmark using a given configuration and a epoch (=budget).
+
+        Parameters
+        ----------
+        configuration : Dict, CS.Configuration
+        fidelity: Dict, None
+            Fidelity parameters, check get_fidelity_space(). Uses default (max) value if None.
+        run_index : int, Tuple, None
+            The nas benchmark has for each configuration-budget-pair results from 3 different runs.
+            - If multiple `run_id`s are given as Tuple, the benchmark returns the mean over the given runs.
+            - By default (no parameter is specified) all runs are used. A specific run can be chosen by setting the
+              `run_id` to a value from [0, 3]. While the performance is averaged across the `run_index`, the costs are
+              the sum of the runtime per `run_index`.
+            - When this value is explicitly set to `None`, the function will use a random seed.
+        rng : np.random.RandomState, int, None
+            Random seed to use in the benchmark.
+
+            To prevent overfitting on a single seed, it is possible to pass a
+            parameter ``rng`` as 'int' or 'np.random.RandomState' to this function.
+            If this parameter is not given, the default random state is used.
+        kwargs
+
+        Returns
+        -------
+        Dict -
+            function_value :
+                misclassification_rate: float [0,1] (lower is better)
+                    1-accuracy on validation set
+                trainable_parameters: int [0, 10**8] (lower is better)
+                    Number of trainable parameters in the network
+
+            cost : runtime
+            info : Dict
+                fidelity : used fidelities in this evaluation
+        """
+        return self._mo_objective_function(
+            configuration=configuration, fidelity=fidelity, run_index=run_index, rng=rng, **kwargs
+        )
+
+    @AbstractMultiObjectiveBenchmark.check_parameters
+    def objective_function_test(self, configuration: Union[Dict, CS.Configuration],
+                                fidelity: Union[CS.Configuration, Dict, None] = None,
+                                rng: Union[np.random.RandomState, int, None] = None,
+                                **kwargs) -> Dict:
+        """
+        Validate a configuration on the maximum available budget.
+
+        Parameters
+        ----------
+        configuration : Dict, CS.Configuration
+        fidelity: Dict, None
+            Fidelity parameters, check get_fidelity_space(). Uses default (max) value if None.
+        rng : np.random.RandomState, int, None
+            Random seed to use in the benchmark. To prevent overfitting on a single seed, it is
+            possible to pass a parameter ``rng`` as 'int' or 'np.random.RandomState' to this
+            function. If this parameter is not given, the default random state is used.
+        kwargs
+
+        Returns
+        -------
+        Dict -
+            function_value :
+                misclassification_rate: float [0,1] (lower is better)
+                    1-accuracy on test set
+                trainable_parameters: int [0, 10**8] (lower is better)
+                    Number of trainable parameters in the network
+
+            cost : runtime
+            info : Dict
+                fidelity : used fidelities in this evaluation
+        """
+
+        return self._mo_objective_function_test(
+            configuration=configuration, fidelity=fidelity, rng=rng, **kwargs
+        )
+
+    @staticmethod
+    def get_objective_names() -> List[str]:
+        return ['misclassification_rate', 'trainable_parameters']
+
+
+class _NASCifar10BaseSOBenchmark(_NAS101BaseBenchmark, AbstractBenchmark):
+
+    # pylint: disable=arguments-differ
+    @AbstractBenchmark.check_parameters
+    def objective_function(self, configuration: Union[CS.Configuration, Dict],
+                           fidelity: Union[CS.Configuration, Dict, None] = None,
+                           run_index: Union[int, Tuple, None] = (0, 1, 2),
+                           rng: Union[np.random.RandomState, int, None] = None,
+                           **kwargs) -> Dict:
+        """
+        Query the NAS-benchmark using a given configuration and a epoch (=budget).
+
+        Parameters
+        ----------
+        configuration : Dict, CS.Configuration
+        fidelity: Dict, None
+            Fidelity parameters, check get_fidelity_space(). Uses default (max) value if None.
+        run_index : int, Tuple, None
+            The nas benchmark has for each configuration-budget-pair results from 3 different runs.
+            - If multiple `run_id`s are given as Tuple, the benchmark returns the mean over the given runs.
+            - By default (no parameter is specified) all runs are used. A specific run can be chosen by setting the
+              `run_id` to a value from [0, 3]. While the performance is averaged across the `run_index`, the costs are
+              the sum of the runtime per `run_index`.
+            - When this value is explicitly set to `None`, the function will use a random seed.
+        rng : np.random.RandomState, int, None
+            Random seed to use in the benchmark.
+
+            To prevent overfitting on a single seed, it is possible to pass a
+            parameter ``rng`` as 'int' or 'np.random.RandomState' to this function.
+            If this parameter is not given, the default random state is used.
+        kwargs
+
+        Returns
+        -------
+        Dict -
+            function_value :
+                misclassification_rate: float [0,1] (lower is better)
+                    1-accuracy on validation set
+            cost : runtime
+            info : Dict
+                fidelity : used fidelities in this evaluation
+        """
+        result_dict = self._mo_objective_function(
+            configuration=configuration, fidelity=fidelity, run_index=run_index, rng=rng, **kwargs
+        )
+
+        # swap function_value dict to value
+        result_dict['function_value'] = result_dict['function_value']['misclassification_rate']
+        return result_dict
+
+    @AbstractBenchmark.check_parameters
+    def objective_function_test(self, configuration: Union[Dict, CS.Configuration],
+                                fidelity: Union[CS.Configuration, Dict, None] = None,
+                                rng: Union[np.random.RandomState, int, None] = None,
+                                **kwargs) -> Dict:
+        """
+        Validate a configuration on the maximum available budget.
+
+        Parameters
+        ----------
+        configuration : Dict, CS.Configuration
+        fidelity: Dict, None
+            Fidelity parameters, check get_fidelity_space(). Uses default (max) value if None.
+        rng : np.random.RandomState, int, None
+            Random seed to use in the benchmark. To prevent overfitting on a single seed, it is
+            possible to pass a parameter ``rng`` as 'int' or 'np.random.RandomState' to this
+            function. If this parameter is not given, the default random state is used.
+        kwargs
+
+        Returns
+        -------
+        Dict -
+            function_value :
+                equals misclassification_rate: float [0,1] (lower is better)
+                    1-accuracy on test set
+            cost : runtime
+            info : Dict
+                fidelity : used fidelities in this evaluation
+        """
+        result_dict = self._mo_objective_function_test(
+            configuration=configuration, fidelity=fidelity, rng=rng, **kwargs
+        )
+
+        # swap function_value dict to value
+        result_dict['function_value'] = result_dict['function_value']['misclassification_rate']
+        return result_dict
+
+
+class NASCifar10ABenchmark(_QueryA, _NASCifar10BaseSOBenchmark):
+    def __init__(self, data_path: Union[Path, str, None] = None,
+                 rng: Union[np.random.RandomState, int, None] = None, **kwargs):
+        super(NASCifar10ABenchmark, self).__init__(data_path=data_path, rng=rng, **kwargs)
+
+
+class NASCifar10AMOBenchmark(_QueryA, _NASCifar10BaseMOBenchmark):
+    def __init__(self, data_path: Union[Path, str, None] = None,
+                 rng: Union[np.random.RandomState, int, None] = None, **kwargs):
+        super(NASCifar10AMOBenchmark, self).__init__(data_path=data_path, rng=rng, **kwargs)
+
+
+class NASCifar10BBenchmark(_QueryB, _NASCifar10BaseSOBenchmark):
+    def __init__(self, data_path: Union[Path, str, None] = None,
+                 rng: Union[np.random.RandomState, int, None] = None, **kwargs):
+        super(NASCifar10BBenchmark, self).__init__(data_path=data_path, rng=rng, **kwargs)
+
+
+class NASCifar10BMOBenchmark(_QueryB, _NASCifar10BaseMOBenchmark):
+    def __init__(self, data_path: Union[Path, str, None] = None,
+                 rng: Union[np.random.RandomState, int, None] = None, **kwargs):
+        super(NASCifar10BMOBenchmark, self).__init__(data_path=data_path, rng=rng, **kwargs)
+
+
+class NASCifar10CBenchmark(_QueryC, _NASCifar10BaseSOBenchmark):
+    def __init__(self, data_path: Union[Path, str, None] = None,
+                 rng: Union[np.random.RandomState, int, None] = None, **kwargs):
+        super(NASCifar10CBenchmark, self).__init__(data_path=data_path, rng=rng, **kwargs)
+
+
+class NASCifar10CMOBenchmark(_QueryC, _NASCifar10BaseMOBenchmark):
+    def __init__(self, data_path: Union[Path, str, None] = None,
+                 rng: Union[np.random.RandomState, int, None] = None, **kwargs):
+        super(NASCifar10CMOBenchmark, self).__init__(data_path=data_path, rng=rng, **kwargs)
+
+
 def modified_query(benchmark, model_spec, run_index: int, epochs=108, stop_halfway=False):
     """
     NOTE:
@@ -607,3 +836,11 @@ def modified_query(benchmark, model_spec, run_index: int, epochs=108, stop_halfw
         benchmark.dataset.total_epochs_spent += epochs
 
     return data
+
+
+__all__ = ["NASCifar10ABenchmark",
+           "NASCifar10AMOBenchmark",
+           "NASCifar10BBenchmark",
+           "NASCifar10BMOBenchmark",
+           "NASCifar10CBenchmark",
+           "NASCifar10CMOBenchmark"]
diff --git a/hpobench/benchmarks/nas/nasbench_1shot1.py b/hpobench/benchmarks/nas/nasbench_1shot1.py
index 4d8231a0..5d94631e 100644
--- a/hpobench/benchmarks/nas/nasbench_1shot1.py
+++ b/hpobench/benchmarks/nas/nasbench_1shot1.py
@@ -34,7 +34,7 @@
 pip install .[nasbench_1shot1]
 
 pip install git+https://github.com/google-research/nasbench.git@master
-git clone https://github.com/automl/nasbench-1shot1/tree/master/nasbench_analysis/
+git clone https://github.com/automl/nasbench-1shot1
 
 3. Environment setup
 ====================
@@ -46,6 +46,9 @@
 
 Changelog:
 ==========
+0.0.5
+* Add MO Version
+
 0.0.4
 * New container release due to a general change in the communication between container and HPOBench.
   Works with HPOBench >= v0.0.8
@@ -62,34 +65,33 @@
 
 """
 import logging
-
+from ast import literal_eval
 from pathlib import Path
 from typing import Union, Dict, Any, Tuple, List
-from ast import literal_eval
 
 import ConfigSpace as CS
 import numpy as np
 from nasbench import api
 from nasbench.api import OutOfDomainError
-
-from hpobench.abstract_benchmark import AbstractBenchmark
-from hpobench.util.data_manager import NASBench_101DataManager
-from hpobench.util import rng_helper
-
 from nasbench_analysis.search_spaces.search_space_1 import SearchSpace1  # noqa
 from nasbench_analysis.search_spaces.search_space_2 import SearchSpace2  # noqa
 from nasbench_analysis.search_spaces.search_space_3 import SearchSpace3  # noqa
 from nasbench_analysis.utils import INPUT, OUTPUT, CONV1X1, CONV3X3, MAXPOOL3X3  # noqa
 
-__version__ = '0.0.4'
+from hpobench.abstract_benchmark import AbstractSingleObjectiveBenchmark, AbstractMultiObjectiveBenchmark
+from hpobench.util import rng_helper
+from hpobench.util.data_manager import NASBench_101DataManager
+
+__version__ = '0.0.5'
 logger = logging.getLogger('NasBench1shot1')
 
 
-class NASBench1shot1BaseBenchmark(AbstractBenchmark):
+class _NASBench1shot1BaseBenchmark:
+
     def __init__(self, data_path: Union[Path, str, None] = None,
                  rng: Union[np.random.RandomState, int, None] = None):
         """
-        Baseclass for the nasbench 1shot1 benchmarks.
+        Baseclass for the all nasbench 1shot1 benchmarks.
         Please install the benchmark first. Place the data under ``data_path``.
 
         Parameters
@@ -99,18 +101,18 @@ def __init__(self, data_path: Union[Path, str, None] = None,
         rng : np.random.RandomState, int, None
             Random seed for the benchmarks
         """
-        super(NASBench1shot1BaseBenchmark, self).__init__(rng=rng)
+
         data_manager = NASBench_101DataManager(data_path)
         self.api = data_manager.load()
         self.search_space = None
-
-    # pylint: disable=arguments-differ
-    @AbstractBenchmark.check_parameters
-    def objective_function(self, configuration: Union[CS.Configuration, Dict],
-                           fidelity: Union[CS.Configuration, Dict, None] = None,
-                           run_index: Union[int, Tuple, List, None] = (0, 1, 2),
-                           rng: Union[np.random.RandomState, int, None] = None,
-                           **kwargs) -> Dict:
+        self.rng = rng
+        super(_NASBench1shot1BaseBenchmark, self).__init__(rng=rng)
+
+    def _mo_objective_function(self, configuration: Union[CS.Configuration, Dict],
+                               fidelity: Union[CS.Configuration, Dict, None] = None,
+                               run_index: Union[int, Tuple, List, None] = (0, 1, 2),
+                               rng: Union[np.random.RandomState, int, None] = None,
+                               **kwargs) -> Dict:
         """
         Query the NAS1shot1-benchmark using a given configuration and an epoch (=budget).
         Only data for the budgets 4, 12, 36, 108 are available.
@@ -171,7 +173,8 @@ def objective_function(self, configuration: Union[CS.Configuration, Dict],
                           'module_operations': data['module_operations']}
             failure = failure or ('info' in data and data['info'] == 'failure')
 
-        return {'function_value': float(1 - np.mean(valid_accuracies)),
+        return {'function_value': {'misclassification_rate': float(1 - np.mean(valid_accuracies)),
+                                   'trainable_parameters': additional['trainable_parameters']},
                 'cost': float(np.sum(training_times)),
                 'info': {'fidelity': fidelity,
                          'train_accuracies': train_accuracies,
@@ -179,50 +182,24 @@ def objective_function(self, configuration: Union[CS.Configuration, Dict],
                          'test_accuracies': test_accuracies,
                          'training_times': training_times,
                          'data': additional,
-                         'failure': 'False' if not failure else 'True'
+                         'failure': 0 if not failure else 1
                          }
                 }
 
-    @AbstractBenchmark.check_parameters
-    def objective_function_test(self, configuration: Union[Dict, CS.Configuration],
-                                fidelity: Union[CS.Configuration, Dict, None] = None,
-                                rng: Union[np.random.RandomState, int, None] = None,
-                                **kwargs) -> Dict:
-        """
-        Validate a configuration on the maximum available budget (108) and on all three seeds.
-
-        Parameters
-        ----------
-        configuration : Dict, CS.Configuration
-        fidelity: Dict, None
-            Fidelity parameters, check get_fidelity_space(). Uses default (max) value if None.
-        rng : np.random.RandomState, int, None
-            Random seed to use in the benchmark. To prevent overfitting on a single seed, it is
-            possible to pass a parameter ``rng`` as 'int' or 'np.random.RandomState' to this
-            function. If this parameter is not given, the default random state is used.
-        kwargs
+    def _mo_objective_function_test(self, configuration: Union[CS.Configuration, Dict],
+                                    fidelity: Union[CS.Configuration, Dict, None] = None,
+                                    rng: Union[np.random.RandomState, int, None] = None,
+                                    **kwargs) -> Dict:
 
-        Returns
-        -------
-        Dict -
-            function_value : test error on largest fidelity.
-            cost : runtime
-            info : Dict
-                train_accuracies
-                test_accuracies
-                valid_accuracies
-                training_times
-                fidelity : used fidelities in this evaluation
-                data : additional data such as trainable parameters and used operations
-        """
         assert fidelity['budget'] == 108, 'Only test data for the 108th epoch is available.'
-        result = self.objective_function(configuration=configuration, fidelity=fidelity, run_index=(0, 1, 2), rng=rng)
-        result['function_value'] = float(1 - np.mean(result['info']['test_accuracies']))
+        result = self._mo_objective_function(configuration=configuration, fidelity=fidelity,
+                                             run_index=(0, 1, 2), rng=rng, **kwargs)
+        result['function_value']['misclassification_rate'] = float(1 - np.mean(result['info']['test_accuracies']))
         return result
 
     @staticmethod
     def get_configuration_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
-        raise NotImplementedError
+        raise NotImplementedError()
 
     @staticmethod
     def get_fidelity_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
@@ -264,7 +241,6 @@ def get_meta_information() -> Dict:
                 }
 
     def _check_run_index(self, run_index):
-
         if isinstance(run_index, int):
             assert 0 <= run_index <= 2, f'run_index must be in [0, 2], not {run_index}'
             run_index = (run_index, )
@@ -426,7 +402,223 @@ def _get_configuration_space(search_space: Any, seed: Union[int, None] = None) -
         return cs
 
 
-class NASBench1shot1SearchSpace1Benchmark(NASBench1shot1BaseBenchmark):
+class NASBench1shot1BaseMOBenchmark(_NASBench1shot1BaseBenchmark, AbstractMultiObjectiveBenchmark):
+
+    # pylint: disable=arguments-differ
+    @AbstractMultiObjectiveBenchmark.check_parameters
+    def objective_function(self, configuration: Union[CS.Configuration, Dict],
+                           fidelity: Union[CS.Configuration, Dict, None] = None,
+                           run_index: Union[int, Tuple, List, None] = (0, 1, 2),
+                           rng: Union[np.random.RandomState, int, None] = None,
+                           **kwargs) -> Dict:
+        """
+        Query the NAS1shot1-benchmark using a given configuration and an epoch (=budget).
+        Only data for the budgets 4, 12, 36, 108 are available.
+
+        Parameters
+        ----------
+        configuration : Dict, CS.Configuration
+        fidelity: Dict, None
+            Fidelity parameters, check get_fidelity_space(). Uses default (max) value if None.
+        run_index : int, Tuple, None
+            The nas benchmark has for each configuration-budget-pair results from 3 different runs.
+            - If multiple `run_id`s are given as Tuple/List, the benchmark returns the mean over the given runs.
+            - By default (no parameter is specified) all runs are used. A specific run can be chosen by setting the
+              `run_id` to a value from [0, 3]. While the performance is averaged across the `run_index`, the costs are
+              the sum of the runtime per `run_index`.
+            - When this value is explicitly set to `None`, the function will use a random seed.
+        rng : np.random.RandomState, int, None
+            Random seed to use in the benchmark.
+
+            To prevent overfitting on a single seed, it is possible to pass a
+            parameter ``rng`` as 'int' or 'np.random.RandomState' to this function.
+            If this parameter is not given, the default random state is used.
+        kwargs
+
+        Returns
+        -------
+        Dict -
+            function_value : validation error
+            cost : runtime
+            info : Dict
+                train_accuracies
+                test_accuracies
+                valid_accuracies
+                training_times
+                fidelity : used fidelities in this evaluation
+                data : additional data such as trainable parameters and used operations
+        """
+        return self._mo_objective_function(configuration=configuration, fidelity=fidelity,
+                                           run_index=run_index, rng=rng, **kwargs)
+
+    @AbstractMultiObjectiveBenchmark.check_parameters
+    def objective_function_test(self, configuration: Union[Dict, CS.Configuration],
+                                fidelity: Union[CS.Configuration, Dict, None] = None,
+                                rng: Union[np.random.RandomState, int, None] = None,
+                                **kwargs) -> Dict:
+        """
+        Validate a configuration on the maximum available budget (108) and on all three seeds.
+
+        Parameters
+        ----------
+        configuration : Dict, CS.Configuration
+        fidelity: Dict, None
+            Fidelity parameters, check get_fidelity_space(). Uses default (max) value if None.
+        rng : np.random.RandomState, int, None
+            Random seed to use in the benchmark. To prevent overfitting on a single seed, it is
+            possible to pass a parameter ``rng`` as 'int' or 'np.random.RandomState' to this
+            function. If this parameter is not given, the default random state is used.
+        kwargs
+
+        Returns
+        -------
+        Dict -
+            function_value : test error on largest fidelity.
+            cost : runtime
+            info : Dict
+                train_accuracies
+                test_accuracies
+                valid_accuracies
+                training_times
+                fidelity : used fidelities in this evaluation
+                data : additional data such as trainable parameters and used operations
+        """
+        return self._mo_objective_function_test(configuration=configuration, fidelity=fidelity,
+                                                rng=rng, **kwargs)
+
+    @staticmethod
+    def get_objective_names() -> List[str]:
+        return ['misclassification_rate', 'trainable_parameters']
+
+
+class NASBench1shot1BaseSOBenchmark(_NASBench1shot1BaseBenchmark, AbstractSingleObjectiveBenchmark):
+
+    # pylint: disable=arguments-differ
+    @AbstractSingleObjectiveBenchmark.check_parameters
+    def objective_function(self, configuration: Union[CS.Configuration, Dict],
+                           fidelity: Union[CS.Configuration, Dict, None] = None,
+                           run_index: Union[int, Tuple, List, None] = (0, 1, 2),
+                           rng: Union[np.random.RandomState, int, None] = None,
+                           **kwargs) -> Dict:
+        """
+        Query the NAS1shot1-benchmark using a given configuration and an epoch (=budget).
+        Only data for the budgets 4, 12, 36, 108 are available.
+
+        Parameters
+        ----------
+        configuration : Dict, CS.Configuration
+        fidelity: Dict, None
+            Fidelity parameters, check get_fidelity_space(). Uses default (max) value if None.
+        run_index : int, Tuple, None
+            The nas benchmark has for each configuration-budget-pair results from 3 different runs.
+            - If multiple `run_id`s are given as Tuple/List, the benchmark returns the mean over the given runs.
+            - By default (no parameter is specified) all runs are used. A specific run can be chosen by setting the
+              `run_id` to a value from [0, 3]. While the performance is averaged across the `run_index`, the costs are
+              the sum of the runtime per `run_index`.
+            - When this value is explicitly set to `None`, the function will use a random seed.
+        rng : np.random.RandomState, int, None
+            Random seed to use in the benchmark.
+
+            To prevent overfitting on a single seed, it is possible to pass a
+            parameter ``rng`` as 'int' or 'np.random.RandomState' to this function.
+            If this parameter is not given, the default random state is used.
+        kwargs
+
+        Returns
+        -------
+        Dict -
+            function_value : validation error
+            cost : runtime
+            info : Dict
+                train_accuracies
+                test_accuracies
+                valid_accuracies
+                training_times
+                fidelity : used fidelities in this evaluation
+                data : additional data such as trainable parameters and used operations
+        """
+        result = self._mo_objective_function(
+            configuration=configuration, fidelity=fidelity, rng=rng, run_index=run_index, **kwargs
+        )
+        result['info'].update(result['function_value'])
+        result['function_value'] = result['function_value']['misclassification_rate']
+        return result
+
+    @AbstractSingleObjectiveBenchmark.check_parameters
+    def objective_function_test(self, configuration: Union[Dict, CS.Configuration],
+                                fidelity: Union[CS.Configuration, Dict, None] = None,
+                                rng: Union[np.random.RandomState, int, None] = None,
+                                **kwargs) -> Dict:
+        """
+        Validate a configuration on the maximum available budget (108) and on all three seeds.
+
+        Parameters
+        ----------
+        configuration : Dict, CS.Configuration
+        fidelity: Dict, None
+            Fidelity parameters, check get_fidelity_space(). Uses default (max) value if None.
+        rng : np.random.RandomState, int, None
+            Random seed to use in the benchmark. To prevent overfitting on a single seed, it is
+            possible to pass a parameter ``rng`` as 'int' or 'np.random.RandomState' to this
+            function. If this parameter is not given, the default random state is used.
+        kwargs
+
+        Returns
+        -------
+        Dict -
+            function_value : test error on largest fidelity.
+            cost : runtime
+            info : Dict
+                train_accuracies
+                test_accuracies
+                valid_accuracies
+                training_times
+                fidelity : used fidelities in this evaluation
+                data : additional data such as trainable parameters and used operations
+        """
+
+        result = self._mo_objective_function_test(
+            configuration=configuration, fidelity=fidelity, rng=rng, **kwargs
+        )
+        result['info'].update(result['function_value'])
+        result['function_value'] = result['function_value']['misclassification_rate']
+        return result
+
+
+class NASBench1shot1SearchSpace1MOBenchmark(NASBench1shot1BaseMOBenchmark):
+    def __init__(self, data_path: Union[Path, str, None] = None,
+                 rng: Union[np.random.RandomState, int, None] = None):
+        super(NASBench1shot1SearchSpace1MOBenchmark, self).__init__(data_path=data_path, rng=rng)
+        self.search_space = SearchSpace1()
+
+    @staticmethod
+    def get_configuration_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
+        return _NASBench1shot1BaseBenchmark._get_configuration_space(SearchSpace1(), seed)
+
+
+class NASBench1shot1SearchSpace2MOBenchmark(NASBench1shot1BaseMOBenchmark):
+    def __init__(self, data_path: Union[Path, str, None] = None,
+                 rng: Union[np.random.RandomState, int, None] = None):
+        super(NASBench1shot1SearchSpace2MOBenchmark, self).__init__(data_path=data_path, rng=rng)
+        self.search_space = SearchSpace2()
+
+    @staticmethod
+    def get_configuration_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
+        return _NASBench1shot1BaseBenchmark._get_configuration_space(SearchSpace2(), seed)
+
+
+class NASBench1shot1SearchSpace3MOBenchmark(NASBench1shot1BaseMOBenchmark):
+    def __init__(self, data_path: Union[Path, str, None] = None,
+                 rng: Union[np.random.RandomState, int, None] = None):
+        super(NASBench1shot1SearchSpace3MOBenchmark, self).__init__(data_path=data_path, rng=rng)
+        self.search_space = SearchSpace3()
+
+    @staticmethod
+    def get_configuration_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
+        return _NASBench1shot1BaseBenchmark._get_configuration_space(SearchSpace3(), seed)
+
+
+class NASBench1shot1SearchSpace1Benchmark(NASBench1shot1BaseSOBenchmark):
     def __init__(self, data_path: Union[Path, str, None] = None,
                  rng: Union[np.random.RandomState, int, None] = None):
         super(NASBench1shot1SearchSpace1Benchmark, self).__init__(data_path=data_path, rng=rng)
@@ -434,10 +626,10 @@ def __init__(self, data_path: Union[Path, str, None] = None,
 
     @staticmethod
     def get_configuration_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
-        return NASBench1shot1BaseBenchmark._get_configuration_space(SearchSpace1(), seed)
+        return _NASBench1shot1BaseBenchmark._get_configuration_space(SearchSpace1(), seed)
 
 
-class NASBench1shot1SearchSpace2Benchmark(NASBench1shot1BaseBenchmark):
+class NASBench1shot1SearchSpace2Benchmark(NASBench1shot1BaseSOBenchmark):
     def __init__(self, data_path: Union[Path, str, None] = None,
                  rng: Union[np.random.RandomState, int, None] = None):
         super(NASBench1shot1SearchSpace2Benchmark, self).__init__(data_path=data_path, rng=rng)
@@ -445,10 +637,10 @@ def __init__(self, data_path: Union[Path, str, None] = None,
 
     @staticmethod
     def get_configuration_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
-        return NASBench1shot1BaseBenchmark._get_configuration_space(SearchSpace2(), seed)
+        return _NASBench1shot1BaseBenchmark._get_configuration_space(SearchSpace2(), seed)
 
 
-class NASBench1shot1SearchSpace3Benchmark(NASBench1shot1BaseBenchmark):
+class NASBench1shot1SearchSpace3Benchmark(NASBench1shot1BaseSOBenchmark):
     def __init__(self, data_path: Union[Path, str, None] = None,
                  rng: Union[np.random.RandomState, int, None] = None):
         super(NASBench1shot1SearchSpace3Benchmark, self).__init__(data_path=data_path, rng=rng)
@@ -456,4 +648,14 @@ def __init__(self, data_path: Union[Path, str, None] = None,
 
     @staticmethod
     def get_configuration_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
-        return NASBench1shot1BaseBenchmark._get_configuration_space(SearchSpace3(), seed)
+        return _NASBench1shot1BaseBenchmark._get_configuration_space(SearchSpace3(), seed)
+
+
+__all__ = [
+    "NASBench1shot1SearchSpace1Benchmark",
+    "NASBench1shot1SearchSpace2Benchmark",
+    "NASBench1shot1SearchSpace3Benchmark",
+    "NASBench1shot1SearchSpace1MOBenchmark",
+    "NASBench1shot1SearchSpace3MOBenchmark",
+    "NASBench1shot1SearchSpace3MOBenchmark",
+]
diff --git a/hpobench/benchmarks/nas/nasbench_201.py b/hpobench/benchmarks/nas/nasbench_201.py
index 0c2324c2..1ca0beb3 100644
--- a/hpobench/benchmarks/nas/nasbench_201.py
+++ b/hpobench/benchmarks/nas/nasbench_201.py
@@ -30,6 +30,8 @@
 0.0.6
 * Add the multiobjective version of this benchmark by returning flops, model size, latency and missclassification rate
 * Integrate #138: Improve the docstrings about the seeds.
+* Scale the returned misclassification rate from range [0, 100] to [0, 1].
+* Improve naming in the result object ("*_precision" -> "*_misclassification_rate")
 
 0.0.5
 * Add for each benchmark a new one with a different fidelity space.
@@ -51,25 +53,23 @@
 * First implementation
 """
 import logging
-from typing import Union, Dict, List, Text, Tuple
 from copy import deepcopy
+from typing import Union, Dict, List, Text, Tuple
 
 import ConfigSpace as CS
 import numpy as np
 
 import hpobench.util.rng_helper as rng_helper
-from hpobench.abstract_benchmark import AbstractBenchmark, AbstractMultiObjectiveBenchmark
-
+from hpobench.abstract_benchmark import AbstractSingleObjectiveBenchmark, AbstractMultiObjectiveBenchmark
 from hpobench.util.data_manager import NASBench_201Data
 
-
 __version__ = '0.0.6'
 MAX_NODES = 4
 
 logger = logging.getLogger('NASBENCH201')
 
 
-class NasBench201BaseMOBenchmark(AbstractMultiObjectiveBenchmark):
+class _NasBench201BaseBenchmark:
     def __init__(self, dataset: str,
                  rng: Union[np.random.RandomState, int, None] = None, **kwargs):
         """
@@ -153,13 +153,12 @@ def __init__(self, dataset: str,
             Random seed for the benchmark's random state.
         """  # noqa: E501
 
-        super(NasBench201BaseMOBenchmark, self).__init__(rng=rng)
-
         data_manager = NASBench_201Data(dataset=dataset)
 
         self.dataset = dataset
         self.data = data_manager.load()
-        self.config_to_structure = NasBench201BaseMOBenchmark.config_to_structure_func(max_nodes=MAX_NODES)
+        self.config_to_structure = _NasBench201BaseMOBenchmark.config_to_structure_func(max_nodes=MAX_NODES)
+        super(_NasBench201BaseBenchmark, self).__init__(rng=rng, **kwargs)
 
     def dataset_mapping(self, dataset):
         mapping = {'cifar10-valid': ('x-valid', 'ori-test'),
@@ -167,76 +166,115 @@ def dataset_mapping(self, dataset):
                    'cifar100': ('ori-test', 'x-test')}
         return mapping[dataset]
 
-    # pylint: disable=arguments-differ
-    @AbstractMultiObjectiveBenchmark.check_parameters
-    def objective_function(self, configuration: Union[CS.Configuration, Dict],
-                           fidelity: Union[Dict, CS.Configuration, None] = None,
-                           rng: Union[np.random.RandomState, int, None] = None,
-                           data_seed: Union[List, Tuple, int, None] = (777, 888, 999),
-                           **kwargs) -> Dict:
+    @staticmethod
+    def config_to_structure_func(max_nodes: int):
+        """
+        From https://github.com/D-X-Y/AutoDL-Projects/blob/master/exps/algos/BOHB.py
+        Author: https://github.com/D-X-Y [Xuanyi.Dong@student.uts.edu.au]
         """
-        Objective function for the NASBench201 benchmark.
-        This functions sends a query to NASBench201 and evaluates the configuration.
-        As already explained in the class definition, different data sets are trained on different splits.
 
-        The table above gives a detailed summary over the available splits, epochs, and which identifier are used per
-        dataset.
+        def config_to_structure(config):
+            genotypes = []
+            for i in range(1, max_nodes):
+                x_list = []
+                for j in range(i):
+                    node_str = f'{i}<-{j}'
+                    op_name = config[node_str]
+                    x_list.append((op_name, j))
+                genotypes.append(tuple(x_list))
+            return _NasBench201BaseMOBenchmark._Structure(genotypes)
+
+        return config_to_structure
+
+    @staticmethod
+    def get_configuration_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
+        """
+        Return the CS representation of the search space.
+        From https://github.com/D-X-Y/AutoDL-Projects/blob/master/exps/algos/BOHB.py
+        Author: https://github.com/D-X-Y [Xuanyi.Dong@student.uts.edu.au]
 
         Parameters
         ----------
-        configuration
-        fidelity: Dict, None
-            epoch: int - Values: [1, 200]
-                Number of epochs an architecture was trained.
-                Note: the number of epoch is 1 indexed! (Results after the first epoch: epoch = 1)
+        seed : int, None
+            Random seed for the configuration space.
 
-            Fidelity parameters, check get_fidelity_space(). Uses default (max) value if None.
-        rng : np.random.RandomState, int, None
-            Random seed to use in the benchmark.
+        Returns
+        -------
+        CS.ConfigurationSpace -
+            Containing the benchmark's hyperparameter
+        """
+        seed = seed if seed is not None else np.random.randint(1, 100000)
+        cs = CS.ConfigurationSpace(seed=seed)
 
-            To prevent overfitting on a single seed, it is possible to pass a
-            parameter ``rng`` as 'int' or 'np.random.RandomState' to this function.
-            If this parameter is not given, the default random state is used.
-        data_seed : List, Tuple, None, int
-            The nasbench_201 benchmark include for each run 3 different seeds: 777, 888, 999.
-            The user can specify which seed to use. If more than one seed is given, the results are averaged
-            across the seeds but then the training time is the sum of the costs per seed.
-            When this value is explicitly set to `None`, the function will chose randomly one out of [777, 888, 999].
+        search_space = _NasBench201BaseMOBenchmark.get_search_spaces('cell', 'nas-bench-201')
+        hps = [CS.CategoricalHyperparameter(f'{i}<-{j}', search_space) for i in range(1, MAX_NODES) for j in range(i)]
+        cs.add_hyperparameters(hps)
+        return cs
 
-            Note:
-                For some architectures (configurations) no run was available. We've set missing values to an
-                available value from another seed. Therefore, it is possible that run results are exactly the same for
-                different seeds.
+    @staticmethod
+    def get_fidelity_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
+        """
+        Creates a ConfigSpace.ConfigurationSpace containing all fidelity parameters for
+        the NAS Benchmark 201.
 
-        kwargs
+        Fidelities
+        ----------
+         epoch: int
+            The loss / accuracy at `epoch`. Can be from 0 to 199.
+
+        Parameters
+        ----------
+        seed : int, None
+            Fixing the seed for the ConfigSpace.ConfigurationSpace
 
         Returns
         -------
-        Dict -
-            function_value : Dict
-                misclassification_rate : float
-                    1 - validation accuracy
-                num_flops : float
-                    Number of floating point operations in M
-                model_size : float
-                    Model size in MB
-                latency : float
-                    Time to evaluate a configuration in seconds
-            cost : time to train the network
-            info : Dict
-                train_precision : float
-                train_losses : float
-                train_cost : float
-                    Time needed to train the network for 'epoch' many epochs. If more than one seed is given,
-                    this field is the sum of the training time per network
-                eval_precision : float
-                eval_losses : float
-                eval_cost : float
-                    Time needed to train the network for 'epoch many epochs plus the time to evaluate the network on the
-                    evaluation split. If more than one seed is given, this field is the sum of the eval cost per network
-                fidelity : Dict
-                    used fidelities in this evaluation
+        ConfigSpace.ConfigurationSpace
+        """
+        seed = seed if seed is not None else np.random.randint(1, 100000)
+        fidel_space = CS.ConfigurationSpace(seed=seed)
+
+        fidel_space.add_hyperparameters([
+            CS.UniformIntegerHyperparameter('epoch', lower=1, upper=200, default_value=200)
+        ])
+        return fidel_space
+
+    @staticmethod
+    def get_meta_information() -> Dict:
+        """ Returns the meta information for the benchmark """
+        return {'name': 'NAS-Bench-201: Extending the Scope of Reproducible Neural Architecture Search',
+                'references': ['@article{dong2020bench,'
+                               'title   = {Nas-bench-201: Extending the scope of reproducible neural '
+                               '           architecture search},'
+                               'author  = {Dong, Xuanyi and Yang, Yi},'
+                               'journal = {arXiv preprint arXiv:2001.00326},'
+                               'year    = {2020}}',
+                               'https://openreview.net/forum?id=HJxyZkBKDr',
+                               ],
+                'code': 'https://github.com/D-X-Y/AutoDL-Projects',
+                }
+
+    @staticmethod
+    def get_search_spaces(xtype: str, name: str) -> List[Text]:
+        """ obtain the search space, i.e., a dict mapping the operation name into a python-function for this op
+        From https://github.com/D-X-Y/AutoDL-Projects/blob/master/lib/models/__init__.py
+        Author: https://github.com/D-X-Y [Xuanyi.Dong@student.uts.edu.au]
         """
+        # pylint: disable=no-else-return
+        if xtype == 'cell':
+            NAS_BENCH_201 = ['none', 'skip_connect', 'nor_conv_1x1', 'nor_conv_3x3', 'avg_pool_3x3']
+            SearchSpaceNames = {'nas-bench-201': NAS_BENCH_201}
+            assert name in SearchSpaceNames, 'invalid name [{:}] in {:}'.format(name, SearchSpaceNames.keys())
+            return SearchSpaceNames[name]
+        else:
+            raise ValueError('invalid search-space type is {:}'.format(xtype))
+
+    def _mo_objective_function(self, configuration: Union[CS.Configuration, Dict],
+                               fidelity: Union[Dict, CS.Configuration, None] = None,
+                               rng: Union[np.random.RandomState, int, None] = None,
+                               data_seed: Union[List, Tuple, int, None] = (777, 888, 999),
+                               **kwargs) -> Dict:
+
         self.rng = rng_helper.get_rng(rng)
 
         if isinstance(data_seed, (List, Tuple)):
@@ -245,7 +283,7 @@ def objective_function(self, configuration: Union[CS.Configuration, Dict],
                 logger.debug('There are some values more than once in the run_index. We remove the redundant entries.')
             data_seed = tuple(set(data_seed))
         elif isinstance(data_seed, int):
-            data_seed = (data_seed, )
+            data_seed = (data_seed,)
         elif data_seed is None:
             logger.debug('The data seed is explicitly set to None! A random seed will be selected.')
             data_seed = tuple(self.rng.choice((777, 888, 999), size=1))
@@ -254,7 +292,7 @@ def objective_function(self, configuration: Union[CS.Configuration, Dict],
             raise ValueError(f'data seed has unknown data type {type(data_seed)}, '
                              f'but should be tuple or int (777,888,999)')
 
-        assert len(set(data_seed) - {777, 888, 999}) == 0,\
+        assert len(set(data_seed) - {777, 888, 999}) == 0, \
             f'data seed can only contain the elements 777, 888, 999, but was {data_seed}'
 
         structure = self.config_to_structure(configuration)
@@ -291,44 +329,112 @@ def objective_function(self, configuration: Union[CS.Configuration, Dict],
 
         return {
             'function_value': {
-                'misclassification_rate': float(100 - np.mean(valid_accuracies)),
+                # The original benchmark returned the accuracy with range [0, 100].
+                # We cast it to a minimization problem with range [0-1] to have a more standardized return value.
+                'misclassification_rate': 0.01 * float(100 - np.mean(valid_accuracies)),
                 'num_flops': float(np.mean(num_flops)),
                 'model_size': float(np.mean(model_size)),
                 'latency': float(np.mean(latency)),
             },
             'cost': float(np.sum(valid_times) + np.sum(train_times)),
             'info': {
-                'train_precision': float(100 - np.mean(train_accuracies)),
+                'train_misclassification_rate': 0.01 * float(100 - np.mean(train_accuracies)),
                 'train_losses': float(np.mean(train_losses)),
                 'train_cost': float(np.sum(train_times)),
-                'valid_precision': float(100 - np.mean(valid_accuracies)),
+                'valid_misclassification_rate': 0.01 * float(100 - np.mean(valid_accuracies)),
                 'valid_losses': float(np.mean(valid_losses)),
                 'valid_cost': float(np.sum(valid_times) + np.sum(train_times)),
-                'test_precision': float(100 - np.mean(test_accuracies)),
+                'test_misclassification_rate': 0.01 * float(100 - np.mean(test_accuracies)),
                 'test_losses': float(np.mean(test_losses)),
                 'test_cost': float(np.sum(train_times)) + float(np.sum(test_times)),
                 'fidelity': fidelity
             }
         }
 
+    def _mo_objective_function_test(self, configuration: Union[CS.Configuration, Dict],
+                                    fidelity: Union[Dict, CS.Configuration, None] = None,
+                                    rng: Union[np.random.RandomState, int, None] = None,
+                                    **kwargs) -> Dict:
+        # The result dict should contain already all necessary information -> Just swap the function value from valid
+        # to test and the corresponding time cost
+        assert fidelity['epoch'] == 200, 'Only test data for the 200. epoch is available. '
+
+        if 'data_seed' in kwargs:
+            all_seeds_available = all([seed in kwargs['data_seed'] for seed in (777, 888, 999)])
+            if not all_seeds_available:
+                logger.warning('You have not specified all available seeds for the '
+                               '`objective_function_test`. However, we are going to ignore them, '
+                               ' because we report test values only as mean across all seeds.'
+                               f' Your given seeds: {kwargs["seed"]}')
+            del kwargs['data_seed']
+
+        result = self._mo_objective_function(configuration=configuration, fidelity=fidelity,
+                                             data_seed=(777, 888, 999),
+                                             rng=rng, **kwargs)
+        result['function_value']['misclassification_rate'] = result['info']['test_misclassification_rate']
+        result['cost'] = result['info']['test_cost']
+        return result
+
+    class _Structure:
+        def __init__(self, genotype):
+            assert isinstance(genotype, (list, tuple)), 'invalid class of genotype : {:}'.format(type(genotype))
+            self.node_num = len(genotype) + 1
+            self.nodes = []
+            self.node_N = []
+            for idx, node_info in enumerate(genotype):
+                assert isinstance(node_info, (list, tuple)), 'invalid class of node_info : {:}'.format(type(node_info))
+                assert len(node_info) >= 1, 'invalid length : {:}'.format(len(node_info))
+                for node_in in node_info:
+                    assert isinstance(node_in, (list, tuple)), 'invalid class of in-node : {:}'.format(type(node_in))
+                    assert len(node_in) == 2 and node_in[1] <= idx, 'invalid in-node : {:}'.format(node_in)
+                self.node_N.append(len(node_info))
+                self.nodes.append(tuple(deepcopy(node_info)))
+
+        def tostr(self):
+            """ Helper function: Create a string representation of the configuration """
+            strings = []
+            for node_info in self.nodes:
+                string = '|'.join([x[0] + '~{:}'.format(x[1]) for x in node_info])
+                string = '|{:}|'.format(string)
+                strings.append(string)
+            return '+'.join(strings)
+
+        def __repr__(self):
+            return (
+                '{name}({node_num} nodes with {node_info})'.format(name=self.__class__.__name__, node_info=self.tostr(),
+                                                                   **self.__dict__))
+
+        def __len__(self):
+            return len(self.nodes) + 1
+
+        def __getitem__(self, index):
+            return self.nodes[index]
+
+
+class _NasBench201BaseMOBenchmark(_NasBench201BaseBenchmark, AbstractMultiObjectiveBenchmark):
+    # pylint: disable=arguments-differ
     @AbstractMultiObjectiveBenchmark.check_parameters
-    def objective_function_test(self, configuration: Union[CS.Configuration, Dict],
-                                fidelity: Union[Dict, None] = None,
-                                rng: Union[np.random.RandomState, int, None] = None,
-                                **kwargs) -> Dict:
+    def objective_function(self, configuration: Union[CS.Configuration, Dict],
+                           fidelity: Union[Dict, CS.Configuration, None] = None,
+                           rng: Union[np.random.RandomState, int, None] = None,
+                           data_seed: Union[List, Tuple, int, None] = (777, 888, 999),
+                           **kwargs) -> Dict:
         """
-        Get the validated results from the NASBench201. Runs a given configuration on the largest budget (here: 200).
-        The test function uses all data set seeds (777, 888, 999).
+        Objective function for the NASBench201 benchmark.
+        This functions sends a query to NASBench201 and evaluates the configuration.
+        As already explained in the class definition, different data sets are trained on different splits.
 
-        See also :py:meth:`~hpobench.benchmarks.nas.nasbench_201.objective_function`
+        The table above gives a detailed summary over the available splits, epochs, and which identifier are used per
+        dataset.
 
         Parameters
         ----------
         configuration
         fidelity: Dict, None
-            epoch: int - Values: [200]
+            epoch: int - Values: [1, 200]
                 Number of epochs an architecture was trained.
-                Note: We only have test performance on the last epoch.
+                Note: the number of epoch is 1 indexed! (Results after the first epoch: epoch = 1)
+
             Fidelity parameters, check get_fidelity_space(). Uses default (max) value if None.
         rng : np.random.RandomState, int, None
             Random seed to use in the benchmark.
@@ -336,6 +442,16 @@ def objective_function_test(self, configuration: Union[CS.Configuration, Dict],
             To prevent overfitting on a single seed, it is possible to pass a
             parameter ``rng`` as 'int' or 'np.random.RandomState' to this function.
             If this parameter is not given, the default random state is used.
+        data_seed : List, Tuple, None, int
+            The nasbench_201 benchmark include for each run 3 different seeds: 777, 888, 999.
+            The user can specify which seed to use. If more than one seed is given, the results are averaged
+            across the seeds but then the training time is the sum of the costs per seed.
+            When this value is explicitly set to `None`, the function will chose randomly one out of [777, 888, 999].
+
+            Note:
+                For some architectures (configurations) no run was available. We've set missing values to an
+                available value from another seed. Therefore, it is possible that run results are exactly the same for
+                different seeds.
 
         kwargs
 
@@ -344,292 +460,110 @@ def objective_function_test(self, configuration: Union[CS.Configuration, Dict],
         Dict -
             function_value : Dict
                 misclassification_rate : float
-                    1 - test accuracy
+                    1 - validation accuracy
                 num_flops : float
                     Number of floating point operations in M
                 model_size : float
                     Model size in MB
                 latency : float
                     Time to evaluate a configuration in seconds
-            cost : time to the network + time to validate
+            cost : time to train the network
             info : Dict
-                train_precision
-                train_losses
-                train_cost
-                eval_precision
-                eval_losses
-                eval_cost
-                fidelity : used fidelities in this evaluation
+                train_misclassification_rate : float
+                train_losses : float
+                train_cost : float
+                    Time needed to train the network for 'epoch' many epochs. If more than one seed is given,
+                    this field is the sum of the training time per network
+                eval_misclassification_rate : float
+                eval_losses : float
+                eval_cost : float
+                    Time needed to train the network for 'epoch many epochs plus the time to evaluate the network on the
+                    evaluation split. If more than one seed is given, this field is the sum of the eval cost per network
+                fidelity : Dict
+                    used fidelities in this evaluation
         """
+        return self._mo_objective_function(configuration=configuration, fidelity=fidelity, rng=rng, data_seed=data_seed,
+                                           **kwargs)
 
-        # The result dict should contain already all necessary information -> Just swap the function value from valid
-        # to test and the corresponding time cost
-        assert fidelity['epoch'] == 200, 'Only test data for the 200. epoch is available. '
-
-        if 'data_seed' in kwargs:
-            all_seeds_available = all([seed in kwargs['data_seed'] for seed in (777, 888, 999)])
-            if not all_seeds_available:
-                logger.warning('You have not specified all available seeds for the '
-                               '`objective_function_test`. However, we are going to ignore them, '
-                               ' because we report test values only as mean across all seeds.'
-                               f' Your given seeds: {kwargs["seed"]}')
-            del kwargs['data_seed']
-
-        result = self.objective_function(configuration=configuration, fidelity=fidelity,
-                                         data_seed=(777, 888, 999),
-                                         rng=rng, **kwargs)
-        result['function_value']['misclassification_rate'] = result['info']['test_precision']
-        result['cost'] = result['info']['test_cost']
-        return result
-
-    @staticmethod
-    def config_to_structure_func(max_nodes: int):
-        """
-        From https://github.com/D-X-Y/AutoDL-Projects/blob/master/exps/algos/BOHB.py
-        Author: https://github.com/D-X-Y [Xuanyi.Dong@student.uts.edu.au]
-        """
-        def config_to_structure(config):
-            genotypes = []
-            for i in range(1, max_nodes):
-                x_list = []
-                for j in range(i):
-                    node_str = f'{i}<-{j}'
-                    op_name = config[node_str]
-                    x_list.append((op_name, j))
-                genotypes.append(tuple(x_list))
-            return NasBench201BaseMOBenchmark._Structure(genotypes)
-        return config_to_structure
-
-    @staticmethod
-    def get_search_spaces(xtype: str, name: str) -> List[Text]:
-        """ obtain the search space, i.e., a dict mapping the operation name into a python-function for this op
-        From https://github.com/D-X-Y/AutoDL-Projects/blob/master/lib/models/__init__.py
-        Author: https://github.com/D-X-Y [Xuanyi.Dong@student.uts.edu.au]
+    @AbstractMultiObjectiveBenchmark.check_parameters
+    def objective_function_test(self, configuration: Union[CS.Configuration, Dict],
+                                fidelity: Union[Dict, None] = None,
+                                rng: Union[np.random.RandomState, int, None] = None,
+                                **kwargs) -> Dict:
         """
-        # pylint: disable=no-else-return
-        if xtype == 'cell':
-            NAS_BENCH_201 = ['none', 'skip_connect', 'nor_conv_1x1', 'nor_conv_3x3', 'avg_pool_3x3']
-            SearchSpaceNames = {'nas-bench-201': NAS_BENCH_201}
-            assert name in SearchSpaceNames, 'invalid name [{:}] in {:}'.format(name, SearchSpaceNames.keys())
-            return SearchSpaceNames[name]
-        else:
-            raise ValueError('invalid search-space type is {:}'.format(xtype))
+        Get the validated results from the NASBench201. Runs a given configuration on the largest budget (here: 200).
+        The test function uses all data set seeds (777, 888, 999).
 
-    @staticmethod
-    def get_configuration_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
-        """
-        Return the CS representation of the search space.
-        From https://github.com/D-X-Y/AutoDL-Projects/blob/master/exps/algos/BOHB.py
-        Author: https://github.com/D-X-Y [Xuanyi.Dong@student.uts.edu.au]
+        See also :py:meth:`~hpobench.benchmarks.nas.nasbench_201.objective_function`
 
         Parameters
         ----------
-        seed : int, None
-            Random seed for the configuration space.
-
-        Returns
-        -------
-        CS.ConfigurationSpace -
-            Containing the benchmark's hyperparameter
-        """
-        seed = seed if seed is not None else np.random.randint(1, 100000)
-        cs = CS.ConfigurationSpace(seed=seed)
-
-        search_space = NasBench201BaseMOBenchmark.get_search_spaces('cell', 'nas-bench-201')
-        hps = [CS.CategoricalHyperparameter(f'{i}<-{j}', search_space) for i in range(1, MAX_NODES) for j in range(i)]
-        cs.add_hyperparameters(hps)
-        return cs
-
-    @staticmethod
-    def get_fidelity_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
-        """
-        Creates a ConfigSpace.ConfigurationSpace containing all fidelity parameters for
-        the NAS Benchmark 201.
+        configuration
+        fidelity: Dict, None
+            epoch: int - Values: [200]
+                Number of epochs an architecture was trained.
+                Note: We only have test performance on the last epoch.
+            Fidelity parameters, check get_fidelity_space(). Uses default (max) value if None.
+        rng : np.random.RandomState, int, None
+            Random seed to use in the benchmark.
 
-        Fidelities:
-         - epoch: int
-         The loss / accuracy at `epoch`. Can be from 0 to 199.
+            To prevent overfitting on a single seed, it is possible to pass a
+            parameter ``rng`` as 'int' or 'np.random.RandomState' to this function.
+            If this parameter is not given, the default random state is used.
 
-        Parameters
-        ----------
-        seed : int, None
-            Fixing the seed for the ConfigSpace.ConfigurationSpace
+        kwargs
 
         Returns
         -------
-        ConfigSpace.ConfigurationSpace
+        Dict -
+            function_value : Dict
+                misclassification_rate : float
+                    1 - test accuracy
+                num_flops : float
+                    Number of floating point operations in M
+                model_size : float
+                    Model size in MB
+                latency : float
+                    Time to evaluate a configuration in seconds
+            cost : time to the network + time to validate
+            info : Dict
+                train_misclassification_rate
+                train_losses
+                train_cost
+                eval_misclassification_rate
+                eval_losses
+                eval_cost
+                fidelity : used fidelities in this evaluation
         """
-        seed = seed if seed is not None else np.random.randint(1, 100000)
-        fidel_space = CS.ConfigurationSpace(seed=seed)
-
-        fidel_space.add_hyperparameters([
-            CS.UniformIntegerHyperparameter('epoch', lower=1, upper=200, default_value=200)
-        ])
-
-        return fidel_space
+        return self._mo_objective_function_test(configuration=configuration, fidelity=fidelity, rng=rng, **kwargs)
 
     @staticmethod
     def get_objective_names() -> List[str]:
         return ['misclassification_rate', 'num_flops', 'model_size', 'latency']
 
-    @staticmethod
-    def get_meta_information() -> Dict:
-        """ Returns the meta information for the benchmark """
-        return {'name': 'NAS-Bench-201: Extending the Scope of Reproducible Neural Architecture Search',
-                'references': ['@article{dong2020bench,'
-                               'title   = {Nas-bench-201: Extending the scope of reproducible neural '
-                               '           architecture search},'
-                               'author  = {Dong, Xuanyi and Yang, Yi},'
-                               'journal = {arXiv preprint arXiv:2001.00326},'
-                               'year    = {2020}}',
-                               'https://openreview.net/forum?id=HJxyZkBKDr',
-                               ],
-                'code': 'https://github.com/D-X-Y/AutoDL-Projects',
-                }
 
-    class _Structure:
-        def __init__(self, genotype):
-            assert isinstance(genotype, (list, tuple)), 'invalid class of genotype : {:}'.format(type(genotype))
-            self.node_num = len(genotype) + 1
-            self.nodes = []
-            self.node_N = []
-            for idx, node_info in enumerate(genotype):
-                assert isinstance(node_info, (list, tuple)), 'invalid class of node_info : {:}'.format(type(node_info))
-                assert len(node_info) >= 1, 'invalid length : {:}'.format(len(node_info))
-                for node_in in node_info:
-                    assert isinstance(node_in, (list, tuple)), 'invalid class of in-node : {:}'.format(type(node_in))
-                    assert len(node_in) == 2 and node_in[1] <= idx, 'invalid in-node : {:}'.format(node_in)
-                self.node_N.append(len(node_info))
-                self.nodes.append(tuple(deepcopy(node_info)))
-
-        def tostr(self):
-            """ Helper function: Create a string representation of the configuration """
-            strings = []
-            for node_info in self.nodes:
-                string = '|'.join([x[0] + '~{:}'.format(x[1]) for x in node_info])
-                string = '|{:}|'.format(string)
-                strings.append(string)
-            return '+'.join(strings)
-
-        def __repr__(self):
-            return (
-                '{name}({node_num} nodes with {node_info})'.format(name=self.__class__.__name__, node_info=self.tostr(),
-                                                                   **self.__dict__))
-
-        def __len__(self):
-            return len(self.nodes) + 1
-
-        def __getitem__(self, index):
-            return self.nodes[index]
-
-
-class Cifar10ValidNasBench201MOBenchmark(NasBench201BaseMOBenchmark):
+class Cifar10ValidNasBench201MOBenchmark(_NasBench201BaseMOBenchmark):
 
     def __init__(self, rng: Union[np.random.RandomState, int, None] = None, **kwargs):
         super(Cifar10ValidNasBench201MOBenchmark, self).__init__(dataset='cifar10-valid', rng=rng, **kwargs)
 
 
-class Cifar100NasBench201MOBenchmark(NasBench201BaseMOBenchmark):
+class Cifar100NasBench201MOBenchmark(_NasBench201BaseMOBenchmark):
 
     def __init__(self, rng: Union[np.random.RandomState, int, None] = None, **kwargs):
         super(Cifar100NasBench201MOBenchmark, self).__init__(dataset='cifar100', rng=rng, **kwargs)
 
 
-class ImageNetNasBench201MOBenchmark(NasBench201BaseMOBenchmark):
+class ImageNetNasBench201MOBenchmark(_NasBench201BaseMOBenchmark):
 
     def __init__(self, rng: Union[np.random.RandomState, int, None] = None, **kwargs):
         super(ImageNetNasBench201MOBenchmark, self).__init__(dataset='ImageNet16-120', rng=rng, **kwargs)
 
 
-class NasBench201SOBenchmark(AbstractBenchmark):
-    def __init__(self, dataset: str,
-                 rng: Union[np.random.RandomState, int, None] = None, **kwargs):
-        """
-        Benchmark interface to the NASBench201 Benchmarks. The NASBench201 contains
-        results for architectures on 4 different data sets.
-
-        We have split the "api" file from NASBench201 in separate files per data set.
-        The original "api" file contains all data sets, but loading this single file took too much RAM.
-
-        We recommend to not call this base class directly but using the correct subclass below.
-
-        The parameter ``dataset`` indicates which data set was used for training.
-
-        For each data set the metrics
-        'train_acc1es', 'train_losses', 'train_times', 'eval_acc1es', 'eval_times', 'eval_losses' are available.
-        However, the data sets report them on different data splits (train, train + valid, test, valid or test+valid).
-
-        We summarize all information about the data sets in the following tables.
-
-        Datastet        Metric      Avail.Epochs    Explanation             returned by HPOBENCH
-        ----------------------------------------------------------------------------------------
-        cifar10-valid   train       [0-199]         training set
-        cifar10-valid   x-valid     [0-199]         validation set          objective function
-        cifar10-valid   x-test
-        cifar10-valid   ori-test    199             test set                objective function test
-
-        cifar100        train       [0-199]         training set
-        cifar100        x-valid     199             validation set
-        cifar100        x-test      199             test set                objective function test
-        cifar100        ori-test    [0-199]         validation + test set   objective function
-
-        ImageNet16-120  train       [0-199]         training set
-        ImageNet16-120  x-valid     199             validation set
-        ImageNet16-120  x-test      199             test set                objective function test
-        ImageNet16-120  ori-test    [0-199]         validation + test set   objective function
-
-
-        We have also extracted the incumbents per split. We report the incumbent accuracy and loss performance
-        i) by taking the maximum value across all seeds and configurations
-        ii) averaged across the three available seeds
-
-                                    i) The best possible incumbents (NO AVG!)                       ii) The "average" incumbent
-        Datastet        Metric      (Index of Arch, Accuracy)       (Index, Loss)                   (Index of Arch, Accuracy)       (Index, Loss)
-        ----------------------------------------------------------------------------------------------------------------------------------------------------------
-        cifar10-valid   train       (258, 100.0)                    (2778, 0.001179278278425336)    (10154, 100)                    (2778, 0.0013082386429297428)
-        cifar10-valid   x-valid     (6111, 91.71999999023437)       (14443, 0.3837750501537323)     (6111, 91.60666665039064)       (3888, 0.3894046771335602)
-        cifar10-valid   x-test
-        cifar10-valid   ori-test    (14174, 91.65)                  (3385, 0.3850496160507202)      (1459, 91.52333333333333)       (3385, 0.3995230517864227)
-
-        cifar100        train       (9930, 99.948)                  (9930, 0.012630240231156348)    (9930, 99.93733333333334)       (9930, 0.012843489621082942)
-        cifar100        x-valid     (13714, 73.71999998779297)      (13934, 1.1490126512527465)     (9930, 73.4933333577474)        (7361, 1.1600867895126343)
-        cifar100        x-test      (1459, 74.28000004882813)       (15383, 1.1427113876342774)     (9930, 73.51333332112631)       (7337, 1.1747569534301758)
-        cifar100        ori-test    (9930, 73.88)                   (13706, 1.1610547459602356)     (9930, 73.50333333333333)       (7361, 1.1696554500579834)
-
-        ImageNet16-120  train       (9930, 73.2524719841793)        (9930, 0.9490517352046979)      (9930, 73.22918040138735)       (9930, 0.9524298415108582)
-        ImageNet16-120  x-valid     (13778, 47.39999985758463)      (10721, 2.0826991437276203)     (10676, 46.73333327229818)      (10721, 2.0915397168795264)
-        ImageNet16-120  x-test      (857, 48.03333317057292)        (12887, 2.0940088628133138)     (857, 47.31111100599501)        (11882, 2.106453532218933)
-        ImageNet16-120  ori-test    (857, 47.083333353678384)       (11882, 2.0950548852284747)     (857, 46.8444444647895)         (11882, 2.1028235816955565)
-
-
-        Note:
-        - The parameter epoch is 0 indexed!
-        - In the original data, the training splits are always marked with the key 'train' but they use different
-          identifiers to refer to the available evaluation splits. We report them also in the table below.
-        - We exclude the data set cifar10 from this benchmark.
-        - In NasBench201, not all architectures have values for the three seeds. To increase robustness, we have patched
-          missing values with the values from an available seed.
-
-         Some further remarks:
-        - cifar10-valid is trained on the train split and tested on the validation split.
-        - The train metrics are dictionaries with epochs (e.g. 0, 1, 2) as key and the metric as value.
-          The evaluation metrics, however, have as key the identifiers, e.g. ori-test@0, with 0 indicating the epoch.
-          Also, each data set reports values for all 200 epochs for a metric on the specified split
-          and a single value on the 200th epoch for the other splits.
-
-        Parameters
-        ----------
-        dataset : str
-            One of cifar10-valid, cifar10, cifar100, ImageNet16-120.
-        rng : np.random.RandomState, int, None
-            Random seed for the benchmark's random state.
-        """  # noqa: E501
-
-        super(NasBench201SOBenchmark, self).__init__(rng=rng, **kwargs)
-        self.mo_benchmark = NasBench201BaseMOBenchmark(rng=rng, dataset=dataset, **kwargs)
+class _NasBench201SOBenchmark(_NasBench201BaseBenchmark, AbstractSingleObjectiveBenchmark):
 
     # pylint: disable=arguments-differ
-    @AbstractBenchmark.check_parameters
+    @AbstractSingleObjectiveBenchmark.check_parameters
     def objective_function(self, configuration: Union[CS.Configuration, Dict],
                            fidelity: Union[Dict, CS.Configuration, None] = None,
                            rng: Union[np.random.RandomState, int, None] = None,
@@ -674,15 +608,15 @@ def objective_function(self, configuration: Union[CS.Configuration, Dict],
         Returns
         -------
         Dict -
-            function_value : training precision
+            function_value : training misclassification_rate
             cost : time to train the network
             info : Dict
-                train_precision : float
+                train_misclassification_rate : float
                 train_losses : float
                 train_cost : float
                     Time needed to train the network for 'epoch' many epochs. If more than one seed is given,
                     this field is the sum of the training time per network
-                eval_precision : float
+                eval_misclassification_rate : float
                 eval_losses : float
                 eval_cost : float
                     Time needed to train the network for 'epoch many epochs plus the time to evaluate the network on the
@@ -690,14 +624,13 @@ def objective_function(self, configuration: Union[CS.Configuration, Dict],
                 fidelity : Dict
                     used fidelities in this evaluation
         """
-        results = self.mo_benchmark.objective_function(
+        results = self._mo_objective_function(
             configuration=configuration, fidelity=fidelity, rng=rng, data_seed=data_seed, **kwargs
         )
-
         results['function_value'] = results['function_value']['misclassification_rate']
         return results
 
-    @AbstractBenchmark.check_parameters
+    @AbstractSingleObjectiveBenchmark.check_parameters
     def objective_function_test(self, configuration: Union[CS.Configuration, Dict],
                                 fidelity: Union[Dict, None] = None,
                                 rng: Union[np.random.RandomState, int, None] = None,
@@ -729,90 +662,44 @@ def objective_function_test(self, configuration: Union[CS.Configuration, Dict],
         Returns
         -------
         Dict -
-            function_value : evaluation precision
+            function_value : evaluation misclassification_rate
             cost : time to the network + time to validate
             info : Dict
-                train_precision
+                train_misclassification_rate
                 train_losses
                 train_cost
-                eval_precision
+                eval_misclassification_rate
                 eval_losses
                 eval_cost
                 fidelity : used fidelities in this evaluation
         """
 
-        results = self.mo_benchmark.objective_function_test(
+        results = self._mo_objective_function_test(
             configuration=configuration, fidelity=fidelity, rng=rng, **kwargs
         )
-
         results['function_value'] = results['function_value']['misclassification_rate']
         return results
 
-    @staticmethod
-    def get_configuration_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
-        """
-        Return the CS representation of the search space.
-        From https://github.com/D-X-Y/AutoDL-Projects/blob/master/exps/algos/BOHB.py
-        Author: https://github.com/D-X-Y [Xuanyi.Dong@student.uts.edu.au]
-
-        Parameters
-        ----------
-        seed : int, None
-            Random seed for the configuration space.
-
-        Returns
-        -------
-        CS.ConfigurationSpace -
-            Containing the benchmark's hyperparameter
-        """
-        return NasBench201BaseMOBenchmark.get_configuration_space(seed=seed)
-
-    @staticmethod
-    def get_fidelity_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
-        """
-        Creates a ConfigSpace.ConfigurationSpace containing all fidelity parameters for
-        the NAS Benchmark 201.
-
-        Fidelities:
-         - epoch: int
-         The loss / accuracy at `epoch`. Can be from 0 to 199.
-
-        Parameters
-        ----------
-        seed : int, None
-            Fixing the seed for the ConfigSpace.ConfigurationSpace
-
-        Returns
-        -------
-        ConfigSpace.ConfigurationSpace
-        """
-        return NasBench201BaseMOBenchmark.get_fidelity_space(seed=seed)
-
-    @staticmethod
-    def get_meta_information() -> Dict:
-        """ Returns the meta information for the benchmark """
-        return NasBench201BaseMOBenchmark.get_meta_information()
 
-
-class Cifar10ValidNasBench201Benchmark(NasBench201SOBenchmark):
+class Cifar10ValidNasBench201Benchmark(_NasBench201SOBenchmark):
 
     def __init__(self, rng: Union[np.random.RandomState, int, None] = None, **kwargs):
         super(Cifar10ValidNasBench201Benchmark, self).__init__(dataset='cifar10-valid', rng=rng, **kwargs)
 
 
-class Cifar100NasBench201Benchmark(NasBench201SOBenchmark):
+class Cifar100NasBench201Benchmark(_NasBench201SOBenchmark):
 
     def __init__(self, rng: Union[np.random.RandomState, int, None] = None, **kwargs):
         super(Cifar100NasBench201Benchmark, self).__init__(dataset='cifar100', rng=rng, **kwargs)
 
 
-class ImageNetNasBench201Benchmark(NasBench201SOBenchmark):
+class ImageNetNasBench201Benchmark(_NasBench201SOBenchmark):
 
     def __init__(self, rng: Union[np.random.RandomState, int, None] = None, **kwargs):
         super(ImageNetNasBench201Benchmark, self).__init__(dataset='ImageNet16-120', rng=rng, **kwargs)
 
 
-class _NasBench201BaseBenchmarkOriginal(NasBench201SOBenchmark):
+class _NasBench201SOBenchmarkOriginal(_NasBench201SOBenchmark):
 
     @staticmethod
     def get_fidelity_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
@@ -825,7 +712,8 @@ def get_fidelity_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
         experiments from DEHB
         [DEHB](https://github.com/automl/DEHB/tree/937dd5cf48e79f6d587ea2ff408cb5ad9a8dce46/dehb/examples)
 
-        Fidelities:
+        Fidelities
+        ----------
         epoch: int
             The loss / accuracy at `epoch`.
 
@@ -851,26 +739,26 @@ def get_fidelity_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
     @staticmethod
     def get_meta_information() -> Dict:
         """ Returns the meta information for the benchmark """
-        meta_information = NasBench201SOBenchmark.get_meta_information()
+        meta_information = _NasBench201SOBenchmark.get_meta_information()
         meta_information['note'] = \
             'This version of the benchmark implements the fidelity space defined in the DEHB paper.' \
             'See [DEHB](https://github.com/automl/DEHB/tree/937dd5cf48e79f6d587ea2ff408cb5ad9a8dce46/dehb/examples)'
         return meta_information
 
 
-class Cifar10ValidNasBench201BenchmarkOriginal(_NasBench201BaseBenchmarkOriginal):
+class Cifar10ValidNasBench201BenchmarkOriginal(_NasBench201SOBenchmarkOriginal):
 
     def __init__(self, rng: Union[np.random.RandomState, int, None] = None, **kwargs):
         super(Cifar10ValidNasBench201BenchmarkOriginal, self).__init__(dataset='cifar10-valid', rng=rng, **kwargs)
 
 
-class Cifar100NasBench201BenchmarkOriginal(_NasBench201BaseBenchmarkOriginal):
+class Cifar100NasBench201BenchmarkOriginal(_NasBench201SOBenchmarkOriginal):
 
     def __init__(self, rng: Union[np.random.RandomState, int, None] = None, **kwargs):
         super(Cifar100NasBench201BenchmarkOriginal, self).__init__(dataset='cifar100', rng=rng, **kwargs)
 
 
-class ImageNetNasBench201BenchmarkOriginal(_NasBench201BaseBenchmarkOriginal):
+class ImageNetNasBench201BenchmarkOriginal(_NasBench201SOBenchmarkOriginal):
 
     def __init__(self, rng: Union[np.random.RandomState, int, None] = None, **kwargs):
         super(ImageNetNasBench201BenchmarkOriginal, self).__init__(dataset='ImageNet16-120', rng=rng, **kwargs)
diff --git a/hpobench/benchmarks/nas/tabular_benchmarks.py b/hpobench/benchmarks/nas/tabular_benchmarks.py
index fd7404a0..5db34f2f 100644
--- a/hpobench/benchmarks/nas/tabular_benchmarks.py
+++ b/hpobench/benchmarks/nas/tabular_benchmarks.py
@@ -50,7 +50,6 @@
 * First implementation
 """
 import logging
-
 from pathlib import Path
 from typing import Union, Dict, Tuple, List
 
diff --git a/hpobench/benchmarks/rl/cartpole.py b/hpobench/benchmarks/rl/cartpole.py
index 3bcaeab4..ea9ef053 100644
--- a/hpobench/benchmarks/rl/cartpole.py
+++ b/hpobench/benchmarks/rl/cartpole.py
@@ -20,12 +20,13 @@
 """
 
 import logging
+import os
 import time
 from typing import Union, Dict
 
 import ConfigSpace as CS
 import numpy as np
-import os
+
 os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
 
 import tensorflow as tf  # noqa: E402
diff --git a/hpobench/benchmarks/surrogates/paramnet_benchmark.py b/hpobench/benchmarks/surrogates/paramnet_benchmark.py
index 2e809b7b..35c7f80d 100644
--- a/hpobench/benchmarks/surrogates/paramnet_benchmark.py
+++ b/hpobench/benchmarks/surrogates/paramnet_benchmark.py
@@ -61,8 +61,8 @@
 0.0.1:
 * First implementation
 """
-import warnings
 import logging
+import warnings
 from typing import Union, Dict
 
 import ConfigSpace as CS
diff --git a/hpobench/benchmarks/surrogates/yahpo_gym.py b/hpobench/benchmarks/surrogates/yahpo_gym.py
index 19522700..ad552acd 100644
--- a/hpobench/benchmarks/surrogates/yahpo_gym.py
+++ b/hpobench/benchmarks/surrogates/yahpo_gym.py
@@ -29,54 +29,104 @@
 
 Changelog:
 ==========
+0.0.2:
+
+* Add support for multi-objective benchmarks
+* Add support for fairness benchmarks and interpretability benchmarks.
+For these new benchmarks (fairness and interpretability), we recommend the following benchmarks and objectives:
+For the entire list of available benchmarks, please take a look in the yahpo benchmark documentation.
+
+Benchmark Name      |   Scenario    |   Objectives
+--------------------|---------------|--------------
+fair_fgrrm          | 7592          | mmce, feo
+                    | 14965         | mmce, feo
+--------------------|---------------|--------------
+fair_rpart          | 317599        | mmce, ffomr
+                    | 7592          | mmce, feo
+--------------------|---------------|--------------
+fair_ranger         | 317599        | mmce, fpredp
+                    | 14965         | mmce, fpredp
+--------------------|---------------|--------------
+fair_xgboost        | 317599        | mmce, ffomr
+                    | 7592          | mmce, ffnr
+--------------------|---------------|--------------
+fair_super          | 14965         | mmce, feo
+                    | 317599        | mmce, ffnr
+--------------------|---------------|--------------
+
+
+Benchmark Name      |   Scenario    |   Objectives
+--------------------|---------------|--------------
+iaml_glmnet          | 1489         | mmce, nf
+                    | 40981         | mmce, nf
+--------------------|---------------|--------------
+iaml_rpart          | 1489          | mmce, nf
+                    | 41146         | mmce, nf
+--------------------|---------------|--------------
+iaml_ranger         | 40981         | mmce, nf
+                    | 41146         | mmce, nf
+--------------------|---------------|--------------
+iaml_xgboost        | 40981         | mmce, nf
+                    | 41146         | mmce, nf
+--------------------|---------------|--------------
+iaml_super          | 40981         | mmce, nf
+                    | 41146         | mmce, nf
+--------------------|---------------|--------------
+
 0.0.1:
 * First implementation
 """
-import os
 import logging
+from pathlib import Path
 from typing import Union, Dict, List
 
 import ConfigSpace as CS
 import numpy as np
-
 from yahpo_gym.benchmark_set import BenchmarkSet
-from hpobench.abstract_benchmark import AbstractMultiObjectiveBenchmark, AbstractBenchmark
 
-__version__ = '0.0.1'
+from hpobench.abstract_benchmark import AbstractMultiObjectiveBenchmark, AbstractSingleObjectiveBenchmark
+from hpobench.util.data_manager import YAHPODataManager
 
-logger = logging.getLogger('YAHPOGym')
+__version__ = '0.0.2'
 
+logger = logging.getLogger('YAHPOGym')
 
-class YAHPOGymMOBenchmark(AbstractMultiObjectiveBenchmark):
 
+class YAHPOGymBaseBenchmark:
     def __init__(self, scenario: str, instance: str,
+                 data_dir: Union[Path, str, None] = None,
+                 multi_thread: bool = True,
                  rng: Union[np.random.RandomState, int, None] = None):
         """
-        For a list of available scenarios and instances see
-        'https://slds-lmu.github.io/yahpo_gym/scenarios.html'
+        Base Benchmark for all single and multi objective yahpo surrogate benchmarks.
         Parameters
         ----------
         scenario : str
-            Name for the surrogate data. Must be one of ["lcbench", "fcnet", "nb301", "rbv2_svm",
-            "rbv2_ranger", "rbv2_rpart", "rbv2_glmnet", "rbv2_aknn", "rbv2_xgboost", "rbv2_super"]
+            Name for the surrogate data. Must be one of
+            ["lcbench", "fcnet", "nb301", "rbv2_svm",
+            "rbv2_ranger", "rbv2_rpart", "rbv2_glmnet", "rbv2_aknn", "rbv2_xgboost", "rbv2_super",
+            "fair_ranger", "fair_rpart", "fair_fgrrm",               "fair_xgboost", "fair_super",
+            "iaml_ranger", "iaml_rpart", "iaml_glmnet",              "iaml_xgboost", "iaml_super"]
         instance : str
             A valid instance for the scenario. See `self.benchset.instances`.
+        data_dir: Optional, str, Path
+            Directory, where the yahpo data is stored.
+            Download automatically from https://github.com/slds-lmu/yahpo_data/tree/fair
+        multi_thread: bool
+            Flag to run ONNX runtime with a single thread. Might be important on compute clusters.
+            Defaults to True
         rng : np.random.RandomState, int, None
         """
-
-        # When in the containerized version, redirect to the data inside the container.
-        if 'YAHPO_CONTAINER' in os.environ:
-            from yahpo_gym.local_config import LocalConfiguration
-            local_config = LocalConfiguration()
-            local_config.init_config(data_path='/home/data/yahpo_data')
+        self.data_manager = YAHPODataManager(data_dir=data_dir)
+        self.data_manager.load()
 
         self.scenario = scenario
         self.instance = instance
-        self.benchset = BenchmarkSet(scenario, active_session=True)
+        self.benchset = BenchmarkSet(scenario, active_session=True, multithread=multi_thread)
         self.benchset.set_instance(instance)
 
         logger.info(f'Start Benchmark for scenario {scenario} and instance {instance}')
-        super(YAHPOGymMOBenchmark, self).__init__(rng=rng)
+        super(YAHPOGymBaseBenchmark, self).__init__(rng=rng)
 
     # pylint: disable=arguments-differ
     def get_configuration_space(self, seed: Union[int, None] = None) -> CS.ConfigurationSpace:
@@ -86,8 +136,7 @@ def get_configuration_space(self, seed: Union[int, None] = None) -> CS.Configura
     def get_fidelity_space(self, seed: Union[int, None] = None) -> CS.ConfigurationSpace:
         return self.benchset.get_fidelity_space(seed=seed)
 
-    @AbstractMultiObjectiveBenchmark.check_parameters
-    def objective_function(self, configuration: Union[CS.Configuration, Dict],
+    def _mo_objective_function(self, configuration: Union[CS.Configuration, Dict],
                            fidelity: Union[CS.Configuration, Dict, None] = None,
                            rng: Union[np.random.RandomState, int, None] = None, **kwargs) -> Dict:
 
@@ -103,17 +152,6 @@ def objective_function(self, configuration: Union[CS.Configuration, Dict],
                 "cost": cost,
                 'info': {'fidelity': fidelity}}
 
-    @AbstractMultiObjectiveBenchmark.check_parameters
-    def objective_function_test(self, configuration: Union[CS.Configuration, Dict],
-                                fidelity: Union[CS.Configuration, Dict, None] = None,
-                                rng: Union[np.random.RandomState, int, None] = None, **kwargs) \
-            -> Dict:
-        return self.objective_function(configuration, fidelity=fidelity, rng=rng)
-
-    # pylint: disable=arguments-differ
-    def get_objective_names(self) -> List[str]:
-        return self.benchset.config.y_names
-
     @staticmethod
     def get_meta_information():
         """ Returns the meta information for the benchmark """
@@ -130,9 +168,11 @@ def get_meta_information():
                 'code': 'https://github.com/pfistfl/yahpo_gym/yahpo_gym'}
 
 
-class YAHPOGymBenchmark(AbstractBenchmark):
+class YAHPOGymMOBenchmark(YAHPOGymBaseBenchmark, AbstractMultiObjectiveBenchmark):
 
     def __init__(self, scenario: str, instance: str, objective: str = None,
+                 data_dir: Union[Path, str, None] = None,
+                 multi_thread: bool = True,
                  rng: Union[np.random.RandomState, int, None] = None):
         """
         For a list of available scenarios and instances see
@@ -140,33 +180,88 @@ def __init__(self, scenario: str, instance: str, objective: str = None,
         Parameters
         ----------
         scenario : str
-            Name for the surrogate data. Must be one of ["lcbench", "fcnet", "nb301", "rbv2_svm",
-            "rbv2_ranger", "rbv2_rpart", "rbv2_glmnet", "rbv2_aknn", "rbv2_xgboost", "rbv2_super"]
+            Name for the surrogate data. Must be one of
+            ["lcbench", "fcnet", "nb301", "rbv2_svm",
+            "rbv2_ranger", "rbv2_rpart", "rbv2_glmnet", "rbv2_aknn", "rbv2_xgboost", "rbv2_super",
+            "fair_ranger", "fair_rpart", "fair_fgrrm",               "fair_xgboost", "fair_super",
+            "iaml_ranger", "iaml_rpart", "iaml_glmnet",              "iaml_xgboost", "iaml_super"]
+        instance : str
+            A valid instance for the scenario. See `self.benchset.instances`.
+        data_dir: Optional, str, Path
+            Directory, where the yahpo data is stored.
+            Download automatically from https://github.com/slds-lmu/yahpo_data/tree/fair
+        multi_thread: bool
+            Flag to run ONNX runtime with a single thread. Might be important on compute clusters.
+            Defaults to True
+        rng : np.random.RandomState, int, None
+        """
+        self.objective = objective
+        super(YAHPOGymMOBenchmark, self).__init__(scenario=scenario, instance=instance, rng=rng, data_dir=data_dir, multi_thread=multi_thread)
+
+    @AbstractMultiObjectiveBenchmark.check_parameters
+    def objective_function(self, configuration: Union[CS.Configuration, Dict],
+                           fidelity: Union[CS.Configuration, Dict, None] = None,
+                           rng: Union[np.random.RandomState, int, None] = None, **kwargs) -> Dict:
+
+        return self._mo_objective_function(configuration, fidelity, rng, **kwargs)
+
+    @AbstractMultiObjectiveBenchmark.check_parameters
+    def objective_function_test(self, configuration: Union[CS.Configuration, Dict],
+                                fidelity: Union[CS.Configuration, Dict, None] = None,
+                                rng: Union[np.random.RandomState, int, None] = None, **kwargs) \
+            -> Dict:
+        return self.objective_function(configuration, fidelity=fidelity, rng=rng)
+
+    # pylint: disable=arguments-differ
+    def get_objective_names(self) -> List[str]:
+        return self.benchset.config.y_names
+
+
+class YAHPOGymBenchmark(YAHPOGymBaseBenchmark, AbstractSingleObjectiveBenchmark):
+
+    def __init__(self, scenario: str, instance: str, objective: str = None,
+                 data_dir: Union[Path, str, None] = None,
+                 multi_thread: bool = True,
+                 rng: Union[np.random.RandomState, int, None] = None):
+        """
+        For a list of available scenarios and instances see
+        'https://slds-lmu.github.io/yahpo_gym/scenarios.html'
+        Parameters
+        ----------
+        scenario : str
+            Name for the surrogate data. Must be one of
+            ["lcbench", "fcnet", "nb301", "rbv2_svm",
+            "rbv2_ranger", "rbv2_rpart", "rbv2_glmnet", "rbv2_aknn", "rbv2_xgboost", "rbv2_super",
+            "fair_ranger", "fair_rpart", "fair_fgrrm",               "fair_xgboost", "fair_super",
+            "iaml_ranger", "iaml_rpart", "iaml_glmnet",              "iaml_xgboost", "iaml_super"]
         instance : str
             A valid instance for the scenario. See `self.benchset.instances`.
         objective : str
             Name of the (single-crit) objective. See `self.benchset.config.y_names`.
             Initialized to None, picks the first element in y_names.
+        data_dir: Optional, str, Path
+            Directory, where the yahpo data is stored.
+            Download automatically from https://github.com/slds-lmu/yahpo_data/tree/fair
+        multi_thread: bool
+            Flag to run ONNX runtime with a single thread. Might be important on compute clusters.
+            Defaults to True
         rng : np.random.RandomState, int, None
         """
-
-        self.backbone = YAHPOGymMOBenchmark(scenario=scenario, instance=instance, rng=rng)
         self.objective = objective
+        super(YAHPOGymBenchmark, self).__init__(scenario=scenario, instance=instance, rng=rng, data_dir=data_dir, multi_thread=multi_thread)
 
-        super(YAHPOGymBenchmark, self).__init__(rng=rng)
-
-    @AbstractBenchmark.check_parameters
+    @AbstractSingleObjectiveBenchmark.check_parameters
     def objective_function(self, configuration: Union[CS.Configuration, Dict],
                            fidelity: Union[Dict, CS.Configuration, None] = None,
                            rng: Union[np.random.RandomState, int, None] = None, **kwargs) -> Dict:
 
-        mo_results = self.backbone.objective_function(configuration=configuration,
-                                                      fidelity=fidelity,
-                                                      **kwargs)
+        mo_results = self._mo_objective_function(configuration=configuration,
+                                                 fidelity=fidelity,
+                                                 **kwargs)
 
         # If not objective is set, we just grab the first returned entry.
         if self.objective is None:
-            self.objective = self.backbone.benchset.config.y_names[0]
+            self.objective = self.benchset.config.y_names[0]
 
         obj_value = mo_results['function_value'][self.objective]
 
@@ -174,20 +269,8 @@ def objective_function(self, configuration: Union[CS.Configuration, Dict],
                 "cost": mo_results['cost'],
                 'info': {'fidelity': fidelity, 'objectives': mo_results['function_value']}}
 
-    @AbstractBenchmark.check_parameters
+    @AbstractSingleObjectiveBenchmark.check_parameters
     def objective_function_test(self, configuration: Union[CS.Configuration, Dict],
                                 fidelity: Union[Dict, CS.Configuration, None] = None,
                                 rng: Union[np.random.RandomState, int, None] = None, **kwargs) -> Dict:
         return self.objective_function(configuration, fidelity=fidelity, rng=rng)
-
-    # pylint: disable=arguments-differ
-    def get_configuration_space(self, seed: Union[int, None] = None) -> CS.ConfigurationSpace:
-        return self.backbone.get_configuration_space(seed=seed)
-
-    # pylint: disable=arguments-differ
-    def get_fidelity_space(self, seed: Union[int, None] = None) -> CS.ConfigurationSpace:
-        return self.backbone.get_fidelity_space(seed=seed)
-
-    @staticmethod
-    def get_meta_information() -> Dict:
-        return YAHPOGymMOBenchmark.get_meta_information()
diff --git a/hpobench/config.py b/hpobench/config.py
index 9d7964e0..cd46c6e5 100644
--- a/hpobench/config.py
+++ b/hpobench/config.py
@@ -64,7 +64,16 @@ def __init__(self):
 
         # Options for the singularity container
         self.socket_dir = Path(self.socket_dir).expanduser().absolute()
-        self.container_dir = self.cache_dir / f'hpobench-{os.getuid()}'
+
+        # os.getuid is only for posix os. Make it compatible with windows
+        # https://stackoverflow.com/questions/842059/is-there-a-portable-way-to-get-the-current-username-in-python
+        if os.name == 'nt':
+            import getpass
+            user_name = getpass.getuser()
+        else:
+            user_name = os.getuid()
+
+        self.container_dir = self.cache_dir / f'hpobench-{user_name}'
         self.container_source = 'oras://gitlab.tf.uni-freiburg.de:5050/muelleph/hpobench-registry'
         self.pyro_connect_max_wait = 400
 
diff --git a/hpobench/container/benchmarks/ml/__init__.py b/hpobench/container/benchmarks/ml/__init__.py
index ed2ce40f..f342f5f8 100644
--- a/hpobench/container/benchmarks/ml/__init__.py
+++ b/hpobench/container/benchmarks/ml/__init__.py
@@ -6,7 +6,7 @@
 from hpobench.container.benchmarks.ml.svm_benchmark import SVMBenchmark, SVMBenchmarkBB, SVMBenchmarkMF
 from hpobench.container.benchmarks.ml.tabular_benchmark import TabularBenchmark
 from hpobench.container.benchmarks.ml.xgboost_benchmark import XGBoostBenchmark, XGBoostBenchmarkBB, XGBoostBenchmarkMF
-
+from hpobench.container.benchmarks.ml.yahpo_benchmark import YAHPOGymRawBenchmark, YAHPOGymMORawBenchmark
 
 __all__ = ['HistGBBenchmark', 'HistGBBenchmarkBB', 'HistGBBenchmarkMF',
            'LRBenchmark', 'LRBenchmarkBB', 'LRBenchmarkMF',
@@ -14,4 +14,5 @@
            'RandomForestBenchmark', 'RandomForestBenchmarkBB', 'RandomForestBenchmarkMF',
            'SVMBenchmark', 'SVMBenchmarkBB', 'SVMBenchmarkMF',
            'TabularBenchmark',
-           'XGBoostBenchmark', 'XGBoostBenchmarkBB', 'XGBoostBenchmarkMF']
+           'XGBoostBenchmark', 'XGBoostBenchmarkBB', 'XGBoostBenchmarkMF',
+           'YAHPOGymRawBenchmark', 'YAHPOGymMORawBenchmark']
diff --git a/hpobench/container/benchmarks/ml/lr_benchmark.py b/hpobench/container/benchmarks/ml/lr_benchmark.py
index 979cda3e..61b80a13 100644
--- a/hpobench/container/benchmarks/ml/lr_benchmark.py
+++ b/hpobench/container/benchmarks/ml/lr_benchmark.py
@@ -6,27 +6,31 @@
 from hpobench.container.client_abstract_benchmark import AbstractBenchmarkClient
 
 
+container_name = "ml_mmfb"
+container_version = "0.0.4"
+
+
 class LRBenchmark(AbstractBenchmarkClient):
     def __init__(self, **kwargs):
         kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'LRBenchmark')
-        kwargs['container_name'] = kwargs.get('container_name', 'ml_mmfb')
-        kwargs['latest'] = kwargs.get('container_tag', '0.0.1')
+        kwargs['container_name'] = kwargs.get('container_name', container_name)
+        kwargs['latest'] = kwargs.get('container_tag', container_version)
         super(LRBenchmark, self).__init__(**kwargs)
 
 
 class LRBenchmarkBB(AbstractBenchmarkClient):
     def __init__(self, **kwargs):
         kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'LRBenchmarkBB')
-        kwargs['container_name'] = kwargs.get('container_name', 'ml_mmfb')
-        kwargs['latest'] = kwargs.get('container_tag', '0.0.1')
+        kwargs['container_name'] = kwargs.get('container_name', container_name)
+        kwargs['latest'] = kwargs.get('container_tag', container_version)
         super(LRBenchmarkBB, self).__init__(**kwargs)
 
 
 class LRBenchmarkMF(AbstractBenchmarkClient):
     def __init__(self, **kwargs):
         kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'LRBenchmarkMF')
-        kwargs['container_name'] = kwargs.get('container_name', 'ml_mmfb')
-        kwargs['latest'] = kwargs.get('container_tag', '0.0.1')
+        kwargs['container_name'] = kwargs.get('container_name', container_name)
+        kwargs['latest'] = kwargs.get('container_tag', container_version)
         super(LRBenchmarkMF, self).__init__(**kwargs)
 
 
diff --git a/hpobench/container/benchmarks/ml/nn_benchmark.py b/hpobench/container/benchmarks/ml/nn_benchmark.py
index 04955e82..d4b0f52a 100644
--- a/hpobench/container/benchmarks/ml/nn_benchmark.py
+++ b/hpobench/container/benchmarks/ml/nn_benchmark.py
@@ -6,27 +6,31 @@
 from hpobench.container.client_abstract_benchmark import AbstractBenchmarkClient
 
 
+container_name = "ml_mmfb"
+container_version = "0.0.4"
+
+
 class NNBenchmark(AbstractBenchmarkClient):
     def __init__(self, **kwargs):
         kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'NNBenchmark')
-        kwargs['container_name'] = kwargs.get('container_name', 'ml_mmfb')
-        kwargs['latest'] = kwargs.get('container_tag', '0.0.1')
+        kwargs['container_name'] = kwargs.get('container_name', container_name)
+        kwargs['latest'] = kwargs.get('container_tag', container_version)
         super(NNBenchmark, self).__init__(**kwargs)
 
 
 class NNBenchmarkBB(AbstractBenchmarkClient):
     def __init__(self, **kwargs):
         kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'NNBenchmarkBB')
-        kwargs['container_name'] = kwargs.get('container_name', 'ml_mmfb')
-        kwargs['latest'] = kwargs.get('container_tag', '0.0.1')
+        kwargs['container_name'] = kwargs.get('container_name', container_name)
+        kwargs['latest'] = kwargs.get('container_tag', container_version)
         super(NNBenchmarkBB, self).__init__(**kwargs)
 
 
 class NNBenchmarkMF(AbstractBenchmarkClient):
     def __init__(self, **kwargs):
         kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'NNBenchmarkMF')
-        kwargs['container_name'] = kwargs.get('container_name', 'ml_mmfb')
-        kwargs['latest'] = kwargs.get('container_tag', '0.0.1')
+        kwargs['container_name'] = kwargs.get('container_name', container_name)
+        kwargs['latest'] = kwargs.get('container_tag', container_version)
         super(NNBenchmarkMF, self).__init__(**kwargs)
 
 
diff --git a/hpobench/container/benchmarks/ml/rf_benchmark.py b/hpobench/container/benchmarks/ml/rf_benchmark.py
index a414349d..13e9bb47 100644
--- a/hpobench/container/benchmarks/ml/rf_benchmark.py
+++ b/hpobench/container/benchmarks/ml/rf_benchmark.py
@@ -6,27 +6,31 @@
 from hpobench.container.client_abstract_benchmark import AbstractBenchmarkClient
 
 
+container_name = "ml_mmfb"
+container_version = "0.0.4"
+
+
 class RandomForestBenchmark(AbstractBenchmarkClient):
     def __init__(self, **kwargs):
         kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'RandomForestBenchmark')
-        kwargs['container_name'] = kwargs.get('container_name', 'ml_mmfb')
-        kwargs['latest'] = kwargs.get('container_tag', '0.0.1')
+        kwargs['container_name'] = kwargs.get('container_name', container_name)
+        kwargs['latest'] = kwargs.get('container_tag', container_version)
         super(RandomForestBenchmark, self).__init__(**kwargs)
 
 
 class RandomForestBenchmarkBB(AbstractBenchmarkClient):
     def __init__(self, **kwargs):
         kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'RandomForestBenchmarkBB')
-        kwargs['container_name'] = kwargs.get('container_name', 'ml_mmfb')
-        kwargs['latest'] = kwargs.get('container_tag', '0.0.1')
+        kwargs['container_name'] = kwargs.get('container_name', container_name)
+        kwargs['latest'] = kwargs.get('container_tag', container_version)
         super(RandomForestBenchmarkBB, self).__init__(**kwargs)
 
 
 class RandomForestBenchmarkMF(AbstractBenchmarkClient):
     def __init__(self, **kwargs):
         kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'RandomForestBenchmarkMF')
-        kwargs['container_name'] = kwargs.get('container_name', 'ml_mmfb')
-        kwargs['latest'] = kwargs.get('container_tag', '0.0.1')
+        kwargs['container_name'] = kwargs.get('container_name', container_name)
+        kwargs['latest'] = kwargs.get('container_tag', container_version)
         super(RandomForestBenchmarkMF, self).__init__(**kwargs)
 
 
diff --git a/hpobench/container/benchmarks/ml/svm_benchmark.py b/hpobench/container/benchmarks/ml/svm_benchmark.py
index 7547a81a..7a20f40b 100644
--- a/hpobench/container/benchmarks/ml/svm_benchmark.py
+++ b/hpobench/container/benchmarks/ml/svm_benchmark.py
@@ -6,27 +6,31 @@
 from hpobench.container.client_abstract_benchmark import AbstractBenchmarkClient
 
 
+container_name = "ml_mmfb"
+container_version = "0.0.4"
+
+
 class SVMBenchmark(AbstractBenchmarkClient):
     def __init__(self, **kwargs):
         kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'SVMBenchmark')
-        kwargs['container_name'] = kwargs.get('container_name', 'ml_mmfb')
-        kwargs['latest'] = kwargs.get('container_tag', '0.0.1')
+        kwargs['container_name'] = kwargs.get('container_name', container_name)
+        kwargs['latest'] = kwargs.get('container_tag', container_version)
         super(SVMBenchmark, self).__init__(**kwargs)
 
 
 class SVMBenchmarkMF(AbstractBenchmarkClient):
     def __init__(self, **kwargs):
         kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'SVMBenchmarkMF')
-        kwargs['container_name'] = kwargs.get('container_name', 'ml_mmfb')
-        kwargs['latest'] = kwargs.get('container_tag', '0.0.1')
+        kwargs['container_name'] = kwargs.get('container_name', container_name)
+        kwargs['latest'] = kwargs.get('container_tag', container_version)
         super(SVMBenchmarkMF, self).__init__(**kwargs)
 
 
 class SVMBenchmarkBB(AbstractBenchmarkClient):
     def __init__(self, **kwargs):
         kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'SVMBenchmarkBB')
-        kwargs['container_name'] = kwargs.get('container_name', 'ml_mmfb')
-        kwargs['latest'] = kwargs.get('container_tag', '0.0.1')
+        kwargs['container_name'] = kwargs.get('container_name', container_name)
+        kwargs['latest'] = kwargs.get('container_tag', container_version)
         super(SVMBenchmarkBB, self).__init__(**kwargs)
 
 
diff --git a/hpobench/container/benchmarks/ml/svm_benchmark_old.py b/hpobench/container/benchmarks/ml/svm_benchmark_old.py
deleted file mode 100644
index 4955f057..00000000
--- a/hpobench/container/benchmarks/ml/svm_benchmark_old.py
+++ /dev/null
@@ -1,15 +0,0 @@
-#!/usr/bin/python3
-# -*- coding: utf-8 -*-
-
-""" Benchmark for the XGBoost Benchmark from hpobench/benchmarks/ml/xgboost_benchmark """
-
-from hpobench.container.client_abstract_benchmark import AbstractBenchmarkClient
-
-
-class SupportVectorMachine(AbstractBenchmarkClient):
-    def __init__(self, task_id: int, **kwargs):
-        kwargs['task_id'] = task_id
-        kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'SupportVectorMachine')
-        kwargs['container_name'] = kwargs.get('container_name', 'svm_benchmark')
-        kwargs['latest'] = kwargs.get('container_tag', '0.0.3')
-        super(SupportVectorMachine, self).__init__(**kwargs)
diff --git a/hpobench/container/benchmarks/ml/tabular_benchmark.py b/hpobench/container/benchmarks/ml/tabular_benchmark.py
index 6d19953b..5c8a22ef 100644
--- a/hpobench/container/benchmarks/ml/tabular_benchmark.py
+++ b/hpobench/container/benchmarks/ml/tabular_benchmark.py
@@ -6,11 +6,15 @@
 from hpobench.container.client_abstract_benchmark import AbstractBenchmarkClient
 
 
+container_name = "ml_tabular_benchmarks"
+container_version = "0.0.4"
+
+
 class TabularBenchmark(AbstractBenchmarkClient):
     def __init__(self, **kwargs):
         kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'TabularBenchmark')
-        kwargs['container_name'] = kwargs.get('container_name', 'ml_tabular_benchmarks')
-        kwargs['latest'] = kwargs.get('container_tag', '0.0.1')
+        kwargs['container_name'] = kwargs.get('container_name', container_name)
+        kwargs['latest'] = kwargs.get('container_tag', container_version)
         super(TabularBenchmark, self).__init__(**kwargs)
 
 
diff --git a/hpobench/container/benchmarks/ml/xgboost_benchmark.py b/hpobench/container/benchmarks/ml/xgboost_benchmark.py
index c82ea606..726d6f45 100644
--- a/hpobench/container/benchmarks/ml/xgboost_benchmark.py
+++ b/hpobench/container/benchmarks/ml/xgboost_benchmark.py
@@ -6,36 +6,42 @@
 from hpobench.container.client_abstract_benchmark import AbstractBenchmarkClient
 
 
+container_name = "ml_mmfb"
+container_version = "0.0.4"
+
+
 class XGBoostBenchmark(AbstractBenchmarkClient):
     def __init__(self, **kwargs):
         kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'XGBoostBenchmark')
-        kwargs['container_name'] = kwargs.get('container_name', 'ml_mmfb')
-        kwargs['latest'] = kwargs.get('container_tag', '0.0.1')
+        kwargs['container_name'] = kwargs.get('container_name', container_name)
+        kwargs['latest'] = kwargs.get('container_tag', container_version)
         super(XGBoostBenchmark, self).__init__(**kwargs)
 
 
 class XGBoostBenchmarkBB(AbstractBenchmarkClient):
     def __init__(self, **kwargs):
         kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'XGBoostBenchmarkBB')
-        kwargs['container_name'] = kwargs.get('container_name', 'ml_mmfb')
-        kwargs['latest'] = kwargs.get('container_tag', '0.0.1')
+        kwargs['container_name'] = kwargs.get('container_name', container_name)
+        kwargs['latest'] = kwargs.get('container_tag', container_version)
         super(XGBoostBenchmarkBB, self).__init__(**kwargs)
 
 
 class XGBoostBenchmarkMF(AbstractBenchmarkClient):
     def __init__(self, **kwargs):
         kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'XGBoostBenchmarkMF')
-        kwargs['container_name'] = kwargs.get('container_name', 'ml_mmfb')
-        kwargs['latest'] = kwargs.get('container_tag', '0.0.1')
+        kwargs['container_name'] = kwargs.get('container_name', container_name)
+        kwargs['latest'] = kwargs.get('container_tag', container_version)
         super(XGBoostBenchmarkMF, self).__init__(**kwargs)
 
 
 class XGBoostSearchSpace3Benchmark(AbstractBenchmarkClient):
     def __init__(self, **kwargs):
         kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'XGBoostSearchSpace3Benchmark')
-        kwargs['container_name'] = kwargs.get('container_name', 'ml_mmfb')
-        kwargs['latest'] = kwargs.get('container_tag', '0.0.1')
+        kwargs['container_name'] = kwargs.get('container_name', container_name)
+        kwargs['latest'] = kwargs.get('container_tag', container_version)
         super(XGBoostSearchSpace3Benchmark, self).__init__(**kwargs)
 
 
-__all__ = ['XGBoostBenchmark', 'XGBoostBenchmarkBB', 'XGBoostBenchmarkMF']
+__all__ = [
+    'XGBoostBenchmark', 'XGBoostBenchmarkBB', 'XGBoostBenchmarkMF', 'XGBoostSearchSpace3Benchmark'
+]
diff --git a/hpobench/container/benchmarks/ml/xgboost_benchmark_old.py b/hpobench/container/benchmarks/ml/xgboost_benchmark_old.py
deleted file mode 100644
index df475748..00000000
--- a/hpobench/container/benchmarks/ml/xgboost_benchmark_old.py
+++ /dev/null
@@ -1,24 +0,0 @@
-#!/usr/bin/python3
-# -*- coding: utf-8 -*-
-
-""" Benchmark for the XGBoost Benchmark from hpobench/benchmarks/ml/xgboost_benchmark """
-
-from hpobench.container.client_abstract_benchmark import AbstractBenchmarkClient
-
-
-class XGBoostBenchmark(AbstractBenchmarkClient):
-    def __init__(self, task_id: int, **kwargs):
-        kwargs['task_id'] = task_id
-        kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'XGBoostBenchmark')
-        kwargs['container_name'] = kwargs.get('container_name', 'xgboost_benchmark')
-        kwargs['latest'] = kwargs.get('container_tag', '0.0.3')
-        super(XGBoostBenchmark, self).__init__(**kwargs)
-
-
-class XGBoostExtendedBenchmark(AbstractBenchmarkClient):
-    def __init__(self, task_id: int, **kwargs):
-        kwargs['task_id'] = task_id
-        kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'XGBoostExtendedBenchmark')
-        kwargs['container_name'] = kwargs.get('container_name', 'xgboost_benchmark')
-        kwargs['latest'] = kwargs.get('container_tag', '0.0.3')
-        super(XGBoostExtendedBenchmark, self).__init__(**kwargs)
diff --git a/hpobench/container/benchmarks/ml/yahpo_benchmark.py b/hpobench/container/benchmarks/ml/yahpo_benchmark.py
new file mode 100644
index 00000000..e4d9cf0c
--- /dev/null
+++ b/hpobench/container/benchmarks/ml/yahpo_benchmark.py
@@ -0,0 +1,21 @@
+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+
+from hpobench.container.client_abstract_benchmark import AbstractMOBenchmarkClient, \
+    AbstractBenchmarkClient
+
+
+class YAHPOGymMORawBenchmark(AbstractMOBenchmarkClient):
+    def __init__(self, **kwargs):
+        kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'YAHPOGymMORawBenchmark')
+        kwargs['container_name'] = kwargs.get('container_name', 'yahpo_raw')
+        kwargs['latest'] = kwargs.get('container_tag', '0.0.1')
+        super(YAHPOGymMORawBenchmark, self).__init__(**kwargs)
+
+
+class YAHPOGymRawBenchmark(AbstractBenchmarkClient):
+    def __init__(self, **kwargs):
+        kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'YAHPOGymRawBenchmark')
+        kwargs['container_name'] = kwargs.get('container_name', 'yahpo_raw')
+        kwargs['latest'] = kwargs.get('container_tag', '0.0.1')
+        super(YAHPOGymRawBenchmark, self).__init__(**kwargs)
diff --git a/hpobench/container/benchmarks/mo/adult_benchmark.py b/hpobench/container/benchmarks/mo/adult_benchmark.py
index dbdcaf4d..34baf1b9 100644
--- a/hpobench/container/benchmarks/mo/adult_benchmark.py
+++ b/hpobench/container/benchmarks/mo/adult_benchmark.py
@@ -8,5 +8,5 @@ class AdultBenchmark(AbstractBenchmarkClient):
     def __init__(self, **kwargs):
         kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'AdultBenchmark')
         kwargs['container_name'] = kwargs.get('container_name', 'fair_adult')
-        kwargs['latest'] = kwargs.get('container_tag', '0.0.1')
+        kwargs['latest'] = kwargs.get('container_tag', '0.0.2')
         super(AdultBenchmark, self).__init__(**kwargs)
diff --git a/hpobench/container/benchmarks/mo/cnn_benchmark.py b/hpobench/container/benchmarks/mo/cnn_benchmark.py
index c9a1d009..9e5cfe6f 100644
--- a/hpobench/container/benchmarks/mo/cnn_benchmark.py
+++ b/hpobench/container/benchmarks/mo/cnn_benchmark.py
@@ -8,7 +8,7 @@ class FlowerCNNBenchmark(AbstractMOBenchmarkClient):
     def __init__(self, **kwargs):
         kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'FlowerCNNBenchmark')
         kwargs['container_name'] = kwargs.get('container_name', 'mo_cnn')
-        kwargs['latest'] = kwargs.get('container_tag', '0.0.1')
+        kwargs['latest'] = kwargs.get('container_tag', '0.0.2')
         kwargs['gpu'] = kwargs.get('gpu', True)
         super(FlowerCNNBenchmark, self).__init__(**kwargs)
 
@@ -17,6 +17,6 @@ class FashionCNNBenchmark(AbstractMOBenchmarkClient):
     def __init__(self, **kwargs):
         kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'FashionCNNBenchmark')
         kwargs['container_name'] = kwargs.get('container_name', 'mo_cnn')
-        kwargs['latest'] = kwargs.get('container_tag', '0.0.1')
+        kwargs['latest'] = kwargs.get('container_tag', '0.0.2')
         kwargs['gpu'] = kwargs.get('gpu', True)
         super(FashionCNNBenchmark, self).__init__(**kwargs)
diff --git a/hpobench/container/benchmarks/nas/nasbench_101.py b/hpobench/container/benchmarks/nas/nasbench_101.py
index 7984d786..a47e96a2 100644
--- a/hpobench/container/benchmarks/nas/nasbench_101.py
+++ b/hpobench/container/benchmarks/nas/nasbench_101.py
@@ -3,14 +3,14 @@
 
 """ Benchmark for the Tabular Benchmark from hpobench/benchmarks/nas/nasbench_101.py """
 
-from hpobench.container.client_abstract_benchmark import AbstractBenchmarkClient
+from hpobench.container.client_abstract_benchmark import AbstractBenchmarkClient, AbstractMOBenchmarkClient
 
 
 class NASCifar10ABenchmark(AbstractBenchmarkClient):
     def __init__(self, **kwargs):
         kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'NASCifar10ABenchmark')
         kwargs['container_name'] = kwargs.get('container_name', 'nasbench_101')
-        kwargs['latest'] = kwargs.get('container_tag', '0.0.4')
+        kwargs['latest'] = kwargs.get('container_tag', '0.0.5')
         super(NASCifar10ABenchmark, self).__init__(**kwargs)
 
 
@@ -18,7 +18,7 @@ class NASCifar10BBenchmark(AbstractBenchmarkClient):
     def __init__(self, **kwargs):
         kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'NASCifar10BBenchmark')
         kwargs['container_name'] = kwargs.get('container_name', 'nasbench_101')
-        kwargs['latest'] = kwargs.get('container_tag', '0.0.4')
+        kwargs['latest'] = kwargs.get('container_tag', '0.0.5')
         super(NASCifar10BBenchmark, self).__init__(**kwargs)
 
 
@@ -26,5 +26,29 @@ class NASCifar10CBenchmark(AbstractBenchmarkClient):
     def __init__(self, **kwargs):
         kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'NASCifar10CBenchmark')
         kwargs['container_name'] = kwargs.get('container_name', 'nasbench_101')
-        kwargs['latest'] = kwargs.get('container_tag', '0.0.4')
+        kwargs['latest'] = kwargs.get('container_tag', '0.0.5')
         super(NASCifar10CBenchmark, self).__init__(**kwargs)
+
+
+class NASCifar10AMOBenchmark(AbstractMOBenchmarkClient):
+    def __init__(self, **kwargs):
+        kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'NASCifar10AMOBenchmark')
+        kwargs['container_name'] = kwargs.get('container_name', 'nasbench_101')
+        kwargs['latest'] = kwargs.get('container_tag', '0.0.5')
+        super(NASCifar10AMOBenchmark, self).__init__(**kwargs)
+
+
+class NASCifar10BMOBenchmark(AbstractMOBenchmarkClient):
+    def __init__(self, **kwargs):
+        kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'NASCifar10BMOBenchmark')
+        kwargs['container_name'] = kwargs.get('container_name', 'nasbench_101')
+        kwargs['latest'] = kwargs.get('container_tag', '0.0.5')
+        super(NASCifar10BMOBenchmark, self).__init__(**kwargs)
+
+
+class NASCifar10CMOBenchmark(AbstractMOBenchmarkClient):
+    def __init__(self, **kwargs):
+        kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'NASCifar10CMOBenchmark')
+        kwargs['container_name'] = kwargs.get('container_name', 'nasbench_101')
+        kwargs['latest'] = kwargs.get('container_tag', '0.0.5')
+        super(NASCifar10CMOBenchmark, self).__init__(**kwargs)
diff --git a/hpobench/container/benchmarks/nas/nasbench_1shot1.py b/hpobench/container/benchmarks/nas/nasbench_1shot1.py
index a88dcf9a..bef0bf16 100644
--- a/hpobench/container/benchmarks/nas/nasbench_1shot1.py
+++ b/hpobench/container/benchmarks/nas/nasbench_1shot1.py
@@ -3,14 +3,14 @@
 
 """ Benchmark for the nasbench 1shot1 benchmarks from hpobench/benchmarks/nas/nasbench_1shot1.py """
 
-from hpobench.container.client_abstract_benchmark import AbstractBenchmarkClient
+from hpobench.container.client_abstract_benchmark import AbstractBenchmarkClient, AbstractMOBenchmarkClient
 
 
 class NASBench1shot1SearchSpace1Benchmark(AbstractBenchmarkClient):
     def __init__(self, **kwargs):
         kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'NASBench1shot1SearchSpace1Benchmark')
         kwargs['container_name'] = kwargs.get('container_name', 'nasbench_1shot1')
-        kwargs['latest'] = kwargs.get('container_tag', '0.0.4')
+        kwargs['latest'] = kwargs.get('container_tag', '0.0.5')
         super(NASBench1shot1SearchSpace1Benchmark, self).__init__(**kwargs)
 
 
@@ -18,7 +18,7 @@ class NASBench1shot1SearchSpace2Benchmark(AbstractBenchmarkClient):
     def __init__(self, **kwargs):
         kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'NASBench1shot1SearchSpace2Benchmark')
         kwargs['container_name'] = kwargs.get('container_name', 'nasbench_1shot1')
-        kwargs['latest'] = kwargs.get('container_tag', '0.0.4')
+        kwargs['latest'] = kwargs.get('container_tag', '0.0.5')
         super(NASBench1shot1SearchSpace2Benchmark, self).__init__(**kwargs)
 
 
@@ -26,5 +26,29 @@ class NASBench1shot1SearchSpace3Benchmark(AbstractBenchmarkClient):
     def __init__(self, **kwargs):
         kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'NASBench1shot1SearchSpace3Benchmark')
         kwargs['container_name'] = kwargs.get('container_name', 'nasbench_1shot1')
-        kwargs['latest'] = kwargs.get('container_tag', '0.0.4')
+        kwargs['latest'] = kwargs.get('container_tag', '0.0.5')
         super(NASBench1shot1SearchSpace3Benchmark, self).__init__(**kwargs)
+
+
+class NASBench1shot1SearchSpace1MOBenchmark(AbstractMOBenchmarkClient):
+    def __init__(self, **kwargs):
+        kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'NASBench1shot1SearchSpace1MOBenchmark')
+        kwargs['container_name'] = kwargs.get('container_name', 'nasbench_1shot1')
+        kwargs['latest'] = kwargs.get('container_tag', '0.0.5')
+        super(NASBench1shot1SearchSpace1MOBenchmark, self).__init__(**kwargs)
+
+
+class NASBench1shot1SearchSpace2MOBenchmark(AbstractMOBenchmarkClient):
+    def __init__(self, **kwargs):
+        kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'NASBench1shot1SearchSpace2MOBenchmark')
+        kwargs['container_name'] = kwargs.get('container_name', 'nasbench_1shot1')
+        kwargs['latest'] = kwargs.get('container_tag', '0.0.5')
+        super(NASBench1shot1SearchSpace2MOBenchmark, self).__init__(**kwargs)
+
+
+class NASBench1shot1SearchSpace3MOBenchmark(AbstractMOBenchmarkClient):
+    def __init__(self, **kwargs):
+        kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'NASBench1shot1SearchSpace3MOBenchmark')
+        kwargs['container_name'] = kwargs.get('container_name', 'nasbench_1shot1')
+        kwargs['latest'] = kwargs.get('container_tag', '0.0.5')
+        super(NASBench1shot1SearchSpace3MOBenchmark, self).__init__(**kwargs)
diff --git a/hpobench/container/benchmarks/nas/nasbench_201.py b/hpobench/container/benchmarks/nas/nasbench_201.py
index 2a948c6b..83b6f488 100644
--- a/hpobench/container/benchmarks/nas/nasbench_201.py
+++ b/hpobench/container/benchmarks/nas/nasbench_201.py
@@ -10,7 +10,7 @@ class Cifar10ValidNasBench201Benchmark(AbstractBenchmarkClient):
     def __init__(self, **kwargs):
         kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'Cifar10ValidNasBench201Benchmark')
         kwargs['container_name'] = kwargs.get('container_name', 'nasbench_201')
-        kwargs['latest'] = kwargs.get('container_tag', '0.0.5')
+        kwargs['latest'] = kwargs.get('container_tag', '0.0.6')
         super(Cifar10ValidNasBench201Benchmark, self).__init__(**kwargs)
 
 
@@ -18,7 +18,7 @@ class Cifar100NasBench201Benchmark(AbstractBenchmarkClient):
     def __init__(self, **kwargs):
         kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'Cifar100NasBench201Benchmark')
         kwargs['container_name'] = kwargs.get('container_name', 'nasbench_201')
-        kwargs['latest'] = kwargs.get('container_tag', '0.0.5')
+        kwargs['latest'] = kwargs.get('container_tag', '0.0.6')
         super(Cifar100NasBench201Benchmark, self).__init__(**kwargs)
 
 
@@ -26,7 +26,7 @@ class ImageNetNasBench201Benchmark(AbstractBenchmarkClient):
     def __init__(self, **kwargs):
         kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'ImageNetNasBench201Benchmark')
         kwargs['container_name'] = kwargs.get('container_name', 'nasbench_201')
-        kwargs['latest'] = kwargs.get('container_tag', '0.0.5')
+        kwargs['latest'] = kwargs.get('container_tag', '0.0.6')
         super(ImageNetNasBench201Benchmark, self).__init__(**kwargs)
 
 
@@ -34,7 +34,7 @@ class Cifar10ValidNasBench201BenchmarkOriginal(AbstractBenchmarkClient):
     def __init__(self, **kwargs):
         kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'Cifar10ValidNasBench201BenchmarkOriginal')
         kwargs['container_name'] = kwargs.get('container_name', 'nasbench_201')
-        kwargs['latest'] = kwargs.get('container_tag', '0.0.5')
+        kwargs['latest'] = kwargs.get('container_tag', '0.0.6')
         super(Cifar10ValidNasBench201BenchmarkOriginal, self).__init__(**kwargs)
 
 
@@ -42,7 +42,7 @@ class Cifar100NasBench201BenchmarkOriginal(AbstractBenchmarkClient):
     def __init__(self, **kwargs):
         kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'Cifar100NasBench201BenchmarkOriginal')
         kwargs['container_name'] = kwargs.get('container_name', 'nasbench_201')
-        kwargs['latest'] = kwargs.get('container_tag', '0.0.5')
+        kwargs['latest'] = kwargs.get('container_tag', '0.0.6')
         super(Cifar100NasBench201BenchmarkOriginal, self).__init__(**kwargs)
 
 
@@ -50,7 +50,7 @@ class ImageNetNasBench201BenchmarkOriginal(AbstractBenchmarkClient):
     def __init__(self, **kwargs):
         kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'ImageNetNasBench201BenchmarkOriginal')
         kwargs['container_name'] = kwargs.get('container_name', 'nasbench_201')
-        kwargs['latest'] = kwargs.get('container_tag', '0.0.5')
+        kwargs['latest'] = kwargs.get('container_tag', '0.0.6')
         super(ImageNetNasBench201BenchmarkOriginal, self).__init__(**kwargs)
 
 
diff --git a/hpobench/container/benchmarks/surrogates/yahpo_gym.py b/hpobench/container/benchmarks/surrogates/yahpo_gym.py
index 9774975d..64cee463 100644
--- a/hpobench/container/benchmarks/surrogates/yahpo_gym.py
+++ b/hpobench/container/benchmarks/surrogates/yahpo_gym.py
@@ -8,7 +8,7 @@ class YAHPOGymBenchmark(AbstractBenchmarkClient):
     def __init__(self, **kwargs):
         kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'YAHPOGymBenchmark')
         kwargs['container_name'] = kwargs.get('container_name', 'yahpo_gym')
-        kwargs['latest'] = kwargs.get('container_tag', '0.0.1')
+        kwargs['latest'] = kwargs.get('container_tag', '0.0.2')
         super(YAHPOGymBenchmark, self).__init__(**kwargs)
 
 
@@ -16,5 +16,5 @@ class YAHPOGymMOBenchmark(AbstractMOBenchmarkClient):
     def __init__(self, **kwargs):
         kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'YAHPOGymMOBenchmark')
         kwargs['container_name'] = kwargs.get('container_name', 'yahpo_gym')
-        kwargs['latest'] = kwargs.get('container_tag', '0.0.1')
+        kwargs['latest'] = kwargs.get('container_tag', '0.0.2')
         super(YAHPOGymMOBenchmark, self).__init__(**kwargs)
diff --git a/hpobench/container/client_abstract_benchmark.py b/hpobench/container/client_abstract_benchmark.py
index 6bbc3489..d2963c00 100644
--- a/hpobench/container/client_abstract_benchmark.py
+++ b/hpobench/container/client_abstract_benchmark.py
@@ -14,12 +14,12 @@
 The name of the container (``container_name``) is defined either in its belonging
 container-benchmark definition. (hpobench/container/<type>/<name> or via ``container_name``.
 """
-import os
 import abc
-import sys
 import json
 import logging
+import os
 import subprocess
+import sys
 import time
 from pathlib import Path
 from typing import Optional, Union, Dict, List, Tuple
@@ -27,8 +27,8 @@
 
 import ConfigSpace as CS
 import Pyro4
-import Pyro4.util
 import Pyro4.errors
+import Pyro4.util
 import numpy as np
 from ConfigSpace.read_and_write import json as csjson
 from oslo_concurrency import lockutils
diff --git a/hpobench/container/recipes/ml/Singularity.YahpoRawBenchmark b/hpobench/container/recipes/ml/Singularity.YahpoRawBenchmark
new file mode 100644
index 00000000..e79dab4b
--- /dev/null
+++ b/hpobench/container/recipes/ml/Singularity.YahpoRawBenchmark
@@ -0,0 +1,82 @@
+Bootstrap: docker
+From: rpy2/rpy2:latest
+
+
+%labels
+MAINTAINER pfistererf@googlemail.com
+VERSION v0.0.1
+
+%help
+    This is the recipe for the Raw YAHPO Benchmarks.
+
+
+%post
+    cd /home
+
+    ####################### INSTALL THE R + BASE DEPENDENCIES #################
+    FILE="libssl1.1_1.1.1f-1ubuntu2_amd64.deb"
+    wget http://archive.ubuntu.com/ubuntu/pool/main/o/openssl/${FILE}
+    sudo dpkg -i ${FILE}
+
+    FILE="libssl-dev_1.1.1f-1ubuntu2_amd64.deb"
+    wget http://archive.ubuntu.com/ubuntu/pool/main/o/openssl/${FILE}
+    sudo dpkg -i ${FILE}
+
+    sudo apt-get install openssl
+    sudo apt-get install libcurl4-openssl-dev git
+
+    # Instal R-Packages
+    cd /home \
+    && Rscript -e 'install.packages("remotes", repos = "http://cran.r-project.org")'
+
+    # Install OpenML dependencies
+    Rscript -e 'install.packages("curl", repos = "http://cran.r-project.org")' \
+    && Rscript -e 'install.packages("openssl", repos = "http://cran.r-project.org")' \
+    && Rscript -e 'install.packages("httr", repos = "http://cran.r-project.org")' \
+    && Rscript -e 'install.packages("farff", repos = "http://cran.r-project.org")' \
+    && Rscript -e 'install.packages("OpenML", repos = "http://cran.r-project.org")'
+
+    # Install rbv2 dependencies
+    Rscript -e 'remotes::install_version("BBmisc", version = "1.11", upgrade = "never", repos = "http://cran.r-project.org")' \
+    && Rscript -e 'remotes::install_version("rpart", version = "4.1-13", upgrade = "never", repos = "http://cran.r-project.org")' \
+    && Rscript -e 'remotes::install_version("e1071", version = "1.7-0.1", upgrade = "never", repos = "http://cran.r-project.org")' \
+    && Rscript -e 'remotes::install_version("xgboost", version = "0.82.1", upgrade = "never", repos = "http://cran.r-project.org")' \
+    && Rscript -e 'remotes::install_version("ranger", version = "0.11.2", upgrade = "never", repos = "http://cran.r-project.org")' \
+    && Rscript -e 'remotes::install_version("RcppHNSW", version = "0.1.0", upgrade = "never", repos = "http://cran.r-project.org")' \
+    && Rscript -e 'remotes::install_version("mlr", version = "2.14", upgrade = "never", repos = "http://cran.r-project.org")' \
+    && Rscript -e 'remotes::install_github("mlr-org/mlr3misc", upgrade = "never", repos = "http://cran.r-project.org")' \
+    && Rscript -e 'remotes::install_version("mlrCPO", version = "0.3.6", upgrade = "never", repos = "http://cran.r-project.org")' \
+    && Rscript -e 'remotes::install_version("testthat", version = "3.1.4", upgrade = "never", repos = "http://cran.r-project.org")' \
+    && Rscript -e 'remotes::install_version("glmnet", version = "4.1-3", upgrade = "never", repos = "http://cran.r-project.org")'
+    # ################################ BASE DEPENDENCIES ################################
+
+    Rscript -e 'remotes::install_github("pfistfl/rbv2", upgrade = "never", dependencies = True)' \
+    && Rscript -e 'remotes::install_github("sumny/iaml", upgrade = "never", dependencies = True)' \
+    && Rscript -e 'remotes::install_github("sumny/fair", upgrade = "never", dependencies = True)'
+
+    cd /home \
+    && mkdir data && cd data \
+    && git clone --depth 1 -b main https://github.com/pfistfl/yahpo_data.git \
+
+    # Upgrade pip
+    python3 -m pip install --upgrade pip
+
+    # Install HPOBench
+    cd /home \
+    && git clone https://github.com/automl/HPOBench.git \
+    && cd HPOBench \
+    && git checkout development \
+    && echo "Please never push a recipe that checks out any other branch than development or master" \
+    && pip uninstall -y rpy2 \
+    && pip install .[yahpo_gym_raw]
+    # && git checkout development \
+
+    # Clean Up.
+    echo "Please don't touch the following lines" \
+    && mkdir /var/lib/hpobench/ \
+    && chmod -R 777 /var/lib/hpobench/ \
+    && rm -rf /var/lib/apt/lists/* \
+    && pip cache purge \
+
+%runscript
+    python3 -s /home/HPOBench/hpobench/container/server_abstract_benchmark.py ml.yahpo_benchmark $@
\ No newline at end of file
diff --git a/hpobench/container/recipes/ml/Singularity.ml_tabular_benchmark b/hpobench/container/recipes/ml/Singularity.ml_tabular_benchmarks
similarity index 100%
rename from hpobench/container/recipes/ml/Singularity.ml_tabular_benchmark
rename to hpobench/container/recipes/ml/Singularity.ml_tabular_benchmarks
diff --git a/hpobench/container/recipes/ml/Singularity.rbv2Benchmark b/hpobench/container/recipes/ml/Singularity.rbv2Benchmark
new file mode 100644
index 00000000..e69de29b
diff --git a/hpobench/container/recipes/surrogates/Singularity.YAHPOGymBenchmark b/hpobench/container/recipes/surrogates/Singularity.YAHPOGymBenchmark
index 66ee63b1..98914ed1 100644
--- a/hpobench/container/recipes/surrogates/Singularity.YAHPOGymBenchmark
+++ b/hpobench/container/recipes/surrogates/Singularity.YAHPOGymBenchmark
@@ -3,7 +3,7 @@ From: python:3.7-slim
 
 %labels
 MAINTAINER pfistererf@googlemail.com
-VERSION v0.0.1
+VERSION v0.0.2
 
 %help
     This is a template for a Singularity recipe
@@ -20,10 +20,10 @@ VERSION v0.0.1
 
     cd /home \
     && mkdir data && cd data \
-    && git clone --depth 1 -b main https://github.com/pfistfl/yahpo_data.git\
+    && git clone --depth 1 -b fair https://github.com/slds-lmu/yahpo_data.git
 
     cd /home \
-    && git clone https://github.com/pfistfl/HPOBench.git \
+    && git clone https://github.com/automl/HPOBench.git \
     && cd HPOBench \
     && echo "Please never push a recipe that checks out any other branch than development or master" \
     && git checkout master \
diff --git a/hpobench/dependencies/ml/data_manager.py b/hpobench/dependencies/ml/data_manager.py
index 526c6756..ebc48c95 100644
--- a/hpobench/dependencies/ml/data_manager.py
+++ b/hpobench/dependencies/ml/data_manager.py
@@ -1,20 +1,20 @@
-import openml
-import numpy as np
-import pandas as pd
-from typing import Union
 from pathlib import Path
+from typing import Union
 
+import numpy as np
+import openml
+import pandas as pd
+from oslo_concurrency import lockutils
+from sklearn.compose import ColumnTransformer
 from sklearn.impute import SimpleImputer
+from sklearn.model_selection import train_test_split
 from sklearn.pipeline import make_pipeline
-from sklearn.utils import check_random_state
-from sklearn.compose import ColumnTransformer
 from sklearn.preprocessing import OneHotEncoder
 from sklearn.preprocessing import StandardScaler
-from sklearn.model_selection import train_test_split
-from oslo_concurrency import lockutils
+from sklearn.utils import check_random_state
 
-from hpobench.util.data_manager import DataManager
 from hpobench import config_file
+from hpobench.util.data_manager import DataManager
 
 
 class OpenMLDataManager(DataManager):
diff --git a/hpobench/dependencies/ml/ml_benchmark_template.py b/hpobench/dependencies/ml/ml_benchmark_template.py
index 7cef515f..1ffe7b9e 100644
--- a/hpobench/dependencies/ml/ml_benchmark_template.py
+++ b/hpobench/dependencies/ml/ml_benchmark_template.py
@@ -33,25 +33,34 @@ class MLBenchmark(AbstractBenchmark):
     def __init__(
             self,
             task_id: int,
-            rng: Union[np.random.RandomState, int, None] = None,
             valid_size: float = 0.33,
+            rng: Union[np.random.RandomState, int, None] = None,
             data_path: Union[str, Path, None] = None,
             global_seed: int = 1
     ):
+        """ Base template for the ML multi-fidelity benchmarks.
+
+        Parameters
+        ----------
+        task_id : int
+            A valid OpenML Task ID.
+        valid_size : float
+            The fraction of training set to be used as validation split.
+        rng : np.random.RandomState, int (optional)
+            The random seed that will be passed to the ML model if not explicitly passed.
+        data_path : str, Path (optional)
+            The path from where the training-validation-testing splits may be loaded.
+        global_seed : int
+            The fixed global seed that is used for creating validation splits if not available.
+        """
         super(MLBenchmark, self).__init__(rng=rng)
 
-        if isinstance(rng, int):
-            self.seed = rng
-        else:
-            self.seed = self.rng.randint(1, 10**6)
-
         self.global_seed = global_seed  # used for fixed training-validation splits
 
         self.task_id = task_id
         self.valid_size = valid_size
-        self.scorers = dict()
-        for k, v in metrics.items():
-            self.scorers[k] = make_scorer(v, **metrics_kwargs[k])
+        self.scorers = metrics
+        self.scorer_args = metrics_kwargs
 
         if data_path is None:
             from hpobench import config_file
@@ -59,7 +68,7 @@ def __init__(
 
         self.data_path = Path(data_path)
 
-        dm = OpenMLDataManager(task_id, valid_size, data_path, global_seed)
+        dm = OpenMLDataManager(self.task_id, self.valid_size, self.data_path, self.global_seed)
         dm.load()
 
         # Data variables
@@ -77,10 +86,6 @@ def __init__(
         self.lower_bound_train_size = dm.lower_bound_train_size
         self.n_classes = dm.n_classes
 
-        # Observation and fidelity spaces
-        self.fidelity_space = self.get_fidelity_space(self.seed)
-        self.configuration_space = self.get_configuration_space(self.seed)
-
     @staticmethod
     def get_configuration_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
         """Parameter space to be optimized --- contains the hyperparameters
@@ -90,33 +95,33 @@ def get_configuration_space(seed: Union[int, None] = None) -> CS.ConfigurationSp
     @staticmethod
     def get_fidelity_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
         """Fidelity space available --- specifies the fidelity dimensions
-
-        If fidelity_choice is 0
-            Fidelity space is the maximal fidelity, akin to a black-box function
-        If fidelity_choice is 1
-            Fidelity space is a single fidelity, in this case the number of trees (n_estimators)
-        If fidelity_choice is 2
-            Fidelity space is a single fidelity, in this case the fraction of dataset (subsample)
-        If fidelity_choice is >2
-            Fidelity space is multi-multi fidelity, all possible fidelities
         """
         raise NotImplementedError()
 
     # pylint: disable=arguments-differ
     def get_meta_information(self):
-        """ Returns the meta information for the benchmark """
+        """ Returns the meta information for the benchmark
+        """
         return {
             'name': 'Support Vector Machine',
             'shape of train data': self.train_X.shape,
             'shape of test data': self.test_X.shape,
             'shape of valid data': self.valid_X.shape,
-            'initial random seed': self.seed,
+            'initial random seed': self.rng,
             'task_id': self.task_id
         }
 
-    def init_model(self, config: Union[CS.Configuration, Dict],
-                   fidelity: Union[CS.Configuration, Dict, None] = None,
-                   rng: Union[int, np.random.RandomState, None] = None):
+    def get_model_size(self, model):
+        """ Returns a custom model size specific to the ML model, if applicable
+        """
+        raise NotImplementedError
+
+    def init_model(
+            self,
+            config: Union[CS.Configuration, Dict],
+            fidelity: Union[CS.Configuration, Dict, None] = None,
+            rng: Union[int, np.random.RandomState, None] = None
+    ):
         """ Function that returns the model initialized based on the configuration and fidelity
         """
         raise NotImplementedError()
@@ -135,18 +140,74 @@ def get_fidelity(self, size: Union[int, None] = None):
             return self.fidelity_space.sample_configuration()
         return [self.fidelity_space.sample_configuration() for i in range(size)]
 
-    def shuffle_data_idx(self, train_idx: Iterable = None, rng: Union[np.random.RandomState, None] = None) -> Iterable:
+    def shuffle_data_idx(
+            self, train_idx: Iterable = None, rng: Union[np.random.RandomState, None] = None
+    ) -> Iterable:
         rng = self.rng if rng is None else rng
         train_idx = self.train_idx if train_idx is None else train_idx
         rng.shuffle(train_idx)
         return train_idx
 
-    def _train_objective(self,
-                         config: Dict,
-                         fidelity: Dict,
-                         shuffle: bool,
-                         rng: Union[np.random.RandomState, int, None] = None,
-                         evaluation: Union[str, None] = "valid"):
+    def _get_lc_spacing(self, max_iter, k):
+        """ Creates an integer sequence to record Learning Curves for every k iteration.
+
+        Designed to include the maximum iteration. A k-spaced iteration sequence may not include
+        the endpoint implicitly.
+        """
+        assert k > 0, "Spacing needs to be at >=1"
+        assert k < max_iter, "Spacing should be in {1, 2, ..., max_iter-1}"
+        spacing = np.arange(0, max_iter + 1, step=k).tolist()
+        spacing = spacing[1:]  # eliminating 0
+        if spacing[-1] != max_iter:
+            spacing.append(max_iter)
+        return spacing
+
+    def _train_objective(
+            self,
+            config: Dict,
+            fidelity: Dict,
+            shuffle: bool,
+            rng: Union[np.random.RandomState, int, None] = None,
+            evaluation: Union[str, None] = "valid",
+            record_stats: bool = False,
+            get_learning_curve: bool = False,
+            lc_every_k: int = 1,
+            **kwargs
+    ):
+        """Function that instantiates a 'config' on a 'fidelity' and trains it
+
+        The ML model is instantiated and trained on the training split. Optionally, the model is
+        evaluated on the training set.
+
+        Parameters
+        ----------
+        config : CS.Configuration, Dict
+            The hyperparameter configuration.
+        fidelity : CS.Configuration, Dict
+            The fidelity configuration.
+        shuffle : bool (optional)
+            If True, shuffles the training split before fitting the ML model.
+        rng : np.random.RandomState, int (optional)
+            The random seed passed to the ML model and if applicable, used for shuffling the data
+            and subsampling the dataset fraction.
+        evaluation : str (optional)
+            If "valid", the ML model is trained on the training set alone.
+            If "test", the ML model is trained on the training + validation sets.
+        record_stats : bool (optional)
+            If True, records the evaluation metrics of the trained ML model on the training set.
+            This is set to False by default to reduce overall compute time.
+        get_learning_curve : bool (optional)
+            If True, records the learning curve using partial_fit or warm starting, if applicable.
+            This is set to False by default to reduce overall compute time.
+        lc_every_k : int (optional)
+            If True, records the learning curve after every k iterations.
+        """
+        if get_learning_curve:
+            raise NotImplementedError(
+                "Need to implement partial or intermediate training to record Learning curves"
+            )
+        learning_curves = None
+        lc_time = None
 
         if rng is not None:
             rng = get_rng(rng, self.rng)
@@ -155,26 +216,32 @@ def _train_objective(self,
         model = self.init_model(config, fidelity, rng)
 
         # preparing data
-        if eval == "valid":
+        if evaluation == "valid":
             train_X = self.train_X
             train_y = self.train_y
-            train_idx = self.train_idx
-        else:
+        elif evaluation == "test":
             train_X = np.vstack((self.train_X, self.valid_X))
             train_y = pd.concat((self.train_y, self.valid_y))
-            train_idx = np.arange(len(train_X))
+        else:
+            raise ValueError("{} not in ['valid', 'test']".format(evaluation))
+        train_idx = np.arange(len(train_X)) if self.train_idx is None else self.train_idx
 
         # shuffling data
         if shuffle:
             train_idx = self.shuffle_data_idx(train_idx, rng)
-            train_X = train_X.iloc[train_idx]
+            if isinstance(train_idx, np.ndarray):
+                train_X = train_X[train_idx]
+            else:
+                train_X = train_X.iloc[train_idx]
             train_y = train_y.iloc[train_idx]
 
         # subsample here:
         # application of the other fidelity to the dataset that the model interfaces
+        # carried over from previous HPOBench code that borrowed from FABOLAS' SVM
+        lower_bound_lim = 1.0 / 512.0
         if self.lower_bound_train_size is None:
             self.lower_bound_train_size = (10 * self.n_classes) / self.train_X.shape[0]
-            self.lower_bound_train_size = np.max((1 / 512, self.lower_bound_train_size))
+            self.lower_bound_train_size = np.max((lower_bound_lim, self.lower_bound_train_size))
         subsample = np.max((fidelity['subsample'], self.lower_bound_train_size))
         train_idx = self.rng.choice(
             np.arange(len(train_X)), size=int(
@@ -185,102 +252,209 @@ def _train_objective(self,
         start = time.time()
         model.fit(train_X[train_idx], train_y.iloc[train_idx])
         model_fit_time = time.time() - start
+        # model inference
+        inference_time = 0.0
+        # can optionally not record evaluation metrics on training set to save compute
+        if record_stats:
+            start = time.time()
+            pred_train = model.predict(train_X)
+            inference_time = time.time() - start
         # computing statistics on training data
         scores = dict()
         score_cost = dict()
         for k, v in self.scorers.items():
             scores[k] = 0.0
             score_cost[k] = 0.0
-            if evaluation == "test":
-                _start = time.time()
-                scores[k] = v(model, train_X, train_y)
-                score_cost[k] = time.time() - _start
+            _start = time.time()
+            if record_stats:
+                scores[k] = v(train_y, pred_train, **self.scorer_args[k])
+            score_cost[k] = time.time() - _start + inference_time
         train_loss = 1 - scores["acc"]
-        return model, model_fit_time, train_loss, scores, score_cost
+        return model, model_fit_time, train_loss, scores, score_cost, learning_curves, lc_time
 
     # pylint: disable=arguments-differ
     @AbstractBenchmark.check_parameters
-    def objective_function(self,
-                           configuration: Union[CS.Configuration, Dict],
-                           fidelity: Union[CS.Configuration, Dict, None] = None,
-                           shuffle: bool = False,
-                           rng: Union[np.random.RandomState, int, None] = None,
-                           **kwargs) -> Dict:
+    def objective_function(
+            self,
+            configuration: Union[CS.Configuration, Dict],
+            fidelity: Union[CS.Configuration, Dict, None] = None,
+            shuffle: bool = False,
+            rng: Union[np.random.RandomState, int, None] = None,
+            record_train: bool = False,
+            get_learning_curve: bool = False,
+            lc_every_k: int = 1,
+            **kwargs
+    ) -> Dict:
         """Function that evaluates a 'config' on a 'fidelity' on the validation set
+
+        The ML model is trained on the training split, and evaluated on the valid and test splits.
+
+        Parameters
+        ----------
+        configuration : CS.Configuration, Dict
+            The hyperparameter configuration.
+        fidelity : CS.Configuration, Dict
+            The fidelity configuration.
+        shuffle : bool (optional)
+            If True, shuffles the training split before fitting the ML model.
+        rng : np.random.RandomState, int (optional)
+            The random seed passed to the ML model and if applicable, used for shuffling the data
+            and subsampling the dataset fraction.
+        record_train : bool (optional)
+            If True, records the evaluation metrics of the trained ML model on the training set.
+            This is set to False by default to reduce overall compute time.
+        get_learning_curve : bool (optional)
+            If True, records the learning curve using partial_fit or warm starting, if applicable.
+            This is set to False by default to reduce overall compute time.
+        lc_every_k : int (optional)
+            If True, records the learning curve after every k iterations.
         """
-        model, model_fit_time, train_loss, train_scores, train_score_cost = self._train_objective(
-            configuration, fidelity, shuffle, rng, evaluation="val"
-        )
+        # obtaining model and training statistics
+        model, model_fit_time, train_loss, train_scores, train_score_cost, lcs, lc_time = \
+            self._train_objective(
+                configuration, fidelity, shuffle, rng,
+                evaluation="valid", record_stats=record_train,
+                get_learning_curve=get_learning_curve, lc_every_k=lc_every_k
+            )
+        model_size = self.get_model_size(model)
+
+        # model inference on validation set
+        start = time.time()
+        pred_val = model.predict(self.valid_X)
+        val_inference_time = time.time() - start
         val_scores = dict()
         val_score_cost = dict()
         for k, v in self.scorers.items():
+            val_scores[k] = 0.0
+            val_score_cost[k] = 0.0
             _start = time.time()
-            val_scores[k] = v(model, self.valid_X, self.valid_y)
-            val_score_cost[k] = time.time() - _start
+            val_scores[k] = v(self.valid_y, pred_val, **self.scorer_args[k])
+            val_score_cost[k] = time.time() - _start + val_inference_time
         val_loss = 1 - val_scores["acc"]
 
+        # model inference on test set
+        start = time.time()
+        pred_test = model.predict(self.test_X)
+        test_inference_time = time.time() - start
         test_scores = dict()
         test_score_cost = dict()
         for k, v in self.scorers.items():
+            test_scores[k] = 0.0
+            test_score_cost[k] = 0.0
             _start = time.time()
-            test_scores[k] = v(model, self.test_X, self.test_y)
-            test_score_cost[k] = time.time() - _start
+            test_scores[k] = v(self.test_y, pred_test, **self.scorer_args[k])
+            test_score_cost[k] = time.time() - _start + test_inference_time
         test_loss = 1 - test_scores["acc"]
 
+        fidelity = fidelity.get_dictionary() if isinstance(fidelity, CS.Configuration) else fidelity
+        configuration = configuration.get_dictionary() \
+            if isinstance(configuration, CS.Configuration) else configuration
+
         info = {
             'train_loss': train_loss,
             'val_loss': val_loss,
             'test_loss': test_loss,
             'model_cost': model_fit_time,
+            'model_size': model_size,
             'train_scores': train_scores,
             'train_costs': train_score_cost,
             'val_scores': val_scores,
             'val_costs': val_score_cost,
             'test_scores': test_scores,
             'test_costs': test_score_cost,
+            'learning_curves': lcs,
+            'learning_curves_cost': lc_time,
+            'learning_curves_spacing': lc_every_k,
             # storing as dictionary and not ConfigSpace saves tremendous memory
             'fidelity': fidelity,
             'config': configuration,
         }
 
         return {
-            'function_value': info['val_loss'],
-            'cost': model_fit_time + info['val_costs']['acc'],
+            'function_value': float(info['val_loss']),
+            'cost': float(model_fit_time + info['val_costs']['acc']),
             'info': info
         }
 
     # pylint: disable=arguments-differ
     @AbstractBenchmark.check_parameters
-    def objective_function_test(self,
-                                configuration: Union[CS.Configuration, Dict],
-                                fidelity: Union[CS.Configuration, Dict, None] = None,
-                                shuffle: bool = False,
-                                rng: Union[np.random.RandomState, int, None] = None,
-                                **kwargs) -> Dict:
+    def objective_function_test(
+            self,
+            configuration: Union[CS.Configuration, Dict],
+            fidelity: Union[CS.Configuration, Dict, None] = None,
+            shuffle: bool = False,
+            rng: Union[np.random.RandomState, int, None] = None,
+            record_train: bool = False,
+            get_learning_curve: bool = False,
+            lc_every_k: int = 1,
+            **kwargs
+    ) -> Dict:
         """Function that evaluates a 'config' on a 'fidelity' on the test set
+
+        The ML model is trained on the training+valid split, and evaluated on the test split.
+
+        Parameters
+        ----------
+        configuration : CS.Configuration, Dict
+            The hyperparameter configuration.
+        fidelity : CS.Configuration, Dict
+            The fidelity configuration.
+        shuffle : bool (optional)
+            If True, shuffles the training split before fitting the ML model.
+        rng : np.random.RandomState, int (optional)
+            The random seed passed to the ML model and if applicable, used for shuffling the data
+            and subsampling the dataset fraction.
+        record_train : bool (optional)
+            If True, records the evaluation metrics of the trained ML model on the training set.
+            This is set to False by default to reduce overall compute time.
+        get_learning_curve : bool (optional)
+            If True, records the learning curve using partial_fit or warm starting, if applicable.
+            This is set to False by default to reduce overall compute time.
+        lc_every_k : int (optional)
+            If True, records the learning curve after every k iterations.
         """
-        model, model_fit_time, train_loss, train_scores, train_score_cost = self._train_objective(
-            configuration, fidelity, shuffle, rng, evaluation="test"
-        )
+        # obtaining model and training statistics
+        model, model_fit_time, train_loss, train_scores, train_score_cost, lcs, lc_time = \
+            self._train_objective(
+                configuration, fidelity, shuffle, rng,
+                evaluation="test", record_stats=record_train,
+                get_learning_curve=get_learning_curve, lc_every_k=lc_every_k
+            )
+        model_size = self.get_model_size(model)
+
+        # model inference on test set
+        start = time.time()
+        pred_test = model.predict(self.test_X)
+        test_inference_time = time.time() - start
         test_scores = dict()
         test_score_cost = dict()
         for k, v in self.scorers.items():
+            test_scores[k] = 0.0
+            test_score_cost[k] = 0.0
             _start = time.time()
-            test_scores[k] = v(model, self.test_X, self.test_y)
-            test_score_cost[k] = time.time() - _start
+            test_scores[k] = v(self.test_y, pred_test, **self.scorer_args[k])
+            test_score_cost[k] = time.time() - _start + test_inference_time
         test_loss = 1 - test_scores["acc"]
 
+        fidelity = fidelity.get_dictionary() if isinstance(fidelity, CS.Configuration) else fidelity
+        configuration = configuration.get_dictionary() \
+            if isinstance(configuration, CS.Configuration) else configuration
+
         info = {
             'train_loss': train_loss,
             'val_loss': None,
             'test_loss': test_loss,
             'model_cost': model_fit_time,
+            'model_size': model_size,
             'train_scores': train_scores,
             'train_costs': train_score_cost,
-            'val_scores': dict(),
-            'val_costs': dict(),
+            'val_scores': None,
+            'val_costs': None,
             'test_scores': test_scores,
             'test_costs': test_score_cost,
+            'learning_curves': lcs,
+            'learning_curves_cost': lc_time,
+            'learning_curves_spacing': lc_every_k,
             # storing as dictionary and not ConfigSpace saves tremendous memory
             'fidelity': fidelity,
             'config': configuration,
diff --git a/hpobench/dependencies/mo/scalar.py b/hpobench/dependencies/mo/scalar.py
index 3f434fde..185c2730 100644
--- a/hpobench/dependencies/mo/scalar.py
+++ b/hpobench/dependencies/mo/scalar.py
@@ -1,6 +1,7 @@
-import numpy as np
 from typing import Union
 
+import numpy as np
+
 try:
     from sklearn.preprocessing import MinMaxScaler, StandardScaler
 except ImportError:
diff --git a/hpobench/util/clean_up_script.py b/hpobench/util/clean_up_script.py
index 5fe9fd0c..771ab80f 100644
--- a/hpobench/util/clean_up_script.py
+++ b/hpobench/util/clean_up_script.py
@@ -1,7 +1,8 @@
+import logging
+import shutil
+
 from hpobench import config_file
 
-import shutil
-import logging
 logger = logging.getLogger('Clean-up')
 logger.setLevel(logging.INFO)
 
diff --git a/hpobench/util/container_utils.py b/hpobench/util/container_utils.py
index 7fee19e9..bb7221c3 100644
--- a/hpobench/util/container_utils.py
+++ b/hpobench/util/container_utils.py
@@ -1,11 +1,11 @@
-import os
+import enum
 import importlib
 import json
-import numpy as np
-import enum
-
+import os
 from typing import Any, Union
 
+import numpy as np
+
 from hpobench.util.rng_helper import serialize_random_state, deserialize_random_state
 
 
diff --git a/hpobench/util/data_manager.py b/hpobench/util/data_manager.py
index c72305e1..914d651c 100644
--- a/hpobench/util/data_manager.py
+++ b/hpobench/util/data_manager.py
@@ -15,6 +15,7 @@
 import gzip
 import json
 import logging
+import os
 import pickle
 import tarfile
 from io import BytesIO
@@ -40,6 +41,15 @@
 import hpobench
 
 
+
+tabular_multi_fidelity_urls = dict(
+    xgb="https://figshare.com/ndownloader/files/35414756",
+    svm="https://figshare.com/ndownloader/files/35414447",
+    lr="https://figshare.com/ndownloader/files/35412425",
+    rf="https://figshare.com/ndownloader/files/35414801",
+    nn="https://figshare.com/ndownloader/files/35414996"
+)
+
 class DataManager(abc.ABC, metaclass=abc.ABCMeta):
     """ Base Class for loading and managing the data.
 
@@ -1174,21 +1184,14 @@ def get_workclass(x):
 class TabularDataManager(DataManager):
     def __init__(self, model: str, task_id: [int, str], data_dir: [str, Path, None] = None):
         super(TabularDataManager, self).__init__()
+        
+        self.model = model
+        self.task_id = str(task_id)
 
-        url_dict = dict(
-            xgb="https://ndownloader.figshare.com/files/30469920",
-            svm="https://ndownloader.figshare.com/files/30379359",
-            lr="https://ndownloader.figshare.com/files/30379038",
-            rf="https://ndownloader.figshare.com/files/30469089",
-            nn="https://ndownloader.figshare.com/files/30379005"
-        )
-
+        url_dict = tabular_multi_fidelity_urls
         assert model in url_dict.keys(), \
             f'Model has to be one of {list(url_dict.keys())} but was {model}'
 
-        self.model = model
-        self.task_id = str(task_id)
-
         self.url_to_use = url_dict.get(model)
 
         if data_dir is None:
@@ -1225,3 +1228,42 @@ def _load_json(path):
         with open(path, "r") as f:
             data = json.load(f)
         return data
+
+
+class YAHPODataManager(DataManager):
+    def __init__(self, data_dir: Union[Path, str, None]):
+        super(YAHPODataManager, self).__init__()
+
+        if data_dir is None:
+            data_dir = hpobench.config_file.data_dir / "yahpo_data"
+        self.data_dir = Path(data_dir)
+        self.logger.info(f'Read data from data directory: {data_dir}')
+
+    @lockutils.synchronized('not_thread_process_safe', external=True,
+                            lock_path=f'{hpobench.config_file.cache_dir}/lock_yahpo_raw', delay=0.5)
+    def _try_download(self):
+        """Clone the data repository."""
+        if not self.data_dir.exists():
+            self.logger.info(
+                'Try to download data from https://github.com/slds-lmu/yahpo_data/tree/fair'
+            )
+            # Create the data directory if not existing
+            self.create_save_directory(self.data_dir.parent)
+
+            import git
+            git.Repo.clone_from(url='https://github.com/slds-lmu/yahpo_data.git',
+                                to_path=str(self.data_dir),
+                                branch='fair',
+                                multi_options=['--depth 1'])
+            self.logger.info(f'Successfully cloned data from repo to {self.data_dir}')
+
+    def load(self):
+        from yahpo_gym.local_config import LocalConfiguration
+        local_config = LocalConfiguration()
+
+        # When in the containerized version, redirect to the data inside the container.
+        if 'YAHPO_CONTAINER' in os.environ:
+            local_config.init_config(data_path='/home/data/yahpo_data')
+        else:
+            self._try_download()
+            local_config.init_config(data_path=str(self.data_dir))
diff --git a/hpobench/util/test_utils.py b/hpobench/util/test_utils.py
new file mode 100644
index 00000000..b2683135
--- /dev/null
+++ b/hpobench/util/test_utils.py
@@ -0,0 +1,24 @@
+import os
+
+CONST_RUN_ALL_TESTS_ENV_VAR = 'HPOBENCH_RUN_EXPENSIVE_TESTS'
+DEFAULT_SKIP_MSG = 'Skip this test due to time limitations'
+
+
+def check_run_all_tests():
+    """ Helper function: Check if all tests should run. """
+    return os.environ.get(CONST_RUN_ALL_TESTS_ENV_VAR, 'false').lower() == 'true'
+
+
+def enable_all_tests():
+    """
+    Some tests are quite expensive. We control if all runs should be executed by this
+    environment variable.
+    """
+    os.environ[CONST_RUN_ALL_TESTS_ENV_VAR] = 'true'
+
+
+def disable_all_tests():
+    """
+    This function disables the evaluation of all test functions.
+    """
+    os.environ[CONST_RUN_ALL_TESTS_ENV_VAR] = 'false'
diff --git a/requirements.txt b/requirements.txt
index aad54f85..b5db0198 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,8 @@
 numpy>=1.18.1
 ConfigSpace>=0.4.12
 Pyro4==4.80
-oslo.concurrency>=4.2.0
\ No newline at end of file
+oslo.concurrency>=4.2.0
+pandas>=1.2.4
+scikit-learn>=0.24.1
+openml>=0.12.2
+tqdm>=4.64.0
\ No newline at end of file
diff --git a/setup.py b/setup.py
index 4c53ecb0..ef1f292c 100644
--- a/setup.py
+++ b/setup.py
@@ -36,7 +36,7 @@ def read_file(file_name):
     version=read_file('hpobench/__version__.py').split()[-1].strip('\''),
     packages=setuptools.find_packages(exclude=['*.tests', '*.tests.*',
                                                'tests.*', 'tests'],),
-    python_requires='>=3.6, <=3.10',
+    python_requires='>=3.6',
     install_requires=read_file('./requirements.txt').split('\n'),
     extras_require=get_extra_requirements(),
     test_suite='pytest',
diff --git a/tests/test_adult.py b/tests/test_adult.py
index d7a030b7..b52c37ed 100644
--- a/tests/test_adult.py
+++ b/tests/test_adult.py
@@ -28,10 +28,10 @@ def test_adult_benchmark():
     result_2 = benchmark.objective_function(test_config, rng=1, fidelity={'budget': 3})
 
     assert result_1['info']['valid_accuracy'] == pytest.approx(0.7539, rel=0.001)
-    assert result_1['info']['valid_accuracy'] == result_1['function_value']['accuracy']
+    assert 1 - result_1['info']['valid_accuracy'] == result_1['function_value']['misclassification_rate']
     assert result_1['info']['train_accuracy'] == pytest.approx(0.76145, rel=0.001)
     assert result_1['info']['train_accuracy'] == result_2['info']['train_accuracy']
 
     result_1 = benchmark.objective_function_test(test_config, rng=1, fidelity={'budget': 3})
-    assert result_1['function_value']['accuracy'] == pytest.approx(0.76377, rel=0.001)
-    assert result_1['function_value']['accuracy'] == result_1['info']['test_accuracy']
+    assert 1 - result_1['function_value']['misclassification_rate'] == pytest.approx(0.76377, rel=0.001)
+    assert 1 - result_1['function_value']['misclassification_rate'] == result_1['info']['test_accuracy']
diff --git a/tests/test_data_manager.py b/tests/test_data_manager.py
index 7e32ce84..cee56ccc 100644
--- a/tests/test_data_manager.py
+++ b/tests/test_data_manager.py
@@ -1,14 +1,13 @@
 import shutil
-from multiprocessing import Pool
-
 import pytest
+from multiprocessing import Pool
 
 import hpobench
 from hpobench.util.data_manager import NASBench_201Data, YearPredictionMSDData, ProteinStructureData, BostonHousingData
-skip_message = 'We currently skip this test because it takes too much time.'
+from hpobench.util.test_utils import DEFAULT_SKIP_MSG, check_run_all_tests
 
 
-@pytest.mark.skip(reason=skip_message)
+@pytest.mark.skipif(not check_run_all_tests(), reason=DEFAULT_SKIP_MSG)
 def test_nasbench_201_load_thread_safe():
     shutil.rmtree(hpobench.config_file.data_dir / "nasbench_201", ignore_errors=True)
     function = lambda: NASBench_201Data(dataset='cifar100').load()
@@ -16,7 +15,7 @@ def test_nasbench_201_load_thread_safe():
         pool.map(function, [])
 
 
-@pytest.mark.skip(reason=skip_message)
+@pytest.mark.skipif(not check_run_all_tests(), reason=DEFAULT_SKIP_MSG)
 def test_nasbench_201_init():
 
     data_manager = NASBench_201Data(dataset='cifar100')
@@ -30,7 +29,7 @@ def test_nasbench_201_init():
     assert data_manager._save_dir.exists()
 
 
-@pytest.mark.skip(reason=skip_message)
+@pytest.mark.skipif(not check_run_all_tests(), reason=DEFAULT_SKIP_MSG)
 def test_nasbench_201_load():
 
     shutil.rmtree(hpobench.config_file.data_dir / "nasbench_201", ignore_errors=True)
diff --git a/tests/test_mo_cnn.py b/tests/test_mo_cnn.py
index 308c59ad..cded9444 100644
--- a/tests/test_mo_cnn.py
+++ b/tests/test_mo_cnn.py
@@ -1,6 +1,8 @@
 import pytest
+from hpobench.util.test_utils import DEFAULT_SKIP_MSG, check_run_all_tests
 
 
+@pytest.mark.skipif(not check_run_all_tests(), reason=DEFAULT_SKIP_MSG)
 def test_mo_cnn_seeding():
     from hpobench.container.benchmarks.mo.cnn_benchmark import FlowerCNNBenchmark
     b1 = FlowerCNNBenchmark(rng=0)
@@ -18,6 +20,7 @@ def test_mo_cnn_seeding():
         assert result_1['function_value'][metric] == pytest.approx(result_2['function_value'][metric], abs=0.001)
 
 
+@pytest.mark.skipif(not check_run_all_tests(), reason=DEFAULT_SKIP_MSG)
 def test_mo_cnn_benchmark():
     from hpobench.container.benchmarks.mo.cnn_benchmark import FlowerCNNBenchmark
 
diff --git a/tests/test_nasbench_101.py b/tests/test_nasbench_101.py
new file mode 100644
index 00000000..67ac7f65
--- /dev/null
+++ b/tests/test_nasbench_101.py
@@ -0,0 +1,82 @@
+import pytest
+import numpy as np
+
+from hpobench.container.benchmarks.nas.nasbench_101 import (
+    NASCifar10ABenchmark, NASCifar10BBenchmark, NASCifar10CBenchmark,
+    NASCifar10AMOBenchmark, NASCifar10BMOBenchmark, NASCifar10CMOBenchmark,
+)
+
+from hpobench.util.container_utils import disable_container_debug, enable_container_debug
+from hpobench.util.test_utils import DEFAULT_SKIP_MSG, check_run_all_tests
+
+# from hpobench.util.test_utils import enable_all_tests
+# enable_all_tests()
+
+
+@pytest.fixture(scope='module')
+def enable_debug():
+    enable_container_debug()
+    yield
+    disable_container_debug()
+
+
+@pytest.mark.skipif(not check_run_all_tests(), reason=DEFAULT_SKIP_MSG)
+def test_nasbench101_A_SO(enable_debug):
+
+    b = NASCifar10ABenchmark(rng=0)
+    cs_1 = b.get_configuration_space(seed=0)
+    config_1 = cs_1.sample_configuration()
+    cs_2 = b.get_configuration_space(seed=0)
+    config_2 = cs_2.sample_configuration()
+    assert config_1 == config_2
+
+    assert len(b.get_fidelity_space()) == 1
+
+    config = {
+        'edge_0': 0, 'edge_1': 0, 'edge_10': 0, 'edge_11': 1, 'edge_12': 1, 'edge_13': 0, 'edge_14': 1, 'edge_15': 0,
+        'edge_16': 0, 'edge_17': 1, 'edge_18': 1, 'edge_19': 0, 'edge_2': 0, 'edge_20': 1, 'edge_3': 0, 'edge_4': 0,
+        'edge_5': 1, 'edge_6': 1, 'edge_7': 0, 'edge_8': 0, 'edge_9': 0, 'op_node_0': 'maxpool3x3',
+        'op_node_1': 'conv1x1-bn-relu', 'op_node_2': 'conv3x3-bn-relu', 'op_node_3': 'conv3x3-bn-relu',
+        'op_node_4': 'conv3x3-bn-relu'
+    }
+
+    result = b.objective_function(configuration=config, fidelity={'budget': 108}, run_index=(0, 1, 2))
+    assert result['function_value'] == pytest.approx(0.1659655372301737, abs=0.1)
+    assert result['cost'] == pytest.approx(853.5010070800781, abs=0.1)
+    assert 1 - np.mean(result['info']['valid_accuracies']) == result['function_value']
+
+    with pytest.raises(AssertionError):
+        result = b.objective_function_test(configuration=config, fidelity={'epoch': 109})
+
+
+@pytest.mark.skipif(not check_run_all_tests(), reason=DEFAULT_SKIP_MSG)
+def test_nasbench101_C_MO(enable_debug):
+    b = NASCifar10CMOBenchmark(rng=0)
+    cs_1 = b.get_configuration_space(seed=0)
+    config_1 = cs_1.sample_configuration()
+    cs_2 = b.get_configuration_space(seed=0)
+    config_2 = cs_2.sample_configuration()
+    assert config_1 == config_2
+
+    assert len(b.get_fidelity_space()) == 1
+
+    config = {
+        'edge_0': 0.9446689170495839, 'edge_1': 0.1289262976548533, 'edge_10': 0.09710127579306127,
+        'edge_11': 0.09394051075844168, 'edge_12': 0.5722519057908734, 'edge_13': 0.30157481667454933,
+        'edge_14': 0.9194826137446735, 'edge_15': 0.3599780644783639, 'edge_16': 0.589909976354571,
+        'edge_17': 0.4536968445560453, 'edge_18': 0.21550767711355845, 'edge_19': 0.18327983621407862,
+        'edge_2': 0.5864101661863267, 'edge_20': 0.47837030703998806, 'edge_3': 0.05342718178682526,
+        'edge_4': 0.6956254456388572, 'edge_5': 0.3068100995451961, 'edge_6': 0.399025321703102,
+        'edge_7': 0.15941446344895593, 'edge_8': 0.23274412927905685, 'edge_9': 0.0653042071517802, 'num_edges': 9,
+        'op_node_0': 'conv1x1-bn-relu', 'op_node_1': 'maxpool3x3', 'op_node_2': 'conv1x1-bn-relu',
+        'op_node_3': 'maxpool3x3', 'op_node_4': 'maxpool3x3'
+    }
+
+    result = b.objective_function(configuration=config, fidelity={'budget': 108}, run_index=(0, 1, 2))
+    assert result['function_value']['misclassification_rate'] == pytest.approx(0.11985842386881507, abs=0.1)
+    assert result['function_value']['trainable_parameters'] == 1115277
+    assert result['cost'] == pytest.approx(3175.9591064453125, abs=0.1)
+    assert 1 - np.mean(result['info']['valid_accuracies']) == result['function_value']['misclassification_rate']
+
+    with pytest.raises(AssertionError):
+        result = b.objective_function_test(configuration=config, fidelity={'epoch': 109})
diff --git a/tests/test_nasbench_201.py b/tests/test_nasbench_201.py
index 70e46de9..29ef18ec 100644
--- a/tests/test_nasbench_201.py
+++ b/tests/test_nasbench_201.py
@@ -1,5 +1,3 @@
-import logging
-logging.basicConfig(level=logging.DEBUG)
 import pytest
 
 from hpobench.container.benchmarks.nas.nasbench_201 import ImageNetNasBench201Benchmark, Cifar100NasBench201Benchmark, \
@@ -7,8 +5,7 @@
 from hpobench.benchmarks.nas.nasbench_201 import \
     Cifar10ValidNasBench201MOBenchmark as LocalCifar10ValidNasBench201MOBenchmark
 from hpobench.util.container_utils import disable_container_debug, enable_container_debug
-
-skip_message = 'We currently skip this test because it takes too much time.'
+from hpobench.util.test_utils import DEFAULT_SKIP_MSG, check_run_all_tests
 
 
 @pytest.fixture(scope='module')
@@ -18,7 +15,7 @@ def enable_debug():
     disable_container_debug()
 
 
-@pytest.mark.skip(reason=skip_message)
+@pytest.mark.skipif(not check_run_all_tests(), reason=DEFAULT_SKIP_MSG)
 def test_nasbench201_cifar10valid(enable_debug):
 
     b = Cifar10ValidNasBench201Benchmark(rng=0)
@@ -38,22 +35,22 @@ def test_nasbench201_cifar10valid(enable_debug):
         '3<-2': 'nor_conv_3x3'
     }
     result = b.objective_function(configuration=config, fidelity={'epoch': 199}, data_seed=(777, 888, 999))
-    assert result['function_value'] == pytest.approx(9.78, abs=0.1)
+    assert result['function_value'] == pytest.approx(0.0978, abs=0.1)
     assert result['cost'] == pytest.approx(11973.20, abs=0.1)
-    assert result['info']['valid_precision'] == result['function_value']
+    assert result['info']['valid_misclassification_rate'] == result['function_value']
     assert result['info']['valid_cost'] == result['cost']
 
     result = b.objective_function_test(configuration=config, fidelity={'epoch': 200})
-    assert result['function_value'] == pytest.approx(9.70, abs=0.1)
+    assert result['function_value'] == pytest.approx(0.0970, abs=0.1)
     assert result['cost'] == pytest.approx(10426.33, abs=0.2)
-    assert result['info']['test_precision'] == result['function_value']
+    assert result['info']['test_misclassification_rate'] == result['function_value']
     assert result['info']['test_cost'] == result['cost']
 
-    with pytest.raises(ValueError):
+    with pytest.raises(AssertionError):
         result = b.objective_function_test(configuration=config, fidelity={'epoch': 10})
 
 
-@pytest.mark.skip(reason=skip_message)
+@pytest.mark.skipif(not check_run_all_tests(), reason=DEFAULT_SKIP_MSG)
 def test_nasbench201_cifar100(enable_debug):
     b = Cifar100NasBench201Benchmark(rng=0)
 
@@ -67,13 +64,13 @@ def test_nasbench201_cifar100(enable_debug):
 
     result = b.objective_function(configuration=config, fidelity=fidelity, data_seed=(777, 888, 999))
     assert result is not None
-    assert result['function_value'] == pytest.approx(29.5233, abs=0.1)
+    assert result['function_value'] == pytest.approx(0.295233, abs=0.1)
     assert result['cost'] == pytest.approx(19681.70, abs=0.1)
-    assert result['info']['valid_precision'] == result['function_value']
+    assert result['info']['valid_misclassification_rate'] == result['function_value']
     assert result['info']['valid_cost'] == result['cost']
 
 
-@pytest.mark.skip(reason=skip_message)
+@pytest.mark.skipif(not check_run_all_tests(), reason=DEFAULT_SKIP_MSG)
 def test_nasbench201_Image(enable_debug):
     b = ImageNetNasBench201Benchmark(rng=0)
     config = {'1<-0': 'nor_conv_1x1',
@@ -86,9 +83,9 @@ def test_nasbench201_Image(enable_debug):
 
     result = b.objective_function(configuration=config, fidelity=fidelity, data_seed=(777, 888, 999))
     assert result is not None
-    assert result['function_value'] == pytest.approx(55.2167, abs=0.1)
+    assert result['function_value'] == pytest.approx(0.552167, abs=0.1)
     assert result['cost'] == pytest.approx(57119.22, abs=0.1)
-    assert result['info']['valid_precision'] == result['function_value']
+    assert result['info']['valid_misclassification_rate'] == result['function_value']
     assert result['info']['valid_cost'] == result['cost']
 
 
diff --git a/tests/test_paramnet.py b/tests/test_paramnet.py
index 52d55f94..076f4b38 100644
--- a/tests/test_paramnet.py
+++ b/tests/test_paramnet.py
@@ -1,11 +1,13 @@
 import pytest
+import sys
 
-# import logging
-# logging.basicConfig(level=logging.DEBUG)
-# from hpobench.util.container_utils import enable_container_debug
-# enable_container_debug()
 
+MSG = 'Skip this test for new (>3.9) python versions. ' \
+      'The paramnet benchmarks require an specific old scikit learn version. This version however does not work under ' \
+      'python 3.10. Therefore we skip this test. The containerized version does still work under 3.10.'
 
+
+@pytest.mark.skipif(sys.version_info > (3, 9), reason=MSG)
 def test_load_data():
     from hpobench.util.data_manager import ParamNetDataManager
 
diff --git a/tests/test_pybnn.py b/tests/test_pybnn.py
index 0e749457..f1c6b5fc 100644
--- a/tests/test_pybnn.py
+++ b/tests/test_pybnn.py
@@ -1,14 +1,19 @@
+import sys
 import pytest
 
 from hpobench.container.benchmarks.ml.pybnn import BNNOnToyFunction, BNNOnBostonHousing, BNNOnProteinStructure, \
     BNNOnYearPrediction
 
-import logging
-logging.basicConfig(level=logging.DEBUG)
 from hpobench.util.container_utils import enable_container_debug
+from hpobench.util.test_utils import check_run_all_tests, DEFAULT_SKIP_MSG
+
 enable_container_debug()
+MSG = 'Skip this test for new (>3.9) python versions. ' \
+      'The paramnet benchmarks require an specific old scikit learn version. This version however does not work under ' \
+      'python 3.10. Therefore we skip this test. The containerized version does still work under 3.10.'
 
 
+@pytest.mark.skipif(sys.version_info > (3, 9), reason=MSG)
 def test_bnn_init():
     benchmark = BNNOnToyFunction(rng=1)
 
@@ -58,6 +63,7 @@ def test_bnn_boston_housing():
     assert test_result['info']['fidelity']['budget'] == 1000
 
 
+@pytest.mark.skipif(not check_run_all_tests(), reason=DEFAULT_SKIP_MSG)
 def test_bnn_protein():
     benchmark = BNNOnProteinStructure(rng=1)
     test_result = simple_call(benchmark)
@@ -66,6 +72,7 @@ def test_bnn_protein():
     assert test_result['info']['fidelity']['budget'] == 1000
 
 
+@pytest.mark.skipif(not check_run_all_tests(), reason=DEFAULT_SKIP_MSG)
 def test_year_pred():
     benchmark = BNNOnYearPrediction(rng=1)
     test_result = simple_call(benchmark)
diff --git a/tests/test_utils.py b/tests/test_utils.py
index 9bc5ff3b..e570dbd7 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -105,3 +105,15 @@ def test_debug_level():
 
     disable_container_debug()
     assert os.environ['HPOBENCH_DEBUG'] == 'false'
+
+
+def test_test_utils():
+    from hpobench.util.test_utils import DEFAULT_SKIP_MSG, enable_all_tests, disable_all_tests, check_run_all_tests
+
+    assert isinstance(DEFAULT_SKIP_MSG, str)
+
+    enable_all_tests()
+    assert check_run_all_tests()
+
+    disable_all_tests()
+    assert not check_run_all_tests()
\ No newline at end of file
diff --git a/tests/test_whitebox.py b/tests/test_whitebox.py
index 35a9a940..585f9867 100644
--- a/tests/test_whitebox.py
+++ b/tests/test_whitebox.py
@@ -63,6 +63,7 @@ def test_whitebox_with_container():
     assert np.isclose(test_loss, 0.43636, atol=0.001)
 
 
+@pytest.mark.skipif(skip_container_test, reason="Requires singularity and flask")
 def test_cartpole():
     from hpobench.container.benchmarks.rl.cartpole import CartpoleReduced as Benchmark
     b = Benchmark(container_name='cartpole',
diff --git a/tests/test_yahpo_raw.py b/tests/test_yahpo_raw.py
new file mode 100644
index 00000000..65694603
--- /dev/null
+++ b/tests/test_yahpo_raw.py
@@ -0,0 +1,12 @@
+from hpobench.container.benchmarks.ml.yahpo_benchmark import YAHPOGymMORawBenchmark
+
+
+def test_mo_benchmark():
+
+    b = YAHPOGymMORawBenchmark(scenario="iaml_xgboost", instance="40981",)
+    cfg = b.get_configuration_space().get_default_configuration()
+    b.objective_function(cfg)
+
+
+if __name__ == '__main__':
+    test_mo_benchmark()
\ No newline at end of file