From 2fe5e585bac96b1f85c550070a49364c33e92a4d Mon Sep 17 00:00:00 2001
From: ayushi-3536 <ayushi.sharma.3536@gmail.com>
Date: Tue, 17 May 2022 14:50:09 +0200
Subject: [PATCH 01/29] - added language model benchmark from MO-ASHA paper -
 added dependencies, benchmark - added token generation and model training
 code from moasha in dependencies

---
 extra_requirements/lm_benchmark.json      |   5 +
 hpobench/benchmarks/mo/lm_benchmark.py    | 363 ++++++++++++++++++++++
 hpobench/dependencies/lm/__init__.py      |   0
 hpobench/dependencies/lm/model.py         | 140 +++++++++
 hpobench/dependencies/lm/tokenize_util.py |  56 ++++
 hpobench/util/data_manager.py             |  60 +++-
 6 files changed, 623 insertions(+), 1 deletion(-)
 create mode 100644 extra_requirements/lm_benchmark.json
 create mode 100644 hpobench/benchmarks/mo/lm_benchmark.py
 create mode 100644 hpobench/dependencies/lm/__init__.py
 create mode 100644 hpobench/dependencies/lm/model.py
 create mode 100644 hpobench/dependencies/lm/tokenize_util.py

diff --git a/extra_requirements/lm_benchmark.json b/extra_requirements/lm_benchmark.json
new file mode 100644
index 00000000..34d1249b
--- /dev/null
+++ b/extra_requirements/lm_benchmark.json
@@ -0,0 +1,5 @@
+{
+  "lm": [
+    "torch==1.3.0"
+  ]
+}
\ No newline at end of file
diff --git a/hpobench/benchmarks/mo/lm_benchmark.py b/hpobench/benchmarks/mo/lm_benchmark.py
new file mode 100644
index 00000000..270243f6
--- /dev/null
+++ b/hpobench/benchmarks/mo/lm_benchmark.py
@@ -0,0 +1,363 @@
+"""
+Changelog:
+==========
+
+0.0.1:
+* First implementation of the Multi-Objective CNN Benchmark.
+"""
+from typing import Union, Tuple, Dict, List
+import ConfigSpace as CS
+import numpy as np
+import torch
+import torch.nn as nn
+import logging
+from ConfigSpace.hyperparameters import Hyperparameter
+import hpobench.util.rng_helper as rng_helper
+from hpobench.abstract_benchmark import AbstractMultiObjectiveBenchmark
+from hpobench.util.data_manager import LanguageModelDataManager
+from hpobench.dependencies.lm.tokenize_util import batchify
+from hpobench.dependencies.lm.model import TransformerModel
+import time
+import math
+import tqdm
+
+__version__ = '0.0.1'
+
+logger = logging.getLogger('MO_CNN')
+
+
+class LanguageModelBenchmark(AbstractMultiObjectiveBenchmark):
+
+    def __init__(self, rng: Union[np.random.RandomState, int, None] = None, **kwargs):
+        super(LanguageModelBenchmark, self).__init__(rng=rng)
+
+        data_manager = LanguageModelDataManager()
+        self.X_train, self.X_valid, self.X_test = data_manager.load()
+        self.corpus = data_manager.corpus
+
+        self.variable = {"eval_batch_size": 10,
+                         "nlayers": 2,
+                         "bptt": 35,
+                         "tied": True,
+                         "nhead": 2,
+                         "ntoken": len(self.corpus.dictionary)
+                         }
+        print("len of corpus dict", len(self.corpus.dictionary))
+
+    @staticmethod
+    def get_configuration_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
+        """Parameter space to be optimized --- contains the hyperparameters
+        """
+        cs = CS.ConfigurationSpace(seed=seed)
+
+        cs.add_hyperparameters([
+            CS.UniformIntegerHyperparameter(
+                'batch_size', default_value=128, lower=8, upper=256
+            ),
+            CS.UniformIntegerHyperparameter(
+                'emsize', default_value=128, lower=32, upper=1024
+            ),
+            CS.UniformIntegerHyperparameter(
+                'lr_factor', default_value=50, lower=1, upper=100, log=True
+            ),
+            CS.UniformFloatHyperparameter(
+                'lr', default_value=5, lower=1, upper=50, log=True
+            ),
+            CS.UniformFloatHyperparameter(
+                'dropout', default_value=0.99, lower=0, upper=0.99
+            ),
+            CS.UniformFloatHyperparameter(
+                'clip', default_value=0.99, lower=0.1, upper=2
+            )
+
+        ])
+        return cs
+
+    @staticmethod
+    def get_objective_names(self) -> List[str]:
+        return ['perplexity', 'error', 'time']
+
+    @staticmethod
+    def get_fidelity_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
+
+        fidelity_space = CS.ConfigurationSpace(seed=seed)
+        fidelity_space.add_hyperparameters([
+            # gray-box setting (multi-multi-fidelity) - iterations + data subsample
+            LanguageModelBenchmark._get_fidelity_choices(iter_choice='variable')
+        ])
+        return fidelity_space
+
+    @staticmethod
+    def _get_fidelity_choices(iter_choice: str) -> Tuple[Hyperparameter, Hyperparameter]:
+
+        fidelity1 = dict(
+            fixed=CS.Constant('budget', value=81),
+            variable=CS.UniformIntegerHyperparameter(
+                'budget', lower=1, upper=81, default_value=81, log=False
+            )
+        )
+
+        budget = fidelity1[iter_choice]
+        return budget
+
+    @staticmethod
+    def get_meta_information() -> Dict:
+        """ Returns the meta information for the benchmark """
+        return {
+            'name': 'Multi-objective Asynchronous Successive Halving',
+            'references': ['@article{schmucker2021multi,'
+                           'title={Multi-objective Asynchronous Successive Halving},'
+                           'author={Schmucker, Robin and Donini, Michele and Zafar, Muhammad Bilal and Salinas, David and Archambeau, C{\'e}dric},'
+                           'journal={arXiv preprint arXiv:2106.12639},'
+                           'year={2021}',
+                           ],
+        }
+
+    def init_model(self, config: Union[CS.Configuration, Dict]):
+        """ Function that returns the model initialized based on the configuration and fidelity
+        """
+
+        if isinstance(config, CS.Configuration):
+            config = config.get_dictionary()
+
+        model = TransformerModel(
+            self.variable['ntoken'], config['emsize'], self.variable['nhead'], config['emsize'],
+            self.variable['nlayers'], config['dropout'])
+
+        return model
+
+    @AbstractMultiObjectiveBenchmark.check_parameters
+    def objective_function(self, configuration: Union[CS.Configuration, Dict],
+                           fidelity: Union[Dict, CS.Configuration, None] = None,
+                           rng: Union[np.random.RandomState, int, None] = None,
+                           shuffle: bool = False,
+                           **kwargs) -> Dict:
+        """
+
+        Parameters
+        ----------
+        configuration
+        fidelity: Dict, None
+            epoch: int - Values: [1, 81]
+                Number of epochs an architecture was trained.
+                Note: the number of epoch is 1 indexed! (Results after the first epoch: epoch = 1)
+
+            Fidelity parameters, check get_fidelity_space(). Uses default (max) value if None.
+        rng : np.random.RandomState, int, None
+            Random seed to use in the benchmark.
+
+            To prevent overfitting on a single seed, it is possible to pass a
+            parameter ``rng`` as 'int' or 'np.random.RandomState' to this function.
+            If this parameter is not given, the default random state is used.
+
+
+        kwargs
+
+        Returns
+        -------
+        Dict -
+            function_value : Dict
+                validation_accuracy: float
+                log_perplexity: float
+            cost : time to train the network
+            info : Dict
+                validation_accuracy : float,
+                test_accuracy : float,
+                log_perplexity : float,
+                negative_log_perplexity : float,
+                training_cost : float,
+                valid_cost : float,
+                test_cost : float,
+                fidelity : Dict
+                    used fidelities in this evaluation
+        """
+        self.rng = rng_helper.get_rng()
+        device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
+        ts_start = time.time()
+
+        # batchify data
+        batch_size = configuration['batch_size']
+        train_data = batchify(self.X_train, batch_size=batch_size).to(device)
+        val_data = batchify(self.X_valid, batch_size=self.variable["eval_batch_size"]).to(device)
+        test_data = batchify(self.X_test, batch_size=self.variable["eval_batch_size"]).to(device)
+
+        epochs = fidelity['budget']
+
+        model = self.init_model(configuration).to(device)
+
+        criterion = nn.CrossEntropyLoss()
+
+        learning_rate = configuration['lr']
+        learning_rate_factor = configuration['lr_factor']
+        clip = configuration['clip']
+        best_val_loss = None
+        train_eval_time = 0
+
+        t = tqdm.tqdm(total=epochs)
+        for epoch in range(epochs):
+            epoch_start_time = time.time()
+            model.train_fun(model, self.corpus, criterion, train_data, learning_rate, batch_size, clip)
+
+            val_loss, val_acc = model.evaluate(model, self.corpus, criterion, val_data)
+            val_loss = np.clip(val_loss, 1e-10, 10)
+
+            ts_now = time.time()
+            train_eval_time += ts_now - epoch_start_time
+
+            t.set_postfix(val_accuracy=val_acc)
+            t.update()
+
+            if not np.isfinite(val_loss):
+                val_loss = 7
+
+            # Save the model if the validation loss is the best we've seen so far.
+            if not best_val_loss or val_loss < best_val_loss:
+                best_val_loss = val_loss
+            else:
+                # Anneal the learning rate if no improvement has been seen in the validation dataset.
+                learning_rate /= learning_rate_factor
+
+        start_time = time.time()
+        _, val_acc = model.eval_fun(model, self.corpus, criterion, val_data)
+        eval_valid_runtime = time.time() - start_time
+
+        start_time = time.time()
+        _, test_acc = model.eval_fun(model, self.corpus, criterion, test_data)
+        eval_test_runtime = time.time() - start_time
+
+        perplexity = math.exp(best_val_loss)
+        log_perplexity = best_val_loss
+        neg_log_perplexity = 10 - best_val_loss
+        elapsed_time = float(ts_start - time.time())
+
+        return {'function_value': {'log_perplexity': log_perplexity,
+                                   'accuracy': val_acc,
+                                   },
+                'cost': elapsed_time,
+                'info': {'validation_accuracy': val_acc,
+                         'test_accuracy': test_acc,
+                         'log_perplexity': log_perplexity,
+                         'perplexity': perplexity,
+                         'negative_log_perplexity': neg_log_perplexity,
+                         'training_cost': train_eval_time,
+                         'valid_cost': eval_valid_runtime,
+                         'test_cost': eval_test_runtime,
+                         'fidelity': fidelity
+                         }
+                }
+
+    @AbstractMultiObjectiveBenchmark.check_parameters
+    def objective_function_test(self, configuration: Union[CS.Configuration, Dict],
+                                fidelity: Union[Dict, None] = None,
+                                rng: Union[np.random.RandomState, int, None] = None,
+                                shuffle: bool = False,
+                                **kwargs) -> Dict:
+        """
+        Get the validated results. Runs a given configuration on the largest budget (here: 50).
+        Parameters
+        ----------
+        configuration
+        fidelity: Dict, None
+            epoch: int - Values: [1, 50]
+                Number of epochs an architecture was trained.
+                Note: the number of epoch is 1 indexed. (Results after the first epoch: epoch = 1)
+
+            Fidelity parameters, check get_fidelity_space(). Uses default (max) value if None.
+        rng : np.random.RandomState, int, None
+            Random seed to use in the benchmark.
+
+            To prevent overfitting on a single seed, it is possible to pass a
+            parameter ``rng`` as 'int' or 'np.random.RandomState' to this function.
+            If this parameter is not given, the default random state is used.
+
+        kwargs
+        Returns
+        -------
+        Dict -
+            function_value : Dict
+                validation_accuracy: float
+                log_perplexity: float
+            cost : time to train the network
+            info : Dict
+                validation_accuracy : float,
+                test_accuracy : float,
+                log_perplexity : float,
+                negative_log_perplexity : float,
+                training_cost : float,
+                valid_cost : float,
+                test_cost : float,
+                fidelity : Dict
+                    used fidelities in this evaluation
+        """
+
+        # The result dict should contain already all necessary information -> Just swap the function value from valid
+        # to test and the corresponding time cost
+        assert fidelity['epoch'] == 81, 'Only test data for the 50. epoch is available. '
+
+        device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
+        ts_start = time.time()
+
+        # batchify data
+        batch_size = configuration['batch_size']
+        train_data = batchify(self.X_train, batch_size=batch_size).to(device)
+        val_data = batchify(self.X_valid, batch_size=self.variable["eval_batch_size"]).to(device)
+
+        train_data = np.vstack(train_data, val_data)
+        test_data = batchify(self.X_test, batch_size=self.variable["eval_batch_size"]).to(device)
+
+        epochs = fidelity['budget']
+
+        model = self.init_model(configuration).to(device)
+
+        criterion = nn.CrossEntropyLoss()
+
+        learning_rate = configuration['lr']
+        learning_rate_factor = configuration['lr_factor']
+        clip = configuration['clip']
+        best_test_loss = None
+        train_eval_time = 0
+        t = tqdm.tqdm(total=epochs)
+        for epoch in range(1, epochs + 1):
+            epoch_start_time = time.time()
+            model.train_fun(model, self.corpus, criterion, train_data, learning_rate, batch_size, clip)
+
+            test_loss, test_acc = model.eval_fun(model, self.corpus, criterion, test_data)
+            test_loss = np.clip(test_loss, 1e-10, 10)
+
+            ts_now = time.time()
+            train_eval_time += ts_now - epoch_start_time
+
+            if not np.isfinite(test_loss):
+                test_loss = 7
+
+            # Save the model if the validation loss is the best we've seen so far.
+            if not best_test_loss or test_loss < best_test_loss:
+                best_test_loss = test_loss
+            else:
+                # Anneal the learning rate if no improvement has been seen in the validation dataset.
+                learning_rate /= learning_rate_factor
+
+        start_time = time.time()
+        _, test_acc = model.eval_fun(model, self.corpus, criterion, test_data)
+        eval_test_runtime = time.time() - start_time
+
+        perplexity = math.exp(best_test_loss)
+        log_perplexity = best_test_loss
+        neg_log_perplexity = 10 - best_test_loss
+        elapsed_time = float(ts_start - time.time())
+
+        return {'function_value': {'log_perplexity': log_perplexity,
+                                   'accuracy': test_acc,
+                                   },
+                'cost': elapsed_time,
+                'info': {'test_accuracy': test_acc,
+                         'log_perplexity': log_perplexity,
+                         'perplexity': perplexity,
+                         'negative_log_perplexity': neg_log_perplexity,
+                         'training_cost': train_eval_time,
+                         'test_cost': eval_test_runtime,
+                         'fidelity': fidelity
+                         }
+                }
+
+    __all__ = ["LanguageModelBenchmark"]
diff --git a/hpobench/dependencies/lm/__init__.py b/hpobench/dependencies/lm/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/hpobench/dependencies/lm/model.py b/hpobench/dependencies/lm/model.py
new file mode 100644
index 00000000..f4aed4cc
--- /dev/null
+++ b/hpobench/dependencies/lm/model.py
@@ -0,0 +1,140 @@
+import torch
+import torch.nn as nn
+import math
+import torch.nn.functional as F
+
+
+class PositionalEncoding(nn.Module):
+    r"""Inject some information about the relative or absolute position of the tokens
+        in the sequence. The positional encodings have the same dimension as
+        the embeddings, so that the two can be summed. Here, we use sine and cosine
+        functions of different frequencies.
+    .. math::
+        \text{PosEncoder}(pos, 2i) = sin(pos/10000^(2i/d_model))
+        \text{PosEncoder}(pos, 2i+1) = cos(pos/10000^(2i/d_model))
+        \text{where pos is the word position and i is the embed idx)
+    Args:
+        d_model: the embed dim (required).
+        dropout: the dropout value (default=0.1).
+        max_len: the max. length of the incoming sequence (default=5000).
+        >>> pos_encoder = PositionalEncoding(d_model)
+    """
+
+    def __init__(self, d_model, dropout=0.1, max_len=5000):
+        super(PositionalEncoding, self).__init__()
+        self.dropout = nn.Dropout(p=dropout)
+        pe = torch.zeros(max_len, d_model)
+        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
+        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
+        pe[:, 0::2] = torch.sin(position * div_term)
+        pe[:, 1::2] = torch.cos(position * div_term)
+        pe = pe.unsqueeze(0).transpose(0, 1)
+        self.register_buffer('pe', pe)
+
+    def forward(self, x):
+        r"""Inputs of forward function
+        Args:
+            x: the sequence fed to the positional encoder model (required).
+        Shape:
+            x: [sequence length, batch size, embed dim]
+            output: [sequence length, batch size, embed dim]
+        Examples:
+            >>> output = pos_encoder(x)
+        """
+        x = x + self.pe[:x.size(0), :]
+        return self.dropout(x)
+
+
+class TransformerModel(nn.Module):
+    """Container module with an encoder, a recurrent or transformer module, and a decoder."""
+
+    def __init__(self, ntoken, ninp, nhead, nhid, nlayers, dropout=0.5, bptt=35):
+        super(TransformerModel, self).__init__()
+        try:
+            from torch.nn import TransformerEncoder, TransformerEncoderLayer
+        except:
+            raise ImportError('TransformerEncoder module does not exist in PyTorch 1.1 or lower.')
+        self.model_type = 'Transformer'
+        self.src_mask = None
+        self.pos_encoder = PositionalEncoding(ninp, dropout)
+        encoder_layers = TransformerEncoderLayer(ninp, nhead, nhid, dropout)
+        self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers)
+        self.encoder = nn.Embedding(ntoken, ninp)
+        self.ninp = ninp
+        self.decoder = nn.Linear(ninp, ntoken)
+        self.init_weights()
+        self.bptt = bptt
+
+    def _generate_square_subsequent_mask(self, sz):
+        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
+        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
+        return mask
+
+    def get_batch(self, source, i):
+        seq_len = min(self.bptt, len(source) - 1 - i)
+        data = source[i:i + seq_len]
+        target = source[i + 1:i + 1 + seq_len].view(-1)
+        return data, target
+
+    def init_weights(self):
+        initrange = 0.1
+        self.encoder.weight.data.uniform_(-initrange, initrange)
+        self.decoder.bias.data.zero_()
+        self.decoder.weight.data.uniform_(-initrange, initrange)
+
+    def forward(self, src, has_mask=True):
+        if has_mask:
+            device = src.device
+            if self.src_mask is None or self.src_mask.size(0) != len(src):
+                mask = self._generate_square_subsequent_mask(len(src)).to(device)
+                self.src_mask = mask
+        else:
+            self.src_mask = None
+        src = self.encoder(src) * math.sqrt(self.ninp)
+        src = self.pos_encoder(src)
+        output = self.transformer_encoder(src, self.src_mask)
+        output = self.decoder(output)
+        return F.log_softmax(output, dim=-1)
+
+    def train_fun(self, model, corpus, criterion, train_data, lr, batch_size, clip):
+        # Turn on training mode which enables dropout.
+        self.train()
+        # total_loss = 0.
+        # start_time = time.time()
+        ntokens = len(corpus.dictionary)
+
+        for batch, i in enumerate(range(0, train_data.size(0) - 1, self.bptt)):
+            data, targets = self.get_batch(train_data, i)
+            # Starting each batch, we detach the hidden state from how it was previously produced.
+            # If we didn't, the model would try backpropagating all the way to start of the dataset.
+            model.zero_grad()
+            output = model(data)
+            loss = criterion(output.view(-1, ntokens), targets)
+            loss.backward()
+
+            # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
+            torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
+            for p in model.parameters():
+                p.data.add_(-lr, p.grad.data)
+
+    def eval_fun(self, model, corpus, criterion, data_source):
+        # Turn on evaluation mode which disables dropout.
+        self.eval()
+        total_loss = 0.
+        total_acc = 0.
+        ntokens = len(corpus.dictionary)
+        with torch.no_grad():
+            for i in range(0, data_source.size(0) - 1, self.bptt):
+                data, targets = self.get_batch(data_source, i)
+                output = model(data)
+                output_flat = output.view(-1, ntokens)
+                total_loss += len(data) * criterion(output_flat, targets).item()
+
+                # inserted accuracy
+                winners = output_flat.argmax(dim=1)
+                corrects = (winners == targets)
+                accuracy = corrects.sum().float() / float(targets.size(0))
+                total_acc += len(data) * accuracy
+
+        avg_acc = total_acc / (len(data_source) - 1)
+        return total_loss / (len(data_source) - 1), avg_acc
diff --git a/hpobench/dependencies/lm/tokenize_util.py b/hpobench/dependencies/lm/tokenize_util.py
new file mode 100644
index 00000000..6e200b1e
--- /dev/null
+++ b/hpobench/dependencies/lm/tokenize_util.py
@@ -0,0 +1,56 @@
+import torch
+
+
+class Dictionary(object):
+    def __init__(self):
+        self.word2idx = {}
+        self.idx2word = []
+
+    def add_word(self, word):
+        if word not in self.word2idx:
+            self.idx2word.append(word)
+            self.word2idx[word] = len(self.idx2word) - 1
+        return self.word2idx[word]
+
+    def __len__(self):
+        return len(self.idx2word)
+
+
+class Corpus(object):
+    def __init__(self, logger):
+        self.dictionary = Dictionary()
+        self.logger = logger
+
+    def tokenize(self, path):
+        """Tokenizes a text file."""
+        # Add words to the dictionary
+        with open(path, 'r', encoding="utf8") as f:
+            for line in f:
+                words = line.split() + ['<eos>']
+                print("words", words)
+                for word in words:
+                    self.dictionary.add_word(word)
+        # Tokenize file content
+        with open(path, 'r', encoding="utf8") as f:
+            idss = []
+            for line in f:
+                words = line.split() + ['<eos>']
+                ids = []
+                try:
+                    for word in words:
+                        ids.append(self.dictionary.word2idx[word])
+                except:
+                    self.logger.debug("word2idx:{}", self.dictionary.word2idx)
+                idss.append(torch.tensor(ids).type(torch.int64))
+            ids = torch.cat(idss)
+        return ids
+
+
+def batchify(data, batch_size):
+    # Work out how cleanly we can divide the dataset into bsz parts.
+    nbatch = data.size(0) // batch_size
+    # Trim off any extra elements that wouldn't cleanly fit (remainders).
+    data = data.narrow(0, 0, nbatch * batch_size)
+    # Evenly divide the data across the bsz batches.
+    data = data.view(batch_size, -1).t().contiguous()
+    return data
diff --git a/hpobench/util/data_manager.py b/hpobench/util/data_manager.py
index a2e33121..e8a3f2d1 100644
--- a/hpobench/util/data_manager.py
+++ b/hpobench/util/data_manager.py
@@ -37,7 +37,6 @@
 except ImportError:
     print("pandas is not installed, can't download datasets for the ml.tabular_benchmarks (not needed for containers)")
 
-
 import hpobench
 
 
@@ -849,6 +848,65 @@ def _load(self) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndar
         return X_train, y_train, X_val, y_val, X_test, y_test
 
 
+class LanguageModelDataManager(HoldoutDataManager):
+    def __init__(self):
+        from hpobench.dependencies.lm.tokenize_util import Corpus
+        super(LanguageModelDataManager, self).__init__()
+        self.logger.debug('LanguageModelDataManager: Starting to load data')
+
+        self.urls = {
+            "train_data": "https://raw.githubusercontent.com/pytorch/examples/master/word_language_model/data/wikitext-2/train.txt",
+            "valid_data": "https://raw.githubusercontent.com/pytorch/examples/master/word_language_model/data/wikitext-2/valid.txt",
+            "test_data": "https://raw.githubusercontent.com/pytorch/examples/master/word_language_model/data/wikitext-2/test.txt",
+        }
+
+        self.save_dir = hpobench.config_file.data_dir / "wikitext"
+        self.create_save_directory(self.save_dir)
+        self.corpus = Corpus(logger=self.logger)
+
+    def load(self):
+        """
+        Loads Adult Fair Datasets from data directory as defined in hpobenchrc.data_directory.
+        Downloads data if necessary.
+        Returns
+        -------
+        X_train: np.ndarray
+        y_train: np.ndarray
+        X_val: np.ndarray
+        y_val: np.ndarray
+        X_test: np.ndarray
+        y_test: np.ndarray
+        """
+
+        t = time()
+        self._download()
+        self.X_train, self.X_valid, self.X_test = self._load()
+        self.logger.info(f'LanguageModelDataManager: Data successfully loaded after {time() - t:.2f}')
+        return self.X_train, self.X_valid, self.X_test
+
+    def _download(self):
+        self._download_file_with_progressbar(self.urls["train_data"], self.save_dir / "train.txt")
+        self._download_file_with_progressbar(self.urls["valid_data"], self.save_dir / "valid.txt")
+        self._download_file_with_progressbar(self.urls["test_data"], self.save_dir / "test.txt")
+
+    def _load(self) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
+        """
+        Load the data from file and split it into train, test and validation split.
+        Returns
+        -------
+        X_train: np.ndarray
+        X_valid: np.ndarray
+        X_test: np.ndarray
+        """
+
+
+        X_train = self.corpus.tokenize(self.save_dir / "train.txt")
+        X_valid = self.corpus.tokenize(self.save_dir / "valid.txt")
+        X_test = self.corpus.tokenize(self.save_dir / "test.txt")
+
+        return X_train, X_valid, X_test
+
+
 class YearPredictionMSDData(HoldoutDataManager):
 
     def __init__(self):

From 83957a7d2d1092978310e453b6e52cf3fd912bcb Mon Sep 17 00:00:00 2001
From: ayushi-3536 <ayushi.sharma.3536@gmail.com>
Date: Tue, 17 May 2022 15:10:44 +0200
Subject: [PATCH 02/29] - added evaluation time as one of the objectives -
 returning prediction time for evaluation time - changed perplexity -->
 log_perplexity for the objective (MO-ASHA uses log perplexity)  changed error
 --> accuracy - added tqdm

---
 extra_requirements/lm_benchmark.json   |  3 ++-
 hpobench/benchmarks/mo/lm_benchmark.py | 10 +++++++---
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/extra_requirements/lm_benchmark.json b/extra_requirements/lm_benchmark.json
index 34d1249b..a686fe43 100644
--- a/extra_requirements/lm_benchmark.json
+++ b/extra_requirements/lm_benchmark.json
@@ -1,5 +1,6 @@
 {
   "lm": [
-    "torch==1.3.0"
+    "torch==1.3.0",
+    "tqdm"
   ]
 }
\ No newline at end of file
diff --git a/hpobench/benchmarks/mo/lm_benchmark.py b/hpobench/benchmarks/mo/lm_benchmark.py
index 270243f6..1326418f 100644
--- a/hpobench/benchmarks/mo/lm_benchmark.py
+++ b/hpobench/benchmarks/mo/lm_benchmark.py
@@ -23,7 +23,7 @@
 
 __version__ = '0.0.1'
 
-logger = logging.getLogger('MO_CNN')
+logger = logging.getLogger('LM_Bench')
 
 
 class LanguageModelBenchmark(AbstractMultiObjectiveBenchmark):
@@ -75,7 +75,7 @@ def get_configuration_space(seed: Union[int, None] = None) -> CS.ConfigurationSp
 
     @staticmethod
     def get_objective_names(self) -> List[str]:
-        return ['perplexity', 'error', 'time']
+        return ['log_perplexity', 'accuracy', 'time']
 
     @staticmethod
     def get_fidelity_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
@@ -232,6 +232,7 @@ def objective_function(self, configuration: Union[CS.Configuration, Dict],
 
         return {'function_value': {'log_perplexity': log_perplexity,
                                    'accuracy': val_acc,
+                                   'time': train_eval_time
                                    },
                 'cost': elapsed_time,
                 'info': {'validation_accuracy': val_acc,
@@ -258,7 +259,7 @@ def objective_function_test(self, configuration: Union[CS.Configuration, Dict],
         ----------
         configuration
         fidelity: Dict, None
-            epoch: int - Values: [1, 50]
+            epoch: int - Values: [1, 81]
                 Number of epochs an architecture was trained.
                 Note: the number of epoch is 1 indexed. (Results after the first epoch: epoch = 1)
 
@@ -327,6 +328,8 @@ def objective_function_test(self, configuration: Union[CS.Configuration, Dict],
             ts_now = time.time()
             train_eval_time += ts_now - epoch_start_time
 
+            t.set_postfix(test_accuracy=test_acc)
+            t.update()
             if not np.isfinite(test_loss):
                 test_loss = 7
 
@@ -348,6 +351,7 @@ def objective_function_test(self, configuration: Union[CS.Configuration, Dict],
 
         return {'function_value': {'log_perplexity': log_perplexity,
                                    'accuracy': test_acc,
+                                   'time': train_eval_time
                                    },
                 'cost': elapsed_time,
                 'info': {'test_accuracy': test_acc,

From 918b0f97ea4a42114e611a6a104ae7db2e8f5292 Mon Sep 17 00:00:00 2001
From: ayushi-3536 <ayushi.sharma.3536@gmail.com>
Date: Tue, 17 May 2022 16:23:10 +0200
Subject: [PATCH 03/29] - func name correction

---
 hpobench/benchmarks/mo/lm_benchmark.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hpobench/benchmarks/mo/lm_benchmark.py b/hpobench/benchmarks/mo/lm_benchmark.py
index 1326418f..e06c237d 100644
--- a/hpobench/benchmarks/mo/lm_benchmark.py
+++ b/hpobench/benchmarks/mo/lm_benchmark.py
@@ -198,7 +198,7 @@ def objective_function(self, configuration: Union[CS.Configuration, Dict],
             epoch_start_time = time.time()
             model.train_fun(model, self.corpus, criterion, train_data, learning_rate, batch_size, clip)
 
-            val_loss, val_acc = model.evaluate(model, self.corpus, criterion, val_data)
+            val_loss, val_acc = model.eval_fun(model, self.corpus, criterion, val_data)
             val_loss = np.clip(val_loss, 1e-10, 10)
 
             ts_now = time.time()

From 9c5090bb034ae91397ba8351c8b60477a87a9ece Mon Sep 17 00:00:00 2001
From: ayushi-3536 <ayushi.sharma.3536@gmail.com>
Date: Tue, 17 May 2022 19:47:49 +0200
Subject: [PATCH 04/29] - load and  save tokenized file

---
 hpobench/benchmarks/mo/lm_benchmark.py    |  7 ++---
 hpobench/dependencies/lm/tokenize_util.py |  1 -
 hpobench/util/data_manager.py             | 33 ++++++++++++++++-------
 3 files changed, 27 insertions(+), 14 deletions(-)

diff --git a/hpobench/benchmarks/mo/lm_benchmark.py b/hpobench/benchmarks/mo/lm_benchmark.py
index e06c237d..4b301492 100644
--- a/hpobench/benchmarks/mo/lm_benchmark.py
+++ b/hpobench/benchmarks/mo/lm_benchmark.py
@@ -31,7 +31,8 @@ class LanguageModelBenchmark(AbstractMultiObjectiveBenchmark):
     def __init__(self, rng: Union[np.random.RandomState, int, None] = None, **kwargs):
         super(LanguageModelBenchmark, self).__init__(rng=rng)
 
-        data_manager = LanguageModelDataManager()
+        device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
+        data_manager = LanguageModelDataManager(device)
         self.X_train, self.X_valid, self.X_test = data_manager.load()
         self.corpus = data_manager.corpus
 
@@ -195,9 +196,10 @@ def objective_function(self, configuration: Union[CS.Configuration, Dict],
 
         t = tqdm.tqdm(total=epochs)
         for epoch in range(epochs):
+            print("epoch training started",epoch)
             epoch_start_time = time.time()
             model.train_fun(model, self.corpus, criterion, train_data, learning_rate, batch_size, clip)
-
+            print("epoch traing done")
             val_loss, val_acc = model.eval_fun(model, self.corpus, criterion, val_data)
             val_loss = np.clip(val_loss, 1e-10, 10)
 
@@ -295,7 +297,6 @@ def objective_function_test(self, configuration: Union[CS.Configuration, Dict],
         # to test and the corresponding time cost
         assert fidelity['epoch'] == 81, 'Only test data for the 50. epoch is available. '
 
-        device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
         ts_start = time.time()
 
         # batchify data
diff --git a/hpobench/dependencies/lm/tokenize_util.py b/hpobench/dependencies/lm/tokenize_util.py
index 6e200b1e..7ae105d7 100644
--- a/hpobench/dependencies/lm/tokenize_util.py
+++ b/hpobench/dependencies/lm/tokenize_util.py
@@ -27,7 +27,6 @@ def tokenize(self, path):
         with open(path, 'r', encoding="utf8") as f:
             for line in f:
                 words = line.split() + ['<eos>']
-                print("words", words)
                 for word in words:
                     self.dictionary.add_word(word)
         # Tokenize file content
diff --git a/hpobench/util/data_manager.py b/hpobench/util/data_manager.py
index e8a3f2d1..30f71b43 100644
--- a/hpobench/util/data_manager.py
+++ b/hpobench/util/data_manager.py
@@ -849,20 +849,22 @@ def _load(self) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndar
 
 
 class LanguageModelDataManager(HoldoutDataManager):
-    def __init__(self):
+    def __init__(self, device):
         from hpobench.dependencies.lm.tokenize_util import Corpus
         super(LanguageModelDataManager, self).__init__()
         self.logger.debug('LanguageModelDataManager: Starting to load data')
 
         self.urls = {
-            "train_data": "https://raw.githubusercontent.com/pytorch/examples/master/word_language_model/data/wikitext-2/train.txt",
-            "valid_data": "https://raw.githubusercontent.com/pytorch/examples/master/word_language_model/data/wikitext-2/valid.txt",
-            "test_data": "https://raw.githubusercontent.com/pytorch/examples/master/word_language_model/data/wikitext-2/test.txt",
+            "train": "https://raw.githubusercontent.com/pytorch/examples/master/word_language_model/data/wikitext-2/train.txt",
+            "valid": "https://raw.githubusercontent.com/pytorch/examples/master/word_language_model/data/wikitext-2/valid.txt",
+            "test": "https://raw.githubusercontent.com/pytorch/examples/master/word_language_model/data/wikitext-2/test.txt",
         }
 
         self.save_dir = hpobench.config_file.data_dir / "wikitext"
         self.create_save_directory(self.save_dir)
         self.corpus = Corpus(logger=self.logger)
+        self.device = device
+        self.tokenize_path = self.save_dir / "tokenize"
 
     def load(self):
         """
@@ -885,9 +887,12 @@ def load(self):
         return self.X_train, self.X_valid, self.X_test
 
     def _download(self):
-        self._download_file_with_progressbar(self.urls["train_data"], self.save_dir / "train.txt")
-        self._download_file_with_progressbar(self.urls["valid_data"], self.save_dir / "valid.txt")
-        self._download_file_with_progressbar(self.urls["test_data"], self.save_dir / "test.txt")
+
+        for data in self.urls:
+            if (self.save_dir / f'{data}.txt').exists():
+                self.logger.debug(f'LanguageModelDataManager : tokenized {data}.txt already exist')
+            else:
+                self._download_file_with_progressbar(self.urls[data], self.save_dir / f"{data}.txt")
 
     def _load(self) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
         """
@@ -899,10 +904,18 @@ def _load(self) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndar
         X_test: np.ndarray
         """
 
+        import torch
+        for data in self.urls:
+            if (self.tokenize_path / f'{data}.pt').exists():
+                self.logger.debug(f'LanguageModelDataManager : {data}.txt already exist')
+
+            else:
+                tokenized_data = self.corpus.tokenize(self.save_dir / "train.txt")
+                torch.save(tokenized_data, self.tokenize_path / f'{data}.pt')
 
-        X_train = self.corpus.tokenize(self.save_dir / "train.txt")
-        X_valid = self.corpus.tokenize(self.save_dir / "valid.txt")
-        X_test = self.corpus.tokenize(self.save_dir / "test.txt")
+        X_train = torch.load(self.tokenize_path / 'train.pt', map_location=self.device)
+        X_valid = torch.load(self.tokenize_path / 'valid.pt', map_location=self.device)
+        X_test = torch.load(self.tokenize_path / 'test.pt', map_location=self.device)
 
         return X_train, X_valid, X_test
 

From f4d4413040366630e9083d07ff11d966491237b8 Mon Sep 17 00:00:00 2001
From: ayushi-3536 <ayushi.sharma.3536@gmail.com>
Date: Fri, 20 May 2022 01:03:27 +0200
Subject: [PATCH 05/29] -code formatting

---
 hpobench/benchmarks/mo/lm_benchmark.py    | 69 ++++++++++-------------
 hpobench/dependencies/lm/model.py         | 31 ++++++----
 hpobench/dependencies/lm/tokenize_util.py |  2 +-
 hpobench/util/data_manager.py             | 41 +++++++++-----
 4 files changed, 79 insertions(+), 64 deletions(-)

diff --git a/hpobench/benchmarks/mo/lm_benchmark.py b/hpobench/benchmarks/mo/lm_benchmark.py
index 4b301492..f22db78f 100644
--- a/hpobench/benchmarks/mo/lm_benchmark.py
+++ b/hpobench/benchmarks/mo/lm_benchmark.py
@@ -3,15 +3,14 @@
 ==========
 
 0.0.1:
-* First implementation of the Multi-Objective CNN Benchmark.
+* First implementation of the Multi-Objective Language Model Benchmark.
 """
-from typing import Union, Tuple, Dict, List
+from typing import Union, Dict, List
 import ConfigSpace as CS
 import numpy as np
 import torch
 import torch.nn as nn
 import logging
-from ConfigSpace.hyperparameters import Hyperparameter
 import hpobench.util.rng_helper as rng_helper
 from hpobench.abstract_benchmark import AbstractMultiObjectiveBenchmark
 from hpobench.util.data_manager import LanguageModelDataManager
@@ -31,19 +30,20 @@ class LanguageModelBenchmark(AbstractMultiObjectiveBenchmark):
     def __init__(self, rng: Union[np.random.RandomState, int, None] = None, **kwargs):
         super(LanguageModelBenchmark, self).__init__(rng=rng)
 
-        device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
-        data_manager = LanguageModelDataManager(device)
+        self.device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
+        data_manager = LanguageModelDataManager(self.device)
         self.X_train, self.X_valid, self.X_test = data_manager.load()
-        self.corpus = data_manager.corpus
+        self.ntokens = len(data_manager.corpus.dictionary)
 
         self.variable = {"eval_batch_size": 10,
                          "nlayers": 2,
                          "bptt": 35,
                          "tied": True,
+                         # number of attention head
                          "nhead": 2,
-                         "ntoken": len(self.corpus.dictionary)
+                         "ntoken": self.ntokens
                          }
-        print("len of corpus dict", len(self.corpus.dictionary))
+        print("len of corpus dict", self.ntokens)
 
     @staticmethod
     def get_configuration_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
@@ -83,23 +83,11 @@ def get_fidelity_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
 
         fidelity_space = CS.ConfigurationSpace(seed=seed)
         fidelity_space.add_hyperparameters([
-            # gray-box setting (multi-multi-fidelity) - iterations + data subsample
-            LanguageModelBenchmark._get_fidelity_choices(iter_choice='variable')
-        ])
-        return fidelity_space
-
-    @staticmethod
-    def _get_fidelity_choices(iter_choice: str) -> Tuple[Hyperparameter, Hyperparameter]:
-
-        fidelity1 = dict(
-            fixed=CS.Constant('budget', value=81),
-            variable=CS.UniformIntegerHyperparameter(
+            CS.UniformIntegerHyperparameter(
                 'budget', lower=1, upper=81, default_value=81, log=False
             )
-        )
-
-        budget = fidelity1[iter_choice]
-        return budget
+        ])
+        return fidelity_space
 
     @staticmethod
     def get_meta_information() -> Dict:
@@ -108,7 +96,8 @@ def get_meta_information() -> Dict:
             'name': 'Multi-objective Asynchronous Successive Halving',
             'references': ['@article{schmucker2021multi,'
                            'title={Multi-objective Asynchronous Successive Halving},'
-                           'author={Schmucker, Robin and Donini, Michele and Zafar, Muhammad Bilal and Salinas, David and Archambeau, C{\'e}dric},'
+                           'author={Schmucker, Robin and Donini, Michele and Zafar, Muhammad Bilal and Salinas,'
+                           ' David and Archambeau, C{\'e}dric},'
                            'journal={arXiv preprint arXiv:2106.12639},'
                            'year={2021}',
                            ],
@@ -121,6 +110,7 @@ def init_model(self, config: Union[CS.Configuration, Dict]):
         if isinstance(config, CS.Configuration):
             config = config.get_dictionary()
 
+        # all sublayers and embedding layers have same dim
         model = TransformerModel(
             self.variable['ntoken'], config['emsize'], self.variable['nhead'], config['emsize'],
             self.variable['nlayers'], config['dropout'])
@@ -196,11 +186,9 @@ def objective_function(self, configuration: Union[CS.Configuration, Dict],
 
         t = tqdm.tqdm(total=epochs)
         for epoch in range(epochs):
-            print("epoch training started",epoch)
             epoch_start_time = time.time()
-            model.train_fun(model, self.corpus, criterion, train_data, learning_rate, batch_size, clip)
-            print("epoch traing done")
-            val_loss, val_acc = model.eval_fun(model, self.corpus, criterion, val_data)
+            train_loss, train_acc = model.train_fun(model, self.ntokens, criterion, train_data, learning_rate, clip)
+            val_loss, val_acc = model.eval_fun(model, self.ntokens, criterion, val_data)
             val_loss = np.clip(val_loss, 1e-10, 10)
 
             ts_now = time.time()
@@ -220,11 +208,11 @@ def objective_function(self, configuration: Union[CS.Configuration, Dict],
                 learning_rate /= learning_rate_factor
 
         start_time = time.time()
-        _, val_acc = model.eval_fun(model, self.corpus, criterion, val_data)
+        _, val_acc = model.eval_fun(model, self.ntokens, criterion, val_data)
         eval_valid_runtime = time.time() - start_time
 
         start_time = time.time()
-        _, test_acc = model.eval_fun(model, self.corpus, criterion, test_data)
+        _, test_acc = model.eval_fun(model, self.ntokens, criterion, test_data)
         eval_test_runtime = time.time() - start_time
 
         perplexity = math.exp(best_val_loss)
@@ -237,7 +225,8 @@ def objective_function(self, configuration: Union[CS.Configuration, Dict],
                                    'time': train_eval_time
                                    },
                 'cost': elapsed_time,
-                'info': {'validation_accuracy': val_acc,
+                'info': {'train_accuracy': train_acc,
+                         'validation_accuracy': val_acc,
                          'test_accuracy': test_acc,
                          'log_perplexity': log_perplexity,
                          'perplexity': perplexity,
@@ -301,15 +290,15 @@ def objective_function_test(self, configuration: Union[CS.Configuration, Dict],
 
         # batchify data
         batch_size = configuration['batch_size']
-        train_data = batchify(self.X_train, batch_size=batch_size).to(device)
-        val_data = batchify(self.X_valid, batch_size=self.variable["eval_batch_size"]).to(device)
+        train_data = batchify(self.X_train, batch_size=batch_size).to(self.device)
+        val_data = batchify(self.X_valid, batch_size=self.variable["eval_batch_size"]).to(self.device)
 
         train_data = np.vstack(train_data, val_data)
-        test_data = batchify(self.X_test, batch_size=self.variable["eval_batch_size"]).to(device)
+        test_data = batchify(self.X_test, batch_size=self.variable["eval_batch_size"]).to(self.device)
 
         epochs = fidelity['budget']
 
-        model = self.init_model(configuration).to(device)
+        model = self.init_model(configuration).to(self.device)
 
         criterion = nn.CrossEntropyLoss()
 
@@ -321,9 +310,10 @@ def objective_function_test(self, configuration: Union[CS.Configuration, Dict],
         t = tqdm.tqdm(total=epochs)
         for epoch in range(1, epochs + 1):
             epoch_start_time = time.time()
-            model.train_fun(model, self.corpus, criterion, train_data, learning_rate, batch_size, clip)
+            train_loss, train_acc = model.train_fun(model, self.ntokens, criterion, train_data, learning_rate,
+                                                    batch_size, clip)
 
-            test_loss, test_acc = model.eval_fun(model, self.corpus, criterion, test_data)
+            test_loss, test_acc = model.eval_fun(model, self.ntokens, criterion, test_data)
             test_loss = np.clip(test_loss, 1e-10, 10)
 
             ts_now = time.time()
@@ -342,7 +332,7 @@ def objective_function_test(self, configuration: Union[CS.Configuration, Dict],
                 learning_rate /= learning_rate_factor
 
         start_time = time.time()
-        _, test_acc = model.eval_fun(model, self.corpus, criterion, test_data)
+        _, test_acc = model.eval_fun(model, self.ntokens, criterion, test_data)
         eval_test_runtime = time.time() - start_time
 
         perplexity = math.exp(best_test_loss)
@@ -355,7 +345,8 @@ def objective_function_test(self, configuration: Union[CS.Configuration, Dict],
                                    'time': train_eval_time
                                    },
                 'cost': elapsed_time,
-                'info': {'test_accuracy': test_acc,
+                'info': {'train_accuracy': train_acc,
+                         'test_accuracy': test_acc,
                          'log_perplexity': log_perplexity,
                          'perplexity': perplexity,
                          'negative_log_perplexity': neg_log_perplexity,
diff --git a/hpobench/dependencies/lm/model.py b/hpobench/dependencies/lm/model.py
index f4aed4cc..8361c61f 100644
--- a/hpobench/dependencies/lm/model.py
+++ b/hpobench/dependencies/lm/model.py
@@ -27,6 +27,9 @@ def __init__(self, d_model, dropout=0.1, max_len=5000):
         position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
         div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
         pe[:, 0::2] = torch.sin(position * div_term)
+        print("shape pe",pe[:,0::2].shape)
+
+        print("shape after pe",pe[:,1::2].shape)
         pe[:, 1::2] = torch.cos(position * div_term)
         pe = pe.unsqueeze(0).transpose(0, 1)
         self.register_buffer('pe', pe)
@@ -46,13 +49,13 @@ def forward(self, x):
 
 
 class TransformerModel(nn.Module):
-    """Container module with an encoder, a recurrent or transformer module, and a decoder."""
+    """Container module with an encoder, a transformer module, and a decoder."""
 
     def __init__(self, ntoken, ninp, nhead, nhid, nlayers, dropout=0.5, bptt=35):
         super(TransformerModel, self).__init__()
         try:
             from torch.nn import TransformerEncoder, TransformerEncoderLayer
-        except:
+        except Exception:
             raise ImportError('TransformerEncoder module does not exist in PyTorch 1.1 or lower.')
         self.model_type = 'Transformer'
         self.src_mask = None
@@ -96,33 +99,41 @@ def forward(self, src, has_mask=True):
         output = self.decoder(output)
         return F.log_softmax(output, dim=-1)
 
-    def train_fun(self, model, corpus, criterion, train_data, lr, batch_size, clip):
+    def train_fun(self, model, ntokens, criterion, train_data, lr, clip):
         # Turn on training mode which enables dropout.
         self.train()
-        # total_loss = 0.
-        # start_time = time.time()
-        ntokens = len(corpus.dictionary)
-
+        total_loss = 0.
+        total_acc = 0.
         for batch, i in enumerate(range(0, train_data.size(0) - 1, self.bptt)):
             data, targets = self.get_batch(train_data, i)
             # Starting each batch, we detach the hidden state from how it was previously produced.
             # If we didn't, the model would try backpropagating all the way to start of the dataset.
             model.zero_grad()
             output = model(data)
-            loss = criterion(output.view(-1, ntokens), targets)
+            output_flat = output.view(-1, ntokens)
+            loss = criterion(output_flat, targets)
             loss.backward()
 
+            # calculate loss and accuracy
+            total_loss += len(data) * loss.item()
+            winners = output_flat.argmax(dim=1)
+            corrects = (winners == targets)
+            accuracy = corrects.sum().float() / float(targets.size(0))
+            total_acc += len(data) * accuracy
+
             # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
             torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
             for p in model.parameters():
                 p.data.add_(-lr, p.grad.data)
 
-    def eval_fun(self, model, corpus, criterion, data_source):
+        avg_acc = total_acc / (len(train_data) - 1)
+        return total_loss / (len(train_data) - 1), avg_acc
+
+    def eval_fun(self, model, ntokens, criterion, data_source):
         # Turn on evaluation mode which disables dropout.
         self.eval()
         total_loss = 0.
         total_acc = 0.
-        ntokens = len(corpus.dictionary)
         with torch.no_grad():
             for i in range(0, data_source.size(0) - 1, self.bptt):
                 data, targets = self.get_batch(data_source, i)
diff --git a/hpobench/dependencies/lm/tokenize_util.py b/hpobench/dependencies/lm/tokenize_util.py
index 7ae105d7..f68e850d 100644
--- a/hpobench/dependencies/lm/tokenize_util.py
+++ b/hpobench/dependencies/lm/tokenize_util.py
@@ -38,7 +38,7 @@ def tokenize(self, path):
                 try:
                     for word in words:
                         ids.append(self.dictionary.word2idx[word])
-                except:
+                except Exception:
                     self.logger.debug("word2idx:{}", self.dictionary.word2idx)
                 idss.append(torch.tensor(ids).type(torch.int64))
             ids = torch.cat(idss)
diff --git a/hpobench/util/data_manager.py b/hpobench/util/data_manager.py
index 30f71b43..ab3ac760 100644
--- a/hpobench/util/data_manager.py
+++ b/hpobench/util/data_manager.py
@@ -855,9 +855,12 @@ def __init__(self, device):
         self.logger.debug('LanguageModelDataManager: Starting to load data')
 
         self.urls = {
-            "train": "https://raw.githubusercontent.com/pytorch/examples/master/word_language_model/data/wikitext-2/train.txt",
-            "valid": "https://raw.githubusercontent.com/pytorch/examples/master/word_language_model/data/wikitext-2/valid.txt",
-            "test": "https://raw.githubusercontent.com/pytorch/examples/master/word_language_model/data/wikitext-2/test.txt",
+            "train": "https://raw.githubusercontent.com/pytorch/examples/master/"
+                     "word_language_model/data/wikitext-2/train.txt",
+            "valid": "https://raw.githubusercontent.com/pytorch/examples/master/"
+                     "word_language_model/data/wikitext-2/valid.txt",
+            "test": "https://raw.githubusercontent.com/pytorch/examples/master/"
+                    "word_language_model/data/wikitext-2/test.txt",
         }
 
         self.save_dir = hpobench.config_file.data_dir / "wikitext"
@@ -884,6 +887,11 @@ def load(self):
         self._download()
         self.X_train, self.X_valid, self.X_test = self._load()
         self.logger.info(f'LanguageModelDataManager: Data successfully loaded after {time() - t:.2f}')
+        print(self.X_train.shape)
+
+        print(self.X_valid.shape)
+
+        print(self.X_test.shape)
         return self.X_train, self.X_valid, self.X_test
 
     def _download(self):
@@ -906,17 +914,22 @@ def _load(self) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndar
 
         import torch
         for data in self.urls:
-            if (self.tokenize_path / f'{data}.pt').exists():
-                self.logger.debug(f'LanguageModelDataManager : {data}.txt already exist')
-
-            else:
-                tokenized_data = self.corpus.tokenize(self.save_dir / "train.txt")
-                torch.save(tokenized_data, self.tokenize_path / f'{data}.pt')
-
-        X_train = torch.load(self.tokenize_path / 'train.pt', map_location=self.device)
-        X_valid = torch.load(self.tokenize_path / 'valid.pt', map_location=self.device)
-        X_test = torch.load(self.tokenize_path / 'test.pt', map_location=self.device)
-
+        #     if (self.tokenize_path / f'{data}.pt').exists():
+        #         self.logger.debug(f'LanguageModelDataManager : {data}.txt already exist')
+        #     else:
+            tokenized_data = self.corpus.tokenize(self.save_dir / f'{data}.txt')
+            torch.save(tokenized_data, self.tokenize_path / f'{data}.pt')
+
+        X_train = self.corpus.tokenize(self.save_dir / 'train.txt')
+
+        X_valid = self.corpus.tokenize(self.save_dir / 'valid.txt')
+
+        X_test = self.corpus.tokenize(self.save_dir / 'test.txt')
+        #
+        # X_train = torch.load(self.tokenize_path / 'train.pt', map_location=self.device)
+        # X_valid = torch.load(self.tokenize_path / 'valid.pt', map_location=self.device)
+        # X_test = torch.load(self.tokenize_path / 'test.pt', map_location=self.device)
+        print(len(self.corpus.dictionary))
         return X_train, X_valid, X_test
 
 

From a34d681231012f3429dd5b211c0cbbb8c4803a1b Mon Sep 17 00:00:00 2001
From: ayushi-3536 <ayushi.sharma.3536@gmail.com>
Date: Sat, 21 May 2022 10:13:28 +0200
Subject: [PATCH 06/29] -make deterministic -report train and eval time
 separately in objective func -code formatting -added test file

---
 hpobench/benchmarks/mo/lm_benchmark.py | 97 ++++++++++++++------------
 hpobench/dependencies/lm/model.py      | 19 +++--
 tests/test_wikitext.py                 | 35 ++++++++++
 3 files changed, 96 insertions(+), 55 deletions(-)
 create mode 100644 tests/test_wikitext.py

diff --git a/hpobench/benchmarks/mo/lm_benchmark.py b/hpobench/benchmarks/mo/lm_benchmark.py
index f22db78f..22ace808 100644
--- a/hpobench/benchmarks/mo/lm_benchmark.py
+++ b/hpobench/benchmarks/mo/lm_benchmark.py
@@ -19,6 +19,7 @@
 import time
 import math
 import tqdm
+import random
 
 __version__ = '0.0.1'
 
@@ -34,7 +35,7 @@ def __init__(self, rng: Union[np.random.RandomState, int, None] = None, **kwargs
         data_manager = LanguageModelDataManager(self.device)
         self.X_train, self.X_valid, self.X_test = data_manager.load()
         self.ntokens = len(data_manager.corpus.dictionary)
-
+        self.__seed_everything()
         self.variable = {"eval_batch_size": 10,
                          "nlayers": 2,
                          "bptt": 35,
@@ -45,6 +46,16 @@ def __init__(self, rng: Union[np.random.RandomState, int, None] = None, **kwargs
                          }
         print("len of corpus dict", self.ntokens)
 
+    def __seed_everything(self):
+        """Helperfunction: Make the benchmark deterministic by setting the correct seeds"""
+        seed = self.rng.randint(0, 100000)
+        print("seed obtained", seed)
+        random.seed(seed)
+        np.random.seed(seed)
+        torch.manual_seed(seed)
+        torch.cuda.manual_seed_all(seed)
+        torch.backends.cudnn.deterministic = True
+
     @staticmethod
     def get_configuration_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
         """Parameter space to be optimized --- contains the hyperparameters
@@ -109,8 +120,6 @@ def init_model(self, config: Union[CS.Configuration, Dict]):
 
         if isinstance(config, CS.Configuration):
             config = config.get_dictionary()
-
-        # all sublayers and embedding layers have same dim
         model = TransformerModel(
             self.variable['ntoken'], config['emsize'], self.variable['nhead'], config['emsize'],
             self.variable['nlayers'], config['dropout'])
@@ -162,7 +171,10 @@ def objective_function(self, configuration: Union[CS.Configuration, Dict],
                 fidelity : Dict
                     used fidelities in this evaluation
         """
-        self.rng = rng_helper.get_rng()
+
+        self.rng = rng_helper.get_rng(rng)
+        self.__seed_everything()
+
         device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
         ts_start = time.time()
 
@@ -182,17 +194,19 @@ def objective_function(self, configuration: Union[CS.Configuration, Dict],
         learning_rate_factor = configuration['lr_factor']
         clip = configuration['clip']
         best_val_loss = None
-        train_eval_time = 0
+        train_time = 0
+        eval_time = 0
 
         t = tqdm.tqdm(total=epochs)
         for epoch in range(epochs):
             epoch_start_time = time.time()
-            train_loss, train_acc = model.train_fun(model, self.ntokens, criterion, train_data, learning_rate, clip)
-            val_loss, val_acc = model.eval_fun(model, self.ntokens, criterion, val_data)
+            train_loss, train_acc = model.train_fun(self.ntokens, criterion, train_data, learning_rate, clip)
+            train_time += time.time() - epoch_start_time
+            start = time.time()
+            val_loss, val_acc = model.eval_fun(self.ntokens, criterion, val_data)
             val_loss = np.clip(val_loss, 1e-10, 10)
-
-            ts_now = time.time()
-            train_eval_time += ts_now - epoch_start_time
+            print("val acc for last epoch", val_acc)
+            eval_time += start - time.time()
 
             t.set_postfix(val_accuracy=val_acc)
             t.update()
@@ -208,11 +222,7 @@ def objective_function(self, configuration: Union[CS.Configuration, Dict],
                 learning_rate /= learning_rate_factor
 
         start_time = time.time()
-        _, val_acc = model.eval_fun(model, self.ntokens, criterion, val_data)
-        eval_valid_runtime = time.time() - start_time
-
-        start_time = time.time()
-        _, test_acc = model.eval_fun(model, self.ntokens, criterion, test_data)
+        _, test_acc = model.eval_fun(self.ntokens, criterion, test_data)
         eval_test_runtime = time.time() - start_time
 
         perplexity = math.exp(best_val_loss)
@@ -221,18 +231,18 @@ def objective_function(self, configuration: Union[CS.Configuration, Dict],
         elapsed_time = float(ts_start - time.time())
 
         return {'function_value': {'log_perplexity': log_perplexity,
-                                   'accuracy': val_acc,
-                                   'time': train_eval_time
+                                   'accuracy': val_acc.item(),
+                                   'time': train_time + eval_time
                                    },
                 'cost': elapsed_time,
-                'info': {'train_accuracy': train_acc,
-                         'validation_accuracy': val_acc,
-                         'test_accuracy': test_acc,
+                'info': {'train_accuracy': train_acc.item(),
+                         'validation_accuracy': val_acc.item(),
+                         'test_accuracy': test_acc.item(),
                          'log_perplexity': log_perplexity,
                          'perplexity': perplexity,
                          'negative_log_perplexity': neg_log_perplexity,
-                         'training_cost': train_eval_time,
-                         'valid_cost': eval_valid_runtime,
+                         'training_cost': train_time,
+                         'valid_cost': eval_time,
                          'test_cost': eval_test_runtime,
                          'fidelity': fidelity
                          }
@@ -285,15 +295,17 @@ def objective_function_test(self, configuration: Union[CS.Configuration, Dict],
         # The result dict should contain already all necessary information -> Just swap the function value from valid
         # to test and the corresponding time cost
         assert fidelity['epoch'] == 81, 'Only test data for the 50. epoch is available. '
-
         ts_start = time.time()
 
+        self.rng = rng_helper.get_rng(rng)
+        self.__seed_everything()
+
         # batchify data
         batch_size = configuration['batch_size']
-        train_data = batchify(self.X_train, batch_size=batch_size).to(self.device)
-        val_data = batchify(self.X_valid, batch_size=self.variable["eval_batch_size"]).to(self.device)
-
-        train_data = np.vstack(train_data, val_data)
+        train_data = batchify(self.X_train, batch_size=batch_size)
+        val_data = batchify(self.X_valid, batch_size=batch_size)
+        train_data = np.vstack((train_data, val_data))
+        train_data = torch.tensor(train_data).to(self.device)
         test_data = batchify(self.X_test, batch_size=self.variable["eval_batch_size"]).to(self.device)
 
         epochs = fidelity['budget']
@@ -306,18 +318,19 @@ def objective_function_test(self, configuration: Union[CS.Configuration, Dict],
         learning_rate_factor = configuration['lr_factor']
         clip = configuration['clip']
         best_test_loss = None
-        train_eval_time = 0
+        train_time = 0
+        eval_time = 0
         t = tqdm.tqdm(total=epochs)
         for epoch in range(1, epochs + 1):
             epoch_start_time = time.time()
-            train_loss, train_acc = model.train_fun(model, self.ntokens, criterion, train_data, learning_rate,
-                                                    batch_size, clip)
+            train_loss, train_acc = model.train_fun(self.ntokens, criterion, train_data, learning_rate,
+                                                    clip)
+            train_time += time.time() - epoch_start_time
+            start = time.time()
 
-            test_loss, test_acc = model.eval_fun(model, self.ntokens, criterion, test_data)
+            test_loss, test_acc = model.eval_fun(self.ntokens, criterion, test_data)
             test_loss = np.clip(test_loss, 1e-10, 10)
-
-            ts_now = time.time()
-            train_eval_time += ts_now - epoch_start_time
+            eval_time += time.time() - start
 
             t.set_postfix(test_accuracy=test_acc)
             t.update()
@@ -331,27 +344,23 @@ def objective_function_test(self, configuration: Union[CS.Configuration, Dict],
                 # Anneal the learning rate if no improvement has been seen in the validation dataset.
                 learning_rate /= learning_rate_factor
 
-        start_time = time.time()
-        _, test_acc = model.eval_fun(model, self.ntokens, criterion, test_data)
-        eval_test_runtime = time.time() - start_time
-
         perplexity = math.exp(best_test_loss)
         log_perplexity = best_test_loss
         neg_log_perplexity = 10 - best_test_loss
         elapsed_time = float(ts_start - time.time())
 
         return {'function_value': {'log_perplexity': log_perplexity,
-                                   'accuracy': test_acc,
-                                   'time': train_eval_time
+                                   'accuracy': test_acc.item(),
+                                   'time': train_time + eval_time
                                    },
                 'cost': elapsed_time,
-                'info': {'train_accuracy': train_acc,
-                         'test_accuracy': test_acc,
+                'info': {'train_accuracy': train_acc.item(),
+                         'test_accuracy': test_acc.item(),
                          'log_perplexity': log_perplexity,
                          'perplexity': perplexity,
                          'negative_log_perplexity': neg_log_perplexity,
-                         'training_cost': train_eval_time,
-                         'test_cost': eval_test_runtime,
+                         'training_cost': train_time,
+                         'test_cost': eval_time,
                          'fidelity': fidelity
                          }
                 }
diff --git a/hpobench/dependencies/lm/model.py b/hpobench/dependencies/lm/model.py
index 8361c61f..4d9e8e97 100644
--- a/hpobench/dependencies/lm/model.py
+++ b/hpobench/dependencies/lm/model.py
@@ -27,9 +27,6 @@ def __init__(self, d_model, dropout=0.1, max_len=5000):
         position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
         div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
         pe[:, 0::2] = torch.sin(position * div_term)
-        print("shape pe",pe[:,0::2].shape)
-
-        print("shape after pe",pe[:,1::2].shape)
         pe[:, 1::2] = torch.cos(position * div_term)
         pe = pe.unsqueeze(0).transpose(0, 1)
         self.register_buffer('pe', pe)
@@ -51,7 +48,7 @@ def forward(self, x):
 class TransformerModel(nn.Module):
     """Container module with an encoder, a transformer module, and a decoder."""
 
-    def __init__(self, ntoken, ninp, nhead, nhid, nlayers, dropout=0.5, bptt=35):
+    def __init__(self, ntoken, ninp, nhead, nhid, nlayers, dropout=0.5, bptt=35, rng=None):
         super(TransformerModel, self).__init__()
         try:
             from torch.nn import TransformerEncoder, TransformerEncoderLayer
@@ -99,7 +96,7 @@ def forward(self, src, has_mask=True):
         output = self.decoder(output)
         return F.log_softmax(output, dim=-1)
 
-    def train_fun(self, model, ntokens, criterion, train_data, lr, clip):
+    def train_fun(self, ntokens, criterion, train_data, lr, clip):
         # Turn on training mode which enables dropout.
         self.train()
         total_loss = 0.
@@ -108,8 +105,8 @@ def train_fun(self, model, ntokens, criterion, train_data, lr, clip):
             data, targets = self.get_batch(train_data, i)
             # Starting each batch, we detach the hidden state from how it was previously produced.
             # If we didn't, the model would try backpropagating all the way to start of the dataset.
-            model.zero_grad()
-            output = model(data)
+            self.zero_grad()
+            output = self(data)
             output_flat = output.view(-1, ntokens)
             loss = criterion(output_flat, targets)
             loss.backward()
@@ -122,14 +119,14 @@ def train_fun(self, model, ntokens, criterion, train_data, lr, clip):
             total_acc += len(data) * accuracy
 
             # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
-            torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
-            for p in model.parameters():
+            torch.nn.utils.clip_grad_norm_(self.parameters(), clip)
+            for p in self.parameters():
                 p.data.add_(-lr, p.grad.data)
 
         avg_acc = total_acc / (len(train_data) - 1)
         return total_loss / (len(train_data) - 1), avg_acc
 
-    def eval_fun(self, model, ntokens, criterion, data_source):
+    def eval_fun(self, ntokens, criterion, data_source):
         # Turn on evaluation mode which disables dropout.
         self.eval()
         total_loss = 0.
@@ -137,7 +134,7 @@ def eval_fun(self, model, ntokens, criterion, data_source):
         with torch.no_grad():
             for i in range(0, data_source.size(0) - 1, self.bptt):
                 data, targets = self.get_batch(data_source, i)
-                output = model(data)
+                output = self(data)
                 output_flat = output.view(-1, ntokens)
                 total_loss += len(data) * criterion(output_flat, targets).item()
 
diff --git a/tests/test_wikitext.py b/tests/test_wikitext.py
new file mode 100644
index 00000000..2f0d0866
--- /dev/null
+++ b/tests/test_wikitext.py
@@ -0,0 +1,35 @@
+import logging
+import pytest
+
+logging.basicConfig(level=logging.DEBUG)
+
+
+def test_wikitext_benchmark():
+    from hpobench.benchmarks.mo.lm_benchmark import LanguageModelBenchmark
+
+    # Check Seeding
+    benchmark = LanguageModelBenchmark(rng=0)
+    cs = benchmark.get_configuration_space(seed=1)
+    cfg_1 = cs.sample_configuration()
+
+    cs = benchmark.get_configuration_space(seed=1)
+    cfg_2 = cs.sample_configuration()
+
+    assert cfg_1 == cfg_2
+
+    print("cfg1", cfg_1)
+    print("cfg2", cfg_2)
+
+
+    test_config = {
+        'batch_size': 144, 'clip': 1.458859796107597, 'dropout': 0.5967357423109274,
+        'emsize': 575, 'lr': 5.245378070737081, 'lr_factor': 15
+    }
+
+    result_1 = benchmark.objective_function(test_config, rng=1, fidelity={'budget': 1})
+    result_2 = benchmark.objective_function(test_config, rng=1, fidelity={'budget': 1})
+    print("r1", result_1)
+    print("r2", result_2)
+
+    assert result_1['info']['train_accuracy'] == pytest.approx(0.76145, rel=0.001)
+    assert result_1['info']['train_accuracy'] == result_2['info']['train_accuracy']

From 9cb6a3383403068a3c1d575e0e20f519678b9e24 Mon Sep 17 00:00:00 2001
From: ayushi-3536 <ayushi.sharma.3536@gmail.com>
Date: Sat, 21 May 2022 11:18:27 +0200
Subject: [PATCH 07/29] -added lock for download data - added recipe and
 container file

---
 .../container/benchmarks/mo/lm_benchmark.py   | 12 ++++++++
 .../mo/Singularity.LanguageModelBenchmark     | 30 +++++++++++++++++++
 hpobench/util/data_manager.py                 | 18 ++---------
 3 files changed, 44 insertions(+), 16 deletions(-)
 create mode 100644 hpobench/container/benchmarks/mo/lm_benchmark.py
 create mode 100644 hpobench/container/recipes/mo/Singularity.LanguageModelBenchmark

diff --git a/hpobench/container/benchmarks/mo/lm_benchmark.py b/hpobench/container/benchmarks/mo/lm_benchmark.py
new file mode 100644
index 00000000..f261b00d
--- /dev/null
+++ b/hpobench/container/benchmarks/mo/lm_benchmark.py
@@ -0,0 +1,12 @@
+""" Benchmark for the Multi-Objective Language Model Benchmark from hpobench/benchmarks/mo/lm_benchmark.py
+"""
+
+from hpobench.container.client_abstract_benchmark import AbstractBenchmarkClient
+
+
+class LanguageModelBenchmark(AbstractBenchmarkClient):
+    def __init__(self, **kwargs):
+        kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'LanguageModelBenchmark')
+        kwargs['container_name'] = kwargs.get('container_name', 'lm_benchmark')
+        kwargs['latest'] = kwargs.get('container_tag', '0.0.1')
+        super(LanguageModelBenchmark, self).__init__(**kwargs)
diff --git a/hpobench/container/recipes/mo/Singularity.LanguageModelBenchmark b/hpobench/container/recipes/mo/Singularity.LanguageModelBenchmark
new file mode 100644
index 00000000..8f364323
--- /dev/null
+++ b/hpobench/container/recipes/mo/Singularity.LanguageModelBenchmark
@@ -0,0 +1,30 @@
+Bootstrap: docker
+From: python:3.7-slim
+
+%labels
+MAINTAINER sharmaa@informatik.uni-freiburg.de
+VERSION v0.0.1
+
+%post
+    apt update -y
+    apt install build-essential git wget -y
+
+    cd /home \
+    && mkdir data && cd data \
+    && wget https://raw.githubusercontent.com/pytorch/examples/master/word_language_model/data/wikitext-2/train.txt \
+    && wget https://raw.githubusercontent.com/pytorch/examples/master/word_language_model/data/wikitext-2/valid.txt \
+    && wget https://raw.githubusercontent.com/pytorch/examples/master/word_language_model/data/wikitext-2/test.txt \
+    && cd /home \
+    && git clone https://github.com/ayushi-3536/HPOBench.git \
+    && cd HPOBench \
+    && git checkout fair_adult \
+    && pip install .[adult] \
+    && cd / \
+    && mkdir /var/lib/hpobench/ \
+    && chmod -R 777 /var/lib/hpobench/ \
+    && rm -rf /var/lib/apt/lists/* \
+    && pip cache purge
+
+
+%runscript
+    python -s /home/HPOBench/hpobench/container/server_abstract_benchmark.py mo.adult_benchmark $@
\ No newline at end of file
diff --git a/hpobench/util/data_manager.py b/hpobench/util/data_manager.py
index ab3ac760..72e817ce 100644
--- a/hpobench/util/data_manager.py
+++ b/hpobench/util/data_manager.py
@@ -894,8 +894,9 @@ def load(self):
         print(self.X_test.shape)
         return self.X_train, self.X_valid, self.X_test
 
+    @lockutils.synchronized('not_thread_process_safe', external=True,
+                            lock_path=f'{hpobench.config_file.cache_dir}/language_model', delay=0.5)
     def _download(self):
-
         for data in self.urls:
             if (self.save_dir / f'{data}.txt').exists():
                 self.logger.debug(f'LanguageModelDataManager : tokenized {data}.txt already exist')
@@ -912,24 +913,9 @@ def _load(self) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndar
         X_test: np.ndarray
         """
 
-        import torch
-        for data in self.urls:
-        #     if (self.tokenize_path / f'{data}.pt').exists():
-        #         self.logger.debug(f'LanguageModelDataManager : {data}.txt already exist')
-        #     else:
-            tokenized_data = self.corpus.tokenize(self.save_dir / f'{data}.txt')
-            torch.save(tokenized_data, self.tokenize_path / f'{data}.pt')
-
         X_train = self.corpus.tokenize(self.save_dir / 'train.txt')
-
         X_valid = self.corpus.tokenize(self.save_dir / 'valid.txt')
-
         X_test = self.corpus.tokenize(self.save_dir / 'test.txt')
-        #
-        # X_train = torch.load(self.tokenize_path / 'train.pt', map_location=self.device)
-        # X_valid = torch.load(self.tokenize_path / 'valid.pt', map_location=self.device)
-        # X_test = torch.load(self.tokenize_path / 'test.pt', map_location=self.device)
-        print(len(self.corpus.dictionary))
         return X_train, X_valid, X_test
 
 

From 68cd9178f0c3348a6494e4295796ae48ff71b2b5 Mon Sep 17 00:00:00 2001
From: ayushi-3536 <ayushi.sharma.3536@gmail.com>
Date: Sat, 21 May 2022 18:03:07 +0200
Subject: [PATCH 08/29] -make emsize sampling log based: To be discussed with
 team - positional encoding doesn't work for odd number, therefor log seems
 like perfect solution -removed logs

---
 hpobench/benchmarks/mo/lm_benchmark.py | 2 +-
 tests/test_wikitext.py                 | 7 -------
 2 files changed, 1 insertion(+), 8 deletions(-)

diff --git a/hpobench/benchmarks/mo/lm_benchmark.py b/hpobench/benchmarks/mo/lm_benchmark.py
index 22ace808..cee33c34 100644
--- a/hpobench/benchmarks/mo/lm_benchmark.py
+++ b/hpobench/benchmarks/mo/lm_benchmark.py
@@ -67,7 +67,7 @@ def get_configuration_space(seed: Union[int, None] = None) -> CS.ConfigurationSp
                 'batch_size', default_value=128, lower=8, upper=256
             ),
             CS.UniformIntegerHyperparameter(
-                'emsize', default_value=128, lower=32, upper=1024
+                'emsize', default_value=128, lower=32, upper=1024, log=True
             ),
             CS.UniformIntegerHyperparameter(
                 'lr_factor', default_value=50, lower=1, upper=100, log=True
diff --git a/tests/test_wikitext.py b/tests/test_wikitext.py
index 2f0d0866..727a8ea4 100644
--- a/tests/test_wikitext.py
+++ b/tests/test_wikitext.py
@@ -17,10 +17,6 @@ def test_wikitext_benchmark():
 
     assert cfg_1 == cfg_2
 
-    print("cfg1", cfg_1)
-    print("cfg2", cfg_2)
-
-
     test_config = {
         'batch_size': 144, 'clip': 1.458859796107597, 'dropout': 0.5967357423109274,
         'emsize': 575, 'lr': 5.245378070737081, 'lr_factor': 15
@@ -28,8 +24,5 @@ def test_wikitext_benchmark():
 
     result_1 = benchmark.objective_function(test_config, rng=1, fidelity={'budget': 1})
     result_2 = benchmark.objective_function(test_config, rng=1, fidelity={'budget': 1})
-    print("r1", result_1)
-    print("r2", result_2)
-
     assert result_1['info']['train_accuracy'] == pytest.approx(0.76145, rel=0.001)
     assert result_1['info']['train_accuracy'] == result_2['info']['train_accuracy']

From 812bdd03b4972cfd323634b3465f0ee1ce85c056 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Philipp=20M=C3=BCller?= <muller-phil@gmx.net>
Date: Mon, 23 May 2022 18:22:30 +0200
Subject: [PATCH 09/29] Update Github Actions (#151)

* Update Github Actions Workflow  and drop support for singularity < 3.7
---
 .../workflows/run_singularity_versions.yml    | 28 ++++++++++------
 .github/workflows/run_tests.yml               | 15 +++++++++
 README.md                                     | 11 +++----
 ci_scripts/install.sh                         | 32 ++-----------------
 ci_scripts/install_singularity.sh             | 32 +++++++++++++------
 requirements.txt                              |  1 -
 6 files changed, 64 insertions(+), 55 deletions(-)

diff --git a/.github/workflows/run_singularity_versions.yml b/.github/workflows/run_singularity_versions.yml
index fe576a30..c7862636 100644
--- a/.github/workflows/run_singularity_versions.yml
+++ b/.github/workflows/run_singularity_versions.yml
@@ -1,6 +1,16 @@
 name: Test Support for different Singularity Versions
 
-on: [push]
+on:
+  pull_request:
+    types: [ready_for_review]
+
+  pull_request_review:
+    types: [submitted]
+
+  push:
+    branches:
+      - 'main'
+      - 'development'
 
 jobs:
   Tests:
@@ -10,25 +20,25 @@ jobs:
       matrix:
         include:
           - python-version: 3.7
-            DISPLAY_NAME: "Singularity Container Examples with S3.5"
+            DISPLAY_NAME: "Singularity Container Examples with S3.7"
             RUN_CONTAINER_EXAMPLES: true
             USE_SINGULARITY: false
-            SINGULARITY_VERSION: "3.5"
+            SINGULARITY_VERSION: "3.7"
           - python-version: 3.7
-            DISPLAY_NAME: "Singularity Container Examples with S3.6"
+            DISPLAY_NAME: "Singularity Container Examples with S3.8"
             RUN_CONTAINER_EXAMPLES: true
             USE_SINGULARITY: false
-            SINGULARITY_VERSION: "3.6"
+            SINGULARITY_VERSION: "3.8"
           - python-version: 3.7
-            DISPLAY_NAME: "Singularity Container Examples with S3.7"
+            DISPLAY_NAME: "Singularity Container Examples with S3.9"
             RUN_CONTAINER_EXAMPLES: true
             USE_SINGULARITY: false
-            SINGULARITY_VERSION: "3.7"
+            SINGULARITY_VERSION: "3.9"
           - python-version: 3.7
-            DISPLAY_NAME: "Singularity Container Examples with S3.8"
+            DISPLAY_NAME: "Singularity Container Examples with S3.10"
             RUN_CONTAINER_EXAMPLES: true
             USE_SINGULARITY: false
-            SINGULARITY_VERSION: "3.8"
+            SINGULARITY_VERSION: "3.10"
 
       fail-fast: false
 
diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml
index 3c22a210..4fecec7d 100644
--- a/.github/workflows/run_tests.yml
+++ b/.github/workflows/run_tests.yml
@@ -15,26 +15,36 @@ jobs:
             DISPLAY_NAME: "Singularity Tests + CODECOV"
             RUN_TESTS: true
             USE_SINGULARITY: true
+            SINGULARITY_VERSION: "3.8"
             RUN_CODECOV: true
+
           - python-version: 3.7
             DISPLAY_NAME: "Codestyle"
             RUN_CODESTYLE: true
+            USE_SINGULARITY: false
+
           - python-version: 3.7
             DISPLAY_NAME: "Singularity Container Examples"
             RUN_CONTAINER_EXAMPLES: true
             USE_SINGULARITY: true
+            SINGULARITY_VERSION: "3.8"
+
           - python-version: 3.7
             DISPLAY_NAME: "Local Examples"
             RUN_LOCAL_EXAMPLES: true
             USE_SINGULARITY: false
+
           - python-version: 3.8
             DISPLAY_NAME: "Singularity Tests"
             RUN_TESTS: true
             USE_SINGULARITY: true
+            SINGULARITY_VERSION: "3.8"
+
           - python-version: 3.9
             DISPLAY_NAME: "Singularity Tests"
             RUN_TESTS: true
             USE_SINGULARITY: true
+            SINGULARITY_VERSION: "3.8"
       fail-fast: false
 
     name: Tests ${{ matrix.python-version }} ${{ matrix.DISPLAY_NAME }}
@@ -42,6 +52,7 @@ jobs:
     env:
       RUN_TESTS: ${{ matrix.RUN_TESTS }}
       USE_SINGULARITY: ${{ matrix.USE_SINGULARITY }}
+      SINGULARITY_VERSION: ${{ matrix.SINGULARITY_VERSION }}
       RUN_CODECOV: ${{ matrix.RUN_CODECOV }}
       RUN_CODESTYLE: ${{ matrix.RUN_CODESTYLE }}
       RUN_CONTAINER_EXAMPLES: ${{ matrix.RUN_CONTAINER_EXAMPLES }}
@@ -58,6 +69,10 @@ jobs:
       uses: actions/setup-go@v2
       with:
         go-version: '1.14.15' # The Go version to download (if necessary) and use.
+    - name: Set up Singularity
+      if: matrix.USE_SINGULARITY == true
+      run: |
+        chmod +x ci_scripts/install_singularity.sh && source ./ci_scripts/install_singularity.sh
     - name: Install dependencies
       run: |
         python -m pip install --upgrade pip
diff --git a/README.md b/README.md
index b74b1a00..ec0a442e 100644
--- a/README.md
+++ b/README.md
@@ -54,14 +54,14 @@ cd HPOBench
 pip install .
 ```
 
-**Note:** This does not install *singularity (version 3.6)*. Please follow the steps described here: [user-guide](https://sylabs.io/guides/3.6/user-guide/quick_start.html#quick-installation-steps).   
+**Note:** This does not install *singularity (version 3.8)*. Please follow the steps described here: [user-guide](https://sylabs.io/guides/3.8/user-guide/quick_start.html#quick-installation-steps).   
 If you run into problems, using the most recent singularity version might help: [here](https://singularity.hpcng.org/admin-docs/master/installation.html)
 
 ## Containerized Benchmarks
 
-We provide all benchmarks as containerized versions to (i) isolate their dependencies and (ii) keep them reproducible. Our containerized benchmarks do not rely on external dependencies and thus do not change over time. For this, we rely on [Singularity (version 3.6)](https://sylabs.io/guides/3.6/user-guide/) and for now upload all containers to a [gitlab registry](https://gitlab.tf.uni-freiburg.de/muelleph/hpobench-registry/container_registry)
+We provide all benchmarks as containerized versions to (i) isolate their dependencies and (ii) keep them reproducible. Our containerized benchmarks do not rely on external dependencies and thus do not change over time. For this, we rely on [Singularity (version 3.8)](https://sylabs.io/guides/3.8/user-guide/) and for now upload all containers to a [gitlab registry](https://gitlab.tf.uni-freiburg.de/muelleph/hpobench-registry/container_registry)
 
-The only other requirements are: [ConfigSpace](https://github.com/automl/ConfigSpace), *scipy* and *numpy* 
+The only other requirements are: [ConfigSpace](https://github.com/automl/ConfigSpace), *numpy*, *oslo* and *Pyro4* 
 
 ### Run a Benchmark Locally
 
@@ -139,10 +139,9 @@ If you use a benchmark in your experiments, please specify the version number of
 the used container to ensure reproducibility. When starting an experiment, HPOBench writes automatically these two version numbers to the log. 
 
 ### Troubleshooting and Further Notes
-
   - **Singularity throws an 'Invalid Image format' exception**
-  Use a singularity version > 3. For users of the Meta-Cluster in Freiburg, you have to set the following path:
-  ```export PATH=/usr/local/kislurm/singularity-3.5/bin/:$PATH```
+  Use a singularity version >= 3.8. If you have multiple singularity installations, you have to add the correct singularity version to your $PATH, e.g.
+  ```export PATH=/usr/local/kislurm/singularity-3.8/bin/:$PATH```
 
   - **A Benchmark fails with `SystemError: Could not start an instance of the benchmark. Retried 5 times` but the container 
 can be started locally with `singularity instance start <pathtocontainer> test`**
diff --git a/ci_scripts/install.sh b/ci_scripts/install.sh
index b68a1b88..2d229f74 100644
--- a/ci_scripts/install.sh
+++ b/ci_scripts/install.sh
@@ -40,35 +40,9 @@ else
     echo "Skip installing packages for local examples"
 fi
 
-if [[ "$USE_SINGULARITY" == "true" ]]; then
-    echo "Install Singularity"
-
-    sudo apt-get update && sudo apt-get install -y \
-      build-essential \
-      libssl-dev \
-      uuid-dev \
-      libgpgme11-dev \
-      squashfs-tools \
-      libseccomp-dev \
-      wget \
-      pkg-config \
-      git \
-      cryptsetup
-
-    export VERSION=3.5.3 && # adjust this as necessary \
-      wget https://github.com/sylabs/singularity/archive/refs/tags/v${VERSION}.tar.gz && \
-      tar -xzf v${VERSION}.tar.gz && \
-      cd singularity-${VERSION}
-
-    ./mconfig && \
-      make -C builddir && \
-      sudo make -C builddir install
-
-    cd ..
-    install_packages="${install_packages}placeholder,"
-else
-    echo "Skip installing Singularity"
-fi
+# We add a placeholder / No-OP operator. When running the container examples, we don't install any
+# additional packages. That causes an error, since `pip install .[]` does not work.
+install_packages="${install_packages}NOP,"
 
 # remove the trailing comma
 install_packages="$(echo ${install_packages} | sed 's/,*\r*$//')"
diff --git a/ci_scripts/install_singularity.sh b/ci_scripts/install_singularity.sh
index 292df85b..9a89e4a3 100644
--- a/ci_scripts/install_singularity.sh
+++ b/ci_scripts/install_singularity.sh
@@ -1,6 +1,6 @@
 #!/usr/bin/env sh
 
-echo "Install Singularity"
+echo "Inside Singularity Installation Script"
 
 sudo apt-get update && sudo apt-get install -y \
   build-essential \
@@ -14,21 +14,33 @@ sudo apt-get update && sudo apt-get install -y \
   git \
   cryptsetup
 
-if [[ "$SINGULARITY_VERSION" == "3.5" ]]; then
-    export VERSION=3.5.3
-elif [[ "$SINGULARITY_VERSION" == "3.6" ]]; then
-    export VERSION=3.6.4
-elif [[ "$SINGULARITY_VERSION" == "3.7" ]]; then
+if [[ "$SINGULARITY_VERSION" == "3.7" ]]; then
     export VERSION=3.7.3
+    export FILENAME=singularity-"${VERSION}"
+    export EXTRACTED_FILENAME=singularity
+
 elif [[ "$SINGULARITY_VERSION" == "3.8" ]]; then
-    export VERSION=3.8.0
+    export VERSION=3.8.4
+    export FILENAME=singularity-ce-"${VERSION}"
+    export EXTRACTED_FILENAME=singularity-ce-"${VERSION}"
+
+elif [[ "$SINGULARITY_VERSION" == "3.9" ]]; then
+    export VERSION=3.9.3
+    export FILENAME=singularity-ce-"${VERSION}"
+    export EXTRACTED_FILENAME=singularity-ce-"${VERSION}"
+
+elif [[ "$SINGULARITY_VERSION" == "3.10" ]]; then
+    export VERSION=3.10.0
+    export FILENAME=singularity-ce-"${VERSION}"
+    export EXTRACTED_FILENAME=singularity-ce-"${VERSION}"
+
 else
     echo "Skip installing Singularity"
 fi
 
-wget https://github.com/sylabs/singularity/archive/refs/tags/v${VERSION}.tar.gz && \
-tar -xzf v${VERSION}.tar.gz && \
-cd singularity-${VERSION} && \
+wget https://github.com/sylabs/singularity/releases/download/v"${VERSION}"/"${FILENAME}".tar.gz && \
+tar -xzf "${FILENAME}".tar.gz && \
+cd "${EXTRACTED_FILENAME}" && \
 ./mconfig && \
 make -C builddir && \
 sudo make -C builddir install
diff --git a/requirements.txt b/requirements.txt
index 73ae9818..aad54f85 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,3 @@
-scipy>=1.4.1
 numpy>=1.18.1
 ConfigSpace>=0.4.12
 Pyro4==4.80

From 5f67bb24911a95d527cf1db6a328a982735f80b2 Mon Sep 17 00:00:00 2001
From: ayushi-3536 <ayushi.sharma.3536@gmail.com>
Date: Tue, 24 May 2022 14:57:33 +0200
Subject: [PATCH 10/29] -minor cleanup

---
 hpobench/benchmarks/mo/lm_benchmark.py        | 29 +++++++++++--------
 .../mo/Singularity.LanguageModelBenchmark     |  6 ++--
 hpobench/util/data_manager.py                 |  5 ----
 3 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/hpobench/benchmarks/mo/lm_benchmark.py b/hpobench/benchmarks/mo/lm_benchmark.py
index cee33c34..a1ae7fbf 100644
--- a/hpobench/benchmarks/mo/lm_benchmark.py
+++ b/hpobench/benchmarks/mo/lm_benchmark.py
@@ -27,8 +27,17 @@
 
 
 class LanguageModelBenchmark(AbstractMultiObjectiveBenchmark):
-
     def __init__(self, rng: Union[np.random.RandomState, int, None] = None, **kwargs):
+        """
+        Tranformer based multi-objective language model benchmark
+
+        Parameters
+        ----------
+        rng : np.random.RandomState, int, None
+            Random seed for the benchmarks
+
+        Transformer Model is based on : "https://arxiv.org/pdf/1706.03762.pdf"
+        """
         super(LanguageModelBenchmark, self).__init__(rng=rng)
 
         self.device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
@@ -40,16 +49,14 @@ def __init__(self, rng: Union[np.random.RandomState, int, None] = None, **kwargs
                          "nlayers": 2,
                          "bptt": 35,
                          "tied": True,
-                         # number of attention head
+                         # Number of attention head
                          "nhead": 2,
                          "ntoken": self.ntokens
                          }
-        print("len of corpus dict", self.ntokens)
 
     def __seed_everything(self):
         """Helperfunction: Make the benchmark deterministic by setting the correct seeds"""
         seed = self.rng.randint(0, 100000)
-        print("seed obtained", seed)
         random.seed(seed)
         np.random.seed(seed)
         torch.manual_seed(seed)
@@ -86,7 +93,7 @@ def get_configuration_space(seed: Union[int, None] = None) -> CS.ConfigurationSp
         return cs
 
     @staticmethod
-    def get_objective_names(self) -> List[str]:
+    def get_objective_names() -> List[str]:
         return ['log_perplexity', 'accuracy', 'time']
 
     @staticmethod
@@ -205,12 +212,12 @@ def objective_function(self, configuration: Union[CS.Configuration, Dict],
             start = time.time()
             val_loss, val_acc = model.eval_fun(self.ntokens, criterion, val_data)
             val_loss = np.clip(val_loss, 1e-10, 10)
-            print("val acc for last epoch", val_acc)
             eval_time += start - time.time()
 
             t.set_postfix(val_accuracy=val_acc)
             t.update()
 
+            # Taken from original experimental setup
             if not np.isfinite(val_loss):
                 val_loss = 7
 
@@ -228,7 +235,7 @@ def objective_function(self, configuration: Union[CS.Configuration, Dict],
         perplexity = math.exp(best_val_loss)
         log_perplexity = best_val_loss
         neg_log_perplexity = 10 - best_val_loss
-        elapsed_time = float(ts_start - time.time())
+        elapsed_time = ts_start - time.time()
 
         return {'function_value': {'log_perplexity': log_perplexity,
                                    'accuracy': val_acc.item(),
@@ -292,9 +299,7 @@ def objective_function_test(self, configuration: Union[CS.Configuration, Dict],
                     used fidelities in this evaluation
         """
 
-        # The result dict should contain already all necessary information -> Just swap the function value from valid
-        # to test and the corresponding time cost
-        assert fidelity['epoch'] == 81, 'Only test data for the 50. epoch is available. '
+        assert fidelity['epoch'] == 81, 'Only test data for the 81 epoch is available. '
         ts_start = time.time()
 
         self.rng = rng_helper.get_rng(rng)
@@ -347,7 +352,7 @@ def objective_function_test(self, configuration: Union[CS.Configuration, Dict],
         perplexity = math.exp(best_test_loss)
         log_perplexity = best_test_loss
         neg_log_perplexity = 10 - best_test_loss
-        elapsed_time = float(ts_start - time.time())
+        elapsed_time = ts_start - time.time()
 
         return {'function_value': {'log_perplexity': log_perplexity,
                                    'accuracy': test_acc.item(),
@@ -365,4 +370,4 @@ def objective_function_test(self, configuration: Union[CS.Configuration, Dict],
                          }
                 }
 
-    __all__ = ["LanguageModelBenchmark"]
+    __all__ = ["LanguageModelBenchmark"]
\ No newline at end of file
diff --git a/hpobench/container/recipes/mo/Singularity.LanguageModelBenchmark b/hpobench/container/recipes/mo/Singularity.LanguageModelBenchmark
index 8f364323..770da7f9 100644
--- a/hpobench/container/recipes/mo/Singularity.LanguageModelBenchmark
+++ b/hpobench/container/recipes/mo/Singularity.LanguageModelBenchmark
@@ -17,8 +17,8 @@ VERSION v0.0.1
     && cd /home \
     && git clone https://github.com/ayushi-3536/HPOBench.git \
     && cd HPOBench \
-    && git checkout fair_adult \
-    && pip install .[adult] \
+    && git checkout wikitext \
+    && pip install .[lm_benchmark] \
     && cd / \
     && mkdir /var/lib/hpobench/ \
     && chmod -R 777 /var/lib/hpobench/ \
@@ -27,4 +27,4 @@ VERSION v0.0.1
 
 
 %runscript
-    python -s /home/HPOBench/hpobench/container/server_abstract_benchmark.py mo.adult_benchmark $@
\ No newline at end of file
+    python -s /home/HPOBench/hpobench/container/server_abstract_benchmark.py mo.lm_benchmark $@
\ No newline at end of file
diff --git a/hpobench/util/data_manager.py b/hpobench/util/data_manager.py
index 72e817ce..c8c7fbf6 100644
--- a/hpobench/util/data_manager.py
+++ b/hpobench/util/data_manager.py
@@ -887,11 +887,6 @@ def load(self):
         self._download()
         self.X_train, self.X_valid, self.X_test = self._load()
         self.logger.info(f'LanguageModelDataManager: Data successfully loaded after {time() - t:.2f}')
-        print(self.X_train.shape)
-
-        print(self.X_valid.shape)
-
-        print(self.X_test.shape)
         return self.X_train, self.X_valid, self.X_test
 
     @lockutils.synchronized('not_thread_process_safe', external=True,

From 49663ceb4eec6d62817588d57a67f0fa5e94ed3b Mon Sep 17 00:00:00 2001
From: ayushi-3536 <ayushi.sharma.3536@gmail.com>
Date: Tue, 24 May 2022 15:34:51 +0200
Subject: [PATCH 11/29] -minor cleanup

---
 hpobench/benchmarks/mo/lm_benchmark.py | 26 ++++++++++++++++++--------
 1 file changed, 18 insertions(+), 8 deletions(-)

diff --git a/hpobench/benchmarks/mo/lm_benchmark.py b/hpobench/benchmarks/mo/lm_benchmark.py
index a1ae7fbf..93c8b145 100644
--- a/hpobench/benchmarks/mo/lm_benchmark.py
+++ b/hpobench/benchmarks/mo/lm_benchmark.py
@@ -98,6 +98,21 @@ def get_objective_names() -> List[str]:
 
     @staticmethod
     def get_fidelity_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
+        """
+        Creates a ConfigSpace.ConfigurationSpace containing all fidelity parameters
+
+        Fidelities:
+         - epoch: int
+
+        Parameters
+        ----------
+        seed : int, None
+            Fixing the seed for the ConfigSpace.ConfigurationSpace
+
+        Returns
+        -------
+        ConfigSpace.ConfigurationSpace
+        """
 
         fidelity_space = CS.ConfigurationSpace(seed=seed)
         fidelity_space.add_hyperparameters([
@@ -124,9 +139,6 @@ def get_meta_information() -> Dict:
     def init_model(self, config: Union[CS.Configuration, Dict]):
         """ Function that returns the model initialized based on the configuration and fidelity
         """
-
-        if isinstance(config, CS.Configuration):
-            config = config.get_dictionary()
         model = TransformerModel(
             self.variable['ntoken'], config['emsize'], self.variable['nhead'], config['emsize'],
             self.variable['nlayers'], config['dropout'])
@@ -137,7 +149,6 @@ def init_model(self, config: Union[CS.Configuration, Dict]):
     def objective_function(self, configuration: Union[CS.Configuration, Dict],
                            fidelity: Union[Dict, CS.Configuration, None] = None,
                            rng: Union[np.random.RandomState, int, None] = None,
-                           shuffle: bool = False,
                            **kwargs) -> Dict:
         """
 
@@ -179,7 +190,7 @@ def objective_function(self, configuration: Union[CS.Configuration, Dict],
                     used fidelities in this evaluation
         """
 
-        self.rng = rng_helper.get_rng(rng)
+        self.rng = rng_helper.get_rng(self.rng, rng)
         self.__seed_everything()
 
         device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
@@ -259,7 +270,6 @@ def objective_function(self, configuration: Union[CS.Configuration, Dict],
     def objective_function_test(self, configuration: Union[CS.Configuration, Dict],
                                 fidelity: Union[Dict, None] = None,
                                 rng: Union[np.random.RandomState, int, None] = None,
-                                shuffle: bool = False,
                                 **kwargs) -> Dict:
         """
         Get the validated results. Runs a given configuration on the largest budget (here: 50).
@@ -302,7 +312,7 @@ def objective_function_test(self, configuration: Union[CS.Configuration, Dict],
         assert fidelity['epoch'] == 81, 'Only test data for the 81 epoch is available. '
         ts_start = time.time()
 
-        self.rng = rng_helper.get_rng(rng)
+        self.rng = rng_helper.get_rng(self.rng, rng)
         self.__seed_everything()
 
         # batchify data
@@ -370,4 +380,4 @@ def objective_function_test(self, configuration: Union[CS.Configuration, Dict],
                          }
                 }
 
-    __all__ = ["LanguageModelBenchmark"]
\ No newline at end of file
+    __all__ = ["LanguageModelBenchmark"]

From 5d3d75ec1da14bb31dc0caffc0f47df1efab3e02 Mon Sep 17 00:00:00 2001
From: Florian <pfistfl@users.noreply.github.com>
Date: Mon, 30 May 2022 13:20:13 +0100
Subject: [PATCH 12/29] Add YAHPO Benchmark (#142)

* Add yahpo_gym w help from phmueller

Co-authored-by: PhMueller <muller-phil@gmx.net>
---
 extra_requirements/yahpo_gym.json             |   3 +
 hpobench/benchmarks/surrogates/yahpo_gym.py   | 193 ++++++++++++++++++
 .../benchmarks/surrogates/yahpo_gym.py        |  20 ++
 .../surrogates/Singularity.YAHPOGymBenchmark  |  39 ++++
 tests/test_yahpo.py                           |  77 +++++++
 5 files changed, 332 insertions(+)
 create mode 100644 extra_requirements/yahpo_gym.json
 create mode 100644 hpobench/benchmarks/surrogates/yahpo_gym.py
 create mode 100644 hpobench/container/benchmarks/surrogates/yahpo_gym.py
 create mode 100644 hpobench/container/recipes/surrogates/Singularity.YAHPOGymBenchmark
 create mode 100644 tests/test_yahpo.py

diff --git a/extra_requirements/yahpo_gym.json b/extra_requirements/yahpo_gym.json
new file mode 100644
index 00000000..77bea14d
--- /dev/null
+++ b/extra_requirements/yahpo_gym.json
@@ -0,0 +1,3 @@
+{
+  "yahpo_gym": ["yahpo_gym@git+https://github.com/pfistfl/yahpo_gym#egg=yahpo_gym&subdirectory=yahpo_gym"]
+}
diff --git a/hpobench/benchmarks/surrogates/yahpo_gym.py b/hpobench/benchmarks/surrogates/yahpo_gym.py
new file mode 100644
index 00000000..19522700
--- /dev/null
+++ b/hpobench/benchmarks/surrogates/yahpo_gym.py
@@ -0,0 +1,193 @@
+"""
+How to use this benchmark:
+--------------------------
+
+We recommend using the containerized version of this benchmark.
+If you want to use this benchmark locally (without running it via the corresponding container),
+you need to perform the following steps.
+
+Prerequisites:
+==============
+Conda environment in which the HPOBench is installed (pip install .). Activate your environment.
+```
+conda activate <Name_of_Conda_HPOBench_environment>
+```
+
+1. Clone from github:
+=====================
+```
+git clone HPOBench
+```
+
+2. Clone and install
+====================
+```
+cd /path/to/HPOBench
+pip install .[yahpo_gym]
+
+```
+
+Changelog:
+==========
+0.0.1:
+* First implementation
+"""
+import os
+import logging
+from typing import Union, Dict, List
+
+import ConfigSpace as CS
+import numpy as np
+
+from yahpo_gym.benchmark_set import BenchmarkSet
+from hpobench.abstract_benchmark import AbstractMultiObjectiveBenchmark, AbstractBenchmark
+
+__version__ = '0.0.1'
+
+logger = logging.getLogger('YAHPOGym')
+
+
+class YAHPOGymMOBenchmark(AbstractMultiObjectiveBenchmark):
+
+    def __init__(self, scenario: str, instance: str,
+                 rng: Union[np.random.RandomState, int, None] = None):
+        """
+        For a list of available scenarios and instances see
+        'https://slds-lmu.github.io/yahpo_gym/scenarios.html'
+        Parameters
+        ----------
+        scenario : str
+            Name for the surrogate data. Must be one of ["lcbench", "fcnet", "nb301", "rbv2_svm",
+            "rbv2_ranger", "rbv2_rpart", "rbv2_glmnet", "rbv2_aknn", "rbv2_xgboost", "rbv2_super"]
+        instance : str
+            A valid instance for the scenario. See `self.benchset.instances`.
+        rng : np.random.RandomState, int, None
+        """
+
+        # When in the containerized version, redirect to the data inside the container.
+        if 'YAHPO_CONTAINER' in os.environ:
+            from yahpo_gym.local_config import LocalConfiguration
+            local_config = LocalConfiguration()
+            local_config.init_config(data_path='/home/data/yahpo_data')
+
+        self.scenario = scenario
+        self.instance = instance
+        self.benchset = BenchmarkSet(scenario, active_session=True)
+        self.benchset.set_instance(instance)
+
+        logger.info(f'Start Benchmark for scenario {scenario} and instance {instance}')
+        super(YAHPOGymMOBenchmark, self).__init__(rng=rng)
+
+    # pylint: disable=arguments-differ
+    def get_configuration_space(self, seed: Union[int, None] = None) -> CS.ConfigurationSpace:
+        return self.benchset.get_opt_space(drop_fidelity_params=True, seed=seed)
+
+    # pylint: disable=arguments-differ
+    def get_fidelity_space(self, seed: Union[int, None] = None) -> CS.ConfigurationSpace:
+        return self.benchset.get_fidelity_space(seed=seed)
+
+    @AbstractMultiObjectiveBenchmark.check_parameters
+    def objective_function(self, configuration: Union[CS.Configuration, Dict],
+                           fidelity: Union[CS.Configuration, Dict, None] = None,
+                           rng: Union[np.random.RandomState, int, None] = None, **kwargs) -> Dict:
+
+        # No batch predicts, so we can grab the first item
+        out = self.benchset.objective_function({**configuration, **fidelity})[0]
+        # Convert to float for serialization
+        out = {k: float(v) for k, v in out.items()}
+
+        # Get runtime name
+        cost = out[self.benchset.config.runtime_name]
+
+        return {'function_value': out,
+                "cost": cost,
+                'info': {'fidelity': fidelity}}
+
+    @AbstractMultiObjectiveBenchmark.check_parameters
+    def objective_function_test(self, configuration: Union[CS.Configuration, Dict],
+                                fidelity: Union[CS.Configuration, Dict, None] = None,
+                                rng: Union[np.random.RandomState, int, None] = None, **kwargs) \
+            -> Dict:
+        return self.objective_function(configuration, fidelity=fidelity, rng=rng)
+
+    # pylint: disable=arguments-differ
+    def get_objective_names(self) -> List[str]:
+        return self.benchset.config.y_names
+
+    @staticmethod
+    def get_meta_information():
+        """ Returns the meta information for the benchmark """
+        return {'name': 'YAHPO Gym',
+                'references': ['@misc{pfisterer2021yahpo,',
+                               'title={YAHPO Gym -- Design Criteria and a new Multifidelity '
+                               '       Benchmark for Hyperparameter Optimization},',
+                               'author    = {Florian Pfisterer and Lennart Schneider and'
+                               '             Julia Moosbauer and Martin Binder'
+                               '             and Bernd Bischl},',
+                               'eprint={2109.03670},',
+                               'archivePrefix={arXiv},',
+                               'year      = {2021}}'],
+                'code': 'https://github.com/pfistfl/yahpo_gym/yahpo_gym'}
+
+
+class YAHPOGymBenchmark(AbstractBenchmark):
+
+    def __init__(self, scenario: str, instance: str, objective: str = None,
+                 rng: Union[np.random.RandomState, int, None] = None):
+        """
+        For a list of available scenarios and instances see
+        'https://slds-lmu.github.io/yahpo_gym/scenarios.html'
+        Parameters
+        ----------
+        scenario : str
+            Name for the surrogate data. Must be one of ["lcbench", "fcnet", "nb301", "rbv2_svm",
+            "rbv2_ranger", "rbv2_rpart", "rbv2_glmnet", "rbv2_aknn", "rbv2_xgboost", "rbv2_super"]
+        instance : str
+            A valid instance for the scenario. See `self.benchset.instances`.
+        objective : str
+            Name of the (single-crit) objective. See `self.benchset.config.y_names`.
+            Initialized to None, picks the first element in y_names.
+        rng : np.random.RandomState, int, None
+        """
+
+        self.backbone = YAHPOGymMOBenchmark(scenario=scenario, instance=instance, rng=rng)
+        self.objective = objective
+
+        super(YAHPOGymBenchmark, self).__init__(rng=rng)
+
+    @AbstractBenchmark.check_parameters
+    def objective_function(self, configuration: Union[CS.Configuration, Dict],
+                           fidelity: Union[Dict, CS.Configuration, None] = None,
+                           rng: Union[np.random.RandomState, int, None] = None, **kwargs) -> Dict:
+
+        mo_results = self.backbone.objective_function(configuration=configuration,
+                                                      fidelity=fidelity,
+                                                      **kwargs)
+
+        # If not objective is set, we just grab the first returned entry.
+        if self.objective is None:
+            self.objective = self.backbone.benchset.config.y_names[0]
+
+        obj_value = mo_results['function_value'][self.objective]
+
+        return {'function_value': obj_value,
+                "cost": mo_results['cost'],
+                'info': {'fidelity': fidelity, 'objectives': mo_results['function_value']}}
+
+    @AbstractBenchmark.check_parameters
+    def objective_function_test(self, configuration: Union[CS.Configuration, Dict],
+                                fidelity: Union[Dict, CS.Configuration, None] = None,
+                                rng: Union[np.random.RandomState, int, None] = None, **kwargs) -> Dict:
+        return self.objective_function(configuration, fidelity=fidelity, rng=rng)
+
+    # pylint: disable=arguments-differ
+    def get_configuration_space(self, seed: Union[int, None] = None) -> CS.ConfigurationSpace:
+        return self.backbone.get_configuration_space(seed=seed)
+
+    # pylint: disable=arguments-differ
+    def get_fidelity_space(self, seed: Union[int, None] = None) -> CS.ConfigurationSpace:
+        return self.backbone.get_fidelity_space(seed=seed)
+
+    @staticmethod
+    def get_meta_information() -> Dict:
+        return YAHPOGymMOBenchmark.get_meta_information()
diff --git a/hpobench/container/benchmarks/surrogates/yahpo_gym.py b/hpobench/container/benchmarks/surrogates/yahpo_gym.py
new file mode 100644
index 00000000..9774975d
--- /dev/null
+++ b/hpobench/container/benchmarks/surrogates/yahpo_gym.py
@@ -0,0 +1,20 @@
+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+
+from hpobench.container.client_abstract_benchmark import AbstractBenchmarkClient, AbstractMOBenchmarkClient
+
+
+class YAHPOGymBenchmark(AbstractBenchmarkClient):
+    def __init__(self, **kwargs):
+        kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'YAHPOGymBenchmark')
+        kwargs['container_name'] = kwargs.get('container_name', 'yahpo_gym')
+        kwargs['latest'] = kwargs.get('container_tag', '0.0.1')
+        super(YAHPOGymBenchmark, self).__init__(**kwargs)
+
+
+class YAHPOGymMOBenchmark(AbstractMOBenchmarkClient):
+    def __init__(self, **kwargs):
+        kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'YAHPOGymMOBenchmark')
+        kwargs['container_name'] = kwargs.get('container_name', 'yahpo_gym')
+        kwargs['latest'] = kwargs.get('container_tag', '0.0.1')
+        super(YAHPOGymMOBenchmark, self).__init__(**kwargs)
diff --git a/hpobench/container/recipes/surrogates/Singularity.YAHPOGymBenchmark b/hpobench/container/recipes/surrogates/Singularity.YAHPOGymBenchmark
new file mode 100644
index 00000000..66ee63b1
--- /dev/null
+++ b/hpobench/container/recipes/surrogates/Singularity.YAHPOGymBenchmark
@@ -0,0 +1,39 @@
+Bootstrap: docker
+From: python:3.7-slim
+
+%labels
+MAINTAINER pfistererf@googlemail.com
+VERSION v0.0.1
+
+%help
+    This is a template for a Singularity recipe
+
+%environment
+    YAHPO_CONTAINER=1
+    export YAHPO_CONTAINER
+
+%post
+    apt update -y
+    apt install build-essential git wget -y
+
+    /usr/local/bin/python -m pip install --upgrade pip
+
+    cd /home \
+    && mkdir data && cd data \
+    && git clone --depth 1 -b main https://github.com/pfistfl/yahpo_data.git\
+
+    cd /home \
+    && git clone https://github.com/pfistfl/HPOBench.git \
+    && cd HPOBench \
+    && echo "Please never push a recipe that checks out any other branch than development or master" \
+    && git checkout master \
+    && pip install .[yahpo_gym] \
+    && echo "Please don't touch the following lines" \
+    && cd / \
+    && mkdir /var/lib/hpobench/ \
+    && chmod -R 777 /var/lib/hpobench/ \
+    && rm -rf /var/lib/apt/lists/* \
+    && pip cache purge \
+
+%runscript
+    python -s /home/HPOBench/hpobench/container/server_abstract_benchmark.py surrogates.yahpo_gym $@
diff --git a/tests/test_yahpo.py b/tests/test_yahpo.py
new file mode 100644
index 00000000..97a7d06d
--- /dev/null
+++ b/tests/test_yahpo.py
@@ -0,0 +1,77 @@
+import sys
+from typing import Dict, List
+
+import pytest
+
+from hpobench.container.benchmarks.surrogates.yahpo_gym import YAHPOGymBenchmark, YAHPOGymMOBenchmark
+
+
+def test_yahpo_init():
+    b = YAHPOGymBenchmark(scenario="lcbench", instance="167152", objective="val_accuracy")
+
+    fs = b.get_fidelity_space(seed=0)
+    fidelity = fs.sample_configuration().get_dictionary()
+    assert isinstance(fidelity, Dict)
+
+    cs = b.get_configuration_space(seed=0)
+    config = cs.sample_configuration().get_dictionary()
+
+    # Some tests are dependent on the python version.
+    if sys.version.startswith('3.9'):
+        assert fidelity['epoch'] == pytest.approx(29, abs=0.001)
+        assert config['OpenML_task_id'] == "167152"
+        assert config['num_layers'] == pytest.approx(4, abs=0.001)
+        assert config['max_units'] == pytest.approx(289, abs=0.0001)
+        assert config['weight_decay'] == pytest.approx(0.04376, abs=0.001)
+        assert config['learning_rate'] == pytest.approx(0.01398, abs=0.0001)
+        assert config['batch_size'] == pytest.approx(106, abs=0.001)
+
+    constant_fidelity = {'epoch': 29}
+    constant_config = {
+        'OpenML_task_id': '167152', 'batch_size': 106, 'learning_rate': 0.013981961408994055,
+        'max_dropout': 0.6027633760716439, 'max_units': 289, 'momentum': 0.47705277141162516,
+        'num_layers': 4, 'weight_decay': 0.04376434525415663
+    }
+
+    result = b.objective_function(configuration=constant_config, fidelity=constant_fidelity)
+    assert result['function_value'] == pytest.approx(61.297, abs=0.1)
+    assert result['cost'] == pytest.approx(119.4965, abs=0.1)
+    assert isinstance(result['info'], Dict)
+
+
+def test_yahpo_mo():
+    b = YAHPOGymMOBenchmark(scenario="lcbench", instance="167152")
+
+    fs = b.get_fidelity_space(seed=0)
+    fidelity = fs.sample_configuration().get_dictionary()
+    assert isinstance(fidelity, Dict)
+
+    cs = b.get_configuration_space(seed=0)
+    config = cs.sample_configuration().get_dictionary()
+
+    # Some tests are dependent on the python version.
+    if sys.version.startswith('3.9'):
+        assert fidelity['epoch'] == pytest.approx(29, abs=0.001)
+        assert config['OpenML_task_id'] == "167152"
+        assert config['num_layers'] == pytest.approx(4, abs=0.001)
+        assert config['max_units'] == pytest.approx(289, abs=0.0001)
+        assert config['weight_decay'] == pytest.approx(0.04376, abs=0.001)
+        assert config['learning_rate'] == pytest.approx(0.01398, abs=0.0001)
+        assert config['batch_size'] == pytest.approx(106, abs=0.001)
+
+    constant_fidelity = {'epoch': 29}
+    constant_config = {
+        'OpenML_task_id': '167152', 'batch_size': 106, 'learning_rate': 0.013981961408994055,
+        'max_dropout': 0.6027633760716439, 'max_units': 289, 'momentum': 0.47705277141162516,
+        'num_layers': 4, 'weight_decay': 0.04376434525415663
+    }
+
+    result = b.objective_function(configuration=constant_config, fidelity=constant_fidelity)
+    assert isinstance(result['function_value'], Dict)
+    assert result['function_value']['val_accuracy'] == pytest.approx(61.2971, abs=0.0001)
+    assert result['cost'] == pytest.approx(119.4965, abs=0.0001)
+
+    names = b.get_objective_names()
+    assert isinstance(names, List)
+    assert len(names) == 6
+    assert names[2] == 'val_cross_entropy'

From ac9547a8dc53f006c79147ef56b363d0ffb9cde1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Philipp=20M=C3=BCller?= <muller-phil@gmx.net>
Date: Mon, 30 May 2022 16:40:27 +0200
Subject: [PATCH 13/29] ADD Multi-Objective Nasbench201 (v0.0.6) (#152)

Update the Nasbench201 benchmark to support Multi-Objective queries.

If you want to use the *single objective* Nasbench201 benchmark, you can query the SO version of this benchmark.
Although, we have not changed the benchmark logic, you can also use the container v0.0.5 in your experiments to reproduce results from the old version of this benchmark.
---
 hpobench/benchmarks/nas/nasbench_201.py       | 398 ++++++++++++++++--
 .../container/benchmarks/nas/nasbench_201.py  |  29 +-
 tests/test_nasbench_201.py                    |  92 ++--
 3 files changed, 446 insertions(+), 73 deletions(-)

diff --git a/hpobench/benchmarks/nas/nasbench_201.py b/hpobench/benchmarks/nas/nasbench_201.py
index 17bac321..0c2324c2 100644
--- a/hpobench/benchmarks/nas/nasbench_201.py
+++ b/hpobench/benchmarks/nas/nasbench_201.py
@@ -27,6 +27,10 @@
 
 Changelog:
 ==========
+0.0.6
+* Add the multiobjective version of this benchmark by returning flops, model size, latency and missclassification rate
+* Integrate #138: Improve the docstrings about the seeds.
+
 0.0.5
 * Add for each benchmark a new one with a different fidelity space.
   The new fidelity space corresponds to the fidelity space in the DEHB paper.
@@ -54,16 +58,18 @@
 import numpy as np
 
 import hpobench.util.rng_helper as rng_helper
-from hpobench.abstract_benchmark import AbstractBenchmark
+from hpobench.abstract_benchmark import AbstractBenchmark, AbstractMultiObjectiveBenchmark
+
 from hpobench.util.data_manager import NASBench_201Data
 
-__version__ = '0.0.5'
+
+__version__ = '0.0.6'
 MAX_NODES = 4
 
 logger = logging.getLogger('NASBENCH201')
 
 
-class NasBench201BaseBenchmark(AbstractBenchmark):
+class NasBench201BaseMOBenchmark(AbstractMultiObjectiveBenchmark):
     def __init__(self, dataset: str,
                  rng: Union[np.random.RandomState, int, None] = None, **kwargs):
         """
@@ -129,6 +135,8 @@ def __init__(self, dataset: str,
         - In the original data, the training splits are always marked with the key 'train' but they use different
           identifiers to refer to the available evaluation splits. We report them also in the table below.
         - We exclude the data set cifar10 from this benchmark.
+        - In NasBench201, not all architectures have values for the three seeds. To increase robustness, we have patched
+          missing values with the values from an available seed.
 
          Some further remarks:
         - cifar10-valid is trained on the train split and tested on the validation split.
@@ -145,13 +153,13 @@ def __init__(self, dataset: str,
             Random seed for the benchmark's random state.
         """  # noqa: E501
 
-        super(NasBench201BaseBenchmark, self).__init__(rng=rng)
+        super(NasBench201BaseMOBenchmark, self).__init__(rng=rng)
 
         data_manager = NASBench_201Data(dataset=dataset)
 
         self.dataset = dataset
         self.data = data_manager.load()
-        self.config_to_structure = NasBench201BaseBenchmark.config_to_structure_func(max_nodes=MAX_NODES)
+        self.config_to_structure = NasBench201BaseMOBenchmark.config_to_structure_func(max_nodes=MAX_NODES)
 
     def dataset_mapping(self, dataset):
         mapping = {'cifar10-valid': ('x-valid', 'ori-test'),
@@ -160,7 +168,7 @@ def dataset_mapping(self, dataset):
         return mapping[dataset]
 
     # pylint: disable=arguments-differ
-    @AbstractBenchmark.check_parameters
+    @AbstractMultiObjectiveBenchmark.check_parameters
     def objective_function(self, configuration: Union[CS.Configuration, Dict],
                            fidelity: Union[Dict, CS.Configuration, None] = None,
                            rng: Union[np.random.RandomState, int, None] = None,
@@ -205,7 +213,15 @@ def objective_function(self, configuration: Union[CS.Configuration, Dict],
         Returns
         -------
         Dict -
-            function_value : training precision
+            function_value : Dict
+                misclassification_rate : float
+                    1 - validation accuracy
+                num_flops : float
+                    Number of floating point operations in M
+                model_size : float
+                    Model size in MB
+                latency : float
+                    Time to evaluate a configuration in seconds
             cost : time to train the network
             info : Dict
                 train_precision : float
@@ -264,22 +280,38 @@ def objective_function(self, configuration: Union[CS.Configuration, Dict],
         test_times = [np.sum((self.data[seed][structure_str]['eval_times'][f'{test_key}@{199}'])
                              for e in range(1, epoch + 1)) for seed in data_seed]
 
-        return {'function_value': float(100 - np.mean(valid_accuracies)),
-                'cost': float(np.sum(valid_times) + np.sum(train_times)),
-                'info': {'train_precision': float(100 - np.mean(train_accuracies)),
-                         'train_losses': float(np.mean(train_losses)),
-                         'train_cost': float(np.sum(train_times)),
-                         'valid_precision': float(100 - np.mean(valid_accuracies)),
-                         'valid_losses': float(np.mean(valid_losses)),
-                         'valid_cost': float(np.sum(valid_times) + np.sum(train_times)),
-                         'test_precision': float(100 - np.mean(test_accuracies)),
-                         'test_losses': float(np.mean(test_losses)),
-                         'test_cost': float(np.sum(train_times)) + float(np.sum(test_times)),
-                         'fidelity': fidelity
-                         }
-                }
-
-    @AbstractBenchmark.check_parameters
+        # Number of floating point operations in million
+        num_flops = [self.data[seed][structure_str]['flop'] for seed in data_seed]
+
+        # Number of trainable model parameters in MB
+        model_size = [self.data[seed][structure_str]['params'] for seed in data_seed]
+
+        # Time to evaluate in seconds
+        latency = [self.data[seed][structure_str]['latency'] for seed in data_seed]
+
+        return {
+            'function_value': {
+                'misclassification_rate': float(100 - np.mean(valid_accuracies)),
+                'num_flops': float(np.mean(num_flops)),
+                'model_size': float(np.mean(model_size)),
+                'latency': float(np.mean(latency)),
+            },
+            'cost': float(np.sum(valid_times) + np.sum(train_times)),
+            'info': {
+                'train_precision': float(100 - np.mean(train_accuracies)),
+                'train_losses': float(np.mean(train_losses)),
+                'train_cost': float(np.sum(train_times)),
+                'valid_precision': float(100 - np.mean(valid_accuracies)),
+                'valid_losses': float(np.mean(valid_losses)),
+                'valid_cost': float(np.sum(valid_times) + np.sum(train_times)),
+                'test_precision': float(100 - np.mean(test_accuracies)),
+                'test_losses': float(np.mean(test_losses)),
+                'test_cost': float(np.sum(train_times)) + float(np.sum(test_times)),
+                'fidelity': fidelity
+            }
+        }
+
+    @AbstractMultiObjectiveBenchmark.check_parameters
     def objective_function_test(self, configuration: Union[CS.Configuration, Dict],
                                 fidelity: Union[Dict, None] = None,
                                 rng: Union[np.random.RandomState, int, None] = None,
@@ -294,10 +326,9 @@ def objective_function_test(self, configuration: Union[CS.Configuration, Dict],
         ----------
         configuration
         fidelity: Dict, None
-            epoch: int - Values: [1, 200]
+            epoch: int - Values: [200]
                 Number of epochs an architecture was trained.
-                Note: the number of epoch is 1 indexed. (Results after the first epoch: epoch = 1)
-
+                Note: We only have test performance on the last epoch.
             Fidelity parameters, check get_fidelity_space(). Uses default (max) value if None.
         rng : np.random.RandomState, int, None
             Random seed to use in the benchmark.
@@ -311,7 +342,15 @@ def objective_function_test(self, configuration: Union[CS.Configuration, Dict],
         Returns
         -------
         Dict -
-            function_value : evaluation precision
+            function_value : Dict
+                misclassification_rate : float
+                    1 - test accuracy
+                num_flops : float
+                    Number of floating point operations in M
+                model_size : float
+                    Model size in MB
+                latency : float
+                    Time to evaluate a configuration in seconds
             cost : time to the network + time to validate
             info : Dict
                 train_precision
@@ -327,10 +366,19 @@ def objective_function_test(self, configuration: Union[CS.Configuration, Dict],
         # to test and the corresponding time cost
         assert fidelity['epoch'] == 200, 'Only test data for the 200. epoch is available. '
 
+        if 'data_seed' in kwargs:
+            all_seeds_available = all([seed in kwargs['data_seed'] for seed in (777, 888, 999)])
+            if not all_seeds_available:
+                logger.warning('You have not specified all available seeds for the '
+                               '`objective_function_test`. However, we are going to ignore them, '
+                               ' because we report test values only as mean across all seeds.'
+                               f' Your given seeds: {kwargs["seed"]}')
+            del kwargs['data_seed']
+
         result = self.objective_function(configuration=configuration, fidelity=fidelity,
                                          data_seed=(777, 888, 999),
                                          rng=rng, **kwargs)
-        result['function_value'] = result['info']['test_precision']
+        result['function_value']['misclassification_rate'] = result['info']['test_precision']
         result['cost'] = result['info']['test_cost']
         return result
 
@@ -349,7 +397,7 @@ def config_to_structure(config):
                     op_name = config[node_str]
                     x_list.append((op_name, j))
                 genotypes.append(tuple(x_list))
-            return NasBench201BaseBenchmark._Structure(genotypes)
+            return NasBench201BaseMOBenchmark._Structure(genotypes)
         return config_to_structure
 
     @staticmethod
@@ -387,7 +435,7 @@ def get_configuration_space(seed: Union[int, None] = None) -> CS.ConfigurationSp
         seed = seed if seed is not None else np.random.randint(1, 100000)
         cs = CS.ConfigurationSpace(seed=seed)
 
-        search_space = NasBench201BaseBenchmark.get_search_spaces('cell', 'nas-bench-201')
+        search_space = NasBench201BaseMOBenchmark.get_search_spaces('cell', 'nas-bench-201')
         hps = [CS.CategoricalHyperparameter(f'{i}<-{j}', search_space) for i in range(1, MAX_NODES) for j in range(i)]
         cs.add_hyperparameters(hps)
         return cs
@@ -420,6 +468,10 @@ def get_fidelity_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
 
         return fidel_space
 
+    @staticmethod
+    def get_objective_names() -> List[str]:
+        return ['misclassification_rate', 'num_flops', 'model_size', 'latency']
+
     @staticmethod
     def get_meta_information() -> Dict:
         """ Returns the meta information for the benchmark """
@@ -471,25 +523,296 @@ def __getitem__(self, index):
             return self.nodes[index]
 
 
-class Cifar10ValidNasBench201Benchmark(NasBench201BaseBenchmark):
+class Cifar10ValidNasBench201MOBenchmark(NasBench201BaseMOBenchmark):
+
+    def __init__(self, rng: Union[np.random.RandomState, int, None] = None, **kwargs):
+        super(Cifar10ValidNasBench201MOBenchmark, self).__init__(dataset='cifar10-valid', rng=rng, **kwargs)
+
+
+class Cifar100NasBench201MOBenchmark(NasBench201BaseMOBenchmark):
+
+    def __init__(self, rng: Union[np.random.RandomState, int, None] = None, **kwargs):
+        super(Cifar100NasBench201MOBenchmark, self).__init__(dataset='cifar100', rng=rng, **kwargs)
+
+
+class ImageNetNasBench201MOBenchmark(NasBench201BaseMOBenchmark):
+
+    def __init__(self, rng: Union[np.random.RandomState, int, None] = None, **kwargs):
+        super(ImageNetNasBench201MOBenchmark, self).__init__(dataset='ImageNet16-120', rng=rng, **kwargs)
+
+
+class NasBench201SOBenchmark(AbstractBenchmark):
+    def __init__(self, dataset: str,
+                 rng: Union[np.random.RandomState, int, None] = None, **kwargs):
+        """
+        Benchmark interface to the NASBench201 Benchmarks. The NASBench201 contains
+        results for architectures on 4 different data sets.
+
+        We have split the "api" file from NASBench201 in separate files per data set.
+        The original "api" file contains all data sets, but loading this single file took too much RAM.
+
+        We recommend to not call this base class directly but using the correct subclass below.
+
+        The parameter ``dataset`` indicates which data set was used for training.
+
+        For each data set the metrics
+        'train_acc1es', 'train_losses', 'train_times', 'eval_acc1es', 'eval_times', 'eval_losses' are available.
+        However, the data sets report them on different data splits (train, train + valid, test, valid or test+valid).
+
+        We summarize all information about the data sets in the following tables.
+
+        Datastet        Metric      Avail.Epochs    Explanation             returned by HPOBENCH
+        ----------------------------------------------------------------------------------------
+        cifar10-valid   train       [0-199]         training set
+        cifar10-valid   x-valid     [0-199]         validation set          objective function
+        cifar10-valid   x-test
+        cifar10-valid   ori-test    199             test set                objective function test
+
+        cifar100        train       [0-199]         training set
+        cifar100        x-valid     199             validation set
+        cifar100        x-test      199             test set                objective function test
+        cifar100        ori-test    [0-199]         validation + test set   objective function
+
+        ImageNet16-120  train       [0-199]         training set
+        ImageNet16-120  x-valid     199             validation set
+        ImageNet16-120  x-test      199             test set                objective function test
+        ImageNet16-120  ori-test    [0-199]         validation + test set   objective function
+
+
+        We have also extracted the incumbents per split. We report the incumbent accuracy and loss performance
+        i) by taking the maximum value across all seeds and configurations
+        ii) averaged across the three available seeds
+
+                                    i) The best possible incumbents (NO AVG!)                       ii) The "average" incumbent
+        Datastet        Metric      (Index of Arch, Accuracy)       (Index, Loss)                   (Index of Arch, Accuracy)       (Index, Loss)
+        ----------------------------------------------------------------------------------------------------------------------------------------------------------
+        cifar10-valid   train       (258, 100.0)                    (2778, 0.001179278278425336)    (10154, 100)                    (2778, 0.0013082386429297428)
+        cifar10-valid   x-valid     (6111, 91.71999999023437)       (14443, 0.3837750501537323)     (6111, 91.60666665039064)       (3888, 0.3894046771335602)
+        cifar10-valid   x-test
+        cifar10-valid   ori-test    (14174, 91.65)                  (3385, 0.3850496160507202)      (1459, 91.52333333333333)       (3385, 0.3995230517864227)
+
+        cifar100        train       (9930, 99.948)                  (9930, 0.012630240231156348)    (9930, 99.93733333333334)       (9930, 0.012843489621082942)
+        cifar100        x-valid     (13714, 73.71999998779297)      (13934, 1.1490126512527465)     (9930, 73.4933333577474)        (7361, 1.1600867895126343)
+        cifar100        x-test      (1459, 74.28000004882813)       (15383, 1.1427113876342774)     (9930, 73.51333332112631)       (7337, 1.1747569534301758)
+        cifar100        ori-test    (9930, 73.88)                   (13706, 1.1610547459602356)     (9930, 73.50333333333333)       (7361, 1.1696554500579834)
+
+        ImageNet16-120  train       (9930, 73.2524719841793)        (9930, 0.9490517352046979)      (9930, 73.22918040138735)       (9930, 0.9524298415108582)
+        ImageNet16-120  x-valid     (13778, 47.39999985758463)      (10721, 2.0826991437276203)     (10676, 46.73333327229818)      (10721, 2.0915397168795264)
+        ImageNet16-120  x-test      (857, 48.03333317057292)        (12887, 2.0940088628133138)     (857, 47.31111100599501)        (11882, 2.106453532218933)
+        ImageNet16-120  ori-test    (857, 47.083333353678384)       (11882, 2.0950548852284747)     (857, 46.8444444647895)         (11882, 2.1028235816955565)
+
+
+        Note:
+        - The parameter epoch is 0 indexed!
+        - In the original data, the training splits are always marked with the key 'train' but they use different
+          identifiers to refer to the available evaluation splits. We report them also in the table below.
+        - We exclude the data set cifar10 from this benchmark.
+        - In NasBench201, not all architectures have values for the three seeds. To increase robustness, we have patched
+          missing values with the values from an available seed.
+
+         Some further remarks:
+        - cifar10-valid is trained on the train split and tested on the validation split.
+        - The train metrics are dictionaries with epochs (e.g. 0, 1, 2) as key and the metric as value.
+          The evaluation metrics, however, have as key the identifiers, e.g. ori-test@0, with 0 indicating the epoch.
+          Also, each data set reports values for all 200 epochs for a metric on the specified split
+          and a single value on the 200th epoch for the other splits.
+
+        Parameters
+        ----------
+        dataset : str
+            One of cifar10-valid, cifar10, cifar100, ImageNet16-120.
+        rng : np.random.RandomState, int, None
+            Random seed for the benchmark's random state.
+        """  # noqa: E501
+
+        super(NasBench201SOBenchmark, self).__init__(rng=rng, **kwargs)
+        self.mo_benchmark = NasBench201BaseMOBenchmark(rng=rng, dataset=dataset, **kwargs)
+
+    # pylint: disable=arguments-differ
+    @AbstractBenchmark.check_parameters
+    def objective_function(self, configuration: Union[CS.Configuration, Dict],
+                           fidelity: Union[Dict, CS.Configuration, None] = None,
+                           rng: Union[np.random.RandomState, int, None] = None,
+                           data_seed: Union[List, Tuple, int, None] = (777, 888, 999),
+                           **kwargs) -> Dict:
+        """
+        Objective function for the NASBench201 benchmark.
+        This functions sends a query to NASBench201 and evaluates the configuration.
+        As already explained in the class definition, different data sets are trained on different splits.
+
+        The table above gives a detailed summary over the available splits, epochs, and which identifier are used per
+        dataset.
+
+        Parameters
+        ----------
+        configuration
+        fidelity: Dict, None
+            epoch: int - Values: [1, 200]
+                Number of epochs an architecture was trained.
+                Note: the number of epoch is 1 indexed! (Results after the first epoch: epoch = 1)
+
+            Fidelity parameters, check get_fidelity_space(). Uses default (max) value if None.
+        rng : np.random.RandomState, int, None
+            Random seed to use in the benchmark.
+
+            To prevent overfitting on a single seed, it is possible to pass a
+            parameter ``rng`` as 'int' or 'np.random.RandomState' to this function.
+            If this parameter is not given, the default random state is used.
+        data_seed : List, Tuple, None, int
+            The nasbench_201 benchmark include for each run 3 different seeds: 777, 888, 999.
+            The user can specify which seed to use. If more than one seed is given, the results are averaged
+            across the seeds but then the training time is the sum of the costs per seed.
+            When this value is explicitly set to `None`, the function will chose randomly one out of [777, 888, 999].
+
+            Note:
+                For some architectures (configurations) no run was available. We've set missing values to an
+                available value from another seed. Therefore, it is possible that run results are exactly the same for
+                different seeds.
+
+        kwargs
+
+        Returns
+        -------
+        Dict -
+            function_value : training precision
+            cost : time to train the network
+            info : Dict
+                train_precision : float
+                train_losses : float
+                train_cost : float
+                    Time needed to train the network for 'epoch' many epochs. If more than one seed is given,
+                    this field is the sum of the training time per network
+                eval_precision : float
+                eval_losses : float
+                eval_cost : float
+                    Time needed to train the network for 'epoch many epochs plus the time to evaluate the network on the
+                    evaluation split. If more than one seed is given, this field is the sum of the eval cost per network
+                fidelity : Dict
+                    used fidelities in this evaluation
+        """
+        results = self.mo_benchmark.objective_function(
+            configuration=configuration, fidelity=fidelity, rng=rng, data_seed=data_seed, **kwargs
+        )
+
+        results['function_value'] = results['function_value']['misclassification_rate']
+        return results
+
+    @AbstractBenchmark.check_parameters
+    def objective_function_test(self, configuration: Union[CS.Configuration, Dict],
+                                fidelity: Union[Dict, None] = None,
+                                rng: Union[np.random.RandomState, int, None] = None,
+                                **kwargs) -> Dict:
+        """
+        Get the validated results from the NASBench201. Runs a given configuration on the largest budget (here: 200).
+        The test function uses all data set seeds (777, 888, 999).
+
+        See also :py:meth:`~hpobench.benchmarks.nas.nasbench_201.objective_function`
+
+        Parameters
+        ----------
+        configuration
+        fidelity: Dict, None
+            epoch: int - Values: [1, 200]
+                Number of epochs an architecture was trained.
+                Note: the number of epoch is 1 indexed. (Results after the first epoch: epoch = 1)
+
+            Fidelity parameters, check get_fidelity_space(). Uses default (max) value if None.
+        rng : np.random.RandomState, int, None
+            Random seed to use in the benchmark.
+
+            To prevent overfitting on a single seed, it is possible to pass a
+            parameter ``rng`` as 'int' or 'np.random.RandomState' to this function.
+            If this parameter is not given, the default random state is used.
+
+        kwargs
+
+        Returns
+        -------
+        Dict -
+            function_value : evaluation precision
+            cost : time to the network + time to validate
+            info : Dict
+                train_precision
+                train_losses
+                train_cost
+                eval_precision
+                eval_losses
+                eval_cost
+                fidelity : used fidelities in this evaluation
+        """
+
+        results = self.mo_benchmark.objective_function_test(
+            configuration=configuration, fidelity=fidelity, rng=rng, **kwargs
+        )
+
+        results['function_value'] = results['function_value']['misclassification_rate']
+        return results
+
+    @staticmethod
+    def get_configuration_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
+        """
+        Return the CS representation of the search space.
+        From https://github.com/D-X-Y/AutoDL-Projects/blob/master/exps/algos/BOHB.py
+        Author: https://github.com/D-X-Y [Xuanyi.Dong@student.uts.edu.au]
+
+        Parameters
+        ----------
+        seed : int, None
+            Random seed for the configuration space.
+
+        Returns
+        -------
+        CS.ConfigurationSpace -
+            Containing the benchmark's hyperparameter
+        """
+        return NasBench201BaseMOBenchmark.get_configuration_space(seed=seed)
+
+    @staticmethod
+    def get_fidelity_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
+        """
+        Creates a ConfigSpace.ConfigurationSpace containing all fidelity parameters for
+        the NAS Benchmark 201.
+
+        Fidelities:
+         - epoch: int
+         The loss / accuracy at `epoch`. Can be from 0 to 199.
+
+        Parameters
+        ----------
+        seed : int, None
+            Fixing the seed for the ConfigSpace.ConfigurationSpace
+
+        Returns
+        -------
+        ConfigSpace.ConfigurationSpace
+        """
+        return NasBench201BaseMOBenchmark.get_fidelity_space(seed=seed)
+
+    @staticmethod
+    def get_meta_information() -> Dict:
+        """ Returns the meta information for the benchmark """
+        return NasBench201BaseMOBenchmark.get_meta_information()
+
+
+class Cifar10ValidNasBench201Benchmark(NasBench201SOBenchmark):
 
     def __init__(self, rng: Union[np.random.RandomState, int, None] = None, **kwargs):
         super(Cifar10ValidNasBench201Benchmark, self).__init__(dataset='cifar10-valid', rng=rng, **kwargs)
 
 
-class Cifar100NasBench201Benchmark(NasBench201BaseBenchmark):
+class Cifar100NasBench201Benchmark(NasBench201SOBenchmark):
 
     def __init__(self, rng: Union[np.random.RandomState, int, None] = None, **kwargs):
         super(Cifar100NasBench201Benchmark, self).__init__(dataset='cifar100', rng=rng, **kwargs)
 
 
-class ImageNetNasBench201Benchmark(NasBench201BaseBenchmark):
+class ImageNetNasBench201Benchmark(NasBench201SOBenchmark):
 
     def __init__(self, rng: Union[np.random.RandomState, int, None] = None, **kwargs):
         super(ImageNetNasBench201Benchmark, self).__init__(dataset='ImageNet16-120', rng=rng, **kwargs)
 
 
-class _NasBench201BaseBenchmarkOriginal(NasBench201BaseBenchmark):
+class _NasBench201BaseBenchmarkOriginal(NasBench201SOBenchmark):
 
     @staticmethod
     def get_fidelity_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
@@ -528,7 +851,7 @@ def get_fidelity_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
     @staticmethod
     def get_meta_information() -> Dict:
         """ Returns the meta information for the benchmark """
-        meta_information = NasBench201BaseBenchmark.get_meta_information()
+        meta_information = NasBench201SOBenchmark.get_meta_information()
         meta_information['note'] = \
             'This version of the benchmark implements the fidelity space defined in the DEHB paper.' \
             'See [DEHB](https://github.com/automl/DEHB/tree/937dd5cf48e79f6d587ea2ff408cb5ad9a8dce46/dehb/examples)'
@@ -558,4 +881,7 @@ def __init__(self, rng: Union[np.random.RandomState, int, None] = None, **kwargs
            "ImageNetNasBench201Benchmark",
            "Cifar10ValidNasBench201BenchmarkOriginal",
            "Cifar100NasBench201BenchmarkOriginal",
-           "ImageNetNasBench201BenchmarkOriginal"]
+           "ImageNetNasBench201BenchmarkOriginal",
+           "Cifar10ValidNasBench201MOBenchmark",
+           "Cifar100NasBench201MOBenchmark",
+           "ImageNetNasBench201MOBenchmark"]
diff --git a/hpobench/container/benchmarks/nas/nasbench_201.py b/hpobench/container/benchmarks/nas/nasbench_201.py
index 5eb9c68f..2a948c6b 100644
--- a/hpobench/container/benchmarks/nas/nasbench_201.py
+++ b/hpobench/container/benchmarks/nas/nasbench_201.py
@@ -54,9 +54,36 @@ def __init__(self, **kwargs):
         super(ImageNetNasBench201BenchmarkOriginal, self).__init__(**kwargs)
 
 
+class Cifar10ValidNasBench201MOBenchmark(AbstractBenchmarkClient):
+    def __init__(self, **kwargs):
+        kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'Cifar10ValidNasBench201MOBenchmark')
+        kwargs['container_name'] = kwargs.get('container_name', 'nasbench_201')
+        kwargs['latest'] = kwargs.get('container_tag', '0.0.6')
+        super(Cifar10ValidNasBench201MOBenchmark, self).__init__(**kwargs)
+
+
+class Cifar100NasBench201MOBenchmark(AbstractBenchmarkClient):
+    def __init__(self, **kwargs):
+        kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'Cifar100NasBench201MOBenchmark')
+        kwargs['container_name'] = kwargs.get('container_name', 'nasbench_201')
+        kwargs['latest'] = kwargs.get('container_tag', '0.0.6')
+        super(Cifar100NasBench201MOBenchmark, self).__init__(**kwargs)
+
+
+class ImageNetNasBench201MOBenchmark(AbstractBenchmarkClient):
+    def __init__(self, **kwargs):
+        kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'ImageNetNasBench201MOBenchmark')
+        kwargs['container_name'] = kwargs.get('container_name', 'nasbench_201')
+        kwargs['latest'] = kwargs.get('container_tag', '0.0.6')
+        super(ImageNetNasBench201MOBenchmark, self).__init__(**kwargs)
+
+
 __all__ = ["Cifar10ValidNasBench201Benchmark",
            "Cifar100NasBench201Benchmark",
            "ImageNetNasBench201Benchmark",
            "Cifar10ValidNasBench201BenchmarkOriginal",
            "Cifar100NasBench201BenchmarkOriginal",
-           "ImageNetNasBench201BenchmarkOriginal"]
+           "ImageNetNasBench201BenchmarkOriginal",
+           "Cifar10ValidNasBench201MOBenchmark",
+           "Cifar100NasBench201MOBenchmark",
+           "ImageNetNasBench201MOBenchmark"]
diff --git a/tests/test_nasbench_201.py b/tests/test_nasbench_201.py
index 22c24b34..70e46de9 100644
--- a/tests/test_nasbench_201.py
+++ b/tests/test_nasbench_201.py
@@ -1,11 +1,11 @@
 import logging
 logging.basicConfig(level=logging.DEBUG)
-
 import pytest
 
-from hpobench.benchmarks.nas.nasbench_201 import ImageNetNasBench201Benchmark, Cifar100NasBench201Benchmark, \
+from hpobench.container.benchmarks.nas.nasbench_201 import ImageNetNasBench201Benchmark, Cifar100NasBench201Benchmark, \
     Cifar10ValidNasBench201Benchmark
-
+from hpobench.benchmarks.nas.nasbench_201 import \
+    Cifar10ValidNasBench201MOBenchmark as LocalCifar10ValidNasBench201MOBenchmark
 from hpobench.util.container_utils import disable_container_debug, enable_container_debug
 
 skip_message = 'We currently skip this test because it takes too much time.'
@@ -23,67 +23,87 @@ def test_nasbench201_cifar10valid(enable_debug):
 
     b = Cifar10ValidNasBench201Benchmark(rng=0)
 
-    cs = b.get_configuration_space(seed=0)
-    config = cs.sample_configuration()
-    fidelity = {'epoch': 199}
-
-    result = b.objective_function(configuration=config, fidelity=fidelity, data_seed=(777, 888, 999))
-
-    assert result['function_value'] == pytest.approx(0.411, abs=0.1)
-    assert result['cost'] == pytest.approx(6650.88, abs=0.1)
-    assert result['info']['train_precision'] == result['function_value']
-    assert result['info']['train_cost'] == result['cost']
-
-    result = b.objective_function_test(configuration=config, fidelity=fidelity, data_seed=(777, 888, 999))
-
-    with pytest.raises(AssertionError):
+    cs_1 = b.get_configuration_space(seed=0)
+    config_1 = cs_1.sample_configuration()
+    cs_2 = b.get_configuration_space(seed=0)
+    config_2 = cs_2.sample_configuration()
+    assert config_1 == config_2
+
+    config = {
+        '1<-0': 'nor_conv_1x1',
+        '2<-0': 'nor_conv_3x3',
+        '2<-1': 'nor_conv_3x3',
+        '3<-0': 'nor_conv_1x1',
+        '3<-1': 'nor_conv_1x1',
+        '3<-2': 'nor_conv_3x3'
+    }
+    result = b.objective_function(configuration=config, fidelity={'epoch': 199}, data_seed=(777, 888, 999))
+    assert result['function_value'] == pytest.approx(9.78, abs=0.1)
+    assert result['cost'] == pytest.approx(11973.20, abs=0.1)
+    assert result['info']['valid_precision'] == result['function_value']
+    assert result['info']['valid_cost'] == result['cost']
+
+    result = b.objective_function_test(configuration=config, fidelity={'epoch': 200})
+    assert result['function_value'] == pytest.approx(9.70, abs=0.1)
+    assert result['cost'] == pytest.approx(10426.33, abs=0.2)
+    assert result['info']['test_precision'] == result['function_value']
+    assert result['info']['test_cost'] == result['cost']
+
+    with pytest.raises(ValueError):
         result = b.objective_function_test(configuration=config, fidelity={'epoch': 10})
 
+
 @pytest.mark.skip(reason=skip_message)
 def test_nasbench201_cifar100(enable_debug):
     b = Cifar100NasBench201Benchmark(rng=0)
 
-    cs = b.get_configuration_space(seed=0)
-    config = cs.sample_configuration()
+    config = {'1<-0': 'nor_conv_1x1',
+              '2<-0': 'nor_conv_3x3',
+              '2<-1': 'nor_conv_3x3',
+              '3<-0': 'nor_conv_1x1',
+              '3<-1': 'nor_conv_1x1',
+              '3<-2': 'nor_conv_3x3'}
     fidelity = {'epoch': 199}
 
     result = b.objective_function(configuration=config, fidelity=fidelity, data_seed=(777, 888, 999))
-
     assert result is not None
-    assert result['function_value'] == pytest.approx(7.8259, abs=0.1)
-    assert result['cost'] == pytest.approx(13301.76, abs=0.1)
-    assert result['info']['train_precision'] == result['function_value']
-    assert result['info']['train_cost'] == result['cost']
+    assert result['function_value'] == pytest.approx(29.5233, abs=0.1)
+    assert result['cost'] == pytest.approx(19681.70, abs=0.1)
+    assert result['info']['valid_precision'] == result['function_value']
+    assert result['info']['valid_cost'] == result['cost']
 
 
 @pytest.mark.skip(reason=skip_message)
 def test_nasbench201_Image(enable_debug):
     b = ImageNetNasBench201Benchmark(rng=0)
-
-    cs = b.get_configuration_space(seed=0)
-    config = cs.sample_configuration()
+    config = {'1<-0': 'nor_conv_1x1',
+              '2<-0': 'nor_conv_3x3',
+              '2<-1': 'nor_conv_3x3',
+              '3<-0': 'nor_conv_1x1',
+              '3<-1': 'nor_conv_1x1',
+              '3<-2': 'nor_conv_3x3'}
     fidelity = {'epoch': 199}
 
     result = b.objective_function(configuration=config, fidelity=fidelity, data_seed=(777, 888, 999))
-
     assert result is not None
-    assert result['function_value'] == pytest.approx(62.858, abs=0.1)
-    assert result['cost'] == pytest.approx(40357.56, abs=0.1)
-    assert result['info']['train_precision'] == result['function_value']
-    assert result['info']['train_cost'] == result['cost']
+    assert result['function_value'] == pytest.approx(55.2167, abs=0.1)
+    assert result['cost'] == pytest.approx(57119.22, abs=0.1)
+    assert result['info']['valid_precision'] == result['function_value']
+    assert result['info']['valid_cost'] == result['cost']
 
 
 def test_nasbench201_fidelity_space():
-    fs = Cifar10ValidNasBench201Benchmark.get_fidelity_space()
+    fs = LocalCifar10ValidNasBench201MOBenchmark.get_fidelity_space()
     assert len(fs.get_hyperparameters()) == 1
 
 
 def test_nasbench201_config():
-    cs = Cifar10ValidNasBench201Benchmark.get_configuration_space(seed=0)
+
+    cs = LocalCifar10ValidNasBench201MOBenchmark.get_configuration_space(seed=0)
     c = cs.sample_configuration()
-    func = Cifar10ValidNasBench201Benchmark.config_to_structure_func(4)
-    struct = func(c)
 
+    func = LocalCifar10ValidNasBench201MOBenchmark.config_to_structure_func(4)
+    struct = func(c)
     assert struct.__repr__() == '_Structure(4 nodes with |nor_conv_1x1~0|+|nor_conv_3x3~0|nor_conv_3x3~1|+' \
                                 '|nor_conv_1x1~0|nor_conv_1x1~1|nor_conv_3x3~2|)'
     assert len(struct) == 4

From 3f08eb2dec42ed44917f21f106748718a4ee70a2 Mon Sep 17 00:00:00 2001
From: ayushi-3536 <77584036+ayushi-3536@users.noreply.github.com>
Date: Tue, 31 May 2022 12:42:31 +0200
Subject: [PATCH 14/29] Benchmark: Fair Adult from MO-ASHA (#148)

We add the benchmark from the MO-ASHA paper by Schmucker et al.

It is a MO benchmark, training an MLP on the Adult data set.
---
 extra_requirements/multi_objective.json       |   3 +
 hpobench/benchmarks/mo/__init__.py            |   0
 hpobench/benchmarks/mo/adult_benchmark.py     | 445 ++++++++++++++++++
 hpobench/container/benchmarks/mo/__init__.py  |   0
 .../benchmarks/mo/adult_benchmark.py          |  12 +
 .../recipes/mo/Singularity.AdultBenchmark     |  25 +
 hpobench/dependencies/mo/__init__.py          |   0
 hpobench/dependencies/mo/fairness_metrics.py  | 110 +++++
 hpobench/dependencies/mo/scalar.py            |  36 ++
 hpobench/util/data_manager.py                 | 159 +++++++
 tests/test_adult.py                           |  37 ++
 11 files changed, 827 insertions(+)
 create mode 100644 extra_requirements/multi_objective.json
 create mode 100644 hpobench/benchmarks/mo/__init__.py
 create mode 100644 hpobench/benchmarks/mo/adult_benchmark.py
 create mode 100644 hpobench/container/benchmarks/mo/__init__.py
 create mode 100644 hpobench/container/benchmarks/mo/adult_benchmark.py
 create mode 100644 hpobench/container/recipes/mo/Singularity.AdultBenchmark
 create mode 100644 hpobench/dependencies/mo/__init__.py
 create mode 100644 hpobench/dependencies/mo/fairness_metrics.py
 create mode 100644 hpobench/dependencies/mo/scalar.py
 create mode 100644 tests/test_adult.py

diff --git a/extra_requirements/multi_objective.json b/extra_requirements/multi_objective.json
new file mode 100644
index 00000000..146c06a7
--- /dev/null
+++ b/extra_requirements/multi_objective.json
@@ -0,0 +1,3 @@
+{
+  "mo_adult": ["pandas==1.2.4","scikit-learn==0.24.2","tqdm>=3.1.4"]
+}
\ No newline at end of file
diff --git a/hpobench/benchmarks/mo/__init__.py b/hpobench/benchmarks/mo/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/hpobench/benchmarks/mo/adult_benchmark.py b/hpobench/benchmarks/mo/adult_benchmark.py
new file mode 100644
index 00000000..a12e8a70
--- /dev/null
+++ b/hpobench/benchmarks/mo/adult_benchmark.py
@@ -0,0 +1,445 @@
+"""
+Changelog:
+==========
+
+0.0.1:
+* First implementation of the Multi-Objective Fair Adult Benchmark.
+"""
+import logging
+import time
+from typing import Union, Dict, List, Any, Tuple
+
+import ConfigSpace as CS
+import numpy as np
+from ConfigSpace.conditions import GreaterThanCondition
+from sklearn.metrics import accuracy_score
+from sklearn.neural_network import MLPClassifier
+
+import hpobench.util.rng_helper as rng_helper
+from hpobench.abstract_benchmark import AbstractMultiObjectiveBenchmark
+from hpobench.dependencies.mo.fairness_metrics import fairness_risk, STATISTICAL_DISPARITY, UNEQUALIZED_ODDS, \
+    UNEQUAL_OPPORTUNITY
+from hpobench.dependencies.mo.scalar import get_fitted_scaler
+from hpobench.util.data_manager import AdultDataManager
+
+__version__ = '0.0.1'
+
+logger = logging.getLogger('ADULT_FAIR')
+
+
+class AdultBenchmark(AbstractMultiObjectiveBenchmark):
+    def __init__(self,
+                 rng: Union[np.random.RandomState, int, None] = None, **kwargs):
+        """
+        Multi-objective fairness HPO task. Optimize the HP of a NN on the adult data set.
+
+        Parameters
+        ----------
+        rng : np.random.RandomState, int, None
+            Random seed for the benchmark's random state.
+        """
+        super(AdultBenchmark, self).__init__(rng=rng, **kwargs)
+
+        data_manager = AdultDataManager()
+        self.X_train, self.y_train, self.X_valid, self.y_valid, self.X_test, self.y_test = data_manager.load()
+        self.output_class = np.unique(self.y_train)
+        self.feature_names = data_manager.feature_names
+        self.sensitive_feature = data_manager.sensitive_names
+
+    @staticmethod
+    def get_configuration_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
+        """
+        Creates a ConfigSpace.ConfigurationSpace containing all parameters for the MLP.
+
+        Parameters
+        ----------
+        seed : int, None
+            Fixing the seed for the ConfigSpace.ConfigurationSpace
+
+        Returns
+        -------
+        ConfigSpace.ConfigurationSpace
+        """
+        cs = CS.ConfigurationSpace(seed=seed)
+
+        cs.add_hyperparameters([
+            CS.UniformIntegerHyperparameter('n_fc_layers', default_value=3, lower=1, upper=4, log=False),
+            CS.UniformIntegerHyperparameter('fc_layer_0', default_value=16, lower=2, upper=32, log=True),
+            CS.UniformIntegerHyperparameter('fc_layer_1', default_value=16, lower=2, upper=32, log=True),
+            CS.UniformIntegerHyperparameter('fc_layer_2', default_value=16, lower=2, upper=32, log=True),
+            CS.UniformIntegerHyperparameter('fc_layer_3', default_value=16, lower=2, upper=32, log=True),
+            CS.UniformFloatHyperparameter('alpha', lower=10**-5, upper=10**-1, default_value=10**-2, log=True),
+            CS.UniformFloatHyperparameter('learning_rate_init', lower=10**-5, upper=1, default_value=10**-3, log=True),
+            CS.UniformFloatHyperparameter('beta_1', lower=10**-3, upper=0.99, default_value=10**-3, log=True),
+            CS.UniformFloatHyperparameter('beta_2', lower=10**-3, upper=0.99, default_value=10**-3, log=True),
+            CS.UniformFloatHyperparameter('tol', lower=10**-5, upper=10**-2, default_value=10**-3, log=True),
+        ])
+
+        cs.add_conditions([
+            # Add the fc_layer_1 (2nd layer) if we allow more than 1 `n_fc_layers`, and so on...
+            GreaterThanCondition(cs.get_hyperparameter('fc_layer_1'), cs.get_hyperparameter('n_fc_layers'), 1),
+            GreaterThanCondition(cs.get_hyperparameter('fc_layer_2'), cs.get_hyperparameter('n_fc_layers'), 2),
+            GreaterThanCondition(cs.get_hyperparameter('fc_layer_3'), cs.get_hyperparameter('n_fc_layers'), 3),
+        ])
+        return cs
+
+    @staticmethod
+    def get_fidelity_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
+        """
+        Creates a ConfigSpace.ConfigurationSpace containing all fidelity parameters.
+
+        Fidelities
+        ----------
+        budget: int - Values: [1, 200]
+            Number of epochs an architecture was trained.
+            Note: the number of epoch is 1 indexed! (Results after the first epoch: epoch = 1)
+
+        Parameters
+        ----------
+        seed : int, None
+            Fixing the seed for the ConfigSpace.ConfigurationSpace
+
+        Returns
+        -------
+        ConfigSpace.ConfigurationSpace
+        """
+        fidelity_space = CS.ConfigurationSpace(seed=seed)
+        fidelity_space.add_hyperparameter(
+            CS.UniformIntegerHyperparameter(
+                'budget', lower=1, upper=200, default_value=200, log=False
+            )
+        )
+        return fidelity_space
+
+    @staticmethod
+    def get_meta_information() -> Dict:
+        """ Returns the meta information for the benchmark """
+        return {
+            'name': 'Multi-objective Asynchronous Successive Halving',
+            'references':
+                ['@article{schmucker2021multi,'
+                 'title={Multi-objective Asynchronous Successive Halving},'
+                 'author={Schmucker, Robin and Donini, Michele and Zafar, Muhammad Bilal and Salinas,'
+                 ' David and Archambeau, C{\'e}dric},'
+                 'journal={arXiv preprint arXiv:2106.12639},'
+                 'year={2021}']}
+
+    @staticmethod
+    def get_objective_names() -> List[str]:
+        """Get a list of objectives evaluated in the objective_function. """
+        return ['accuracy', 'DSP', 'DEO', 'DFP']
+
+    @AbstractMultiObjectiveBenchmark.check_parameters
+    def objective_function(self, configuration: Union[CS.Configuration, Dict],
+                           fidelity: Union[Dict, CS.Configuration, None] = None,
+                           rng: Union[np.random.RandomState, int, None] = None,
+                           shuffle: bool = False,
+                           **kwargs) -> Dict:
+        """
+        Objective function for the multi-objective adult benchmark.
+
+        We train a NN and evaluate its performance using fairness metrics.
+        This function returns the performance on the validation set.
+        However, we report also train and test performance.
+
+        Parameters
+        ----------
+        configuration: Dict, CS.Configuration
+            Configuration for the MLP model.
+        fidelity: Dict, None
+            budget: int - Values: [1, 200]
+                Number of epochs an architecture was trained.
+                Note: the number of epoch is 1 indexed! (Results after the first epoch: epoch = 1)
+            Fidelity parameters, check get_fidelity_space(). Uses default (max) value if None.
+        rng : np.random.RandomState, int, None
+            Random seed to use in the benchmark.
+            To prevent overfitting on a single seed, it is possible to pass a
+            parameter ``rng`` as 'int' or 'np.random.RandomState' to this function.
+            If this parameter is not given, the default random state is used.
+        shuffle: bool, None
+            If ``True``, shuffle the training idx. If no parameter ``rng`` is given, use the class random state.
+            Defaults to ``False``.
+        kwargs
+
+        Returns
+        -------
+        Dict -
+            function_value : Dict - validation metrics after training on train
+                accuracy: float
+                DSO: float
+                DEO: float
+                DFP: float
+            cost : time to train the network
+            info : Dict
+                 train_accuracy : float
+                 valid_accuracy : float
+                 test_accuracy : float
+                 training_cost : float - time to train the network. see `training_cost`
+                 total_cost : float - elapsed time for the entire obj_func call,
+                 eval_train_cost : float - time to compute metrics on training split
+                 eval_valid_cost : float - time to compute metrics on validation split
+                 eval_test_cost : float - time to compute metrics on test split
+                 train_DSO : float
+                 train_DEO : float
+                 train_DFP : float
+                 valid_DSO : float
+                 valid_DEO : float
+                 valid_DFP : float
+                 test_DSO : float
+                 test_DEO : float
+                 test_DFP : float
+                 fidelity : int
+        """
+        self.rng = rng_helper.get_rng(rng=rng, self_rng=self.rng)
+        if shuffle:
+            self._shuffle_data(rng=self.rng, shuffle_valid=False)
+
+        ts_start = time.time()
+
+        budget = fidelity['budget']
+        logger.debug(f"budget for evaluation of config:{budget}")
+        logger.debug(f"config for evaluation:{configuration}")
+
+        sensitive_rows_train = self.X_train[:, self.feature_names.index(self.sensitive_feature)]
+        sensitive_rows_val = self.X_valid[:, self.feature_names.index(self.sensitive_feature)]
+        sensitive_rows_test = self.X_test[:, self.feature_names.index(self.sensitive_feature)]
+
+        X_train, X_valid, X_test = self.X_train.copy(), self.X_valid.copy(), self.X_test.copy()
+
+        # Normalize data
+        scaler = get_fitted_scaler(X_train, "Standard")
+        if scaler is not None:
+            X_train = scaler(X_train)
+            X_valid = scaler(X_valid)
+            X_test = scaler(X_test)
+
+        # Create model. The parameters fc_layer_1-3 might not be included in the search space.
+        hidden = [configuration['fc_layer_0'],
+                  configuration.get('fc_layer_1', None),
+                  configuration.get('fc_layer_2', None),
+                  configuration.get('fc_layer_3', None)][:configuration['n_fc_layers']]
+
+        for item in ['fc_layer_0', 'fc_layer_1', 'fc_layer_2', 'fc_layer_3', 'n_fc_layers']:
+            if item in configuration:
+                configuration.pop(item)
+
+        # We deviate here from the original implementation. They have called `budget`-times mlp.partial_fit().
+        # We call `.fit()` due to efficiency aspects.
+        mlp = MLPClassifier(**configuration, hidden_layer_sizes=hidden, shuffle=shuffle,
+                            random_state=self.rng, max_iter=budget)
+
+        mlp.fit(X_train, self.y_train)
+        training_cost = time.time() - ts_start
+
+        train_accuracy, train_statistical_disparity, train_unequal_opportunity, train_unequalized_odds, \
+            eval_train_runtime = \
+            AdultBenchmark._compute_metrics_on_split(X_train, self.y_train, sensitive_rows_train, mlp)
+
+        val_accuracy, val_statistical_disparity, val_unequal_opportunity, val_unequalized_odds, eval_valid_runtime = \
+            AdultBenchmark._compute_metrics_on_split(X_valid, self.y_valid, sensitive_rows_val, mlp)
+
+        test_accuracy, test_statistical_disparity, test_unequal_opportunity, test_unequalized_odds, eval_test_runtime =\
+            AdultBenchmark._compute_metrics_on_split(X_test, self.y_test, sensitive_rows_test, mlp)
+
+        logger.debug(f"config: {configuration}, val_acc: {val_accuracy}, test_score: {test_accuracy}, "
+                     f"train score: {train_accuracy}, dsp: {val_statistical_disparity}, "
+                     f"deo :{val_unequal_opportunity}, dfp :{val_unequalized_odds}")
+
+        elapsed_time = time.time() - ts_start
+
+        return {'function_value': {'accuracy': float(val_accuracy),
+                                   'DSO': float(val_statistical_disparity),
+                                   'DEO': float(val_unequal_opportunity),
+                                   'DFP': float(val_unequalized_odds)
+                                   },
+                'cost': training_cost,
+                'info': {'train_accuracy': float(train_accuracy),
+                         'valid_accuracy': float(val_accuracy),
+                         'test_accuracy': float(test_accuracy),
+                         'training_cost': training_cost,
+                         'total_cost': elapsed_time,
+                         'eval_train_cost': eval_train_runtime,
+                         'eval_valid_cost': eval_valid_runtime,
+                         'eval_test_cost': eval_test_runtime,
+                         'train_DSO': float(train_statistical_disparity),
+                         'train_DEO': float(train_unequal_opportunity),
+                         'train_DFP': float(train_unequalized_odds),
+                         'valid_DSO': float(val_statistical_disparity),
+                         'valid_DEO': float(val_unequal_opportunity),
+                         'valid_DFP': float(val_unequalized_odds),
+                         'test_DSO': float(test_statistical_disparity),
+                         'test_DEO': float(test_unequal_opportunity),
+                         'test_DFP': float(test_unequalized_odds),
+                         'fidelity': budget
+                         }
+                }
+
+    @AbstractMultiObjectiveBenchmark.check_parameters
+    def objective_function_test(self, configuration: Union[CS.Configuration, Dict],
+                                fidelity: Union[Dict, CS.Configuration, None] = None,
+                                rng: Union[np.random.RandomState, int, None] = None,
+                                shuffle: Union[bool, None] = False,
+                                **kwargs) -> Dict:
+        """
+        Objective function for the multi-objective adult benchmark.
+
+        We train a NN and evaluate its performance using fairness metrics.
+        This function returns the performance on the test set.
+
+        Parameters
+        ----------
+        configuration: Dict, CS.Configuration
+            Configuration for the MLP model.
+            Use default configuration if None.
+        fidelity: Dict, CS.Configuration, None
+            epoch: int - Values: [1, 200]
+                Number of epochs an architecture was trained.
+                Note: the number of epoch is 1 indexed! (Results after the first epoch: epoch = 1)
+            Fidelity parameters, check get_fidelity_space(). Uses default (max) value if None.
+        rng : np.random.RandomState, int, None
+            Random seed to use in the benchmark.
+            To prevent overfitting on a single seed, it is possible to pass a
+            parameter ``rng`` as 'int' or 'np.random.RandomState' to this function.
+            If this parameter is not given, the default random state is used.
+        shuffle: bool, None
+            If ``True``, shuffle the training idx. If no parameter ``rng`` is given, use the class random state.
+            Defaults to ``False``.
+        kwargs
+
+        Returns
+        -------
+        Dict -
+            function_value : Dict - test metrics reported after training on (train+valid)
+                accuracy: float
+                DSO: float
+                DEO: float
+                DFP: float
+            cost : float - time to train the network. see `training_cost`
+            info : Dict
+                 train_accuracy : float
+                 test_accuracy : float
+                 training_cost : float
+                 total_cost : float - elapsed time for the entire obj_func_test call,
+                 eval_train_cost : float - time to compute metrics on training split
+                 eval_test_cost : float - time to compute metrics on test split
+                 train_DSO : float
+                 train_DEO : float
+                 train_DFP : float
+                 test_DSO : float
+                 test_DEO : float
+                 test_DFP : float
+                 fidelity : int
+        """
+        self.rng = rng_helper.get_rng(rng=rng, self_rng=self.rng)
+
+        if shuffle:
+            self._shuffle_data(self.rng, shuffle_valid=True)
+
+        ts_start = time.time()
+
+        budget = fidelity['budget']
+
+        X_train, X_valid, X_test = self.X_train.copy(), self.X_valid.copy(), self.X_test.copy()
+        X_train = np.vstack((X_train, X_valid))
+        y_train = np.vstack((self.y_train[:, np.newaxis], self.y_valid[:, np.newaxis])).ravel()
+
+        sensitive_rows_train = X_train[:, self.feature_names.index(self.sensitive_feature)]
+        sensitive_rows_test = X_test[:, self.feature_names.index(self.sensitive_feature)]
+
+        # Normalize data
+        scaler = get_fitted_scaler(X_train, "Standard")
+        if scaler is not None:
+            X_train = scaler(X_train)
+            X_test = scaler(X_test)
+
+        # Create model. The parameters fc_layer_1-3 might not be included in the search space.
+        hidden = [configuration['fc_layer_0'],
+                  configuration.get('fc_layer_1', None),
+                  configuration.get('fc_layer_2', None),
+                  configuration.get('fc_layer_3', None)][:configuration['n_fc_layers']]
+
+        for item in ['fc_layer_0', 'fc_layer_1', 'fc_layer_2', 'fc_layer_3', 'n_fc_layers']:
+            if item in configuration:
+                configuration.pop(item)
+
+        # We deviate here from the original implementation. They have called `budget`-times mlp.partial_fit().
+        # We call `.fit()` due to efficiency aspects.
+        mlp = MLPClassifier(**configuration, hidden_layer_sizes=hidden, shuffle=shuffle,
+                            random_state=rng, max_iter=budget)
+        mlp.fit(X_train, y_train)
+        training_cost = time.time() - ts_start
+
+        train_accuracy, train_statistical_disparity, train_unequal_opportunity, train_unequalized_odds, \
+            eval_train_runtime = \
+            AdultBenchmark._compute_metrics_on_split(X_train, y_train, sensitive_rows_train, mlp)
+
+        test_accuracy, test_statistical_disparity, test_unequal_opportunity, test_unequalized_odds, eval_test_runtime =\
+            AdultBenchmark._compute_metrics_on_split(X_test, self.y_test, sensitive_rows_test, mlp)
+
+        elapsed_time = time.time() - ts_start
+
+        logger.debug(f"config:{configuration}, test_score: {test_accuracy}, train score:{train_accuracy},"
+                     f"dsp:{test_statistical_disparity}, deo :{test_unequal_opportunity}, dfp :{test_unequalized_odds}")
+
+        return {'function_value': {'accuracy': float(test_accuracy),
+                                   'DSO': float(test_statistical_disparity),
+                                   'DEO': float(test_unequal_opportunity),
+                                   'DFP': float(test_unequalized_odds)
+                                   },
+                'cost': training_cost,
+                'info': {'train_accuracy': float(train_accuracy),
+                         'test_accuracy': float(test_accuracy),
+                         'training_cost': training_cost,
+                         'total_cost': elapsed_time,
+                         'eval_train_cost': eval_train_runtime,
+                         'eval_test_cost': eval_test_runtime,
+                         'train_DSO': float(train_statistical_disparity),
+                         'train_DEO': float(train_unequal_opportunity),
+                         'train_DFP': float(train_unequalized_odds),
+                         'test_DSO': float(test_statistical_disparity),
+                         'test_DEO': float(test_unequal_opportunity),
+                         'test_DFP': float(test_unequalized_odds),
+                         'fidelity': budget
+                         }
+                }
+
+    @staticmethod
+    def _compute_metrics_on_split(
+            x_split: np.ndarray, y_split: np.ndarray, sensitive_rows: Any,  mlp: Any
+    ) -> Tuple:
+
+        start = time.time()
+        _y_pred = mlp.predict(x_split)
+        accuracy = accuracy_score(y_split, _y_pred)
+        statistical_disparity = fairness_risk(x_split, y_split, sensitive_rows, mlp, STATISTICAL_DISPARITY)
+        unequal_opportunity = fairness_risk(x_split, y_split, sensitive_rows, mlp, UNEQUAL_OPPORTUNITY)
+        unequalized_odds = fairness_risk(x_split, y_split, sensitive_rows, mlp, UNEQUALIZED_ODDS)
+        runtime = time.time() - start
+        return accuracy, statistical_disparity, unequal_opportunity, unequalized_odds, runtime
+
+    def _shuffle_data(self, rng=None, shuffle_valid=False) -> None:
+        """
+        Reshuffle the training data.
+
+        Parameters
+        ----------
+        rng
+            If 'rng' is None, the training idx are shuffled according to the class-random-state
+        shuffle_valid: bool, None
+            If true, shuffle the validation data. Defaults to False.
+        """
+        random_state = rng_helper.get_rng(rng, self.rng)
+
+        train_idx = np.arange(len(self.X_train))
+        random_state.shuffle(train_idx)
+        self.X_train = self.X_train[train_idx]
+        self.y_train = self.y_train[train_idx]
+
+        if shuffle_valid:
+            valid_idx = np.arange(len(self.X_valid))
+            random_state.shuffle(valid_idx)
+            self.X_valid = self.X_valid[valid_idx]
+            self.y_valid = self.y_valid[valid_idx]
+
+
+__all__ = ['AdultBenchmark']
diff --git a/hpobench/container/benchmarks/mo/__init__.py b/hpobench/container/benchmarks/mo/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/hpobench/container/benchmarks/mo/adult_benchmark.py b/hpobench/container/benchmarks/mo/adult_benchmark.py
new file mode 100644
index 00000000..dbdcaf4d
--- /dev/null
+++ b/hpobench/container/benchmarks/mo/adult_benchmark.py
@@ -0,0 +1,12 @@
+""" Benchmark for the Multi-Objective Adult Benchmark from hpobench/benchmarks/mo/adult_benchmark.py
+"""
+
+from hpobench.container.client_abstract_benchmark import AbstractBenchmarkClient
+
+
+class AdultBenchmark(AbstractBenchmarkClient):
+    def __init__(self, **kwargs):
+        kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'AdultBenchmark')
+        kwargs['container_name'] = kwargs.get('container_name', 'fair_adult')
+        kwargs['latest'] = kwargs.get('container_tag', '0.0.1')
+        super(AdultBenchmark, self).__init__(**kwargs)
diff --git a/hpobench/container/recipes/mo/Singularity.AdultBenchmark b/hpobench/container/recipes/mo/Singularity.AdultBenchmark
new file mode 100644
index 00000000..d373caa2
--- /dev/null
+++ b/hpobench/container/recipes/mo/Singularity.AdultBenchmark
@@ -0,0 +1,25 @@
+Bootstrap: docker
+From: python:3.7-slim
+
+%labels
+MAINTAINER sharmaa@informatik.uni-freiburg.de
+VERSION v0.0.1
+
+%post
+    apt update -y
+    apt install build-essential git wget -y
+
+    cd /home \
+    && git clone https://github.com/automl/HPOBench.git \
+    && cd HPOBench \
+    && git checkout master \
+    && pip install .[mo_adult] \
+    && cd / \
+    && mkdir /var/lib/hpobench/ \
+    && chmod -R 777 /var/lib/hpobench/ \
+    && rm -rf /var/lib/apt/lists/* \
+    && pip cache purge
+
+
+%runscript
+    python -s /home/HPOBench/hpobench/container/server_abstract_benchmark.py mo.adult_benchmark $@
\ No newline at end of file
diff --git a/hpobench/dependencies/mo/__init__.py b/hpobench/dependencies/mo/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/hpobench/dependencies/mo/fairness_metrics.py b/hpobench/dependencies/mo/fairness_metrics.py
new file mode 100644
index 00000000..7776fbd9
--- /dev/null
+++ b/hpobench/dependencies/mo/fairness_metrics.py
@@ -0,0 +1,110 @@
+"""
+This file contains functionality to compute various fairness related risk scores.
+"""
+
+import numpy as np
+
+STATISTICAL_DISPARITY = 'statistical_disparity'  # P(1 | group A) - P(1 | group B)
+UNEQUAL_OPPORTUNITY = 'unequal_opportunity'  # P(1 | group A, 0) - P(1 | group B, 0)
+UNEQUALIZED_ODDS = 'unequalized_odds'  # P(1 | group A, 1) - P(1 | group B, 1)
+
+TPR0 = 'tpr0'
+TPR1 = 'tpr1'
+TPR_DIF = 'tpr_dif'
+TPR_MIN = 'tpr_min'
+
+FAIRNESS_METRICS = [STATISTICAL_DISPARITY, UNEQUAL_OPPORTUNITY, UNEQUALIZED_ODDS, TPR0, TPR1, TPR_DIF, TPR_MIN]
+
+PRED_THRESHOLD = 0.5
+
+
+def fairness_risk(x, y, sensitive_rows, model, unfairness_metric):
+    """
+    Returns the fairness_risk based on the definition of the unfairness_metric, currently supporting:
+    statistical_disparity: P(positive prediction | group A) = P(positive prediction | group B)
+
+    Parameters
+    ----------
+    x: np.ndarray
+        inputs
+    y: np.ndarray
+        labels in {0, 1} such that 0 is a "positive" label, 1 "negative"
+    sensitive_rows: np.ndarray
+        binary array indicating which rows correspond to the protected group
+    model:
+        trained sklearn model
+    unfairness_metric: str
+        string with unfairness condition
+
+    Returns
+    -------
+        float
+    """
+    predicted_probs = model.predict_proba(x)
+    if unfairness_metric == STATISTICAL_DISPARITY:
+        prob_not_recidive_a = np.mean(
+            predicted_probs[sensitive_rows == 1][:, 0])
+        prob_not_recidive_b = np.mean(
+            predicted_probs[sensitive_rows == 0][:, 0])
+        return abs(prob_not_recidive_a - prob_not_recidive_b)
+    elif unfairness_metric == UNEQUAL_OPPORTUNITY:
+        prob_not_recidive_a = np.mean(
+            predicted_probs[sensitive_rows == 1 & (y == 0)][:, 0])
+        prob_not_recidive_b = np.mean(
+            predicted_probs[sensitive_rows == 0 & (y == 0)][:, 0])
+        return abs(prob_not_recidive_a - prob_not_recidive_b)
+    elif unfairness_metric == UNEQUALIZED_ODDS:
+        prob_not_recidive_a = np.mean(
+            predicted_probs[sensitive_rows == 1 & (y == 1)][:, 0])
+        prob_not_recidive_b = np.mean(
+            predicted_probs[sensitive_rows == 0 & (y == 1)][:, 0])
+        return abs(prob_not_recidive_a - prob_not_recidive_b)
+    elif unfairness_metric == TPR0:
+        y_pred = np.where(predicted_probs[:, 0] >= PRED_THRESHOLD, 1, 0)
+        y0 = y[sensitive_rows == 0]
+        y0_pred = y_pred[sensitive_rows == 0]
+        tp0 = np.sum((y0_pred == 1) & (y0 == 1))
+        fn0 = np.sum((y0_pred == 0) & (y0 == 1))
+        tpr0 = tp0 / (tp0 + fn0)
+        return tpr0
+    elif unfairness_metric == TPR1:
+        y_pred = np.where(predicted_probs[:, 0] >= PRED_THRESHOLD, 1, 0)
+        y1 = y[sensitive_rows == 1]
+        y1_pred = y_pred[sensitive_rows == 1]
+        tp1 = np.sum((y1_pred == 1) & (y1 == 1))
+        fn1 = np.sum((y1_pred == 0) & (y1 == 1))
+        tpr1 = tp1 / (tp1 + fn1)
+        return tpr1
+    elif unfairness_metric == TPR_DIF:
+        y_pred = np.where(predicted_probs[:, 0] >= PRED_THRESHOLD, 1, 0)
+        y0 = y[sensitive_rows == 0]
+        y0_pred = y_pred[sensitive_rows == 0]
+        tp0 = np.sum((y0_pred == 1) & (y0 == 1))
+        fn0 = np.sum((y0_pred == 0) & (y0 == 1))
+        tpr0 = tp0 / (tp0 + fn0)
+
+        y1 = y[sensitive_rows == 1]
+        y1_pred = y_pred[sensitive_rows == 1]
+        tp1 = np.sum((y1_pred == 1) & (y1 == 1))
+        fn1 = np.sum((y1_pred == 0) & (y1 == 1))
+        tpr1 = tp1 / (tp1 + fn1)
+        return abs(tpr0 - tpr1)
+    elif unfairness_metric == TPR_MIN:
+        y_pred = np.where(predicted_probs[:, 0] >= PRED_THRESHOLD, 1, 0)
+        y0 = y[sensitive_rows == 0]
+        y0_pred = y_pred[sensitive_rows == 0]
+        tp0 = np.sum((y0_pred == 1) & (y0 == 1))
+        fn0 = np.sum((y0_pred == 0) & (y0 == 1))
+        tpr0 = tp0 / (tp0 + fn0)
+
+        y1 = y[sensitive_rows == 1]
+        y1_pred = y_pred[sensitive_rows == 1]
+        tp1 = np.sum((y1_pred == 1) & (y1 == 1))
+        fn1 = np.sum((y1_pred == 0) & (y1 == 1))
+        tpr1 = tp1 / (tp1 + fn1)
+        return min(tpr0, tpr1)
+    else:
+        raise ValueError(
+            f'{unfairness_metric} is not a valid unfairness condition. '
+            f'Please specify one among ({STATISTICAL_DISPARITY}, {UNEQUAL_OPPORTUNITY}, {UNEQUALIZED_ODDS})'
+        )
diff --git a/hpobench/dependencies/mo/scalar.py b/hpobench/dependencies/mo/scalar.py
new file mode 100644
index 00000000..3f434fde
--- /dev/null
+++ b/hpobench/dependencies/mo/scalar.py
@@ -0,0 +1,36 @@
+import numpy as np
+from typing import Union
+
+try:
+    from sklearn.preprocessing import MinMaxScaler, StandardScaler
+except ImportError:
+    print("scikit-learn not installed")
+
+
+def get_fitted_scaler(x_train: np.ndarray, name: Union[None, str] = None):
+    """
+    Instantiates a scaler by a given name and fits the scaler with x_train.
+    Parameters
+    ----------
+    x_train: np.ndarray
+        Train data
+
+    name: str, None
+        Name of the scaling method. Defaults to no scaling.
+
+    Returns
+    -------
+
+    """
+
+    if name == "MinMax":
+        scaler = MinMaxScaler(feature_range=(0, 1), copy=True)
+    elif name == "Standard":
+        scaler = StandardScaler(copy=True)
+    elif name is None or name == "None":
+        return None
+    else:
+        raise NotImplementedError()
+
+    scaler.fit(x_train)
+    return lambda x: scaler.transform(x)
diff --git a/hpobench/util/data_manager.py b/hpobench/util/data_manager.py
index a2e33121..00d9568d 100644
--- a/hpobench/util/data_manager.py
+++ b/hpobench/util/data_manager.py
@@ -926,6 +926,165 @@ def _load(self) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndar
         return X_trn, y_trn, X_val, y_val, X_tst, y_tst
 
 
+class AdultDataManager(HoldoutDataManager):
+
+    def __init__(self):
+        super(AdultDataManager, self).__init__()
+        self.logger.debug('AdultDataManager: Starting to load data')
+        self.urls = {"data": "http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data",
+                     "test_data": "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test"}
+
+        self.feature_names = ['age', 'fnlwgt', 'education-num', 'marital-status', 'relationship', 'race',
+                              'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'country',
+                              'employment_type']
+        self.sensitive_names = 'sex'
+
+        self._save_dir = hpobench.config_file.data_dir / "adult"
+
+        self._data_extract_path = self._save_dir / "processed_data"
+
+        self.create_save_directory(self._data_extract_path)
+
+    def load(self):
+        """
+        Loads Adult Fair Datasets from data directory as defined in hpobenchrc.data_directory.
+        Downloads data if necessary.
+
+        Returns
+        -------
+        X_train: np.ndarray
+        y_train: np.ndarray
+        X_val: np.ndarray
+        y_val: np.ndarray
+        X_test: np.ndarray
+        y_test: np.ndarray
+        """
+
+        t = time()
+        self._download()
+        X_trn, y_trn, X_val, y_val, X_tst, y_tst = self._load()
+        self.logger.info(f'AdultDataManager: Data successfully loaded after {time() - t:.2f}')
+
+        return X_trn, y_trn, X_val, y_val, X_tst, y_tst
+
+    def _download(self):
+
+        if not (self._save_dir / "adult.data").exists():
+            self._download_file_with_progressbar(self.urls["data"], self._save_dir / "adult.data")
+
+        if not (self._save_dir / "adult.test").exists():
+            self._download_file_with_progressbar(self.urls["test_data"], self._save_dir / "adult.test")
+
+    def _load(self) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
+        """
+        Load the data from file and split it into train, test and validation split.
+
+        Returns
+        -------
+        X_train: np.ndarray
+        y_train: np.ndarray
+        X_val: np.ndarray
+        y_val: np.ndarray
+        X_test: np.ndarray
+        y_test: np.ndarray
+        """
+        processed_files = ['x_train', 'x_valid', 'x_test', 'y_train', 'y_valid', 'y_test']
+        file_is_missing = not all([(self._data_extract_path / f'{file}.npy').exists() for file in processed_files])
+
+        if file_is_missing:
+            columns = ["age", "workclass", "fnlwgt", "education", "education-num", "marital-status",
+                       "occupation", "relationship", "race", "sex", "capital-gain", "capital-loss",
+                       "hours-per-week", "country", "salary"]
+            train_data = pd.read_csv(self._save_dir / 'adult.data', names=columns, sep=',', na_values='?')
+            test_data = pd.read_csv(self._save_dir / 'adult.test', names=columns, sep=',', skiprows=1, na_values='?')
+
+            X, y = self._process_adult_data(train_data)
+            X_test, y_test = self._process_adult_data(test_data)
+
+            n_trn = int(X.shape[0] * 0.7)
+            # Creation of Train and Test dataset
+            X_train, y_train = X[:n_trn], y[:n_trn]
+            X_valid, y_valid = X[n_trn:], y[n_trn:]
+
+            np.save(self._data_extract_path / 'x_train.npy', X_train)
+            np.save(self._data_extract_path / 'x_valid.npy', X_valid)
+            np.save(self._data_extract_path / 'x_test.npy', X_test)
+
+            np.save(self._data_extract_path / 'y_train.npy', y_train)
+            np.save(self._data_extract_path / 'y_valid.npy', y_valid)
+            np.save(self._data_extract_path / 'y_test.npy', y_test)
+
+        else:
+            X_train = np.load(self._data_extract_path / 'x_train.npy')
+            X_valid = np.load(self._data_extract_path / 'x_valid.npy')
+            X_test = np.load(self._data_extract_path / 'x_test.npy')
+
+            y_train = np.load(self._data_extract_path / 'y_train.npy')
+            y_valid = np.load(self._data_extract_path / 'y_valid.npy')
+            y_test = np.load(self._data_extract_path / 'y_test.npy')
+
+        return X_train, y_train, X_valid, y_valid, X_test, y_test
+
+    def _process_adult_data(self, df) -> Tuple[np.ndarray, np.ndarray]:
+        # mapping all categories of marital status to Single(1) or Couple(0)
+        df['marital-status'] = df['marital-status'].replace(
+            [' Divorced', ' Married-spouse-absent', ' Never-married', ' Separated', ' Widowed'], 'Single')
+        df['marital-status'] = df['marital-status'].replace([' Married-AF-spouse', ' Married-civ-spouse'], 'Couple')
+        df['marital-status'] = df['marital-status'].map({'Couple': 0, 'Single': 1})
+
+        # mapping race
+        race_map = {' White': 0, ' Amer-Indian-Eskimo': 1, ' Asian-Pac-Islander': 2, ' Black': 3, ' Other': 4}
+        df['race'] = df['race'].map(race_map)
+
+        # categorizing all work classes into 4 major categories
+        def get_workclass(x):
+            if x['workclass'] == ' Federal-gov' or x['workclass'] == ' Local-gov' or x['workclass'] == ' State-gov':
+                return 'govt'
+            elif x['workclass'] == ' Private':
+                return 'private'
+            elif x['workclass'] == ' Self-emp-inc' or x['workclass'] == ' Self-emp-not-inc':
+                return 'self_employed'
+            else:
+                return 'without_pay'
+
+        df['employment_type'] = df.apply(get_workclass, axis=1)
+        employment_map = {'govt': 0, 'private': 1, 'self_employed': 2, 'without_pay': 3}
+        df['employment_type'] = df['employment_type'].map(employment_map)
+
+        # mapping relationship map
+        rel_map = {' Unmarried': 0, ' Wife': 1, ' Husband': 2, ' Not-in-family': 3, ' Own-child': 4,
+                   ' Other-relative': 5}
+        df['relationship'] = df['relationship'].map(rel_map)
+
+        # maping capital gain/loss to binary values
+        df.loc[(df['capital-gain'] > 0), 'capital-gain'] = 1
+        df.loc[(df['capital-gain'] == 0, 'capital-gain')] = 0
+        df.loc[(df['capital-loss'] > 0), 'capital-loss'] = 1
+        df.loc[(df['capital-loss'] == 0, 'capital-loss')] = 0
+
+        # defining salary map
+        salary_map = {' <=50K': 1, ' >50K': 0, ' <=50K.': 1, ' >50K.': 0, }
+        df['salary'] = df['salary'].map(salary_map).astype(int)
+
+        df['sex'] = df['sex'].map({' Male': 1, ' Female': 0}).astype(int)
+
+        # replacing all missing values with np.nan
+        df['country'] = df['country'].replace(' ?', np.nan)
+        df['workclass'] = df['workclass'].replace(' ?', np.nan)
+        df['occupation'] = df['occupation'].replace(' ?', np.nan)
+
+        # categorizing countries into "Non-US" and "US"
+        df.loc[df['country'] != ' United-States', 'country'] = 'Non-US'
+        df.loc[df['country'] == ' United-States', 'country'] = 'US'
+        df['country'] = df['country'].map({'US': 1, 'Non-US': 0}).astype(int)
+
+        df.drop(labels=['workclass', 'education', 'occupation'], axis=1, inplace=True)
+        X = df.drop(['salary'], axis=1)
+        y = df['salary']
+
+        return X.to_numpy(), y.to_numpy()
+
+
 class TabularDataManager(DataManager):
     def __init__(self, model: str, task_id: [int, str], data_dir: [str, Path, None] = None):
         super(TabularDataManager, self).__init__()
diff --git a/tests/test_adult.py b/tests/test_adult.py
new file mode 100644
index 00000000..d7a030b7
--- /dev/null
+++ b/tests/test_adult.py
@@ -0,0 +1,37 @@
+import logging
+import pytest
+
+logging.basicConfig(level=logging.DEBUG)
+
+
+def test_adult_benchmark():
+    from hpobench.container.benchmarks.mo.adult_benchmark import AdultBenchmark
+
+    # Check Seeding
+    benchmark = AdultBenchmark(rng=0)
+    cs = benchmark.get_configuration_space(seed=0)
+    cfg_1 = cs.sample_configuration()
+
+    cs = benchmark.get_configuration_space(seed=0)
+    cfg_2 = cs.sample_configuration()
+
+    assert cfg_1 == cfg_2
+
+    test_config = {
+        'alpha': 0.00046568046379195655, 'beta_1': 0.14382335124614148, 'beta_2': 0.0010007892350251595,
+        'fc_layer_0': 4, 'fc_layer_1': 2, 'fc_layer_2': 2, 'fc_layer_3': 3,'n_fc_layers': 4,
+        'learning_rate_init': 0.0005343227125594117,
+        'tol': 0.0004134759007834719
+    }
+
+    result_1 = benchmark.objective_function(test_config, rng=1, fidelity={'budget': 3})
+    result_2 = benchmark.objective_function(test_config, rng=1, fidelity={'budget': 3})
+
+    assert result_1['info']['valid_accuracy'] == pytest.approx(0.7539, rel=0.001)
+    assert result_1['info']['valid_accuracy'] == result_1['function_value']['accuracy']
+    assert result_1['info']['train_accuracy'] == pytest.approx(0.76145, rel=0.001)
+    assert result_1['info']['train_accuracy'] == result_2['info']['train_accuracy']
+
+    result_1 = benchmark.objective_function_test(test_config, rng=1, fidelity={'budget': 3})
+    assert result_1['function_value']['accuracy'] == pytest.approx(0.76377, rel=0.001)
+    assert result_1['function_value']['accuracy'] == result_1['info']['test_accuracy']

From 4c4f1d93b446b96dc01c16d1448d05c6132e6440 Mon Sep 17 00:00:00 2001
From: ayushi-3536 <77584036+ayushi-3536@users.noreply.github.com>
Date: Wed, 1 Jun 2022 17:04:26 +0200
Subject: [PATCH 15/29] Multi Objective CNN benchmark: Flowers and Fashion
 (#147)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Added mo cnn benchmarks from bag of baseline paper

We deviate from the original benchmark in two points:
* we return as cost only the  training time instead of the total elapsed time
* we return as objective for minimization instead of `-100 * accuracy` now `1 - accuracy` to achieve better output scalings.

Co-authored-by: ayushi-3536 <ayushi-3536@github.com>
Co-authored-by: Philipp Müller <muller-phil@gmx.net>
---
 extra_requirements/mo_cnn.json                |   7 +
 hpobench/benchmarks/mo/cnn_benchmark.py       | 575 ++++++++++++++++++
 .../container/benchmarks/mo/cnn_benchmark.py  |  22 +
 hpobench/container/benchmarks/od/__init__.py  |   0
 .../recipes/mo/Singularity.CNNBenchmark       |  26 +
 hpobench/util/data_manager.py                 |  88 ++-
 tests/test_mo_cnn.py                          |  48 ++
 7 files changed, 765 insertions(+), 1 deletion(-)
 create mode 100644 extra_requirements/mo_cnn.json
 create mode 100644 hpobench/benchmarks/mo/cnn_benchmark.py
 create mode 100644 hpobench/container/benchmarks/mo/cnn_benchmark.py
 create mode 100644 hpobench/container/benchmarks/od/__init__.py
 create mode 100644 hpobench/container/recipes/mo/Singularity.CNNBenchmark
 create mode 100644 tests/test_mo_cnn.py

diff --git a/extra_requirements/mo_cnn.json b/extra_requirements/mo_cnn.json
new file mode 100644
index 00000000..35914e3e
--- /dev/null
+++ b/extra_requirements/mo_cnn.json
@@ -0,0 +1,7 @@
+{
+  "mo_cnn": [
+    "tqdm>=3.0.0",
+    "torch==1.9.0",
+    "pandas==1.2.4"
+  ]
+}
diff --git a/hpobench/benchmarks/mo/cnn_benchmark.py b/hpobench/benchmarks/mo/cnn_benchmark.py
new file mode 100644
index 00000000..d8bfd939
--- /dev/null
+++ b/hpobench/benchmarks/mo/cnn_benchmark.py
@@ -0,0 +1,575 @@
+"""
+Changelog:
+==========
+
+0.0.1:
+* First implementation of the Multi-Objective CNN Benchmark.
+"""
+import logging
+import random
+import time
+from typing import Union, Dict, List, Tuple, Any
+
+import ConfigSpace as CS
+import numpy as np
+import torch
+import torch.nn as nn
+import tqdm
+from ConfigSpace.conditions import GreaterThanCondition
+from torch.utils.data import TensorDataset, DataLoader
+
+import hpobench.util.rng_helper as rng_helper
+from hpobench.abstract_benchmark import AbstractMultiObjectiveBenchmark
+from hpobench.util.data_manager import CNNDataManager
+
+__version__ = '0.0.1'
+
+logger = logging.getLogger('MO_CNN')
+
+
+class AccuracyTop1:
+
+    def __init__(self):
+        self.reset()
+
+        self.sum = 0
+        self.cnt = 0
+
+    def reset(self):
+        self.sum = 0
+        self.cnt = 0
+
+    def __call__(self, y_true: torch.Tensor, y_pred: torch.Tensor) -> float:
+        self.sum += y_pred.topk(1)[1].eq(y_true.argmax(-1).reshape(-1, 1).expand(-1, 1)).float().sum().to('cpu').numpy()
+        self.cnt += y_pred.size(0)
+        return self.sum / self.cnt
+
+
+class Net(nn.Module):
+    """
+    The model to optimize
+    """
+
+    def __init__(self, config: Dict, input_shape: Tuple = (3, 28, 28),
+                 num_classes: Union[int, None] = 10):
+        super(Net, self).__init__()
+        inp_ch = input_shape[0]
+        layers = []
+        for i in range(config['n_conv_layers']):
+            out_ch = config['conv_layer_{}'.format(i)]
+            ks = config['kernel_size']
+            layers.append(nn.Conv2d(inp_ch, out_ch, kernel_size=ks, padding=(ks - 1) // 2))
+            layers.append(nn.ReLU())
+            if config['batch_norm']:
+                layers.append(nn.BatchNorm2d(out_ch))
+            layers.append(nn.MaxPool2d(kernel_size=2, stride=2))
+            inp_ch = out_ch
+
+        self.conv_layers = nn.Sequential(*layers)
+        self.pooling = nn.AdaptiveAvgPool2d(1) if config['global_avg_pooling'] else nn.Identity()
+        self.output_size = num_classes
+
+        self.fc_layers = nn.ModuleList()
+
+        inp_n = self._get_conv_output(input_shape)
+
+        layers = [nn.Flatten()]
+        for i in range(config['n_fc_layers']):
+            out_n = config['fc_layer_{}'.format(i)]
+
+            layers.append(nn.Linear(inp_n, out_n))
+            layers.append(nn.ReLU())
+
+            inp_n = out_n
+
+        layers.append(nn.Linear(inp_n, num_classes))
+        self.fc_layers = nn.Sequential(*layers)
+
+    # generate input sample and forward to get shape
+    def _get_conv_output(self, shape: Tuple) -> int:
+        bs = 1
+        input = torch.autograd.Variable(torch.rand(bs, *shape))
+        output_feat = self.conv_layers(input)
+        output_feat = self.pooling(output_feat)
+        n_size = output_feat.data.view(bs, -1).size(1)
+        return n_size
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.conv_layers(x)
+        x = self.pooling(x)
+        x = self.fc_layers(x)
+        return x
+
+    def train_fn(self, optimizer: torch.optim.Optimizer, criterion: Any, loader: DataLoader, device: torch.device):
+        """
+        Training method
+
+        Parameters
+        ----------
+        optimizer
+            optimization algorithm
+        criterion
+            loss function
+        loader
+            data loader for either training or testing set
+        device
+            Either CPU or GPU
+        Returns
+        -------
+        accuracy on the data
+        """
+        accuracy = AccuracyTop1()
+        self.train()
+
+        acc = 0
+        for images, labels in loader:
+            images = images.to(device)
+            labels = labels.to(device)
+
+            optimizer.zero_grad()
+            logits = self(images)
+
+            loss = criterion(logits, labels.argmax(-1))
+            loss.backward()
+            optimizer.step()
+
+            acc = accuracy(labels, logits)
+
+        return acc
+
+    def eval_fn(self, loader: DataLoader, device: torch.device):
+        """
+        Evaluation method
+
+        Parameters
+        ----------
+        loader:
+            data loader for either training or testing set
+        device:
+            torch device
+
+        Returns
+        -------
+        accuracy on the data
+        """
+        accuracy = AccuracyTop1()
+        self.eval()
+
+        acc = 0
+        with torch.no_grad():  # no gradient needed
+            for images, labels in loader:
+                images = images.to(device)
+                labels = labels.to(device)
+
+                outputs = self(images)
+                acc = accuracy(labels, outputs)
+
+        return acc
+
+
+class CNNBenchmark(AbstractMultiObjectiveBenchmark):
+    def __init__(self, dataset: str,
+                 rng: Union[np.random.RandomState, int, None] = None, **kwargs):
+        """
+        Parameters
+        ----------
+        dataset : str
+            One of fashion, flower.
+        rng : np.random.RandomState, int, None
+            Random seed for the benchmark's random state.
+        """
+
+        super(CNNBenchmark, self).__init__(rng=rng)
+        allowed_datasets = ["fashion", "flower"]
+        assert dataset in allowed_datasets, f'Requested data set is not supported. Must be one of ' \
+                                            f'{", ".join(allowed_datasets)}, but was {dataset}'
+        logger.info(f'Start Benchmark on dataset {dataset}')
+
+        self.dataset = dataset
+        self.__seed_everything()
+
+        # Dataset loading
+        data_manager = CNNDataManager(dataset=self.dataset)
+        self.X_train, self.y_train, self.X_valid, self.y_valid, self.X_test, self.y_test = data_manager.load()
+
+        self.output_classes = self.y_train.shape[1]
+        self.input_shape = self.X_train.shape[1:4]
+
+    @staticmethod
+    def get_configuration_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
+        """
+        Creates a ConfigSpace.ConfigurationSpace containing all parameters for
+        the CNN model.
+
+        Parameters
+        ----------
+        seed : int, None
+            Fixing the seed for the ConfigSpace.ConfigurationSpace
+
+        Returns
+        -------
+        ConfigSpace.ConfigurationSpace
+        """
+        cs = CS.ConfigurationSpace(seed=seed)
+        cs.add_hyperparameters([
+            CS.UniformIntegerHyperparameter('n_conv_layers', default_value=3, lower=1, upper=3, log=False),
+            CS.UniformIntegerHyperparameter('n_fc_layers', default_value=3, lower=1, upper=3, log=False),
+            CS.UniformIntegerHyperparameter('conv_layer_0', default_value=128, lower=16, upper=1024, log=True),
+            CS.UniformIntegerHyperparameter('conv_layer_1', default_value=128, lower=16, upper=1024, log=True),
+            CS.UniformIntegerHyperparameter('conv_layer_2', default_value=128, lower=16, upper=1024, log=True),
+            CS.UniformIntegerHyperparameter('fc_layer_0', default_value=32, lower=2, upper=512, log=True),
+            CS.UniformIntegerHyperparameter('fc_layer_1', default_value=32, lower=2, upper=512, log=True),
+            CS.UniformIntegerHyperparameter('fc_layer_2', default_value=32, lower=2, upper=512, log=True),
+
+            CS.UniformIntegerHyperparameter('batch_size', lower=1, upper=512, default_value=128, log=True),
+            CS.UniformFloatHyperparameter('learning_rate_init', lower=10**-5, upper=1, default_value=10**-3, log=True),
+            CS.CategoricalHyperparameter('batch_norm', default_value=False, choices=[False, True]),
+            CS.CategoricalHyperparameter('global_avg_pooling', default_value=True, choices=[False, True]),
+            CS.CategoricalHyperparameter('kernel_size', default_value=5, choices=[7, 5, 3])
+        ])
+
+        cs.add_conditions([
+            # Add the conv_layer_1 (2nd layer) if we allow more than 1 (>1) `n_conv_layers`, and so on...
+            GreaterThanCondition(cs.get_hyperparameter('conv_layer_1'), cs.get_hyperparameter('n_conv_layers'), 1),
+            GreaterThanCondition(cs.get_hyperparameter('conv_layer_2'), cs.get_hyperparameter('n_conv_layers'), 2),
+            GreaterThanCondition(cs.get_hyperparameter('fc_layer_1'), cs.get_hyperparameter('n_fc_layers'), 1),
+            GreaterThanCondition(cs.get_hyperparameter('fc_layer_2'), cs.get_hyperparameter('n_fc_layers'), 2),
+        ])
+
+        return cs
+
+    @staticmethod
+    def get_fidelity_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
+        """
+        Creates a ConfigSpace.ConfigurationSpace containing all fidelity parameters
+
+        Fidelities
+        ----------
+        budget: int - [1, 25]
+            Number of epochs to train
+
+        Parameters
+        ----------
+        seed : int, None
+            Fixing the seed for the ConfigSpace.ConfigurationSpace
+
+        Returns
+        -------
+        ConfigSpace.ConfigurationSpace
+        """
+        fidelity_space = CS.ConfigurationSpace(seed=seed)
+        fidelity_space.add_hyperparameters([
+            CS.UniformIntegerHyperparameter('budget', lower=1, upper=25, default_value=25, log=False)
+        ])
+        return fidelity_space
+
+    @staticmethod
+    def get_meta_information() -> Dict:
+        """ Returns the meta information for the benchmark """
+        return {
+            'name': 'Bag of baselines for multi-objective joint neural architecture search and '
+                    'hyperparameter optimization',
+            'references': ['@article{guerrero2021bag,'
+                           'title   = {Bag of baselines for multi - objective joint neural architecture search and '
+                           'hyperparameter optimization},'
+                           'author  = {Guerrero-Viu, Julia and Hauns, Sven and Izquierdo, Sergio and Miotto, '
+                           'Guilherme and Schrodi, Simon and Biedenkapp, Andre and Elsken, Thomas and Deng, '
+                           'Difan and Lindauer, Marius and Hutter, Frank},},'
+                           'journal = {arXiv preprint arXiv:2105.01015},'
+                           'year    = {2021}}',
+                           ],
+            'code': 'https://github.com/automl/multi-obj-baselines',
+        }
+
+    @staticmethod
+    def get_objective_names() -> List[str]:
+        """Get the names of the objectives reported in the objective function."""
+        return ['accuracy', 'model_size']
+
+    def init_model(self, config: Union[CS.Configuration, Dict]) -> Net:
+        """
+        Function that returns the model initialized based on the configuration and fidelity
+        """
+        if isinstance(config, CS.Configuration):
+            config = config.get_dictionary()
+        return Net(config, self.input_shape, self.output_classes)
+
+    def __seed_everything(self):
+        """Helperfunction: Make the benchmark deterministic by setting the correct seeds"""
+        seed = self.rng.randint(0, 100000)
+        logger.debug(f'Generate seed: {seed}')
+        random.seed(seed)
+        np.random.seed(seed)
+        torch.manual_seed(seed)
+        torch.cuda.manual_seed_all(seed)
+        torch.backends.cudnn.deterministic = True
+
+    def _shuffle_data(self, rng=None, shuffle_valid=False) -> None:
+        """
+        Reshuffle the training data.
+
+        Parameters
+        ----------
+        rng
+            If 'rng' is None, the training idx are shuffled according to the class-random-state
+        shuffle_valid: bool, None
+            If true, shuffle the validation data. Defaults to False.
+        """
+        random_state = rng_helper.get_rng(rng, self.rng)
+
+        train_idx = np.arange(len(self.X_train))
+        random_state.shuffle(train_idx)
+        self.X_train = self.X_train[train_idx]
+        self.y_train = self.y_train[train_idx]
+
+        if shuffle_valid:
+            valid_idx = np.arange(len(self.X_valid))
+            random_state.shuffle(valid_idx)
+            self.X_valid = self.X_valid[valid_idx]
+            self.y_valid = self.y_valid[valid_idx]
+
+    @AbstractMultiObjectiveBenchmark.check_parameters
+    def objective_function(self, configuration: Union[CS.Configuration, Dict],
+                           fidelity: Union[Dict, CS.Configuration, None] = None,
+                           rng: Union[np.random.RandomState, int, None] = None,
+                           shuffle: bool = False,
+                           **kwargs) -> Dict:
+        """
+        Train a CNN on either the flower or the fashion data set and return the performance on the validation
+        data split.
+
+        Parameters
+        ----------
+        configuration : Dict, CS.Configuration
+            Configuration for the CNN Model
+        fidelity: Dict, CS.Configuration, None
+            epoch: int - Values: [1, 50]
+                Number of epochs an architecture was trained.
+                Note: the number of epoch is 1 indexed! (Results after the first epoch: epoch = 1)
+            Fidelity parameters, check get_fidelity_space(). Uses default (max) value if None.
+        rng : np.random.RandomState, int, None
+            Random seed to use in the benchmark.
+            To prevent overfitting on a single seed, it is possible to pass a
+            parameter ``rng`` as 'int' or 'np.random.RandomState' to this function.
+            If this parameter is not given, the default random state is used.
+        shuffle: bool, None
+            If ``True``, shuffle the training idx. If no parameter ``rng`` is given, use the class random state.
+            Defaults to ``False``.
+        kwargs
+
+        Returns
+        -------
+        Dict -
+            function_value : Dict
+                negative_accuracy: float
+                    1 - validation accuracy
+                log_model_size: float
+                    log10 of the number of parameters
+            cost : time to train the network
+            info : Dict
+                train_accuracy : float,
+                training_cost : float,
+                valid_accuracy : float,
+                valid_cost : float,
+                test_accuracy : float,
+                test_cost : float,
+                model_size : int,
+                fidelity : Dict
+                    used fidelities in this evaluation
+        """
+        self.rng = rng_helper.get_rng(rng=rng, self_rng=self.rng)
+        self.__seed_everything()
+
+        if shuffle:
+            self._shuffle_data(rng=self.rng, shuffle_valid=False)
+
+        time_in = time.time()
+
+        device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
+        logger.info(f'We use the device: {device}')
+
+        # initializing model
+        model = self.init_model(configuration).to(device)
+        epochs = fidelity['budget']
+
+        optimizer = torch.optim.Adam(model.parameters(), lr=configuration['learning_rate_init'])
+        criterion = torch.nn.CrossEntropyLoss()
+
+        ds_train = TensorDataset(self.X_train, self.y_train)
+        ds_train = DataLoader(ds_train, batch_size=configuration['batch_size'], shuffle=True)
+
+        ds_val = TensorDataset(self.X_valid, self.y_valid)
+        ds_val = DataLoader(ds_val, batch_size=configuration['batch_size'], shuffle=True)
+
+        ds_test = TensorDataset(self.X_test, self.y_test)
+        ds_test = DataLoader(ds_test, batch_size=configuration['batch_size'], shuffle=True)
+
+        start = time.time()
+        t = tqdm.tqdm(total=epochs)
+
+        train_accuracy = 0
+        for epoch in range(epochs):
+            train_accuracy = model.train_fn(optimizer, criterion, ds_train, device).item()
+            t.set_postfix(train_accuracy=train_accuracy)
+            t.update()
+        training_runtime = time.time() - start
+
+        num_params = np.sum([p.numel() for p in model.parameters()]).item()
+        start = time.time()
+        val_accuracy = model.eval_fn(ds_val, device).item()
+        eval_valid_runtime = time.time() - start
+        start = time.time()
+        test_accuracy = model.eval_fn(ds_test, device).item()
+        eval_test_runtime = time.time() - start
+
+        t.set_postfix(
+            train_acc=train_accuracy,
+            val_acc=val_accuracy,
+            tst_acc=test_accuracy,
+            len=np.log10(num_params),
+            train_runtime=training_runtime,
+            eval_valid_runtime=eval_valid_runtime,
+            eval_test_runtime=eval_test_runtime,
+        )
+        t.close()
+
+        elapsed_time = time.time() - time_in
+
+        return {'function_value': {'negative_accuracy': 1 - val_accuracy,
+                                   'log_model_size': float(np.log10(num_params))},
+                'cost': float(training_runtime),
+                'info': {'train_accuracy': train_accuracy,
+                         'training_cost': training_runtime,
+                         'valid_accuracy': val_accuracy,
+                         'valid_cost': eval_valid_runtime,
+                         'test_accuracy': test_accuracy,
+                         'test_cost': eval_test_runtime,
+                         'total_time': elapsed_time,
+                         'model_size': num_params,
+                         'fidelity': fidelity}
+                }
+
+    @AbstractMultiObjectiveBenchmark.check_parameters
+    def objective_function_test(self, configuration: Union[CS.Configuration, Dict],
+                                fidelity: Union[Dict, CS.Configuration, None] = None,
+                                rng: Union[np.random.RandomState, int, None] = None,
+                                shuffle: bool = False,
+                                **kwargs) -> Dict:
+        """
+        Train a CNN on both the train adn validation split of either the flower or the fashion data set and
+        get the test results.
+        Parameters
+        ----------
+        configuration : Dict, CS.Configuration
+            Configuration for the CNN Model
+        fidelity: Dict, CS.Configuration, None
+            epoch: int - Values: [1, 50]
+                Number of epochs an architecture was trained.
+        rng : np.random.RandomState, int, None
+            Random seed to use in the benchmark.
+            To prevent overfitting on a single seed, it is possible to pass a
+            parameter ``rng`` as 'int' or 'np.random.RandomState' to this function.
+            If this parameter is not given, the default random state is used.
+        shuffle: bool, None
+            If ``True``, shuffle the training idx. If no parameter ``rng`` is given, use the class random state.
+            Defaults to ``False``.
+        kwargs
+
+        Returns
+        -------
+        Dict -
+            function_value : Dict
+                negative_accuracy: float
+                    1 - test accuracy
+                log_model_size: float
+                    log10 of the number of parameters
+            cost : time to train the network
+            info : Dict
+                train_accuracy : float,
+                training_cost : float,
+                test_accuracy : float,
+                test_cost : float,
+                model_size : int,
+                fidelity : Dict
+                    used fidelities in this evaluation
+        """
+
+        time_in = time.time()
+
+        self.rng = rng_helper.get_rng(rng=rng, self_rng=self.rng)
+        self.__seed_everything()
+
+        if shuffle:
+            self._shuffle_data(rng=self.rng, shuffle_valid=False)
+
+        train_X = torch.vstack((self.X_train, self.X_valid))
+        y_train = torch.cat((self.y_train, self.y_valid))
+
+        device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
+        # initializing model
+        model = self.init_model(configuration).to(device)
+        epochs = fidelity['budget']
+
+        optimizer = torch.optim.Adam(model.parameters(), lr=configuration['learning_rate_init'])
+        criterion = torch.nn.CrossEntropyLoss()
+
+        ds_train = TensorDataset(train_X, y_train)
+        ds_train = DataLoader(ds_train, batch_size=configuration['batch_size'], shuffle=True)
+
+        ds_test = TensorDataset(self.X_test, self.y_test)
+        ds_test = DataLoader(ds_test, batch_size=configuration['batch_size'], shuffle=True)
+
+        start = time.time()
+        t = tqdm.tqdm(total=epochs)
+
+        train_accuracy = 0
+        for epoch in range(epochs):
+            train_accuracy = model.train_fn(optimizer, criterion, ds_train, device).item()
+            t.set_postfix(train_accuracy=train_accuracy)
+            t.update()
+        training_runtime = time.time() - start
+
+        num_params = np.sum([p.numel() for p in model.parameters()])
+        start = time.time()
+        test_accuracy = model.eval_fn(ds_test, device).item()
+        eval_test_runtime = time.time() - start
+
+        t.set_postfix(
+            train_acc=train_accuracy,
+            tst_acc=test_accuracy,
+            len=np.log10(num_params),
+            eval_train_runtime=training_runtime,
+            eval_test_runtime=eval_test_runtime,
+
+        )
+        t.close()
+
+        elapsed_time = time.time() - time_in
+
+        return {'function_value': {'negative_accuracy': 1 - test_accuracy,
+                                   'log_model_size': float(np.log10(num_params))},
+                'cost': training_runtime,
+                'info': {'train_accuracy': train_accuracy,
+                         'training_cost': training_runtime,
+                         'test_accuracy': test_accuracy,
+                         'test_cost': eval_test_runtime,
+                         'total_time': elapsed_time,
+                         'model_size': num_params,
+                         'fidelity': fidelity}
+                }
+
+
+class FashionCNNBenchmark(CNNBenchmark):
+
+    def __init__(self, rng: Union[np.random.RandomState, int, None] = None, **kwargs):
+        super(FashionCNNBenchmark, self).__init__(dataset='fashion', rng=rng, **kwargs)
+
+
+class FlowerCNNBenchmark(CNNBenchmark):
+
+    def __init__(self, rng: Union[np.random.RandomState, int, None] = None, **kwargs):
+        super(FlowerCNNBenchmark, self).__init__(dataset='flower', rng=rng, **kwargs)
+
+
+__all__ = ["FashionCNNBenchmark",
+           "FlowerCNNBenchmark"]
diff --git a/hpobench/container/benchmarks/mo/cnn_benchmark.py b/hpobench/container/benchmarks/mo/cnn_benchmark.py
new file mode 100644
index 00000000..c9a1d009
--- /dev/null
+++ b/hpobench/container/benchmarks/mo/cnn_benchmark.py
@@ -0,0 +1,22 @@
+""" Benchmark for the Multi-Objective CNN Benchmark from hpobench/benchmarks/mo/cnn_benchmark.py
+"""
+
+from hpobench.container.client_abstract_benchmark import AbstractMOBenchmarkClient
+
+
+class FlowerCNNBenchmark(AbstractMOBenchmarkClient):
+    def __init__(self, **kwargs):
+        kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'FlowerCNNBenchmark')
+        kwargs['container_name'] = kwargs.get('container_name', 'mo_cnn')
+        kwargs['latest'] = kwargs.get('container_tag', '0.0.1')
+        kwargs['gpu'] = kwargs.get('gpu', True)
+        super(FlowerCNNBenchmark, self).__init__(**kwargs)
+
+
+class FashionCNNBenchmark(AbstractMOBenchmarkClient):
+    def __init__(self, **kwargs):
+        kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'FashionCNNBenchmark')
+        kwargs['container_name'] = kwargs.get('container_name', 'mo_cnn')
+        kwargs['latest'] = kwargs.get('container_tag', '0.0.1')
+        kwargs['gpu'] = kwargs.get('gpu', True)
+        super(FashionCNNBenchmark, self).__init__(**kwargs)
diff --git a/hpobench/container/benchmarks/od/__init__.py b/hpobench/container/benchmarks/od/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/hpobench/container/recipes/mo/Singularity.CNNBenchmark b/hpobench/container/recipes/mo/Singularity.CNNBenchmark
new file mode 100644
index 00000000..c9870968
--- /dev/null
+++ b/hpobench/container/recipes/mo/Singularity.CNNBenchmark
@@ -0,0 +1,26 @@
+Bootstrap: docker
+From: python:3.7-slim
+
+%labels
+MAINTAINER sharmaa@informatik.uni-freiburg.de
+VERSION v0.0.1
+
+%post
+    apt update -y
+    apt install build-essential git wget -y
+
+    cd /home \
+    && cd /home \
+    && git clone https://github.com/automl/HPOBench.git \
+    && cd HPOBench \
+    && git checkout master \
+    && pip install .[mo_cnn] \
+    && cd / \
+    && mkdir /var/lib/hpobench/ \
+    && chmod -R 777 /var/lib/hpobench/ \
+    && rm -rf /var/lib/apt/lists/* \
+    && pip cache purge
+
+
+%runscript
+    python -s /home/HPOBench/hpobench/container/server_abstract_benchmark.py mo.cnn_benchmark $@
\ No newline at end of file
diff --git a/hpobench/util/data_manager.py b/hpobench/util/data_manager.py
index 00d9568d..c72305e1 100644
--- a/hpobench/util/data_manager.py
+++ b/hpobench/util/data_manager.py
@@ -37,7 +37,6 @@
 except ImportError:
     print("pandas is not installed, can't download datasets for the ml.tabular_benchmarks (not needed for containers)")
 
-
 import hpobench
 
 
@@ -845,6 +844,93 @@ def _load(self) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndar
         X_train, y_train = data[:n_train, 1:], data[:n_train, 0]
         X_val, y_val = data[n_train:n_train + n_val, 1:], data[n_train:n_train + n_val, 0]
         X_test, y_test = data[n_train + n_val:, 1:], data[n_train + n_val:, 0]
+        return X_train, y_train, X_val, y_val, X_test, y_test
+
+
+class CNNDataManager(HoldoutDataManager):
+
+    def __init__(self, dataset: str):
+
+        super(CNNDataManager, self).__init__()
+        self.logger.debug('CNNDataManager: Starting to load data')
+
+        allowed_datasets = ["fashion", "flower"]
+        assert dataset in allowed_datasets, f'Requested data set is not supported. Must be one of ' \
+                                            f'{", ".join(allowed_datasets)}, but was {dataset}'
+
+        self.url_source = f'https://github.com/ayushi-3536/DatasetHost/blob/main/{dataset}.tar.gz?raw=true'
+        self.dataset = dataset
+        self.save_dir = hpobench.config_file.data_dir / "CNN" / f'{dataset}'
+        self.compressed_data = self.save_dir / f'{dataset}.tar.gz'
+        self.create_save_directory(self.save_dir)
+
+    def load(self):
+        """
+        Loads CNN Benchmark from data directory as defined in hpobenchrc.data_directory.
+        Downloads data if necessary.
+
+        Returns
+        -------
+        X_train: np.ndarray
+        y_train: np.ndarray
+        X_val: np.ndarray
+        y_val: np.ndarray
+        X_test: np.ndarray
+        y_test: np.ndarray
+        """
+
+        t = time()
+        self._download()
+        X_trn, y_trn, X_val, y_val, X_tst, y_tst = self._load()
+        self.logger.info(f'CNNDataManager: Data successfully loaded after {time() - t:.2f}')
+
+        return X_trn, y_trn, X_val, y_val, X_tst, y_tst
+
+    def _download(self):
+
+        # Check if data is already downloaded.
+        # Use a file lock to ensure that no two processes try to download the same files at the same time.
+        if self.compressed_data.exists():
+            self.logger.debug('CNNDataManager: Data already downloaded')
+        else:
+
+            self.logger.info(f'CNNDataManager: Start downloading data from {self.url_source} '
+                             f'to {self.save_dir}')
+            self._download_file_with_progressbar(data_url=self.url_source, data_file=self.compressed_data)
+            self._untar_data(compressed_file=self.compressed_data, save_dir=self.save_dir)
+
+    def _load(self) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
+        """
+        Load the data from file and split it into train, test and validation split.
+
+        Returns
+        -------
+        X_train: np.ndarray
+        y_train: np.ndarray
+        X_val: np.ndarray
+        y_val: np.ndarray
+        X_test: np.ndarray
+        y_test: np.ndarray
+        """
+
+        data_extract_path = self.save_dir / "data"
+        X_train = np.load(data_extract_path / 'x_train.npy')
+        y_train = np.load(data_extract_path / 'y_train.npy')
+
+        X_val = np.load(data_extract_path / 'x_val.npy')
+        y_val = np.load(data_extract_path / 'y_val.npy')
+
+        # Read Test datasets
+        X_test = np.load(data_extract_path / 'x_test.npy')
+        y_test = np.load(data_extract_path / 'y_test.npy')
+
+        def __cast_x_y(x, y) -> Tuple:
+            import torch
+            return torch.tensor(x).float().permute(0, 3, 1, 2), torch.tensor(y).long()
+
+        X_train, y_train = __cast_x_y(X_train, y_train)
+        X_val, y_val = __cast_x_y(X_val, y_val)
+        X_test, y_test = __cast_x_y(X_test, y_test)
 
         return X_train, y_train, X_val, y_val, X_test, y_test
 
diff --git a/tests/test_mo_cnn.py b/tests/test_mo_cnn.py
new file mode 100644
index 00000000..308c59ad
--- /dev/null
+++ b/tests/test_mo_cnn.py
@@ -0,0 +1,48 @@
+import pytest
+
+
+def test_mo_cnn_seeding():
+    from hpobench.container.benchmarks.mo.cnn_benchmark import FlowerCNNBenchmark
+    b1 = FlowerCNNBenchmark(rng=0)
+    b2 = FlowerCNNBenchmark(rng=0)
+    test_config = {
+        'batch_norm': True, 'batch_size': 71, 'conv_layer_0': 194,  'conv_layer_1': 152,
+        'conv_layer_2': 92, 'fc_layer_0': 65, 'fc_layer_1': 19, 'fc_layer_2': 273,
+        'global_avg_pooling': True, 'kernel_size': 5, 'learning_rate_init': 0.09091283280651452,
+        'n_conv_layers': 2, 'n_fc_layers': 2
+    }
+
+    result_1 = b1.objective_function(test_config, rng=1, fidelity={'budget': 3})
+    result_2 = b2.objective_function(test_config, rng=1, fidelity={'budget': 3})
+    for metric in result_1['function_value'].keys():
+        assert result_1['function_value'][metric] == pytest.approx(result_2['function_value'][metric], abs=0.001)
+
+
+def test_mo_cnn_benchmark():
+    from hpobench.container.benchmarks.mo.cnn_benchmark import FlowerCNNBenchmark
+
+    # Check Seeding
+    benchmark = FlowerCNNBenchmark(rng=0)
+    cs = benchmark.get_configuration_space(seed=0)
+    cfg_1 = cs.sample_configuration()
+
+    cs = benchmark.get_configuration_space(seed=0)
+    cfg_2 = cs.sample_configuration()
+
+    assert cfg_1 == cfg_2
+
+    test_config = {
+        'batch_norm': True, 'batch_size': 71, 'conv_layer_0': 194,  'conv_layer_1': 152,
+        'conv_layer_2': 92, 'fc_layer_0': 65, 'fc_layer_1': 19, 'fc_layer_2': 273,
+        'global_avg_pooling': True, 'kernel_size': 5, 'learning_rate_init': 0.09091283280651452,
+        'n_conv_layers': 2, 'n_fc_layers': 2
+    }
+
+    result_1 = benchmark.objective_function(test_config, rng=1, fidelity={'budget': 3})
+    result_2 = benchmark.objective_function(test_config, rng=1, fidelity={'budget': 3})
+    print(f'MO CNN: Valid Accuracy = {result_1["info"]["valid_accuracy"]}')
+    print(f'MO CNN: Train Accuracy = {result_1["info"]["train_accuracy"]}')
+    # assert result_1['info']['train_accuracy'] == pytest.approx(0.1044, rel=0.001)
+    # assert result_1['info']['valid_accuracy'] == pytest.approx(0.1029, rel=0.001)
+    assert result_1['info']['valid_accuracy'] == pytest.approx(1 - result_1['function_value']['negative_accuracy'], abs=0.001)
+    assert result_1['info']['train_accuracy'] == result_2['info']['train_accuracy']

From 9e471d9959a9f703efa4d4f9b3d0b50ecd64e81d Mon Sep 17 00:00:00 2001
From: ayushi-3536 <ayushi.sharma.3536@gmail.com>
Date: Sat, 4 Jun 2022 19:36:31 +0200
Subject: [PATCH 16/29] -add gpu support -add dependency version

---
 extra_requirements/lm_benchmark.json             | 2 +-
 hpobench/container/benchmarks/mo/lm_benchmark.py | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/extra_requirements/lm_benchmark.json b/extra_requirements/lm_benchmark.json
index a686fe43..8f263bba 100644
--- a/extra_requirements/lm_benchmark.json
+++ b/extra_requirements/lm_benchmark.json
@@ -1,6 +1,6 @@
 {
   "lm": [
     "torch==1.3.0",
-    "tqdm"
+    "tqdm>=3.0.0"
   ]
 }
\ No newline at end of file
diff --git a/hpobench/container/benchmarks/mo/lm_benchmark.py b/hpobench/container/benchmarks/mo/lm_benchmark.py
index f261b00d..8e08fcae 100644
--- a/hpobench/container/benchmarks/mo/lm_benchmark.py
+++ b/hpobench/container/benchmarks/mo/lm_benchmark.py
@@ -9,4 +9,5 @@ def __init__(self, **kwargs):
         kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'LanguageModelBenchmark')
         kwargs['container_name'] = kwargs.get('container_name', 'lm_benchmark')
         kwargs['latest'] = kwargs.get('container_tag', '0.0.1')
+        kwargs['gpu'] = kwargs.get('gpu', True)
         super(LanguageModelBenchmark, self).__init__(**kwargs)

From 1dee0c301629326260bd6a12a2491191d1174d34 Mon Sep 17 00:00:00 2001
From: ayushi-3536 <ayushi.sharma.3536@gmail.com>
Date: Tue, 17 May 2022 14:50:09 +0200
Subject: [PATCH 17/29] - added language model benchmark from MO-ASHA paper -
 added dependencies, benchmark - added token generation and model training
 code from moasha in dependencies

---
 extra_requirements/lm_benchmark.json      |   5 +
 hpobench/benchmarks/mo/lm_benchmark.py    | 363 ++++++++++++++++++++++
 hpobench/dependencies/lm/__init__.py      |   0
 hpobench/dependencies/lm/model.py         | 140 +++++++++
 hpobench/dependencies/lm/tokenize_util.py |  56 ++++
 hpobench/util/data_manager.py             |  59 ++++
 6 files changed, 623 insertions(+)
 create mode 100644 extra_requirements/lm_benchmark.json
 create mode 100644 hpobench/benchmarks/mo/lm_benchmark.py
 create mode 100644 hpobench/dependencies/lm/__init__.py
 create mode 100644 hpobench/dependencies/lm/model.py
 create mode 100644 hpobench/dependencies/lm/tokenize_util.py

diff --git a/extra_requirements/lm_benchmark.json b/extra_requirements/lm_benchmark.json
new file mode 100644
index 00000000..34d1249b
--- /dev/null
+++ b/extra_requirements/lm_benchmark.json
@@ -0,0 +1,5 @@
+{
+  "lm": [
+    "torch==1.3.0"
+  ]
+}
\ No newline at end of file
diff --git a/hpobench/benchmarks/mo/lm_benchmark.py b/hpobench/benchmarks/mo/lm_benchmark.py
new file mode 100644
index 00000000..270243f6
--- /dev/null
+++ b/hpobench/benchmarks/mo/lm_benchmark.py
@@ -0,0 +1,363 @@
+"""
+Changelog:
+==========
+
+0.0.1:
+* First implementation of the Multi-Objective CNN Benchmark.
+"""
+from typing import Union, Tuple, Dict, List
+import ConfigSpace as CS
+import numpy as np
+import torch
+import torch.nn as nn
+import logging
+from ConfigSpace.hyperparameters import Hyperparameter
+import hpobench.util.rng_helper as rng_helper
+from hpobench.abstract_benchmark import AbstractMultiObjectiveBenchmark
+from hpobench.util.data_manager import LanguageModelDataManager
+from hpobench.dependencies.lm.tokenize_util import batchify
+from hpobench.dependencies.lm.model import TransformerModel
+import time
+import math
+import tqdm
+
+__version__ = '0.0.1'
+
+logger = logging.getLogger('MO_CNN')
+
+
+class LanguageModelBenchmark(AbstractMultiObjectiveBenchmark):
+
+    def __init__(self, rng: Union[np.random.RandomState, int, None] = None, **kwargs):
+        super(LanguageModelBenchmark, self).__init__(rng=rng)
+
+        data_manager = LanguageModelDataManager()
+        self.X_train, self.X_valid, self.X_test = data_manager.load()
+        self.corpus = data_manager.corpus
+
+        self.variable = {"eval_batch_size": 10,
+                         "nlayers": 2,
+                         "bptt": 35,
+                         "tied": True,
+                         "nhead": 2,
+                         "ntoken": len(self.corpus.dictionary)
+                         }
+        print("len of corpus dict", len(self.corpus.dictionary))
+
+    @staticmethod
+    def get_configuration_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
+        """Parameter space to be optimized --- contains the hyperparameters
+        """
+        cs = CS.ConfigurationSpace(seed=seed)
+
+        cs.add_hyperparameters([
+            CS.UniformIntegerHyperparameter(
+                'batch_size', default_value=128, lower=8, upper=256
+            ),
+            CS.UniformIntegerHyperparameter(
+                'emsize', default_value=128, lower=32, upper=1024
+            ),
+            CS.UniformIntegerHyperparameter(
+                'lr_factor', default_value=50, lower=1, upper=100, log=True
+            ),
+            CS.UniformFloatHyperparameter(
+                'lr', default_value=5, lower=1, upper=50, log=True
+            ),
+            CS.UniformFloatHyperparameter(
+                'dropout', default_value=0.99, lower=0, upper=0.99
+            ),
+            CS.UniformFloatHyperparameter(
+                'clip', default_value=0.99, lower=0.1, upper=2
+            )
+
+        ])
+        return cs
+
+    @staticmethod
+    def get_objective_names(self) -> List[str]:
+        return ['perplexity', 'error', 'time']
+
+    @staticmethod
+    def get_fidelity_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
+
+        fidelity_space = CS.ConfigurationSpace(seed=seed)
+        fidelity_space.add_hyperparameters([
+            # gray-box setting (multi-multi-fidelity) - iterations + data subsample
+            LanguageModelBenchmark._get_fidelity_choices(iter_choice='variable')
+        ])
+        return fidelity_space
+
+    @staticmethod
+    def _get_fidelity_choices(iter_choice: str) -> Tuple[Hyperparameter, Hyperparameter]:
+
+        fidelity1 = dict(
+            fixed=CS.Constant('budget', value=81),
+            variable=CS.UniformIntegerHyperparameter(
+                'budget', lower=1, upper=81, default_value=81, log=False
+            )
+        )
+
+        budget = fidelity1[iter_choice]
+        return budget
+
+    @staticmethod
+    def get_meta_information() -> Dict:
+        """ Returns the meta information for the benchmark """
+        return {
+            'name': 'Multi-objective Asynchronous Successive Halving',
+            'references': ['@article{schmucker2021multi,'
+                           'title={Multi-objective Asynchronous Successive Halving},'
+                           'author={Schmucker, Robin and Donini, Michele and Zafar, Muhammad Bilal and Salinas, David and Archambeau, C{\'e}dric},'
+                           'journal={arXiv preprint arXiv:2106.12639},'
+                           'year={2021}',
+                           ],
+        }
+
+    def init_model(self, config: Union[CS.Configuration, Dict]):
+        """ Function that returns the model initialized based on the configuration and fidelity
+        """
+
+        if isinstance(config, CS.Configuration):
+            config = config.get_dictionary()
+
+        model = TransformerModel(
+            self.variable['ntoken'], config['emsize'], self.variable['nhead'], config['emsize'],
+            self.variable['nlayers'], config['dropout'])
+
+        return model
+
+    @AbstractMultiObjectiveBenchmark.check_parameters
+    def objective_function(self, configuration: Union[CS.Configuration, Dict],
+                           fidelity: Union[Dict, CS.Configuration, None] = None,
+                           rng: Union[np.random.RandomState, int, None] = None,
+                           shuffle: bool = False,
+                           **kwargs) -> Dict:
+        """
+
+        Parameters
+        ----------
+        configuration
+        fidelity: Dict, None
+            epoch: int - Values: [1, 81]
+                Number of epochs an architecture was trained.
+                Note: the number of epoch is 1 indexed! (Results after the first epoch: epoch = 1)
+
+            Fidelity parameters, check get_fidelity_space(). Uses default (max) value if None.
+        rng : np.random.RandomState, int, None
+            Random seed to use in the benchmark.
+
+            To prevent overfitting on a single seed, it is possible to pass a
+            parameter ``rng`` as 'int' or 'np.random.RandomState' to this function.
+            If this parameter is not given, the default random state is used.
+
+
+        kwargs
+
+        Returns
+        -------
+        Dict -
+            function_value : Dict
+                validation_accuracy: float
+                log_perplexity: float
+            cost : time to train the network
+            info : Dict
+                validation_accuracy : float,
+                test_accuracy : float,
+                log_perplexity : float,
+                negative_log_perplexity : float,
+                training_cost : float,
+                valid_cost : float,
+                test_cost : float,
+                fidelity : Dict
+                    used fidelities in this evaluation
+        """
+        self.rng = rng_helper.get_rng()
+        device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
+        ts_start = time.time()
+
+        # batchify data
+        batch_size = configuration['batch_size']
+        train_data = batchify(self.X_train, batch_size=batch_size).to(device)
+        val_data = batchify(self.X_valid, batch_size=self.variable["eval_batch_size"]).to(device)
+        test_data = batchify(self.X_test, batch_size=self.variable["eval_batch_size"]).to(device)
+
+        epochs = fidelity['budget']
+
+        model = self.init_model(configuration).to(device)
+
+        criterion = nn.CrossEntropyLoss()
+
+        learning_rate = configuration['lr']
+        learning_rate_factor = configuration['lr_factor']
+        clip = configuration['clip']
+        best_val_loss = None
+        train_eval_time = 0
+
+        t = tqdm.tqdm(total=epochs)
+        for epoch in range(epochs):
+            epoch_start_time = time.time()
+            model.train_fun(model, self.corpus, criterion, train_data, learning_rate, batch_size, clip)
+
+            val_loss, val_acc = model.evaluate(model, self.corpus, criterion, val_data)
+            val_loss = np.clip(val_loss, 1e-10, 10)
+
+            ts_now = time.time()
+            train_eval_time += ts_now - epoch_start_time
+
+            t.set_postfix(val_accuracy=val_acc)
+            t.update()
+
+            if not np.isfinite(val_loss):
+                val_loss = 7
+
+            # Save the model if the validation loss is the best we've seen so far.
+            if not best_val_loss or val_loss < best_val_loss:
+                best_val_loss = val_loss
+            else:
+                # Anneal the learning rate if no improvement has been seen in the validation dataset.
+                learning_rate /= learning_rate_factor
+
+        start_time = time.time()
+        _, val_acc = model.eval_fun(model, self.corpus, criterion, val_data)
+        eval_valid_runtime = time.time() - start_time
+
+        start_time = time.time()
+        _, test_acc = model.eval_fun(model, self.corpus, criterion, test_data)
+        eval_test_runtime = time.time() - start_time
+
+        perplexity = math.exp(best_val_loss)
+        log_perplexity = best_val_loss
+        neg_log_perplexity = 10 - best_val_loss
+        elapsed_time = float(ts_start - time.time())
+
+        return {'function_value': {'log_perplexity': log_perplexity,
+                                   'accuracy': val_acc,
+                                   },
+                'cost': elapsed_time,
+                'info': {'validation_accuracy': val_acc,
+                         'test_accuracy': test_acc,
+                         'log_perplexity': log_perplexity,
+                         'perplexity': perplexity,
+                         'negative_log_perplexity': neg_log_perplexity,
+                         'training_cost': train_eval_time,
+                         'valid_cost': eval_valid_runtime,
+                         'test_cost': eval_test_runtime,
+                         'fidelity': fidelity
+                         }
+                }
+
+    @AbstractMultiObjectiveBenchmark.check_parameters
+    def objective_function_test(self, configuration: Union[CS.Configuration, Dict],
+                                fidelity: Union[Dict, None] = None,
+                                rng: Union[np.random.RandomState, int, None] = None,
+                                shuffle: bool = False,
+                                **kwargs) -> Dict:
+        """
+        Get the validated results. Runs a given configuration on the largest budget (here: 50).
+        Parameters
+        ----------
+        configuration
+        fidelity: Dict, None
+            epoch: int - Values: [1, 50]
+                Number of epochs an architecture was trained.
+                Note: the number of epoch is 1 indexed. (Results after the first epoch: epoch = 1)
+
+            Fidelity parameters, check get_fidelity_space(). Uses default (max) value if None.
+        rng : np.random.RandomState, int, None
+            Random seed to use in the benchmark.
+
+            To prevent overfitting on a single seed, it is possible to pass a
+            parameter ``rng`` as 'int' or 'np.random.RandomState' to this function.
+            If this parameter is not given, the default random state is used.
+
+        kwargs
+        Returns
+        -------
+        Dict -
+            function_value : Dict
+                validation_accuracy: float
+                log_perplexity: float
+            cost : time to train the network
+            info : Dict
+                validation_accuracy : float,
+                test_accuracy : float,
+                log_perplexity : float,
+                negative_log_perplexity : float,
+                training_cost : float,
+                valid_cost : float,
+                test_cost : float,
+                fidelity : Dict
+                    used fidelities in this evaluation
+        """
+
+        # The result dict should contain already all necessary information -> Just swap the function value from valid
+        # to test and the corresponding time cost
+        assert fidelity['epoch'] == 81, 'Only test data for the 50. epoch is available. '
+
+        device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
+        ts_start = time.time()
+
+        # batchify data
+        batch_size = configuration['batch_size']
+        train_data = batchify(self.X_train, batch_size=batch_size).to(device)
+        val_data = batchify(self.X_valid, batch_size=self.variable["eval_batch_size"]).to(device)
+
+        train_data = np.vstack(train_data, val_data)
+        test_data = batchify(self.X_test, batch_size=self.variable["eval_batch_size"]).to(device)
+
+        epochs = fidelity['budget']
+
+        model = self.init_model(configuration).to(device)
+
+        criterion = nn.CrossEntropyLoss()
+
+        learning_rate = configuration['lr']
+        learning_rate_factor = configuration['lr_factor']
+        clip = configuration['clip']
+        best_test_loss = None
+        train_eval_time = 0
+        t = tqdm.tqdm(total=epochs)
+        for epoch in range(1, epochs + 1):
+            epoch_start_time = time.time()
+            model.train_fun(model, self.corpus, criterion, train_data, learning_rate, batch_size, clip)
+
+            test_loss, test_acc = model.eval_fun(model, self.corpus, criterion, test_data)
+            test_loss = np.clip(test_loss, 1e-10, 10)
+
+            ts_now = time.time()
+            train_eval_time += ts_now - epoch_start_time
+
+            if not np.isfinite(test_loss):
+                test_loss = 7
+
+            # Save the model if the validation loss is the best we've seen so far.
+            if not best_test_loss or test_loss < best_test_loss:
+                best_test_loss = test_loss
+            else:
+                # Anneal the learning rate if no improvement has been seen in the validation dataset.
+                learning_rate /= learning_rate_factor
+
+        start_time = time.time()
+        _, test_acc = model.eval_fun(model, self.corpus, criterion, test_data)
+        eval_test_runtime = time.time() - start_time
+
+        perplexity = math.exp(best_test_loss)
+        log_perplexity = best_test_loss
+        neg_log_perplexity = 10 - best_test_loss
+        elapsed_time = float(ts_start - time.time())
+
+        return {'function_value': {'log_perplexity': log_perplexity,
+                                   'accuracy': test_acc,
+                                   },
+                'cost': elapsed_time,
+                'info': {'test_accuracy': test_acc,
+                         'log_perplexity': log_perplexity,
+                         'perplexity': perplexity,
+                         'negative_log_perplexity': neg_log_perplexity,
+                         'training_cost': train_eval_time,
+                         'test_cost': eval_test_runtime,
+                         'fidelity': fidelity
+                         }
+                }
+
+    __all__ = ["LanguageModelBenchmark"]
diff --git a/hpobench/dependencies/lm/__init__.py b/hpobench/dependencies/lm/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/hpobench/dependencies/lm/model.py b/hpobench/dependencies/lm/model.py
new file mode 100644
index 00000000..f4aed4cc
--- /dev/null
+++ b/hpobench/dependencies/lm/model.py
@@ -0,0 +1,140 @@
+import torch
+import torch.nn as nn
+import math
+import torch.nn.functional as F
+
+
+class PositionalEncoding(nn.Module):
+    r"""Inject some information about the relative or absolute position of the tokens
+        in the sequence. The positional encodings have the same dimension as
+        the embeddings, so that the two can be summed. Here, we use sine and cosine
+        functions of different frequencies.
+    .. math::
+        \text{PosEncoder}(pos, 2i) = sin(pos/10000^(2i/d_model))
+        \text{PosEncoder}(pos, 2i+1) = cos(pos/10000^(2i/d_model))
+        \text{where pos is the word position and i is the embed idx)
+    Args:
+        d_model: the embed dim (required).
+        dropout: the dropout value (default=0.1).
+        max_len: the max. length of the incoming sequence (default=5000).
+        >>> pos_encoder = PositionalEncoding(d_model)
+    """
+
+    def __init__(self, d_model, dropout=0.1, max_len=5000):
+        super(PositionalEncoding, self).__init__()
+        self.dropout = nn.Dropout(p=dropout)
+        pe = torch.zeros(max_len, d_model)
+        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
+        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
+        pe[:, 0::2] = torch.sin(position * div_term)
+        pe[:, 1::2] = torch.cos(position * div_term)
+        pe = pe.unsqueeze(0).transpose(0, 1)
+        self.register_buffer('pe', pe)
+
+    def forward(self, x):
+        r"""Inputs of forward function
+        Args:
+            x: the sequence fed to the positional encoder model (required).
+        Shape:
+            x: [sequence length, batch size, embed dim]
+            output: [sequence length, batch size, embed dim]
+        Examples:
+            >>> output = pos_encoder(x)
+        """
+        x = x + self.pe[:x.size(0), :]
+        return self.dropout(x)
+
+
+class TransformerModel(nn.Module):
+    """Container module with an encoder, a recurrent or transformer module, and a decoder."""
+
+    def __init__(self, ntoken, ninp, nhead, nhid, nlayers, dropout=0.5, bptt=35):
+        super(TransformerModel, self).__init__()
+        try:
+            from torch.nn import TransformerEncoder, TransformerEncoderLayer
+        except:
+            raise ImportError('TransformerEncoder module does not exist in PyTorch 1.1 or lower.')
+        self.model_type = 'Transformer'
+        self.src_mask = None
+        self.pos_encoder = PositionalEncoding(ninp, dropout)
+        encoder_layers = TransformerEncoderLayer(ninp, nhead, nhid, dropout)
+        self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers)
+        self.encoder = nn.Embedding(ntoken, ninp)
+        self.ninp = ninp
+        self.decoder = nn.Linear(ninp, ntoken)
+        self.init_weights()
+        self.bptt = bptt
+
+    def _generate_square_subsequent_mask(self, sz):
+        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
+        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
+        return mask
+
+    def get_batch(self, source, i):
+        seq_len = min(self.bptt, len(source) - 1 - i)
+        data = source[i:i + seq_len]
+        target = source[i + 1:i + 1 + seq_len].view(-1)
+        return data, target
+
+    def init_weights(self):
+        initrange = 0.1
+        self.encoder.weight.data.uniform_(-initrange, initrange)
+        self.decoder.bias.data.zero_()
+        self.decoder.weight.data.uniform_(-initrange, initrange)
+
+    def forward(self, src, has_mask=True):
+        if has_mask:
+            device = src.device
+            if self.src_mask is None or self.src_mask.size(0) != len(src):
+                mask = self._generate_square_subsequent_mask(len(src)).to(device)
+                self.src_mask = mask
+        else:
+            self.src_mask = None
+        src = self.encoder(src) * math.sqrt(self.ninp)
+        src = self.pos_encoder(src)
+        output = self.transformer_encoder(src, self.src_mask)
+        output = self.decoder(output)
+        return F.log_softmax(output, dim=-1)
+
+    def train_fun(self, model, corpus, criterion, train_data, lr, batch_size, clip):
+        # Turn on training mode which enables dropout.
+        self.train()
+        # total_loss = 0.
+        # start_time = time.time()
+        ntokens = len(corpus.dictionary)
+
+        for batch, i in enumerate(range(0, train_data.size(0) - 1, self.bptt)):
+            data, targets = self.get_batch(train_data, i)
+            # Starting each batch, we detach the hidden state from how it was previously produced.
+            # If we didn't, the model would try backpropagating all the way to start of the dataset.
+            model.zero_grad()
+            output = model(data)
+            loss = criterion(output.view(-1, ntokens), targets)
+            loss.backward()
+
+            # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
+            torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
+            for p in model.parameters():
+                p.data.add_(-lr, p.grad.data)
+
+    def eval_fun(self, model, corpus, criterion, data_source):
+        # Turn on evaluation mode which disables dropout.
+        self.eval()
+        total_loss = 0.
+        total_acc = 0.
+        ntokens = len(corpus.dictionary)
+        with torch.no_grad():
+            for i in range(0, data_source.size(0) - 1, self.bptt):
+                data, targets = self.get_batch(data_source, i)
+                output = model(data)
+                output_flat = output.view(-1, ntokens)
+                total_loss += len(data) * criterion(output_flat, targets).item()
+
+                # inserted accuracy
+                winners = output_flat.argmax(dim=1)
+                corrects = (winners == targets)
+                accuracy = corrects.sum().float() / float(targets.size(0))
+                total_acc += len(data) * accuracy
+
+        avg_acc = total_acc / (len(data_source) - 1)
+        return total_loss / (len(data_source) - 1), avg_acc
diff --git a/hpobench/dependencies/lm/tokenize_util.py b/hpobench/dependencies/lm/tokenize_util.py
new file mode 100644
index 00000000..6e200b1e
--- /dev/null
+++ b/hpobench/dependencies/lm/tokenize_util.py
@@ -0,0 +1,56 @@
+import torch
+
+
+class Dictionary(object):
+    def __init__(self):
+        self.word2idx = {}
+        self.idx2word = []
+
+    def add_word(self, word):
+        if word not in self.word2idx:
+            self.idx2word.append(word)
+            self.word2idx[word] = len(self.idx2word) - 1
+        return self.word2idx[word]
+
+    def __len__(self):
+        return len(self.idx2word)
+
+
+class Corpus(object):
+    def __init__(self, logger):
+        self.dictionary = Dictionary()
+        self.logger = logger
+
+    def tokenize(self, path):
+        """Tokenizes a text file."""
+        # Add words to the dictionary
+        with open(path, 'r', encoding="utf8") as f:
+            for line in f:
+                words = line.split() + ['<eos>']
+                print("words", words)
+                for word in words:
+                    self.dictionary.add_word(word)
+        # Tokenize file content
+        with open(path, 'r', encoding="utf8") as f:
+            idss = []
+            for line in f:
+                words = line.split() + ['<eos>']
+                ids = []
+                try:
+                    for word in words:
+                        ids.append(self.dictionary.word2idx[word])
+                except:
+                    self.logger.debug("word2idx:{}", self.dictionary.word2idx)
+                idss.append(torch.tensor(ids).type(torch.int64))
+            ids = torch.cat(idss)
+        return ids
+
+
+def batchify(data, batch_size):
+    # Work out how cleanly we can divide the dataset into bsz parts.
+    nbatch = data.size(0) // batch_size
+    # Trim off any extra elements that wouldn't cleanly fit (remainders).
+    data = data.narrow(0, 0, nbatch * batch_size)
+    # Evenly divide the data across the bsz batches.
+    data = data.view(batch_size, -1).t().contiguous()
+    return data
diff --git a/hpobench/util/data_manager.py b/hpobench/util/data_manager.py
index c72305e1..31486c1f 100644
--- a/hpobench/util/data_manager.py
+++ b/hpobench/util/data_manager.py
@@ -935,6 +935,65 @@ def __cast_x_y(x, y) -> Tuple:
         return X_train, y_train, X_val, y_val, X_test, y_test
 
 
+class LanguageModelDataManager(HoldoutDataManager):
+    def __init__(self):
+        from hpobench.dependencies.lm.tokenize_util import Corpus
+        super(LanguageModelDataManager, self).__init__()
+        self.logger.debug('LanguageModelDataManager: Starting to load data')
+
+        self.urls = {
+            "train_data": "https://raw.githubusercontent.com/pytorch/examples/master/word_language_model/data/wikitext-2/train.txt",
+            "valid_data": "https://raw.githubusercontent.com/pytorch/examples/master/word_language_model/data/wikitext-2/valid.txt",
+            "test_data": "https://raw.githubusercontent.com/pytorch/examples/master/word_language_model/data/wikitext-2/test.txt",
+        }
+
+        self.save_dir = hpobench.config_file.data_dir / "wikitext"
+        self.create_save_directory(self.save_dir)
+        self.corpus = Corpus(logger=self.logger)
+
+    def load(self):
+        """
+        Loads Adult Fair Datasets from data directory as defined in hpobenchrc.data_directory.
+        Downloads data if necessary.
+        Returns
+        -------
+        X_train: np.ndarray
+        y_train: np.ndarray
+        X_val: np.ndarray
+        y_val: np.ndarray
+        X_test: np.ndarray
+        y_test: np.ndarray
+        """
+
+        t = time()
+        self._download()
+        self.X_train, self.X_valid, self.X_test = self._load()
+        self.logger.info(f'LanguageModelDataManager: Data successfully loaded after {time() - t:.2f}')
+        return self.X_train, self.X_valid, self.X_test
+
+    def _download(self):
+        self._download_file_with_progressbar(self.urls["train_data"], self.save_dir / "train.txt")
+        self._download_file_with_progressbar(self.urls["valid_data"], self.save_dir / "valid.txt")
+        self._download_file_with_progressbar(self.urls["test_data"], self.save_dir / "test.txt")
+
+    def _load(self) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
+        """
+        Load the data from file and split it into train, test and validation split.
+        Returns
+        -------
+        X_train: np.ndarray
+        X_valid: np.ndarray
+        X_test: np.ndarray
+        """
+
+
+        X_train = self.corpus.tokenize(self.save_dir / "train.txt")
+        X_valid = self.corpus.tokenize(self.save_dir / "valid.txt")
+        X_test = self.corpus.tokenize(self.save_dir / "test.txt")
+
+        return X_train, X_valid, X_test
+
+
 class YearPredictionMSDData(HoldoutDataManager):
 
     def __init__(self):

From c862cd6ed8b0d5c975af56d1b7f2838070dc8eff Mon Sep 17 00:00:00 2001
From: ayushi-3536 <ayushi.sharma.3536@gmail.com>
Date: Tue, 17 May 2022 15:10:44 +0200
Subject: [PATCH 18/29] - added evaluation time as one of the objectives -
 returning prediction time for evaluation time - changed perplexity -->
 log_perplexity for the objective (MO-ASHA uses log perplexity)  changed error
 --> accuracy - added tqdm

---
 extra_requirements/lm_benchmark.json   |  3 ++-
 hpobench/benchmarks/mo/lm_benchmark.py | 10 +++++++---
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/extra_requirements/lm_benchmark.json b/extra_requirements/lm_benchmark.json
index 34d1249b..a686fe43 100644
--- a/extra_requirements/lm_benchmark.json
+++ b/extra_requirements/lm_benchmark.json
@@ -1,5 +1,6 @@
 {
   "lm": [
-    "torch==1.3.0"
+    "torch==1.3.0",
+    "tqdm"
   ]
 }
\ No newline at end of file
diff --git a/hpobench/benchmarks/mo/lm_benchmark.py b/hpobench/benchmarks/mo/lm_benchmark.py
index 270243f6..1326418f 100644
--- a/hpobench/benchmarks/mo/lm_benchmark.py
+++ b/hpobench/benchmarks/mo/lm_benchmark.py
@@ -23,7 +23,7 @@
 
 __version__ = '0.0.1'
 
-logger = logging.getLogger('MO_CNN')
+logger = logging.getLogger('LM_Bench')
 
 
 class LanguageModelBenchmark(AbstractMultiObjectiveBenchmark):
@@ -75,7 +75,7 @@ def get_configuration_space(seed: Union[int, None] = None) -> CS.ConfigurationSp
 
     @staticmethod
     def get_objective_names(self) -> List[str]:
-        return ['perplexity', 'error', 'time']
+        return ['log_perplexity', 'accuracy', 'time']
 
     @staticmethod
     def get_fidelity_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
@@ -232,6 +232,7 @@ def objective_function(self, configuration: Union[CS.Configuration, Dict],
 
         return {'function_value': {'log_perplexity': log_perplexity,
                                    'accuracy': val_acc,
+                                   'time': train_eval_time
                                    },
                 'cost': elapsed_time,
                 'info': {'validation_accuracy': val_acc,
@@ -258,7 +259,7 @@ def objective_function_test(self, configuration: Union[CS.Configuration, Dict],
         ----------
         configuration
         fidelity: Dict, None
-            epoch: int - Values: [1, 50]
+            epoch: int - Values: [1, 81]
                 Number of epochs an architecture was trained.
                 Note: the number of epoch is 1 indexed. (Results after the first epoch: epoch = 1)
 
@@ -327,6 +328,8 @@ def objective_function_test(self, configuration: Union[CS.Configuration, Dict],
             ts_now = time.time()
             train_eval_time += ts_now - epoch_start_time
 
+            t.set_postfix(test_accuracy=test_acc)
+            t.update()
             if not np.isfinite(test_loss):
                 test_loss = 7
 
@@ -348,6 +351,7 @@ def objective_function_test(self, configuration: Union[CS.Configuration, Dict],
 
         return {'function_value': {'log_perplexity': log_perplexity,
                                    'accuracy': test_acc,
+                                   'time': train_eval_time
                                    },
                 'cost': elapsed_time,
                 'info': {'test_accuracy': test_acc,

From 2a5d35dc5514d89a579dff5253c718fc7bfc5c93 Mon Sep 17 00:00:00 2001
From: ayushi-3536 <ayushi.sharma.3536@gmail.com>
Date: Tue, 17 May 2022 16:23:10 +0200
Subject: [PATCH 19/29] - func name correction

---
 hpobench/benchmarks/mo/lm_benchmark.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hpobench/benchmarks/mo/lm_benchmark.py b/hpobench/benchmarks/mo/lm_benchmark.py
index 1326418f..e06c237d 100644
--- a/hpobench/benchmarks/mo/lm_benchmark.py
+++ b/hpobench/benchmarks/mo/lm_benchmark.py
@@ -198,7 +198,7 @@ def objective_function(self, configuration: Union[CS.Configuration, Dict],
             epoch_start_time = time.time()
             model.train_fun(model, self.corpus, criterion, train_data, learning_rate, batch_size, clip)
 
-            val_loss, val_acc = model.evaluate(model, self.corpus, criterion, val_data)
+            val_loss, val_acc = model.eval_fun(model, self.corpus, criterion, val_data)
             val_loss = np.clip(val_loss, 1e-10, 10)
 
             ts_now = time.time()

From 29c43779ef9aa425aa36aead48b2645675505479 Mon Sep 17 00:00:00 2001
From: ayushi-3536 <ayushi.sharma.3536@gmail.com>
Date: Tue, 17 May 2022 19:47:49 +0200
Subject: [PATCH 20/29] - load and  save tokenized file

---
 hpobench/benchmarks/mo/lm_benchmark.py    |  7 ++---
 hpobench/dependencies/lm/tokenize_util.py |  1 -
 hpobench/util/data_manager.py             | 33 ++++++++++++++++-------
 3 files changed, 27 insertions(+), 14 deletions(-)

diff --git a/hpobench/benchmarks/mo/lm_benchmark.py b/hpobench/benchmarks/mo/lm_benchmark.py
index e06c237d..4b301492 100644
--- a/hpobench/benchmarks/mo/lm_benchmark.py
+++ b/hpobench/benchmarks/mo/lm_benchmark.py
@@ -31,7 +31,8 @@ class LanguageModelBenchmark(AbstractMultiObjectiveBenchmark):
     def __init__(self, rng: Union[np.random.RandomState, int, None] = None, **kwargs):
         super(LanguageModelBenchmark, self).__init__(rng=rng)
 
-        data_manager = LanguageModelDataManager()
+        device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
+        data_manager = LanguageModelDataManager(device)
         self.X_train, self.X_valid, self.X_test = data_manager.load()
         self.corpus = data_manager.corpus
 
@@ -195,9 +196,10 @@ def objective_function(self, configuration: Union[CS.Configuration, Dict],
 
         t = tqdm.tqdm(total=epochs)
         for epoch in range(epochs):
+            print("epoch training started",epoch)
             epoch_start_time = time.time()
             model.train_fun(model, self.corpus, criterion, train_data, learning_rate, batch_size, clip)
-
+            print("epoch traing done")
             val_loss, val_acc = model.eval_fun(model, self.corpus, criterion, val_data)
             val_loss = np.clip(val_loss, 1e-10, 10)
 
@@ -295,7 +297,6 @@ def objective_function_test(self, configuration: Union[CS.Configuration, Dict],
         # to test and the corresponding time cost
         assert fidelity['epoch'] == 81, 'Only test data for the 50. epoch is available. '
 
-        device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
         ts_start = time.time()
 
         # batchify data
diff --git a/hpobench/dependencies/lm/tokenize_util.py b/hpobench/dependencies/lm/tokenize_util.py
index 6e200b1e..7ae105d7 100644
--- a/hpobench/dependencies/lm/tokenize_util.py
+++ b/hpobench/dependencies/lm/tokenize_util.py
@@ -27,7 +27,6 @@ def tokenize(self, path):
         with open(path, 'r', encoding="utf8") as f:
             for line in f:
                 words = line.split() + ['<eos>']
-                print("words", words)
                 for word in words:
                     self.dictionary.add_word(word)
         # Tokenize file content
diff --git a/hpobench/util/data_manager.py b/hpobench/util/data_manager.py
index 31486c1f..66639c2b 100644
--- a/hpobench/util/data_manager.py
+++ b/hpobench/util/data_manager.py
@@ -936,20 +936,22 @@ def __cast_x_y(x, y) -> Tuple:
 
 
 class LanguageModelDataManager(HoldoutDataManager):
-    def __init__(self):
+    def __init__(self, device):
         from hpobench.dependencies.lm.tokenize_util import Corpus
         super(LanguageModelDataManager, self).__init__()
         self.logger.debug('LanguageModelDataManager: Starting to load data')
 
         self.urls = {
-            "train_data": "https://raw.githubusercontent.com/pytorch/examples/master/word_language_model/data/wikitext-2/train.txt",
-            "valid_data": "https://raw.githubusercontent.com/pytorch/examples/master/word_language_model/data/wikitext-2/valid.txt",
-            "test_data": "https://raw.githubusercontent.com/pytorch/examples/master/word_language_model/data/wikitext-2/test.txt",
+            "train": "https://raw.githubusercontent.com/pytorch/examples/master/word_language_model/data/wikitext-2/train.txt",
+            "valid": "https://raw.githubusercontent.com/pytorch/examples/master/word_language_model/data/wikitext-2/valid.txt",
+            "test": "https://raw.githubusercontent.com/pytorch/examples/master/word_language_model/data/wikitext-2/test.txt",
         }
 
         self.save_dir = hpobench.config_file.data_dir / "wikitext"
         self.create_save_directory(self.save_dir)
         self.corpus = Corpus(logger=self.logger)
+        self.device = device
+        self.tokenize_path = self.save_dir / "tokenize"
 
     def load(self):
         """
@@ -972,9 +974,12 @@ def load(self):
         return self.X_train, self.X_valid, self.X_test
 
     def _download(self):
-        self._download_file_with_progressbar(self.urls["train_data"], self.save_dir / "train.txt")
-        self._download_file_with_progressbar(self.urls["valid_data"], self.save_dir / "valid.txt")
-        self._download_file_with_progressbar(self.urls["test_data"], self.save_dir / "test.txt")
+
+        for data in self.urls:
+            if (self.save_dir / f'{data}.txt').exists():
+                self.logger.debug(f'LanguageModelDataManager : tokenized {data}.txt already exist')
+            else:
+                self._download_file_with_progressbar(self.urls[data], self.save_dir / f"{data}.txt")
 
     def _load(self) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
         """
@@ -986,10 +991,18 @@ def _load(self) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndar
         X_test: np.ndarray
         """
 
+        import torch
+        for data in self.urls:
+            if (self.tokenize_path / f'{data}.pt').exists():
+                self.logger.debug(f'LanguageModelDataManager : {data}.txt already exist')
+
+            else:
+                tokenized_data = self.corpus.tokenize(self.save_dir / "train.txt")
+                torch.save(tokenized_data, self.tokenize_path / f'{data}.pt')
 
-        X_train = self.corpus.tokenize(self.save_dir / "train.txt")
-        X_valid = self.corpus.tokenize(self.save_dir / "valid.txt")
-        X_test = self.corpus.tokenize(self.save_dir / "test.txt")
+        X_train = torch.load(self.tokenize_path / 'train.pt', map_location=self.device)
+        X_valid = torch.load(self.tokenize_path / 'valid.pt', map_location=self.device)
+        X_test = torch.load(self.tokenize_path / 'test.pt', map_location=self.device)
 
         return X_train, X_valid, X_test
 

From ab5f484f0a69a0a0381f6b3d85ffc560af9aaa57 Mon Sep 17 00:00:00 2001
From: ayushi-3536 <ayushi.sharma.3536@gmail.com>
Date: Fri, 20 May 2022 01:03:27 +0200
Subject: [PATCH 21/29] -code formatting

---
 hpobench/benchmarks/mo/lm_benchmark.py    | 69 ++++++++++-------------
 hpobench/dependencies/lm/model.py         | 31 ++++++----
 hpobench/dependencies/lm/tokenize_util.py |  2 +-
 hpobench/util/data_manager.py             | 41 +++++++++-----
 4 files changed, 79 insertions(+), 64 deletions(-)

diff --git a/hpobench/benchmarks/mo/lm_benchmark.py b/hpobench/benchmarks/mo/lm_benchmark.py
index 4b301492..f22db78f 100644
--- a/hpobench/benchmarks/mo/lm_benchmark.py
+++ b/hpobench/benchmarks/mo/lm_benchmark.py
@@ -3,15 +3,14 @@
 ==========
 
 0.0.1:
-* First implementation of the Multi-Objective CNN Benchmark.
+* First implementation of the Multi-Objective Language Model Benchmark.
 """
-from typing import Union, Tuple, Dict, List
+from typing import Union, Dict, List
 import ConfigSpace as CS
 import numpy as np
 import torch
 import torch.nn as nn
 import logging
-from ConfigSpace.hyperparameters import Hyperparameter
 import hpobench.util.rng_helper as rng_helper
 from hpobench.abstract_benchmark import AbstractMultiObjectiveBenchmark
 from hpobench.util.data_manager import LanguageModelDataManager
@@ -31,19 +30,20 @@ class LanguageModelBenchmark(AbstractMultiObjectiveBenchmark):
     def __init__(self, rng: Union[np.random.RandomState, int, None] = None, **kwargs):
         super(LanguageModelBenchmark, self).__init__(rng=rng)
 
-        device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
-        data_manager = LanguageModelDataManager(device)
+        self.device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
+        data_manager = LanguageModelDataManager(self.device)
         self.X_train, self.X_valid, self.X_test = data_manager.load()
-        self.corpus = data_manager.corpus
+        self.ntokens = len(data_manager.corpus.dictionary)
 
         self.variable = {"eval_batch_size": 10,
                          "nlayers": 2,
                          "bptt": 35,
                          "tied": True,
+                         # number of attention head
                          "nhead": 2,
-                         "ntoken": len(self.corpus.dictionary)
+                         "ntoken": self.ntokens
                          }
-        print("len of corpus dict", len(self.corpus.dictionary))
+        print("len of corpus dict", self.ntokens)
 
     @staticmethod
     def get_configuration_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
@@ -83,23 +83,11 @@ def get_fidelity_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
 
         fidelity_space = CS.ConfigurationSpace(seed=seed)
         fidelity_space.add_hyperparameters([
-            # gray-box setting (multi-multi-fidelity) - iterations + data subsample
-            LanguageModelBenchmark._get_fidelity_choices(iter_choice='variable')
-        ])
-        return fidelity_space
-
-    @staticmethod
-    def _get_fidelity_choices(iter_choice: str) -> Tuple[Hyperparameter, Hyperparameter]:
-
-        fidelity1 = dict(
-            fixed=CS.Constant('budget', value=81),
-            variable=CS.UniformIntegerHyperparameter(
+            CS.UniformIntegerHyperparameter(
                 'budget', lower=1, upper=81, default_value=81, log=False
             )
-        )
-
-        budget = fidelity1[iter_choice]
-        return budget
+        ])
+        return fidelity_space
 
     @staticmethod
     def get_meta_information() -> Dict:
@@ -108,7 +96,8 @@ def get_meta_information() -> Dict:
             'name': 'Multi-objective Asynchronous Successive Halving',
             'references': ['@article{schmucker2021multi,'
                            'title={Multi-objective Asynchronous Successive Halving},'
-                           'author={Schmucker, Robin and Donini, Michele and Zafar, Muhammad Bilal and Salinas, David and Archambeau, C{\'e}dric},'
+                           'author={Schmucker, Robin and Donini, Michele and Zafar, Muhammad Bilal and Salinas,'
+                           ' David and Archambeau, C{\'e}dric},'
                            'journal={arXiv preprint arXiv:2106.12639},'
                            'year={2021}',
                            ],
@@ -121,6 +110,7 @@ def init_model(self, config: Union[CS.Configuration, Dict]):
         if isinstance(config, CS.Configuration):
             config = config.get_dictionary()
 
+        # all sublayers and embedding layers have same dim
         model = TransformerModel(
             self.variable['ntoken'], config['emsize'], self.variable['nhead'], config['emsize'],
             self.variable['nlayers'], config['dropout'])
@@ -196,11 +186,9 @@ def objective_function(self, configuration: Union[CS.Configuration, Dict],
 
         t = tqdm.tqdm(total=epochs)
         for epoch in range(epochs):
-            print("epoch training started",epoch)
             epoch_start_time = time.time()
-            model.train_fun(model, self.corpus, criterion, train_data, learning_rate, batch_size, clip)
-            print("epoch traing done")
-            val_loss, val_acc = model.eval_fun(model, self.corpus, criterion, val_data)
+            train_loss, train_acc = model.train_fun(model, self.ntokens, criterion, train_data, learning_rate, clip)
+            val_loss, val_acc = model.eval_fun(model, self.ntokens, criterion, val_data)
             val_loss = np.clip(val_loss, 1e-10, 10)
 
             ts_now = time.time()
@@ -220,11 +208,11 @@ def objective_function(self, configuration: Union[CS.Configuration, Dict],
                 learning_rate /= learning_rate_factor
 
         start_time = time.time()
-        _, val_acc = model.eval_fun(model, self.corpus, criterion, val_data)
+        _, val_acc = model.eval_fun(model, self.ntokens, criterion, val_data)
         eval_valid_runtime = time.time() - start_time
 
         start_time = time.time()
-        _, test_acc = model.eval_fun(model, self.corpus, criterion, test_data)
+        _, test_acc = model.eval_fun(model, self.ntokens, criterion, test_data)
         eval_test_runtime = time.time() - start_time
 
         perplexity = math.exp(best_val_loss)
@@ -237,7 +225,8 @@ def objective_function(self, configuration: Union[CS.Configuration, Dict],
                                    'time': train_eval_time
                                    },
                 'cost': elapsed_time,
-                'info': {'validation_accuracy': val_acc,
+                'info': {'train_accuracy': train_acc,
+                         'validation_accuracy': val_acc,
                          'test_accuracy': test_acc,
                          'log_perplexity': log_perplexity,
                          'perplexity': perplexity,
@@ -301,15 +290,15 @@ def objective_function_test(self, configuration: Union[CS.Configuration, Dict],
 
         # batchify data
         batch_size = configuration['batch_size']
-        train_data = batchify(self.X_train, batch_size=batch_size).to(device)
-        val_data = batchify(self.X_valid, batch_size=self.variable["eval_batch_size"]).to(device)
+        train_data = batchify(self.X_train, batch_size=batch_size).to(self.device)
+        val_data = batchify(self.X_valid, batch_size=self.variable["eval_batch_size"]).to(self.device)
 
         train_data = np.vstack(train_data, val_data)
-        test_data = batchify(self.X_test, batch_size=self.variable["eval_batch_size"]).to(device)
+        test_data = batchify(self.X_test, batch_size=self.variable["eval_batch_size"]).to(self.device)
 
         epochs = fidelity['budget']
 
-        model = self.init_model(configuration).to(device)
+        model = self.init_model(configuration).to(self.device)
 
         criterion = nn.CrossEntropyLoss()
 
@@ -321,9 +310,10 @@ def objective_function_test(self, configuration: Union[CS.Configuration, Dict],
         t = tqdm.tqdm(total=epochs)
         for epoch in range(1, epochs + 1):
             epoch_start_time = time.time()
-            model.train_fun(model, self.corpus, criterion, train_data, learning_rate, batch_size, clip)
+            train_loss, train_acc = model.train_fun(model, self.ntokens, criterion, train_data, learning_rate,
+                                                    batch_size, clip)
 
-            test_loss, test_acc = model.eval_fun(model, self.corpus, criterion, test_data)
+            test_loss, test_acc = model.eval_fun(model, self.ntokens, criterion, test_data)
             test_loss = np.clip(test_loss, 1e-10, 10)
 
             ts_now = time.time()
@@ -342,7 +332,7 @@ def objective_function_test(self, configuration: Union[CS.Configuration, Dict],
                 learning_rate /= learning_rate_factor
 
         start_time = time.time()
-        _, test_acc = model.eval_fun(model, self.corpus, criterion, test_data)
+        _, test_acc = model.eval_fun(model, self.ntokens, criterion, test_data)
         eval_test_runtime = time.time() - start_time
 
         perplexity = math.exp(best_test_loss)
@@ -355,7 +345,8 @@ def objective_function_test(self, configuration: Union[CS.Configuration, Dict],
                                    'time': train_eval_time
                                    },
                 'cost': elapsed_time,
-                'info': {'test_accuracy': test_acc,
+                'info': {'train_accuracy': train_acc,
+                         'test_accuracy': test_acc,
                          'log_perplexity': log_perplexity,
                          'perplexity': perplexity,
                          'negative_log_perplexity': neg_log_perplexity,
diff --git a/hpobench/dependencies/lm/model.py b/hpobench/dependencies/lm/model.py
index f4aed4cc..8361c61f 100644
--- a/hpobench/dependencies/lm/model.py
+++ b/hpobench/dependencies/lm/model.py
@@ -27,6 +27,9 @@ def __init__(self, d_model, dropout=0.1, max_len=5000):
         position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
         div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
         pe[:, 0::2] = torch.sin(position * div_term)
+        print("shape pe",pe[:,0::2].shape)
+
+        print("shape after pe",pe[:,1::2].shape)
         pe[:, 1::2] = torch.cos(position * div_term)
         pe = pe.unsqueeze(0).transpose(0, 1)
         self.register_buffer('pe', pe)
@@ -46,13 +49,13 @@ def forward(self, x):
 
 
 class TransformerModel(nn.Module):
-    """Container module with an encoder, a recurrent or transformer module, and a decoder."""
+    """Container module with an encoder, a transformer module, and a decoder."""
 
     def __init__(self, ntoken, ninp, nhead, nhid, nlayers, dropout=0.5, bptt=35):
         super(TransformerModel, self).__init__()
         try:
             from torch.nn import TransformerEncoder, TransformerEncoderLayer
-        except:
+        except Exception:
             raise ImportError('TransformerEncoder module does not exist in PyTorch 1.1 or lower.')
         self.model_type = 'Transformer'
         self.src_mask = None
@@ -96,33 +99,41 @@ def forward(self, src, has_mask=True):
         output = self.decoder(output)
         return F.log_softmax(output, dim=-1)
 
-    def train_fun(self, model, corpus, criterion, train_data, lr, batch_size, clip):
+    def train_fun(self, model, ntokens, criterion, train_data, lr, clip):
         # Turn on training mode which enables dropout.
         self.train()
-        # total_loss = 0.
-        # start_time = time.time()
-        ntokens = len(corpus.dictionary)
-
+        total_loss = 0.
+        total_acc = 0.
         for batch, i in enumerate(range(0, train_data.size(0) - 1, self.bptt)):
             data, targets = self.get_batch(train_data, i)
             # Starting each batch, we detach the hidden state from how it was previously produced.
             # If we didn't, the model would try backpropagating all the way to start of the dataset.
             model.zero_grad()
             output = model(data)
-            loss = criterion(output.view(-1, ntokens), targets)
+            output_flat = output.view(-1, ntokens)
+            loss = criterion(output_flat, targets)
             loss.backward()
 
+            # calculate loss and accuracy
+            total_loss += len(data) * loss.item()
+            winners = output_flat.argmax(dim=1)
+            corrects = (winners == targets)
+            accuracy = corrects.sum().float() / float(targets.size(0))
+            total_acc += len(data) * accuracy
+
             # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
             torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
             for p in model.parameters():
                 p.data.add_(-lr, p.grad.data)
 
-    def eval_fun(self, model, corpus, criterion, data_source):
+        avg_acc = total_acc / (len(train_data) - 1)
+        return total_loss / (len(train_data) - 1), avg_acc
+
+    def eval_fun(self, model, ntokens, criterion, data_source):
         # Turn on evaluation mode which disables dropout.
         self.eval()
         total_loss = 0.
         total_acc = 0.
-        ntokens = len(corpus.dictionary)
         with torch.no_grad():
             for i in range(0, data_source.size(0) - 1, self.bptt):
                 data, targets = self.get_batch(data_source, i)
diff --git a/hpobench/dependencies/lm/tokenize_util.py b/hpobench/dependencies/lm/tokenize_util.py
index 7ae105d7..f68e850d 100644
--- a/hpobench/dependencies/lm/tokenize_util.py
+++ b/hpobench/dependencies/lm/tokenize_util.py
@@ -38,7 +38,7 @@ def tokenize(self, path):
                 try:
                     for word in words:
                         ids.append(self.dictionary.word2idx[word])
-                except:
+                except Exception:
                     self.logger.debug("word2idx:{}", self.dictionary.word2idx)
                 idss.append(torch.tensor(ids).type(torch.int64))
             ids = torch.cat(idss)
diff --git a/hpobench/util/data_manager.py b/hpobench/util/data_manager.py
index 66639c2b..3edc05c1 100644
--- a/hpobench/util/data_manager.py
+++ b/hpobench/util/data_manager.py
@@ -942,9 +942,12 @@ def __init__(self, device):
         self.logger.debug('LanguageModelDataManager: Starting to load data')
 
         self.urls = {
-            "train": "https://raw.githubusercontent.com/pytorch/examples/master/word_language_model/data/wikitext-2/train.txt",
-            "valid": "https://raw.githubusercontent.com/pytorch/examples/master/word_language_model/data/wikitext-2/valid.txt",
-            "test": "https://raw.githubusercontent.com/pytorch/examples/master/word_language_model/data/wikitext-2/test.txt",
+            "train": "https://raw.githubusercontent.com/pytorch/examples/master/"
+                     "word_language_model/data/wikitext-2/train.txt",
+            "valid": "https://raw.githubusercontent.com/pytorch/examples/master/"
+                     "word_language_model/data/wikitext-2/valid.txt",
+            "test": "https://raw.githubusercontent.com/pytorch/examples/master/"
+                    "word_language_model/data/wikitext-2/test.txt",
         }
 
         self.save_dir = hpobench.config_file.data_dir / "wikitext"
@@ -971,6 +974,11 @@ def load(self):
         self._download()
         self.X_train, self.X_valid, self.X_test = self._load()
         self.logger.info(f'LanguageModelDataManager: Data successfully loaded after {time() - t:.2f}')
+        print(self.X_train.shape)
+
+        print(self.X_valid.shape)
+
+        print(self.X_test.shape)
         return self.X_train, self.X_valid, self.X_test
 
     def _download(self):
@@ -993,17 +1001,22 @@ def _load(self) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndar
 
         import torch
         for data in self.urls:
-            if (self.tokenize_path / f'{data}.pt').exists():
-                self.logger.debug(f'LanguageModelDataManager : {data}.txt already exist')
-
-            else:
-                tokenized_data = self.corpus.tokenize(self.save_dir / "train.txt")
-                torch.save(tokenized_data, self.tokenize_path / f'{data}.pt')
-
-        X_train = torch.load(self.tokenize_path / 'train.pt', map_location=self.device)
-        X_valid = torch.load(self.tokenize_path / 'valid.pt', map_location=self.device)
-        X_test = torch.load(self.tokenize_path / 'test.pt', map_location=self.device)
-
+        #     if (self.tokenize_path / f'{data}.pt').exists():
+        #         self.logger.debug(f'LanguageModelDataManager : {data}.txt already exist')
+        #     else:
+            tokenized_data = self.corpus.tokenize(self.save_dir / f'{data}.txt')
+            torch.save(tokenized_data, self.tokenize_path / f'{data}.pt')
+
+        X_train = self.corpus.tokenize(self.save_dir / 'train.txt')
+
+        X_valid = self.corpus.tokenize(self.save_dir / 'valid.txt')
+
+        X_test = self.corpus.tokenize(self.save_dir / 'test.txt')
+        #
+        # X_train = torch.load(self.tokenize_path / 'train.pt', map_location=self.device)
+        # X_valid = torch.load(self.tokenize_path / 'valid.pt', map_location=self.device)
+        # X_test = torch.load(self.tokenize_path / 'test.pt', map_location=self.device)
+        print(len(self.corpus.dictionary))
         return X_train, X_valid, X_test
 
 

From 6e6af739857e8745423fbe3532b0848060336058 Mon Sep 17 00:00:00 2001
From: ayushi-3536 <ayushi.sharma.3536@gmail.com>
Date: Sat, 21 May 2022 10:13:28 +0200
Subject: [PATCH 22/29] -make deterministic -report train and eval time
 separately in objective func -code formatting -added test file

---
 hpobench/benchmarks/mo/lm_benchmark.py | 97 ++++++++++++++------------
 hpobench/dependencies/lm/model.py      | 19 +++--
 tests/test_wikitext.py                 | 35 ++++++++++
 3 files changed, 96 insertions(+), 55 deletions(-)
 create mode 100644 tests/test_wikitext.py

diff --git a/hpobench/benchmarks/mo/lm_benchmark.py b/hpobench/benchmarks/mo/lm_benchmark.py
index f22db78f..22ace808 100644
--- a/hpobench/benchmarks/mo/lm_benchmark.py
+++ b/hpobench/benchmarks/mo/lm_benchmark.py
@@ -19,6 +19,7 @@
 import time
 import math
 import tqdm
+import random
 
 __version__ = '0.0.1'
 
@@ -34,7 +35,7 @@ def __init__(self, rng: Union[np.random.RandomState, int, None] = None, **kwargs
         data_manager = LanguageModelDataManager(self.device)
         self.X_train, self.X_valid, self.X_test = data_manager.load()
         self.ntokens = len(data_manager.corpus.dictionary)
-
+        self.__seed_everything()
         self.variable = {"eval_batch_size": 10,
                          "nlayers": 2,
                          "bptt": 35,
@@ -45,6 +46,16 @@ def __init__(self, rng: Union[np.random.RandomState, int, None] = None, **kwargs
                          }
         print("len of corpus dict", self.ntokens)
 
+    def __seed_everything(self):
+        """Helperfunction: Make the benchmark deterministic by setting the correct seeds"""
+        seed = self.rng.randint(0, 100000)
+        print("seed obtained", seed)
+        random.seed(seed)
+        np.random.seed(seed)
+        torch.manual_seed(seed)
+        torch.cuda.manual_seed_all(seed)
+        torch.backends.cudnn.deterministic = True
+
     @staticmethod
     def get_configuration_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
         """Parameter space to be optimized --- contains the hyperparameters
@@ -109,8 +120,6 @@ def init_model(self, config: Union[CS.Configuration, Dict]):
 
         if isinstance(config, CS.Configuration):
             config = config.get_dictionary()
-
-        # all sublayers and embedding layers have same dim
         model = TransformerModel(
             self.variable['ntoken'], config['emsize'], self.variable['nhead'], config['emsize'],
             self.variable['nlayers'], config['dropout'])
@@ -162,7 +171,10 @@ def objective_function(self, configuration: Union[CS.Configuration, Dict],
                 fidelity : Dict
                     used fidelities in this evaluation
         """
-        self.rng = rng_helper.get_rng()
+
+        self.rng = rng_helper.get_rng(rng)
+        self.__seed_everything()
+
         device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
         ts_start = time.time()
 
@@ -182,17 +194,19 @@ def objective_function(self, configuration: Union[CS.Configuration, Dict],
         learning_rate_factor = configuration['lr_factor']
         clip = configuration['clip']
         best_val_loss = None
-        train_eval_time = 0
+        train_time = 0
+        eval_time = 0
 
         t = tqdm.tqdm(total=epochs)
         for epoch in range(epochs):
             epoch_start_time = time.time()
-            train_loss, train_acc = model.train_fun(model, self.ntokens, criterion, train_data, learning_rate, clip)
-            val_loss, val_acc = model.eval_fun(model, self.ntokens, criterion, val_data)
+            train_loss, train_acc = model.train_fun(self.ntokens, criterion, train_data, learning_rate, clip)
+            train_time += time.time() - epoch_start_time
+            start = time.time()
+            val_loss, val_acc = model.eval_fun(self.ntokens, criterion, val_data)
             val_loss = np.clip(val_loss, 1e-10, 10)
-
-            ts_now = time.time()
-            train_eval_time += ts_now - epoch_start_time
+            print("val acc for last epoch", val_acc)
+            eval_time += start - time.time()
 
             t.set_postfix(val_accuracy=val_acc)
             t.update()
@@ -208,11 +222,7 @@ def objective_function(self, configuration: Union[CS.Configuration, Dict],
                 learning_rate /= learning_rate_factor
 
         start_time = time.time()
-        _, val_acc = model.eval_fun(model, self.ntokens, criterion, val_data)
-        eval_valid_runtime = time.time() - start_time
-
-        start_time = time.time()
-        _, test_acc = model.eval_fun(model, self.ntokens, criterion, test_data)
+        _, test_acc = model.eval_fun(self.ntokens, criterion, test_data)
         eval_test_runtime = time.time() - start_time
 
         perplexity = math.exp(best_val_loss)
@@ -221,18 +231,18 @@ def objective_function(self, configuration: Union[CS.Configuration, Dict],
         elapsed_time = float(ts_start - time.time())
 
         return {'function_value': {'log_perplexity': log_perplexity,
-                                   'accuracy': val_acc,
-                                   'time': train_eval_time
+                                   'accuracy': val_acc.item(),
+                                   'time': train_time + eval_time
                                    },
                 'cost': elapsed_time,
-                'info': {'train_accuracy': train_acc,
-                         'validation_accuracy': val_acc,
-                         'test_accuracy': test_acc,
+                'info': {'train_accuracy': train_acc.item(),
+                         'validation_accuracy': val_acc.item(),
+                         'test_accuracy': test_acc.item(),
                          'log_perplexity': log_perplexity,
                          'perplexity': perplexity,
                          'negative_log_perplexity': neg_log_perplexity,
-                         'training_cost': train_eval_time,
-                         'valid_cost': eval_valid_runtime,
+                         'training_cost': train_time,
+                         'valid_cost': eval_time,
                          'test_cost': eval_test_runtime,
                          'fidelity': fidelity
                          }
@@ -285,15 +295,17 @@ def objective_function_test(self, configuration: Union[CS.Configuration, Dict],
         # The result dict should contain already all necessary information -> Just swap the function value from valid
         # to test and the corresponding time cost
         assert fidelity['epoch'] == 81, 'Only test data for the 50. epoch is available. '
-
         ts_start = time.time()
 
+        self.rng = rng_helper.get_rng(rng)
+        self.__seed_everything()
+
         # batchify data
         batch_size = configuration['batch_size']
-        train_data = batchify(self.X_train, batch_size=batch_size).to(self.device)
-        val_data = batchify(self.X_valid, batch_size=self.variable["eval_batch_size"]).to(self.device)
-
-        train_data = np.vstack(train_data, val_data)
+        train_data = batchify(self.X_train, batch_size=batch_size)
+        val_data = batchify(self.X_valid, batch_size=batch_size)
+        train_data = np.vstack((train_data, val_data))
+        train_data = torch.tensor(train_data).to(self.device)
         test_data = batchify(self.X_test, batch_size=self.variable["eval_batch_size"]).to(self.device)
 
         epochs = fidelity['budget']
@@ -306,18 +318,19 @@ def objective_function_test(self, configuration: Union[CS.Configuration, Dict],
         learning_rate_factor = configuration['lr_factor']
         clip = configuration['clip']
         best_test_loss = None
-        train_eval_time = 0
+        train_time = 0
+        eval_time = 0
         t = tqdm.tqdm(total=epochs)
         for epoch in range(1, epochs + 1):
             epoch_start_time = time.time()
-            train_loss, train_acc = model.train_fun(model, self.ntokens, criterion, train_data, learning_rate,
-                                                    batch_size, clip)
+            train_loss, train_acc = model.train_fun(self.ntokens, criterion, train_data, learning_rate,
+                                                    clip)
+            train_time += time.time() - epoch_start_time
+            start = time.time()
 
-            test_loss, test_acc = model.eval_fun(model, self.ntokens, criterion, test_data)
+            test_loss, test_acc = model.eval_fun(self.ntokens, criterion, test_data)
             test_loss = np.clip(test_loss, 1e-10, 10)
-
-            ts_now = time.time()
-            train_eval_time += ts_now - epoch_start_time
+            eval_time += time.time() - start
 
             t.set_postfix(test_accuracy=test_acc)
             t.update()
@@ -331,27 +344,23 @@ def objective_function_test(self, configuration: Union[CS.Configuration, Dict],
                 # Anneal the learning rate if no improvement has been seen in the validation dataset.
                 learning_rate /= learning_rate_factor
 
-        start_time = time.time()
-        _, test_acc = model.eval_fun(model, self.ntokens, criterion, test_data)
-        eval_test_runtime = time.time() - start_time
-
         perplexity = math.exp(best_test_loss)
         log_perplexity = best_test_loss
         neg_log_perplexity = 10 - best_test_loss
         elapsed_time = float(ts_start - time.time())
 
         return {'function_value': {'log_perplexity': log_perplexity,
-                                   'accuracy': test_acc,
-                                   'time': train_eval_time
+                                   'accuracy': test_acc.item(),
+                                   'time': train_time + eval_time
                                    },
                 'cost': elapsed_time,
-                'info': {'train_accuracy': train_acc,
-                         'test_accuracy': test_acc,
+                'info': {'train_accuracy': train_acc.item(),
+                         'test_accuracy': test_acc.item(),
                          'log_perplexity': log_perplexity,
                          'perplexity': perplexity,
                          'negative_log_perplexity': neg_log_perplexity,
-                         'training_cost': train_eval_time,
-                         'test_cost': eval_test_runtime,
+                         'training_cost': train_time,
+                         'test_cost': eval_time,
                          'fidelity': fidelity
                          }
                 }
diff --git a/hpobench/dependencies/lm/model.py b/hpobench/dependencies/lm/model.py
index 8361c61f..4d9e8e97 100644
--- a/hpobench/dependencies/lm/model.py
+++ b/hpobench/dependencies/lm/model.py
@@ -27,9 +27,6 @@ def __init__(self, d_model, dropout=0.1, max_len=5000):
         position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
         div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
         pe[:, 0::2] = torch.sin(position * div_term)
-        print("shape pe",pe[:,0::2].shape)
-
-        print("shape after pe",pe[:,1::2].shape)
         pe[:, 1::2] = torch.cos(position * div_term)
         pe = pe.unsqueeze(0).transpose(0, 1)
         self.register_buffer('pe', pe)
@@ -51,7 +48,7 @@ def forward(self, x):
 class TransformerModel(nn.Module):
     """Container module with an encoder, a transformer module, and a decoder."""
 
-    def __init__(self, ntoken, ninp, nhead, nhid, nlayers, dropout=0.5, bptt=35):
+    def __init__(self, ntoken, ninp, nhead, nhid, nlayers, dropout=0.5, bptt=35, rng=None):
         super(TransformerModel, self).__init__()
         try:
             from torch.nn import TransformerEncoder, TransformerEncoderLayer
@@ -99,7 +96,7 @@ def forward(self, src, has_mask=True):
         output = self.decoder(output)
         return F.log_softmax(output, dim=-1)
 
-    def train_fun(self, model, ntokens, criterion, train_data, lr, clip):
+    def train_fun(self, ntokens, criterion, train_data, lr, clip):
         # Turn on training mode which enables dropout.
         self.train()
         total_loss = 0.
@@ -108,8 +105,8 @@ def train_fun(self, model, ntokens, criterion, train_data, lr, clip):
             data, targets = self.get_batch(train_data, i)
             # Starting each batch, we detach the hidden state from how it was previously produced.
             # If we didn't, the model would try backpropagating all the way to start of the dataset.
-            model.zero_grad()
-            output = model(data)
+            self.zero_grad()
+            output = self(data)
             output_flat = output.view(-1, ntokens)
             loss = criterion(output_flat, targets)
             loss.backward()
@@ -122,14 +119,14 @@ def train_fun(self, model, ntokens, criterion, train_data, lr, clip):
             total_acc += len(data) * accuracy
 
             # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
-            torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
-            for p in model.parameters():
+            torch.nn.utils.clip_grad_norm_(self.parameters(), clip)
+            for p in self.parameters():
                 p.data.add_(-lr, p.grad.data)
 
         avg_acc = total_acc / (len(train_data) - 1)
         return total_loss / (len(train_data) - 1), avg_acc
 
-    def eval_fun(self, model, ntokens, criterion, data_source):
+    def eval_fun(self, ntokens, criterion, data_source):
         # Turn on evaluation mode which disables dropout.
         self.eval()
         total_loss = 0.
@@ -137,7 +134,7 @@ def eval_fun(self, model, ntokens, criterion, data_source):
         with torch.no_grad():
             for i in range(0, data_source.size(0) - 1, self.bptt):
                 data, targets = self.get_batch(data_source, i)
-                output = model(data)
+                output = self(data)
                 output_flat = output.view(-1, ntokens)
                 total_loss += len(data) * criterion(output_flat, targets).item()
 
diff --git a/tests/test_wikitext.py b/tests/test_wikitext.py
new file mode 100644
index 00000000..2f0d0866
--- /dev/null
+++ b/tests/test_wikitext.py
@@ -0,0 +1,35 @@
+import logging
+import pytest
+
+logging.basicConfig(level=logging.DEBUG)
+
+
+def test_wikitext_benchmark():
+    from hpobench.benchmarks.mo.lm_benchmark import LanguageModelBenchmark
+
+    # Check Seeding
+    benchmark = LanguageModelBenchmark(rng=0)
+    cs = benchmark.get_configuration_space(seed=1)
+    cfg_1 = cs.sample_configuration()
+
+    cs = benchmark.get_configuration_space(seed=1)
+    cfg_2 = cs.sample_configuration()
+
+    assert cfg_1 == cfg_2
+
+    print("cfg1", cfg_1)
+    print("cfg2", cfg_2)
+
+
+    test_config = {
+        'batch_size': 144, 'clip': 1.458859796107597, 'dropout': 0.5967357423109274,
+        'emsize': 575, 'lr': 5.245378070737081, 'lr_factor': 15
+    }
+
+    result_1 = benchmark.objective_function(test_config, rng=1, fidelity={'budget': 1})
+    result_2 = benchmark.objective_function(test_config, rng=1, fidelity={'budget': 1})
+    print("r1", result_1)
+    print("r2", result_2)
+
+    assert result_1['info']['train_accuracy'] == pytest.approx(0.76145, rel=0.001)
+    assert result_1['info']['train_accuracy'] == result_2['info']['train_accuracy']

From 9767d5c1a6a514a002a40e971a112856c43fa007 Mon Sep 17 00:00:00 2001
From: ayushi-3536 <ayushi.sharma.3536@gmail.com>
Date: Sat, 21 May 2022 11:18:27 +0200
Subject: [PATCH 23/29] -added lock for download data - added recipe and
 container file

---
 .../container/benchmarks/mo/lm_benchmark.py   | 12 ++++++++
 .../mo/Singularity.LanguageModelBenchmark     | 30 +++++++++++++++++++
 hpobench/util/data_manager.py                 | 18 ++---------
 3 files changed, 44 insertions(+), 16 deletions(-)
 create mode 100644 hpobench/container/benchmarks/mo/lm_benchmark.py
 create mode 100644 hpobench/container/recipes/mo/Singularity.LanguageModelBenchmark

diff --git a/hpobench/container/benchmarks/mo/lm_benchmark.py b/hpobench/container/benchmarks/mo/lm_benchmark.py
new file mode 100644
index 00000000..f261b00d
--- /dev/null
+++ b/hpobench/container/benchmarks/mo/lm_benchmark.py
@@ -0,0 +1,12 @@
+""" Benchmark for the Multi-Objective Language Model Benchmark from hpobench/benchmarks/mo/lm_benchmark.py
+"""
+
+from hpobench.container.client_abstract_benchmark import AbstractBenchmarkClient
+
+
+class LanguageModelBenchmark(AbstractBenchmarkClient):
+    def __init__(self, **kwargs):
+        kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'LanguageModelBenchmark')
+        kwargs['container_name'] = kwargs.get('container_name', 'lm_benchmark')
+        kwargs['latest'] = kwargs.get('container_tag', '0.0.1')
+        super(LanguageModelBenchmark, self).__init__(**kwargs)
diff --git a/hpobench/container/recipes/mo/Singularity.LanguageModelBenchmark b/hpobench/container/recipes/mo/Singularity.LanguageModelBenchmark
new file mode 100644
index 00000000..8f364323
--- /dev/null
+++ b/hpobench/container/recipes/mo/Singularity.LanguageModelBenchmark
@@ -0,0 +1,30 @@
+Bootstrap: docker
+From: python:3.7-slim
+
+%labels
+MAINTAINER sharmaa@informatik.uni-freiburg.de
+VERSION v0.0.1
+
+%post
+    apt update -y
+    apt install build-essential git wget -y
+
+    cd /home \
+    && mkdir data && cd data \
+    && wget https://raw.githubusercontent.com/pytorch/examples/master/word_language_model/data/wikitext-2/train.txt \
+    && wget https://raw.githubusercontent.com/pytorch/examples/master/word_language_model/data/wikitext-2/valid.txt \
+    && wget https://raw.githubusercontent.com/pytorch/examples/master/word_language_model/data/wikitext-2/test.txt \
+    && cd /home \
+    && git clone https://github.com/ayushi-3536/HPOBench.git \
+    && cd HPOBench \
+    && git checkout fair_adult \
+    && pip install .[adult] \
+    && cd / \
+    && mkdir /var/lib/hpobench/ \
+    && chmod -R 777 /var/lib/hpobench/ \
+    && rm -rf /var/lib/apt/lists/* \
+    && pip cache purge
+
+
+%runscript
+    python -s /home/HPOBench/hpobench/container/server_abstract_benchmark.py mo.adult_benchmark $@
\ No newline at end of file
diff --git a/hpobench/util/data_manager.py b/hpobench/util/data_manager.py
index 3edc05c1..c58ac575 100644
--- a/hpobench/util/data_manager.py
+++ b/hpobench/util/data_manager.py
@@ -981,8 +981,9 @@ def load(self):
         print(self.X_test.shape)
         return self.X_train, self.X_valid, self.X_test
 
+    @lockutils.synchronized('not_thread_process_safe', external=True,
+                            lock_path=f'{hpobench.config_file.cache_dir}/language_model', delay=0.5)
     def _download(self):
-
         for data in self.urls:
             if (self.save_dir / f'{data}.txt').exists():
                 self.logger.debug(f'LanguageModelDataManager : tokenized {data}.txt already exist')
@@ -999,24 +1000,9 @@ def _load(self) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndar
         X_test: np.ndarray
         """
 
-        import torch
-        for data in self.urls:
-        #     if (self.tokenize_path / f'{data}.pt').exists():
-        #         self.logger.debug(f'LanguageModelDataManager : {data}.txt already exist')
-        #     else:
-            tokenized_data = self.corpus.tokenize(self.save_dir / f'{data}.txt')
-            torch.save(tokenized_data, self.tokenize_path / f'{data}.pt')
-
         X_train = self.corpus.tokenize(self.save_dir / 'train.txt')
-
         X_valid = self.corpus.tokenize(self.save_dir / 'valid.txt')
-
         X_test = self.corpus.tokenize(self.save_dir / 'test.txt')
-        #
-        # X_train = torch.load(self.tokenize_path / 'train.pt', map_location=self.device)
-        # X_valid = torch.load(self.tokenize_path / 'valid.pt', map_location=self.device)
-        # X_test = torch.load(self.tokenize_path / 'test.pt', map_location=self.device)
-        print(len(self.corpus.dictionary))
         return X_train, X_valid, X_test
 
 

From f28e7839ffe7fef73f098e89b3739b99884436a6 Mon Sep 17 00:00:00 2001
From: ayushi-3536 <ayushi.sharma.3536@gmail.com>
Date: Sat, 21 May 2022 18:03:07 +0200
Subject: [PATCH 24/29] -make emsize sampling log based: To be discussed with
 team - positional encoding doesn't work for odd number, therefor log seems
 like perfect solution -removed logs

---
 hpobench/benchmarks/mo/lm_benchmark.py | 2 +-
 tests/test_wikitext.py                 | 7 -------
 2 files changed, 1 insertion(+), 8 deletions(-)

diff --git a/hpobench/benchmarks/mo/lm_benchmark.py b/hpobench/benchmarks/mo/lm_benchmark.py
index 22ace808..cee33c34 100644
--- a/hpobench/benchmarks/mo/lm_benchmark.py
+++ b/hpobench/benchmarks/mo/lm_benchmark.py
@@ -67,7 +67,7 @@ def get_configuration_space(seed: Union[int, None] = None) -> CS.ConfigurationSp
                 'batch_size', default_value=128, lower=8, upper=256
             ),
             CS.UniformIntegerHyperparameter(
-                'emsize', default_value=128, lower=32, upper=1024
+                'emsize', default_value=128, lower=32, upper=1024, log=True
             ),
             CS.UniformIntegerHyperparameter(
                 'lr_factor', default_value=50, lower=1, upper=100, log=True
diff --git a/tests/test_wikitext.py b/tests/test_wikitext.py
index 2f0d0866..727a8ea4 100644
--- a/tests/test_wikitext.py
+++ b/tests/test_wikitext.py
@@ -17,10 +17,6 @@ def test_wikitext_benchmark():
 
     assert cfg_1 == cfg_2
 
-    print("cfg1", cfg_1)
-    print("cfg2", cfg_2)
-
-
     test_config = {
         'batch_size': 144, 'clip': 1.458859796107597, 'dropout': 0.5967357423109274,
         'emsize': 575, 'lr': 5.245378070737081, 'lr_factor': 15
@@ -28,8 +24,5 @@ def test_wikitext_benchmark():
 
     result_1 = benchmark.objective_function(test_config, rng=1, fidelity={'budget': 1})
     result_2 = benchmark.objective_function(test_config, rng=1, fidelity={'budget': 1})
-    print("r1", result_1)
-    print("r2", result_2)
-
     assert result_1['info']['train_accuracy'] == pytest.approx(0.76145, rel=0.001)
     assert result_1['info']['train_accuracy'] == result_2['info']['train_accuracy']

From 8ad4eaf466f2cf48b9ca80e4ff49426fdb345a63 Mon Sep 17 00:00:00 2001
From: ayushi-3536 <ayushi.sharma.3536@gmail.com>
Date: Tue, 24 May 2022 14:57:33 +0200
Subject: [PATCH 25/29] -minor cleanup

---
 hpobench/benchmarks/mo/lm_benchmark.py        | 29 +++++++++++--------
 .../mo/Singularity.LanguageModelBenchmark     |  6 ++--
 hpobench/util/data_manager.py                 |  5 ----
 3 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/hpobench/benchmarks/mo/lm_benchmark.py b/hpobench/benchmarks/mo/lm_benchmark.py
index cee33c34..a1ae7fbf 100644
--- a/hpobench/benchmarks/mo/lm_benchmark.py
+++ b/hpobench/benchmarks/mo/lm_benchmark.py
@@ -27,8 +27,17 @@
 
 
 class LanguageModelBenchmark(AbstractMultiObjectiveBenchmark):
-
     def __init__(self, rng: Union[np.random.RandomState, int, None] = None, **kwargs):
+        """
+        Tranformer based multi-objective language model benchmark
+
+        Parameters
+        ----------
+        rng : np.random.RandomState, int, None
+            Random seed for the benchmarks
+
+        Transformer Model is based on : "https://arxiv.org/pdf/1706.03762.pdf"
+        """
         super(LanguageModelBenchmark, self).__init__(rng=rng)
 
         self.device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
@@ -40,16 +49,14 @@ def __init__(self, rng: Union[np.random.RandomState, int, None] = None, **kwargs
                          "nlayers": 2,
                          "bptt": 35,
                          "tied": True,
-                         # number of attention head
+                         # Number of attention head
                          "nhead": 2,
                          "ntoken": self.ntokens
                          }
-        print("len of corpus dict", self.ntokens)
 
     def __seed_everything(self):
         """Helperfunction: Make the benchmark deterministic by setting the correct seeds"""
         seed = self.rng.randint(0, 100000)
-        print("seed obtained", seed)
         random.seed(seed)
         np.random.seed(seed)
         torch.manual_seed(seed)
@@ -86,7 +93,7 @@ def get_configuration_space(seed: Union[int, None] = None) -> CS.ConfigurationSp
         return cs
 
     @staticmethod
-    def get_objective_names(self) -> List[str]:
+    def get_objective_names() -> List[str]:
         return ['log_perplexity', 'accuracy', 'time']
 
     @staticmethod
@@ -205,12 +212,12 @@ def objective_function(self, configuration: Union[CS.Configuration, Dict],
             start = time.time()
             val_loss, val_acc = model.eval_fun(self.ntokens, criterion, val_data)
             val_loss = np.clip(val_loss, 1e-10, 10)
-            print("val acc for last epoch", val_acc)
             eval_time += start - time.time()
 
             t.set_postfix(val_accuracy=val_acc)
             t.update()
 
+            # Taken from original experimental setup
             if not np.isfinite(val_loss):
                 val_loss = 7
 
@@ -228,7 +235,7 @@ def objective_function(self, configuration: Union[CS.Configuration, Dict],
         perplexity = math.exp(best_val_loss)
         log_perplexity = best_val_loss
         neg_log_perplexity = 10 - best_val_loss
-        elapsed_time = float(ts_start - time.time())
+        elapsed_time = ts_start - time.time()
 
         return {'function_value': {'log_perplexity': log_perplexity,
                                    'accuracy': val_acc.item(),
@@ -292,9 +299,7 @@ def objective_function_test(self, configuration: Union[CS.Configuration, Dict],
                     used fidelities in this evaluation
         """
 
-        # The result dict should contain already all necessary information -> Just swap the function value from valid
-        # to test and the corresponding time cost
-        assert fidelity['epoch'] == 81, 'Only test data for the 50. epoch is available. '
+        assert fidelity['epoch'] == 81, 'Only test data for the 81 epoch is available. '
         ts_start = time.time()
 
         self.rng = rng_helper.get_rng(rng)
@@ -347,7 +352,7 @@ def objective_function_test(self, configuration: Union[CS.Configuration, Dict],
         perplexity = math.exp(best_test_loss)
         log_perplexity = best_test_loss
         neg_log_perplexity = 10 - best_test_loss
-        elapsed_time = float(ts_start - time.time())
+        elapsed_time = ts_start - time.time()
 
         return {'function_value': {'log_perplexity': log_perplexity,
                                    'accuracy': test_acc.item(),
@@ -365,4 +370,4 @@ def objective_function_test(self, configuration: Union[CS.Configuration, Dict],
                          }
                 }
 
-    __all__ = ["LanguageModelBenchmark"]
+    __all__ = ["LanguageModelBenchmark"]
\ No newline at end of file
diff --git a/hpobench/container/recipes/mo/Singularity.LanguageModelBenchmark b/hpobench/container/recipes/mo/Singularity.LanguageModelBenchmark
index 8f364323..770da7f9 100644
--- a/hpobench/container/recipes/mo/Singularity.LanguageModelBenchmark
+++ b/hpobench/container/recipes/mo/Singularity.LanguageModelBenchmark
@@ -17,8 +17,8 @@ VERSION v0.0.1
     && cd /home \
     && git clone https://github.com/ayushi-3536/HPOBench.git \
     && cd HPOBench \
-    && git checkout fair_adult \
-    && pip install .[adult] \
+    && git checkout wikitext \
+    && pip install .[lm_benchmark] \
     && cd / \
     && mkdir /var/lib/hpobench/ \
     && chmod -R 777 /var/lib/hpobench/ \
@@ -27,4 +27,4 @@ VERSION v0.0.1
 
 
 %runscript
-    python -s /home/HPOBench/hpobench/container/server_abstract_benchmark.py mo.adult_benchmark $@
\ No newline at end of file
+    python -s /home/HPOBench/hpobench/container/server_abstract_benchmark.py mo.lm_benchmark $@
\ No newline at end of file
diff --git a/hpobench/util/data_manager.py b/hpobench/util/data_manager.py
index c58ac575..cce04868 100644
--- a/hpobench/util/data_manager.py
+++ b/hpobench/util/data_manager.py
@@ -974,11 +974,6 @@ def load(self):
         self._download()
         self.X_train, self.X_valid, self.X_test = self._load()
         self.logger.info(f'LanguageModelDataManager: Data successfully loaded after {time() - t:.2f}')
-        print(self.X_train.shape)
-
-        print(self.X_valid.shape)
-
-        print(self.X_test.shape)
         return self.X_train, self.X_valid, self.X_test
 
     @lockutils.synchronized('not_thread_process_safe', external=True,

From 4412b7000529ed272f2f4660938e429994b6cd50 Mon Sep 17 00:00:00 2001
From: ayushi-3536 <ayushi.sharma.3536@gmail.com>
Date: Tue, 24 May 2022 15:34:51 +0200
Subject: [PATCH 26/29] -minor cleanup

---
 hpobench/benchmarks/mo/lm_benchmark.py | 26 ++++++++++++++++++--------
 1 file changed, 18 insertions(+), 8 deletions(-)

diff --git a/hpobench/benchmarks/mo/lm_benchmark.py b/hpobench/benchmarks/mo/lm_benchmark.py
index a1ae7fbf..93c8b145 100644
--- a/hpobench/benchmarks/mo/lm_benchmark.py
+++ b/hpobench/benchmarks/mo/lm_benchmark.py
@@ -98,6 +98,21 @@ def get_objective_names() -> List[str]:
 
     @staticmethod
     def get_fidelity_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
+        """
+        Creates a ConfigSpace.ConfigurationSpace containing all fidelity parameters
+
+        Fidelities:
+         - epoch: int
+
+        Parameters
+        ----------
+        seed : int, None
+            Fixing the seed for the ConfigSpace.ConfigurationSpace
+
+        Returns
+        -------
+        ConfigSpace.ConfigurationSpace
+        """
 
         fidelity_space = CS.ConfigurationSpace(seed=seed)
         fidelity_space.add_hyperparameters([
@@ -124,9 +139,6 @@ def get_meta_information() -> Dict:
     def init_model(self, config: Union[CS.Configuration, Dict]):
         """ Function that returns the model initialized based on the configuration and fidelity
         """
-
-        if isinstance(config, CS.Configuration):
-            config = config.get_dictionary()
         model = TransformerModel(
             self.variable['ntoken'], config['emsize'], self.variable['nhead'], config['emsize'],
             self.variable['nlayers'], config['dropout'])
@@ -137,7 +149,6 @@ def init_model(self, config: Union[CS.Configuration, Dict]):
     def objective_function(self, configuration: Union[CS.Configuration, Dict],
                            fidelity: Union[Dict, CS.Configuration, None] = None,
                            rng: Union[np.random.RandomState, int, None] = None,
-                           shuffle: bool = False,
                            **kwargs) -> Dict:
         """
 
@@ -179,7 +190,7 @@ def objective_function(self, configuration: Union[CS.Configuration, Dict],
                     used fidelities in this evaluation
         """
 
-        self.rng = rng_helper.get_rng(rng)
+        self.rng = rng_helper.get_rng(self.rng, rng)
         self.__seed_everything()
 
         device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
@@ -259,7 +270,6 @@ def objective_function(self, configuration: Union[CS.Configuration, Dict],
     def objective_function_test(self, configuration: Union[CS.Configuration, Dict],
                                 fidelity: Union[Dict, None] = None,
                                 rng: Union[np.random.RandomState, int, None] = None,
-                                shuffle: bool = False,
                                 **kwargs) -> Dict:
         """
         Get the validated results. Runs a given configuration on the largest budget (here: 50).
@@ -302,7 +312,7 @@ def objective_function_test(self, configuration: Union[CS.Configuration, Dict],
         assert fidelity['epoch'] == 81, 'Only test data for the 81 epoch is available. '
         ts_start = time.time()
 
-        self.rng = rng_helper.get_rng(rng)
+        self.rng = rng_helper.get_rng(self.rng, rng)
         self.__seed_everything()
 
         # batchify data
@@ -370,4 +380,4 @@ def objective_function_test(self, configuration: Union[CS.Configuration, Dict],
                          }
                 }
 
-    __all__ = ["LanguageModelBenchmark"]
\ No newline at end of file
+    __all__ = ["LanguageModelBenchmark"]

From c44bdfcaf16331670f18e3415b92898624f670ff Mon Sep 17 00:00:00 2001
From: ayushi-3536 <ayushi.sharma.3536@gmail.com>
Date: Sat, 4 Jun 2022 19:36:31 +0200
Subject: [PATCH 27/29] -add gpu support -add dependency version

---
 extra_requirements/lm_benchmark.json             | 2 +-
 hpobench/container/benchmarks/mo/lm_benchmark.py | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/extra_requirements/lm_benchmark.json b/extra_requirements/lm_benchmark.json
index a686fe43..8f263bba 100644
--- a/extra_requirements/lm_benchmark.json
+++ b/extra_requirements/lm_benchmark.json
@@ -1,6 +1,6 @@
 {
   "lm": [
     "torch==1.3.0",
-    "tqdm"
+    "tqdm>=3.0.0"
   ]
 }
\ No newline at end of file
diff --git a/hpobench/container/benchmarks/mo/lm_benchmark.py b/hpobench/container/benchmarks/mo/lm_benchmark.py
index f261b00d..8e08fcae 100644
--- a/hpobench/container/benchmarks/mo/lm_benchmark.py
+++ b/hpobench/container/benchmarks/mo/lm_benchmark.py
@@ -9,4 +9,5 @@ def __init__(self, **kwargs):
         kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'LanguageModelBenchmark')
         kwargs['container_name'] = kwargs.get('container_name', 'lm_benchmark')
         kwargs['latest'] = kwargs.get('container_tag', '0.0.1')
+        kwargs['gpu'] = kwargs.get('gpu', True)
         super(LanguageModelBenchmark, self).__init__(**kwargs)

From 130282ef66c918e1865e07d571511c11a75279d1 Mon Sep 17 00:00:00 2001
From: ayushi-3536 <ayushi.sharma.3536@gmail.com>
Date: Sat, 4 Jun 2022 19:43:43 +0200
Subject: [PATCH 28/29] -add MO abstract client

---
 hpobench/container/benchmarks/mo/lm_benchmark.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/hpobench/container/benchmarks/mo/lm_benchmark.py b/hpobench/container/benchmarks/mo/lm_benchmark.py
index 8e08fcae..0f506c43 100644
--- a/hpobench/container/benchmarks/mo/lm_benchmark.py
+++ b/hpobench/container/benchmarks/mo/lm_benchmark.py
@@ -1,10 +1,10 @@
 """ Benchmark for the Multi-Objective Language Model Benchmark from hpobench/benchmarks/mo/lm_benchmark.py
 """
 
-from hpobench.container.client_abstract_benchmark import AbstractBenchmarkClient
+from hpobench.container.client_abstract_benchmark import AbstractMOBenchmarkClient
 
 
-class LanguageModelBenchmark(AbstractBenchmarkClient):
+class LanguageModelBenchmark(AbstractMOBenchmarkClient):
     def __init__(self, **kwargs):
         kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'LanguageModelBenchmark')
         kwargs['container_name'] = kwargs.get('container_name', 'lm_benchmark')

From f259291c7f07b3ea0acd83d111bc5c66fb33067e Mon Sep 17 00:00:00 2001
From: ayushi-3536 <ayushi.sharma.3536@gmail.com>
Date: Sat, 4 Jun 2022 19:49:41 +0200
Subject: [PATCH 29/29] - minimize objective values

---
 hpobench/benchmarks/mo/lm_benchmark.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/hpobench/benchmarks/mo/lm_benchmark.py b/hpobench/benchmarks/mo/lm_benchmark.py
index 93c8b145..fd016d01 100644
--- a/hpobench/benchmarks/mo/lm_benchmark.py
+++ b/hpobench/benchmarks/mo/lm_benchmark.py
@@ -249,7 +249,7 @@ def objective_function(self, configuration: Union[CS.Configuration, Dict],
         elapsed_time = ts_start - time.time()
 
         return {'function_value': {'log_perplexity': log_perplexity,
-                                   'accuracy': val_acc.item(),
+                                   'accuracy': 1 - val_acc.item(),
                                    'time': train_time + eval_time
                                    },
                 'cost': elapsed_time,
@@ -365,7 +365,7 @@ def objective_function_test(self, configuration: Union[CS.Configuration, Dict],
         elapsed_time = ts_start - time.time()
 
         return {'function_value': {'log_perplexity': log_perplexity,
-                                   'accuracy': test_acc.item(),
+                                   'accuracy': 1 - test_acc.item(),
                                    'time': train_time + eval_time
                                    },
                 'cost': elapsed_time,