Remove the output directory argument (#1159)

franchuterivera · web-flow · commit ccb419ceb9dd · 2021-06-25T16:22:54.000+02:00
* Remove file output

* output folder

* don't remove everything

* Move to ubuntu latest

* fix binary

* revert latest
diff --git a/autosklearn/automl.py b/autosklearn/automl.py
@@ -145,7 +145,6 @@ def __init__(self,
         self.configuration_space = None
         self._backend = backend
         # self._tmp_dir = tmp_dir
-        # self._output_dir = output_dir
         self._time_for_task = time_left_for_this_task
         self._per_run_time_limit = per_run_time_limit
         self._initial_configurations_via_metalearning = \
@@ -254,9 +253,6 @@ def __init__(self,
         # By default try to use the TCP logging port or get a new port
         self._logger_port = logging.handlers.DEFAULT_TCP_LOGGING_PORT
 
-        # After assigning and checking variables...
-        # self._backend = Backend(self._output_dir, self._tmp_dir)
-
         # Num_run tell us how many runs have been launched
         # It can be seen as an identifier for each configuration
         # saved to disk
@@ -573,7 +569,6 @@ def fit(
                 raise ValueError('Unable to read requirement: %s' % requirement)
         self._logger.debug('Done printing environment information')
         self._logger.debug('Starting to print arguments to auto-sklearn')
-        self._logger.debug('  output_folder: %s', self._backend.context._output_directory)
         self._logger.debug('  tmp_folder: %s', self._backend.context._temporary_directory)
         self._logger.debug('  time_left_for_this_task: %f', self._time_for_task)
         self._logger.debug('  per_run_time_limit: %f', self._per_run_time_limit)
diff --git a/autosklearn/ensemble_builder.py b/autosklearn/ensemble_builder.py
@@ -1329,14 +1329,6 @@ def predict(self, set_: str,
             y = ensemble.predict(predictions)
             if self.task_type == BINARY_CLASSIFICATION:
                 y = y[:, 1]
-            if self.SAVE2DISC:
-                self.backend.save_predictions_as_txt(
-                    predictions=y,
-                    subset=set_,
-                    idx=index_run,
-                    prefix=self.dataset_name,
-                    precision=8,
-                )
             return y
         else:
             self.logger.info(
diff --git a/autosklearn/estimators.py b/autosklearn/estimators.py
@@ -39,9 +39,7 @@ def __init__(
         resampling_strategy='holdout',
         resampling_strategy_arguments=None,
         tmp_folder=None,
-        output_folder=None,
         delete_tmp_folder_after_terminate=True,
-        delete_output_folder_after_terminate=True,
         n_jobs: Optional[int] = None,
         dask_client: Optional[dask.distributed.Client] = None,
         disable_evaluator_output=False,
@@ -161,18 +159,10 @@ def __init__(
             folder to store configuration output and log files, if ``None``
             automatically use ``/tmp/autosklearn_tmp_$pid_$random_number``
 
-        output_folder : string, optional (None)
-            folder to store predictions for optional test set, if ``None``
-            no output will be generated
-
         delete_tmp_folder_after_terminate: string, optional (True)
             remove tmp_folder, when finished. If tmp_folder is None
             tmp_dir will always be deleted
 
-        delete_output_folder_after_terminate: bool, optional (True)
-            remove output_folder, when finished. If output_folder is None
-            output_dir will always be deleted
-
         n_jobs : int, optional, experimental
             The number of jobs to run in parallel for ``fit()``. ``-1`` means
             using all processors. By default, Auto-sklearn uses a single core
@@ -263,9 +253,7 @@ def __init__(
         self.resampling_strategy = resampling_strategy
         self.resampling_strategy_arguments = resampling_strategy_arguments
         self.tmp_folder = tmp_folder
-        self.output_folder = output_folder
         self.delete_tmp_folder_after_terminate = delete_tmp_folder_after_terminate
-        self.delete_output_folder_after_terminate = delete_output_folder_after_terminate
         self.n_jobs = n_jobs
         self.dask_client = dask_client
         self.disable_evaluator_output = disable_evaluator_output
@@ -299,9 +287,7 @@ def build_automl(self):
 
         backend = create(
             temporary_directory=self.tmp_folder,
-            output_directory=self.output_folder,
             delete_tmp_folder_after_terminate=self.delete_tmp_folder_after_terminate,
-            delete_output_folder_after_terminate=self.delete_output_folder_after_terminate,
             )
 
         automl = self._get_automl_class()(
diff --git a/autosklearn/experimental/askl2.py b/autosklearn/experimental/askl2.py
@@ -170,9 +170,7 @@ def __init__(
         seed: int = 1,
         memory_limit: int = 3072,
         tmp_folder: Optional[str] = None,
-        output_folder: Optional[str] = None,
         delete_tmp_folder_after_terminate: bool = True,
-        delete_output_folder_after_terminate: bool = True,
         n_jobs: Optional[int] = None,
         dask_client: Optional[dask.distributed.Client] = None,
         disable_evaluator_output: bool = False,
@@ -230,18 +228,10 @@ def __init__(
             folder to store configuration output and log files, if ``None``
             automatically use ``/tmp/autosklearn_tmp_$pid_$random_number``
 
-        output_folder : string, optional (None)
-            folder to store predictions for optional test set, if ``None``
-            no output will be generated
-
         delete_tmp_folder_after_terminate: string, optional (True)
             remove tmp_folder, when finished. If tmp_folder is None
             tmp_dir will always be deleted
 
-        delete_output_folder_after_terminate: bool, optional (True)
-            remove output_folder, when finished. If output_folder is None
-            output_dir will always be deleted
-
         n_jobs : int, optional, experimental
             The number of jobs to run in parallel for ``fit()``. ``-1`` means
             using all processors. By default, Auto-sklearn uses a single core
@@ -324,9 +314,7 @@ def __init__(
             resampling_strategy=None,
             resampling_strategy_arguments=None,
             tmp_folder=tmp_folder,
-            output_folder=output_folder,
             delete_tmp_folder_after_terminate=delete_tmp_folder_after_terminate,
-            delete_output_folder_after_terminate=delete_output_folder_after_terminate,
             n_jobs=n_jobs,
             dask_client=dask_client,
             disable_evaluator_output=disable_evaluator_output,
diff --git a/autosklearn/util/backend.py b/autosklearn/util/backend.py
@@ -24,13 +24,10 @@
 
 def create(
     temporary_directory: str,
-    output_directory: Optional[str],
     delete_tmp_folder_after_terminate: bool = True,
-    delete_output_folder_after_terminate: bool = True,
 ) -> 'Backend':
-    context = BackendContext(temporary_directory, output_directory,
+    context = BackendContext(temporary_directory,
                              delete_tmp_folder_after_terminate,
-                             delete_output_folder_after_terminate,
                              )
     backend = Backend(context)
 
@@ -58,28 +55,18 @@ class BackendContext(object):
 
     def __init__(self,
                  temporary_directory: str,
-                 output_directory: Optional[str],
                  delete_tmp_folder_after_terminate: bool,
-                 delete_output_folder_after_terminate: bool,
                  ):
 
-        # Check that the names of tmp_dir and output_dir is not the same.
-        if temporary_directory == output_directory and temporary_directory is not None:
-            raise ValueError("The temporary and the output directory "
-                             "must be different.")
-
         self.delete_tmp_folder_after_terminate = delete_tmp_folder_after_terminate
-        self.delete_output_folder_after_terminate = delete_output_folder_after_terminate
         # attributes to check that directories were created by autosklearn.
         self._tmp_dir_created = False
-        self._output_dir_created = False
 
         self._temporary_directory = (
             get_randomized_directory_name(
                 temporary_directory=temporary_directory,
             )
         )
-        self._output_directory = output_directory
         # Auto-Sklearn logs through the use of a PicklableClientLogger
         # For this reason we need a port to communicate with the server
         # When the backend is created, this port is not available
@@ -94,14 +81,6 @@ def setup_logger(self, port: int) -> None:
             port=port,
         )
 
-    @property
-    def output_directory(self) -> Optional[str]:
-        if self._output_directory is not None:
-            # make sure that tilde does not appear on the path.
-            return os.path.expanduser(os.path.expandvars(self._output_directory))
-        else:
-            return None
-
     @property
     def temporary_directory(self) -> str:
         # make sure that tilde does not appear on the path.
@@ -112,29 +91,7 @@ def create_directories(self) -> None:
         os.makedirs(self.temporary_directory)
         self._tmp_dir_created = True
 
-        # Exception is raised if self.output_directory already exists.
-        if self.output_directory is not None:
-            os.makedirs(self.output_directory)
-            self._output_dir_created = True
-
     def delete_directories(self, force: bool = True) -> None:
-        if self.output_directory and (self.delete_output_folder_after_terminate or force):
-            if self._output_dir_created is False:
-                raise ValueError("Failed to delete output dir: %s because auto-sklearn did not "
-                                 "create it. Please make sure that the specified output dir does "
-                                 "not exist when instantiating auto-sklearn."
-                                 % self.output_directory)
-            try:
-                shutil.rmtree(self.output_directory)
-            except Exception:
-                try:
-                    if self._logger is not None:
-                        self._logger.warning("Could not delete output dir: %s" %
-                                             self.output_directory)
-                    else:
-                        print("Could not delete output dir: %s" % self.output_directory)
-                except Exception:
-                    print("Could not delete output dir: %s" % self.output_directory)
 
         if self.delete_tmp_folder_after_terminate or force:
             if self._tmp_dir_created is False:
@@ -175,10 +132,6 @@ def __init__(self, context: BackendContext):
             os.makedirs(self.temporary_directory)
         except Exception:
             pass
-        # This does not have to exist or be specified
-        if self.output_directory is not None:
-            if not os.path.exists(self.output_directory):
-                raise ValueError("Output directory %s does not exist." % self.output_directory)
 
         self.internals_directory = os.path.join(self.temporary_directory, ".auto-sklearn")
         self._make_internals_directory()
@@ -190,10 +143,6 @@ def setup_logger(self, port: int) -> None:
         )
         self.context.setup_logger(port)
 
-    @property
-    def output_directory(self) -> Optional[str]:
-        return self.context.output_directory
-
     @property
     def temporary_directory(self) -> str:
         return self.context.temporary_directory
@@ -466,31 +415,6 @@ def get_prediction_filename(self, subset: str,
                                 ) -> str:
         return 'predictions_%s_%s_%s_%s.npy' % (subset, automl_seed, idx, budget)
 
-    def save_predictions_as_txt(self,
-                                predictions: np.ndarray,
-                                subset: str,
-                                idx: int, precision: int,
-                                prefix: Optional[str] = None) -> None:
-        if not self.output_directory:
-            return
-        # Write prediction scores in prescribed format
-        filepath = os.path.join(
-            self.output_directory,
-            ('%s_' % prefix if prefix else '') + '%s_%s.predict' % (subset, str(idx)),
-        )
-
-        format_string = '{:.%dg} ' % precision
-        with tempfile.NamedTemporaryFile('w', dir=os.path.dirname(
-                filepath), delete=False) as output_file:
-            for row in predictions:
-                if not isinstance(row, np.ndarray) and not isinstance(row, list):
-                    row = [row]
-                for val in row:
-                    output_file.write(format_string.format(float(val)))
-                output_file.write('\n')
-            tempname = output_file.name
-        os.rename(tempname, filepath)
-
     def write_txt_file(self, filepath: str, data: str, name: str) -> None:
         with tempfile.NamedTemporaryFile('w', dir=os.path.dirname(
                 filepath), delete=False) as fh:
diff --git a/examples/20_basic/example_classification.py b/examples/20_basic/example_classification.py
@@ -29,7 +29,6 @@
     time_left_for_this_task=120,
     per_run_time_limit=30,
     tmp_folder='/tmp/autosklearn_classification_example_tmp',
-    output_folder='/tmp/autosklearn_classification_example_out',
 )
 automl.fit(X_train, y_train, dataset_name='breast_cancer')
 
diff --git a/examples/20_basic/example_multioutput_regression.py b/examples/20_basic/example_multioutput_regression.py
@@ -32,7 +32,6 @@
     time_left_for_this_task=120,
     per_run_time_limit=30,
     tmp_folder='/tmp/autosklearn_multioutput_regression_example_tmp',
-    output_folder='/tmp/autosklearn_multioutput_regression_example_out',
 )
 automl.fit(X_train, y_train, dataset_name='synthetic')
 
diff --git a/examples/20_basic/example_regression.py b/examples/20_basic/example_regression.py
@@ -30,7 +30,6 @@
     time_left_for_this_task=120,
     per_run_time_limit=30,
     tmp_folder='/tmp/autosklearn_regression_example_tmp',
-    output_folder='/tmp/autosklearn_regression_example_out',
 )
 automl.fit(X_train, y_train, dataset_name='diabetes')
 
diff --git a/examples/40_advanced/example_resampling.py b/examples/40_advanced/example_resampling.py
@@ -33,7 +33,6 @@
     time_left_for_this_task=120,
     per_run_time_limit=30,
     tmp_folder='/tmp/autosklearn_resampling_example_tmp',
-    output_folder='/tmp/autosklearn_resampling_example_out',
     disable_evaluator_output=False,
     # 'holdout' with 'train_size'=0.67 is the default argument setting
     # for AutoSklearnClassifier. It is explicitly specified in this example
@@ -59,7 +58,6 @@
     time_left_for_this_task=120,
     per_run_time_limit=30,
     tmp_folder='/tmp/autosklearn_resampling_example_tmp',
-    output_folder='/tmp/autosklearn_resampling_example_out',
     disable_evaluator_output=False,
     resampling_strategy='cv',
     resampling_strategy_arguments={'folds': 5},
@@ -107,7 +105,6 @@
     time_left_for_this_task=120,
     per_run_time_limit=30,
     tmp_folder='/tmp/autosklearn_resampling_example_tmp',
-    output_folder='/tmp/autosklearn_resampling_example_out',
     disable_evaluator_output=False,
     resampling_strategy=resampling_strategy,
     resampling_strategy_arguments=resampling_strategy_arguments,
diff --git a/examples/60_search/example_parallel_manual_spawning_cli.py b/examples/60_search/example_parallel_manual_spawning_cli.py
@@ -63,7 +63,6 @@
 from autosklearn.constants import MULTICLASS_CLASSIFICATION
 
 tmp_folder = '/tmp/autosklearn_parallel_3_example_tmp'
-output_folder = '/tmp/autosklearn_parallel_3_example_out'
 
 worker_processes = []
 
@@ -180,7 +179,6 @@ def cli_start_worker(scheduler_file_name):
         per_run_time_limit=10,
         memory_limit=1024,
         tmp_folder=tmp_folder,
-        output_folder=output_folder,
         seed=777,
         # n_jobs is ignored internally as we pass a dask client.
         n_jobs=1,
diff --git a/examples/60_search/example_parallel_manual_spawning_python.py b/examples/60_search/example_parallel_manual_spawning_python.py
@@ -57,7 +57,6 @@
 from autosklearn.constants import MULTICLASS_CLASSIFICATION
 
 tmp_folder = '/tmp/autosklearn_parallel_2_example_tmp'
-output_folder = '/tmp/autosklearn_parallel_2_example_out'
 
 
 ############################################################################
@@ -128,7 +127,6 @@ async def do_work():
                 per_run_time_limit=10,
                 memory_limit=1024,
                 tmp_folder=tmp_folder,
-                output_folder=output_folder,
                 seed=777,
                 # n_jobs is ignored internally as we pass a dask client.
                 n_jobs=1,
diff --git a/examples/60_search/example_parallel_n_jobs.py b/examples/60_search/example_parallel_n_jobs.py
@@ -41,7 +41,6 @@
         time_left_for_this_task=120,
         per_run_time_limit=30,
         tmp_folder='/tmp/autosklearn_parallel_1_example_tmp',
-        output_folder='/tmp/autosklearn_parallel_1_example_out',
         n_jobs=4,
         # Each one of the 4 jobs is allocated 3GB
         memory_limit=3072,
diff --git a/examples/60_search/example_random_search.py b/examples/60_search/example_random_search.py
@@ -4,8 +4,8 @@
 =============
 
 A crucial feature of *auto-sklearn* is automatically optimizing the hyperparameters through SMAC,
-introduced `here <https://ml.informatik.uni-freiburg.de/papers/11-LION5-SMAC.pdf>`_. 
-Additionally, it is possible to use 
+introduced `here <https://ml.informatik.uni-freiburg.de/papers/11-LION5-SMAC.pdf>`_.
+Additionally, it is possible to use
 `random search <https://www.jmlr.org/papers/v13/bergstra12a.html>`_ instead of
 SMAC, as demonstrated in the example below. Furthermore, the example also demonstrates how to use
 `Random Online Aggressive Racing (ROAR) <https://ml.informatik.uni-freiburg.de/papers/11-LION5-SMAC.pdf>`_
@@ -65,7 +65,6 @@ def get_roar_object_callback(
 automl = autosklearn.classification.AutoSklearnClassifier(
     time_left_for_this_task=60, per_run_time_limit=15,
     tmp_folder='/tmp/autosklearn_random_search_example_tmp',
-    output_folder='/tmp/autosklearn_random_search_example_out',
     get_smac_object_callback=get_roar_object_callback,
     initial_configurations_via_metalearning=0,
 )
@@ -118,7 +117,6 @@ def get_random_search_object_callback(
     time_left_for_this_task=60,
     per_run_time_limit=15,
     tmp_folder='/tmp/autosklearn_random_search_example_tmp',
-    output_folder='/tmp/autosklearn_random_search_example_out',
     get_smac_object_callback=get_random_search_object_callback,
     initial_configurations_via_metalearning=0,
 )
diff --git a/examples/60_search/example_sequential.py b/examples/60_search/example_sequential.py
@@ -32,7 +32,6 @@
     time_left_for_this_task=120,
     per_run_time_limit=30,
     tmp_folder='/tmp/autosklearn_sequential_example_tmp',
-    output_folder='/tmp/autosklearn_sequential_example_out',
     # Do not construct ensembles in parallel to avoid using more than one
     # core at a time. The ensemble will be constructed after auto-sklearn
     # finished fitting all machine learning models.
diff --git a/examples/60_search/example_successive_halving.py b/examples/60_search/example_successive_halving.py
diff --git a/scripts/2015_nips_paper/run/score_ensemble.py b/scripts/2015_nips_paper/run/score_ensemble.py
diff --git a/test/conftest.py b/test/conftest.py
diff --git a/test/test_automl/test_estimators.py b/test/test_automl/test_estimators.py
diff --git a/test/test_evaluation/test_abstract_evaluator.py b/test/test_evaluation/test_abstract_evaluator.py
diff --git a/test/test_evaluation/test_train_evaluator.py b/test/test_evaluation/test_train_evaluator.py

Original file line number	Diff line number	Diff line change
`@@ -29,7 +29,6 @@`
`29`	`29`	`time_left_for_this_task=120,`
`30`	`30`	`per_run_time_limit=30,`
`31`	`31`	`tmp_folder='/tmp/autosklearn_classification_example_tmp',`
`32`		`- output_folder='/tmp/autosklearn_classification_example_out',`
`33`	`32`	`)`
`34`	`33`	`automl.fit(X_train, y_train, dataset_name='breast_cancer')`
`35`	`34`
Original file line number	Diff line number	Diff line change
`@@ -32,7 +32,6 @@`
`32`	`32`	`time_left_for_this_task=120,`
`33`	`33`	`per_run_time_limit=30,`
`34`	`34`	`tmp_folder='/tmp/autosklearn_multioutput_regression_example_tmp',`
`35`		`- output_folder='/tmp/autosklearn_multioutput_regression_example_out',`
`36`	`35`	`)`
`37`	`36`	`automl.fit(X_train, y_train, dataset_name='synthetic')`
`38`	`37`
Original file line number	Diff line number	Diff line change
`@@ -30,7 +30,6 @@`
`30`	`30`	`time_left_for_this_task=120,`
`31`	`31`	`per_run_time_limit=30,`
`32`	`32`	`tmp_folder='/tmp/autosklearn_regression_example_tmp',`
`33`		`- output_folder='/tmp/autosklearn_regression_example_out',`
`34`	`33`	`)`
`35`	`34`	`automl.fit(X_train, y_train, dataset_name='diabetes')`
`36`	`35`