Measuring runtimes (#1031)

Neeratyoy · web-flow · commit f94672e67205 · 2021-03-12T16:38:21.000+01:00
* [skip ci] addressing #248 * Unit test to test existence of refit time * Refactoring unit test * Fixing unit test failures * Unit test fixing + removing redundant parameter * Debugging stochastic failure of test_joblib_backends unit test * Unit test fix with decorators * Flaky for failing unit test * Adding flaky reruns for unit tests * Fixing setup big * pytest rerun debug * Fixing coverage failure * Debugging coverage failure * Debugging coverage failure * Adding __init__ files in test/ for pytest-cov * Debugging coverage failure * Debugging lean unit test * Debugging loky failure in unit tests * Clean up of debugging stuff
diff --git a/.github/workflows/ubuntu-test.yml b/.github/workflows/ubuntu-test.yml
@@ -29,6 +29,8 @@ jobs:
 
     steps:
     - uses: actions/checkout@v2
+      with:
+        fetch-depth: 2
     - name: Setup Python ${{ matrix.python-version }}
       uses: actions/setup-python@v2
       with:
@@ -51,7 +53,7 @@ jobs:
     - name: Run tests
       run: |
         if [ ${{ matrix.code-cov }} ]; then codecov='--cov=openml --long  --cov-report=xml'; fi
-        pytest -n 4 --durations=20 --timeout=600 --timeout-method=thread --dist load -sv $codecov
+        pytest -n 4 --durations=20 --timeout=600 --timeout-method=thread --dist load -sv $codecov --reruns 5 --reruns-delay 1
     - name: Check for files left behind by test
       if: ${{ always() }}
       run: |
@@ -67,5 +69,6 @@ jobs:
       if: matrix.code-cov && always()
       uses: codecov/codecov-action@v1
       with:
+        files: coverage.xml
         fail_ci_if_error: true
-        verbose: true
+        verbose: true
diff --git a/openml/config.py b/openml/config.py
@@ -211,15 +211,6 @@ def _setup(config=None):
     else:
         cache_exists = True
 
-    if cache_exists:
-        _create_log_handlers()
-    else:
-        _create_log_handlers(create_file_handler=False)
-        openml_logger.warning(
-            "No permission to create OpenML directory at %s! This can result in OpenML-Python "
-            "not working properly." % config_dir
-        )
-
     if config is None:
         config = _parse_config(config_file)
 
@@ -240,6 +231,15 @@ def _get(config, key):
     connection_n_retries = int(_get(config, "connection_n_retries"))
     max_retries = int(_get(config, "max_retries"))
 
+    if cache_exists:
+        _create_log_handlers()
+    else:
+        _create_log_handlers(create_file_handler=False)
+        openml_logger.warning(
+            "No permission to create OpenML directory at %s! This can result in OpenML-Python "
+            "not working properly." % config_dir
+        )
+
     cache_directory = os.path.expanduser(short_cache_dir)
     # create the cache subdirectory
     if not os.path.exists(cache_directory):
diff --git a/openml/extensions/sklearn/extension.py b/openml/extensions/sklearn/extension.py
@@ -1744,6 +1744,8 @@ def _prediction_to_probabilities(
                 user_defined_measures["usercpu_time_millis_training"] = modelfit_dur_cputime
 
             modelfit_dur_walltime = (time.time() - modelfit_start_walltime) * 1000
+            if hasattr(model_copy, "refit_time_"):
+                modelfit_dur_walltime += model_copy.refit_time_
             if can_measure_wallclocktime:
                 user_defined_measures["wall_clock_time_millis_training"] = modelfit_dur_walltime
 
diff --git a/openml/runs/functions.py b/openml/runs/functions.py
@@ -271,7 +271,6 @@ def run_flow_on_task(
 
     # execute the run
     res = _run_task_get_arffcontent(
-        flow=flow,
         model=flow.model,
         task=task,
         extension=flow.extension,
@@ -432,7 +431,6 @@ def run_exists(task_id: int, setup_id: int) -> Set[int]:
 
 
 def _run_task_get_arffcontent(
-    flow: OpenMLFlow,
     model: Any,
     task: OpenMLTask,
     extension: "Extension",
@@ -476,7 +474,6 @@ def _run_task_get_arffcontent(
     job_rvals = Parallel(verbose=0, n_jobs=n_jobs)(
         delayed(_run_task_get_arffcontent_parallel_helper)(
             extension=extension,
-            flow=flow,
             fold_no=fold_no,
             model=model,
             rep_no=rep_no,
@@ -613,7 +610,6 @@ def _calculate_local_measure(sklearn_fn, openml_name):
 
 def _run_task_get_arffcontent_parallel_helper(
     extension: "Extension",
-    flow: OpenMLFlow,
     fold_no: int,
     model: Any,
     rep_no: int,
@@ -661,12 +657,13 @@ def _run_task_get_arffcontent_parallel_helper(
     else:
         raise NotImplementedError(task.task_type)
     config.logger.info(
-        "Going to execute flow '%s' on task %d for repeat %d fold %d sample %d.",
-        flow.name,
-        task.task_id,
-        rep_no,
-        fold_no,
-        sample_no,
+        "Going to run model {} on dataset {} for repeat {} fold {} sample {}".format(
+            str(model),
+            openml.datasets.get_dataset(task.dataset_id).name,
+            rep_no,
+            fold_no,
+            sample_no,
+        )
     )
     pred_y, proba_y, user_defined_measures_fold, trace, = extension._run_model_on_fold(
         model=model,
diff --git a/setup.py b/setup.py
@@ -69,6 +69,7 @@
             "flaky",
             "pre-commit",
             "pytest-cov",
+            "pytest-rerunfailures",
             "mypy",
         ],
         "examples": [
diff --git a/tests/test_evaluations/__init__.py b/tests/test_evaluations/__init__.py
diff --git a/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py b/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py
@@ -1254,7 +1254,7 @@ def test_paralizable_check(self):
         # using this param distribution should raise an exception
         illegal_param_dist = {"base__n_jobs": [-1, 0, 1]}
         # using this param distribution should not raise an exception
-        legal_param_dist = {"base__max_depth": [2, 3, 4]}
+        legal_param_dist = {"n_estimators": [2, 3, 4]}
 
         legal_models = [
             sklearn.ensemble.RandomForestClassifier(),
@@ -1282,12 +1282,19 @@ def test_paralizable_check(self):
 
         can_measure_cputime_answers = [True, False, False, True, False, False, True, False, False]
         can_measure_walltime_answers = [True, True, False, True, True, False, True, True, False]
+        if LooseVersion(sklearn.__version__) < "0.20":
+            has_refit_time = [False, False, False, False, False, False, False, False, False]
+        else:
+            has_refit_time = [False, False, False, False, False, False, True, True, False]
 
-        for model, allowed_cputime, allowed_walltime in zip(
-            legal_models, can_measure_cputime_answers, can_measure_walltime_answers
+        X, y = sklearn.datasets.load_iris(return_X_y=True)
+        for model, allowed_cputime, allowed_walltime, refit_time in zip(
+            legal_models, can_measure_cputime_answers, can_measure_walltime_answers, has_refit_time
         ):
             self.assertEqual(self.extension._can_measure_cputime(model), allowed_cputime)
             self.assertEqual(self.extension._can_measure_wallclocktime(model), allowed_walltime)
+            model.fit(X, y)
+            self.assertEqual(refit_time, hasattr(model, "refit_time_"))
 
         for model in illegal_models:
             with self.assertRaises(PyOpenMLError):
diff --git a/tests/test_runs/test_run_functions.py b/tests/test_runs/test_run_functions.py
@@ -10,6 +10,7 @@
 import unittest.mock
 
 import numpy as np
+import joblib
 from joblib import parallel_backend
 
 import openml
@@ -1187,13 +1188,10 @@ def test__run_task_get_arffcontent(self):
         num_folds = 10
         num_repeats = 1
 
-        flow = unittest.mock.Mock()
-        flow.name = "dummy"
         clf = make_pipeline(
             OneHotEncoder(handle_unknown="ignore"), SGDClassifier(loss="log", random_state=1)
         )
         res = openml.runs.functions._run_task_get_arffcontent(
-            flow=flow,
             extension=self.extension,
             model=clf,
             task=task,
@@ -1404,8 +1402,6 @@ def test_run_on_dataset_with_missing_labels_dataframe(self):
         # Check that _run_task_get_arffcontent works when one of the class
         # labels only declared in the arff file, but is not present in the
         # actual data
-        flow = unittest.mock.Mock()
-        flow.name = "dummy"
         task = openml.tasks.get_task(2)  # anneal; crossvalidation
 
         from sklearn.compose import ColumnTransformer
@@ -1420,7 +1416,6 @@ def test_run_on_dataset_with_missing_labels_dataframe(self):
         )  # build a sklearn classifier
 
         data_content, _, _, _ = _run_task_get_arffcontent(
-            flow=flow,
             model=model,
             task=task,
             extension=self.extension,
@@ -1442,8 +1437,6 @@ def test_run_on_dataset_with_missing_labels_array(self):
         # Check that _run_task_get_arffcontent works when one of the class
         # labels only declared in the arff file, but is not present in the
         # actual data
-        flow = unittest.mock.Mock()
-        flow.name = "dummy"
         task = openml.tasks.get_task(2)  # anneal; crossvalidation
         # task_id=2 on test server has 38 columns with 6 numeric columns
         cont_idx = [3, 4, 8, 32, 33, 34]
@@ -1465,7 +1458,6 @@ def test_run_on_dataset_with_missing_labels_array(self):
         )  # build a sklearn classifier
 
         data_content, _, _, _ = _run_task_get_arffcontent(
-            flow=flow,
             model=model,
             task=task,
             extension=self.extension,
@@ -1581,20 +1573,18 @@ def test_format_prediction_task_regression(self):
         LooseVersion(sklearn.__version__) < "0.21",
         reason="couldn't perform local tests successfully w/o bloating RAM",
     )
-    @unittest.mock.patch("openml.extensions.sklearn.SklearnExtension._run_model_on_fold")
+    @unittest.mock.patch("openml.extensions.sklearn.SklearnExtension._prevent_optimize_n_jobs")
     def test__run_task_get_arffcontent_2(self, parallel_mock):
         """ Tests if a run executed in parallel is collated correctly. """
         task = openml.tasks.get_task(7)  # Supervised Classification on kr-vs-kp
         x, y = task.get_X_and_y(dataset_format="dataframe")
         num_instances = x.shape[0]
         line_length = 6 + len(task.class_labels)
-        flow = unittest.mock.Mock()
-        flow.name = "dummy"
         clf = SGDClassifier(loss="log", random_state=1)
         n_jobs = 2
-        with parallel_backend("loky", n_jobs=n_jobs):
+        backend = "loky" if LooseVersion(joblib.__version__) > "0.11" else "multiprocessing"
+        with parallel_backend(backend, n_jobs=n_jobs):
             res = openml.runs.functions._run_task_get_arffcontent(
-                flow=flow,
                 extension=self.extension,
                 model=clf,
                 task=task,
@@ -1606,6 +1596,9 @@ def test__run_task_get_arffcontent_2(self, parallel_mock):
         # function _run_model_on_fold is being mocked out. However, for a new spawned worker, it
         # is not and the mock call_count should remain 0 while the subsequent check of actual
         # results should also hold, only on successful distribution of tasks to workers.
+        # The _prevent_optimize_n_jobs() is a function executed within the _run_model_on_fold()
+        # block and mocking this function doesn't affect rest of the pipeline, but is adequately
+        # indicative if _run_model_on_fold() is being called or not.
         self.assertEqual(parallel_mock.call_count, 0)
         self.assertIsInstance(res[0], list)
         self.assertEqual(len(res[0]), num_instances)
@@ -1638,13 +1631,12 @@ def test_joblib_backends(self, parallel_mock):
         x, y = task.get_X_and_y(dataset_format="dataframe")
         num_instances = x.shape[0]
         line_length = 6 + len(task.class_labels)
-        flow = unittest.mock.Mock()
-        flow.name = "dummy"
 
+        backend_choice = "loky" if LooseVersion(joblib.__version__) > "0.11" else "multiprocessing"
         for n_jobs, backend, len_time_stats, call_count in [
-            (1, "loky", 7, 10),
-            (2, "loky", 4, 10),
-            (-1, "loky", 1, 10),
+            (1, backend_choice, 7, 10),
+            (2, backend_choice, 4, 10),
+            (-1, backend_choice, 1, 10),
             (1, "threading", 7, 20),
             (-1, "threading", 1, 30),
             (1, "sequential", 7, 40),
@@ -1668,7 +1660,6 @@ def test_joblib_backends(self, parallel_mock):
             )
             with parallel_backend(backend, n_jobs=n_jobs):
                 res = openml.runs.functions._run_task_get_arffcontent(
-                    flow=flow,
                     extension=self.extension,
                     model=clf,
                     task=task,
diff --git a/tests/test_study/__init__.py b/tests/test_study/__init__.py
diff --git a/tests/test_study/test_study_functions.py b/tests/test_study/test_study_functions.py
@@ -4,6 +4,7 @@
 import openml.study
 from openml.testing import TestBase
 import pandas as pd
+import pytest
 
 
 class TestStudyFunctions(TestBase):
@@ -113,6 +114,7 @@ def test_publish_benchmark_suite(self):
         self.assertEqual(study_downloaded.status, "deactivated")
         # can't delete study, now it's not longer in preparation
 
+    @pytest.mark.flaky()
     def test_publish_study(self):
         # get some random runs to attach
         run_list = openml.evaluations.list_evaluations("predictive_accuracy", size=10)
@@ -133,8 +135,8 @@ def test_publish_study(self):
             run_ids=list(run_list.keys()),
         )
         study.publish()
-        # not tracking upload for delete since _delete_entity called end of function
-        # asserting return status from openml.study.delete_study()
+        TestBase._mark_entity_for_removal("study", study.id)
+        TestBase.logger.info("collected from {}: {}".format(__file__.split("/")[-1], study.id))
         self.assertGreater(study.id, 0)
         study_downloaded = openml.study.get_study(study.id)
         self.assertEqual(study_downloaded.alias, fixt_alias)