Reinstantiate model if needed. Better errors if can't. (#722)

PGijsbers · mfeurer · commit 692af9728a3a · 2019-07-09T15:27:22.000+02:00
* Clearer error messages when trying to reinstantiate a model and this is not possible. Automatically reinstantiate flow model if possible when run_flow_on_task is called.

* Updated changelog.

* Fix unit test mistakes.

* Check error message with regex.
diff --git a/doc/progress.rst b/doc/progress.rst
@@ -8,6 +8,7 @@ Changelog
 
 0.10.0
 ~~~~~~
+* ADD #722: Automatic reinstantiation of flow in `run_model_on_task`. Clearer errors if that's not possible.
 * FIX #608: Fixing dataset_id referenced before assignment error in get_run function.
 * ADD #715: `list_evaluations` now has an option to sort evaluations by score (value).
 * FIX #589: Fixing a bug that did not successfully upload the columns to ignore when creating and publishing a dataset.
diff --git a/openml/flows/flow.py b/openml/flows/flow.py
@@ -132,7 +132,15 @@ def __init__(self, name, description, model, components, parameters,
         self.dependencies = dependencies
         self.flow_id = flow_id
 
-        self.extension = get_extension_by_flow(self)
+        self._extension = get_extension_by_flow(self)
+
+    @property
+    def extension(self):
+        if self._extension is not None:
+            return self._extension
+        else:
+            raise RuntimeError("No extension could be found for flow {}: {}"
+                               .format(self.flow_id, self.name))
 
     def __str__(self):
         header = "OpenML Flow"
diff --git a/openml/flows/functions.py b/openml/flows/functions.py
@@ -92,7 +92,6 @@ def get_flow(flow_id: int, reinstantiate: bool = False) -> OpenMLFlow:
 
     if reinstantiate:
         flow.model = flow.extension.flow_to_model(flow)
-
     return flow
 
 
@@ -360,7 +359,7 @@ def assert_flows_equal(flow1: OpenMLFlow, flow2: OpenMLFlow,
                 assert_flows_equal(attr1[name], attr2[name],
                                    ignore_parameter_values_on_older_children,
                                    ignore_parameter_values)
-        elif key == 'extension':
+        elif key == '_extension':
             continue
         else:
             if key == 'parameters':
diff --git a/openml/runs/functions.py b/openml/runs/functions.py
@@ -171,6 +171,8 @@ def run_flow_on_task(
     if task.task_id is None:
         raise ValueError("The task should be published at OpenML")
 
+    if flow.model is None:
+        flow.model = flow.extension.flow_to_model(flow)
     flow.model = flow.extension.seed_model(flow.model, seed=seed)
 
     # We only need to sync with the server right now if we want to upload the flow,
diff --git a/tests/test_flows/test_flow_functions.py b/tests/test_flows/test_flow_functions.py
@@ -256,3 +256,27 @@ def test_sklearn_to_flow_list_of_lists(self):
         server_flow = openml.flows.get_flow(flow.flow_id, reinstantiate=True)
         self.assertEqual(server_flow.parameters['categories'], '[[0, 1], [0, 1]]')
         self.assertEqual(server_flow.model.categories, flow.model.categories)
+
+    def test_get_flow_reinstantiate_model(self):
+        model = sklearn.ensemble.RandomForestClassifier(n_estimators=33)
+        extension = openml.extensions.get_extension_by_model(model)
+        flow = extension.model_to_flow(model)
+        flow.publish(raise_error_if_exists=False)
+
+        downloaded_flow = openml.flows.get_flow(flow.flow_id, reinstantiate=True)
+        self.assertIsInstance(downloaded_flow.model, sklearn.ensemble.RandomForestClassifier)
+
+    def test_get_flow_reinstantiate_model_no_extension(self):
+        # Flow 10 is a WEKA flow
+        self.assertRaisesRegex(RuntimeError,
+                               "No extension could be found for flow 10: weka.SMO",
+                               openml.flows.get_flow,
+                               flow_id=10,
+                               reinstantiate=True)
+
+    @unittest.skipIf(LooseVersion(sklearn.__version__) == "0.20.0",
+                     reason="No non-0.20 scikit-learn flow known.")
+    def test_get_flow_reinstantiate_model_wrong_version(self):
+        # 20 is scikit-learn ==0.20.0
+        # I can't find a != 0.20 permanent flow on the test server.
+        self.assertRaises(ValueError, openml.flows.get_flow, flow_id=20, reinstantiate=True)
diff --git a/tests/test_runs/test_run_functions.py b/tests/test_runs/test_run_functions.py
@@ -1259,12 +1259,12 @@ def test_get_uncached_run(self):
         with self.assertRaises(openml.exceptions.OpenMLCacheException):
             openml.runs.functions._get_cached_run(10)
 
-    def test_run_model_on_task_downloaded_flow(self):
+    def test_run_flow_on_task_downloaded_flow(self):
         model = sklearn.ensemble.RandomForestClassifier(n_estimators=33)
         flow = self.extension.model_to_flow(model)
         flow.publish(raise_error_if_exists=False)
 
-        downloaded_flow = openml.flows.get_flow(flow.flow_id, reinstantiate=True)
+        downloaded_flow = openml.flows.get_flow(flow.flow_id)
         task = openml.tasks.get_task(119)  # diabetes
         run = openml.runs.run_flow_on_task(
             flow=downloaded_flow,