Adds flow.get_structure and flow.get_subflow (which are complements of each other). Also fixes #564 (#567)

janvanrijn · mfeurer · commit 7c0a77d232a6 · 2018-12-11T11:13:30.000+01:00
* fixes minor indentation problems

* initial commit

* adds a function to deduce the flow structure

* removes sklearn converter from this PR

* added main functionality

* fix code quality

* adds flow name to setup test file

* adds functionality to return sklearn parameter name into openml flow name

* PEP8 fixes

* changed structure of PR, such that get_structure is not part of flow class.
updated unit tests accordingly

* pep8 fix

* fixes last typo

* flow name doc string

* also added additional filter for task list

* renamed id argument of parameter object (for code quality)

* fix reference to input id

* updated reinitialize model fn

* removed imputer (deprecated)

* fixes PEP8 problems

* pep8

* PEP8

* incorporated changes by Matthias

* fix 604

* bugfix

* flake fix

* import error

* removed sentence

* updated comment
diff --git a/examples/run_setup_tutorial.py b/examples/run_setup_tutorial.py
@@ -0,0 +1,102 @@
+"""
+=========
+Run Setup
+=========
+
+By: Jan N. van Rijn
+
+One of the key features of the openml-python library is that is allows to
+reinstantiate flows with hyperparameter settings that were uploaded before.
+This tutorial uses the concept of setups. Although setups are not extensively
+described in the OpenML documentation (because most users will not directly
+use them), they form a important concept within OpenML distinguishing between
+hyperparameter configurations.
+A setup is the combination of a flow with all its hyperparameters set.
+
+A key requirement for reinstantiating a flow is to have the same scikit-learn
+version as the flow that was uploaded. However, this tutorial will upload the
+flow (that will later be reinstantiated) itself, so it can be ran with any
+scikit-learn version that is supported by this library. In this case, the
+requirement of the corresponding scikit-learn versions is automatically met.
+
+In this tutorial we will
+    1) Create a flow and use it to solve a task;
+    2) Download the flow, reinstantiate the model with same hyperparameters,
+       and solve the same task again;
+    3) We will verify that the obtained results are exactly the same.
+"""
+import logging
+import numpy as np
+import openml
+import sklearn.ensemble
+import sklearn.impute
+import sklearn.preprocessing
+
+
+root = logging.getLogger()
+root.setLevel(logging.INFO)
+
+###############################################################################
+# 1) Create a flow and use it to solve a task
+###############################################################################
+
+# first, let's download the task that we are interested in
+task = openml.tasks.get_task(6)
+
+
+# we will create a fairly complex model, with many preprocessing components and
+# many potential hyperparameters. Of course, the model can be as complex and as
+# easy as you want it to be
+model_original = sklearn.pipeline.make_pipeline(
+    sklearn.impute.SimpleImputer(),
+    sklearn.ensemble.RandomForestClassifier()
+)
+
+
+# Let's change some hyperparameters. Of course, in any good application we
+# would tune them using, e.g., Random Search or Bayesian Optimization, but for
+# the purpose of this tutorial we set them to some specific values that might
+# or might not be optimal
+hyperparameters_original = {
+    'simpleimputer__strategy': 'median',
+    'randomforestclassifier__criterion': 'entropy',
+    'randomforestclassifier__max_features': 0.2,
+    'randomforestclassifier__min_samples_leaf': 1,
+    'randomforestclassifier__n_estimators': 16,
+    'randomforestclassifier__random_state': 42,
+}
+model_original.set_params(**hyperparameters_original)
+
+# solve the task and upload the result (this implicitly creates the flow)
+run = openml.runs.run_model_on_task(
+    model_original,
+    task,
+    avoid_duplicate_runs=False)
+run_original = run.publish()  # this implicitly uploads the flow
+
+###############################################################################
+# 2) Download the flow, reinstantiate the model with same hyperparameters,
+#    and solve the same task again.
+###############################################################################
+
+# obtain setup id (note that the setup id is assigned by the OpenML server -
+# therefore it was not yet available in our local copy of the run)
+run_downloaded = openml.runs.get_run(run_original.run_id)
+setup_id = run_downloaded.setup_id
+
+# after this, we can easily reinstantiate the model
+model_duplicate = openml.setups.initialize_model(setup_id)
+# it will automatically have all the hyperparameters set
+
+# and run the task again
+run_duplicate = openml.runs.run_model_on_task(
+    model_duplicate, task, avoid_duplicate_runs=False)
+
+
+###############################################################################
+# 3) We will verify that the obtained results are exactly the same.
+###############################################################################
+
+# the run has stored all predictions in the field data content
+np.testing.assert_array_equal(run_original.data_content,
+                              run_duplicate.data_content)
diff --git a/openml/flows/__init__.py b/openml/flows/__init__.py
@@ -1,7 +1,8 @@
-from .flow import OpenMLFlow, _copy_server_fields
+from .flow import OpenMLFlow
 
-from .sklearn_converter import sklearn_to_flow, flow_to_sklearn, _check_n_jobs
+from .sklearn_converter import sklearn_to_flow, flow_to_sklearn, \
+    openml_param_name_to_sklearn
 from .functions import get_flow, list_flows, flow_exists, assert_flows_equal
 
-__all__ = ['OpenMLFlow', 'create_flow_from_model', 'get_flow', 'list_flows',
-           'sklearn_to_flow', 'flow_to_sklearn', 'flow_exists']
+__all__ = ['OpenMLFlow', 'get_flow', 'list_flows', 'sklearn_to_flow',
+           'flow_to_sklearn', 'flow_exists', 'openml_param_name_to_sklearn']
diff --git a/openml/flows/flow.py b/openml/flows/flow.py
@@ -346,6 +346,60 @@ def publish(self):
                              (flow_id, message))
         return self
 
+    def get_structure(self, key_item):
+        """
+        Returns for each sub-component of the flow the path of identifiers that
+        should be traversed to reach this component. The resulting dict maps a
+        key (identifying a flow by either its id, name or fullname) to the
+        parameter prefix.
+
+        Parameters
+        ----------
+        key_item: str
+            The flow attribute that will be used to identify flows in the
+            structure. Allowed values {flow_id, name}
+
+        Returns
+        -------
+        dict[str, List[str]]
+            The flow structure
+        """
+        if key_item not in ['flow_id', 'name']:
+            raise ValueError('key_item should be in {flow_id, name}')
+        structure = dict()
+        for key, sub_flow in self.components.items():
+            sub_structure = sub_flow.get_structure(key_item)
+            for flow_name, flow_sub_structure in sub_structure.items():
+                structure[flow_name] = [key] + flow_sub_structure
+        structure[getattr(self, key_item)] = []
+        return structure
+
+    def get_subflow(self, structure):
+        """
+        Returns a subflow from the tree of dependencies.
+
+        Parameters
+        ----------
+        structure: list[str]
+            A list of strings, indicating the location of the subflow
+
+        Returns
+        -------
+        OpenMLFlow
+            The OpenMLFlow that corresponds to the structure
+        """
+        if len(structure) < 1:
+            raise ValueError('Please provide a structure list of size >= 1')
+        sub_identifier = structure[0]
+        if sub_identifier not in self.components:
+            raise ValueError('Flow %s does not contain component with '
+                             'identifier %s' % (self.name, sub_identifier))
+        if len(structure) == 1:
+            return self.components[sub_identifier]
+        else:
+            structure.pop(0)
+            return self.components[sub_identifier].get_subflow(structure)
+
     def push_tag(self, tag):
         """Annotates this flow with a tag on the server.
 
diff --git a/openml/flows/sklearn_converter.py b/openml/flows/sklearn_converter.py
@@ -11,7 +11,6 @@
 import six
 import warnings
 import sys
-import inspect
 
 import numpy as np
 import scipy.stats.distributions
@@ -177,6 +176,37 @@ def flow_to_sklearn(o, components=None, initialize_with_defaults=False):
     return rval
 
 
+def openml_param_name_to_sklearn(openml_parameter, flow):
+    """
+    Converts the name of an OpenMLParameter into the sklean name, given a flow.
+
+    Parameters
+    ----------
+    openml_parameter: OpenMLParameter
+        The parameter under consideration
+
+    flow: OpenMLFlow
+        The flow that provides context.
+
+    Returns
+    -------
+    sklearn_parameter_name: str
+        The name the parameter will have once used in scikit-learn
+    """
+    if not isinstance(openml_parameter, openml.setups.OpenMLParameter):
+        raise ValueError('openml_parameter should be an instance of '
+                         'OpenMLParameter')
+    if not isinstance(flow, OpenMLFlow):
+        raise ValueError('flow should be an instance of OpenMLFlow')
+
+    flow_structure = flow.get_structure('name')
+    if openml_parameter.flow_name not in flow_structure:
+        raise ValueError('Obtained OpenMLParameter and OpenMLFlow do not '
+                         'correspond. ')
+    name = openml_parameter.flow_name  # for PEP8
+    return '__'.join(flow_structure[name] + [openml_parameter.parameter_name])
+
+
 def _serialize_model(model):
     """Create an OpenMLFlow.
 
diff --git a/openml/runs/functions.py b/openml/runs/functions.py
@@ -17,8 +17,9 @@
 import openml._api_calls
 from ..exceptions import PyOpenMLError
 from .. import config
-from ..flows import sklearn_to_flow, get_flow, flow_exists, _check_n_jobs, \
-    _copy_server_fields, OpenMLFlow
+from openml.flows.sklearn_converter import _check_n_jobs
+from openml.flows.flow import _copy_server_fields
+from ..flows import sklearn_to_flow, get_flow, flow_exists, OpenMLFlow
 from ..setups import setup_exists, initialize_model
 from ..exceptions import OpenMLCacheException, OpenMLServerException
 from ..tasks import OpenMLTask
diff --git a/openml/setups/__init__.py b/openml/setups/__init__.py
@@ -1,4 +1,5 @@
-from .setup import OpenMLSetup
+from .setup import OpenMLSetup, OpenMLParameter
 from .functions import get_setup, list_setups, setup_exists, initialize_model
 
-__all__ = ['get_setup', 'list_setups', 'setup_exists', 'initialize_model']
+__all__ = ['OpenMLSetup', 'OpenMLParameter', 'get_setup', 'list_setups',
+           'setup_exists', 'initialize_model']
diff --git a/openml/setups/functions.py b/openml/setups/functions.py
@@ -211,44 +211,16 @@ def initialize_model(setup_id):
     # transform an openml setup object into
     # a dict of dicts, structured: flow_id maps to dict of
     # parameter_names mapping to parameter_value
-
     setup = get_setup(setup_id)
-    parameters = {}
-    for _param in setup.parameters:
-        _flow_id = setup.parameters[_param].flow_id
-        _param_name = setup.parameters[_param].parameter_name
-        _param_value = setup.parameters[_param].value
-        if _flow_id not in parameters:
-            parameters[_flow_id] = {}
-        parameters[_flow_id][_param_name] = _param_value
-
-    def _reconstruct_flow(_flow, _params):
-        # recursively set the values of flow parameters (and subflows) to
-        # the specific values from a setup. _params is a dict of
-        # dicts, mapping from flow id to param name to param value
-        # (obtained by using the subfunction _to_dict_of_dicts)
-        for _param in _flow.parameters:
-            # It can happen that no parameters of a flow are in a setup,
-            # then the flow_id is not in _params; usually happens for a
-            # sklearn.pipeline.Pipeline object, where the steps parameter is
-            # not in the setup
-            if _flow.flow_id not in _params:
-                continue
-            # It is not guaranteed that a setup on OpenML has all parameter
-            # settings of a flow, thus a param must not be in _params!
-            if _param not in _params[_flow.flow_id]:
-                continue
-            _flow.parameters[_param] = _params[_flow.flow_id][_param]
-        for _identifier in _flow.components:
-            _flow.components[_identifier] = _reconstruct_flow(_flow.components[_identifier], _params)
-        return _flow
-
-    # now we 'abuse' the parameter object by passing in the
-    # parameters obtained from the setup
     flow = openml.flows.get_flow(setup.flow_id)
-    flow = _reconstruct_flow(flow, parameters)
-
-    return openml.flows.flow_to_sklearn(flow)
+    model = openml.flows.flow_to_sklearn(flow)
+    hyperparameters = {
+        openml.flows.openml_param_name_to_sklearn(hp, flow):
+            openml.flows.flow_to_sklearn(hp.value)
+        for hp in setup.parameters.values()
+    }
+    model.set_params(**hyperparameters)
+    return model
 
 
 def _to_dict(flow_id, openml_parameter_settings):
@@ -288,10 +260,11 @@ def _create_setup_from_xml(result_dict):
 
 
 def _create_setup_parameter_from_xml(result_dict):
-    return OpenMLParameter(int(result_dict['oml:id']),
-                           int(result_dict['oml:flow_id']),
-                           result_dict['oml:full_name'],
-                           result_dict['oml:parameter_name'],
-                           result_dict['oml:data_type'],
-                           result_dict['oml:default_value'],
-                           result_dict['oml:value'])
+    return OpenMLParameter(input_id=int(result_dict['oml:id']),
+                           flow_id=int(result_dict['oml:flow_id']),
+                           flow_name=result_dict['oml:flow_name'],
+                           full_name=result_dict['oml:full_name'],
+                           parameter_name=result_dict['oml:parameter_name'],
+                           data_type=result_dict['oml:data_type'],
+                           default_value=result_dict['oml:default_value'],
+                           value=result_dict['oml:value'])
diff --git a/openml/setups/setup.py b/openml/setups/setup.py
@@ -29,27 +29,32 @@ def __init__(self, setup_id, flow_id, parameters):
 class OpenMLParameter(object):
     """Parameter object (used in setup).
 
-       Parameters
-       ----------
-       id : int
-            The input id from the openml database
-        flow id : int
-            The flow to which this parameter is associated
-        full_name : str
-            The name of the flow and parameter combined
-        parameter_name : str
-            The name of the parameter 
-        data_type : str
-            The datatype of the parameter. generally unused for sklearn flows
-        default_value : str
-            The default value. For sklearn parameters, this is unknown and a
-            default value is selected arbitrarily
-        value : str
-            If the parameter was set, the value that it was set to. 
+    Parameters
+    ----------
+    input_id : int
+        The input id from the openml database
+    flow id : int
+        The flow to which this parameter is associated
+    flow name : str
+        The name of the flow (no version number) to which this parameter
+        is associated
+    full_name : str
+        The name of the flow and parameter combined
+    parameter_name : str
+        The name of the parameter
+    data_type : str
+        The datatype of the parameter. generally unused for sklearn flows
+    default_value : str
+        The default value. For sklearn parameters, this is unknown and a
+        default value is selected arbitrarily
+    value : str
+        If the parameter was set, the value that it was set to.
     """
-    def __init__(self, id, flow_id, full_name, parameter_name, data_type, default_value, value):
-        self.id = id
+    def __init__(self, input_id, flow_id, flow_name, full_name, parameter_name,
+                 data_type, default_value, value):
+        self.id = input_id
         self.flow_id = flow_id
+        self.flow_name = flow_name
         self.full_name = full_name
         self.parameter_name = parameter_name
         self.data_type = data_type
diff --git a/openml/tasks/functions.py b/openml/tasks/functions.py
@@ -172,7 +172,7 @@ def _list_tasks(task_type_id=None, **kwargs):
         - Survival Analysis: 7
         - Subgroup Discovery: 8
     kwargs: dict, optional
-        Legal filter operators: tag, data_tag, status, limit,
+        Legal filter operators: tag, task_id (list), data_tag, status, limit,
         offset, data_id, data_name, number_instances, number_features,
         number_classes, number_missing_values.
     Returns
@@ -184,6 +184,8 @@ def _list_tasks(task_type_id=None, **kwargs):
         api_call += "/type/%d" % int(task_type_id)
     if kwargs is not None:
         for operator, value in kwargs.items():
+            if operator == 'task_id':
+                value = ','.join([str(int(i)) for i in value])
             api_call += "/%s/%s" % (operator, value)
     return __list_tasks(api_call)
 
diff --git a/tests/files/org/openml/test/setups/1/description.xml b/tests/files/org/openml/test/setups/1/description.xml
diff --git a/tests/test_flows/test_flow.py b/tests/test_flows/test_flow.py
diff --git a/tests/test_flows/test_sklearn.py b/tests/test_flows/test_sklearn.py
diff --git a/tests/test_setups/test_setup_functions.py b/tests/test_setups/test_setup_functions.py