Skip to content

Commit 2375940

Browse files
janvanrijnmfeurer
authored andcommitted
Fix602 (#615)
* extended check to include missing values * added more tests * modularized tests * extended unit tests * small fixes * removed flow check on scikit-learn representation -- bad idea * exposed sentinel, incorporated test case according to #602 * work on fixing column transformer bug * logging output to flow_to_sklearn * overrides default values in openml flow in case a setup needs to be initialized * fix unit test * PEP8 * fix unit tests Python 3.x * solved unicode issues * fix 3.5 issue
1 parent ecdf9b1 commit 2375940

File tree

10 files changed

+527
-267
lines changed

10 files changed

+527
-267
lines changed

openml/flows/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
from .flow import OpenMLFlow
22

33
from .sklearn_converter import sklearn_to_flow, flow_to_sklearn, \
4-
openml_param_name_to_sklearn
4+
openml_param_name_to_sklearn, obtain_parameter_values
55
from .functions import get_flow, list_flows, flow_exists, assert_flows_equal
66

77
__all__ = ['OpenMLFlow', 'get_flow', 'list_flows', 'sklearn_to_flow',

openml/flows/flow.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -337,7 +337,9 @@ def publish(self):
337337
flow = openml.flows.functions.get_flow(flow_id)
338338
_copy_server_fields(flow, self)
339339
try:
340-
openml.flows.functions.assert_flows_equal(self, flow, flow.upload_date)
340+
openml.flows.functions.assert_flows_equal(
341+
self, flow, flow.upload_date, ignore_parameter_values=True
342+
)
341343
except ValueError as e:
342344
message = e.args[0]
343345
raise ValueError("Flow was not stored correctly on the server. "
@@ -388,6 +390,9 @@ def get_subflow(self, structure):
388390
OpenMLFlow
389391
The OpenMLFlow that corresponds to the structure
390392
"""
393+
# make a copy of structure, as we don't want to change it in the
394+
# outer scope
395+
structure = list(structure)
391396
if len(structure) < 1:
392397
raise ValueError('Please provide a structure list of size >= 1')
393398
sub_identifier = structure[0]

openml/flows/sklearn_converter.py

Lines changed: 190 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
import inspect
88
import json
99
import json.decoder
10+
import logging
1011
import re
1112
import six
1213
import warnings
@@ -92,7 +93,8 @@ def _is_cross_validator(o):
9293
return isinstance(o, sklearn.model_selection.BaseCrossValidator)
9394

9495

95-
def flow_to_sklearn(o, components=None, initialize_with_defaults=False):
96+
def flow_to_sklearn(o, components=None, initialize_with_defaults=False,
97+
recursion_depth=0):
9698
"""Initializes a sklearn model based on a flow.
9799
98100
Parameters
@@ -108,11 +110,19 @@ def flow_to_sklearn(o, components=None, initialize_with_defaults=False):
108110
If this flag is set, the hyperparameter values of flows will be
109111
ignored and a flow with its defaults is returned.
110112
113+
recursion_depth : int
114+
The depth at which this flow is called, mostly for debugging
115+
purposes
116+
111117
Returns
112118
-------
113119
mixed
114120
115121
"""
122+
logging.info('-%s flow_to_sklearn START o=%s, components=%s, '
123+
'init_defaults=%s' % ('-' * recursion_depth, o, components,
124+
initialize_with_defaults))
125+
depth_pp = recursion_depth + 1 # shortcut var, depth plus plus
116126

117127
# First, we need to check whether the presented object is a json string.
118128
# JSON strings are used to encoder parameter values. By passing around
@@ -139,10 +149,14 @@ def flow_to_sklearn(o, components=None, initialize_with_defaults=False):
139149
elif serialized_type == 'function':
140150
rval = deserialize_function(value)
141151
elif serialized_type == 'component_reference':
142-
value = flow_to_sklearn(value)
152+
value = flow_to_sklearn(value, recursion_depth=depth_pp)
143153
step_name = value['step_name']
144154
key = value['key']
145-
component = flow_to_sklearn(components[key], initialize_with_defaults=initialize_with_defaults)
155+
component = flow_to_sklearn(
156+
components[key],
157+
initialize_with_defaults=initialize_with_defaults,
158+
recursion_depth=depth_pp
159+
)
146160
# The component is now added to where it should be used
147161
# later. It should not be passed to the constructor of the
148162
# main flow object.
@@ -154,25 +168,39 @@ def flow_to_sklearn(o, components=None, initialize_with_defaults=False):
154168
else:
155169
rval = (step_name, component, value['argument_1'])
156170
elif serialized_type == 'cv_object':
157-
rval = _deserialize_cross_validator(value)
171+
rval = _deserialize_cross_validator(
172+
value, recursion_depth=recursion_depth
173+
)
158174
else:
159175
raise ValueError('Cannot flow_to_sklearn %s' % serialized_type)
160176

161177
else:
162-
rval = OrderedDict((flow_to_sklearn(key, components, initialize_with_defaults),
163-
flow_to_sklearn(value, components, initialize_with_defaults))
178+
rval = OrderedDict((flow_to_sklearn(key,
179+
components,
180+
initialize_with_defaults,
181+
recursion_depth=depth_pp),
182+
flow_to_sklearn(value,
183+
components,
184+
initialize_with_defaults,
185+
recursion_depth=depth_pp))
164186
for key, value in sorted(o.items()))
165187
elif isinstance(o, (list, tuple)):
166-
rval = [flow_to_sklearn(element, components, initialize_with_defaults) for element in o]
188+
rval = [flow_to_sklearn(element,
189+
components,
190+
initialize_with_defaults,
191+
depth_pp) for element in o]
167192
if isinstance(o, tuple):
168193
rval = tuple(rval)
169194
elif isinstance(o, (bool, int, float, six.string_types)) or o is None:
170195
rval = o
171196
elif isinstance(o, OpenMLFlow):
172-
rval = _deserialize_model(o, initialize_with_defaults)
197+
rval = _deserialize_model(o,
198+
initialize_with_defaults,
199+
recursion_depth=recursion_depth)
173200
else:
174201
raise TypeError(o)
175-
202+
logging.info('-%s flow_to_sklearn END o=%s, rval=%s'
203+
% ('-' * recursion_depth, o, rval))
176204
return rval
177205

178206

@@ -207,6 +235,143 @@ def openml_param_name_to_sklearn(openml_parameter, flow):
207235
return '__'.join(flow_structure[name] + [openml_parameter.parameter_name])
208236

209237

238+
def obtain_parameter_values(flow):
239+
"""
240+
Extracts all parameter settings from the model inside a flow in OpenML
241+
format.
242+
243+
Parameters
244+
----------
245+
flow : OpenMLFlow
246+
openml flow object (containing flow ids, i.e., it has to be downloaded
247+
from the server)
248+
249+
Returns
250+
-------
251+
list
252+
A list of dicts, where each dict has the following names:
253+
- oml:name (str): The OpenML parameter name
254+
- oml:value (mixed): A representation of the parameter value
255+
- oml:component (int): flow id to which the parameter belongs
256+
"""
257+
258+
openml.flows.functions._check_flow_for_server_id(flow)
259+
260+
def get_flow_dict(_flow):
261+
flow_map = {_flow.name: _flow.flow_id}
262+
for subflow in _flow.components:
263+
flow_map.update(get_flow_dict(_flow.components[subflow]))
264+
return flow_map
265+
266+
def extract_parameters(_flow, _flow_dict, component_model,
267+
_main_call=False, main_id=None):
268+
def is_subcomponent_specification(values):
269+
# checks whether the current value can be a specification of
270+
# subcomponents, as for example the value for steps parameter
271+
# (in Pipeline) or transformers parameter (in
272+
# ColumnTransformer). These are always lists/tuples of lists/
273+
# tuples, size bigger than 2 and an OpenMLFlow item involved.
274+
if not isinstance(values, (tuple, list)):
275+
return False
276+
for item in values:
277+
if not isinstance(item, (tuple, list)):
278+
return False
279+
if len(item) < 2:
280+
return False
281+
if not isinstance(item[1], openml.flows.OpenMLFlow):
282+
return False
283+
return True
284+
285+
# _flow is openml flow object, _param dict maps from flow name to flow
286+
# id for the main call, the param dict can be overridden (useful for
287+
# unit tests / sentinels) this way, for flows without subflows we do
288+
# not have to rely on _flow_dict
289+
exp_parameters = set(_flow.parameters)
290+
exp_components = set(_flow.components)
291+
model_parameters = set([mp for mp in component_model.get_params()
292+
if '__' not in mp])
293+
if len((exp_parameters | exp_components) ^ model_parameters) != 0:
294+
flow_params = sorted(exp_parameters | exp_components)
295+
model_params = sorted(model_parameters)
296+
raise ValueError('Parameters of the model do not match the '
297+
'parameters expected by the '
298+
'flow:\nexpected flow parameters: '
299+
'%s\nmodel parameters: %s' % (flow_params,
300+
model_params))
301+
302+
_params = []
303+
for _param_name in _flow.parameters:
304+
_current = OrderedDict()
305+
_current['oml:name'] = _param_name
306+
307+
current_param_values = openml.flows.sklearn_to_flow(
308+
component_model.get_params()[_param_name])
309+
310+
# Try to filter out components (a.k.a. subflows) which are
311+
# handled further down in the code (by recursively calling
312+
# this function)!
313+
if isinstance(current_param_values, openml.flows.OpenMLFlow):
314+
continue
315+
316+
if is_subcomponent_specification(current_param_values):
317+
# complex parameter value, with subcomponents
318+
parsed_values = list()
319+
for subcomponent in current_param_values:
320+
# scikit-learn stores usually tuples in the form
321+
# (name (str), subcomponent (mixed), argument
322+
# (mixed)). OpenML replaces the subcomponent by an
323+
# OpenMLFlow object.
324+
if len(subcomponent) < 2 or len(subcomponent) > 3:
325+
raise ValueError('Component reference should be '
326+
'size {2,3}. ')
327+
328+
subcomponent_identifier = subcomponent[0]
329+
subcomponent_flow = subcomponent[1]
330+
if not isinstance(subcomponent_identifier, six.string_types):
331+
raise TypeError('Subcomponent identifier should be '
332+
'string')
333+
if not isinstance(subcomponent_flow,
334+
openml.flows.OpenMLFlow):
335+
raise TypeError('Subcomponent flow should be string')
336+
337+
current = {
338+
"oml-python:serialized_object": "component_reference",
339+
"value": {
340+
"key": subcomponent_identifier,
341+
"step_name": subcomponent_identifier
342+
}
343+
}
344+
if len(subcomponent) == 3:
345+
if not isinstance(subcomponent[2], list):
346+
raise TypeError('Subcomponent argument should be'
347+
'list')
348+
current['value']['argument_1'] = subcomponent[2]
349+
parsed_values.append(current)
350+
parsed_values = json.dumps(parsed_values)
351+
else:
352+
# vanilla parameter value
353+
parsed_values = json.dumps(current_param_values)
354+
355+
_current['oml:value'] = parsed_values
356+
if _main_call:
357+
_current['oml:component'] = main_id
358+
else:
359+
_current['oml:component'] = _flow_dict[_flow.name]
360+
_params.append(_current)
361+
362+
for _identifier in _flow.components:
363+
subcomponent_model = component_model.get_params()[_identifier]
364+
_params.extend(extract_parameters(_flow.components[_identifier],
365+
_flow_dict, subcomponent_model))
366+
return _params
367+
368+
flow_dict = get_flow_dict(flow)
369+
parameters = extract_parameters(flow, flow_dict, flow.model,
370+
True, flow.flow_id)
371+
372+
return parameters
373+
374+
210375
def _serialize_model(model):
211376
"""Create an OpenMLFlow.
212377
@@ -466,8 +631,8 @@ def _get_fn_arguments_with_defaults(fn_name):
466631
return params_with_defaults, params_without_defaults
467632

468633

469-
def _deserialize_model(flow, keep_defaults):
470-
634+
def _deserialize_model(flow, keep_defaults, recursion_depth):
635+
logging.info('-%s deserialize %s' % ('-' * recursion_depth, flow.name))
471636
model_name = flow.class_name
472637
_check_dependencies(flow.dependencies)
473638

@@ -484,7 +649,12 @@ def _deserialize_model(flow, keep_defaults):
484649

485650
for name in parameters:
486651
value = parameters.get(name)
487-
rval = flow_to_sklearn(value, components=components_, initialize_with_defaults=keep_defaults)
652+
logging.info('--%s flow_parameter=%s, value=%s' %
653+
('-' * recursion_depth, name, value))
654+
rval = flow_to_sklearn(value,
655+
components=components_,
656+
initialize_with_defaults=keep_defaults,
657+
recursion_depth=recursion_depth + 1)
488658
parameter_dict[name] = rval
489659

490660
for name in components:
@@ -493,7 +663,10 @@ def _deserialize_model(flow, keep_defaults):
493663
if name not in components_:
494664
continue
495665
value = components[name]
496-
rval = flow_to_sklearn(value, **kwargs)
666+
logging.info('--%s flow_component=%s, value=%s'
667+
% ('-' * recursion_depth, name, value))
668+
rval = flow_to_sklearn(value,
669+
recursion_depth=recursion_depth + 1)
497670
parameter_dict[name] = rval
498671

499672
module_name = model_name.rsplit('.', 1)
@@ -723,15 +896,17 @@ def check(param_grid, restricted_parameter_name, legal_values):
723896
return check(model.get_params(), 'n_jobs', [1, None])
724897

725898

726-
def _deserialize_cross_validator(value):
899+
def _deserialize_cross_validator(value, recursion_depth):
727900
model_name = value['name']
728901
parameters = value['parameters']
729902

730903
module_name = model_name.rsplit('.', 1)
731904
model_class = getattr(importlib.import_module(module_name[0]),
732905
module_name[1])
733906
for parameter in parameters:
734-
parameters[parameter] = flow_to_sklearn(parameters[parameter])
907+
parameters[parameter] = flow_to_sklearn(
908+
parameters[parameter], recursion_depth=recursion_depth + 1
909+
)
735910
return model_class(**parameters)
736911

737912

openml/runs/functions.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,8 @@ def run_flow_on_task(flow, task, avoid_duplicate_runs=True, flow_tags=None,
7474
flow_tags : list(str)
7575
A list of tags that the flow should have at creation.
7676
seed: int
77-
Models that are not seeded will get this seed.
77+
Models that are not seeded will be automatically seeded by a RNG. The
78+
RBG will be seeded with this seed.
7879
add_local_measures : bool
7980
Determines whether to calculate a set of evaluation measures locally,
8081
to later verify server behaviour. Defaults to True
@@ -101,7 +102,8 @@ def run_flow_on_task(flow, task, avoid_duplicate_runs=True, flow_tags=None,
101102
flow_id = flow_exists(flow.name, flow.external_version)
102103
if avoid_duplicate_runs and flow_id:
103104
flow_from_server = get_flow(flow_id)
104-
setup_id = setup_exists(flow_from_server, flow.model)
105+
flow_from_server.model = flow.model
106+
setup_id = setup_exists(flow_from_server)
105107
ids = _run_exists(task.task_id, setup_id)
106108
if ids:
107109
raise PyOpenMLError("Run already exists in server. Run id(s): %s" % str(ids))
@@ -162,7 +164,8 @@ def run_flow_on_task(flow, task, avoid_duplicate_runs=True, flow_tags=None,
162164
trace=trace,
163165
data_content=data_content,
164166
)
165-
run.parameter_settings = OpenMLRun._parse_parameters(flow)
167+
# TODO: currently hard-coded sklearn assumption.
168+
run.parameter_settings = openml.flows.obtain_parameter_values(flow)
166169

167170
# now we need to attach the detailed evaluations
168171
if task.task_type_id == 3:

0 commit comments

Comments
 (0)