Skip to content

Commit 4715796

Browse files
PGijsbersmfeurer
authored andcommitted
Function to trim flownames for scikit-learn flows. (#723)
* Function to trim flownames for scikit-learn flows. * max_length -> extra trim length rename * Flake. * Fix typo in test which is no longer allowed with Pytest 5.0.0 * Allow long names from other modules. * Update test to reflect we allow non-sklearn pipelines now. * [skip-CI] Flake8. * Allow to ignore custom name when checking if flows are equal. Allow difference on upload. * Propegate ignore_custom_name_if_none in assert_flows_equal * Allow model_selection in pipeline or pipeline in model_selection * Flake8 * reinstantiate wrong version tests against live and has 0.20 support * [skip-ci] Remove commented out code. * Disable test_get_flow_reinstantiate_model_wrong_version for sklearn 0.19 * Process feedback.
1 parent 56fcc00 commit 4715796

File tree

7 files changed

+224
-14
lines changed

7 files changed

+224
-14
lines changed

doc/progress.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ Changelog
1717
* DOC #639: More descriptive documention for function to convert array format.
1818
* ADD #687: Adds a function to retrieve the list of evaluation measures available.
1919
* ADD #695: A function to retrieve all the data quality measures available.
20+
* ADD #412: Add a function to trim flow names for scikit-learn flows.
2021
* ADD #715: `list_evaluations` now has an option to sort evaluations by score (value).
2122
* ADD #722: Automatic reinstantiation of flow in `run_model_on_task`. Clearer errors if that's not possible.
2223
* MAINT #726: Update examples to remove deprecation warnings from scikit-learn

openml/extensions/sklearn/extension.py

Lines changed: 118 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,122 @@ def can_handle_model(cls, model: Any) -> bool:
8787
"""
8888
return isinstance(model, sklearn.base.BaseEstimator)
8989

90+
@classmethod
91+
def trim_flow_name(
92+
cls,
93+
long_name: str,
94+
extra_trim_length: int = 100,
95+
_outer: bool = True
96+
) -> str:
97+
""" Shorten generated sklearn flow name to at most `max_length` characters.
98+
99+
Flows are assumed to have the following naming structure:
100+
(model_selection)? (pipeline)? (steps)+
101+
and will be shortened to:
102+
sklearn.(selection.)?(pipeline.)?(steps)+
103+
e.g. (white spaces and newlines added for readability)
104+
sklearn.pipeline.Pipeline(
105+
columntransformer=sklearn.compose._column_transformer.ColumnTransformer(
106+
numeric=sklearn.pipeline.Pipeline(
107+
imputer=sklearn.preprocessing.imputation.Imputer,
108+
standardscaler=sklearn.preprocessing.data.StandardScaler),
109+
nominal=sklearn.pipeline.Pipeline(
110+
simpleimputer=sklearn.impute.SimpleImputer,
111+
onehotencoder=sklearn.preprocessing._encoders.OneHotEncoder)),
112+
variancethreshold=sklearn.feature_selection.variance_threshold.VarianceThreshold,
113+
svc=sklearn.svm.classes.SVC)
114+
->
115+
sklearn.Pipeline(ColumnTransformer,VarianceThreshold,SVC)
116+
117+
Parameters
118+
----------
119+
long_name : str
120+
The full flow name generated by the scikit-learn extension.
121+
extra_trim_length: int (default=100)
122+
If the trimmed name would exceed `extra_trim_length` characters, additional trimming
123+
of the short name is performed. This reduces the produced short name length.
124+
There is no guarantee the end result will not exceed `extra_trim_length`.
125+
_outer : bool (default=True)
126+
For internal use only. Specifies if the function is called recursively.
127+
128+
Returns
129+
-------
130+
str
131+
132+
"""
133+
def remove_all_in_parentheses(string: str) -> str:
134+
string, removals = re.subn(r"\([^()]*\)", "", string)
135+
while removals > 0:
136+
string, removals = re.subn(r"\([^()]*\)", "", string)
137+
return string
138+
139+
# Generally, we want to trim all hyperparameters, the exception to that is for model
140+
# selection, as the `estimator` hyperparameter is very indicative of what is in the flow.
141+
# So we first trim name of the `estimator` specified in mode selection. For reference, in
142+
# the example below, we want to trim `sklearn.tree.tree.DecisionTreeClassifier`, and
143+
# keep it in the final trimmed flow name:
144+
# sklearn.pipeline.Pipeline(Imputer=sklearn.preprocessing.imputation.Imputer,
145+
# VarianceThreshold=sklearn.feature_selection.variance_threshold.VarianceThreshold,
146+
# Estimator=sklearn.model_selection._search.RandomizedSearchCV(estimator=
147+
# sklearn.tree.tree.DecisionTreeClassifier))
148+
if 'sklearn.model_selection' in long_name:
149+
start_index = long_name.index('sklearn.model_selection')
150+
estimator_start = (start_index
151+
+ long_name[start_index:].index('estimator=')
152+
+ len('estimator='))
153+
154+
model_select_boilerplate = long_name[start_index:estimator_start]
155+
# above is .g. "sklearn.model_selection._search.RandomizedSearchCV(estimator="
156+
model_selection_class = model_select_boilerplate.split('(')[0].split('.')[-1]
157+
158+
# Now we want to also find and parse the `estimator`, for this we find the closing
159+
# parenthesis to the model selection technique:
160+
closing_parenthesis_expected = 1
161+
for i, char in enumerate(long_name[estimator_start:], start=estimator_start):
162+
if char == '(':
163+
closing_parenthesis_expected += 1
164+
if char == ')':
165+
closing_parenthesis_expected -= 1
166+
if closing_parenthesis_expected == 0:
167+
break
168+
169+
model_select_pipeline = long_name[estimator_start:i]
170+
trimmed_pipeline = cls.trim_flow_name(model_select_pipeline, _outer=False)
171+
_, trimmed_pipeline = trimmed_pipeline.split('.', maxsplit=1) # trim module prefix
172+
model_select_short = "sklearn.{}[{}]".format(model_selection_class, trimmed_pipeline)
173+
name = long_name[:start_index] + model_select_short + long_name[i + 1:]
174+
else:
175+
name = long_name
176+
177+
module_name = long_name.split('.')[0]
178+
short_name = module_name + '.{}'
179+
180+
if name.startswith('sklearn.pipeline'):
181+
full_pipeline_class, pipeline = name[:-1].split('(', maxsplit=1)
182+
pipeline_class = full_pipeline_class.split('.')[-1]
183+
# We don't want nested pipelines in the short name, so we trim all complicated
184+
# subcomponents, i.e. those with parentheses:
185+
pipeline = remove_all_in_parentheses(pipeline)
186+
187+
# then the pipeline steps are formatted e.g.:
188+
# step1name=sklearn.submodule.ClassName,step2name...
189+
components = [component.split('.')[-1] for component in pipeline.split(',')]
190+
pipeline = "{}({})".format(pipeline_class, ','.join(components))
191+
if len(short_name.format(pipeline)) > extra_trim_length:
192+
pipeline = "{}(...,{})".format(pipeline_class, components[-1])
193+
else:
194+
# Just a simple component: e.g. sklearn.tree.DecisionTreeClassifier
195+
pipeline = remove_all_in_parentheses(name).split('.')[-1]
196+
197+
if not _outer:
198+
# Anything from parenthesis in inner calls should not be culled, so we use brackets
199+
pipeline = pipeline.replace('(', '[').replace(')', ']')
200+
else:
201+
# Square brackets may be introduced with nested model_selection
202+
pipeline = pipeline.replace('[', '(').replace(']', ')')
203+
204+
return short_name.format(pipeline)
205+
90206
################################################################################################
91207
# Methods for flow serialization and de-serialization
92208

@@ -402,6 +518,7 @@ def _serialize_model(self, model: Any) -> OpenMLFlow:
402518
name = '%s(%s)' % (class_name, sub_components_names[1:])
403519
else:
404520
name = class_name
521+
short_name = SklearnExtension.trim_flow_name(name)
405522

406523
# Get the external versions of all sub-components
407524
external_version = self._get_external_version_string(model, subcomponents)
@@ -419,6 +536,7 @@ def _serialize_model(self, model: Any) -> OpenMLFlow:
419536
sklearn_version_formatted = sklearn_version.replace('==', '_')
420537
flow = OpenMLFlow(name=name,
421538
class_name=class_name,
539+
custom_name=short_name,
422540
description='Automatically created scikit-learn flow.',
423541
model=model,
424542
components=subcomponents,

openml/flows/flow.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -417,14 +417,15 @@ def publish(self, raise_error_if_exists: bool = False) -> 'OpenMLFlow':
417417
_copy_server_fields(flow, self)
418418
try:
419419
openml.flows.functions.assert_flows_equal(
420-
self, flow, flow.upload_date, ignore_parameter_values=True
420+
self, flow, flow.upload_date,
421+
ignore_parameter_values=True,
422+
ignore_custom_name_if_none=True
421423
)
422424
except ValueError as e:
423425
message = e.args[0]
424-
raise ValueError("Flow was not stored correctly on the server. "
425-
"New flow ID is %d. Please check manually and "
426-
"remove the flow if necessary! Error is:\n'%s'" %
427-
(flow_id, message))
426+
raise ValueError("The flow on the server is inconsistent with the local flow. "
427+
"The server flow ID is {}. Please check manually and remove "
428+
"the flow if necessary! Error is:\n'{}'".format(flow_id, message))
428429
return self
429430

430431
def get_structure(self, key_item: str) -> Dict[str, List[str]]:

openml/flows/functions.py

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -307,7 +307,8 @@ def _check_flow_for_server_id(flow: OpenMLFlow) -> None:
307307

308308
def assert_flows_equal(flow1: OpenMLFlow, flow2: OpenMLFlow,
309309
ignore_parameter_values_on_older_children: str = None,
310-
ignore_parameter_values: bool = False) -> None:
310+
ignore_parameter_values: bool = False,
311+
ignore_custom_name_if_none: bool = False) -> None:
311312
"""Check equality of two flows.
312313
313314
Two flows are equal if their all keys which are not set by the server
@@ -325,6 +326,9 @@ def assert_flows_equal(flow1: OpenMLFlow, flow2: OpenMLFlow,
325326
326327
ignore_parameter_values : bool
327328
Whether to ignore parameter values when comparing flows.
329+
330+
ignore_custom_name_if_none : bool
331+
Whether to ignore the custom name field if either flow has `custom_name` equal to `None`.
328332
"""
329333
if not isinstance(flow1, OpenMLFlow):
330334
raise TypeError('Argument 1 must be of type OpenMLFlow, but is %s' %
@@ -358,7 +362,8 @@ def assert_flows_equal(flow1: OpenMLFlow, flow2: OpenMLFlow,
358362
'argument2, but not in argument1.' % name)
359363
assert_flows_equal(attr1[name], attr2[name],
360364
ignore_parameter_values_on_older_children,
361-
ignore_parameter_values)
365+
ignore_parameter_values,
366+
ignore_custom_name_if_none)
362367
elif key == '_extension':
363368
continue
364369
else:
@@ -385,6 +390,13 @@ def assert_flows_equal(flow1: OpenMLFlow, flow2: OpenMLFlow,
385390
# Continue needs to be done here as the first if
386391
# statement triggers in both special cases
387392
continue
393+
elif (key == 'custom_name'
394+
and ignore_custom_name_if_none
395+
and (attr1 is None or attr2 is None)):
396+
# If specified, we allow `custom_name` inequality if one flow's name is None.
397+
# Helps with backwards compatibility as `custom_name` is now auto-generated, but
398+
# before it used to be `None`.
399+
continue
388400

389401
if attr1 != attr2:
390402
raise ValueError("Flow %s: values for attribute '%s' differ: "

0 commit comments

Comments
 (0)