Skip to content

Commit e105390

Browse files
authored
Merge branch 'develop' into fix_589
2 parents 964e732 + ae49090 commit e105390

File tree

18 files changed

+360
-28
lines changed

18 files changed

+360
-28
lines changed

doc/api.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,7 @@ Modules
7272
get_dataset
7373
get_datasets
7474
list_datasets
75+
list_qualities
7576
status_update
7677

7778
:mod:`openml.evaluations`: Evaluation Functions
@@ -83,6 +84,7 @@ Modules
8384
:template: function.rst
8485

8586
list_evaluations
87+
list_evaluation_measures
8688

8789
:mod:`openml.flows`: Flow Functions
8890
-----------------------------------

doc/progress.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,8 @@ Changelog
1010
~~~~~~
1111
* FIX #589: Fixing a bug that did not successfully upload the columns to ignore when creating and publishing a dataset.
1212
* DOC #639: More descriptive documention for function to convert array format.
13+
* ADD #687: Adds a function to retrieve the list of evaluation measures available.
14+
* ADD #695: A function to retrieve all the data quality measures available.
1315

1416
0.9.0
1517
~~~~~

openml/datasets/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
get_datasets,
77
list_datasets,
88
status_update,
9+
list_qualities
910
)
1011
from .dataset import OpenMLDataset
1112
from .data_feature import OpenMLDataFeature
@@ -20,4 +21,5 @@
2021
'OpenMLDataset',
2122
'OpenMLDataFeature',
2223
'status_update',
24+
'list_qualities'
2325
]

openml/datasets/dataset.py

Lines changed: 53 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -173,6 +173,36 @@ def __init__(self, name, description, format=None,
173173
else:
174174
self.data_pickle_file = None
175175

176+
def __str__(self):
177+
header = "OpenML Dataset"
178+
header = '{}\n{}\n'.format(header, '=' * len(header))
179+
180+
base_url = "{}".format(openml.config.server[:-len('api/v1/xml')])
181+
fields = {"Name": self.name,
182+
"Version": self.version,
183+
"Format": self.format,
184+
"Licence": self.licence,
185+
"Download URL": self.url,
186+
"Data file": self.data_file,
187+
"Pickle file": self.data_pickle_file,
188+
"# of features": len(self.features)}
189+
if self.upload_date is not None:
190+
fields["Upload Date"] = self.upload_date.replace('T', ' ')
191+
if self.dataset_id is not None:
192+
fields["OpenML URL"] = "{}d/{}".format(base_url, self.dataset_id)
193+
if self.qualities['NumberOfInstances'] is not None:
194+
fields["# of instances"] = int(self.qualities['NumberOfInstances'])
195+
196+
# determines the order in which the information will be printed
197+
order = ["Name", "Version", "Format", "Upload Date", "Licence", "Download URL",
198+
"OpenML URL", "Data File", "Pickle File", "# of features", "# of instances"]
199+
fields = [(key, fields[key]) for key in order if key in fields]
200+
201+
longest_field_name_length = max(len(name) for name, value in fields)
202+
field_line_format = "{{:.<{}}}: {{}}".format(longest_field_name_length)
203+
body = '\n'.join(field_line_format.format(name, value) for name, value in fields)
204+
return header + body
205+
176206
def _data_arff_to_pickle(self, data_file):
177207
data_pickle_file = data_file.replace('.arff', '.pkl.py3')
178208
if os.path.exists(data_pickle_file):
@@ -368,9 +398,25 @@ def decode_arff(fh):
368398
def _convert_array_format(data, array_format, attribute_names):
369399
"""Convert a dataset to a given array format.
370400
371-
By default, the data are stored as a sparse matrix or a pandas
372-
dataframe. One might be interested to get a pandas SparseDataFrame or a
373-
NumPy array instead, respectively.
401+
Converts to numpy array if data is non-sparse.
402+
Converts to a sparse dataframe if data is sparse.
403+
404+
Parameters
405+
----------
406+
array_format : str {'array', 'dataframe'}
407+
Desired data type of the output
408+
- If array_format='array'
409+
If data is non-sparse
410+
Converts to numpy-array
411+
Enforces numeric encoding of categorical columns
412+
Missing values are represented as NaN in the numpy-array
413+
else returns data as is
414+
- If array_format='dataframe'
415+
If data is sparse
416+
Works only on sparse data
417+
Converts sparse data to sparse dataframe
418+
else returns data as is
419+
374420
"""
375421
if array_format == "array" and not scipy.sparse.issparse(data):
376422
# We encode the categories such that they are integer to be able
@@ -396,8 +442,11 @@ def _encode_if_category(column):
396442
'PyOpenML cannot handle string when returning numpy'
397443
' arrays. Use dataset_format="dataframe".'
398444
)
399-
if array_format == "dataframe" and scipy.sparse.issparse(data):
445+
elif array_format == "dataframe" and scipy.sparse.issparse(data):
400446
return pd.SparseDataFrame(data, columns=attribute_names)
447+
else:
448+
data_type = "sparse-data" if scipy.sparse.issparse(data) else "non-sparse data"
449+
warn("Cannot convert {} to '{}'. Returning input data.".format(data_type, array_format))
401450
return data
402451

403452
@staticmethod

openml/datasets/functions.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -165,6 +165,30 @@ def _get_cache_directory(dataset: OpenMLDataset) -> str:
165165
return _create_cache_directory_for_id(DATASETS_CACHE_DIR_NAME, dataset.dataset_id)
166166

167167

168+
def list_qualities() -> List[str]:
169+
""" Return list of data qualities available.
170+
171+
The function performs an API call to retrieve the entire list of
172+
data qualities that are computed on the datasets uploaded.
173+
174+
Returns
175+
-------
176+
list
177+
"""
178+
api_call = "data/qualities/list"
179+
xml_string = openml._api_calls._perform_api_call(api_call, 'get')
180+
qualities = xmltodict.parse(xml_string, force_list=('oml:quality'))
181+
# Minimalistic check if the XML is useful
182+
if 'oml:data_qualities_list' not in qualities:
183+
raise ValueError('Error in return XML, does not contain '
184+
'"oml:data_qualities_list"')
185+
if not isinstance(qualities['oml:data_qualities_list']['oml:quality'], list):
186+
raise TypeError('Error in return XML, does not contain '
187+
'"oml:quality" as a list')
188+
qualities = qualities['oml:data_qualities_list']['oml:quality']
189+
return qualities
190+
191+
168192
def list_datasets(
169193
offset: Optional[int] = None,
170194
size: Optional[int] = None,

openml/evaluations/__init__.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
from .evaluation import OpenMLEvaluation
2-
from .functions import list_evaluations
2+
from .functions import list_evaluations, list_evaluation_measures
33

4-
__all__ = ['OpenMLEvaluation', 'list_evaluations']
4+
__all__ = ['OpenMLEvaluation', 'list_evaluations', 'list_evaluation_measures']

openml/evaluations/evaluation.py

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
import openml.config
2+
13

24
class OpenMLEvaluation(object):
35
"""
@@ -47,3 +49,32 @@ def __init__(self, run_id, task_id, setup_id, flow_id, flow_name,
4749
self.value = value
4850
self.values = values
4951
self.array_data = array_data
52+
53+
def __str__(self):
54+
header = "OpenML Evaluation"
55+
header = '{}\n{}\n'.format(header, '=' * len(header))
56+
57+
base_url = "{}".format(openml.config.server[:-len('api/v1/xml')])
58+
fields = {"Upload Date": self.upload_time,
59+
"Run ID": self.run_id,
60+
"OpenML Run URL": "{}r/{}".format(base_url, self.run_id),
61+
"Task ID": self.task_id,
62+
"OpenML Task URL": "{}t/{}".format(base_url, self.task_id),
63+
"Flow ID": self.flow_id,
64+
"OpenML Flow URL": "{}f/{}".format(base_url, self.flow_id),
65+
"Setup ID": self.setup_id,
66+
"Data ID": self.data_id,
67+
"Data Name": self.data_name,
68+
"OpenML Data URL": "{}d/{}".format(base_url, self.data_id),
69+
"Metric Used": self.function,
70+
"Result": self.value}
71+
72+
order = ["Uploader Date", "Run ID", "OpenML Run URL", "Task ID", "OpenML Task URL"
73+
"Flow ID", "OpenML Flow URL", "Setup ID", "Data ID", "Data Name",
74+
"OpenML Data URL", "Metric Used", "Result"]
75+
fields = [(key, fields[key]) for key in order if key in fields]
76+
77+
longest_field_name_length = max(len(name) for name, value in fields)
78+
field_line_format = "{{:.<{}}}: {{}}".format(longest_field_name_length)
79+
body = '\n'.join(field_line_format.format(name, value) for name, value in fields)
80+
return header + body

openml/evaluations/functions.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -200,3 +200,29 @@ def __list_evaluations(api_call, output_format='object'):
200200
evals = pd.DataFrame.from_dict(evals, orient='index')
201201

202202
return evals
203+
204+
205+
def list_evaluation_measures() -> List[str]:
206+
""" Return list of evaluation measures available.
207+
208+
The function performs an API call to retrieve the entire list of
209+
evaluation measures that are available.
210+
211+
Returns
212+
-------
213+
list
214+
215+
"""
216+
api_call = "evaluationmeasure/list"
217+
xml_string = openml._api_calls._perform_api_call(api_call, 'get')
218+
qualities = xmltodict.parse(xml_string, force_list=('oml:measures'))
219+
# Minimalistic check if the XML is useful
220+
if 'oml:evaluation_measures' not in qualities:
221+
raise ValueError('Error in return XML, does not contain '
222+
'"oml:evaluation_measures"')
223+
if not isinstance(qualities['oml:evaluation_measures']['oml:measures'][0]['oml:measure'],
224+
list):
225+
raise TypeError('Error in return XML, does not contain '
226+
'"oml:measure" as a list')
227+
qualities = qualities['oml:evaluation_measures']['oml:measures'][0]['oml:measure']
228+
return qualities

openml/flows/flow.py

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@
77
from ..extensions import get_extension_by_flow
88
from ..utils import extract_xml_tags, _tag_entity
99

10+
import openml.config
11+
1012

1113
class OpenMLFlow(object):
1214
"""OpenML Flow. Stores machine learning models.
@@ -132,6 +134,35 @@ def __init__(self, name, description, model, components, parameters,
132134

133135
self.extension = get_extension_by_flow(self)
134136

137+
def __str__(self):
138+
header = "OpenML Flow"
139+
header = '{}\n{}\n'.format(header, '=' * len(header))
140+
141+
base_url = "{}".format(openml.config.server[:-len('api/v1/xml')])
142+
fields = {"Flow Name": self.name,
143+
"Flow Description": self.description,
144+
"Dependencies": self.dependencies}
145+
if self.flow_id is not None:
146+
if self.version is not None:
147+
fields["Flow ID"] = "{} (version {})".format(self.flow_id, self.version)
148+
else:
149+
fields["Flow ID"] = self.flow_id
150+
fields["Flow URL"] = "{}f/{}".format(base_url, self.flow_id)
151+
if self.upload_date is not None:
152+
fields["Upload Date"] = self.upload_date.replace('T', ' ')
153+
if self.binary_url is not None:
154+
fields["Binary URL"] = self.binary_url
155+
156+
# determines the order in which the information will be printed
157+
order = ["Flow ID", "Flow URL", "Flow Name", "Flow Description", "Binary URL",
158+
"Upload Date", "Dependencies"]
159+
fields = [(key, fields[key]) for key in order if key in fields]
160+
161+
longest_field_name_length = max(len(name) for name, value in fields)
162+
field_line_format = "{{:.<{}}}: {{}}".format(longest_field_name_length)
163+
body = '\n'.join(field_line_format.format(name, value) for name, value in fields)
164+
return header + body
165+
135166
def _to_xml(self) -> str:
136167
"""Generate xml representation of self for upload to server.
137168

openml/runs/functions.py

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -78,22 +78,22 @@ def run_model_on_task(
7878
Flow generated from the model.
7979
"""
8080

81-
extension = get_extension_by_model(model, raise_if_no_extension=True)
82-
if extension is None:
83-
# This should never happen and is only here to please mypy will be gone soon once the
84-
# whole function is removed
85-
raise TypeError(extension)
86-
8781
# TODO: At some point in the future do not allow for arguments in old order (6-2018).
8882
# Flexibility currently still allowed due to code-snippet in OpenML100 paper (3-2019).
8983
# When removing this please also remove the method `is_estimator` from the extension
9084
# interface as it is only used here (MF, 3-2019)
91-
if isinstance(model, OpenMLTask) and extension.is_estimator(model):
85+
if isinstance(model, OpenMLTask):
9286
warnings.warn("The old argument order (task, model) is deprecated and "
9387
"will not be supported in the future. Please use the "
9488
"order (model, task).", DeprecationWarning)
9589
task, model = model, task
9690

91+
extension = get_extension_by_model(model, raise_if_no_extension=True)
92+
if extension is None:
93+
# This should never happen and is only here to please mypy will be gone soon once the
94+
# whole function is removed
95+
raise TypeError(extension)
96+
9797
flow = extension.model_to_flow(model)
9898

9999
run = run_flow_on_task(
@@ -159,9 +159,6 @@ def run_flow_on_task(
159159
if flow_tags is not None and not isinstance(flow_tags, list):
160160
raise ValueError("flow_tags should be a list")
161161

162-
if task.task_id is None:
163-
raise ValueError("The task should be published at OpenML")
164-
165162
# TODO: At some point in the future do not allow for arguments in old order (changed 6-2018).
166163
# Flexibility currently still allowed due to code-snippet in OpenML100 paper (3-2019).
167164
if isinstance(flow, OpenMLTask) and isinstance(task, OpenMLFlow):
@@ -171,6 +168,9 @@ def run_flow_on_task(
171168
"order (model, Flow).", DeprecationWarning)
172169
task, flow = flow, task
173170

171+
if task.task_id is None:
172+
raise ValueError("The task should be published at OpenML")
173+
174174
flow.model = flow.extension.seed_model(flow.model, seed=seed)
175175

176176
# We only need to sync with the server right now if we want to upload the flow,

0 commit comments

Comments
 (0)