Skip to content

Commit 40521b5

Browse files
authored
Merge pull request #296 from openml/develop
Develop
2 parents c5c1dc3 + 3b37dfc commit 40521b5

31 files changed

+1092
-217
lines changed

openml/__init__.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,11 +22,13 @@
2222
from . import runs
2323
from . import flows
2424
from . import setups
25+
from . import study
26+
from . import evaluations
2527
from .runs import OpenMLRun
2628
from .tasks import OpenMLTask, OpenMLSplit
2729
from .flows import OpenMLFlow
2830

29-
__version__ = "0.4.0dev"
31+
from .__version__ import __version__
3032

3133

3234
def populate_cache(task_ids=None, dataset_ids=None, flow_ids=None,
@@ -66,5 +68,6 @@ def populate_cache(task_ids=None, dataset_ids=None, flow_ids=None,
6668

6769

6870
__all__ = ['OpenMLDataset', 'OpenMLDataFeature', 'OpenMLRun',
69-
'OpenMLSplit', 'datasets', 'OpenMLTask', 'OpenMLFlow',
71+
'OpenMLSplit', 'OpenMLEvaluation', 'OpenMLSetup',
72+
'OpenMLTask', 'OpenMLFlow', 'datasets', 'evaluations',
7073
'config', 'runs', 'flows', 'tasks', 'setups']

openml/__version__.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
"""Version information."""
2+
3+
# The following line *must* be the last in the module, exactly as formatted:
4+
__version__ = "0.5.0dev"

openml/_api_calls.py

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -104,11 +104,16 @@ def _read_url_files(url, data=None, file_dictionary=None, file_elements=None):
104104
def _read_url(url, data=None):
105105

106106
data = {} if data is None else data
107-
data['api_key'] = config.apikey
107+
if config.apikey is not None:
108+
data['api_key'] = config.apikey
108109

109-
# Using requests.post sets header 'Accept-encoding' automatically to
110-
# 'gzip,deflate'
111-
response = requests.post(url, data=data)
110+
if len(data) == 0 or (len(data) == 1 and 'api_key' in data):
111+
# do a GET
112+
response = requests.get(url, params=data)
113+
else: # an actual post request
114+
# Using requests.post sets header 'Accept-encoding' automatically to
115+
# 'gzip,deflate'
116+
response = requests.post(url, data=data)
112117

113118
if response.status_code != 200:
114119
raise _parse_server_exception(response)
@@ -117,6 +122,7 @@ def _read_url(url, data=None):
117122
warnings.warn('Received uncompressed content from OpenML for %s.' % url)
118123
return response.text
119124

125+
120126
def _parse_server_exception(response):
121127
# OpenML has a sopisticated error system
122128
# where information about failures is provided. try to parse this

openml/datasets/dataset.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ def __init__(self, dataset_id=None, name=None, version=None, description=None,
3939
row_id_attribute=None, ignore_attribute=None,
4040
version_label=None, citation=None, tag=None, visibility=None,
4141
original_data_url=None, paper_url=None, update_comment=None,
42-
md5_checksum=None, data_file=None, features=None):
42+
md5_checksum=None, data_file=None, features=None, qualities=None):
4343
# Attributes received by querying the RESTful API
4444
self.dataset_id = int(dataset_id) if dataset_id is not None else None
4545
self.name = name
@@ -74,6 +74,7 @@ def __init__(self, dataset_id=None, name=None, version=None, description=None,
7474
self.md5_cheksum = md5_checksum
7575
self.data_file = data_file
7676
self.features = None
77+
self.qualities = None
7778

7879
if features is not None:
7980
self.features = {}
@@ -87,6 +88,12 @@ def __init__(self, dataset_id=None, name=None, version=None, description=None,
8788
raise ValueError('Data features not provided in right order')
8889
self.features[feature.index] = feature
8990

91+
if qualities is not None:
92+
self.qualities = {}
93+
for idx, xmlquality in enumerate(qualities['oml:quality']):
94+
name = xmlquality['oml:name']
95+
value = xmlquality['oml:value']
96+
self.qualities[name] = value
9097

9198
if data_file is not None:
9299
if self._data_features_supported():

openml/datasets/functions.py

Lines changed: 19 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,8 @@ def _get_cached_dataset(dataset_id):
7575
description = _get_cached_dataset_description(dataset_id)
7676
arff_file = _get_cached_dataset_arff(dataset_id)
7777
features = _get_cached_dataset_features(dataset_id)
78-
dataset = _create_dataset_from_description(description, features, arff_file)
78+
qualities = _get_cached_dataset_qualities(dataset_id)
79+
dataset = _create_dataset_from_description(description, features, qualities, arff_file)
7980

8081
return dataset
8182

@@ -107,6 +108,19 @@ def _get_cached_dataset_features(dataset_id):
107108
"cached" % dataset_id)
108109

109110

111+
def _get_cached_dataset_qualities(dataset_id):
112+
cache_dir = config.get_cache_directory()
113+
did_cache_dir = os.path.join(cache_dir, "datasets", str(dataset_id))
114+
qualities_file = os.path.join(did_cache_dir, "qualities.xml")
115+
try:
116+
with io.open(qualities_file, encoding='utf8') as fh:
117+
qualities_xml = fh.read()
118+
return xmltodict.parse(qualities_xml)["oml:data_qualities"]
119+
except (IOError, OSError):
120+
raise OpenMLCacheException("Dataset qualities for dataset id %d not "
121+
"cached" % dataset_id)
122+
123+
110124
def _get_cached_dataset_arff(dataset_id):
111125
cache_dir = config.get_cache_directory()
112126
did_cache_dir = os.path.join(cache_dir, "datasets", str(dataset_id))
@@ -272,7 +286,7 @@ def get_dataset(dataset_id):
272286
_remove_dataset_cache_dir(did_cache_dir)
273287
raise e
274288

275-
dataset = _create_dataset_from_description(description, features, arff_file)
289+
dataset = _create_dataset_from_description(description, features, qualities, arff_file)
276290
return dataset
277291

278292

@@ -470,7 +484,7 @@ def _remove_dataset_cache_dir(did_cache_dir):
470484
'Please do this manually!' % did_cache_dir)
471485

472486

473-
def _create_dataset_from_description(description, features, arff_file):
487+
def _create_dataset_from_description(description, features, qualities, arff_file):
474488
"""Create a dataset object from a description dict.
475489
476490
Parameters
@@ -510,5 +524,6 @@ def _create_dataset_from_description(description, features, arff_file):
510524
description.get("oml:update_comment"),
511525
description.get("oml:md5_checksum"),
512526
data_file=arff_file,
513-
features=features)
527+
features=features,
528+
qualities=qualities)
514529
return dataset

openml/evaluations/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
from .evaluation import OpenMLEvaluation
2+
from .functions import list_evaluations

openml/evaluations/evaluation.py

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
2+
class OpenMLEvaluation(object):
3+
'''
4+
Contains all meta-information about a run / evaluation combination,
5+
according to the evaluation/list function
6+
7+
Parameters
8+
----------
9+
run_id : int
10+
task_id : int
11+
setup_id : int
12+
flow_id : int
13+
flow_name : str
14+
data_id : int
15+
data_name : str
16+
the name of the dataset
17+
function : str
18+
the evaluation function of this item (e.g., accuracy)
19+
upload_time : str
20+
the time of evaluation
21+
value : float
22+
the value of this evaluation
23+
array_data : str
24+
list of information per class (e.g., in case of precision, auroc, recall)
25+
'''
26+
def __init__(self, run_id, task_id, setup_id, flow_id, flow_name,
27+
data_id, data_name, function, upload_time, value,
28+
array_data=None):
29+
self.run_id = run_id
30+
self.task_id = task_id
31+
self.setup_id = setup_id
32+
self.flow_id = flow_id
33+
self.flow_name = flow_name
34+
self.data_id = data_id
35+
self.data_name = data_name
36+
self.function = function
37+
self.upload_time = upload_time
38+
self.value = value
39+
self.array_data = array_data
40+

openml/evaluations/functions.py

Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,93 @@
1+
import xmltodict
2+
3+
from .._api_calls import _perform_api_call
4+
from ..evaluations import OpenMLEvaluation
5+
6+
def list_evaluations(function, offset=None, size=None, id=None, task=None, setup=None,
7+
flow=None, uploader=None, tag=None):
8+
"""List all run-evaluation pairs matching all of the given filters.
9+
10+
Perform API call `/evaluation/function{function}/{filters}
11+
12+
Parameters
13+
----------
14+
function : str
15+
the evaluation function. e.g., predictive_accuracy
16+
offset : int, optional
17+
the number of runs to skip, starting from the first
18+
size : int, optional
19+
the maximum number of runs to show
20+
21+
id : list, optional
22+
23+
task : list, optional
24+
25+
setup: list, optional
26+
27+
flow : list, optional
28+
29+
uploader : list, optional
30+
31+
tag : str, optional
32+
33+
Returns
34+
-------
35+
list
36+
List of found evaluations.
37+
"""
38+
39+
api_call = "evaluation/list/function/%s" %function
40+
if offset is not None:
41+
api_call += "/offset/%d" % int(offset)
42+
if size is not None:
43+
api_call += "/limit/%d" % int(size)
44+
if id is not None:
45+
api_call += "/run/%s" % ','.join([str(int(i)) for i in id])
46+
if task is not None:
47+
api_call += "/task/%s" % ','.join([str(int(i)) for i in task])
48+
if setup is not None:
49+
api_call += "/setup/%s" % ','.join([str(int(i)) for i in setup])
50+
if flow is not None:
51+
api_call += "/flow/%s" % ','.join([str(int(i)) for i in flow])
52+
if uploader is not None:
53+
api_call += "/uploader/%s" % ','.join([str(int(i)) for i in uploader])
54+
if tag is not None:
55+
api_call += "/tag/%s" % tag
56+
57+
return _list_evaluations(api_call)
58+
59+
60+
def _list_evaluations(api_call):
61+
"""Helper function to parse API calls which are lists of runs"""
62+
63+
xml_string = _perform_api_call(api_call)
64+
65+
evals_dict = xmltodict.parse(xml_string)
66+
# Minimalistic check if the XML is useful
67+
if 'oml:evaluations' not in evals_dict:
68+
raise ValueError('Error in return XML, does not contain "oml:evaluations": %s'
69+
% str(evals_dict))
70+
71+
if isinstance(evals_dict['oml:evaluations']['oml:evaluation'], list):
72+
evals_list = evals_dict['oml:evaluations']['oml:evaluation']
73+
elif isinstance(evals_dict['oml:evaluations']['oml:evaluation'], dict):
74+
evals_list = [evals_dict['oml:evaluations']['oml:evaluation']]
75+
else:
76+
raise TypeError()
77+
78+
evals = dict()
79+
for eval_ in evals_list:
80+
run_id = int(eval_['oml:run_id'])
81+
array_data = None
82+
if 'oml:array_data' in eval_:
83+
eval_['oml:array_data']
84+
85+
evaluation = OpenMLEvaluation(int(eval_['oml:run_id']), int(eval_['oml:task_id']),
86+
int(eval_['oml:setup_id']), int(eval_['oml:flow_id']),
87+
eval_['oml:flow_name'], eval_['oml:data_id'],
88+
eval_['oml:data_name'], eval_['oml:function'],
89+
eval_['oml:upload_time'], float(eval_['oml:value']),
90+
array_data)
91+
evals[run_id] = evaluation
92+
return evals
93+

openml/flows/sklearn_converter.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -586,10 +586,13 @@ def check(param_dict, disallow_parameter=False):
586586
elif isinstance(model, sklearn.model_selection.RandomizedSearchCV):
587587
param_distributions = model.param_distributions
588588
else:
589+
if hasattr(model, 'param_distributions'):
590+
param_distributions = model.param_distributions
591+
else:
592+
raise AttributeError('Using subclass BaseSearchCV other than {GridSearchCV, RandomizedSearchCV}. Could not find attribute param_distributions. ')
589593
print('Warning! Using subclass BaseSearchCV other than ' \
590594
'{GridSearchCV, RandomizedSearchCV}. Should implement param check. ')
591-
pass
592-
595+
593596
if not check(param_distributions, True):
594597
raise PyOpenMLError('openml-python should not be used to '
595598
'optimize the n_jobs parameter.')

0 commit comments

Comments
 (0)