Skip to content

Commit 688af3f

Browse files
authored
Merge pull request #279 from openml/get_local_evaluations
Get local evaluations
2 parents 18d93cd + 8f3f999 commit 688af3f

File tree

4 files changed

+183
-11
lines changed

4 files changed

+183
-11
lines changed

openml/_api_calls.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -117,6 +117,7 @@ def _read_url(url, data=None):
117117
warnings.warn('Received uncompressed content from OpenML for %s.' % url)
118118
return response.text
119119

120+
120121
def _parse_server_exception(response):
121122
# OpenML has a sopisticated error system
122123
# where information about failures is provided. try to parse this

openml/runs/functions.py

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
import sklearn.pipeline
1111
import six
1212
import xmltodict
13+
import sklearn.metrics
1314

1415
import openml
1516
import openml.utils
@@ -113,7 +114,6 @@ def run_flow_on_task(task, flow, avoid_duplicate_runs=True, flow_tags=None,
113114
else:
114115
run.fold_evaluations = fold_evaluations
115116

116-
117117
config.logger.info('Executed Task %d with Flow id: %d' % (task.task_id, run.flow_id))
118118

119119
return run
@@ -427,6 +427,16 @@ def _run_task_get_arffcontent(model, task, class_labels):
427427

428428
ProbaY = model_fold.predict_proba(testX)
429429
PredY = model_fold.predict(testX)
430+
431+
# add client-side calculated metrics. These might be used on the server as consistency check
432+
def _calculate_local_measure(sklearn_fn, openml_name):
433+
user_defined_measures_fold[openml_name][rep_no][fold_no] = \
434+
sklearn_fn(testY, PredY)
435+
user_defined_measures_sample[openml_name][rep_no][fold_no][sample_no] = \
436+
sklearn_fn(testY, PredY)
437+
438+
_calculate_local_measure(sklearn.metrics.accuracy_score, 'predictive_accuracy')
439+
430440
if can_measure_runtime:
431441
modelpredict_duration = (time.process_time() - modelpredict_starttime) * 1000
432442
user_defined_measures_fold['usercpu_time_millis_testing'][rep_no][fold_no] = modelpredict_duration
@@ -457,6 +467,7 @@ def _run_task_get_arffcontent(model, task, class_labels):
457467
user_defined_measures_sample
458468

459469

470+
460471
def _extract_arfftrace(model, rep_no, fold_no):
461472
if not isinstance(model, sklearn.model_selection._search.BaseSearchCV):
462473
raise ValueError('model should be instance of'\

openml/runs/run.py

Lines changed: 91 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,15 @@
1-
from collections import OrderedDict
1+
from collections import OrderedDict, defaultdict
22
import json
33
import sys
44
import time
5+
import numpy as np
56

67
import arff
78
import xmltodict
89

910
import openml
1011
from ..tasks import get_task
11-
from .._api_calls import _perform_api_call
12+
from .._api_calls import _perform_api_call, _file_id_to_url, _read_url_files
1213
from ..exceptions import PyOpenMLError
1314

1415
class OpenMLRun(object):
@@ -106,6 +107,94 @@ def _generate_trace_arff_dict(self):
106107

107108
return arff_dict
108109

110+
def get_metric_fn(self, sklearn_fn, kwargs={}):
111+
'''Calculates metric scores based on predicted values. Assumes the
112+
run has been executed locally (and contains run_data). Furthermore,
113+
it assumes that the 'correct' attribute is specified in the arff
114+
(which is an optional field, but always the case for openml-python
115+
runs)
116+
117+
Parameters
118+
-------
119+
sklearn_fn : function
120+
a function pointer to a sklearn function that
121+
accepts y_true, y_pred and *kwargs
122+
123+
Returns
124+
-------
125+
scores : list
126+
a list of floats, of length num_folds * num_repeats
127+
'''
128+
if self.data_content is not None:
129+
predictions_arff = self._generate_arff_dict()
130+
elif 'predictions' in self.output_files:
131+
predictions_file_url = _file_id_to_url(self.output_files['predictions'], 'predictions.arff')
132+
predictions_arff = arff.loads(openml._api_calls._read_url(predictions_file_url))
133+
# TODO: make this a stream reader
134+
else:
135+
raise ValueError('Run should have been locally executed.')
136+
137+
attribute_names = [att[0] for att in predictions_arff['attributes']]
138+
if 'correct' not in attribute_names:
139+
raise ValueError('Attribute "correct" should be set')
140+
if 'prediction' not in attribute_names:
141+
raise ValueError('Attribute "predict" should be set')
142+
143+
def _attribute_list_to_dict(attribute_list):
144+
# convenience function: Creates a mapping to map from the name of attributes
145+
# present in the arff prediction file to their index. This is necessary
146+
# because the number of classes can be different for different tasks.
147+
res = dict()
148+
for idx in range(len(attribute_list)):
149+
res[attribute_list[idx][0]] = idx
150+
return res
151+
attribute_dict = _attribute_list_to_dict(predictions_arff['attributes'])
152+
153+
# might throw KeyError!
154+
predicted_idx = attribute_dict['prediction']
155+
correct_idx = attribute_dict['correct']
156+
repeat_idx = attribute_dict['repeat']
157+
fold_idx = attribute_dict['fold']
158+
sample_idx = attribute_dict['sample'] # TODO: this one might be zero
159+
160+
if predictions_arff['attributes'][predicted_idx][1] != predictions_arff['attributes'][correct_idx][1]:
161+
pred = predictions_arff['attributes'][predicted_idx][1]
162+
corr = predictions_arff['attributes'][correct_idx][1]
163+
raise ValueError('Predicted and Correct do not have equal values: %s Vs. %s' %(str(pred), str(corr)))
164+
165+
# TODO: these could be cached
166+
values_predict = {}
167+
values_correct = {}
168+
for line_idx, line in enumerate(predictions_arff['data']):
169+
rep = line[repeat_idx]
170+
fold = line[fold_idx]
171+
samp = line[sample_idx]
172+
173+
# TODO: can be sped up bt preprocessing index, but OK for now.
174+
prediction = predictions_arff['attributes'][predicted_idx][1].index(line[predicted_idx])
175+
correct = predictions_arff['attributes'][predicted_idx][1].index(line[correct_idx])
176+
if rep not in values_predict:
177+
values_predict[rep] = dict()
178+
values_correct[rep] = dict()
179+
if fold not in values_predict[rep]:
180+
values_predict[rep][fold] = dict()
181+
values_correct[rep][fold] = dict()
182+
if samp not in values_predict[rep][fold]:
183+
values_predict[rep][fold][samp] = []
184+
values_correct[rep][fold][samp] = []
185+
186+
values_predict[line[repeat_idx]][line[fold_idx]][line[sample_idx]].append(prediction)
187+
values_correct[line[repeat_idx]][line[fold_idx]][line[sample_idx]].append(correct)
188+
189+
scores = []
190+
for rep in values_predict.keys():
191+
for fold in values_predict[rep].keys():
192+
last_sample = len(values_predict[rep][fold]) - 1
193+
y_pred = values_predict[rep][fold][last_sample]
194+
y_true = values_correct[rep][fold][last_sample]
195+
scores.append(sklearn_fn(y_true, y_pred, **kwargs))
196+
return np.array(scores)
197+
109198
def publish(self):
110199
"""Publish a run to the OpenML server.
111200

tests/test_runs/test_run_functions.py

Lines changed: 79 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -166,22 +166,32 @@ def _check_fold_evaluations(self, fold_evaluations, num_repeats, num_folds, max_
166166
condition outside of this function. )
167167
default max_time_allowed (per fold, in milli seconds) = 1 minute, quite pessimistic
168168
'''
169-
timing_measures = {'usercpu_time_millis_testing', 'usercpu_time_millis_training', 'usercpu_time_millis'}
169+
170+
# a dict mapping from openml measure to a tuple with the minimum and maximum allowed value
171+
check_measures = {'usercpu_time_millis_testing': (0, max_time_allowed),
172+
'usercpu_time_millis_training': (0, max_time_allowed), # should take at least one millisecond (?)
173+
'usercpu_time_millis': (0, max_time_allowed),
174+
'predictive_accuracy': (0, 1)}
170175

171176
self.assertIsInstance(fold_evaluations, dict)
172177
if sys.version_info[:2] >= (3, 3):
173-
self.assertEquals(set(fold_evaluations.keys()), timing_measures)
174-
for measure in timing_measures:
178+
# this only holds if we are allowed to record time (otherwise some are missing)
179+
self.assertEquals(set(fold_evaluations.keys()), set(check_measures.keys()))
180+
181+
for measure in check_measures.keys():
182+
if measure in fold_evaluations:
175183
num_rep_entrees = len(fold_evaluations[measure])
176184
self.assertEquals(num_rep_entrees, num_repeats)
185+
min_val = check_measures[measure][0]
186+
max_val = check_measures[measure][1]
177187
for rep in range(num_rep_entrees):
178188
num_fold_entrees = len(fold_evaluations[measure][rep])
179189
self.assertEquals(num_fold_entrees, num_folds)
180190
for fold in range(num_fold_entrees):
181191
evaluation = fold_evaluations[measure][rep][fold]
182192
self.assertIsInstance(evaluation, float)
183-
self.assertGreater(evaluation, 0) # should take at least one millisecond (?)
184-
self.assertLess(evaluation, max_time_allowed)
193+
self.assertGreaterEqual(evaluation, min_val)
194+
self.assertLessEqual(evaluation, max_val)
185195

186196

187197
def _check_sample_evaluations(self, sample_evaluations, num_repeats, num_folds, num_samples, max_time_allowed=60000):
@@ -193,12 +203,20 @@ def _check_sample_evaluations(self, sample_evaluations, num_repeats, num_folds,
193203
condition outside of this function. )
194204
default max_time_allowed (per fold, in milli seconds) = 1 minute, quite pessimistic
195205
'''
196-
timing_measures = {'usercpu_time_millis_testing', 'usercpu_time_millis_training', 'usercpu_time_millis'}
206+
207+
# a dict mapping from openml measure to a tuple with the minimum and maximum allowed value
208+
check_measures = {'usercpu_time_millis_testing': (0, max_time_allowed),
209+
'usercpu_time_millis_training': (0, max_time_allowed), # should take at least one millisecond (?)
210+
'usercpu_time_millis': (0, max_time_allowed),
211+
'predictive_accuracy': (0, 1)}
197212

198213
self.assertIsInstance(sample_evaluations, dict)
199214
if sys.version_info[:2] >= (3, 3):
200-
self.assertEquals(set(sample_evaluations.keys()), timing_measures)
201-
for measure in timing_measures:
215+
# this only holds if we are allowed to record time (otherwise some are missing)
216+
self.assertEquals(set(sample_evaluations.keys()), set(check_measures.keys()))
217+
218+
for measure in check_measures.keys():
219+
if measure in sample_evaluations:
202220
num_rep_entrees = len(sample_evaluations[measure])
203221
self.assertEquals(num_rep_entrees, num_repeats)
204222
for rep in range(num_rep_entrees):
@@ -309,6 +327,16 @@ def test_run_and_upload(self):
309327
for clf, rsv in zip(clfs, random_state_fixtures):
310328
run = self._perform_run(task_id, num_test_instances, clf,
311329
random_state_value=rsv)
330+
331+
# obtain accuracy scores using get_metric_score:
332+
accuracy_scores = run.get_metric_fn(sklearn.metrics.accuracy_score)
333+
# compare with the scores in user defined measures
334+
accuracy_scores_provided = []
335+
for rep in run.fold_evaluations['predictive_accuracy'].keys():
336+
for fold in run.fold_evaluations['predictive_accuracy'][rep].keys():
337+
accuracy_scores_provided.append(run.fold_evaluations['predictive_accuracy'][rep][fold])
338+
self.assertEquals(sum(accuracy_scores_provided), sum(accuracy_scores))
339+
312340
if isinstance(clf, BaseSearchCV):
313341
if isinstance(clf, GridSearchCV):
314342
grid_iterations = 1
@@ -385,6 +413,49 @@ def test_initialize_cv_from_run(self):
385413
self.assertEquals(modelS.cv.random_state, 62501)
386414
self.assertEqual(modelR.cv.random_state, 62501)
387415

416+
def _test_local_evaluations(self, run):
417+
418+
# compare with the scores in user defined measures
419+
accuracy_scores_provided = []
420+
for rep in run.fold_evaluations['predictive_accuracy'].keys():
421+
for fold in run.fold_evaluations['predictive_accuracy'][rep].keys():
422+
accuracy_scores_provided.append(run.fold_evaluations['predictive_accuracy'][rep][fold])
423+
accuracy_scores = run.get_metric_fn(sklearn.metrics.accuracy_score)
424+
np.testing.assert_array_almost_equal(accuracy_scores_provided, accuracy_scores)
425+
426+
# also check if we can obtain some other scores: # TODO: how to do AUC?
427+
tests = [(sklearn.metrics.cohen_kappa_score, {'weights': None}),
428+
(sklearn.metrics.auc, {'reorder': True}),
429+
(sklearn.metrics.average_precision_score, {}),
430+
(sklearn.metrics.jaccard_similarity_score, {}),
431+
(sklearn.metrics.precision_score, {'average': 'macro'}),
432+
(sklearn.metrics.brier_score_loss, {})]
433+
for test_idx, test in enumerate(tests):
434+
alt_scores = run.get_metric_fn(test[0], test[1])
435+
self.assertEquals(len(alt_scores), 10)
436+
for idx in range(len(alt_scores)):
437+
self.assertGreaterEqual(alt_scores[idx], 0)
438+
self.assertLessEqual(alt_scores[idx], 1)
439+
440+
def test_local_run_metric_score(self):
441+
442+
# construct sci-kit learn classifier
443+
clf = Pipeline(steps=[('imputer', Imputer(strategy='median')), ('estimator', RandomForestClassifier())])
444+
445+
# download task
446+
task = openml.tasks.get_task(7)
447+
448+
# invoke OpenML run
449+
run = openml.runs.run_model_on_task(task, clf)
450+
451+
self._test_local_evaluations(run)
452+
453+
def test_online_run_metric_score(self):
454+
openml.config.server = self.production_server
455+
run = openml.runs.get_run(5965513) # important to use binary classification task, due to assertions
456+
self._test_local_evaluations(run)
457+
458+
388459
def test_initialize_model_from_run(self):
389460
clf = sklearn.pipeline.Pipeline(steps=[('Imputer', Imputer(strategy='median')),
390461
('VarianceThreshold', VarianceThreshold(threshold=0.05)),

0 commit comments

Comments
 (0)