Merge pull request #279 from openml/get_local_evaluations

janvanrijn · web-flow · commit 688af3f38a10 · 2017-07-18T17:30:58.000+02:00
Get local evaluations
diff --git a/openml/_api_calls.py b/openml/_api_calls.py
@@ -117,6 +117,7 @@ def _read_url(url, data=None):
         warnings.warn('Received uncompressed content from OpenML for %s.' % url)
     return response.text
 
+
 def _parse_server_exception(response):
     # OpenML has a sopisticated error system
     # where information about failures is provided. try to parse this
diff --git a/openml/runs/functions.py b/openml/runs/functions.py
@@ -10,6 +10,7 @@
 import sklearn.pipeline
 import six
 import xmltodict
+import sklearn.metrics
 
 import openml
 import openml.utils
@@ -113,7 +114,6 @@ def run_flow_on_task(task, flow, avoid_duplicate_runs=True, flow_tags=None,
     else:
         run.fold_evaluations = fold_evaluations
 
-
     config.logger.info('Executed Task %d with Flow id: %d' % (task.task_id, run.flow_id))
 
     return run
@@ -427,6 +427,16 @@ def _run_task_get_arffcontent(model, task, class_labels):
 
                 ProbaY = model_fold.predict_proba(testX)
                 PredY = model_fold.predict(testX)
+
+                # add client-side calculated metrics. These might be used on the server as consistency check
+                def _calculate_local_measure(sklearn_fn, openml_name):
+                    user_defined_measures_fold[openml_name][rep_no][fold_no] = \
+                        sklearn_fn(testY, PredY)
+                    user_defined_measures_sample[openml_name][rep_no][fold_no][sample_no] = \
+                        sklearn_fn(testY, PredY)
+
+                _calculate_local_measure(sklearn.metrics.accuracy_score, 'predictive_accuracy')
+
                 if can_measure_runtime:
                     modelpredict_duration = (time.process_time() - modelpredict_starttime) * 1000
                     user_defined_measures_fold['usercpu_time_millis_testing'][rep_no][fold_no] = modelpredict_duration
@@ -457,6 +467,7 @@ def _run_task_get_arffcontent(model, task, class_labels):
            user_defined_measures_sample
 
 
+
 def _extract_arfftrace(model, rep_no, fold_no):
     if not isinstance(model, sklearn.model_selection._search.BaseSearchCV):
         raise ValueError('model should be instance of'\
diff --git a/openml/runs/run.py b/openml/runs/run.py
@@ -1,14 +1,15 @@
-from collections import OrderedDict
+from collections import OrderedDict, defaultdict
 import json
 import sys
 import time
+import numpy as np
 
 import arff
 import xmltodict
 
 import openml
 from ..tasks import get_task
-from .._api_calls import _perform_api_call
+from .._api_calls import _perform_api_call, _file_id_to_url, _read_url_files
 from ..exceptions import PyOpenMLError
 
 class OpenMLRun(object):
@@ -106,6 +107,94 @@ def _generate_trace_arff_dict(self):
 
         return arff_dict
 
+    def get_metric_fn(self, sklearn_fn, kwargs={}):
+        '''Calculates metric scores based on predicted values. Assumes the
+        run has been executed locally (and contains run_data). Furthermore,
+        it assumes that the 'correct' attribute is specified in the arff
+        (which is an optional field, but always the case for openml-python
+        runs)
+
+        Parameters
+        -------
+        sklearn_fn : function
+            a function pointer to a sklearn function that
+            accepts y_true, y_pred and *kwargs
+
+        Returns
+        -------
+        scores : list
+            a list of floats, of length num_folds * num_repeats
+        '''
+        if self.data_content is not None:
+            predictions_arff = self._generate_arff_dict()
+        elif 'predictions' in self.output_files:
+            predictions_file_url = _file_id_to_url(self.output_files['predictions'], 'predictions.arff')
+            predictions_arff = arff.loads(openml._api_calls._read_url(predictions_file_url))
+            # TODO: make this a stream reader
+        else:
+            raise ValueError('Run should have been locally executed.')
+
+        attribute_names = [att[0] for att in predictions_arff['attributes']]
+        if 'correct' not in attribute_names:
+            raise ValueError('Attribute "correct" should be set')
+        if 'prediction' not in attribute_names:
+            raise ValueError('Attribute "predict" should be set')
+
+        def _attribute_list_to_dict(attribute_list):
+            # convenience function: Creates a mapping to map from the name of attributes
+            # present in the arff prediction file to their index. This is necessary
+            # because the number of classes can be different for different tasks.
+            res = dict()
+            for idx in range(len(attribute_list)):
+                res[attribute_list[idx][0]] = idx
+            return res
+        attribute_dict = _attribute_list_to_dict(predictions_arff['attributes'])
+
+        # might throw KeyError!
+        predicted_idx = attribute_dict['prediction']
+        correct_idx = attribute_dict['correct']
+        repeat_idx = attribute_dict['repeat']
+        fold_idx = attribute_dict['fold']
+        sample_idx = attribute_dict['sample'] # TODO: this one might be zero
+
+        if predictions_arff['attributes'][predicted_idx][1] != predictions_arff['attributes'][correct_idx][1]:
+            pred = predictions_arff['attributes'][predicted_idx][1]
+            corr = predictions_arff['attributes'][correct_idx][1]
+            raise ValueError('Predicted and Correct do not have equal values: %s Vs. %s' %(str(pred), str(corr)))
+
+        # TODO: these could be cached
+        values_predict = {}
+        values_correct = {}
+        for line_idx, line in enumerate(predictions_arff['data']):
+            rep = line[repeat_idx]
+            fold = line[fold_idx]
+            samp = line[sample_idx]
+
+            # TODO: can be sped up bt preprocessing index, but OK for now.
+            prediction = predictions_arff['attributes'][predicted_idx][1].index(line[predicted_idx])
+            correct = predictions_arff['attributes'][predicted_idx][1].index(line[correct_idx])
+            if rep not in values_predict:
+                values_predict[rep] = dict()
+                values_correct[rep] = dict()
+            if fold not in values_predict[rep]:
+                values_predict[rep][fold] = dict()
+                values_correct[rep][fold] = dict()
+            if samp not in values_predict[rep][fold]:
+                values_predict[rep][fold][samp] = []
+                values_correct[rep][fold][samp] = []
+
+            values_predict[line[repeat_idx]][line[fold_idx]][line[sample_idx]].append(prediction)
+            values_correct[line[repeat_idx]][line[fold_idx]][line[sample_idx]].append(correct)
+
+        scores = []
+        for rep in values_predict.keys():
+            for fold in values_predict[rep].keys():
+                last_sample = len(values_predict[rep][fold]) - 1
+                y_pred = values_predict[rep][fold][last_sample]
+                y_true = values_correct[rep][fold][last_sample]
+                scores.append(sklearn_fn(y_true, y_pred, **kwargs))
+        return np.array(scores)
+
     def publish(self):
         """Publish a run to the OpenML server.
 
diff --git a/tests/test_runs/test_run_functions.py b/tests/test_runs/test_run_functions.py
@@ -166,22 +166,32 @@ def _check_fold_evaluations(self, fold_evaluations, num_repeats, num_folds, max_
         condition outside of this function. )
         default max_time_allowed (per fold, in milli seconds) = 1 minute, quite pessimistic
         '''
-        timing_measures = {'usercpu_time_millis_testing', 'usercpu_time_millis_training', 'usercpu_time_millis'}
+
+        # a dict mapping from openml measure to a tuple with the minimum and maximum allowed value
+        check_measures = {'usercpu_time_millis_testing': (0, max_time_allowed),
+                          'usercpu_time_millis_training': (0, max_time_allowed),  # should take at least one millisecond (?)
+                          'usercpu_time_millis': (0, max_time_allowed),
+                          'predictive_accuracy': (0, 1)}
 
         self.assertIsInstance(fold_evaluations, dict)
         if sys.version_info[:2] >= (3, 3):
-            self.assertEquals(set(fold_evaluations.keys()), timing_measures)
-            for measure in timing_measures:
+            # this only holds if we are allowed to record time (otherwise some are missing)
+            self.assertEquals(set(fold_evaluations.keys()), set(check_measures.keys()))
+
+        for measure in check_measures.keys():
+            if measure in fold_evaluations:
                 num_rep_entrees = len(fold_evaluations[measure])
                 self.assertEquals(num_rep_entrees, num_repeats)
+                min_val = check_measures[measure][0]
+                max_val = check_measures[measure][1]
                 for rep in range(num_rep_entrees):
                     num_fold_entrees = len(fold_evaluations[measure][rep])
                     self.assertEquals(num_fold_entrees, num_folds)
                     for fold in range(num_fold_entrees):
                         evaluation = fold_evaluations[measure][rep][fold]
                         self.assertIsInstance(evaluation, float)
-                        self.assertGreater(evaluation, 0) # should take at least one millisecond (?)
-                        self.assertLess(evaluation, max_time_allowed)
+                        self.assertGreaterEqual(evaluation, min_val)
+                        self.assertLessEqual(evaluation, max_val)
 
 
     def _check_sample_evaluations(self, sample_evaluations, num_repeats, num_folds, num_samples, max_time_allowed=60000):
@@ -193,12 +203,20 @@ def _check_sample_evaluations(self, sample_evaluations, num_repeats, num_folds,
         condition outside of this function. )
         default max_time_allowed (per fold, in milli seconds) = 1 minute, quite pessimistic
         '''
-        timing_measures = {'usercpu_time_millis_testing', 'usercpu_time_millis_training', 'usercpu_time_millis'}
+
+        # a dict mapping from openml measure to a tuple with the minimum and maximum allowed value
+        check_measures = {'usercpu_time_millis_testing': (0, max_time_allowed),
+                          'usercpu_time_millis_training': (0, max_time_allowed),  # should take at least one millisecond (?)
+                          'usercpu_time_millis': (0, max_time_allowed),
+                          'predictive_accuracy': (0, 1)}
 
         self.assertIsInstance(sample_evaluations, dict)
         if sys.version_info[:2] >= (3, 3):
-            self.assertEquals(set(sample_evaluations.keys()), timing_measures)
-            for measure in timing_measures:
+            # this only holds if we are allowed to record time (otherwise some are missing)
+            self.assertEquals(set(sample_evaluations.keys()), set(check_measures.keys()))
+
+        for measure in check_measures.keys():
+            if measure in sample_evaluations:
                 num_rep_entrees = len(sample_evaluations[measure])
                 self.assertEquals(num_rep_entrees, num_repeats)
                 for rep in range(num_rep_entrees):
@@ -309,6 +327,16 @@ def test_run_and_upload(self):
         for clf, rsv in zip(clfs, random_state_fixtures):
             run = self._perform_run(task_id, num_test_instances, clf,
                                     random_state_value=rsv)
+
+            # obtain accuracy scores using get_metric_score:
+            accuracy_scores = run.get_metric_fn(sklearn.metrics.accuracy_score)
+            # compare with the scores in user defined measures
+            accuracy_scores_provided = []
+            for rep in run.fold_evaluations['predictive_accuracy'].keys():
+                for fold in run.fold_evaluations['predictive_accuracy'][rep].keys():
+                    accuracy_scores_provided.append(run.fold_evaluations['predictive_accuracy'][rep][fold])
+            self.assertEquals(sum(accuracy_scores_provided), sum(accuracy_scores))
+
             if isinstance(clf, BaseSearchCV):
                 if isinstance(clf, GridSearchCV):
                     grid_iterations = 1
@@ -385,6 +413,49 @@ def test_initialize_cv_from_run(self):
         self.assertEquals(modelS.cv.random_state, 62501)
         self.assertEqual(modelR.cv.random_state, 62501)
 
+    def _test_local_evaluations(self, run):
+
+        # compare with the scores in user defined measures
+        accuracy_scores_provided = []
+        for rep in run.fold_evaluations['predictive_accuracy'].keys():
+            for fold in run.fold_evaluations['predictive_accuracy'][rep].keys():
+                accuracy_scores_provided.append(run.fold_evaluations['predictive_accuracy'][rep][fold])
+        accuracy_scores = run.get_metric_fn(sklearn.metrics.accuracy_score)
+        np.testing.assert_array_almost_equal(accuracy_scores_provided, accuracy_scores)
+
+        # also check if we can obtain some other scores: # TODO: how to do AUC?
+        tests = [(sklearn.metrics.cohen_kappa_score, {'weights': None}),
+                 (sklearn.metrics.auc, {'reorder': True}),
+                 (sklearn.metrics.average_precision_score, {}),
+                 (sklearn.metrics.jaccard_similarity_score, {}),
+                 (sklearn.metrics.precision_score, {'average': 'macro'}),
+                 (sklearn.metrics.brier_score_loss, {})]
+        for test_idx, test in enumerate(tests):
+            alt_scores = run.get_metric_fn(test[0], test[1])
+            self.assertEquals(len(alt_scores), 10)
+            for idx in range(len(alt_scores)):
+                self.assertGreaterEqual(alt_scores[idx], 0)
+                self.assertLessEqual(alt_scores[idx], 1)
+
+    def test_local_run_metric_score(self):
+
+        # construct sci-kit learn classifier
+        clf = Pipeline(steps=[('imputer', Imputer(strategy='median')), ('estimator', RandomForestClassifier())])
+
+        # download task
+        task = openml.tasks.get_task(7)
+
+        # invoke OpenML run
+        run = openml.runs.run_model_on_task(task, clf)
+
+        self._test_local_evaluations(run)
+
+    def test_online_run_metric_score(self):
+        openml.config.server = self.production_server
+        run = openml.runs.get_run(5965513) # important to use binary classification task, due to assertions
+        self._test_local_evaluations(run)
+
+
     def test_initialize_model_from_run(self):
         clf = sklearn.pipeline.Pipeline(steps=[('Imputer', Imputer(strategy='median')),
                                                ('VarianceThreshold', VarianceThreshold(threshold=0.05)),