incorporated requests of @mfeurer

janvanrijn · janvanrijn · commit 3ed5c7ab0497 · 2017-07-18T14:57:35.000+02:00
diff --git a/openml/runs/run.py b/openml/runs/run.py
@@ -107,11 +107,12 @@ def _generate_trace_arff_dict(self):
 
         return arff_dict
 
-    def get_metric_score(self, sklearn_fn, kwargs={}):
+    def get_metric_fn(self, sklearn_fn, kwargs={}):
         '''Calculates metric scores based on predicted values. Assumes the
-        run has been executed locally (and contans run_data). Furthermore,
-        it assumes that the 'correct' field has been set (which is
-        automatically the case for local runs)
+        run has been executed locally (and contains run_data). Furthermore,
+        it assumes that the 'correct' attribute is specified in the arff
+        (which is an optional field, but always the case for openml-python
+        runs)
 
         Parameters
         -------
@@ -133,8 +134,15 @@ def get_metric_score(self, sklearn_fn, kwargs={}):
         else:
             raise ValueError('Run should have been locally executed.')
 
+        if 'correct' not in predictions_arff['attributes']:
+            raise ValueError('Attribute "correct" should be set')
+        if 'predict' not in predictions_arff['attributes']:
+            raise ValueError('Attribute "predict" should be set')
+
         def _attribute_list_to_dict(attribute_list):
-            # convenience function
+            # convenience function: Creates a mapping to map from the name of attributes
+            # present in the arff prediction file to their index. This is necessary
+            # because the number of classes can be different for different tasks.
             res = dict()
             for idx in range(len(attribute_list)):
                 res[attribute_list[idx][0]] = idx
diff --git a/tests/test_flows/test_flow_functions.py b/tests/test_flows/test_flow_functions.py
@@ -25,7 +25,7 @@ def test_list_flows(self):
         # data from the internet...
         flows = openml.flows.list_flows()
         # 3000 as the number of flows on openml.org
-        self.assertGreaterEqual(len(flows), 3000)
+        self.assertGreaterEqual(len(flows), 1500)
         for fid in flows:
             self._check_flow(flows[fid])
 
diff --git a/tests/test_runs/test_run_functions.py b/tests/test_runs/test_run_functions.py
@@ -412,26 +412,16 @@ def test_initialize_cv_from_run(self):
 
         self.assertEquals(modelS.cv.random_state, 62501)
         self.assertEqual(modelR.cv.random_state, 62501)
-
-    def test_get_run_metric_score(self):
-
-        # construct sci-kit learn classifier
-        clf = Pipeline(steps=[('imputer', Imputer(strategy='median')), ('estimator', RandomForestClassifier())])
-
-
-        # download task
-        task = openml.tasks.get_task(7)
-
-        # invoke OpenML run
-        run = openml.runs.run_model_on_task(task, clf)
+    
+    def _test_local_evaluations(self, run):
 
         # compare with the scores in user defined measures
         accuracy_scores_provided = []
         for rep in run.fold_evaluations['predictive_accuracy'].keys():
             for fold in run.fold_evaluations['predictive_accuracy'][rep].keys():
                 accuracy_scores_provided.append(run.fold_evaluations['predictive_accuracy'][rep][fold])
         accuracy_scores = run.get_metric_score(sklearn.metrics.accuracy_score)
-        self.assertEquals(sum(accuracy_scores_provided), sum(accuracy_scores))
+        np.testing.assert_array_almost_equal(accuracy_scores_provided, accuracy_scores)
 
         # also check if we can obtain some other scores: # TODO: how to do AUC?
         tests = [(sklearn.metrics.cohen_kappa_score, {'weights': None}),
@@ -447,6 +437,25 @@ def test_get_run_metric_score(self):
                 self.assertGreaterEqual(alt_scores[idx], 0)
                 self.assertLessEqual(alt_scores[idx], 1)
 
+    def test_local_run_metric_score(self):
+
+        # construct sci-kit learn classifier
+        clf = Pipeline(steps=[('imputer', Imputer(strategy='median')), ('estimator', RandomForestClassifier())])
+
+        # download task
+        task = openml.tasks.get_task(7)
+
+        # invoke OpenML run
+        run = openml.runs.run_model_on_task(task, clf)
+
+        self._test_local_evaluations(run)
+
+    def test_online_run_metric_score(self):
+        openml.config.server = self.production_server
+        run = openml.runs.get_run(5572567)
+        self._test_local_evaluations(run)
+
+
     def test_initialize_model_from_run(self):
         clf = sklearn.pipeline.Pipeline(steps=[('Imputer', Imputer(strategy='median')),
                                                ('VarianceThreshold', VarianceThreshold(threshold=0.05)),