Per fold evals (#613)

janvanrijn · mfeurer · commit 2e69fe05cb38 · 2019-02-11T16:31:08.000+01:00
* added ability to obtain per fold evaluation measures

* added json loads

* updated unit test
diff --git a/openml/evaluations/evaluation.py b/openml/evaluations/evaluation.py
@@ -1,6 +1,6 @@
 
 class OpenMLEvaluation(object):
-    '''
+    """
     Contains all meta-information about a run / evaluation combination,
     according to the evaluation/list function
 
@@ -26,11 +26,13 @@ class OpenMLEvaluation(object):
         the time of evaluation
     value : float
         the value of this evaluation
+    values : List[float]
+        the values per repeat and fold (if requested)
     array_data : str
         list of information per class (e.g., in case of precision, auroc, recall)
-    '''
+    """
     def __init__(self, run_id, task_id, setup_id, flow_id, flow_name,
-                 data_id, data_name, function, upload_time, value,
+                 data_id, data_name, function, upload_time, value, values,
                  array_data=None):
         self.run_id = run_id
         self.task_id = task_id
@@ -42,4 +44,5 @@ def __init__(self, run_id, task_id, setup_id, flow_id, flow_name,
         self.function = function
         self.upload_time = upload_time
         self.value = value
+        self.values = values
         self.array_data = array_data
diff --git a/openml/evaluations/functions.py b/openml/evaluations/functions.py
@@ -1,13 +1,14 @@
+import json
 import xmltodict
 
-from openml.exceptions import OpenMLServerNoResult
 import openml.utils
 import openml._api_calls
 from ..evaluations import OpenMLEvaluation
 
 
 def list_evaluations(function, offset=None, size=None, id=None, task=None,
-                     setup=None, flow=None, uploader=None, tag=None):
+                     setup=None, flow=None, uploader=None, tag=None,
+                     per_fold=None):
     """
     List all run-evaluation pairs matching all of the given filters.
     (Supports large amount of results)
@@ -33,13 +34,19 @@ def list_evaluations(function, offset=None, size=None, id=None, task=None,
 
     tag : str, optional
 
+    per_fold : bool, optional
+
     Returns
     -------
     dict
     """
+    if per_fold is not None:
+        per_fold = str(per_fold).lower()
 
-    return openml.utils._list_all(_list_evaluations, function, offset=offset, size=size,
-                                  id=id, task=task, setup=setup, flow=flow, uploader=uploader, tag=tag)
+    return openml.utils._list_all(_list_evaluations, function, offset=offset,
+                                  size=size, id=id, task=task, setup=setup,
+                                  flow=flow, uploader=uploader, tag=tag,
+                                  per_fold=per_fold)
 
 
 def _list_evaluations(function, id=None, task=None,
@@ -97,24 +104,34 @@ def __list_evaluations(api_call):
     evals_dict = xmltodict.parse(xml_string, force_list=('oml:evaluation',))
     # Minimalistic check if the XML is useful
     if 'oml:evaluations' not in evals_dict:
-        raise ValueError('Error in return XML, does not contain "oml:evaluations": %s'
-                         % str(evals_dict))
+        raise ValueError('Error in return XML, does not contain '
+                         '"oml:evaluations": %s' % str(evals_dict))
 
     assert type(evals_dict['oml:evaluations']['oml:evaluation']) == list, \
         type(evals_dict['oml:evaluations'])
 
     evals = dict()
     for eval_ in evals_dict['oml:evaluations']['oml:evaluation']:
         run_id = int(eval_['oml:run_id'])
+        value = None
+        values = None
         array_data = None
+        if 'oml:value' in eval_:
+            value = float(eval_['oml:value'])
+        if 'oml:values' in eval_:
+            values = json.loads(eval_['oml:values'])
         if 'oml:array_data' in eval_:
             array_data = eval_['oml:array_data']
 
-        evals[run_id] = OpenMLEvaluation(int(eval_['oml:run_id']), int(eval_['oml:task_id']),
-                                      int(eval_['oml:setup_id']), int(eval_['oml:flow_id']),
-                                      eval_['oml:flow_name'], eval_['oml:data_id'],
-                                      eval_['oml:data_name'], eval_['oml:function'],
-                                      eval_['oml:upload_time'], float(eval_['oml:value']),
-                                      array_data)
+        evals[run_id] = OpenMLEvaluation(int(eval_['oml:run_id']),
+                                         int(eval_['oml:task_id']),
+                                         int(eval_['oml:setup_id']),
+                                         int(eval_['oml:flow_id']),
+                                         eval_['oml:flow_name'],
+                                         eval_['oml:data_id'],
+                                         eval_['oml:data_name'],
+                                         eval_['oml:function'],
+                                         eval_['oml:upload_time'],
+                                         value, values, array_data)
 
     return evals
diff --git a/openml/runs/functions.py b/openml/runs/functions.py
@@ -184,7 +184,7 @@ def _publish_flow_if_necessary(flow):
     except OpenMLServerException as e:
         if e.message == "flow already exists":
             # TODO: JvR: the following lines of code can be replaced by
-            # a pass (after changing the unit test) as run_flow_on_task does
+            # a pass (after changing the unit tests) as run_flow_on_task does
             # not longer rely on it
             flow_id = openml.flows.flow_exists(flow.name,
                                                flow.external_version)
diff --git a/tests/test_evaluations/test_evaluation_functions.py b/tests/test_evaluations/test_evaluation_functions.py
@@ -2,6 +2,7 @@
 import openml.evaluations
 from openml.testing import TestBase
 
+
 class TestEvaluationFunctions(TestBase):
     _multiprocess_can_split_ = True
 
@@ -15,6 +16,10 @@ def test_evaluation_list_filter_task(self):
         self.assertGreater(len(evaluations), 100)
         for run_id in evaluations.keys():
             self.assertEquals(evaluations[run_id].task_id, task_id)
+            # default behaviour of this method: return aggregated results (not
+            # per fold)
+            self.assertIsNotNone(evaluations[run_id].value)
+            self.assertIsNone(evaluations[run_id].values)
 
     def test_evaluation_list_filter_uploader_ID_16(self):
         openml.config.server = self.production_server
@@ -23,7 +28,7 @@ def test_evaluation_list_filter_uploader_ID_16(self):
 
         evaluations = openml.evaluations.list_evaluations("predictive_accuracy", uploader=[uploader_id])
 
-        self.assertGreater(len(evaluations), 100)
+        self.assertGreater(len(evaluations), 50)
 
     def test_evaluation_list_filter_uploader_ID_10(self):
         openml.config.server = self.production_server
@@ -32,9 +37,13 @@ def test_evaluation_list_filter_uploader_ID_10(self):
 
         evaluations = openml.evaluations.list_evaluations("predictive_accuracy", setup=[setup_id])
 
-        self.assertGreater(len(evaluations), 100)
+        self.assertGreater(len(evaluations), 50)
         for run_id in evaluations.keys():
             self.assertEquals(evaluations[run_id].setup_id, setup_id)
+            # default behaviour of this method: return aggregated results (not
+            # per fold)
+            self.assertIsNotNone(evaluations[run_id].value)
+            self.assertIsNone(evaluations[run_id].values)
 
     def test_evaluation_list_filter_flow(self):
         openml.config.server = self.production_server
@@ -46,17 +55,25 @@ def test_evaluation_list_filter_flow(self):
         self.assertGreater(len(evaluations), 2)
         for run_id in evaluations.keys():
             self.assertEquals(evaluations[run_id].flow_id, flow_id)
+            # default behaviour of this method: return aggregated results (not
+            # per fold)
+            self.assertIsNotNone(evaluations[run_id].value)
+            self.assertIsNone(evaluations[run_id].values)
 
     def test_evaluation_list_filter_run(self):
         openml.config.server = self.production_server
 
-        run_id = 1
+        run_id = 12
 
         evaluations = openml.evaluations.list_evaluations("predictive_accuracy", id=[run_id])
 
         self.assertEquals(len(evaluations), 1)
         for run_id in evaluations.keys():
             self.assertEquals(evaluations[run_id].run_id, run_id)
+            # default behaviour of this method: return aggregated results (not
+            # per fold)
+            self.assertIsNotNone(evaluations[run_id].value)
+            self.assertIsNone(evaluations[run_id].values)
 
     def test_evaluation_list_limit(self):
         openml.config.server = self.production_server
@@ -70,3 +87,28 @@ def test_list_evaluations_empty(self):
             raise ValueError('UnitTest Outdated, got somehow results')
 
         self.assertIsInstance(evaluations, dict)
+
+    def test_evaluation_list_per_fold(self):
+        openml.config.server = self.production_server
+        size = 1000
+        task_ids = [6]
+        uploader_ids = [1]
+        flow_ids = [6969]
+
+        evaluations = openml.evaluations.list_evaluations(
+            "predictive_accuracy", size=size, offset=0, task=task_ids,
+            flow=flow_ids, uploader=uploader_ids, per_fold=True)
+
+        self.assertEquals(len(evaluations), size)
+        for run_id in evaluations.keys():
+            self.assertIsNone(evaluations[run_id].value)
+            self.assertIsNotNone(evaluations[run_id].values)
+            # potentially we could also test array values, but these might be
+            # added in the future
+
+        evaluations = openml.evaluations.list_evaluations(
+            "predictive_accuracy", size=size, offset=0, task=task_ids,
+            flow=flow_ids, uploader=uploader_ids, per_fold=False)
+        for run_id in evaluations.keys():
+            self.assertIsNotNone(evaluations[run_id].value)
+            self.assertIsNone(evaluations[run_id].values)
diff --git a/tests/test_runs/test_run_functions.py b/tests/test_runs/test_run_functions.py
@@ -999,7 +999,7 @@ def _check_run(self, run):
     def test_get_runs_list(self):
         # TODO: comes from live, no such lists on test
         openml.config.server = self.production_server
-        runs = openml.runs.list_runs(id=[2])
+        runs = openml.runs.list_runs(id=[2], show_errors=True)
         self.assertEqual(len(runs), 1)
         for rid in runs:
             self._check_run(runs[rid])