Merge pull request #747 from openml/add_#737

PGijsbers · web-flow · commit 91be1acb6a75 · 2019-08-05T09:34:01.000+02:00
Add #737
diff --git a/doc/progress.rst b/doc/progress.rst
@@ -8,6 +8,7 @@ Changelog
 
 0.10.0
 ~~~~~~
+* ADD #737: Add list_evaluations_setups to return hyperparameters along with list of evaluations.
 * FIX #261: Test server is cleared of all files uploaded during unit testing.
 * FIX #447: All files created by unit tests no longer persist in local.
 * FIX #608: Fixing dataset_id referenced before assignment error in get_run function.
diff --git a/openml/evaluations/__init__.py b/openml/evaluations/__init__.py
@@ -1,4 +1,5 @@
 from .evaluation import OpenMLEvaluation
-from .functions import list_evaluations, list_evaluation_measures
+from .functions import list_evaluations, list_evaluation_measures, list_evaluations_setups
 
-__all__ = ['OpenMLEvaluation', 'list_evaluations', 'list_evaluation_measures']
+__all__ = ['OpenMLEvaluation', 'list_evaluations', 'list_evaluation_measures',
+           'list_evaluations_setups']
diff --git a/openml/evaluations/functions.py b/openml/evaluations/functions.py
@@ -1,12 +1,14 @@
 import json
 import xmltodict
 import pandas as pd
+import numpy as np
 from typing import Union, List, Optional, Dict
 import collections
 
 import openml.utils
 import openml._api_calls
 from ..evaluations import OpenMLEvaluation
+import openml
 
 
 def list_evaluations(
@@ -209,8 +211,8 @@ def __list_evaluations(api_call, output_format='object'):
                              'array_data': array_data}
 
     if output_format == 'dataframe':
-        evals = pd.DataFrame.from_dict(evals, orient='index')
-
+        rows = [value for key, value in evals.items()]
+        evals = pd.DataFrame.from_records(rows, columns=rows[0].keys())
     return evals
 
 
@@ -238,3 +240,90 @@ def list_evaluation_measures() -> List[str]:
                         '"oml:measure" as a list')
     qualities = qualities['oml:evaluation_measures']['oml:measures'][0]['oml:measure']
     return qualities
+
+
+def list_evaluations_setups(
+        function: str,
+        offset: Optional[int] = None,
+        size: Optional[int] = None,
+        id: Optional[List] = None,
+        task: Optional[List] = None,
+        setup: Optional[List] = None,
+        flow: Optional[List] = None,
+        uploader: Optional[List] = None,
+        tag: Optional[str] = None,
+        per_fold: Optional[bool] = None,
+        sort_order: Optional[str] = None,
+        output_format: str = 'dataframe'
+) -> Union[Dict, pd.DataFrame]:
+    """
+    List all run-evaluation pairs matching all of the given filters
+    and their hyperparameter settings.
+
+    Parameters
+    ----------
+    function : str
+        the evaluation function. e.g., predictive_accuracy
+    offset : int, optional
+        the number of runs to skip, starting from the first
+    size : int, optional
+        the maximum number of runs to show
+    id : list[int], optional
+        the list of evaluation ID's
+    task : list[int], optional
+        the list of task ID's
+    setup: list[int], optional
+        the list of setup ID's
+    flow : list[int], optional
+        the list of flow ID's
+    uploader : list[int], optional
+        the list of uploader ID's
+    tag : str, optional
+        filter evaluation based on given tag
+    per_fold : bool, optional
+    sort_order : str, optional
+       order of sorting evaluations, ascending ("asc") or descending ("desc")
+    output_format: str, optional (default='dataframe')
+        The parameter decides the format of the output.
+        - If 'dict' the output is a dict of dict
+        - If 'dataframe' the output is a pandas DataFrame
+
+
+    Returns
+    -------
+    dict or dataframe with hyperparameter settings as a list of tuples.
+    """
+    # List evaluations
+    evals = list_evaluations(function=function, offset=offset, size=size, id=id, task=task,
+                             setup=setup, flow=flow, uploader=uploader, tag=tag,
+                             per_fold=per_fold, sort_order=sort_order, output_format='dataframe')
+
+    # List setups
+    # Split setups in evals into chunks of N setups as list_setups does not support large size
+    df = pd.DataFrame()
+    if len(evals) != 0:
+        N = 100
+        setup_chunks = np.split(evals['setup_id'].unique(),
+                                ((len(evals['setup_id'].unique()) - 1) // N) + 1)
+        setups = pd.DataFrame()
+        for setup in setup_chunks:
+            result = pd.DataFrame(openml.setups.list_setups(setup=setup, output_format='dataframe'))
+            result.drop('flow_id', axis=1, inplace=True)
+            # concat resulting setup chunks into single datframe
+            setups = pd.concat([setups, result], ignore_index=True)
+        parameters = []
+        # Convert parameters of setup into list of tuples of (hyperparameter, value)
+        for parameter_dict in setups['parameters']:
+            if parameter_dict is not None:
+                parameters.append([tuple([param['parameter_name'], param['value']])
+                                   for param in parameter_dict.values()])
+            else:
+                parameters.append([])
+        setups['parameters'] = parameters
+        # Merge setups with evaluations
+        df = pd.merge(evals, setups, on='setup_id', how='left')
+
+    if output_format == 'dataframe':
+        return df
+    else:
+        return df.to_dict(orient='index')
diff --git a/tests/test_evaluations/test_evaluation_functions.py b/tests/test_evaluations/test_evaluation_functions.py
@@ -6,6 +6,30 @@
 class TestEvaluationFunctions(TestBase):
     _multiprocess_can_split_ = True
 
+    def _check_list_evaluation_setups(self, size, **kwargs):
+        evals_setups = openml.evaluations.list_evaluations_setups("predictive_accuracy",
+                                                                  **kwargs, size=size,
+                                                                  sort_order='desc',
+                                                                  output_format='dataframe')
+        evals = openml.evaluations.list_evaluations("predictive_accuracy",
+                                                    **kwargs, size=size,
+                                                    sort_order='desc',
+                                                    output_format='dataframe')
+
+        # Check if list is non-empty
+        self.assertGreater(len(evals_setups), 0)
+        # Check if output from sort is sorted in the right order
+        self.assertSequenceEqual(sorted(evals_setups['value'].tolist(), reverse=True),
+                                 evals_setups['value'].tolist())
+
+        # Check if output and order of list_evaluations is preserved
+        self.assertSequenceEqual(evals_setups['run_id'].tolist(), evals['run_id'].tolist())
+        # Check if the hyper-parameter column is as accurate and flow_id
+        for index, row in evals_setups.iterrows():
+            params = openml.runs.get_run(row['run_id']).parameter_settings
+            hyper_params = [tuple([param['oml:name'], param['oml:value']]) for param in params]
+            self.assertTrue(sorted(row['parameters']) == sorted(hyper_params))
+
     def test_evaluation_list_filter_task(self):
         openml.config.server = self.production_server
 
@@ -142,3 +166,15 @@ def test_list_evaluation_measures(self):
         measures = openml.evaluations.list_evaluation_measures()
         self.assertEqual(isinstance(measures, list), True)
         self.assertEqual(all([isinstance(s, str) for s in measures]), True)
+
+    def test_list_evaluations_setups_filter_flow(self):
+        openml.config.server = self.production_server
+        flow_id = [405]
+        size = 100
+        self._check_list_evaluation_setups(size, flow=flow_id)
+
+    def test_list_evaluations_setups_filter_task(self):
+        openml.config.server = self.production_server
+        task_id = [6]
+        size = 100
+        self._check_list_evaluation_setups(size, task=task_id)