openml
diff --git a/‎examples/OpenMLDemo.ipynb‎
Lines changed: 0 additions & 703 deletions b/‎examples/OpenMLDemo.ipynb‎
Lines changed: 0 additions & 703 deletions
diff --git a/‎examples/OpenML_Tutorial.ipynb‎
Lines changed: 1344 additions & 0 deletions b/‎examples/OpenML_Tutorial.ipynb‎
Lines changed: 1344 additions & 0 deletions
diff --git a/‎examples/PyOpenML.ipynb‎
Lines changed: 0 additions & 862 deletions b/‎examples/PyOpenML.ipynb‎
Lines changed: 0 additions & 862 deletions
diff --git a/‎openml/evaluations/__init__.py‎
Lines changed: 2 additions & 0 deletions b/‎openml/evaluations/__init__.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎openml/evaluations/evaluation.py‎
Lines changed: 40 additions & 0 deletions b/‎openml/evaluations/evaluation.py‎
Lines changed: 40 additions & 0 deletions
diff --git a/‎openml/evaluations/functions.py‎
Lines changed: 93 additions & 0 deletions b/‎openml/evaluations/functions.py‎
Lines changed: 93 additions & 0 deletions
diff --git a/‎openml/runs/functions.py‎
Lines changed: 107 additions & 67 deletions b/‎openml/runs/functions.py‎
Lines changed: 107 additions & 67 deletions
@@ -0,0 +1,2 @@
+from .evaluation import OpenMLEvaluation
+from .functions import list_evaluations
@@ -0,0 +1,40 @@
+
+class OpenMLEvaluation(object):
+    '''
+    Contains all meta-information about a run / evaluation combination,
+    according to the evaluation/list function
+
+    Parameters
+       ----------
+        run_id : int
+        task_id : int
+        setup_id : int
+        flow_id : int
+        flow_name : str
+        data_id : int
+        data_name : str
+            the name of the dataset
+        function : str
+            the evaluation function of this item (e.g., accuracy)
+        upload_time : str
+            the time of evaluation
+        value : float
+            the value of this evaluation
+        array_data : str
+            list of information per class (e.g., in case of precision, auroc, recall)
+    '''
+    def __init__(self, run_id, task_id, setup_id, flow_id, flow_name,
+                 data_id, data_name, function, upload_time, value,
+                 array_data=None):
+        self.run_id = run_id
+        self.task_id = task_id
+        self.setup_id = setup_id
+        self.flow_id = flow_id
+        self.flow_name = flow_name
+        self.data_id = data_id
+        self.data_name = data_name
+        self.function = function
+        self.upload_time = upload_time
+        self.value = value
+        self.array_data = array_data
+
@@ -0,0 +1,93 @@
+import xmltodict
+
+from .._api_calls import _perform_api_call
+from ..evaluations import OpenMLEvaluation
+
+def list_evaluations(function, offset=None, size=None, id=None, task=None, setup=None,
+              flow=None, uploader=None, tag=None):
+    """List all run-evaluation pairs matching all of the given filters.
+
+        Perform API call `/evaluation/function{function}/{filters} 
+        
+        Parameters
+        ----------
+        function : str 
+            the evaluation function. e.g., predictive_accuracy
+        offset : int, optional
+            the number of runs to skip, starting from the first
+        size : int, optional
+            the maximum number of runs to show
+
+        id : list, optional
+
+        task : list, optional
+
+        setup: list, optional
+
+        flow : list, optional
+
+        uploader : list, optional
+
+        tag : str, optional
+
+        Returns
+        -------
+        list
+            List of found evaluations.
+        """
+
+    api_call = "evaluation/list/function/%s" %function
+    if offset is not None:
+        api_call += "/offset/%d" % int(offset)
+    if size is not None:
+        api_call += "/limit/%d" % int(size)
+    if id is not None:
+        api_call += "/run/%s" % ','.join([str(int(i)) for i in id])
+    if task is not None:
+        api_call += "/task/%s" % ','.join([str(int(i)) for i in task])
+    if setup is not None:
+        api_call += "/setup/%s" % ','.join([str(int(i)) for i in setup])
+    if flow is not None:
+        api_call += "/flow/%s" % ','.join([str(int(i)) for i in flow])
+    if uploader is not None:
+        api_call += "/uploader/%s" % ','.join([str(int(i)) for i in uploader])
+    if tag is not None:
+        api_call += "/tag/%s" % tag
+
+    return _list_evaluations(api_call)
+
+
+def _list_evaluations(api_call):
+    """Helper function to parse API calls which are lists of runs"""
+
+    xml_string = _perform_api_call(api_call)
+
+    evals_dict = xmltodict.parse(xml_string)
+    # Minimalistic check if the XML is useful
+    if 'oml:evaluations' not in evals_dict:
+        raise ValueError('Error in return XML, does not contain "oml:evaluations": %s'
+                         % str(evals_dict))
+
+    if isinstance(evals_dict['oml:evaluations']['oml:evaluation'], list):
+        evals_list = evals_dict['oml:evaluations']['oml:evaluation']
+    elif isinstance(evals_dict['oml:evaluations']['oml:evaluation'], dict):
+        evals_list = [evals_dict['oml:evaluations']['oml:evaluation']]
+    else:
+        raise TypeError()
+
+    evals = dict()
+    for eval_ in evals_list:
+        run_id = int(eval_['oml:run_id'])
+        array_data = None
+        if 'oml:array_data' in eval_:
+            eval_['oml:array_data']
+
+        evaluation = OpenMLEvaluation(int(eval_['oml:run_id']), int(eval_['oml:task_id']),
+                                      int(eval_['oml:setup_id']), int(eval_['oml:flow_id']),
+                                      eval_['oml:flow_name'], eval_['oml:data_id'],
+                                      eval_['oml:data_name'], eval_['oml:function'],
+                                      eval_['oml:upload_time'], float(eval_['oml:value']),
+                                      array_data)
+        evals[run_id] = evaluation
+    return evals
+
@@ -106,7 +106,13 @@ def run_flow_on_task(task, flow, avoid_duplicate_runs=True, flow_tags=None,
                     dataset_id=dataset.dataset_id, model=flow.model, tags=tags)
     run.parameter_settings = OpenMLRun._parse_parameters(flow)
 
-    run.data_content, run.trace_content, run.trace_attributes, run.detailed_evaluations = res
+    run.data_content, run.trace_content, run.trace_attributes, fold_evaluations, sample_evaluations = res
+    # now we need to attach the detailed evaluations
+    if task.task_type_id == 3:
+        run.sample_evaluations = sample_evaluations
+    else:
+        run.fold_evaluations = fold_evaluations
+
 
     config.logger.info('Executed Task %d with Flow id: %d' % (task.task_id, run.flow_id))
 
@@ -299,15 +305,20 @@ def _seed_current_object(current_value):
     return model
 
 
-def _prediction_to_row(rep_no, fold_no, row_id, correct_label, predicted_label,
-                       predicted_probabilities, class_labels, model_classes_mapping):
+def _prediction_to_row(rep_no, fold_no, sample_no, row_id, correct_label,
+                       predicted_label, predicted_probabilities, class_labels,
+                       model_classes_mapping):
     """Util function that turns probability estimates of a classifier for a given
         instance into the right arff format to upload to openml.
 
         Parameters
         ----------
         rep_no : int
+            The repeat of the experiment (0-based; in case of 1 time CV, always 0)
         fold_no : int
+            The fold nr of the experiment (0-based; in case of holdout, always 0)
+        sample_no : int
+            In case of learning curves, the index of the subsample (0-based; in case of no learning curve, always 0)
         row_id : int
             row id in the initial dataset
         correct_label : str
@@ -328,11 +339,12 @@ def _prediction_to_row(rep_no, fold_no, row_id, correct_label, predicted_label,
         """
     if not isinstance(rep_no, (int, np.integer)): raise ValueError('rep_no should be int')
     if not isinstance(fold_no, (int, np.integer)): raise ValueError('fold_no should be int')
+    if not isinstance(sample_no, (int, np.integer)): raise ValueError('sample_no should be int')
     if not isinstance(row_id, (int, np.integer)): raise ValueError('row_id should be int')
     if not len(predicted_probabilities) == len(model_classes_mapping):
         raise ValueError('len(predicted_probabilities) != len(class_labels)')
 
-    arff_line = [rep_no, fold_no, row_id]
+    arff_line = [rep_no, fold_no, sample_no, row_id]
     for class_label_idx in range(len(class_labels)):
         if class_label_idx in model_classes_mapping:
             index = np.where(model_classes_mapping == class_label_idx)[0][0]  # TODO: WHY IS THIS 2D???
@@ -349,82 +361,100 @@ def _run_task_get_arffcontent(model, task, class_labels):
     X, Y = task.get_X_and_y()
     arff_datacontent = []
     arff_tracecontent = []
-    user_defined_measures = defaultdict(lambda: defaultdict(dict))
+    # stores fold-based evaluation measures. In case of a sample based task,
+    # this information is multiple times overwritten, but due to the ordering
+    # of tne loops, eventually it contains the information based on the full
+    # dataset size
+    user_defined_measures_fold = defaultdict(lambda: defaultdict(dict))
+    # stores sample-based evaluation measures (sublevel of fold-based)
+    # will also be filled on a non sample-based task, but the information
+    # is the same as the fold-based measures, and disregarded in that case
+    user_defined_measures_sample = defaultdict(lambda: defaultdict(lambda: defaultdict(dict)))
 
-    rep_no = 0
     # sys.version_info returns a tuple, the following line compares the entry of tuples
     # https://docs.python.org/3.6/reference/expressions.html#value-comparisons
     can_measure_runtime = sys.version_info[:2] >= (3, 3) and _check_n_jobs(model)
     # TODO use different iterator to only provide a single iterator (less
     # methods, less maintenance, less confusion)
-    for rep in task.iterate_repeats():
-        fold_no = 0
-        for fold in rep:
-            model_fold = sklearn.base.clone(model, safe=True)
-            train_indices, test_indices = fold
-            trainX = X[train_indices]
-            trainY = Y[train_indices]
-            testX = X[test_indices]
-            testY = Y[test_indices]
-
-            try:
-                # for measuring runtime. Only available since Python 3.3
-                if can_measure_runtime:
-                    modelfit_starttime = time.process_time()
-                model_fold.fit(trainX, trainY)
-
-                if can_measure_runtime:
-                    modelfit_duration = (time.process_time() - modelfit_starttime) * 1000
-                    user_defined_measures['usercpu_time_millis_training'][rep_no][fold_no] = modelfit_duration
-            except AttributeError as e:
-                # typically happens when training a regressor on classification task
-                raise PyOpenMLError(str(e))
-            
-            # extract trace, if applicable
-            if isinstance(model_fold, sklearn.model_selection._search.BaseSearchCV):
-                arff_tracecontent.extend(_extract_arfftrace(model_fold, rep_no, fold_no))
-
-            # search for model classes_ (might differ depending on modeltype)
-            # first, pipelines are a special case (these don't have a classes_
-            # object, but rather borrows it from the last step. We do this manually,
-            # because of the BaseSearch check)
-            if isinstance(model_fold, sklearn.pipeline.Pipeline):
-                used_estimator = model_fold.steps[-1][-1]
-            else:
-                used_estimator = model_fold
+    num_reps, num_folds, num_samples = task.get_split_dimensions()
+
+    for rep_no in range(num_reps):
+        for fold_no in range(num_folds):
+            for sample_no in range(num_samples):
+                model_fold = sklearn.base.clone(model, safe=True)
+                train_indices, test_indices = task.get_train_test_split_indices(repeat=rep_no,
+                                                                                fold=fold_no,
+                                                                                sample=sample_no)
+                trainX = X[train_indices]
+                trainY = Y[train_indices]
+                testX = X[test_indices]
+                testY = Y[test_indices]
+
+                try:
+                    # for measuring runtime. Only available since Python 3.3
+                    if can_measure_runtime:
+                        modelfit_starttime = time.process_time()
+                    model_fold.fit(trainX, trainY)
+
+                    if can_measure_runtime:
+                        modelfit_duration = (time.process_time() - modelfit_starttime) * 1000
+                        user_defined_measures_sample['usercpu_time_millis_training'][rep_no][fold_no][sample_no] = modelfit_duration
+                        user_defined_measures_fold['usercpu_time_millis_training'][rep_no][fold_no] = modelfit_duration
+                except AttributeError as e:
+                    # typically happens when training a regressor on classification task
+                    raise PyOpenMLError(str(e))
+
+                # extract trace, if applicable
+                if isinstance(model_fold, sklearn.model_selection._search.BaseSearchCV):
+                    arff_tracecontent.extend(_extract_arfftrace(model_fold, rep_no, fold_no))
+
+                # search for model classes_ (might differ depending on modeltype)
+                # first, pipelines are a special case (these don't have a classes_
+                # object, but rather borrows it from the last step. We do this manually,
+                # because of the BaseSearch check)
+                if isinstance(model_fold, sklearn.pipeline.Pipeline):
+                    used_estimator = model_fold.steps[-1][-1]
+                else:
+                    used_estimator = model_fold
 
-            if isinstance(used_estimator, sklearn.model_selection._search.BaseSearchCV):
-                model_classes = used_estimator.best_estimator_.classes_
-            else:
-                model_classes = used_estimator.classes_
+                if isinstance(used_estimator, sklearn.model_selection._search.BaseSearchCV):
+                    model_classes = used_estimator.best_estimator_.classes_
+                else:
+                    model_classes = used_estimator.classes_
 
-            if can_measure_runtime:
-                modelpredict_starttime = time.process_time()
-            
-            ProbaY = model_fold.predict_proba(testX)
-            PredY = model_fold.predict(testX)
-            if can_measure_runtime:
-                modelpredict_duration = (time.process_time() - modelpredict_starttime) * 1000
-                user_defined_measures['usercpu_time_millis_testing'][rep_no][fold_no] = modelpredict_duration
-                user_defined_measures['usercpu_time_millis'][rep_no][fold_no] = modelfit_duration + modelpredict_duration
+                if can_measure_runtime:
+                    modelpredict_starttime = time.process_time()
 
-            if ProbaY.shape[1] != len(class_labels):
-                warnings.warn("Repeat %d Fold %d: estimator only predicted for %d/%d classes!" %(rep_no, fold_no, ProbaY.shape[1], len(class_labels)))
+                ProbaY = model_fold.predict_proba(testX)
+                PredY = model_fold.predict(testX)
+                if can_measure_runtime:
+                    modelpredict_duration = (time.process_time() - modelpredict_starttime) * 1000
+                    user_defined_measures_fold['usercpu_time_millis_testing'][rep_no][fold_no] = modelpredict_duration
+                    user_defined_measures_fold['usercpu_time_millis'][rep_no][fold_no] = modelfit_duration + modelpredict_duration
+                    user_defined_measures_sample['usercpu_time_millis_testing'][rep_no][fold_no][sample_no] = modelpredict_duration
+                    user_defined_measures_sample['usercpu_time_millis'][rep_no][fold_no][sample_no] = modelfit_duration + modelpredict_duration
 
-            for i in range(0, len(test_indices)):
-                arff_line = _prediction_to_row(rep_no, fold_no, test_indices[i], class_labels[testY[i]], PredY[i], ProbaY[i], class_labels, model_classes)
-                arff_datacontent.append(arff_line)
+                if ProbaY.shape[1] != len(class_labels):
+                    warnings.warn("Repeat %d Fold %d: estimator only predicted for %d/%d classes!" %(rep_no, fold_no, ProbaY.shape[1], len(class_labels)))
 
-            fold_no = fold_no + 1
-        rep_no = rep_no + 1
+                for i in range(0, len(test_indices)):
+                    arff_line = _prediction_to_row(rep_no, fold_no, sample_no,
+                                                   test_indices[i], class_labels[testY[i]],
+                                                   PredY[i], ProbaY[i], class_labels, model_classes)
+                    arff_datacontent.append(arff_line)
 
     if isinstance(model_fold, sklearn.model_selection._search.BaseSearchCV):
         # arff_tracecontent is already set
         arff_trace_attributes = _extract_arfftrace_attributes(model_fold)
     else:
         arff_tracecontent = None
         arff_trace_attributes = None
-    return arff_datacontent, arff_tracecontent, arff_trace_attributes, user_defined_measures
+
+    return arff_datacontent, \
+           arff_tracecontent, \
+           arff_trace_attributes, \
+           user_defined_measures_fold, \
+           user_defined_measures_sample
 
 
 def _extract_arfftrace(model, rep_no, fold_no):
@@ -571,7 +601,8 @@ def _create_run_from_xml(xml):
 
     files = dict()
     evaluations = dict()
-    detailed_evaluations = defaultdict(lambda: defaultdict(dict))
+    fold_evaluations = defaultdict(lambda: defaultdict(dict))
+    sample_evaluations = defaultdict(lambda: defaultdict(lambda: defaultdict(dict)))
     if 'oml:output_data' not in run:
         raise ValueError('Run does not contain output_data (OpenML server error?)')
     else:
@@ -598,11 +629,18 @@ def _create_run_from_xml(xml):
                 else:
                     raise ValueError('Could not find keys "value" or "array_data" '
                                      'in %s' % str(evaluation_dict.keys()))
-
-                if '@repeat' in evaluation_dict and '@fold' in evaluation_dict:
+                if '@repeat' in evaluation_dict and '@fold' in evaluation_dict and '@sample' in evaluation_dict:
+                    repeat = int(evaluation_dict['@repeat'])
+                    fold = int(evaluation_dict['@fold'])
+                    sample = int(evaluation_dict['@sample'])
+                    repeat_dict = sample_evaluations[key]
+                    fold_dict = repeat_dict[repeat]
+                    sample_dict = fold_dict[fold]
+                    sample_dict[sample] = value
+                elif '@repeat' in evaluation_dict and '@fold' in evaluation_dict:
                     repeat = int(evaluation_dict['@repeat'])
                     fold = int(evaluation_dict['@fold'])
-                    repeat_dict = detailed_evaluations[key]
+                    repeat_dict = fold_evaluations[key]
                     fold_dict = repeat_dict[repeat]
                     fold_dict[fold] = value
                 else:
@@ -629,7 +667,9 @@ def _create_run_from_xml(xml):
                      parameter_settings=parameters,
                      dataset_id=dataset_id, output_files=files,
                      evaluations=evaluations,
-                     detailed_evaluations=detailed_evaluations, tags=tags)
+                     fold_evaluations=fold_evaluations,
+                     sample_evaluations=sample_evaluations,
+                     tags=tags)
 
 def _create_trace_from_description(xml):
     result_dict = xmltodict.parse(xml)['oml:trace']
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+from .evaluation import OpenMLEvaluation`
	`2`	`+from .functions import list_evaluations`