openml
diff --git a/‎openml/runs/functions.py‎
Lines changed: 107 additions & 67 deletions b/‎openml/runs/functions.py‎
Lines changed: 107 additions & 67 deletions
diff --git a/‎openml/runs/run.py‎
Lines changed: 27 additions & 9 deletions b/‎openml/runs/run.py‎
Lines changed: 27 additions & 9 deletions
@@ -106,7 +106,13 @@ def run_flow_on_task(task, flow, avoid_duplicate_runs=True, flow_tags=None,
                     dataset_id=dataset.dataset_id, model=flow.model, tags=tags)
     run.parameter_settings = OpenMLRun._parse_parameters(flow)
 
-    run.data_content, run.trace_content, run.trace_attributes, run.detailed_evaluations = res
+    run.data_content, run.trace_content, run.trace_attributes, fold_evaluations, sample_evaluations = res
+    # now we need to attach the detailed evaluations
+    if task.task_type_id == 3:
+        run.sample_evaluations = sample_evaluations
+    else:
+        run.fold_evaluations = fold_evaluations
+
 
     config.logger.info('Executed Task %d with Flow id: %d' % (task.task_id, run.flow_id))
 
@@ -299,15 +305,20 @@ def _seed_current_object(current_value):
     return model
 
 
-def _prediction_to_row(rep_no, fold_no, row_id, correct_label, predicted_label,
-                       predicted_probabilities, class_labels, model_classes_mapping):
+def _prediction_to_row(rep_no, fold_no, sample_no, row_id, correct_label,
+                       predicted_label, predicted_probabilities, class_labels,
+                       model_classes_mapping):
     """Util function that turns probability estimates of a classifier for a given
         instance into the right arff format to upload to openml.
 
         Parameters
         ----------
         rep_no : int
+            The repeat of the experiment (0-based; in case of 1 time CV, always 0)
         fold_no : int
+            The fold nr of the experiment (0-based; in case of holdout, always 0)
+        sample_no : int
+            In case of learning curves, the index of the subsample (0-based; in case of no learning curve, always 0)
         row_id : int
             row id in the initial dataset
         correct_label : str
@@ -328,11 +339,12 @@ def _prediction_to_row(rep_no, fold_no, row_id, correct_label, predicted_label,
         """
     if not isinstance(rep_no, (int, np.integer)): raise ValueError('rep_no should be int')
     if not isinstance(fold_no, (int, np.integer)): raise ValueError('fold_no should be int')
+    if not isinstance(sample_no, (int, np.integer)): raise ValueError('sample_no should be int')
     if not isinstance(row_id, (int, np.integer)): raise ValueError('row_id should be int')
     if not len(predicted_probabilities) == len(model_classes_mapping):
         raise ValueError('len(predicted_probabilities) != len(class_labels)')
 
-    arff_line = [rep_no, fold_no, row_id]
+    arff_line = [rep_no, fold_no, sample_no, row_id]
     for class_label_idx in range(len(class_labels)):
         if class_label_idx in model_classes_mapping:
             index = np.where(model_classes_mapping == class_label_idx)[0][0]  # TODO: WHY IS THIS 2D???
@@ -349,82 +361,100 @@ def _run_task_get_arffcontent(model, task, class_labels):
     X, Y = task.get_X_and_y()
     arff_datacontent = []
     arff_tracecontent = []
-    user_defined_measures = defaultdict(lambda: defaultdict(dict))
+    # stores fold-based evaluation measures. In case of a sample based task,
+    # this information is multiple times overwritten, but due to the ordering
+    # of tne loops, eventually it contains the information based on the full
+    # dataset size
+    user_defined_measures_fold = defaultdict(lambda: defaultdict(dict))
+    # stores sample-based evaluation measures (sublevel of fold-based)
+    # will also be filled on a non sample-based task, but the information
+    # is the same as the fold-based measures, and disregarded in that case
+    user_defined_measures_sample = defaultdict(lambda: defaultdict(lambda: defaultdict(dict)))
 
-    rep_no = 0
     # sys.version_info returns a tuple, the following line compares the entry of tuples
     # https://docs.python.org/3.6/reference/expressions.html#value-comparisons
     can_measure_runtime = sys.version_info[:2] >= (3, 3) and _check_n_jobs(model)
     # TODO use different iterator to only provide a single iterator (less
     # methods, less maintenance, less confusion)
-    for rep in task.iterate_repeats():
-        fold_no = 0
-        for fold in rep:
-            model_fold = sklearn.base.clone(model, safe=True)
-            train_indices, test_indices = fold
-            trainX = X[train_indices]
-            trainY = Y[train_indices]
-            testX = X[test_indices]
-            testY = Y[test_indices]
-
-            try:
-                # for measuring runtime. Only available since Python 3.3
-                if can_measure_runtime:
-                    modelfit_starttime = time.process_time()
-                model_fold.fit(trainX, trainY)
-
-                if can_measure_runtime:
-                    modelfit_duration = (time.process_time() - modelfit_starttime) * 1000
-                    user_defined_measures['usercpu_time_millis_training'][rep_no][fold_no] = modelfit_duration
-            except AttributeError as e:
-                # typically happens when training a regressor on classification task
-                raise PyOpenMLError(str(e))
-            
-            # extract trace, if applicable
-            if isinstance(model_fold, sklearn.model_selection._search.BaseSearchCV):
-                arff_tracecontent.extend(_extract_arfftrace(model_fold, rep_no, fold_no))
-
-            # search for model classes_ (might differ depending on modeltype)
-            # first, pipelines are a special case (these don't have a classes_
-            # object, but rather borrows it from the last step. We do this manually,
-            # because of the BaseSearch check)
-            if isinstance(model_fold, sklearn.pipeline.Pipeline):
-                used_estimator = model_fold.steps[-1][-1]
-            else:
-                used_estimator = model_fold
+    num_reps, num_folds, num_samples = task.get_split_dimensions()
+
+    for rep_no in range(num_reps):
+        for fold_no in range(num_folds):
+            for sample_no in range(num_samples):
+                model_fold = sklearn.base.clone(model, safe=True)
+                train_indices, test_indices = task.get_train_test_split_indices(repeat=rep_no,
+                                                                                fold=fold_no,
+                                                                                sample=sample_no)
+                trainX = X[train_indices]
+                trainY = Y[train_indices]
+                testX = X[test_indices]
+                testY = Y[test_indices]
+
+                try:
+                    # for measuring runtime. Only available since Python 3.3
+                    if can_measure_runtime:
+                        modelfit_starttime = time.process_time()
+                    model_fold.fit(trainX, trainY)
+
+                    if can_measure_runtime:
+                        modelfit_duration = (time.process_time() - modelfit_starttime) * 1000
+                        user_defined_measures_sample['usercpu_time_millis_training'][rep_no][fold_no][sample_no] = modelfit_duration
+                        user_defined_measures_fold['usercpu_time_millis_training'][rep_no][fold_no] = modelfit_duration
+                except AttributeError as e:
+                    # typically happens when training a regressor on classification task
+                    raise PyOpenMLError(str(e))
+
+                # extract trace, if applicable
+                if isinstance(model_fold, sklearn.model_selection._search.BaseSearchCV):
+                    arff_tracecontent.extend(_extract_arfftrace(model_fold, rep_no, fold_no))
+
+                # search for model classes_ (might differ depending on modeltype)
+                # first, pipelines are a special case (these don't have a classes_
+                # object, but rather borrows it from the last step. We do this manually,
+                # because of the BaseSearch check)
+                if isinstance(model_fold, sklearn.pipeline.Pipeline):
+                    used_estimator = model_fold.steps[-1][-1]
+                else:
+                    used_estimator = model_fold
 
-            if isinstance(used_estimator, sklearn.model_selection._search.BaseSearchCV):
-                model_classes = used_estimator.best_estimator_.classes_
-            else:
-                model_classes = used_estimator.classes_
+                if isinstance(used_estimator, sklearn.model_selection._search.BaseSearchCV):
+                    model_classes = used_estimator.best_estimator_.classes_
+                else:
+                    model_classes = used_estimator.classes_
 
-            if can_measure_runtime:
-                modelpredict_starttime = time.process_time()
-            
-            ProbaY = model_fold.predict_proba(testX)
-            PredY = model_fold.predict(testX)
-            if can_measure_runtime:
-                modelpredict_duration = (time.process_time() - modelpredict_starttime) * 1000
-                user_defined_measures['usercpu_time_millis_testing'][rep_no][fold_no] = modelpredict_duration
-                user_defined_measures['usercpu_time_millis'][rep_no][fold_no] = modelfit_duration + modelpredict_duration
+                if can_measure_runtime:
+                    modelpredict_starttime = time.process_time()
 
-            if ProbaY.shape[1] != len(class_labels):
-                warnings.warn("Repeat %d Fold %d: estimator only predicted for %d/%d classes!" %(rep_no, fold_no, ProbaY.shape[1], len(class_labels)))
+                ProbaY = model_fold.predict_proba(testX)
+                PredY = model_fold.predict(testX)
+                if can_measure_runtime:
+                    modelpredict_duration = (time.process_time() - modelpredict_starttime) * 1000
+                    user_defined_measures_fold['usercpu_time_millis_testing'][rep_no][fold_no] = modelpredict_duration
+                    user_defined_measures_fold['usercpu_time_millis'][rep_no][fold_no] = modelfit_duration + modelpredict_duration
+                    user_defined_measures_sample['usercpu_time_millis_testing'][rep_no][fold_no][sample_no] = modelpredict_duration
+                    user_defined_measures_sample['usercpu_time_millis'][rep_no][fold_no][sample_no] = modelfit_duration + modelpredict_duration
 
-            for i in range(0, len(test_indices)):
-                arff_line = _prediction_to_row(rep_no, fold_no, test_indices[i], class_labels[testY[i]], PredY[i], ProbaY[i], class_labels, model_classes)
-                arff_datacontent.append(arff_line)
+                if ProbaY.shape[1] != len(class_labels):
+                    warnings.warn("Repeat %d Fold %d: estimator only predicted for %d/%d classes!" %(rep_no, fold_no, ProbaY.shape[1], len(class_labels)))
 
-            fold_no = fold_no + 1
-        rep_no = rep_no + 1
+                for i in range(0, len(test_indices)):
+                    arff_line = _prediction_to_row(rep_no, fold_no, sample_no,
+                                                   test_indices[i], class_labels[testY[i]],
+                                                   PredY[i], ProbaY[i], class_labels, model_classes)
+                    arff_datacontent.append(arff_line)
 
     if isinstance(model_fold, sklearn.model_selection._search.BaseSearchCV):
         # arff_tracecontent is already set
         arff_trace_attributes = _extract_arfftrace_attributes(model_fold)
     else:
         arff_tracecontent = None
         arff_trace_attributes = None
-    return arff_datacontent, arff_tracecontent, arff_trace_attributes, user_defined_measures
+
+    return arff_datacontent, \
+           arff_tracecontent, \
+           arff_trace_attributes, \
+           user_defined_measures_fold, \
+           user_defined_measures_sample
 
 
 def _extract_arfftrace(model, rep_no, fold_no):
@@ -571,7 +601,8 @@ def _create_run_from_xml(xml):
 
     files = dict()
     evaluations = dict()
-    detailed_evaluations = defaultdict(lambda: defaultdict(dict))
+    fold_evaluations = defaultdict(lambda: defaultdict(dict))
+    sample_evaluations = defaultdict(lambda: defaultdict(lambda: defaultdict(dict)))
     if 'oml:output_data' not in run:
         raise ValueError('Run does not contain output_data (OpenML server error?)')
     else:
@@ -598,11 +629,18 @@ def _create_run_from_xml(xml):
                 else:
                     raise ValueError('Could not find keys "value" or "array_data" '
                                      'in %s' % str(evaluation_dict.keys()))
-
-                if '@repeat' in evaluation_dict and '@fold' in evaluation_dict:
+                if '@repeat' in evaluation_dict and '@fold' in evaluation_dict and '@sample' in evaluation_dict:
+                    repeat = int(evaluation_dict['@repeat'])
+                    fold = int(evaluation_dict['@fold'])
+                    sample = int(evaluation_dict['@sample'])
+                    repeat_dict = sample_evaluations[key]
+                    fold_dict = repeat_dict[repeat]
+                    sample_dict = fold_dict[fold]
+                    sample_dict[sample] = value
+                elif '@repeat' in evaluation_dict and '@fold' in evaluation_dict:
                     repeat = int(evaluation_dict['@repeat'])
                     fold = int(evaluation_dict['@fold'])
-                    repeat_dict = detailed_evaluations[key]
+                    repeat_dict = fold_evaluations[key]
                     fold_dict = repeat_dict[repeat]
                     fold_dict[fold] = value
                 else:
@@ -629,7 +667,9 @@ def _create_run_from_xml(xml):
                      parameter_settings=parameters,
                      dataset_id=dataset_id, output_files=files,
                      evaluations=evaluations,
-                     detailed_evaluations=detailed_evaluations, tags=tags)
+                     fold_evaluations=fold_evaluations,
+                     sample_evaluations=sample_evaluations,
+                     tags=tags)
 
 def _create_trace_from_description(xml):
     result_dict = xmltodict.parse(xml)['oml:trace']
 
@@ -21,7 +21,7 @@ class OpenMLRun(object):
     """
     def __init__(self, task_id, flow_id, dataset_id, setup_string=None,
                  output_files=None, setup_id=None, tags=None, uploader=None, uploader_name=None,
-                 evaluations=None, detailed_evaluations=None,
+                 evaluations=None, fold_evaluations=None, sample_evaluations=None,
                  data_content=None, trace_attributes=None, trace_content=None,
                  model=None, task_type=None, task_evaluation_measure=None, flow_name=None,
                  parameter_settings=None, predictions_url=None, task=None,
@@ -38,7 +38,8 @@ def __init__(self, task_id, flow_id, dataset_id, setup_string=None,
         self.parameter_settings = parameter_settings
         self.dataset_id = dataset_id
         self.evaluations = evaluations
-        self.detailed_evaluations = detailed_evaluations
+        self.fold_evaluations = fold_evaluations
+        self.sample_evaluations = sample_evaluations
         self.data_content = data_content
         self.output_files = output_files
         self.trace_attributes = trace_attributes
@@ -72,6 +73,7 @@ def _generate_arff_dict(self):
         arff_dict = {}
         arff_dict['attributes'] = [('repeat', 'NUMERIC'),  # lowercase 'numeric' gives an error
                                    ('fold', 'NUMERIC'),
+                                   ('sample', 'NUMERIC'),
                                    ('row_id', 'NUMERIC')] + \
             [('confidence.' + class_labels[i], 'NUMERIC') for i in range(len(class_labels))] +\
             [('prediction', class_labels),
@@ -154,7 +156,8 @@ def _create_description_xml(self):
                                setup_string=_create_setup_string(self.model),
                                parameter_settings=self.parameter_settings,
                                error_message=self.error_message,
-                               detailed_evaluations=self.detailed_evaluations,
+                               fold_evaluations=self.fold_evaluations,
+                               sample_evaluations=self.sample_evaluations,
                                tags=self.tags)
         description_xml = xmltodict.unparse(description, pretty=True)
         return description_xml
@@ -284,7 +287,8 @@ def _get_version_information():
     return [python_version, sklearn_version, numpy_version, scipy_version]
 
 
-def _to_dict(taskid, flow_id, setup_string, error_message, parameter_settings, tags=None, detailed_evaluations=None):
+def _to_dict(taskid, flow_id, setup_string, error_message, parameter_settings,
+             tags=None, fold_evaluations=None, sample_evaluations=None):
     """ Creates a dictionary corresponding to the desired xml desired by openML
 
     Parameters
@@ -298,7 +302,11 @@ def _to_dict(taskid, flow_id, setup_string, error_message, parameter_settings, t
     tags : array of strings
         information that give a description of the run, must conform to
         regex ``([a-zA-Z0-9_\-\.])+``
-
+    fold_evaluations : dict mapping from evaluation measure to a dict mapping repeat_nr
+        to a dict mapping from fold nr to a value (double)
+    sample_evaluations : dict mapping from evaluation measure to a dict mapping repeat_nr
+        to a dict mapping from fold nr to a dict mapping to a sample nr to a value (double)
+    sample_evaluations :
     Returns
     -------
     result : an array with version information of the above packages
@@ -313,15 +321,25 @@ def _to_dict(taskid, flow_id, setup_string, error_message, parameter_settings, t
     description['oml:run']['oml:parameter_setting'] = parameter_settings
     if tags is not None:
         description['oml:run']['oml:tag'] = tags  # Tags describing the run
-    if detailed_evaluations is not None:
+    if fold_evaluations is not None or sample_evaluations is not None:
         description['oml:run']['oml:output_data'] = dict()
         description['oml:run']['oml:output_data']['oml:evaluation'] = list()
-        for measure in detailed_evaluations:
-            for repeat in detailed_evaluations[measure]:
-                for fold, value in detailed_evaluations[measure][repeat].items():
+    if fold_evaluations is not None:
+        for measure in fold_evaluations:
+            for repeat in fold_evaluations[measure]:
+                for fold, value in fold_evaluations[measure][repeat].items():
                     current = OrderedDict([('@repeat', str(repeat)), ('@fold', str(fold)),
                                            ('oml:name', measure), ('oml:value', str(value))])
                     description['oml:run']['oml:output_data']['oml:evaluation'].append(current)
+    if sample_evaluations is not None:
+        for measure in sample_evaluations:
+            for repeat in sample_evaluations[measure]:
+                for fold in sample_evaluations[measure][repeat]:
+                    for sample, value in sample_evaluations[measure][repeat][fold].items():
+                        current = OrderedDict([('@repeat', str(repeat)), ('@fold', str(fold)),
+                                               ('@sample', str(sample)), ('oml:name', measure),
+                                               ('oml:value', str(value))])
+                        description['oml:run']['oml:output_data']['oml:evaluation'].append(current)
     return description