Skip to content

Commit bef46b7

Browse files
authored
Merge pull request #265 from openml/learningcurves
Support for Learning Curve Tasks
2 parents de66af0 + 993dbea commit bef46b7

File tree

7 files changed

+290
-167
lines changed

7 files changed

+290
-167
lines changed

openml/runs/functions.py

Lines changed: 107 additions & 67 deletions
Original file line numberDiff line numberDiff line change
@@ -106,7 +106,13 @@ def run_flow_on_task(task, flow, avoid_duplicate_runs=True, flow_tags=None,
106106
dataset_id=dataset.dataset_id, model=flow.model, tags=tags)
107107
run.parameter_settings = OpenMLRun._parse_parameters(flow)
108108

109-
run.data_content, run.trace_content, run.trace_attributes, run.detailed_evaluations = res
109+
run.data_content, run.trace_content, run.trace_attributes, fold_evaluations, sample_evaluations = res
110+
# now we need to attach the detailed evaluations
111+
if task.task_type_id == 3:
112+
run.sample_evaluations = sample_evaluations
113+
else:
114+
run.fold_evaluations = fold_evaluations
115+
110116

111117
config.logger.info('Executed Task %d with Flow id: %d' % (task.task_id, run.flow_id))
112118

@@ -299,15 +305,20 @@ def _seed_current_object(current_value):
299305
return model
300306

301307

302-
def _prediction_to_row(rep_no, fold_no, row_id, correct_label, predicted_label,
303-
predicted_probabilities, class_labels, model_classes_mapping):
308+
def _prediction_to_row(rep_no, fold_no, sample_no, row_id, correct_label,
309+
predicted_label, predicted_probabilities, class_labels,
310+
model_classes_mapping):
304311
"""Util function that turns probability estimates of a classifier for a given
305312
instance into the right arff format to upload to openml.
306313
307314
Parameters
308315
----------
309316
rep_no : int
317+
The repeat of the experiment (0-based; in case of 1 time CV, always 0)
310318
fold_no : int
319+
The fold nr of the experiment (0-based; in case of holdout, always 0)
320+
sample_no : int
321+
In case of learning curves, the index of the subsample (0-based; in case of no learning curve, always 0)
311322
row_id : int
312323
row id in the initial dataset
313324
correct_label : str
@@ -328,11 +339,12 @@ def _prediction_to_row(rep_no, fold_no, row_id, correct_label, predicted_label,
328339
"""
329340
if not isinstance(rep_no, (int, np.integer)): raise ValueError('rep_no should be int')
330341
if not isinstance(fold_no, (int, np.integer)): raise ValueError('fold_no should be int')
342+
if not isinstance(sample_no, (int, np.integer)): raise ValueError('sample_no should be int')
331343
if not isinstance(row_id, (int, np.integer)): raise ValueError('row_id should be int')
332344
if not len(predicted_probabilities) == len(model_classes_mapping):
333345
raise ValueError('len(predicted_probabilities) != len(class_labels)')
334346

335-
arff_line = [rep_no, fold_no, row_id]
347+
arff_line = [rep_no, fold_no, sample_no, row_id]
336348
for class_label_idx in range(len(class_labels)):
337349
if class_label_idx in model_classes_mapping:
338350
index = np.where(model_classes_mapping == class_label_idx)[0][0] # TODO: WHY IS THIS 2D???
@@ -349,82 +361,100 @@ def _run_task_get_arffcontent(model, task, class_labels):
349361
X, Y = task.get_X_and_y()
350362
arff_datacontent = []
351363
arff_tracecontent = []
352-
user_defined_measures = defaultdict(lambda: defaultdict(dict))
364+
# stores fold-based evaluation measures. In case of a sample based task,
365+
# this information is multiple times overwritten, but due to the ordering
366+
# of tne loops, eventually it contains the information based on the full
367+
# dataset size
368+
user_defined_measures_fold = defaultdict(lambda: defaultdict(dict))
369+
# stores sample-based evaluation measures (sublevel of fold-based)
370+
# will also be filled on a non sample-based task, but the information
371+
# is the same as the fold-based measures, and disregarded in that case
372+
user_defined_measures_sample = defaultdict(lambda: defaultdict(lambda: defaultdict(dict)))
353373

354-
rep_no = 0
355374
# sys.version_info returns a tuple, the following line compares the entry of tuples
356375
# https://docs.python.org/3.6/reference/expressions.html#value-comparisons
357376
can_measure_runtime = sys.version_info[:2] >= (3, 3) and _check_n_jobs(model)
358377
# TODO use different iterator to only provide a single iterator (less
359378
# methods, less maintenance, less confusion)
360-
for rep in task.iterate_repeats():
361-
fold_no = 0
362-
for fold in rep:
363-
model_fold = sklearn.base.clone(model, safe=True)
364-
train_indices, test_indices = fold
365-
trainX = X[train_indices]
366-
trainY = Y[train_indices]
367-
testX = X[test_indices]
368-
testY = Y[test_indices]
369-
370-
try:
371-
# for measuring runtime. Only available since Python 3.3
372-
if can_measure_runtime:
373-
modelfit_starttime = time.process_time()
374-
model_fold.fit(trainX, trainY)
375-
376-
if can_measure_runtime:
377-
modelfit_duration = (time.process_time() - modelfit_starttime) * 1000
378-
user_defined_measures['usercpu_time_millis_training'][rep_no][fold_no] = modelfit_duration
379-
except AttributeError as e:
380-
# typically happens when training a regressor on classification task
381-
raise PyOpenMLError(str(e))
382-
383-
# extract trace, if applicable
384-
if isinstance(model_fold, sklearn.model_selection._search.BaseSearchCV):
385-
arff_tracecontent.extend(_extract_arfftrace(model_fold, rep_no, fold_no))
386-
387-
# search for model classes_ (might differ depending on modeltype)
388-
# first, pipelines are a special case (these don't have a classes_
389-
# object, but rather borrows it from the last step. We do this manually,
390-
# because of the BaseSearch check)
391-
if isinstance(model_fold, sklearn.pipeline.Pipeline):
392-
used_estimator = model_fold.steps[-1][-1]
393-
else:
394-
used_estimator = model_fold
379+
num_reps, num_folds, num_samples = task.get_split_dimensions()
380+
381+
for rep_no in range(num_reps):
382+
for fold_no in range(num_folds):
383+
for sample_no in range(num_samples):
384+
model_fold = sklearn.base.clone(model, safe=True)
385+
train_indices, test_indices = task.get_train_test_split_indices(repeat=rep_no,
386+
fold=fold_no,
387+
sample=sample_no)
388+
trainX = X[train_indices]
389+
trainY = Y[train_indices]
390+
testX = X[test_indices]
391+
testY = Y[test_indices]
392+
393+
try:
394+
# for measuring runtime. Only available since Python 3.3
395+
if can_measure_runtime:
396+
modelfit_starttime = time.process_time()
397+
model_fold.fit(trainX, trainY)
398+
399+
if can_measure_runtime:
400+
modelfit_duration = (time.process_time() - modelfit_starttime) * 1000
401+
user_defined_measures_sample['usercpu_time_millis_training'][rep_no][fold_no][sample_no] = modelfit_duration
402+
user_defined_measures_fold['usercpu_time_millis_training'][rep_no][fold_no] = modelfit_duration
403+
except AttributeError as e:
404+
# typically happens when training a regressor on classification task
405+
raise PyOpenMLError(str(e))
406+
407+
# extract trace, if applicable
408+
if isinstance(model_fold, sklearn.model_selection._search.BaseSearchCV):
409+
arff_tracecontent.extend(_extract_arfftrace(model_fold, rep_no, fold_no))
410+
411+
# search for model classes_ (might differ depending on modeltype)
412+
# first, pipelines are a special case (these don't have a classes_
413+
# object, but rather borrows it from the last step. We do this manually,
414+
# because of the BaseSearch check)
415+
if isinstance(model_fold, sklearn.pipeline.Pipeline):
416+
used_estimator = model_fold.steps[-1][-1]
417+
else:
418+
used_estimator = model_fold
395419

396-
if isinstance(used_estimator, sklearn.model_selection._search.BaseSearchCV):
397-
model_classes = used_estimator.best_estimator_.classes_
398-
else:
399-
model_classes = used_estimator.classes_
420+
if isinstance(used_estimator, sklearn.model_selection._search.BaseSearchCV):
421+
model_classes = used_estimator.best_estimator_.classes_
422+
else:
423+
model_classes = used_estimator.classes_
400424

401-
if can_measure_runtime:
402-
modelpredict_starttime = time.process_time()
403-
404-
ProbaY = model_fold.predict_proba(testX)
405-
PredY = model_fold.predict(testX)
406-
if can_measure_runtime:
407-
modelpredict_duration = (time.process_time() - modelpredict_starttime) * 1000
408-
user_defined_measures['usercpu_time_millis_testing'][rep_no][fold_no] = modelpredict_duration
409-
user_defined_measures['usercpu_time_millis'][rep_no][fold_no] = modelfit_duration + modelpredict_duration
425+
if can_measure_runtime:
426+
modelpredict_starttime = time.process_time()
410427

411-
if ProbaY.shape[1] != len(class_labels):
412-
warnings.warn("Repeat %d Fold %d: estimator only predicted for %d/%d classes!" %(rep_no, fold_no, ProbaY.shape[1], len(class_labels)))
428+
ProbaY = model_fold.predict_proba(testX)
429+
PredY = model_fold.predict(testX)
430+
if can_measure_runtime:
431+
modelpredict_duration = (time.process_time() - modelpredict_starttime) * 1000
432+
user_defined_measures_fold['usercpu_time_millis_testing'][rep_no][fold_no] = modelpredict_duration
433+
user_defined_measures_fold['usercpu_time_millis'][rep_no][fold_no] = modelfit_duration + modelpredict_duration
434+
user_defined_measures_sample['usercpu_time_millis_testing'][rep_no][fold_no][sample_no] = modelpredict_duration
435+
user_defined_measures_sample['usercpu_time_millis'][rep_no][fold_no][sample_no] = modelfit_duration + modelpredict_duration
413436

414-
for i in range(0, len(test_indices)):
415-
arff_line = _prediction_to_row(rep_no, fold_no, test_indices[i], class_labels[testY[i]], PredY[i], ProbaY[i], class_labels, model_classes)
416-
arff_datacontent.append(arff_line)
437+
if ProbaY.shape[1] != len(class_labels):
438+
warnings.warn("Repeat %d Fold %d: estimator only predicted for %d/%d classes!" %(rep_no, fold_no, ProbaY.shape[1], len(class_labels)))
417439

418-
fold_no = fold_no + 1
419-
rep_no = rep_no + 1
440+
for i in range(0, len(test_indices)):
441+
arff_line = _prediction_to_row(rep_no, fold_no, sample_no,
442+
test_indices[i], class_labels[testY[i]],
443+
PredY[i], ProbaY[i], class_labels, model_classes)
444+
arff_datacontent.append(arff_line)
420445

421446
if isinstance(model_fold, sklearn.model_selection._search.BaseSearchCV):
422447
# arff_tracecontent is already set
423448
arff_trace_attributes = _extract_arfftrace_attributes(model_fold)
424449
else:
425450
arff_tracecontent = None
426451
arff_trace_attributes = None
427-
return arff_datacontent, arff_tracecontent, arff_trace_attributes, user_defined_measures
452+
453+
return arff_datacontent, \
454+
arff_tracecontent, \
455+
arff_trace_attributes, \
456+
user_defined_measures_fold, \
457+
user_defined_measures_sample
428458

429459

430460
def _extract_arfftrace(model, rep_no, fold_no):
@@ -571,7 +601,8 @@ def _create_run_from_xml(xml):
571601

572602
files = dict()
573603
evaluations = dict()
574-
detailed_evaluations = defaultdict(lambda: defaultdict(dict))
604+
fold_evaluations = defaultdict(lambda: defaultdict(dict))
605+
sample_evaluations = defaultdict(lambda: defaultdict(lambda: defaultdict(dict)))
575606
if 'oml:output_data' not in run:
576607
raise ValueError('Run does not contain output_data (OpenML server error?)')
577608
else:
@@ -598,11 +629,18 @@ def _create_run_from_xml(xml):
598629
else:
599630
raise ValueError('Could not find keys "value" or "array_data" '
600631
'in %s' % str(evaluation_dict.keys()))
601-
602-
if '@repeat' in evaluation_dict and '@fold' in evaluation_dict:
632+
if '@repeat' in evaluation_dict and '@fold' in evaluation_dict and '@sample' in evaluation_dict:
633+
repeat = int(evaluation_dict['@repeat'])
634+
fold = int(evaluation_dict['@fold'])
635+
sample = int(evaluation_dict['@sample'])
636+
repeat_dict = sample_evaluations[key]
637+
fold_dict = repeat_dict[repeat]
638+
sample_dict = fold_dict[fold]
639+
sample_dict[sample] = value
640+
elif '@repeat' in evaluation_dict and '@fold' in evaluation_dict:
603641
repeat = int(evaluation_dict['@repeat'])
604642
fold = int(evaluation_dict['@fold'])
605-
repeat_dict = detailed_evaluations[key]
643+
repeat_dict = fold_evaluations[key]
606644
fold_dict = repeat_dict[repeat]
607645
fold_dict[fold] = value
608646
else:
@@ -629,7 +667,9 @@ def _create_run_from_xml(xml):
629667
parameter_settings=parameters,
630668
dataset_id=dataset_id, output_files=files,
631669
evaluations=evaluations,
632-
detailed_evaluations=detailed_evaluations, tags=tags)
670+
fold_evaluations=fold_evaluations,
671+
sample_evaluations=sample_evaluations,
672+
tags=tags)
633673

634674
def _create_trace_from_description(xml):
635675
result_dict = xmltodict.parse(xml)['oml:trace']

openml/runs/run.py

Lines changed: 27 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ class OpenMLRun(object):
2121
"""
2222
def __init__(self, task_id, flow_id, dataset_id, setup_string=None,
2323
output_files=None, setup_id=None, tags=None, uploader=None, uploader_name=None,
24-
evaluations=None, detailed_evaluations=None,
24+
evaluations=None, fold_evaluations=None, sample_evaluations=None,
2525
data_content=None, trace_attributes=None, trace_content=None,
2626
model=None, task_type=None, task_evaluation_measure=None, flow_name=None,
2727
parameter_settings=None, predictions_url=None, task=None,
@@ -38,7 +38,8 @@ def __init__(self, task_id, flow_id, dataset_id, setup_string=None,
3838
self.parameter_settings = parameter_settings
3939
self.dataset_id = dataset_id
4040
self.evaluations = evaluations
41-
self.detailed_evaluations = detailed_evaluations
41+
self.fold_evaluations = fold_evaluations
42+
self.sample_evaluations = sample_evaluations
4243
self.data_content = data_content
4344
self.output_files = output_files
4445
self.trace_attributes = trace_attributes
@@ -72,6 +73,7 @@ def _generate_arff_dict(self):
7273
arff_dict = {}
7374
arff_dict['attributes'] = [('repeat', 'NUMERIC'), # lowercase 'numeric' gives an error
7475
('fold', 'NUMERIC'),
76+
('sample', 'NUMERIC'),
7577
('row_id', 'NUMERIC')] + \
7678
[('confidence.' + class_labels[i], 'NUMERIC') for i in range(len(class_labels))] +\
7779
[('prediction', class_labels),
@@ -154,7 +156,8 @@ def _create_description_xml(self):
154156
setup_string=_create_setup_string(self.model),
155157
parameter_settings=self.parameter_settings,
156158
error_message=self.error_message,
157-
detailed_evaluations=self.detailed_evaluations,
159+
fold_evaluations=self.fold_evaluations,
160+
sample_evaluations=self.sample_evaluations,
158161
tags=self.tags)
159162
description_xml = xmltodict.unparse(description, pretty=True)
160163
return description_xml
@@ -284,7 +287,8 @@ def _get_version_information():
284287
return [python_version, sklearn_version, numpy_version, scipy_version]
285288

286289

287-
def _to_dict(taskid, flow_id, setup_string, error_message, parameter_settings, tags=None, detailed_evaluations=None):
290+
def _to_dict(taskid, flow_id, setup_string, error_message, parameter_settings,
291+
tags=None, fold_evaluations=None, sample_evaluations=None):
288292
""" Creates a dictionary corresponding to the desired xml desired by openML
289293
290294
Parameters
@@ -298,7 +302,11 @@ def _to_dict(taskid, flow_id, setup_string, error_message, parameter_settings, t
298302
tags : array of strings
299303
information that give a description of the run, must conform to
300304
regex ``([a-zA-Z0-9_\-\.])+``
301-
305+
fold_evaluations : dict mapping from evaluation measure to a dict mapping repeat_nr
306+
to a dict mapping from fold nr to a value (double)
307+
sample_evaluations : dict mapping from evaluation measure to a dict mapping repeat_nr
308+
to a dict mapping from fold nr to a dict mapping to a sample nr to a value (double)
309+
sample_evaluations :
302310
Returns
303311
-------
304312
result : an array with version information of the above packages
@@ -313,15 +321,25 @@ def _to_dict(taskid, flow_id, setup_string, error_message, parameter_settings, t
313321
description['oml:run']['oml:parameter_setting'] = parameter_settings
314322
if tags is not None:
315323
description['oml:run']['oml:tag'] = tags # Tags describing the run
316-
if detailed_evaluations is not None:
324+
if fold_evaluations is not None or sample_evaluations is not None:
317325
description['oml:run']['oml:output_data'] = dict()
318326
description['oml:run']['oml:output_data']['oml:evaluation'] = list()
319-
for measure in detailed_evaluations:
320-
for repeat in detailed_evaluations[measure]:
321-
for fold, value in detailed_evaluations[measure][repeat].items():
327+
if fold_evaluations is not None:
328+
for measure in fold_evaluations:
329+
for repeat in fold_evaluations[measure]:
330+
for fold, value in fold_evaluations[measure][repeat].items():
322331
current = OrderedDict([('@repeat', str(repeat)), ('@fold', str(fold)),
323332
('oml:name', measure), ('oml:value', str(value))])
324333
description['oml:run']['oml:output_data']['oml:evaluation'].append(current)
334+
if sample_evaluations is not None:
335+
for measure in sample_evaluations:
336+
for repeat in sample_evaluations[measure]:
337+
for fold in sample_evaluations[measure][repeat]:
338+
for sample, value in sample_evaluations[measure][repeat][fold].items():
339+
current = OrderedDict([('@repeat', str(repeat)), ('@fold', str(fold)),
340+
('@sample', str(sample)), ('oml:name', measure),
341+
('oml:value', str(value))])
342+
description['oml:run']['oml:output_data']['oml:evaluation'].append(current)
325343
return description
326344

327345

0 commit comments

Comments
 (0)