@@ -166,22 +166,32 @@ def _check_fold_evaluations(self, fold_evaluations, num_repeats, num_folds, max_
166166 condition outside of this function. )
167167 default max_time_allowed (per fold, in milli seconds) = 1 minute, quite pessimistic
168168 '''
169- timing_measures = {'usercpu_time_millis_testing' , 'usercpu_time_millis_training' , 'usercpu_time_millis' }
169+
170+ # a dict mapping from openml measure to a tuple with the minimum and maximum allowed value
171+ check_measures = {'usercpu_time_millis_testing' : (0 , max_time_allowed ),
172+ 'usercpu_time_millis_training' : (0 , max_time_allowed ), # should take at least one millisecond (?)
173+ 'usercpu_time_millis' : (0 , max_time_allowed ),
174+ 'predictive_accuracy' : (0 , 1 )}
170175
171176 self .assertIsInstance (fold_evaluations , dict )
172177 if sys .version_info [:2 ] >= (3 , 3 ):
173- self .assertEquals (set (fold_evaluations .keys ()), timing_measures )
174- for measure in timing_measures :
178+ # this only holds if we are allowed to record time (otherwise some are missing)
179+ self .assertEquals (set (fold_evaluations .keys ()), set (check_measures .keys ()))
180+
181+ for measure in check_measures .keys ():
182+ if measure in fold_evaluations :
175183 num_rep_entrees = len (fold_evaluations [measure ])
176184 self .assertEquals (num_rep_entrees , num_repeats )
185+ min_val = check_measures [measure ][0 ]
186+ max_val = check_measures [measure ][1 ]
177187 for rep in range (num_rep_entrees ):
178188 num_fold_entrees = len (fold_evaluations [measure ][rep ])
179189 self .assertEquals (num_fold_entrees , num_folds )
180190 for fold in range (num_fold_entrees ):
181191 evaluation = fold_evaluations [measure ][rep ][fold ]
182192 self .assertIsInstance (evaluation , float )
183- self .assertGreater (evaluation , 0 ) # should take at least one millisecond (? )
184- self .assertLess (evaluation , max_time_allowed )
193+ self .assertGreaterEqual (evaluation , min_val )
194+ self .assertLessEqual (evaluation , max_val )
185195
186196
187197 def _check_sample_evaluations (self , sample_evaluations , num_repeats , num_folds , num_samples , max_time_allowed = 60000 ):
@@ -193,12 +203,20 @@ def _check_sample_evaluations(self, sample_evaluations, num_repeats, num_folds,
193203 condition outside of this function. )
194204 default max_time_allowed (per fold, in milli seconds) = 1 minute, quite pessimistic
195205 '''
196- timing_measures = {'usercpu_time_millis_testing' , 'usercpu_time_millis_training' , 'usercpu_time_millis' }
206+
207+ # a dict mapping from openml measure to a tuple with the minimum and maximum allowed value
208+ check_measures = {'usercpu_time_millis_testing' : (0 , max_time_allowed ),
209+ 'usercpu_time_millis_training' : (0 , max_time_allowed ), # should take at least one millisecond (?)
210+ 'usercpu_time_millis' : (0 , max_time_allowed ),
211+ 'predictive_accuracy' : (0 , 1 )}
197212
198213 self .assertIsInstance (sample_evaluations , dict )
199214 if sys .version_info [:2 ] >= (3 , 3 ):
200- self .assertEquals (set (sample_evaluations .keys ()), timing_measures )
201- for measure in timing_measures :
215+ # this only holds if we are allowed to record time (otherwise some are missing)
216+ self .assertEquals (set (sample_evaluations .keys ()), set (check_measures .keys ()))
217+
218+ for measure in check_measures .keys ():
219+ if measure in sample_evaluations :
202220 num_rep_entrees = len (sample_evaluations [measure ])
203221 self .assertEquals (num_rep_entrees , num_repeats )
204222 for rep in range (num_rep_entrees ):
@@ -309,6 +327,16 @@ def test_run_and_upload(self):
309327 for clf , rsv in zip (clfs , random_state_fixtures ):
310328 run = self ._perform_run (task_id , num_test_instances , clf ,
311329 random_state_value = rsv )
330+
331+ # obtain accuracy scores using get_metric_score:
332+ accuracy_scores = run .get_metric_fn (sklearn .metrics .accuracy_score )
333+ # compare with the scores in user defined measures
334+ accuracy_scores_provided = []
335+ for rep in run .fold_evaluations ['predictive_accuracy' ].keys ():
336+ for fold in run .fold_evaluations ['predictive_accuracy' ][rep ].keys ():
337+ accuracy_scores_provided .append (run .fold_evaluations ['predictive_accuracy' ][rep ][fold ])
338+ self .assertEquals (sum (accuracy_scores_provided ), sum (accuracy_scores ))
339+
312340 if isinstance (clf , BaseSearchCV ):
313341 if isinstance (clf , GridSearchCV ):
314342 grid_iterations = 1
@@ -385,6 +413,49 @@ def test_initialize_cv_from_run(self):
385413 self .assertEquals (modelS .cv .random_state , 62501 )
386414 self .assertEqual (modelR .cv .random_state , 62501 )
387415
416+ def _test_local_evaluations (self , run ):
417+
418+ # compare with the scores in user defined measures
419+ accuracy_scores_provided = []
420+ for rep in run .fold_evaluations ['predictive_accuracy' ].keys ():
421+ for fold in run .fold_evaluations ['predictive_accuracy' ][rep ].keys ():
422+ accuracy_scores_provided .append (run .fold_evaluations ['predictive_accuracy' ][rep ][fold ])
423+ accuracy_scores = run .get_metric_fn (sklearn .metrics .accuracy_score )
424+ np .testing .assert_array_almost_equal (accuracy_scores_provided , accuracy_scores )
425+
426+ # also check if we can obtain some other scores: # TODO: how to do AUC?
427+ tests = [(sklearn .metrics .cohen_kappa_score , {'weights' : None }),
428+ (sklearn .metrics .auc , {'reorder' : True }),
429+ (sklearn .metrics .average_precision_score , {}),
430+ (sklearn .metrics .jaccard_similarity_score , {}),
431+ (sklearn .metrics .precision_score , {'average' : 'macro' }),
432+ (sklearn .metrics .brier_score_loss , {})]
433+ for test_idx , test in enumerate (tests ):
434+ alt_scores = run .get_metric_fn (test [0 ], test [1 ])
435+ self .assertEquals (len (alt_scores ), 10 )
436+ for idx in range (len (alt_scores )):
437+ self .assertGreaterEqual (alt_scores [idx ], 0 )
438+ self .assertLessEqual (alt_scores [idx ], 1 )
439+
440+ def test_local_run_metric_score (self ):
441+
442+ # construct sci-kit learn classifier
443+ clf = Pipeline (steps = [('imputer' , Imputer (strategy = 'median' )), ('estimator' , RandomForestClassifier ())])
444+
445+ # download task
446+ task = openml .tasks .get_task (7 )
447+
448+ # invoke OpenML run
449+ run = openml .runs .run_model_on_task (task , clf )
450+
451+ self ._test_local_evaluations (run )
452+
453+ def test_online_run_metric_score (self ):
454+ openml .config .server = self .production_server
455+ run = openml .runs .get_run (5965513 ) # important to use binary classification task, due to assertions
456+ self ._test_local_evaluations (run )
457+
458+
388459 def test_initialize_model_from_run (self ):
389460 clf = sklearn .pipeline .Pipeline (steps = [('Imputer' , Imputer (strategy = 'median' )),
390461 ('VarianceThreshold' , VarianceThreshold (threshold = 0.05 )),
0 commit comments