Skip to content

Commit bbf09b3

Browse files
Fix: correctly order the ground truth and prediction for ARFF files in run.data_content (#1209)
* add test and fix for switch of ground truth and predictions * undo import optimization * fix bug with model passing to function * fix order in other tests * update progress.rst * new unit test for run consistency and bug fixed * clarify new assert * minor loop refactor * refactor default to None * directly test prediction data equal * Update tests/test_runs/test_run.py Co-authored-by: Pieter Gijsbers <[email protected]> * Mark sklearn tests (#1202) * Add sklearn marker * Mark tests that use scikit-learn * Only run scikit-learn tests multiple times The generic tests that don't use scikit-learn should only be tested once (per platform). * Rename for correct variable * Add sklearn mark for filesystem test * Remove quotes around sklearn * Instead include sklearn in the matrix definition * Update jobnames * Add explicit false to jobname * Remove space * Add function inside of expression? * Do string testing instead * Add missing ${{ * Add explicit true to old sklearn tests * Add instruction to add pytest marker for sklearn tests * add test and fix for switch of ground truth and predictions * undo import optimization * fix mask error resulting from rebase * make dummy classifier strategy consistent to avoid problems as a result of the random state problems for sklearn < 0.24 --------- Co-authored-by: Pieter Gijsbers <[email protected]>
1 parent c590b3a commit bbf09b3

File tree

5 files changed

+188
-53
lines changed

5 files changed

+188
-53
lines changed

doc/progress.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,9 +9,9 @@ Changelog
99
0.13.1
1010
~~~~~~
1111

12+
* FIX #1197 #559 #1131: Fix the order of ground truth and predictions in the ``OpenMLRun`` object and in ``format_prediction``.
1213
* FIX #1198: Support numpy 1.24 and higher.
1314

14-
1515
0.13.0
1616
~~~~~~
1717

openml/runs/functions.py

Lines changed: 15 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -155,7 +155,6 @@ def run_flow_on_task(
155155
dataset_format: str = "dataframe",
156156
n_jobs: Optional[int] = None,
157157
) -> OpenMLRun:
158-
159158
"""Run the model provided by the flow on the dataset defined by task.
160159
161160
Takes the flow and repeat information into account.
@@ -515,13 +514,13 @@ def _calculate_local_measure(sklearn_fn, openml_name):
515514
else pred_y[i]
516515
)
517516
if isinstance(test_y, pd.Series):
518-
test_prediction = (
517+
truth = (
519518
task.class_labels[test_y.iloc[i]]
520519
if isinstance(test_y.iloc[i], int)
521520
else test_y.iloc[i]
522521
)
523522
else:
524-
test_prediction = (
523+
truth = (
525524
task.class_labels[test_y[i]]
526525
if isinstance(test_y[i], (int, np.integer))
527526
else test_y[i]
@@ -535,7 +534,7 @@ def _calculate_local_measure(sklearn_fn, openml_name):
535534
sample=sample_no,
536535
index=tst_idx,
537536
prediction=prediction,
538-
truth=test_prediction,
537+
truth=truth,
539538
proba=dict(zip(task.class_labels, pred_prob)),
540539
)
541540
else:
@@ -552,14 +551,14 @@ def _calculate_local_measure(sklearn_fn, openml_name):
552551
elif isinstance(task, OpenMLRegressionTask):
553552

554553
for i, _ in enumerate(test_indices):
555-
test_prediction = test_y.iloc[i] if isinstance(test_y, pd.Series) else test_y[i]
554+
truth = test_y.iloc[i] if isinstance(test_y, pd.Series) else test_y[i]
556555
arff_line = format_prediction(
557556
task=task,
558557
repeat=rep_no,
559558
fold=fold_no,
560559
index=test_indices[i],
561560
prediction=pred_y[i],
562-
truth=test_prediction,
561+
truth=truth,
563562
)
564563

565564
arff_datacontent.append(arff_line)
@@ -920,9 +919,10 @@ def obtain_field(xml_obj, fieldname, from_server, cast=None):
920919
parameter_settings=parameters,
921920
dataset_id=dataset_id,
922921
output_files=files,
923-
evaluations=evaluations,
924-
fold_evaluations=fold_evaluations,
925-
sample_evaluations=sample_evaluations,
922+
# Make sure default values are used where needed to keep run objects identical
923+
evaluations=evaluations or None,
924+
fold_evaluations=fold_evaluations or None,
925+
sample_evaluations=sample_evaluations or None,
926926
tags=tags,
927927
predictions_url=predictions_url,
928928
run_details=run_details,
@@ -1186,6 +1186,10 @@ def format_prediction(
11861186
-------
11871187
A list with elements for the prediction results of a run.
11881188
1189+
The returned order of the elements is (if available):
1190+
[repeat, fold, sample, index, prediction, truth, *probabilities]
1191+
1192+
This order follows the R Client API.
11891193
"""
11901194
if isinstance(task, OpenMLClassificationTask):
11911195
if proba is None:
@@ -1200,8 +1204,8 @@ def format_prediction(
12001204
else:
12011205
sample = 0
12021206
probabilities = [proba[c] for c in task.class_labels]
1203-
return [repeat, fold, sample, index, *probabilities, truth, prediction]
1207+
return [repeat, fold, sample, index, prediction, truth, *probabilities]
12041208
elif isinstance(task, OpenMLRegressionTask):
1205-
return [repeat, fold, index, truth, prediction]
1209+
return [repeat, fold, index, prediction, truth]
12061210
else:
12071211
raise NotImplementedError(f"Formatting for {type(task)} is not supported.")

openml/runs/run.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -304,6 +304,8 @@ def _generate_arff_dict(self) -> "OrderedDict[str, Any]":
304304
305305
Assumes that the run has been executed.
306306
307+
The order of the attributes follows the order defined by the Client API for R.
308+
307309
Returns
308310
-------
309311
arf_dict : dict
@@ -337,11 +339,11 @@ def _generate_arff_dict(self) -> "OrderedDict[str, Any]":
337339
if class_labels is not None:
338340
arff_dict["attributes"] = (
339341
arff_dict["attributes"]
342+
+ [("prediction", class_labels), ("correct", class_labels)]
340343
+ [
341344
("confidence." + class_labels[i], "NUMERIC")
342345
for i in range(len(class_labels))
343346
]
344-
+ [("prediction", class_labels), ("correct", class_labels)]
345347
)
346348
else:
347349
raise ValueError("The task has no class labels")
@@ -362,7 +364,7 @@ def _generate_arff_dict(self) -> "OrderedDict[str, Any]":
362364
]
363365
prediction_and_true = [("prediction", class_labels), ("correct", class_labels)]
364366
arff_dict["attributes"] = (
365-
arff_dict["attributes"] + prediction_confidences + prediction_and_true
367+
arff_dict["attributes"] + prediction_and_true + prediction_confidences
366368
)
367369
else:
368370
raise ValueError("The task has no class labels")

tests/test_runs/test_run.py

Lines changed: 164 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,11 @@
77

88
import xmltodict
99
from sklearn.dummy import DummyClassifier
10+
from sklearn.linear_model import LinearRegression
1011
from sklearn.tree import DecisionTreeClassifier
1112
from sklearn.model_selection import GridSearchCV
1213
from sklearn.pipeline import Pipeline
14+
from sklearn.base import clone
1315

1416
from openml import OpenMLRun
1517
from openml.testing import TestBase, SimpleImputer
@@ -39,6 +41,25 @@ def test_tagging(self):
3941
run_list = openml.runs.list_runs(tag=tag)
4042
self.assertEqual(len(run_list), 0)
4143

44+
@staticmethod
45+
def _test_prediction_data_equal(run, run_prime):
46+
# Determine which attributes are numeric and which not
47+
num_cols = np.array(
48+
[d_type == "NUMERIC" for _, d_type in run._generate_arff_dict()["attributes"]]
49+
)
50+
# Get run data consistently
51+
# (For run from server, .data_content does not exist)
52+
run_data_content = run.predictions.values
53+
run_prime_data_content = run_prime.predictions.values
54+
55+
# Assert numeric and string parts separately
56+
numeric_part = np.array(run_data_content[:, num_cols], dtype=float)
57+
numeric_part_prime = np.array(run_prime_data_content[:, num_cols], dtype=float)
58+
string_part = run_data_content[:, ~num_cols]
59+
string_part_prime = run_prime_data_content[:, ~num_cols]
60+
np.testing.assert_array_almost_equal(numeric_part, numeric_part_prime)
61+
np.testing.assert_array_equal(string_part, string_part_prime)
62+
4263
def _test_run_obj_equals(self, run, run_prime):
4364
for dictionary in ["evaluations", "fold_evaluations", "sample_evaluations"]:
4465
if getattr(run, dictionary) is not None:
@@ -49,14 +70,9 @@ def _test_run_obj_equals(self, run, run_prime):
4970
if other is not None:
5071
self.assertDictEqual(other, dict())
5172
self.assertEqual(run._to_xml(), run_prime._to_xml())
73+
self._test_prediction_data_equal(run, run_prime)
5274

53-
numeric_part = np.array(np.array(run.data_content)[:, 0:-2], dtype=float)
54-
numeric_part_prime = np.array(np.array(run_prime.data_content)[:, 0:-2], dtype=float)
55-
string_part = np.array(run.data_content)[:, -2:]
56-
string_part_prime = np.array(run_prime.data_content)[:, -2:]
57-
np.testing.assert_array_almost_equal(numeric_part, numeric_part_prime)
58-
np.testing.assert_array_equal(string_part, string_part_prime)
59-
75+
# Test trace
6076
if run.trace is not None:
6177
run_trace_content = run.trace.trace_to_arff()["data"]
6278
else:
@@ -192,6 +208,73 @@ def test_to_from_filesystem_no_model(self):
192208
with self.assertRaises(ValueError, msg="Could not find model.pkl"):
193209
openml.runs.OpenMLRun.from_filesystem(cache_path)
194210

211+
@staticmethod
212+
def _get_models_tasks_for_tests():
213+
model_clf = Pipeline(
214+
[
215+
("imputer", SimpleImputer(strategy="mean")),
216+
("classifier", DummyClassifier(strategy="prior")),
217+
]
218+
)
219+
model_reg = Pipeline(
220+
[
221+
("imputer", SimpleImputer(strategy="mean")),
222+
(
223+
"regressor",
224+
# LR because dummy does not produce enough float-like values
225+
LinearRegression(),
226+
),
227+
]
228+
)
229+
230+
task_clf = openml.tasks.get_task(119) # diabetes; hold out validation
231+
task_reg = openml.tasks.get_task(733) # quake; crossvalidation
232+
233+
return [(model_clf, task_clf), (model_reg, task_reg)]
234+
235+
@staticmethod
236+
def assert_run_prediction_data(task, run, model):
237+
# -- Get y_pred and y_true as it should be stored in the run
238+
n_repeats, n_folds, n_samples = task.get_split_dimensions()
239+
if (n_repeats > 1) or (n_samples > 1):
240+
raise ValueError("Test does not support this task type's split dimensions.")
241+
242+
X, y = task.get_X_and_y()
243+
244+
# Check correctness of y_true and y_pred in run
245+
for fold_id in range(n_folds):
246+
# Get data for fold
247+
_, test_indices = task.get_train_test_split_indices(repeat=0, fold=fold_id, sample=0)
248+
train_mask = np.full(len(X), True)
249+
train_mask[test_indices] = False
250+
251+
# Get train / test
252+
X_train = X[train_mask]
253+
y_train = y[train_mask]
254+
X_test = X[~train_mask]
255+
y_test = y[~train_mask]
256+
257+
# Get y_pred
258+
y_pred = model.fit(X_train, y_train).predict(X_test)
259+
260+
# Get stored data for fold
261+
saved_fold_data = run.predictions[run.predictions["fold"] == fold_id].sort_values(
262+
by="row_id"
263+
)
264+
saved_y_pred = saved_fold_data["prediction"].values
265+
gt_key = "truth" if "truth" in list(saved_fold_data) else "correct"
266+
saved_y_test = saved_fold_data[gt_key].values
267+
268+
assert_method = np.testing.assert_array_almost_equal
269+
if task.task_type == "Supervised Classification":
270+
y_pred = np.take(task.class_labels, y_pred)
271+
y_test = np.take(task.class_labels, y_test)
272+
assert_method = np.testing.assert_array_equal
273+
274+
# Assert correctness
275+
assert_method(y_pred, saved_y_pred)
276+
assert_method(y_test, saved_y_test)
277+
195278
@pytest.mark.sklearn
196279
def test_publish_with_local_loaded_flow(self):
197280
"""
@@ -200,40 +283,85 @@ def test_publish_with_local_loaded_flow(self):
200283
"""
201284
extension = openml.extensions.sklearn.SklearnExtension()
202285

203-
model = Pipeline(
204-
[("imputer", SimpleImputer(strategy="mean")), ("classifier", DummyClassifier())]
205-
)
206-
task = openml.tasks.get_task(119) # diabetes; crossvalidation
286+
for model, task in self._get_models_tasks_for_tests():
287+
# Make sure the flow does not exist on the server yet.
288+
flow = extension.model_to_flow(model)
289+
self._add_sentinel_to_flow_name(flow)
290+
self.assertFalse(openml.flows.flow_exists(flow.name, flow.external_version))
291+
292+
run = openml.runs.run_flow_on_task(
293+
flow=flow,
294+
task=task,
295+
add_local_measures=False,
296+
avoid_duplicate_runs=False,
297+
upload_flow=False,
298+
)
207299

208-
# Make sure the flow does not exist on the server yet.
209-
flow = extension.model_to_flow(model)
210-
self._add_sentinel_to_flow_name(flow)
211-
self.assertFalse(openml.flows.flow_exists(flow.name, flow.external_version))
300+
# Make sure that the flow has not been uploaded as requested.
301+
self.assertFalse(openml.flows.flow_exists(flow.name, flow.external_version))
212302

213-
run = openml.runs.run_flow_on_task(
214-
flow=flow,
215-
task=task,
216-
add_local_measures=False,
217-
avoid_duplicate_runs=False,
218-
upload_flow=False,
219-
)
303+
# Make sure that the prediction data stored in the run is correct.
304+
self.assert_run_prediction_data(task, run, clone(model))
220305

221-
# Make sure that the flow has not been uploaded as requested.
222-
self.assertFalse(openml.flows.flow_exists(flow.name, flow.external_version))
306+
cache_path = os.path.join(self.workdir, "runs", str(random.getrandbits(128)))
307+
run.to_filesystem(cache_path)
308+
# obtain run from filesystem
309+
loaded_run = openml.runs.OpenMLRun.from_filesystem(cache_path)
310+
loaded_run.publish()
223311

224-
cache_path = os.path.join(self.workdir, "runs", str(random.getrandbits(128)))
225-
run.to_filesystem(cache_path)
226-
# obtain run from filesystem
227-
loaded_run = openml.runs.OpenMLRun.from_filesystem(cache_path)
228-
loaded_run.publish()
229-
TestBase._mark_entity_for_removal("run", loaded_run.run_id)
230-
TestBase.logger.info(
231-
"collected from {}: {}".format(__file__.split("/")[-1], loaded_run.run_id)
232-
)
312+
# Clean up
313+
TestBase._mark_entity_for_removal("run", loaded_run.run_id)
314+
TestBase.logger.info(
315+
"collected from {}: {}".format(__file__.split("/")[-1], loaded_run.run_id)
316+
)
317+
318+
# make sure the flow is published as part of publishing the run.
319+
self.assertTrue(openml.flows.flow_exists(flow.name, flow.external_version))
320+
openml.runs.get_run(loaded_run.run_id)
321+
322+
@pytest.mark.sklearn
323+
def test_offline_and_online_run_identical(self):
324+
325+
extension = openml.extensions.sklearn.SklearnExtension()
326+
327+
for model, task in self._get_models_tasks_for_tests():
328+
# Make sure the flow does not exist on the server yet.
329+
flow = extension.model_to_flow(model)
330+
self._add_sentinel_to_flow_name(flow)
331+
self.assertFalse(openml.flows.flow_exists(flow.name, flow.external_version))
332+
333+
run = openml.runs.run_flow_on_task(
334+
flow=flow,
335+
task=task,
336+
add_local_measures=False,
337+
avoid_duplicate_runs=False,
338+
upload_flow=False,
339+
)
233340

234-
# make sure the flow is published as part of publishing the run.
235-
self.assertTrue(openml.flows.flow_exists(flow.name, flow.external_version))
236-
openml.runs.get_run(loaded_run.run_id)
341+
# Make sure that the flow has not been uploaded as requested.
342+
self.assertFalse(openml.flows.flow_exists(flow.name, flow.external_version))
343+
344+
# Load from filesystem
345+
cache_path = os.path.join(self.workdir, "runs", str(random.getrandbits(128)))
346+
run.to_filesystem(cache_path)
347+
loaded_run = openml.runs.OpenMLRun.from_filesystem(cache_path)
348+
349+
# Assert identical for offline - offline
350+
self._test_run_obj_equals(run, loaded_run)
351+
352+
# Publish and test for offline - online
353+
run.publish()
354+
self.assertTrue(openml.flows.flow_exists(flow.name, flow.external_version))
355+
356+
try:
357+
online_run = openml.runs.get_run(run.run_id, ignore_cache=True)
358+
self._test_prediction_data_equal(run, online_run)
359+
finally:
360+
# Clean up
361+
TestBase._mark_entity_for_removal("run", run.run_id)
362+
TestBase.logger.info(
363+
"collected from {}: {}".format(__file__.split("/")[-1], loaded_run.run_id)
364+
)
237365

238366
def test_run_setup_string_included_in_xml(self):
239367
SETUP_STRING = "setup-string"

tests/test_runs/test_run_functions.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1308,10 +1308,11 @@ def test__run_task_get_arffcontent(self):
13081308
# check row id
13091309
self.assertGreaterEqual(arff_line[2], 0)
13101310
self.assertLessEqual(arff_line[2], num_instances - 1)
1311+
# check prediction and ground truth columns
1312+
self.assertIn(arff_line[4], ["won", "nowin"])
1313+
self.assertIn(arff_line[5], ["won", "nowin"])
13111314
# check confidences
1312-
self.assertAlmostEqual(sum(arff_line[4:6]), 1.0)
1313-
self.assertIn(arff_line[6], ["won", "nowin"])
1314-
self.assertIn(arff_line[7], ["won", "nowin"])
1315+
self.assertAlmostEqual(sum(arff_line[6:]), 1.0)
13151316

13161317
def test__create_trace_from_arff(self):
13171318
with open(self.static_cache_dir + "/misc/trace.arff", "r") as arff_file:

0 commit comments

Comments
 (0)