Fix: correctly order the ground truth and prediction for ARFF files in run.data_content (#1209)

LennartPurucker · PGijsbers · web-flow · commit bbf09b344d53 · 2023-02-24T09:48:10.000+01:00
* add test and fix for switch of ground truth and predictions * undo import optimization * fix bug with model passing to function * fix order in other tests * update progress.rst * new unit test for run consistency and bug fixed * clarify new assert * minor loop refactor * refactor default to None * directly test prediction data equal * Update tests/test_runs/test_run.py Co-authored-by: Pieter Gijsbers <p.gijsbers@tue.nl> * Mark sklearn tests (#1202) * Add sklearn marker * Mark tests that use scikit-learn * Only run scikit-learn tests multiple times The generic tests that don't use scikit-learn should only be tested once (per platform). * Rename for correct variable * Add sklearn mark for filesystem test * Remove quotes around sklearn * Instead include sklearn in the matrix definition * Update jobnames * Add explicit false to jobname * Remove space * Add function inside of expression? * Do string testing instead * Add missing ${{ * Add explicit true to old sklearn tests * Add instruction to add pytest marker for sklearn tests * add test and fix for switch of ground truth and predictions * undo import optimization * fix mask error resulting from rebase * make dummy classifier strategy consistent to avoid problems as a result of the random state problems for sklearn < 0.24 --------- Co-authored-by: Pieter Gijsbers <p.gijsbers@tue.nl>
diff --git a/doc/progress.rst b/doc/progress.rst
@@ -9,9 +9,9 @@ Changelog
 0.13.1
 ~~~~~~
 
+ * FIX #1197 #559 #1131: Fix the order of ground truth and predictions in the ``OpenMLRun`` object and in ``format_prediction``.
  * FIX #1198: Support numpy 1.24 and higher.
 
-
 0.13.0
 ~~~~~~
 
diff --git a/openml/runs/functions.py b/openml/runs/functions.py
@@ -155,7 +155,6 @@ def run_flow_on_task(
     dataset_format: str = "dataframe",
     n_jobs: Optional[int] = None,
 ) -> OpenMLRun:
-
     """Run the model provided by the flow on the dataset defined by task.
 
     Takes the flow and repeat information into account.
@@ -515,13 +514,13 @@ def _calculate_local_measure(sklearn_fn, openml_name):
                         else pred_y[i]
                     )
                     if isinstance(test_y, pd.Series):
-                        test_prediction = (
+                        truth = (
                             task.class_labels[test_y.iloc[i]]
                             if isinstance(test_y.iloc[i], int)
                             else test_y.iloc[i]
                         )
                     else:
-                        test_prediction = (
+                        truth = (
                             task.class_labels[test_y[i]]
                             if isinstance(test_y[i], (int, np.integer))
                             else test_y[i]
@@ -535,7 +534,7 @@ def _calculate_local_measure(sklearn_fn, openml_name):
                         sample=sample_no,
                         index=tst_idx,
                         prediction=prediction,
-                        truth=test_prediction,
+                        truth=truth,
                         proba=dict(zip(task.class_labels, pred_prob)),
                     )
                 else:
@@ -552,14 +551,14 @@ def _calculate_local_measure(sklearn_fn, openml_name):
         elif isinstance(task, OpenMLRegressionTask):
 
             for i, _ in enumerate(test_indices):
-                test_prediction = test_y.iloc[i] if isinstance(test_y, pd.Series) else test_y[i]
+                truth = test_y.iloc[i] if isinstance(test_y, pd.Series) else test_y[i]
                 arff_line = format_prediction(
                     task=task,
                     repeat=rep_no,
                     fold=fold_no,
                     index=test_indices[i],
                     prediction=pred_y[i],
-                    truth=test_prediction,
+                    truth=truth,
                 )
 
                 arff_datacontent.append(arff_line)
@@ -920,9 +919,10 @@ def obtain_field(xml_obj, fieldname, from_server, cast=None):
         parameter_settings=parameters,
         dataset_id=dataset_id,
         output_files=files,
-        evaluations=evaluations,
-        fold_evaluations=fold_evaluations,
-        sample_evaluations=sample_evaluations,
+        # Make sure default values are used where needed to keep run objects identical
+        evaluations=evaluations or None,
+        fold_evaluations=fold_evaluations or None,
+        sample_evaluations=sample_evaluations or None,
         tags=tags,
         predictions_url=predictions_url,
         run_details=run_details,
@@ -1186,6 +1186,10 @@ def format_prediction(
     -------
     A list with elements for the prediction results of a run.
 
+    The returned order of the elements is (if available):
+        [repeat, fold, sample, index, prediction, truth, *probabilities]
+
+    This order follows the R Client API.
     """
     if isinstance(task, OpenMLClassificationTask):
         if proba is None:
@@ -1200,8 +1204,8 @@ def format_prediction(
             else:
                 sample = 0
         probabilities = [proba[c] for c in task.class_labels]
-        return [repeat, fold, sample, index, *probabilities, truth, prediction]
+        return [repeat, fold, sample, index, prediction, truth, *probabilities]
     elif isinstance(task, OpenMLRegressionTask):
-        return [repeat, fold, index, truth, prediction]
+        return [repeat, fold, index, prediction, truth]
     else:
         raise NotImplementedError(f"Formatting for {type(task)} is not supported.")
diff --git a/openml/runs/run.py b/openml/runs/run.py
@@ -304,6 +304,8 @@ def _generate_arff_dict(self) -> "OrderedDict[str, Any]":
 
         Assumes that the run has been executed.
 
+        The order of the attributes follows the order defined by the Client API for R.
+
         Returns
         -------
         arf_dict : dict
@@ -337,11 +339,11 @@ def _generate_arff_dict(self) -> "OrderedDict[str, Any]":
             if class_labels is not None:
                 arff_dict["attributes"] = (
                     arff_dict["attributes"]
+                    + [("prediction", class_labels), ("correct", class_labels)]
                     + [
                         ("confidence." + class_labels[i], "NUMERIC")
                         for i in range(len(class_labels))
                     ]
-                    + [("prediction", class_labels), ("correct", class_labels)]
                 )
             else:
                 raise ValueError("The task has no class labels")
@@ -362,7 +364,7 @@ def _generate_arff_dict(self) -> "OrderedDict[str, Any]":
                 ]
                 prediction_and_true = [("prediction", class_labels), ("correct", class_labels)]
                 arff_dict["attributes"] = (
-                    arff_dict["attributes"] + prediction_confidences + prediction_and_true
+                    arff_dict["attributes"] + prediction_and_true + prediction_confidences
                 )
             else:
                 raise ValueError("The task has no class labels")
diff --git a/tests/test_runs/test_run.py b/tests/test_runs/test_run.py
@@ -7,9 +7,11 @@
 
 import xmltodict
 from sklearn.dummy import DummyClassifier
+from sklearn.linear_model import LinearRegression
 from sklearn.tree import DecisionTreeClassifier
 from sklearn.model_selection import GridSearchCV
 from sklearn.pipeline import Pipeline
+from sklearn.base import clone
 
 from openml import OpenMLRun
 from openml.testing import TestBase, SimpleImputer
@@ -39,6 +41,25 @@ def test_tagging(self):
         run_list = openml.runs.list_runs(tag=tag)
         self.assertEqual(len(run_list), 0)
 
+    @staticmethod
+    def _test_prediction_data_equal(run, run_prime):
+        # Determine which attributes are numeric and which not
+        num_cols = np.array(
+            [d_type == "NUMERIC" for _, d_type in run._generate_arff_dict()["attributes"]]
+        )
+        # Get run data consistently
+        #   (For run from server, .data_content does not exist)
+        run_data_content = run.predictions.values
+        run_prime_data_content = run_prime.predictions.values
+
+        # Assert numeric and string parts separately
+        numeric_part = np.array(run_data_content[:, num_cols], dtype=float)
+        numeric_part_prime = np.array(run_prime_data_content[:, num_cols], dtype=float)
+        string_part = run_data_content[:, ~num_cols]
+        string_part_prime = run_prime_data_content[:, ~num_cols]
+        np.testing.assert_array_almost_equal(numeric_part, numeric_part_prime)
+        np.testing.assert_array_equal(string_part, string_part_prime)
+
     def _test_run_obj_equals(self, run, run_prime):
         for dictionary in ["evaluations", "fold_evaluations", "sample_evaluations"]:
             if getattr(run, dictionary) is not None:
@@ -49,14 +70,9 @@ def _test_run_obj_equals(self, run, run_prime):
                 if other is not None:
                     self.assertDictEqual(other, dict())
         self.assertEqual(run._to_xml(), run_prime._to_xml())
+        self._test_prediction_data_equal(run, run_prime)
 
-        numeric_part = np.array(np.array(run.data_content)[:, 0:-2], dtype=float)
-        numeric_part_prime = np.array(np.array(run_prime.data_content)[:, 0:-2], dtype=float)
-        string_part = np.array(run.data_content)[:, -2:]
-        string_part_prime = np.array(run_prime.data_content)[:, -2:]
-        np.testing.assert_array_almost_equal(numeric_part, numeric_part_prime)
-        np.testing.assert_array_equal(string_part, string_part_prime)
-
+        # Test trace
         if run.trace is not None:
             run_trace_content = run.trace.trace_to_arff()["data"]
         else:
@@ -192,6 +208,73 @@ def test_to_from_filesystem_no_model(self):
         with self.assertRaises(ValueError, msg="Could not find model.pkl"):
             openml.runs.OpenMLRun.from_filesystem(cache_path)
 
+    @staticmethod
+    def _get_models_tasks_for_tests():
+        model_clf = Pipeline(
+            [
+                ("imputer", SimpleImputer(strategy="mean")),
+                ("classifier", DummyClassifier(strategy="prior")),
+            ]
+        )
+        model_reg = Pipeline(
+            [
+                ("imputer", SimpleImputer(strategy="mean")),
+                (
+                    "regressor",
+                    # LR because dummy does not produce enough float-like values
+                    LinearRegression(),
+                ),
+            ]
+        )
+
+        task_clf = openml.tasks.get_task(119)  # diabetes; hold out validation
+        task_reg = openml.tasks.get_task(733)  # quake; crossvalidation
+
+        return [(model_clf, task_clf), (model_reg, task_reg)]
+
+    @staticmethod
+    def assert_run_prediction_data(task, run, model):
+        # -- Get y_pred and y_true as it should be stored in the run
+        n_repeats, n_folds, n_samples = task.get_split_dimensions()
+        if (n_repeats > 1) or (n_samples > 1):
+            raise ValueError("Test does not support this task type's split dimensions.")
+
+        X, y = task.get_X_and_y()
+
+        # Check correctness of y_true and y_pred in run
+        for fold_id in range(n_folds):
+            # Get data for fold
+            _, test_indices = task.get_train_test_split_indices(repeat=0, fold=fold_id, sample=0)
+            train_mask = np.full(len(X), True)
+            train_mask[test_indices] = False
+
+            # Get train / test
+            X_train = X[train_mask]
+            y_train = y[train_mask]
+            X_test = X[~train_mask]
+            y_test = y[~train_mask]
+
+            # Get y_pred
+            y_pred = model.fit(X_train, y_train).predict(X_test)
+
+            # Get stored data for fold
+            saved_fold_data = run.predictions[run.predictions["fold"] == fold_id].sort_values(
+                by="row_id"
+            )
+            saved_y_pred = saved_fold_data["prediction"].values
+            gt_key = "truth" if "truth" in list(saved_fold_data) else "correct"
+            saved_y_test = saved_fold_data[gt_key].values
+
+            assert_method = np.testing.assert_array_almost_equal
+            if task.task_type == "Supervised Classification":
+                y_pred = np.take(task.class_labels, y_pred)
+                y_test = np.take(task.class_labels, y_test)
+                assert_method = np.testing.assert_array_equal
+
+            # Assert correctness
+            assert_method(y_pred, saved_y_pred)
+            assert_method(y_test, saved_y_test)
+
     @pytest.mark.sklearn
     def test_publish_with_local_loaded_flow(self):
         """
@@ -200,40 +283,85 @@ def test_publish_with_local_loaded_flow(self):
         """
         extension = openml.extensions.sklearn.SklearnExtension()
 
-        model = Pipeline(
-            [("imputer", SimpleImputer(strategy="mean")), ("classifier", DummyClassifier())]
-        )
-        task = openml.tasks.get_task(119)  # diabetes; crossvalidation
+        for model, task in self._get_models_tasks_for_tests():
+            # Make sure the flow does not exist on the server yet.
+            flow = extension.model_to_flow(model)
+            self._add_sentinel_to_flow_name(flow)
+            self.assertFalse(openml.flows.flow_exists(flow.name, flow.external_version))
+
+            run = openml.runs.run_flow_on_task(
+                flow=flow,
+                task=task,
+                add_local_measures=False,
+                avoid_duplicate_runs=False,
+                upload_flow=False,
+            )
 
-        # Make sure the flow does not exist on the server yet.
-        flow = extension.model_to_flow(model)
-        self._add_sentinel_to_flow_name(flow)
-        self.assertFalse(openml.flows.flow_exists(flow.name, flow.external_version))
+            # Make sure that the flow has not been uploaded as requested.
+            self.assertFalse(openml.flows.flow_exists(flow.name, flow.external_version))
 
-        run = openml.runs.run_flow_on_task(
-            flow=flow,
-            task=task,
-            add_local_measures=False,
-            avoid_duplicate_runs=False,
-            upload_flow=False,
-        )
+            # Make sure that the prediction data stored in the run is correct.
+            self.assert_run_prediction_data(task, run, clone(model))
 
-        # Make sure that the flow has not been uploaded as requested.
-        self.assertFalse(openml.flows.flow_exists(flow.name, flow.external_version))
+            cache_path = os.path.join(self.workdir, "runs", str(random.getrandbits(128)))
+            run.to_filesystem(cache_path)
+            # obtain run from filesystem
+            loaded_run = openml.runs.OpenMLRun.from_filesystem(cache_path)
+            loaded_run.publish()
 
-        cache_path = os.path.join(self.workdir, "runs", str(random.getrandbits(128)))
-        run.to_filesystem(cache_path)
-        # obtain run from filesystem
-        loaded_run = openml.runs.OpenMLRun.from_filesystem(cache_path)
-        loaded_run.publish()
-        TestBase._mark_entity_for_removal("run", loaded_run.run_id)
-        TestBase.logger.info(
-            "collected from {}: {}".format(__file__.split("/")[-1], loaded_run.run_id)
-        )
+            # Clean up
+            TestBase._mark_entity_for_removal("run", loaded_run.run_id)
+            TestBase.logger.info(
+                "collected from {}: {}".format(__file__.split("/")[-1], loaded_run.run_id)
+            )
+
+            # make sure the flow is published as part of publishing the run.
+            self.assertTrue(openml.flows.flow_exists(flow.name, flow.external_version))
+            openml.runs.get_run(loaded_run.run_id)
+
+    @pytest.mark.sklearn
+    def test_offline_and_online_run_identical(self):
+
+        extension = openml.extensions.sklearn.SklearnExtension()
+
+        for model, task in self._get_models_tasks_for_tests():
+            # Make sure the flow does not exist on the server yet.
+            flow = extension.model_to_flow(model)
+            self._add_sentinel_to_flow_name(flow)
+            self.assertFalse(openml.flows.flow_exists(flow.name, flow.external_version))
+
+            run = openml.runs.run_flow_on_task(
+                flow=flow,
+                task=task,
+                add_local_measures=False,
+                avoid_duplicate_runs=False,
+                upload_flow=False,
+            )
 
-        # make sure the flow is published as part of publishing the run.
-        self.assertTrue(openml.flows.flow_exists(flow.name, flow.external_version))
-        openml.runs.get_run(loaded_run.run_id)
+            # Make sure that the flow has not been uploaded as requested.
+            self.assertFalse(openml.flows.flow_exists(flow.name, flow.external_version))
+
+            # Load from filesystem
+            cache_path = os.path.join(self.workdir, "runs", str(random.getrandbits(128)))
+            run.to_filesystem(cache_path)
+            loaded_run = openml.runs.OpenMLRun.from_filesystem(cache_path)
+
+            # Assert identical for offline - offline
+            self._test_run_obj_equals(run, loaded_run)
+
+            # Publish and test for offline - online
+            run.publish()
+            self.assertTrue(openml.flows.flow_exists(flow.name, flow.external_version))
+
+            try:
+                online_run = openml.runs.get_run(run.run_id, ignore_cache=True)
+                self._test_prediction_data_equal(run, online_run)
+            finally:
+                # Clean up
+                TestBase._mark_entity_for_removal("run", run.run_id)
+                TestBase.logger.info(
+                    "collected from {}: {}".format(__file__.split("/")[-1], loaded_run.run_id)
+                )
 
     def test_run_setup_string_included_in_xml(self):
         SETUP_STRING = "setup-string"
diff --git a/tests/test_runs/test_run_functions.py b/tests/test_runs/test_run_functions.py
@@ -1308,10 +1308,11 @@ def test__run_task_get_arffcontent(self):
             # check row id
             self.assertGreaterEqual(arff_line[2], 0)
             self.assertLessEqual(arff_line[2], num_instances - 1)
+            # check prediction and ground truth columns
+            self.assertIn(arff_line[4], ["won", "nowin"])
+            self.assertIn(arff_line[5], ["won", "nowin"])
             # check confidences
-            self.assertAlmostEqual(sum(arff_line[4:6]), 1.0)
-            self.assertIn(arff_line[6], ["won", "nowin"])
-            self.assertIn(arff_line[7], ["won", "nowin"])
+            self.assertAlmostEqual(sum(arff_line[6:]), 1.0)
 
     def test__create_trace_from_arff(self):
         with open(self.static_cache_dir + "/misc/trace.arff", "r") as arff_file: