Fix 1013: Store run setup_string (#1015)

PGijsbers · web-flow · commit bd8ae14b82c3 · 2021-03-25T21:55:38.000+01:00
* Test setup_string is stored and retrievable

* Add setup_string to run dictionary representation

* Add fix to release notes

* Test setup_string in xml without roundtrip

Also moved the test to OpenMLRun, since it mainly tests the OpenMLRun
behavior, not a function from openml.runs.functions.

* Serialize run_details

* Update with merged PRs since 11.0

* Prepare for run_details being provided by the server

* Remove pipeline code from setup_string

Long pipelines (e.g. gridsearches) could lead to too long setup strings.
This prevented run uploads.
Also add mypy ignores for old errors which weren't yet vetted by mypy.
diff --git a/doc/progress.rst b/doc/progress.rst
@@ -8,12 +8,33 @@ Changelog
 
 0.11.1
 ~~~~~~
-* MAINT #1018 : Refactor data loading and storage. Data is now compressed on the first call to `get_data`.
-* MAINT #891: Changed the way that numerical features are stored. Numerical features that range from 0 to 255 are now stored as uint8, which reduces the storage space required as well as storing and loading times.
-* MAINT #671: Improved the performance of ``check_datasets_active`` by only querying the given list of datasets in contrast to querying all datasets. Modified the corresponding unit test.
-* FIX #964 : AValidate `ignore_attribute`, `default_target_attribute`, `row_id_attribute` are set to attributes that exist on the dataset when calling ``create_dataset``.
-* DOC #973 : Change the task used in the welcome page example so it no longer fails using numerical dataset.
+* ADD #964: Validate ``ignore_attribute``, ``default_target_attribute``, ``row_id_attribute`` are set to attributes that exist on the dataset when calling ``create_dataset``.
+* ADD #979: Dataset features and qualities are now also cached in pickle format.
+* ADD #982: Add helper functions for column transformers.
+* ADD #989: ``run_model_on_task`` will now warn the user the the model passed has already been fitted.
 * ADD #1009 : Give possibility to not download the dataset qualities. The cached version is used even so download attribute is false.
+* ADD #1016: Add scikit-learn 0.24 support.
+* ADD #1020: Add option to parallelize evaluation of tasks with joblib.
+* ADD #1022: Allow minimum version of dependencies to be listed for a flow, use more accurate minimum versions for scikit-learn dependencies.
+* ADD #1023: Add admin-only calls for adding topics to datasets.
+* ADD #1029: Add support for fetching dataset from a minio server in parquet format.
+* ADD #1031: Generally improve runtime measurements, add them for some previously unsupported flows (e.g. BaseSearchCV derived flows).
+* DOC #973 : Change the task used in the welcome page example so it no longer fails using numerical dataset.
+* MAINT #671: Improved the performance of ``check_datasets_active`` by only querying the given list of datasets in contrast to querying all datasets. Modified the corresponding unit test.
+* MAINT #891: Changed the way that numerical features are stored. Numerical features that range from 0 to 255 are now stored as uint8, which reduces the storage space required as well as storing and loading times.
+* MAINT #975, #988: Add CI through Github Actions.
+* MAINT #977: Allow ``short`` and ``long`` scenarios for unit tests. Reduce the workload for some unit tests.
+* MAINT #985, #1000: Improve unit test stability and output readability, and adds load balancing.
+* MAINT #1018: Refactor data loading and storage. Data is now compressed on the first call to `get_data`.
+* MAINT #1024: Remove flaky decorator for study unit test.
+* FIX #883 #884 #906 #972: Various improvements to the caching system.
+* FIX #980: Speed up ``check_datasets_active``.
+* FIX #984: Add a retry mechanism when the server encounters a database issue.
+* FIX #1004: Fixed an issue that prevented installation on some systems (e.g. Ubuntu).
+* FIX #1013: Fixes a bug where ``OpenMLRun.setup_string`` was not uploaded to the server, prepares for ``run_details`` being sent from the server.
+* FIX #1021: Fixes an issue that could occur when running unit tests and openml-python was not in PATH.
+* FIX #1037: Fixes a bug where a dataset could not be loaded if a categorical value had listed nan-like as a possible category.
+
 0.11.0
 ~~~~~~
 * ADD #753: Allows uploading custom flows to OpenML via OpenML-Python.
diff --git a/openml/extensions/sklearn/extension.py b/openml/extensions/sklearn/extension.py
@@ -52,7 +52,10 @@
 
 
 SIMPLE_NUMPY_TYPES = [
-    nptype for type_cat, nptypes in np.sctypes.items() for nptype in nptypes if type_cat != "others"
+    nptype
+    for type_cat, nptypes in np.sctypes.items()
+    for nptype in nptypes  # type: ignore
+    if type_cat != "others"
 ]
 SIMPLE_TYPES = tuple([bool, int, float, str] + SIMPLE_NUMPY_TYPES)
 
@@ -546,7 +549,7 @@ def get_version_information(self) -> List[str]:
         major, minor, micro, _, _ = sys.version_info
         python_version = "Python_{}.".format(".".join([str(major), str(minor), str(micro)]))
         sklearn_version = "Sklearn_{}.".format(sklearn.__version__)
-        numpy_version = "NumPy_{}.".format(numpy.__version__)
+        numpy_version = "NumPy_{}.".format(numpy.__version__)  # type: ignore
         scipy_version = "SciPy_{}.".format(scipy.__version__)
 
         return [python_version, sklearn_version, numpy_version, scipy_version]
@@ -563,8 +566,7 @@ def create_setup_string(self, model: Any) -> str:
         str
         """
         run_environment = " ".join(self.get_version_information())
-        # fixme str(model) might contain (...)
-        return run_environment + " " + str(model)
+        return run_environment
 
     def _is_cross_validator(self, o: Any) -> bool:
         return isinstance(o, sklearn.model_selection.BaseCrossValidator)
@@ -1237,11 +1239,11 @@ def _check_dependencies(self, dependencies: str, strict_version: bool = True) ->
     def _serialize_type(self, o: Any) -> "OrderedDict[str, str]":
         mapping = {
             float: "float",
-            np.float: "np.float",
+            np.float: "np.float",  # type: ignore
             np.float32: "np.float32",
             np.float64: "np.float64",
             int: "int",
-            np.int: "np.int",
+            np.int: "np.int",  # type: ignore
             np.int32: "np.int32",
             np.int64: "np.int64",
         }
@@ -1253,11 +1255,11 @@ def _serialize_type(self, o: Any) -> "OrderedDict[str, str]":
     def _deserialize_type(self, o: str) -> Any:
         mapping = {
             "float": float,
-            "np.float": np.float,
+            "np.float": np.float,  # type: ignore
             "np.float32": np.float32,
             "np.float64": np.float64,
             "int": int,
-            "np.int": np.int,
+            "np.int": np.int,  # type: ignore
             "np.int32": np.int32,
             "np.int64": np.int64,
         }
@@ -1675,7 +1677,7 @@ def _run_model_on_fold(
         """
 
         def _prediction_to_probabilities(
-            y: np.ndarray, model_classes: List[Any], class_labels: Optional[List[str]]
+            y: Union[np.ndarray, List], model_classes: List[Any], class_labels: Optional[List[str]]
         ) -> pd.DataFrame:
             """Transforms predicted probabilities to match with OpenML class indices.
 
diff --git a/openml/runs/functions.py b/openml/runs/functions.py
@@ -805,6 +805,9 @@ def obtain_field(xml_obj, fieldname, from_server, cast=None):
     flow_name = obtain_field(run, "oml:flow_name", from_server)
     setup_id = obtain_field(run, "oml:setup_id", from_server, cast=int)
     setup_string = obtain_field(run, "oml:setup_string", from_server)
+    # run_details is currently not sent by the server, so we need to retrieve it safely.
+    # whenever that's resolved, we can enforce it being present (OpenML#1087)
+    run_details = obtain_field(run, "oml:run_details", from_server=False)
 
     if "oml:input_data" in run:
         dataset_id = int(run["oml:input_data"]["oml:dataset"]["oml:did"])
@@ -827,6 +830,7 @@ def obtain_field(xml_obj, fieldname, from_server, cast=None):
     if "oml:output_data" not in run:
         if from_server:
             raise ValueError("Run does not contain output_data " "(OpenML server error?)")
+        predictions_url = None
     else:
         output_data = run["oml:output_data"]
         predictions_url = None
@@ -911,6 +915,7 @@ def obtain_field(xml_obj, fieldname, from_server, cast=None):
         sample_evaluations=sample_evaluations,
         tags=tags,
         predictions_url=predictions_url,
+        run_details=run_details,
     )
 
 
diff --git a/openml/runs/run.py b/openml/runs/run.py
@@ -57,7 +57,9 @@ class OpenMLRun(OpenMLBase):
     run_id: int
     description_text: str, optional
         Description text to add to the predictions file.
-        If left None,
+        If left None, is set to the time the arff file is generated.
+    run_details: str, optional (default=None)
+        Description of the run stored in the run meta-data.
     """
 
     def __init__(
@@ -86,6 +88,7 @@ def __init__(
         flow=None,
         run_id=None,
         description_text=None,
+        run_details=None,
     ):
         self.uploader = uploader
         self.uploader_name = uploader_name
@@ -112,6 +115,7 @@ def __init__(
         self.tags = tags
         self.predictions_url = predictions_url
         self.description_text = description_text
+        self.run_details = run_details
 
     @property
     def id(self) -> Optional[int]:
@@ -543,11 +547,15 @@ def _to_dict(self) -> "OrderedDict[str, OrderedDict]":
         description["oml:run"]["@xmlns:oml"] = "http://openml.org/openml"
         description["oml:run"]["oml:task_id"] = self.task_id
         description["oml:run"]["oml:flow_id"] = self.flow_id
+        if self.setup_string is not None:
+            description["oml:run"]["oml:setup_string"] = self.setup_string
         if self.error_message is not None:
             description["oml:run"]["oml:error_message"] = self.error_message
+        if self.run_details is not None:
+            description["oml:run"]["oml:run_details"] = self.run_details
         description["oml:run"]["oml:parameter_setting"] = self.parameter_settings
         if self.tags is not None:
-            description["oml:run"]["oml:tag"] = self.tags  # Tags describing the run
+            description["oml:run"]["oml:tag"] = self.tags
         if (self.fold_evaluations is not None and len(self.fold_evaluations) > 0) or (
             self.sample_evaluations is not None and len(self.sample_evaluations) > 0
         ):
diff --git a/tests/test_runs/test_run.py b/tests/test_runs/test_run.py
@@ -5,11 +5,13 @@
 import os
 from time import time
 
+import xmltodict
 from sklearn.dummy import DummyClassifier
 from sklearn.tree import DecisionTreeClassifier
 from sklearn.model_selection import GridSearchCV
 from sklearn.pipeline import Pipeline
 
+from openml import OpenMLRun
 from openml.testing import TestBase, SimpleImputer
 import openml
 import openml.extensions.sklearn
@@ -215,3 +217,19 @@ def test_publish_with_local_loaded_flow(self):
         # make sure the flow is published as part of publishing the run.
         self.assertTrue(openml.flows.flow_exists(flow.name, flow.external_version))
         openml.runs.get_run(loaded_run.run_id)
+
+    def test_run_setup_string_included_in_xml(self):
+        SETUP_STRING = "setup-string"
+        run = OpenMLRun(
+            task_id=0,
+            flow_id=None,  # if not none, flow parameters are required.
+            dataset_id=0,
+            setup_string=SETUP_STRING,
+        )
+        xml = run._to_xml()
+        run_dict = xmltodict.parse(xml)["oml:run"]
+        assert "oml:setup_string" in run_dict
+        assert run_dict["oml:setup_string"] == SETUP_STRING
+
+        recreated_run = openml.runs.functions._create_run_from_xml(xml, from_server=False)
+        assert recreated_run.setup_string == SETUP_STRING