Merge pull request #117 from dataiku/enhancement/dss90-better-suport-custom-plugin-algos

nicolasservel · web-flow · commit 3e60674637df · 2021-01-22T15:35:11.000+01:00
Better suport custom &amp; plugin algos
diff --git a/dataikuapi/dss/analysis.py b/dataikuapi/dss/analysis.py
@@ -188,8 +188,9 @@ def create_prediction_ml_task(self,
         return mltask
 
     def create_clustering_ml_task(self,
-                                   ml_backend_type = "PY_MEMORY",
-                                   guess_policy = "KMEANS"):
+                                  ml_backend_type="PY_MEMORY",
+                                  guess_policy="KMEANS",
+                                  wait_guess_complete=True):
 
 
         """Creates a new clustering task in a new visual analysis lab
@@ -205,6 +206,10 @@ def create_clustering_ml_task(self,
 
         :param string ml_backend_type: ML backend to use, one of PY_MEMORY, MLLIB or H2O
         :param string guess_policy: Policy to use for setting the default parameters.  Valid values are: KMEANS and ANOMALY_DETECTION
+        :param boolean wait_guess_complete: if False, the returned ML task will be in 'guessing' state, i.e. analyzing the input dataset to determine feature handling and algorithms.
+                                            You should wait for the guessing to be completed by calling
+                                            ``wait_guess_complete`` on the returned object before doing anything
+                                            else (in particular calling ``train`` or ``get_settings``)
         """
 
         obj = {
@@ -214,7 +219,11 @@ def create_clustering_ml_task(self,
         }
 
         ref = self.client._perform_json("POST", "/projects/%s/lab/%s/models/" % (self.project_key, self.analysis_id), body=obj)
-        return DSSMLTask(self.client, self.project_key, self.analysis_id, ref["mlTaskId"])
+        mltask = DSSMLTask(self.client, self.project_key, self.analysis_id, ref["mlTaskId"])
+
+        if wait_guess_complete:
+            mltask.wait_guess_complete()
+        return mltask
 
     def list_ml_tasks(self):
         """
diff --git a/dataikuapi/dss/dataset.py b/dataikuapi/dss/dataset.py
@@ -385,8 +385,9 @@ def create_prediction_ml_task(self, target_variable,
              guess_policy = guess_policy, prediction_type = prediction_type, wait_guess_complete = wait_guess_complete)
 
     def create_clustering_ml_task(self, input_dataset,
-                                   ml_backend_type = "PY_MEMORY",
-                                   guess_policy = "KMEANS"):
+                                  ml_backend_type="PY_MEMORY",
+                                  guess_policy="KMEANS",
+                                  wait_guess_complete=True):
         """Creates a new clustering task in a new visual analysis lab
         for a dataset.
 
@@ -400,9 +401,13 @@ def create_clustering_ml_task(self, input_dataset,
 
         :param string ml_backend_type: ML backend to use, one of PY_MEMORY, MLLIB or H2O
         :param string guess_policy: Policy to use for setting the default parameters.  Valid values are: KMEANS and ANOMALY_DETECTION
+        :param boolean wait_guess_complete: if False, the returned ML task will be in 'guessing' state, i.e. analyzing the input dataset to determine feature handling and algorithms.
+                                            You should wait for the guessing to be completed by calling
+                                            ``wait_guess_complete`` on the returned object before doing anything
+                                            else (in particular calling ``train`` or ``get_settings``)
         """
-        return self.project.create_clustering_ml_task(self.dataset_name, 
-            ml_backend_type = ml_backend_type, guess_policy = guess_policy)
+        return self.project.create_clustering_ml_task(self.dataset_name, ml_backend_type=ml_backend_type, guess_policy=guess_policy,
+                                                      wait_guess_complete=wait_guess_complete)
 
     def create_analysis(self):
         """
diff --git a/dataikuapi/dss/ml.py b/dataikuapi/dss/ml.py
@@ -238,6 +238,16 @@ def use_feature(self, feature_name):
     def get_algorithm_settings(self, algorithm_name):
         raise NotImplementedError()
 
+    def _get_custom_algorithm_settings(self, algorithm_name):
+        # returns the first algorithm with this name
+        for algo in self.mltask_settings["modeling"]["custom_mllib"]:
+            if algorithm_name == algo["name"]:
+                return algo
+        for algo in self.mltask_settings["modeling"]["custom_python"]:
+            if algorithm_name == algo["name"]:
+                return algo
+        raise ValueError("Unknown algorithm: {}".format(algorithm_name))
+
     def get_diagnostics_settings(self):
         """
         Gets the diagnostics settings for a mltask. This returns a reference to the
@@ -307,31 +317,38 @@ def disable_all_algorithms(self):
             custom_mllib["enabled"] = False
         for custom_python in self.mltask_settings["modeling"]["custom_python"]:
             custom_python["enabled"] = False
-        for plugin in self.mltask_settings["modeling"]["plugin_python"].values():
+        for plugin in self.mltask_settings["modeling"].get("plugin_python", {}).values():
             plugin["enabled"] = False
 
     def get_all_possible_algorithm_names(self):
         """
         Returns the list of possible algorithm names, i.e. the list of valid
         identifiers for :meth:`set_algorithm_enabled` and :meth:`get_algorithm_settings`
 
-        This does not include Custom Python models, Custom MLLib models, plugin models.
         This includes all possible algorithms, regardless of the prediction kind (regression/classification)
         or engine, so some algorithms may be irrelevant
 
         :returns: the list of algorithm names as a list of strings
         :rtype: list of string
         """
-        return list(self.__class__.algorithm_remap.keys())
+        return list(self.__class__.algorithm_remap.keys()) + self._get_custom_algorithm_names()
+
+    def _get_custom_algorithm_names(self):
+        """
+        Returns the list of names of defined custom models (Python & MLlib backends)
+
+        :returns: the list of custom models names
+        :rtype: list of string
+        """
+        return ([algo["name"] for algo in self.mltask_settings["modeling"]["custom_mllib"]]
+                + [algo["name"] for algo in self.mltask_settings["modeling"]["custom_python"]])
 
     def get_enabled_algorithm_names(self):
         """
         :returns: the list of enabled algorithm names as a list of strings
         :rtype: list of string
         """
-        algos = self.__class__.algorithm_remap
-        algo_names = [algo_name for algo_name in algos.keys() if self.mltask_settings["modeling"][algos[algo_name].algorithm_name.lower()]["enabled"]]
-        return algo_names
+        return [algo_name for algo_name in self.get_all_possible_algorithm_names() if self.get_algorithm_settings(algo_name).get("enabled", False)]
 
     def get_enabled_algorithm_settings(self):
         """
@@ -356,6 +373,32 @@ def set_metric(self, metric=None, custom_metric=None, custom_metric_greater_is_b
         self.mltask_settings["modeling"]["metrics"]["customEvaluationMetricGIB"] = custom_metric_greater_is_better
         self.mltask_settings["modeling"]["metrics"]["customEvaluationMetricNeedsProba"] = custom_metric_use_probas
 
+    def add_custom_python_model(self, name="Custom Python Model", code=""):
+        """
+        Adds a new custom python model
+
+        :param str name: name of the custom model
+        :param str code: code of the custom model
+        """
+        self.mltask_settings["modeling"]["custom_python"].append({
+            "name": name,
+            "code": code,
+            "enabled": True
+        })
+
+    def add_custom_mllib_model(self, name="Custom MLlib Model", code=""):
+        """
+        Adds a new custom MLlib model
+
+        :param str name: name of the custom model
+        :param str code: code of the custom model
+        """
+        self.mltask_settings["modeling"]["custom_mllib"].append({
+            "name": name,
+            "initializationCode": code,
+            "enabled": True
+        })
+
     def save(self):
         """Saves back these settings to the ML Task"""
 
@@ -1310,7 +1353,6 @@ def __init__(self, raw_settings, hyperparameter_search_params):
 
         self.cache_node_ids = self._register_simple_parameter("cache_node_ids")
         self.checkpoint_interval = self._register_single_value_hyperparameter("checkpoint_interval", accepted_types=[int])
-        self.impurity = self._register_single_category_hyperparameter("impurity", accepted_values=["gini", "entropy", "variance"])  # TODO: distinguish between regression and classif
         self.max_bins = self._register_single_value_hyperparameter("max_bins", accepted_types=[int])
         self.max_memory_mb = self._register_simple_parameter("max_memory_mb")
         self.min_info_gain = self._register_single_value_hyperparameter("min_info_gain", accepted_types=[int, float])
@@ -1395,20 +1437,41 @@ def __init__(self, client, project_key, analysis_id, mltask_id, mltask_settings)
     def get_prediction_type(self):
         return self.mltask_settings['predictionType']
 
+    def get_all_possible_algorithm_names(self):
+        """
+        Returns the list of possible algorithm names, i.e. the list of valid
+        identifiers for :meth:`set_algorithm_enabled` and :meth:`get_algorithm_settings`
+
+        This includes all possible algorithms, regardless of the prediction kind (regression/classification)
+        or engine, so some algorithms may be irrelevant
+
+        :returns: the list of algorithm names as a list of strings
+        :rtype: list of string
+        """
+        return super(DSSPredictionMLTaskSettings, self).get_all_possible_algorithm_names() + self._get_plugin_algorithm_names()
+
+    def _get_plugin_algorithm_names(self):
+        return list(self.mltask_settings["modeling"]["plugin_python"].keys())
+
+    def _get_plugin_algorithm_settings(self, algorithm_name):
+        if algorithm_name in self.mltask_settings["modeling"]["plugin_python"]:
+                return self.mltask_settings["modeling"]["plugin_python"][algorithm_name]
+        raise ValueError("Unknown algorithm: {}".format(algorithm_name))
+
     def get_enabled_algorithm_names(self):
         """
         :returns: the list of enabled algorithm names as a list of strings
         :rtype: list of string
         """
-        algos = self.__class__.algorithm_remap
+        algo_names = super(DSSPredictionMLTaskSettings, self).get_enabled_algorithm_names()
+
         # Hide either "XGBOOST_CLASSIFICATION" or "XGBOOST_REGRESSION" which point to the same key "xgboost"
         if self.mltask_settings["predictionType"] == "REGRESSION":
-            excluded_name = {"XGBOOST_CLASSIFICATION"}
+            excluded_names = {"XGBOOST_CLASSIFICATION"}
         else:
-            excluded_name = {"XGBOOST_REGRESSION"}
-        algo_names = [algo_name for algo_name in algos.keys() if (self.mltask_settings["modeling"][algos[algo_name].algorithm_name.lower()]["enabled"]
-                                                                  and algo_name not in excluded_name)]
-        return algo_names
+            excluded_names = {"XGBOOST_REGRESSION"}
+
+        return [algo_name for algo_name in algo_names if algo_name not in excluded_names]
 
     def get_algorithm_settings(self, algorithm_name):
         """
@@ -1442,6 +1505,10 @@ def get_algorithm_settings(self, algorithm_name):
                 # Subsequent calls get the same object
                 self.mltask_settings["modeling"][algorithm_name.lower()] = algorithm_settings
             return self.mltask_settings["modeling"][algorithm_name.lower()]
+        elif algorithm_name in self._get_custom_algorithm_names():
+            return self._get_custom_algorithm_settings(algorithm_name)
+        elif algorithm_name in self._get_plugin_algorithm_names():
+            return self._get_plugin_algorithm_settings(algorithm_name)
         else:
             raise ValueError("Unknown algorithm: {}".format(algorithm_name))
 
@@ -1590,8 +1657,11 @@ def get_algorithm_settings(self, algorithm_name):
         """
         if algorithm_name in self.__class__.algorithm_remap:
             algorithm_name = self.__class__.algorithm_remap[algorithm_name]
-
-        return self.mltask_settings["modeling"][algorithm_name.lower()]
+            return self.mltask_settings["modeling"][algorithm_name.lower()]
+        elif algorithm_name in self._get_custom_algorithm_names():
+            return self._get_custom_algorithm_settings(algorithm_name)
+        else:
+            raise ValueError("Unknown algorithm: {}".format(algorithm_name))
 
 
 class DSSTrainedModelDetails(object):
diff --git a/dataikuapi/dss/project.py b/dataikuapi/dss/project.py
@@ -532,9 +532,9 @@ def create_prediction_ml_task(self, input_dataset, target_variable,
         return ret
 
     def create_clustering_ml_task(self, input_dataset,
-                                   ml_backend_type = "PY_MEMORY",
-                                   guess_policy = "KMEANS"):
-
+                                  ml_backend_type = "PY_MEMORY",
+                                  guess_policy = "KMEANS",
+                                  wait_guess_complete=True):
 
         """Creates a new clustering task in a new visual analysis lab
         for a dataset.
@@ -549,6 +549,10 @@ def create_clustering_ml_task(self, input_dataset,
 
         :param string ml_backend_type: ML backend to use, one of PY_MEMORY, MLLIB or H2O
         :param string guess_policy: Policy to use for setting the default parameters.  Valid values are: KMEANS and ANOMALY_DETECTION
+        :param boolean wait_guess_complete: if False, the returned ML task will be in 'guessing' state, i.e. analyzing the input dataset to determine feature handling and algorithms.
+                                            You should wait for the guessing to be completed by calling
+                                            ``wait_guess_complete`` on the returned object before doing anything
+                                            else (in particular calling ``train`` or ``get_settings``)
         """
 
         obj = {
@@ -559,7 +563,11 @@ def create_clustering_ml_task(self, input_dataset,
         }
 
         ref = self.client._perform_json("POST", "/projects/%s/models/lab/" % self.project_key, body=obj)
-        return DSSMLTask(self.client, self.project_key, ref["analysisId"], ref["mlTaskId"])
+        mltask = DSSMLTask(self.client, self.project_key, ref["analysisId"], ref["mlTaskId"])
+
+        if wait_guess_complete:
+            mltask.wait_guess_complete()
+        return mltask
 
     def list_ml_tasks(self):
         """