diff --git a/dataikuapi/dss/analysis.py b/dataikuapi/dss/analysis.py index e08e85c2..ae10f702 100644 --- a/dataikuapi/dss/analysis.py +++ b/dataikuapi/dss/analysis.py @@ -188,8 +188,9 @@ def create_prediction_ml_task(self, return mltask def create_clustering_ml_task(self, - ml_backend_type = "PY_MEMORY", - guess_policy = "KMEANS"): + ml_backend_type="PY_MEMORY", + guess_policy="KMEANS", + wait_guess_complete=True): """Creates a new clustering task in a new visual analysis lab @@ -205,6 +206,10 @@ def create_clustering_ml_task(self, :param string ml_backend_type: ML backend to use, one of PY_MEMORY, MLLIB or H2O :param string guess_policy: Policy to use for setting the default parameters. Valid values are: KMEANS and ANOMALY_DETECTION + :param boolean wait_guess_complete: if False, the returned ML task will be in 'guessing' state, i.e. analyzing the input dataset to determine feature handling and algorithms. + You should wait for the guessing to be completed by calling + ``wait_guess_complete`` on the returned object before doing anything + else (in particular calling ``train`` or ``get_settings``) """ obj = { @@ -214,7 +219,11 @@ def create_clustering_ml_task(self, } ref = self.client._perform_json("POST", "/projects/%s/lab/%s/models/" % (self.project_key, self.analysis_id), body=obj) - return DSSMLTask(self.client, self.project_key, self.analysis_id, ref["mlTaskId"]) + mltask = DSSMLTask(self.client, self.project_key, self.analysis_id, ref["mlTaskId"]) + + if wait_guess_complete: + mltask.wait_guess_complete() + return mltask def list_ml_tasks(self): """ diff --git a/dataikuapi/dss/dataset.py b/dataikuapi/dss/dataset.py index 574ca125..91cfb3da 100644 --- a/dataikuapi/dss/dataset.py +++ b/dataikuapi/dss/dataset.py @@ -385,8 +385,9 @@ def create_prediction_ml_task(self, target_variable, guess_policy = guess_policy, prediction_type = prediction_type, wait_guess_complete = wait_guess_complete) def create_clustering_ml_task(self, input_dataset, - ml_backend_type = "PY_MEMORY", - guess_policy = "KMEANS"): + ml_backend_type="PY_MEMORY", + guess_policy="KMEANS", + wait_guess_complete=True): """Creates a new clustering task in a new visual analysis lab for a dataset. @@ -400,9 +401,13 @@ def create_clustering_ml_task(self, input_dataset, :param string ml_backend_type: ML backend to use, one of PY_MEMORY, MLLIB or H2O :param string guess_policy: Policy to use for setting the default parameters. Valid values are: KMEANS and ANOMALY_DETECTION + :param boolean wait_guess_complete: if False, the returned ML task will be in 'guessing' state, i.e. analyzing the input dataset to determine feature handling and algorithms. + You should wait for the guessing to be completed by calling + ``wait_guess_complete`` on the returned object before doing anything + else (in particular calling ``train`` or ``get_settings``) """ - return self.project.create_clustering_ml_task(self.dataset_name, - ml_backend_type = ml_backend_type, guess_policy = guess_policy) + return self.project.create_clustering_ml_task(self.dataset_name, ml_backend_type=ml_backend_type, guess_policy=guess_policy, + wait_guess_complete=wait_guess_complete) def create_analysis(self): """ diff --git a/dataikuapi/dss/ml.py b/dataikuapi/dss/ml.py index 18309f0e..6088bdea 100644 --- a/dataikuapi/dss/ml.py +++ b/dataikuapi/dss/ml.py @@ -238,6 +238,16 @@ def use_feature(self, feature_name): def get_algorithm_settings(self, algorithm_name): raise NotImplementedError() + def _get_custom_algorithm_settings(self, algorithm_name): + # returns the first algorithm with this name + for algo in self.mltask_settings["modeling"]["custom_mllib"]: + if algorithm_name == algo["name"]: + return algo + for algo in self.mltask_settings["modeling"]["custom_python"]: + if algorithm_name == algo["name"]: + return algo + raise ValueError("Unknown algorithm: {}".format(algorithm_name)) + def get_diagnostics_settings(self): """ Gets the diagnostics settings for a mltask. This returns a reference to the @@ -307,7 +317,7 @@ def disable_all_algorithms(self): custom_mllib["enabled"] = False for custom_python in self.mltask_settings["modeling"]["custom_python"]: custom_python["enabled"] = False - for plugin in self.mltask_settings["modeling"]["plugin_python"].values(): + for plugin in self.mltask_settings["modeling"].get("plugin_python", {}).values(): plugin["enabled"] = False def get_all_possible_algorithm_names(self): @@ -315,23 +325,30 @@ def get_all_possible_algorithm_names(self): Returns the list of possible algorithm names, i.e. the list of valid identifiers for :meth:`set_algorithm_enabled` and :meth:`get_algorithm_settings` - This does not include Custom Python models, Custom MLLib models, plugin models. This includes all possible algorithms, regardless of the prediction kind (regression/classification) or engine, so some algorithms may be irrelevant :returns: the list of algorithm names as a list of strings :rtype: list of string """ - return list(self.__class__.algorithm_remap.keys()) + return list(self.__class__.algorithm_remap.keys()) + self._get_custom_algorithm_names() + + def _get_custom_algorithm_names(self): + """ + Returns the list of names of defined custom models (Python & MLlib backends) + + :returns: the list of custom models names + :rtype: list of string + """ + return ([algo["name"] for algo in self.mltask_settings["modeling"]["custom_mllib"]] + + [algo["name"] for algo in self.mltask_settings["modeling"]["custom_python"]]) def get_enabled_algorithm_names(self): """ :returns: the list of enabled algorithm names as a list of strings :rtype: list of string """ - algos = self.__class__.algorithm_remap - algo_names = [algo_name for algo_name in algos.keys() if self.mltask_settings["modeling"][algos[algo_name].algorithm_name.lower()]["enabled"]] - return algo_names + return [algo_name for algo_name in self.get_all_possible_algorithm_names() if self.get_algorithm_settings(algo_name).get("enabled", False)] def get_enabled_algorithm_settings(self): """ @@ -356,6 +373,32 @@ def set_metric(self, metric=None, custom_metric=None, custom_metric_greater_is_b self.mltask_settings["modeling"]["metrics"]["customEvaluationMetricGIB"] = custom_metric_greater_is_better self.mltask_settings["modeling"]["metrics"]["customEvaluationMetricNeedsProba"] = custom_metric_use_probas + def add_custom_python_model(self, name="Custom Python Model", code=""): + """ + Adds a new custom python model + + :param str name: name of the custom model + :param str code: code of the custom model + """ + self.mltask_settings["modeling"]["custom_python"].append({ + "name": name, + "code": code, + "enabled": True + }) + + def add_custom_mllib_model(self, name="Custom MLlib Model", code=""): + """ + Adds a new custom MLlib model + + :param str name: name of the custom model + :param str code: code of the custom model + """ + self.mltask_settings["modeling"]["custom_mllib"].append({ + "name": name, + "initializationCode": code, + "enabled": True + }) + def save(self): """Saves back these settings to the ML Task""" @@ -1310,7 +1353,6 @@ def __init__(self, raw_settings, hyperparameter_search_params): self.cache_node_ids = self._register_simple_parameter("cache_node_ids") self.checkpoint_interval = self._register_single_value_hyperparameter("checkpoint_interval", accepted_types=[int]) - self.impurity = self._register_single_category_hyperparameter("impurity", accepted_values=["gini", "entropy", "variance"]) # TODO: distinguish between regression and classif self.max_bins = self._register_single_value_hyperparameter("max_bins", accepted_types=[int]) self.max_memory_mb = self._register_simple_parameter("max_memory_mb") self.min_info_gain = self._register_single_value_hyperparameter("min_info_gain", accepted_types=[int, float]) @@ -1395,20 +1437,41 @@ def __init__(self, client, project_key, analysis_id, mltask_id, mltask_settings) def get_prediction_type(self): return self.mltask_settings['predictionType'] + def get_all_possible_algorithm_names(self): + """ + Returns the list of possible algorithm names, i.e. the list of valid + identifiers for :meth:`set_algorithm_enabled` and :meth:`get_algorithm_settings` + + This includes all possible algorithms, regardless of the prediction kind (regression/classification) + or engine, so some algorithms may be irrelevant + + :returns: the list of algorithm names as a list of strings + :rtype: list of string + """ + return super(DSSPredictionMLTaskSettings, self).get_all_possible_algorithm_names() + self._get_plugin_algorithm_names() + + def _get_plugin_algorithm_names(self): + return list(self.mltask_settings["modeling"]["plugin_python"].keys()) + + def _get_plugin_algorithm_settings(self, algorithm_name): + if algorithm_name in self.mltask_settings["modeling"]["plugin_python"]: + return self.mltask_settings["modeling"]["plugin_python"][algorithm_name] + raise ValueError("Unknown algorithm: {}".format(algorithm_name)) + def get_enabled_algorithm_names(self): """ :returns: the list of enabled algorithm names as a list of strings :rtype: list of string """ - algos = self.__class__.algorithm_remap + algo_names = super(DSSPredictionMLTaskSettings, self).get_enabled_algorithm_names() + # Hide either "XGBOOST_CLASSIFICATION" or "XGBOOST_REGRESSION" which point to the same key "xgboost" if self.mltask_settings["predictionType"] == "REGRESSION": - excluded_name = {"XGBOOST_CLASSIFICATION"} + excluded_names = {"XGBOOST_CLASSIFICATION"} else: - excluded_name = {"XGBOOST_REGRESSION"} - algo_names = [algo_name for algo_name in algos.keys() if (self.mltask_settings["modeling"][algos[algo_name].algorithm_name.lower()]["enabled"] - and algo_name not in excluded_name)] - return algo_names + excluded_names = {"XGBOOST_REGRESSION"} + + return [algo_name for algo_name in algo_names if algo_name not in excluded_names] def get_algorithm_settings(self, algorithm_name): """ @@ -1442,6 +1505,10 @@ def get_algorithm_settings(self, algorithm_name): # Subsequent calls get the same object self.mltask_settings["modeling"][algorithm_name.lower()] = algorithm_settings return self.mltask_settings["modeling"][algorithm_name.lower()] + elif algorithm_name in self._get_custom_algorithm_names(): + return self._get_custom_algorithm_settings(algorithm_name) + elif algorithm_name in self._get_plugin_algorithm_names(): + return self._get_plugin_algorithm_settings(algorithm_name) else: raise ValueError("Unknown algorithm: {}".format(algorithm_name)) @@ -1590,8 +1657,11 @@ def get_algorithm_settings(self, algorithm_name): """ if algorithm_name in self.__class__.algorithm_remap: algorithm_name = self.__class__.algorithm_remap[algorithm_name] - - return self.mltask_settings["modeling"][algorithm_name.lower()] + return self.mltask_settings["modeling"][algorithm_name.lower()] + elif algorithm_name in self._get_custom_algorithm_names(): + return self._get_custom_algorithm_settings(algorithm_name) + else: + raise ValueError("Unknown algorithm: {}".format(algorithm_name)) class DSSTrainedModelDetails(object): diff --git a/dataikuapi/dss/project.py b/dataikuapi/dss/project.py index 7422b829..6557e8b6 100644 --- a/dataikuapi/dss/project.py +++ b/dataikuapi/dss/project.py @@ -532,9 +532,9 @@ def create_prediction_ml_task(self, input_dataset, target_variable, return ret def create_clustering_ml_task(self, input_dataset, - ml_backend_type = "PY_MEMORY", - guess_policy = "KMEANS"): - + ml_backend_type = "PY_MEMORY", + guess_policy = "KMEANS", + wait_guess_complete=True): """Creates a new clustering task in a new visual analysis lab for a dataset. @@ -549,6 +549,10 @@ def create_clustering_ml_task(self, input_dataset, :param string ml_backend_type: ML backend to use, one of PY_MEMORY, MLLIB or H2O :param string guess_policy: Policy to use for setting the default parameters. Valid values are: KMEANS and ANOMALY_DETECTION + :param boolean wait_guess_complete: if False, the returned ML task will be in 'guessing' state, i.e. analyzing the input dataset to determine feature handling and algorithms. + You should wait for the guessing to be completed by calling + ``wait_guess_complete`` on the returned object before doing anything + else (in particular calling ``train`` or ``get_settings``) """ obj = { @@ -559,7 +563,11 @@ def create_clustering_ml_task(self, input_dataset, } ref = self.client._perform_json("POST", "/projects/%s/models/lab/" % self.project_key, body=obj) - return DSSMLTask(self.client, self.project_key, ref["analysisId"], ref["mlTaskId"]) + mltask = DSSMLTask(self.client, self.project_key, ref["analysisId"], ref["mlTaskId"]) + + if wait_guess_complete: + mltask.wait_guess_complete() + return mltask def list_ml_tasks(self): """