Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 12 additions & 3 deletions dataikuapi/dss/analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -188,8 +188,9 @@ def create_prediction_ml_task(self,
return mltask

def create_clustering_ml_task(self,
ml_backend_type = "PY_MEMORY",
guess_policy = "KMEANS"):
ml_backend_type="PY_MEMORY",
guess_policy="KMEANS",
wait_guess_complete=True):


"""Creates a new clustering task in a new visual analysis lab
Expand All @@ -205,6 +206,10 @@ def create_clustering_ml_task(self,

:param string ml_backend_type: ML backend to use, one of PY_MEMORY, MLLIB or H2O
:param string guess_policy: Policy to use for setting the default parameters. Valid values are: KMEANS and ANOMALY_DETECTION
:param boolean wait_guess_complete: if False, the returned ML task will be in 'guessing' state, i.e. analyzing the input dataset to determine feature handling and algorithms.
You should wait for the guessing to be completed by calling
``wait_guess_complete`` on the returned object before doing anything
else (in particular calling ``train`` or ``get_settings``)
"""

obj = {
Expand All @@ -214,7 +219,11 @@ def create_clustering_ml_task(self,
}

ref = self.client._perform_json("POST", "/projects/%s/lab/%s/models/" % (self.project_key, self.analysis_id), body=obj)
return DSSMLTask(self.client, self.project_key, self.analysis_id, ref["mlTaskId"])
mltask = DSSMLTask(self.client, self.project_key, self.analysis_id, ref["mlTaskId"])

if wait_guess_complete:
mltask.wait_guess_complete()
return mltask

def list_ml_tasks(self):
"""
Expand Down
13 changes: 9 additions & 4 deletions dataikuapi/dss/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -385,8 +385,9 @@ def create_prediction_ml_task(self, target_variable,
guess_policy = guess_policy, prediction_type = prediction_type, wait_guess_complete = wait_guess_complete)

def create_clustering_ml_task(self, input_dataset,
ml_backend_type = "PY_MEMORY",
guess_policy = "KMEANS"):
ml_backend_type="PY_MEMORY",
guess_policy="KMEANS",
wait_guess_complete=True):
"""Creates a new clustering task in a new visual analysis lab
for a dataset.

Expand All @@ -400,9 +401,13 @@ def create_clustering_ml_task(self, input_dataset,

:param string ml_backend_type: ML backend to use, one of PY_MEMORY, MLLIB or H2O
:param string guess_policy: Policy to use for setting the default parameters. Valid values are: KMEANS and ANOMALY_DETECTION
:param boolean wait_guess_complete: if False, the returned ML task will be in 'guessing' state, i.e. analyzing the input dataset to determine feature handling and algorithms.
You should wait for the guessing to be completed by calling
``wait_guess_complete`` on the returned object before doing anything
else (in particular calling ``train`` or ``get_settings``)
"""
return self.project.create_clustering_ml_task(self.dataset_name,
ml_backend_type = ml_backend_type, guess_policy = guess_policy)
return self.project.create_clustering_ml_task(self.dataset_name, ml_backend_type=ml_backend_type, guess_policy=guess_policy,
wait_guess_complete=wait_guess_complete)

def create_analysis(self):
"""
Expand Down
100 changes: 85 additions & 15 deletions dataikuapi/dss/ml.py
Original file line number Diff line number Diff line change
Expand Up @@ -238,6 +238,16 @@ def use_feature(self, feature_name):
def get_algorithm_settings(self, algorithm_name):
raise NotImplementedError()

def _get_custom_algorithm_settings(self, algorithm_name):
# returns the first algorithm with this name
for algo in self.mltask_settings["modeling"]["custom_mllib"]:
if algorithm_name == algo["name"]:
return algo
for algo in self.mltask_settings["modeling"]["custom_python"]:
if algorithm_name == algo["name"]:
return algo
raise ValueError("Unknown algorithm: {}".format(algorithm_name))

def get_diagnostics_settings(self):
"""
Gets the diagnostics settings for a mltask. This returns a reference to the
Expand Down Expand Up @@ -307,31 +317,38 @@ def disable_all_algorithms(self):
custom_mllib["enabled"] = False
for custom_python in self.mltask_settings["modeling"]["custom_python"]:
custom_python["enabled"] = False
for plugin in self.mltask_settings["modeling"]["plugin_python"].values():
for plugin in self.mltask_settings["modeling"].get("plugin_python", {}).values():
plugin["enabled"] = False

def get_all_possible_algorithm_names(self):
"""
Returns the list of possible algorithm names, i.e. the list of valid
identifiers for :meth:`set_algorithm_enabled` and :meth:`get_algorithm_settings`

This does not include Custom Python models, Custom MLLib models, plugin models.
This includes all possible algorithms, regardless of the prediction kind (regression/classification)
or engine, so some algorithms may be irrelevant

:returns: the list of algorithm names as a list of strings
:rtype: list of string
"""
return list(self.__class__.algorithm_remap.keys())
return list(self.__class__.algorithm_remap.keys()) + self._get_custom_algorithm_names()

def _get_custom_algorithm_names(self):
"""
Returns the list of names of defined custom models (Python & MLlib backends)

:returns: the list of custom models names
:rtype: list of string
"""
return ([algo["name"] for algo in self.mltask_settings["modeling"]["custom_mllib"]]
+ [algo["name"] for algo in self.mltask_settings["modeling"]["custom_python"]])

def get_enabled_algorithm_names(self):
"""
:returns: the list of enabled algorithm names as a list of strings
:rtype: list of string
"""
algos = self.__class__.algorithm_remap
algo_names = [algo_name for algo_name in algos.keys() if self.mltask_settings["modeling"][algos[algo_name].algorithm_name.lower()]["enabled"]]
return algo_names
return [algo_name for algo_name in self.get_all_possible_algorithm_names() if self.get_algorithm_settings(algo_name).get("enabled", False)]

def get_enabled_algorithm_settings(self):
"""
Expand All @@ -356,6 +373,32 @@ def set_metric(self, metric=None, custom_metric=None, custom_metric_greater_is_b
self.mltask_settings["modeling"]["metrics"]["customEvaluationMetricGIB"] = custom_metric_greater_is_better
self.mltask_settings["modeling"]["metrics"]["customEvaluationMetricNeedsProba"] = custom_metric_use_probas

def add_custom_python_model(self, name="Custom Python Model", code=""):
"""
Adds a new custom python model

:param str name: name of the custom model
:param str code: code of the custom model
"""
self.mltask_settings["modeling"]["custom_python"].append({
"name": name,
"code": code,
"enabled": True
})

def add_custom_mllib_model(self, name="Custom MLlib Model", code=""):
"""
Adds a new custom MLlib model

:param str name: name of the custom model
:param str code: code of the custom model
"""
self.mltask_settings["modeling"]["custom_mllib"].append({
"name": name,
"initializationCode": code,
"enabled": True
})

def save(self):
"""Saves back these settings to the ML Task"""

Expand Down Expand Up @@ -1310,7 +1353,6 @@ def __init__(self, raw_settings, hyperparameter_search_params):

self.cache_node_ids = self._register_simple_parameter("cache_node_ids")
self.checkpoint_interval = self._register_single_value_hyperparameter("checkpoint_interval", accepted_types=[int])
self.impurity = self._register_single_category_hyperparameter("impurity", accepted_values=["gini", "entropy", "variance"]) # TODO: distinguish between regression and classif
self.max_bins = self._register_single_value_hyperparameter("max_bins", accepted_types=[int])
self.max_memory_mb = self._register_simple_parameter("max_memory_mb")
self.min_info_gain = self._register_single_value_hyperparameter("min_info_gain", accepted_types=[int, float])
Expand Down Expand Up @@ -1395,20 +1437,41 @@ def __init__(self, client, project_key, analysis_id, mltask_id, mltask_settings)
def get_prediction_type(self):
return self.mltask_settings['predictionType']

def get_all_possible_algorithm_names(self):
"""
Returns the list of possible algorithm names, i.e. the list of valid
identifiers for :meth:`set_algorithm_enabled` and :meth:`get_algorithm_settings`

This includes all possible algorithms, regardless of the prediction kind (regression/classification)
or engine, so some algorithms may be irrelevant

:returns: the list of algorithm names as a list of strings
:rtype: list of string
"""
return super(DSSPredictionMLTaskSettings, self).get_all_possible_algorithm_names() + self._get_plugin_algorithm_names()

def _get_plugin_algorithm_names(self):
return list(self.mltask_settings["modeling"]["plugin_python"].keys())

def _get_plugin_algorithm_settings(self, algorithm_name):
if algorithm_name in self.mltask_settings["modeling"]["plugin_python"]:
return self.mltask_settings["modeling"]["plugin_python"][algorithm_name]
raise ValueError("Unknown algorithm: {}".format(algorithm_name))

def get_enabled_algorithm_names(self):
"""
:returns: the list of enabled algorithm names as a list of strings
:rtype: list of string
"""
algos = self.__class__.algorithm_remap
algo_names = super(DSSPredictionMLTaskSettings, self).get_enabled_algorithm_names()

# Hide either "XGBOOST_CLASSIFICATION" or "XGBOOST_REGRESSION" which point to the same key "xgboost"
if self.mltask_settings["predictionType"] == "REGRESSION":
excluded_name = {"XGBOOST_CLASSIFICATION"}
excluded_names = {"XGBOOST_CLASSIFICATION"}
else:
excluded_name = {"XGBOOST_REGRESSION"}
algo_names = [algo_name for algo_name in algos.keys() if (self.mltask_settings["modeling"][algos[algo_name].algorithm_name.lower()]["enabled"]
and algo_name not in excluded_name)]
return algo_names
excluded_names = {"XGBOOST_REGRESSION"}

return [algo_name for algo_name in algo_names if algo_name not in excluded_names]

def get_algorithm_settings(self, algorithm_name):
"""
Expand Down Expand Up @@ -1442,6 +1505,10 @@ def get_algorithm_settings(self, algorithm_name):
# Subsequent calls get the same object
self.mltask_settings["modeling"][algorithm_name.lower()] = algorithm_settings
return self.mltask_settings["modeling"][algorithm_name.lower()]
elif algorithm_name in self._get_custom_algorithm_names():
return self._get_custom_algorithm_settings(algorithm_name)
elif algorithm_name in self._get_plugin_algorithm_names():
return self._get_plugin_algorithm_settings(algorithm_name)
else:
raise ValueError("Unknown algorithm: {}".format(algorithm_name))

Expand Down Expand Up @@ -1590,8 +1657,11 @@ def get_algorithm_settings(self, algorithm_name):
"""
if algorithm_name in self.__class__.algorithm_remap:
algorithm_name = self.__class__.algorithm_remap[algorithm_name]

return self.mltask_settings["modeling"][algorithm_name.lower()]
return self.mltask_settings["modeling"][algorithm_name.lower()]
elif algorithm_name in self._get_custom_algorithm_names():
return self._get_custom_algorithm_settings(algorithm_name)
else:
raise ValueError("Unknown algorithm: {}".format(algorithm_name))


class DSSTrainedModelDetails(object):
Expand Down
16 changes: 12 additions & 4 deletions dataikuapi/dss/project.py
Original file line number Diff line number Diff line change
Expand Up @@ -532,9 +532,9 @@ def create_prediction_ml_task(self, input_dataset, target_variable,
return ret

def create_clustering_ml_task(self, input_dataset,
ml_backend_type = "PY_MEMORY",
guess_policy = "KMEANS"):

ml_backend_type = "PY_MEMORY",
guess_policy = "KMEANS",
wait_guess_complete=True):

"""Creates a new clustering task in a new visual analysis lab
for a dataset.
Expand All @@ -549,6 +549,10 @@ def create_clustering_ml_task(self, input_dataset,

:param string ml_backend_type: ML backend to use, one of PY_MEMORY, MLLIB or H2O
:param string guess_policy: Policy to use for setting the default parameters. Valid values are: KMEANS and ANOMALY_DETECTION
:param boolean wait_guess_complete: if False, the returned ML task will be in 'guessing' state, i.e. analyzing the input dataset to determine feature handling and algorithms.
You should wait for the guessing to be completed by calling
``wait_guess_complete`` on the returned object before doing anything
else (in particular calling ``train`` or ``get_settings``)
"""

obj = {
Expand All @@ -559,7 +563,11 @@ def create_clustering_ml_task(self, input_dataset,
}

ref = self.client._perform_json("POST", "/projects/%s/models/lab/" % self.project_key, body=obj)
return DSSMLTask(self.client, self.project_key, ref["analysisId"], ref["mlTaskId"])
mltask = DSSMLTask(self.client, self.project_key, ref["analysisId"], ref["mlTaskId"])

if wait_guess_complete:
mltask.wait_guess_complete()
return mltask

def list_ml_tasks(self):
"""
Expand Down