Skip to content

Commit 0165dd0

Browse files
authored
Merge pull request #202 from dataiku/feature/mlflow-experiment-tracking
MLflow experiment tracking client API
2 parents f4a66f5 + 324f25e commit 0165dd0

File tree

9 files changed

+643
-41
lines changed

9 files changed

+643
-41
lines changed

dataikuapi/dss/mlflow.py

Lines changed: 200 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,200 @@
1+
import json
2+
3+
class DSSMLflowExtension(object):
4+
"""
5+
A handle to interact with specific endpoints of the DSS MLflow integration.
6+
7+
Do not create this directly, use :meth:`dataikuapi.dss.DSSProject.get_mlflow_extension`
8+
"""
9+
10+
def __init__(self, client, project_key):
11+
self.client = client
12+
self.project = client.get_project(project_key)
13+
self.project_key = project_key
14+
15+
def list_models(self, run_id):
16+
"""
17+
Returns the list of models of given run
18+
19+
:param run_id: run_id for which to return a list of models
20+
:type run_id: str
21+
"""
22+
response = self.client._perform_http(
23+
"GET", "/api/2.0/mlflow/extension/models/{}".format(run_id),
24+
headers={"x-dku-mlflow-project-key": self.project_key}
25+
)
26+
return response.json()
27+
28+
def list_experiments(self, view_type="ACTIVE_ONLY", max_results=1000):
29+
"""
30+
Returns the list of experiments in the DSS project for which MLflow integration
31+
is setup
32+
33+
:param view_type: ACTIVE_ONLY, DELETED_ONLY or ALL
34+
:type view_type: str
35+
:param max_results: max results count
36+
:type max_results: int
37+
:rtype: dict
38+
"""
39+
response = self.client._perform_http(
40+
"GET", "/api/2.0/mlflow/experiments/list?view_type={view_type}&max_results={max_results}".format(view_type=view_type, max_results=max_results),
41+
headers={"x-dku-mlflow-project-key": self.project_key}
42+
)
43+
return response.json()
44+
45+
def rename_experiment(self, experiment_id, new_name):
46+
"""
47+
Renames an experiment
48+
49+
:param experiment_id: experiment id
50+
:type experiment_id: str
51+
:param new_name: new name
52+
:type new_name: str
53+
"""
54+
response = self.client._perform_http(
55+
"POST", "/api/2.0/mlflow/experiments/update",
56+
headers={"x-dku-mlflow-project-key": self.project_key},
57+
body={"experiment_id": experiment_id, "new_name": new_name}
58+
)
59+
return response.json()
60+
61+
def restore_experiment(self, experiment_id):
62+
"""
63+
Restores a deleted experiment
64+
65+
:param experiment_id: experiment id
66+
:type experiment_id: str
67+
"""
68+
response = self.client._perform_http(
69+
"POST", "/api/2.0/mlflow/experiments/restore",
70+
headers={"x-dku-mlflow-project-key": self.project_key},
71+
body={"experiment_id": experiment_id}
72+
)
73+
return response.json()
74+
75+
def restore_run(self, run_id):
76+
"""
77+
Restores a deleted run
78+
79+
:param run_id: run id
80+
:type run_id: str
81+
"""
82+
response = self.client._perform_http(
83+
"POST", "/api/2.0/mlflow/runs/restore",
84+
headers={"x-dku-mlflow-project-key": self.project_key},
85+
body={"run_id": run_id}
86+
)
87+
return response.json()
88+
89+
def garbage_collect(self):
90+
"""
91+
Permanently deletes the experiments and runs marked as "Deleted"
92+
"""
93+
self.client._perform_http(
94+
"GET", "/api/2.0/mlflow/extension/garbage-collect",
95+
headers={"x-dku-mlflow-project-key": self.project_key}
96+
)
97+
98+
def create_experiment_tracking_dataset(self, dataset_name, experiment_ids=[], view_type="ACTIVE_ONLY", filter_expr="", order_by=[], format="LONG"):
99+
"""
100+
101+
Creates a virtual dataset exposing experiment tracking data.
102+
103+
:param dataset_name: name of the dataset
104+
:type dataset_name: str
105+
:param experiment_ids: list of ids of experiments to filter on. No filtering if empty
106+
:type experiment_ids: list(str)
107+
:param view_type: one of ACTIVE_ONLY, DELETED_ONLY and ALL. Default is ACTIVE_ONLY
108+
:type view_type: str
109+
:param filter_expr: MLflow search expression
110+
:type filter_expr: str
111+
:param order_by: list of order by clauses. Default is ordered by start_time, then runId
112+
:type order_by: list(str)
113+
:param format: LONG or JSON. Default is LONG
114+
:type format: str
115+
"""
116+
self.client._perform_http(
117+
"POST", "/api/2.0/mlflow/extension/create-project-experiments-dataset",
118+
headers={"x-dku-mlflow-project-key": self.project_key},
119+
body={
120+
"datasetName": dataset_name,
121+
"experimentIds": experiment_ids,
122+
"viewType": view_type,
123+
"filter": filter_expr,
124+
"orderBy": order_by,
125+
"format": format
126+
}
127+
)
128+
129+
def clean_experiment_tracking_db(self):
130+
"""
131+
Cleans the experiments, runs, params, metrics, tags, etc. for this project
132+
133+
This call requires an API key with admin rights
134+
"""
135+
self.client._perform_raw("DELETE", "/api/2.0/mlflow/extension/clean-db/%s" % self.project_key)
136+
137+
def set_run_inference_info(self, run_id, model_type, classes=None, code_env_name=None, target=None):
138+
"""
139+
Sets the type of the model, and optionally other information useful to deploy or evaluate it.
140+
141+
model_type must be one of:
142+
- REGRESSION
143+
- BINARY_CLASSIFICATION
144+
- MULTICLASS
145+
- OTHER
146+
147+
Classes must be specified if and only if the model is a BINARY_CLASSIFICATION or MULTICLASS model.
148+
149+
This information is leveraged to filter saved models on their prediction type and prefill the classes
150+
when deploying using the GUI an MLflow model as a version of a DSS Saved Model.
151+
152+
:param model_type: prediction type (see doc)
153+
:type model_type: str
154+
:param run_id: run_id for which to set the classes
155+
:type run_id: str
156+
:param classes: ordered list of classes (not for all prediction types, see doc)
157+
:type classes: list(str)
158+
:param code_env_name: name of an adequate DSS python code environment
159+
:type code_env_name: str
160+
:param target: name of the target
161+
:type target: str
162+
"""
163+
if model_type not in {"REGRESSION", "BINARY_CLASSIFICATION", "MULTICLASS", "OTHER"}:
164+
raise ValueError('Invalid prediction type: {}'.format(model_type))
165+
166+
if classes and model_type not in {"BINARY_CLASSIFICATION", "MULTICLASS"}:
167+
raise ValueError('Classes can be specified only for BINARY_CLASSIFICATION or MULTICLASS prediction types')
168+
if model_type in {"BINARY_CLASSIFICATION", "MULTICLASS"}:
169+
if not classes:
170+
raise ValueError('Classes must be specified for {} prediction type'.format(model_type))
171+
if not isinstance(classes, list):
172+
raise ValueError('Wrong type for classes: {}'.format(type(classes)))
173+
for cur_class in classes:
174+
if cur_class is None:
175+
raise ValueError('class can not be None')
176+
if not isinstance(cur_class, str):
177+
raise ValueError('Wrong type for class {}: {}'.format(cur_class, type(cur_class)))
178+
179+
if code_env_name and not isinstance(code_env_name, str):
180+
raise ValueError('code_env_name must be a string')
181+
if target and not isinstance(target, str):
182+
raise ValueError('target must be a string')
183+
184+
params = {
185+
"run_id": run_id,
186+
"prediction_type": model_type
187+
}
188+
189+
if classes:
190+
params["classes"] = json.dumps(classes)
191+
if code_env_name:
192+
params["code_env_name"] = code_env_name
193+
if target:
194+
params["target"] = target
195+
196+
self.client._perform_http(
197+
"POST", "/api/2.0/mlflow/extension/set-run-inference-info",
198+
headers={"x-dku-mlflow-project-key": self.project_key},
199+
body=params
200+
)

dataikuapi/dss/project.py

Lines changed: 27 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,7 @@
1-
import time, warnings, sys, os.path as osp
1+
import warnings, os.path as osp
2+
3+
from ..dss_plugin_mlflow import MLflowHandle
4+
25
from .dataset import DSSDataset, DSSDatasetListItem, DSSManagedDatasetCreationHelper
36
from .modelcomparison import DSSModelComparison
47
from .jupyternotebook import DSSJupyterNotebook, DSSJupyterNotebookListItem
@@ -9,6 +12,7 @@
912
from .managedfolder import DSSManagedFolder
1013
from .savedmodel import DSSSavedModel
1114
from .modelevaluationstore import DSSModelEvaluationStore
15+
from .mlflow import DSSMLflowExtension
1216
from .job import DSSJob, DSSJobWaiter
1317
from .scenario import DSSScenario, DSSScenarioListItem
1418
from .continuousactivity import DSSContinuousActivity
@@ -30,8 +34,8 @@ class DSSProject(object):
3034
Do not create this class directly, instead use :meth:`dataikuapi.DSSClient.get_project`
3135
"""
3236
def __init__(self, client, project_key):
33-
self.client = client
34-
self.project_key = project_key
37+
self.client = client
38+
self.project_key = project_key
3539

3640
def get_summary(self):
3741
"""
@@ -1589,7 +1593,7 @@ def list_hive_tables(self, hive_database):
15891593
"""
15901594
connection_name = "@virtual(hive-jdbc):" + hive_database
15911595
ret = self.client._perform_json("GET", "/projects/%s/datasets/tables-import/actions/list-tables" % (self.project_key),
1592-
params = {"connectionName": connection_name} )
1596+
params={"connectionName": connection_name} )
15931597

15941598
def to_schema_table_pair(x):
15951599
return {"schema":x.get("databaseName", None), "table":x["table"]}
@@ -1598,11 +1602,29 @@ def to_schema_table_pair(x):
15981602
########################################################
15991603
# App designer
16001604
########################################################
1601-
16021605
def get_app_manifest(self):
16031606
raw_data = self.client._perform_json("GET", "/projects/%s/app-manifest" % self.project_key)
16041607
return DSSAppManifest(self.client, raw_data, self.project_key)
16051608

1609+
# MLflow experiment tracking
1610+
########################################################
1611+
def setup_mlflow(self, managed_folder, host=None):
1612+
"""
1613+
Setup the dss-plugin for MLflow
1614+
1615+
:param object managed_folder: a :class:`dataikuapi.dss.DSSManagedFolder` where MLflow artifacts should be stored.
1616+
:param str host: setup a custom host if the backend used is not DSS
1617+
"""
1618+
return MLflowHandle(client=self.client, project=self, managed_folder=managed_folder, host=host)
1619+
1620+
def get_mlflow_extension(self):
1621+
"""
1622+
Get a handle to interact with the extension of MLflow provided by DSS
1623+
1624+
:returns: A :class:`dataikuapi.dss.mlflow.DSSMLflowExtension` Mlflow Extension handle
1625+
1626+
"""
1627+
return DSSMLflowExtension(client=self.client, project_key=self.project_key)
16061628

16071629
class TablesImportDefinition(object):
16081630
"""

dataikuapi/dss/savedmodel.py

Lines changed: 19 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -122,7 +122,7 @@ def get_origin_ml_task(self):
122122
if fmi is not None:
123123
return DSSMLTask.from_full_model_id(self.client, fmi, project_key=self.project_key)
124124

125-
def import_mlflow_version_from_path(self, version_id, path, code_env_name="INHERIT"):
125+
def import_mlflow_version_from_path(self, version_id, path, code_env_name="INHERIT", container_exec_config_name="INHERIT"):
126126
"""
127127
Create a new version for this saved model from a path containing a MLFlow model.
128128
@@ -133,6 +133,10 @@ def import_mlflow_version_from_path(self, version_id, path, code_env_name="INHER
133133
:param str code_env_name: Name of the code env to use for this model version. The code env must contain at least
134134
mlflow and the package(s) corresponding to the used MLFlow-compatible frameworks.
135135
If value is "INHERIT", the default active code env of the project will be used
136+
:param str container_exec_config_name: Name of the containerized execution configuration to use while creating
137+
this model version.
138+
If value is "INHERIT", the container execution configuration of the project will be used.
139+
If value is "NONE", local execution will be used (no container)
136140
:return a :class:MLFlowVersionHandler in order to interact with the new MLFlow model version
137141
"""
138142
# TODO: Add a check that it's indeed a MLFlow model folder
@@ -144,13 +148,13 @@ def import_mlflow_version_from_path(self, version_id, path, code_env_name="INHER
144148
archive_filename = _make_zipfile(os.path.join(archive_temp_dir, "tmpmodel.zip"), path)
145149

146150
with open(archive_filename, "rb") as fp:
147-
self.client._perform_empty("POST", "/projects/%s/savedmodels/%s/versions/%s?codeEnvName=%s" % (self.project_key, self.sm_id, version_id, code_env_name),
151+
self.client._perform_empty("POST", "/projects/%s/savedmodels/%s/versions/%s?codeEnvName=%s&containerExecConfigName=%s" % (self.project_key, self.sm_id, version_id, code_env_name, container_exec_config_name),
148152
files={"file": (archive_filename, fp)})
149153
return self.get_mlflow_version_handler(version_id)
150154
finally:
151155
shutil.rmtree(archive_temp_dir)
152156

153-
def import_mlflow_version_from_managed_folder(self, version_id, managed_folder, path, code_env_name="INHERIT"):
157+
def import_mlflow_version_from_managed_folder(self, version_id, managed_folder, path, code_env_name="INHERIT", container_exec_config_name="INHERIT"):
154158
"""
155159
Create a new version for this saved model from a path containing a MLFlow model in a managed folder.
156160
@@ -162,6 +166,10 @@ def import_mlflow_version_from_managed_folder(self, version_id, managed_folder,
162166
:param str code_env_name: Name of the code env to use for this model version. The code env must contain at least
163167
mlflow and the package(s) corresponding to the used MLFlow-compatible frameworks.
164168
If value is "INHERIT", the default active code env of the project will be used
169+
:param str container_exec_config_name: Name of the containerized execution configuration to use for evaluating
170+
this model version.
171+
If value is "INHERIT", the container execution configuration of the project will be used.
172+
If value is "NONE", local execution will be used (no container)
165173
:return a :class:MLFlowVersionHandler in order to interact with the new MLFlow model version
166174
"""
167175
# TODO: Add a check that it's indeed a MLFlow model folder
@@ -172,8 +180,8 @@ def import_mlflow_version_from_managed_folder(self, version_id, managed_folder,
172180
folder_ref = managed_folder
173181

174182
self.client._perform_empty(
175-
"POST", "/projects/{project_id}/savedmodels/{saved_model_id}/versions/{version_id}?codeEnvName={codeEnvName}".format(
176-
project_id=self.project_key, saved_model_id=self.sm_id, version_id=version_id, codeEnvName=code_env_name
183+
"POST", "/projects/{project_id}/savedmodels/{saved_model_id}/versions/{version_id}?codeEnvName={codeEnvName}&containerExecConfigName={containerExecConfigName}".format(
184+
project_id=self.project_key, saved_model_id=self.sm_id, version_id=version_id, codeEnvName=code_env_name, containerExecConfigName=container_exec_config_name
177185
),
178186
params={"folderRef": folder_ref, "path": path},
179187
files={"file": (None, None)} # required for backend-mandated multipart request
@@ -343,21 +351,24 @@ def set_core_metadata(self,
343351
"/projects/%s/savedmodels/%s/versions/%s/external-ml/metadata" % (self.saved_model.project_key, self.saved_model.sm_id, self.version_id),
344352
body=metadata)
345353

346-
def evaluate(self, dataset_ref):
354+
def evaluate(self, dataset_ref, container_exec_config_name="INHERIT"):
347355
"""
348356
Evaluates the performance of this model version on a particular dataset.
349357
After calling this, the "result screens" of the MLFlow model version will be available
350358
(confusion matrix, error distribution, performance metrics, ...)
351359
and more information will be available when calling :meth:`DSSSavedModel.get_version_details`
352360
353361
:meth:`set_core_metadata` must be called before you can evaluate a dataset
354-
355362
:param str dataset_ref: Evaluation dataset to use (either a dataset name, "PROJECT.datasetName", :class:`DSSDataset` instance or :class:`dataiku.Dataset` instance)
363+
:param str container_exec_config_name: Name of the containerized execution configuration to use for running the evaluation process.
364+
If value is "INHERIT", the container execution configuration of the project will be used.
365+
If value is "NONE", local execution will be used (no container)
356366
"""
357367
if hasattr(dataset_ref, 'name'):
358368
dataset_ref = dataset_ref.name
359369
req = {
360-
"datasetRef" : dataset_ref
370+
"datasetRef": dataset_ref,
371+
"containerExecConfigName": container_exec_config_name
361372
}
362373
self.saved_model.client._perform_empty("POST",
363374
"/projects/%s/savedmodels/%s/versions/%s/external-ml/actions/evaluate" % (self.saved_model.project_key, self.saved_model.sm_id, self.version_id),
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
from .utils import MLflowHandle

0 commit comments

Comments
 (0)