Skip to content

Commit 354efea

Browse files
committed
Merge remote-tracking branch 'origin/master' into perso/aqueru/govern-bundle
2 parents 9bb43da + c84bec6 commit 354efea

File tree

12 files changed

+794
-41
lines changed

12 files changed

+794
-41
lines changed

dataikuapi/dss/dataset.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -737,6 +737,25 @@ def add_time_partitioning_dimension(self, dim_name, period="DAY"):
737737
def add_raw_schema_column(self, column):
738738
self.settings["schema"]["columns"].append(column)
739739

740+
@property
741+
def is_feature_group(self):
742+
"""
743+
Indicates whether the Dataset is defined as a Feature Group, available in the Feature Store.
744+
745+
:rtype: bool
746+
"""
747+
return self.settings["featureGroup"]
748+
749+
def set_feature_group(self, status):
750+
"""
751+
(Un)sets the dataset as a Feature Group, available in the Feature Store.
752+
Changes of this property will be applied when calling :meth:`save` and require the "Manage Feature Store" permission.
753+
754+
:param status: whether the dataset should be defined as a feature group
755+
:type status: bool
756+
"""
757+
self.settings["featureGroup"] = status
758+
740759
def save(self):
741760
self.dataset.client._perform_empty(
742761
"PUT", "/projects/%s/datasets/%s" % (self.dataset.project_key, self.dataset.dataset_name),

dataikuapi/dss/feature_store.py

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
from dataikuapi.dss.dataset import DSSDataset
2+
3+
4+
class DSSFeatureGroupListItem(object):
5+
def __init__(self, client, project_key, name):
6+
self.client = client
7+
self.project_key = project_key
8+
self.name = name
9+
10+
@property
11+
def id(self):
12+
return self.project_key + "." + self.name
13+
14+
def get_as_dataset(self):
15+
"""
16+
Gets the feature group as a dataset
17+
18+
:return: a handle on the dataset
19+
:rtype: :class:`dataikuapi.dss.dataset.DSSDataset`
20+
"""
21+
return DSSDataset(self.client, self.project_key, self.name)
22+
23+
24+
class DSSFeatureStore(object):
25+
def __init__(self, client):
26+
"""
27+
A handle on the Feature Store.
28+
Do not create this class directly, use :meth:`DSSClient.get_feature_store`
29+
"""
30+
self.client = client
31+
32+
def list_feature_groups(self):
33+
"""
34+
Get a list of feature groups on which the user has at least read permissions
35+
36+
:return: list of feature groups
37+
:rtype: list of :class:`dataikuapi.feature_store.DSSFeatureGroupListItem`
38+
"""
39+
items = self.client._perform_json("GET", "/feature-store/feature-groups")
40+
return [DSSFeatureGroupListItem(self.client, item["projectKey"], item["name"]) for item in items]

dataikuapi/dss/mlflow.py

Lines changed: 200 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,200 @@
1+
import json
2+
3+
class DSSMLflowExtension(object):
4+
"""
5+
A handle to interact with specific endpoints of the DSS MLflow integration.
6+
7+
Do not create this directly, use :meth:`dataikuapi.dss.DSSProject.get_mlflow_extension`
8+
"""
9+
10+
def __init__(self, client, project_key):
11+
self.client = client
12+
self.project = client.get_project(project_key)
13+
self.project_key = project_key
14+
15+
def list_models(self, run_id):
16+
"""
17+
Returns the list of models of given run
18+
19+
:param run_id: run_id for which to return a list of models
20+
:type run_id: str
21+
"""
22+
response = self.client._perform_http(
23+
"GET", "/api/2.0/mlflow/extension/models/{}".format(run_id),
24+
headers={"x-dku-mlflow-project-key": self.project_key}
25+
)
26+
return response.json()
27+
28+
def list_experiments(self, view_type="ACTIVE_ONLY", max_results=1000):
29+
"""
30+
Returns the list of experiments in the DSS project for which MLflow integration
31+
is setup
32+
33+
:param view_type: ACTIVE_ONLY, DELETED_ONLY or ALL
34+
:type view_type: str
35+
:param max_results: max results count
36+
:type max_results: int
37+
:rtype: dict
38+
"""
39+
response = self.client._perform_http(
40+
"GET", "/api/2.0/mlflow/experiments/list?view_type={view_type}&max_results={max_results}".format(view_type=view_type, max_results=max_results),
41+
headers={"x-dku-mlflow-project-key": self.project_key}
42+
)
43+
return response.json()
44+
45+
def rename_experiment(self, experiment_id, new_name):
46+
"""
47+
Renames an experiment
48+
49+
:param experiment_id: experiment id
50+
:type experiment_id: str
51+
:param new_name: new name
52+
:type new_name: str
53+
"""
54+
response = self.client._perform_http(
55+
"POST", "/api/2.0/mlflow/experiments/update",
56+
headers={"x-dku-mlflow-project-key": self.project_key},
57+
body={"experiment_id": experiment_id, "new_name": new_name}
58+
)
59+
return response.json()
60+
61+
def restore_experiment(self, experiment_id):
62+
"""
63+
Restores a deleted experiment
64+
65+
:param experiment_id: experiment id
66+
:type experiment_id: str
67+
"""
68+
response = self.client._perform_http(
69+
"POST", "/api/2.0/mlflow/experiments/restore",
70+
headers={"x-dku-mlflow-project-key": self.project_key},
71+
body={"experiment_id": experiment_id}
72+
)
73+
return response.json()
74+
75+
def restore_run(self, run_id):
76+
"""
77+
Restores a deleted run
78+
79+
:param run_id: run id
80+
:type run_id: str
81+
"""
82+
response = self.client._perform_http(
83+
"POST", "/api/2.0/mlflow/runs/restore",
84+
headers={"x-dku-mlflow-project-key": self.project_key},
85+
body={"run_id": run_id}
86+
)
87+
return response.json()
88+
89+
def garbage_collect(self):
90+
"""
91+
Permanently deletes the experiments and runs marked as "Deleted"
92+
"""
93+
self.client._perform_http(
94+
"GET", "/api/2.0/mlflow/extension/garbage-collect",
95+
headers={"x-dku-mlflow-project-key": self.project_key}
96+
)
97+
98+
def create_experiment_tracking_dataset(self, dataset_name, experiment_ids=[], view_type="ACTIVE_ONLY", filter_expr="", order_by=[], format="LONG"):
99+
"""
100+
101+
Creates a virtual dataset exposing experiment tracking data.
102+
103+
:param dataset_name: name of the dataset
104+
:type dataset_name: str
105+
:param experiment_ids: list of ids of experiments to filter on. No filtering if empty
106+
:type experiment_ids: list(str)
107+
:param view_type: one of ACTIVE_ONLY, DELETED_ONLY and ALL. Default is ACTIVE_ONLY
108+
:type view_type: str
109+
:param filter_expr: MLflow search expression
110+
:type filter_expr: str
111+
:param order_by: list of order by clauses. Default is ordered by start_time, then runId
112+
:type order_by: list(str)
113+
:param format: LONG or JSON. Default is LONG
114+
:type format: str
115+
"""
116+
self.client._perform_http(
117+
"POST", "/api/2.0/mlflow/extension/create-project-experiments-dataset",
118+
headers={"x-dku-mlflow-project-key": self.project_key},
119+
body={
120+
"datasetName": dataset_name,
121+
"experimentIds": experiment_ids,
122+
"viewType": view_type,
123+
"filter": filter_expr,
124+
"orderBy": order_by,
125+
"format": format
126+
}
127+
)
128+
129+
def clean_experiment_tracking_db(self):
130+
"""
131+
Cleans the experiments, runs, params, metrics, tags, etc. for this project
132+
133+
This call requires an API key with admin rights
134+
"""
135+
self.client._perform_raw("DELETE", "/api/2.0/mlflow/extension/clean-db/%s" % self.project_key)
136+
137+
def set_run_inference_info(self, run_id, model_type, classes=None, code_env_name=None, target=None):
138+
"""
139+
Sets the type of the model, and optionally other information useful to deploy or evaluate it.
140+
141+
model_type must be one of:
142+
- REGRESSION
143+
- BINARY_CLASSIFICATION
144+
- MULTICLASS
145+
- OTHER
146+
147+
Classes must be specified if and only if the model is a BINARY_CLASSIFICATION or MULTICLASS model.
148+
149+
This information is leveraged to filter saved models on their prediction type and prefill the classes
150+
when deploying using the GUI an MLflow model as a version of a DSS Saved Model.
151+
152+
:param model_type: prediction type (see doc)
153+
:type model_type: str
154+
:param run_id: run_id for which to set the classes
155+
:type run_id: str
156+
:param classes: ordered list of classes (not for all prediction types, see doc)
157+
:type classes: list(str)
158+
:param code_env_name: name of an adequate DSS python code environment
159+
:type code_env_name: str
160+
:param target: name of the target
161+
:type target: str
162+
"""
163+
if model_type not in {"REGRESSION", "BINARY_CLASSIFICATION", "MULTICLASS", "OTHER"}:
164+
raise ValueError('Invalid prediction type: {}'.format(model_type))
165+
166+
if classes and model_type not in {"BINARY_CLASSIFICATION", "MULTICLASS"}:
167+
raise ValueError('Classes can be specified only for BINARY_CLASSIFICATION or MULTICLASS prediction types')
168+
if model_type in {"BINARY_CLASSIFICATION", "MULTICLASS"}:
169+
if not classes:
170+
raise ValueError('Classes must be specified for {} prediction type'.format(model_type))
171+
if not isinstance(classes, list):
172+
raise ValueError('Wrong type for classes: {}'.format(type(classes)))
173+
for cur_class in classes:
174+
if cur_class is None:
175+
raise ValueError('class can not be None')
176+
if not isinstance(cur_class, str):
177+
raise ValueError('Wrong type for class {}: {}'.format(cur_class, type(cur_class)))
178+
179+
if code_env_name and not isinstance(code_env_name, str):
180+
raise ValueError('code_env_name must be a string')
181+
if target and not isinstance(target, str):
182+
raise ValueError('target must be a string')
183+
184+
params = {
185+
"run_id": run_id,
186+
"prediction_type": model_type
187+
}
188+
189+
if classes:
190+
params["classes"] = json.dumps(classes)
191+
if code_env_name:
192+
params["code_env_name"] = code_env_name
193+
if target:
194+
params["target"] = target
195+
196+
self.client._perform_http(
197+
"POST", "/api/2.0/mlflow/extension/set-run-inference-info",
198+
headers={"x-dku-mlflow-project-key": self.project_key},
199+
body=params
200+
)

dataikuapi/dss/project.py

Lines changed: 27 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,7 @@
1-
import time, warnings, sys, os.path as osp
1+
import warnings, os.path as osp
2+
3+
from ..dss_plugin_mlflow import MLflowHandle
4+
25
from .dataset import DSSDataset, DSSDatasetListItem, DSSManagedDatasetCreationHelper
36
from .modelcomparison import DSSModelComparison
47
from .jupyternotebook import DSSJupyterNotebook, DSSJupyterNotebookListItem
@@ -9,6 +12,7 @@
912
from .managedfolder import DSSManagedFolder
1013
from .savedmodel import DSSSavedModel
1114
from .modelevaluationstore import DSSModelEvaluationStore
15+
from .mlflow import DSSMLflowExtension
1216
from .job import DSSJob, DSSJobWaiter
1317
from .scenario import DSSScenario, DSSScenarioListItem
1418
from .continuousactivity import DSSContinuousActivity
@@ -30,8 +34,8 @@ class DSSProject(object):
3034
Do not create this class directly, instead use :meth:`dataikuapi.DSSClient.get_project`
3135
"""
3236
def __init__(self, client, project_key):
33-
self.client = client
34-
self.project_key = project_key
37+
self.client = client
38+
self.project_key = project_key
3539

3640
def get_summary(self):
3741
"""
@@ -1589,7 +1593,7 @@ def list_hive_tables(self, hive_database):
15891593
"""
15901594
connection_name = "@virtual(hive-jdbc):" + hive_database
15911595
ret = self.client._perform_json("GET", "/projects/%s/datasets/tables-import/actions/list-tables" % (self.project_key),
1592-
params = {"connectionName": connection_name} )
1596+
params={"connectionName": connection_name} )
15931597

15941598
def to_schema_table_pair(x):
15951599
return {"schema":x.get("databaseName", None), "table":x["table"]}
@@ -1598,11 +1602,29 @@ def to_schema_table_pair(x):
15981602
########################################################
15991603
# App designer
16001604
########################################################
1601-
16021605
def get_app_manifest(self):
16031606
raw_data = self.client._perform_json("GET", "/projects/%s/app-manifest" % self.project_key)
16041607
return DSSAppManifest(self.client, raw_data, self.project_key)
16051608

1609+
# MLflow experiment tracking
1610+
########################################################
1611+
def setup_mlflow(self, managed_folder, host=None):
1612+
"""
1613+
Setup the dss-plugin for MLflow
1614+
1615+
:param object managed_folder: a :class:`dataikuapi.dss.DSSManagedFolder` where MLflow artifacts should be stored.
1616+
:param str host: setup a custom host if the backend used is not DSS
1617+
"""
1618+
return MLflowHandle(client=self.client, project=self, managed_folder=managed_folder, host=host)
1619+
1620+
def get_mlflow_extension(self):
1621+
"""
1622+
Get a handle to interact with the extension of MLflow provided by DSS
1623+
1624+
:returns: A :class:`dataikuapi.dss.mlflow.DSSMLflowExtension` Mlflow Extension handle
1625+
1626+
"""
1627+
return DSSMLflowExtension(client=self.client, project_key=self.project_key)
16061628

16071629
class TablesImportDefinition(object):
16081630
"""

dataikuapi/dss/recipe.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,10 @@
33
from .discussion import DSSObjectDiscussions
44
import json, logging, warnings
55
from .utils import DSSTaggableObjectListItem, DSSTaggableObjectSettings
6+
try:
7+
basestring
8+
except NameError:
9+
basestring = str
610

711
class DSSRecipeListItem(DSSTaggableObjectListItem):
812
"""An item in a list of recipes. Do not instantiate this class, use :meth:`dataikuapi.dss.project.DSSProject.list_recipes`"""
@@ -34,6 +38,11 @@ def __init__(self, client, project_key, recipe_name):
3438
self.project_key = project_key
3539
self.recipe_name = recipe_name
3640

41+
@property
42+
def id(self):
43+
"""The id of the recipe"""
44+
return self.recipe_name
45+
3746
@property
3847
def name(self):
3948
"""The name of the recipe"""
@@ -225,6 +234,16 @@ def get_continuous_activity(self):
225234
from .continuousactivity import DSSContinuousActivity
226235
return DSSContinuousActivity(self.client, self.project_key, self.recipe_name)
227236

237+
def move_to_zone(self, zone):
238+
"""
239+
Moves this object to a flow zone
240+
241+
:param object zone: a :class:`dataikuapi.dss.flow.DSSFlowZone` where to move the object
242+
"""
243+
if isinstance(zone, basestring):
244+
zone = self.client.get_project(self.project_key).get_flow().get_zone(zone)
245+
zone.add_item(self)
246+
228247
class DSSRecipeStatus(object):
229248
"""Status of a recipce.
230249
Do not create that directly, use :meth:`DSSRecipe.get_status`"""
@@ -1411,6 +1430,12 @@ class StandaloneEvaluationRecipeCreator(DSSRecipeCreator):
14111430
builder.with_input("scored_dataset_to_evaluate")
14121431
builder.with_output_evaluation_store(evaluation_store_id)
14131432
1433+
# Add a reference dataset (optional) to compute data drift
1434+
1435+
builder.with_reference_dataset("reference_dataset")
1436+
1437+
# Finish creation of the recipe
1438+
14141439
new_recipe = builder.create()
14151440
14161441
# Modify the model parameters in the SER settings
@@ -1465,6 +1490,10 @@ def with_output_evaluation_store(self, mes_id):
14651490
"""Sets the output model evaluation store"""
14661491
return self._with_output(mes_id, role="main")
14671492

1493+
def with_reference_dataset(self, dataset_name):
1494+
"""Sets the dataset to use as a reference in data drift computation (optional)."""
1495+
return self._with_input(dataset_name, self.project.project_key, role="reference")
1496+
14681497

14691498
class ClusteringScoringRecipeCreator(SingleOutputRecipeCreator):
14701499
"""

0 commit comments

Comments
 (0)