Skip to content

Commit c73465d

Browse files
authored
Ai examples (#79)
* Added ai example files * Updated Readme * Added formatting changes * Updated examples * Added formatting changes * Removed unneeded lines * Made fixes * Added changes to call open ai by mlflow gateway and creating gateway routes * Removed extra code and added example notebook for creating mlflow gateway routes * formatting changes * fixed name * Updates notes * Made changes as per Review comments --------- Co-authored-by: souravg-db <souravg-db>
1 parent 7c462df commit c73465d

File tree

4 files changed

+393
-0
lines changed

4 files changed

+393
-0
lines changed

README.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@ Operations are applied concurrently across multiple tables
1717
* Visualise quantity of data written per table per period
1818
* **Governance**
1919
* PII detection with Presidio ([example notebook](examples/pii_detection_presidio.py))
20+
* Text Analysis with MosaicML and Databricks MLflow ([example notebook](examples/text_analysis_mosaicml_mlflow.py))
21+
* Text Analysis with OpenAI GPT ([example notebook](examples/text_analysis_openai_gpt.py))
2022
* [GDPR right of access: extract user data from all tables at once](docs/GDPR_RoA.md)
2123
* [GDPR right of erasure: delete user data from all tables at once](docs/GDPR_RoE.md)
2224
* [Search in any column](docs/Search.md)
@@ -26,6 +28,7 @@ Operations are applied concurrently across multiple tables
2628
* [Delete data based on semantic classes](docs/Delete_by_class.md)
2729
* **Custom**
2830
* [Arbitrary SQL template execution across multiple tables](docs/Arbitrary_multi-table_SQL.md)
31+
* Create Mlflow gateway routes for MosaicML and OpenAI ([example notebook](examples/mlflow_gateway_routes_examples.py))
2932

3033
## Getting started
3134

Lines changed: 110 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,110 @@
1+
# Databricks notebook source
2+
# MAGIC %md
3+
# MAGIC #Create MLflow Gateway Routes for MosaicML & OpenAI
4+
# MAGIC This notebook provides examples of creating mlflow gateway routes for MosaicML & OpenAI
5+
# MAGIC
6+
# MAGIC **NOTE**:
7+
# MAGIC - This notebook requires >= DBR 13.3 LTS ML Runtime
8+
# MAGIC - Please refer to [configuring-the-ai-gateway](https://mlflow.org/docs/latest/gateway/index.html#configuring-the-ai-gateway) for more info
9+
10+
# COMMAND ----------
11+
12+
# MAGIC %md
13+
# MAGIC ### Install dependencies
14+
15+
# COMMAND ----------
16+
17+
# MAGIC %pip install mlflow[gateway]
18+
# MAGIC dbutils.library.restartPython()
19+
20+
# COMMAND ----------
21+
22+
# MAGIC %md
23+
# MAGIC ## Setup widgets
24+
25+
# COMMAND ----------
26+
27+
dbutils.widgets.text("moasicml_route_name", "discoverx-mosaicml-llama2-70b-completions", "mosaicml route name")
28+
dbutils.widgets.text("openai_route_name", "discoverx-openai-gpt-3.5-completions", "openai route name")
29+
30+
# COMMAND ----------
31+
32+
# MAGIC %md
33+
# MAGIC ## Import required libs and initialize variables
34+
35+
# COMMAND ----------
36+
37+
import mlflow
38+
from mlflow import gateway
39+
40+
# COMMAND ----------
41+
42+
moasicml_route_name = dbutils.widgets.get("moasicml_route_name")
43+
openai_route_name = dbutils.widgets.get("openai_route_name")
44+
45+
# COMMAND ----------
46+
47+
# MAGIC %md
48+
# MAGIC ### Create MLflow gateway route for MosaicML (llama2 model)
49+
50+
# COMMAND ----------
51+
52+
# get or create mosaicml route
53+
import mlflow
54+
from mlflow import gateway
55+
56+
gateway.set_gateway_uri(gateway_uri="databricks")
57+
58+
try:
59+
route = gateway.get_route(moasicml_route_name)
60+
except:
61+
# Create a route for embeddings with MosaicML
62+
print(f"Creating the route {moasicml_route_name}")
63+
print(
64+
gateway.create_route(
65+
name=moasicml_route_name,
66+
route_type="llm/v1/completions",
67+
model={
68+
"name": "llama2-70b-chat",
69+
"provider": "mosaicml",
70+
"mosaicml_config": {
71+
"mosaicml_api_key": dbutils.secrets.get(scope="discoverx", key="mosaic_ml_api_key")
72+
},
73+
},
74+
)
75+
)
76+
77+
# COMMAND ----------
78+
79+
# MAGIC %md
80+
# MAGIC ### Create MLflow gateway route for Open AI (GPT 3.5 model)
81+
82+
# COMMAND ----------
83+
84+
# get or create openai route
85+
import mlflow
86+
from mlflow import gateway
87+
88+
gateway.set_gateway_uri(gateway_uri="databricks")
89+
try:
90+
route = gateway.get_route(openai_route_name)
91+
except:
92+
# Create a route for embeddings with OpenAI
93+
print(f"Creating the route {openai_route_name}")
94+
print(
95+
gateway.create_route(
96+
name=openai_route_name,
97+
route_type="llm/v1/completions",
98+
model={
99+
"name": "gpt-35-turbo",
100+
"provider": "openai",
101+
"openai_config": {
102+
"openai_api_key": dbutils.secrets.get(scope="discoverx", key="openaikey"),
103+
"openai_api_base": dbutils.secrets.get(scope="discoverx", key="openaibase"),
104+
"openai_deployment_name": dbutils.secrets.get(scope="discoverx", key="openai_deployment_name"),
105+
"openai_api_type": "azure",
106+
"openai_api_version": "2023-05-15",
107+
},
108+
},
109+
)
110+
)
Lines changed: 139 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,139 @@
1+
# Databricks notebook source
2+
# MAGIC %md
3+
# MAGIC # Text analysis with DiscoverX, MosaicML & Databricks MLflow
4+
# MAGIC
5+
# MAGIC This notebooks uses [DiscoverX](https://github.com/databrickslabs/discoverx) to analyze text with [MosiacML](https://www.mosaicml.com/blog/llama2-inference) over a set of tables in Unity Catalog.
6+
# MAGIC
7+
# MAGIC The notebook will:
8+
# MAGIC 1. Use DiscoverX to sample a set of tables from Unity Catalog and unpivot all string columns into a long format dataset
9+
# MAGIC 2. Run text analysis with MosaicML llama2-70b model & Databricks MLflow
10+
# MAGIC
11+
# MAGIC **NOTE**:
12+
# MAGIC - This notebook requires >= DBR 13.3 LTS ML Runtime
13+
# MAGIC - This notebook requires Mlflow gateway route for MosaicML. For examples of creating routes, please refer to the [README.md](https://github.com/databrickslabs/discoverx/blob/master/README.md) file.
14+
# MAGIC - For detailed information about the cost of API hits, please refer to [MosaicML Inference](https://www.mosaicml.com/inference)
15+
16+
# COMMAND ----------
17+
18+
# MAGIC %md
19+
# MAGIC ## Install dependencies
20+
21+
# COMMAND ----------
22+
23+
# MAGIC %pip install mlflow[gateway]
24+
# MAGIC dbutils.library.restartPython()
25+
26+
# COMMAND ----------
27+
28+
# MAGIC %md
29+
# MAGIC ## Setup widgets
30+
31+
# COMMAND ----------
32+
33+
dbutils.widgets.text("from_tables", "discoverx_sample.*.*", "from tables")
34+
dbutils.widgets.text("moasicml_route_name", "discoverx-mosaicml-llama2-70b-completions", "mosaicml route name")
35+
36+
# COMMAND ----------
37+
38+
# MAGIC %md
39+
# MAGIC ## Import required libs and initialize variables
40+
41+
# COMMAND ----------
42+
43+
import pandas as pd
44+
from pyspark.sql.functions import (
45+
pandas_udf,
46+
col,
47+
concat,
48+
lit,
49+
explode,
50+
count,
51+
avg,
52+
min,
53+
max,
54+
sum,
55+
collect_set,
56+
concat_ws,
57+
)
58+
from pyspark.sql.types import ArrayType, StringType, StructType, FloatType, StructField
59+
from typing import Iterator
60+
61+
# COMMAND ----------
62+
63+
from_tables = dbutils.widgets.get("from_tables")
64+
moasicml_route_name = dbutils.widgets.get("moasicml_route_name")
65+
66+
# Set the sample rows size
67+
sample_size = 100
68+
69+
# COMMAND ----------
70+
71+
# MAGIC %md
72+
# MAGIC ## Initialize discoverx
73+
74+
# COMMAND ----------
75+
76+
from discoverx import DX
77+
78+
dx = DX()
79+
80+
# COMMAND ----------
81+
82+
# MAGIC %md
83+
# MAGIC ## Transform all sampled tables
84+
85+
# COMMAND ----------
86+
87+
unpivoted_df = (
88+
dx.from_tables(from_tables)
89+
.unpivot_string_columns(sample_size=sample_size)
90+
.apply()
91+
.localCheckpoint() # Checkpointing to reduce the query plan size
92+
)
93+
94+
# COMMAND ----------
95+
96+
display(unpivoted_df)
97+
98+
# COMMAND ----------
99+
100+
# MAGIC %md
101+
# MAGIC ### Define udf to use MosiacML model
102+
103+
# COMMAND ----------
104+
105+
import mlflow
106+
from mlflow import gateway
107+
108+
109+
@pandas_udf(StringType())
110+
def predict_value_udf(s):
111+
def predict_value(s):
112+
data = {
113+
"prompt": f""" [INST]
114+
<<SYS>>
115+
Reply with either YES or NO
116+
<</SYS>>
117+
Is this news article related to aquisition/merger ?
118+
News Article: {s}
119+
[/INST]
120+
"""
121+
}
122+
r = mlflow.gateway.query(route=moasicml_route_name, data=data)
123+
return r["candidates"][0]["text"]
124+
125+
return s.apply(predict_value)
126+
127+
128+
# COMMAND ----------
129+
130+
# MAGIC %md
131+
# MAGIC ### Run Predictions
132+
133+
# COMMAND ----------
134+
135+
df_with_prediction = unpivoted_df.withColumn("is_realted_to_aquisition", predict_value_udf(col("string_value")))
136+
137+
# COMMAND ----------
138+
139+
display(df_with_prediction)

0 commit comments

Comments
 (0)