Skip to content

Commit 53d6201

Browse files
Added new Python tutorial
1 parent 2dda902 commit 53d6201

File tree

2 files changed

+220
-0
lines changed

2 files changed

+220
-0
lines changed
Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,104 @@
1+
USE [tpcxbb_1gb]
2+
GO
3+
4+
-- Stored procedure that performs customer clustering using Python and SQL Server ML Services
5+
DROP PROCEDURE IF EXISTS [dbo].[py_generate_customer_return_clusters]
6+
GO
7+
CREATE procedure [dbo].[py_generate_customer_return_clusters]
8+
AS
9+
10+
BEGIN
11+
DECLARE
12+
13+
-- Input query to generate the purchase history & return metrics
14+
@input_query NVARCHAR(MAX) = N'
15+
SELECT
16+
ss_customer_sk AS customer,
17+
CAST( (ROUND(COALESCE(returns_count / NULLIF(1.0*orders_count, 0), 0), 7) ) AS FLOAT) AS orderRatio,
18+
CAST( (ROUND(COALESCE(returns_items / NULLIF(1.0*orders_items, 0), 0), 7) ) AS FLOAT) AS itemsRatio,
19+
CAST( (ROUND(COALESCE(returns_money / NULLIF(1.0*orders_money, 0), 0), 7) ) AS FLOAT) AS monetaryRatio,
20+
CAST( (COALESCE(returns_count, 0)) AS FLOAT) AS frequency
21+
FROM
22+
(
23+
SELECT
24+
ss_customer_sk,
25+
-- return order ratio
26+
COUNT(distinct(ss_ticket_number)) AS orders_count,
27+
-- return ss_item_sk ratio
28+
COUNT(ss_item_sk) AS orders_items,
29+
-- return monetary amount ratio
30+
SUM( ss_net_paid ) AS orders_money
31+
FROM store_sales s
32+
GROUP BY ss_customer_sk
33+
) orders
34+
LEFT OUTER JOIN
35+
(
36+
SELECT
37+
sr_customer_sk,
38+
-- return order ratio
39+
count(distinct(sr_ticket_number)) as returns_count,
40+
-- return ss_item_sk ratio
41+
COUNT(sr_item_sk) as returns_items,
42+
-- return monetary amount ratio
43+
SUM( sr_return_amt ) AS returns_money
44+
FROM store_returns
45+
GROUP BY sr_customer_sk
46+
) returned ON ss_customer_sk=sr_customer_sk
47+
'
48+
49+
EXEC sp_execute_external_script
50+
@language = N'Python'
51+
, @script = N'
52+
53+
import pandas as pd
54+
from sklearn.cluster import KMeans
55+
56+
#get data from input query
57+
customer_data = my_input_data
58+
59+
#We concluded in step2 in the tutorial that 4 would be a good number of clusters
60+
n_clusters = 4
61+
62+
#Perform clustering
63+
est = KMeans(n_clusters=n_clusters, random_state=111).fit(customer_data[["orderRatio","itemsRatio","monetaryRatio","frequency"]])
64+
clusters = est.labels_
65+
customer_data["cluster"] = clusters
66+
67+
OutputDataSet = customer_data
68+
'
69+
, @input_data_1 = @input_query
70+
, @input_data_1_name = N'my_input_data'
71+
with result sets (("Customer" int, "orderRatio" float,"itemsRatio" float,"monetaryRatio" float,"frequency" float,"cluster" float));
72+
END;
73+
GO
74+
75+
76+
77+
--Creating a table for storing the clustering data
78+
DROP TABLE IF EXISTS [dbo].[py_customer_clusters];
79+
GO
80+
--Create a table to store the predictions in
81+
CREATE TABLE [dbo].[py_customer_clusters](
82+
[Customer] [bigint] NULL,
83+
[OrderRatio] [float] NULL,
84+
[itemsRatio] [float] NULL,
85+
[monetaryRatio] [float] NULL,
86+
[frequency] [float] NULL,
87+
[cluster] [int] NULL,
88+
) ON [PRIMARY]
89+
GO
90+
91+
--Execute the clustering and insert results into table
92+
INSERT INTO py_customer_clusters
93+
EXEC [dbo].[py_generate_customer_return_clusters];
94+
95+
-- Select contents of the table
96+
SELECT * FROM py_customer_clusters;
97+
98+
--Get email addresses of customers in cluster 0
99+
SELECT customer.[c_email_address], customer.c_customer_sk
100+
FROM dbo.customer
101+
JOIN
102+
[dbo].[py_customer_clusters] as c
103+
ON c.Customer = customer.c_customer_sk
104+
WHERE c.cluster = 0;
Lines changed: 116 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,116 @@
1+
# Load packages.
2+
import pandas as pd
3+
from revoscalepy import RxInSqlServer, RxSqlServerData, RxComputeContext, rx_import
4+
from sklearn.cluster import KMeans
5+
from sklearn.decomposition import PCA
6+
import matplotlib.pyplot as plt
7+
from mpl_toolkits.mplot3d import Axes3D
8+
from scipy.spatial.distance import cdist, pdist
9+
import numpy as np
10+
11+
12+
def perform_clustering():
13+
##########################################################################################################################################
14+
15+
## Connect to DB and select data
16+
17+
##########################################################################################################################################
18+
19+
# Connection string to connect to SQL Server named instance
20+
conn_str = 'Driver=SQL Server;Server=localhost;Database=tpcxbb_1gb;Trusted_Connection=True;'
21+
22+
input_query = '''SELECT
23+
ss_customer_sk AS customer,
24+
ROUND(COALESCE(returns_count / NULLIF(1.0*orders_count, 0), 0), 7) AS orderRatio,
25+
ROUND(COALESCE(returns_items / NULLIF(1.0*orders_items, 0), 0), 7) AS itemsRatio,
26+
ROUND(COALESCE(returns_money / NULLIF(1.0*orders_money, 0), 0), 7) AS monetaryRatio,
27+
COALESCE(returns_count, 0) AS frequency
28+
FROM
29+
(
30+
SELECT
31+
ss_customer_sk,
32+
-- return order ratio
33+
COUNT(distinct(ss_ticket_number)) AS orders_count,
34+
-- return ss_item_sk ratio
35+
COUNT(ss_item_sk) AS orders_items,
36+
-- return monetary amount ratio
37+
SUM( ss_net_paid ) AS orders_money
38+
FROM store_sales s
39+
GROUP BY ss_customer_sk
40+
) orders
41+
LEFT OUTER JOIN
42+
(
43+
SELECT
44+
sr_customer_sk,
45+
-- return order ratio
46+
count(distinct(sr_ticket_number)) as returns_count,
47+
-- return ss_item_sk ratio
48+
COUNT(sr_item_sk) as returns_items,
49+
-- return monetary amount ratio
50+
SUM( sr_return_amt ) AS returns_money
51+
FROM store_returns
52+
GROUP BY sr_customer_sk ) returned ON ss_customer_sk=sr_customer_sk'''
53+
54+
55+
# Define the columns we wish to import
56+
column_info = {
57+
"customer": {"type": "integer"},
58+
"orderRatio": {"type": "integer"},
59+
"itemsRatio": {"type": "integer"},
60+
"frequency": {"type": "integer"}
61+
}
62+
63+
data_source = RxSqlServerData(sql_query=input_query, column_Info=column_info, connection_string=conn_str)
64+
RxInSqlServer(connection_string=conn_str, num_tasks=1, auto_cleanup=False)
65+
# import data source and convert to pandas dataframe
66+
customer_data = pd.DataFrame(rx_import(data_source))
67+
print("Data frame:", customer_data.head(n=20))
68+
69+
##########################################################################################################################################
70+
71+
## Determine number of clusters using the Elbow method
72+
73+
##########################################################################################################################################
74+
75+
cdata = customer_data
76+
K = range(1, 20)
77+
KM = [KMeans(n_clusters=k).fit(cdata) for k in K]
78+
centroids = [k.cluster_centers_ for k in KM]
79+
80+
D_k = [cdist(cdata, cent, 'euclidean') for cent in centroids]
81+
dist = [np.min(D, axis=1) for D in D_k]
82+
avgWithinSS = [sum(d) / cdata.shape[0] for d in dist]
83+
plt.plot(K, avgWithinSS, 'b*-')
84+
plt.grid(True)
85+
plt.xlabel('Number of clusters')
86+
plt.ylabel('Average within-cluster sum of squares')
87+
plt.title('Elbow for KMeans clustering')
88+
plt.show()
89+
90+
91+
##########################################################################################################################################
92+
93+
## Perform clustering using Kmeans
94+
95+
##########################################################################################################################################
96+
97+
#It looks like k=4 is a good number to use based on the elbow graph
98+
n_clusters = 4
99+
100+
est = KMeans(n_clusters=n_clusters, random_state=111).fit(customer_data[["orderRatio", "itemsRatio", "monetaryRatio", "frequency"]])
101+
clusters = est.labels_
102+
customer_data['cluster'] = clusters
103+
104+
#Print some data about the clusters:
105+
106+
#For each cluster, count the members
107+
for c in range(n_clusters):
108+
cluster_members=customer_data[customer_data['cluster']== c][:]
109+
print('Cluster{0}(n={1}):'.format(c,len(cluster_members)))
110+
print('-------------------')
111+
112+
#Print mean values per cluster
113+
print(customer_data.groupby(['cluster']).mean())
114+
115+
116+
perform_clustering()

0 commit comments

Comments
 (0)