Skip to content

Commit 7171589

Browse files
authored
Merge pull request #271 from NelGson/master
New Python clustering tutorial
2 parents 95a6346 + 21a5601 commit 7171589

File tree

3 files changed

+225
-2
lines changed

3 files changed

+225
-2
lines changed
Lines changed: 119 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,119 @@
1+
# Load packages.
2+
import matplotlib.pyplot as plt
3+
import numpy as np
4+
import pandas as pd
5+
import revoscalepy as revoscale
6+
from scipy.spatial import distance as sci_distance
7+
from sklearn import cluster as sk_cluster
8+
9+
10+
11+
def perform_clustering():
12+
################################################################################################
13+
14+
## Connect to DB and select data
15+
16+
################################################################################################
17+
18+
# Connection string to connect to SQL Server named instance.
19+
conn_str = 'Driver=SQL Server;Server=localhost;Database=tpcxbb_1gb;Trusted_Connection=True;'
20+
21+
input_query = '''SELECT
22+
ss_customer_sk AS customer,
23+
ROUND(COALESCE(returns_count / NULLIF(1.0*orders_count, 0), 0), 7) AS orderRatio,
24+
ROUND(COALESCE(returns_items / NULLIF(1.0*orders_items, 0), 0), 7) AS itemsRatio,
25+
ROUND(COALESCE(returns_money / NULLIF(1.0*orders_money, 0), 0), 7) AS monetaryRatio,
26+
COALESCE(returns_count, 0) AS frequency
27+
FROM
28+
(
29+
SELECT
30+
ss_customer_sk,
31+
-- return order ratio
32+
COUNT(distinct(ss_ticket_number)) AS orders_count,
33+
-- return ss_item_sk ratio
34+
COUNT(ss_item_sk) AS orders_items,
35+
-- return monetary amount ratio
36+
SUM( ss_net_paid ) AS orders_money
37+
FROM store_sales s
38+
GROUP BY ss_customer_sk
39+
) orders
40+
LEFT OUTER JOIN
41+
(
42+
SELECT
43+
sr_customer_sk,
44+
-- return order ratio
45+
count(distinct(sr_ticket_number)) as returns_count,
46+
-- return ss_item_sk ratio
47+
COUNT(sr_item_sk) as returns_items,
48+
-- return monetary amount ratio
49+
SUM( sr_return_amt ) AS returns_money
50+
FROM store_returns
51+
GROUP BY sr_customer_sk ) returned ON ss_customer_sk=sr_customer_sk'''
52+
53+
54+
# Define the columns we wish to import.
55+
column_info = {
56+
"customer": {"type": "integer"},
57+
"orderRatio": {"type": "integer"},
58+
"itemsRatio": {"type": "integer"},
59+
"frequency": {"type": "integer"}
60+
}
61+
62+
data_source = revoscale.RxSqlServerData(sql_query=input_query, column_info=column_info,
63+
connection_string=conn_str)
64+
65+
# import data source and convert to pandas dataframe.
66+
customer_data = pd.DataFrame(revoscalepy.rx_import(data_source))
67+
print("Data frame:", customer_data.head(n=20))
68+
69+
################################################################################################
70+
71+
## Determine number of clusters using the Elbow method
72+
73+
################################################################################################
74+
75+
cdata = customer_data
76+
K = range(1, 20)
77+
KM = (sk_cluster.KMeans(n_clusters=k).fit(cdata) for k in K)
78+
centroids = (k.cluster_centers_ for k in KM)
79+
80+
D_k = (sci_distance.cdist(cdata, cent, 'euclidean') for cent in centroids)
81+
dist = (np.min(D, axis=1) for D in D_k)
82+
avgWithinSS = [sum(d) / cdata.shape[0] for d in dist]
83+
plt.plot(K, avgWithinSS, 'b*-')
84+
plt.grid(True)
85+
plt.xlabel('Number of clusters')
86+
plt.ylabel('Average within-cluster sum of squares')
87+
plt.title('Elbow for KMeans clustering')
88+
plt.show()
89+
90+
91+
################################################################################################
92+
93+
## Perform clustering using Kmeans
94+
95+
################################################################################################
96+
97+
# It looks like k=4 is a good number to use based on the elbow graph.
98+
n_clusters = 4
99+
100+
means_cluster = sk_cluster.KMeans(n_clusters=n_clusters, random_state=111)
101+
columns = ["orderRatio", "itemsRatio", "monetaryRatio", "frequency"]
102+
est = means_cluster.fit(customer_data[columns])
103+
clusters = est.labels_
104+
customer_data['cluster'] = clusters
105+
106+
# Print some data about the clusters:
107+
108+
# For each cluster, count the members.
109+
for c in range(n_clusters):
110+
cluster_members=customer_data[customer_data['cluster'] == c][:]
111+
print('Cluster{}(n={}):'.format(c, len(cluster_members)))
112+
print('-'* 17)
113+
114+
# Print mean values per cluster.
115+
print(customer_data.groupby(['cluster']).mean())
116+
117+
118+
perform_clustering()
119+
Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,100 @@
1+
USE [tpcxbb_1gb]
2+
GO
3+
4+
-- Stored procedure that performs customer clustering using Python and SQL Server ML Services
5+
CREATE OR ALTER PROCEDURE [dbo].[py_generate_customer_return_clusters]
6+
AS
7+
8+
BEGIN
9+
DECLARE
10+
11+
-- Input query to generate the purchase history & return metrics
12+
@input_query NVARCHAR(MAX) = N'
13+
SELECT
14+
ss_customer_sk AS customer,
15+
CAST( (ROUND(COALESCE(returns_count / NULLIF(1.0*orders_count, 0), 0), 7) ) AS FLOAT) AS orderRatio,
16+
CAST( (ROUND(COALESCE(returns_items / NULLIF(1.0*orders_items, 0), 0), 7) ) AS FLOAT) AS itemsRatio,
17+
CAST( (ROUND(COALESCE(returns_money / NULLIF(1.0*orders_money, 0), 0), 7) ) AS FLOAT) AS monetaryRatio,
18+
CAST( (COALESCE(returns_count, 0)) AS FLOAT) AS frequency
19+
FROM
20+
(
21+
SELECT
22+
ss_customer_sk,
23+
-- return order ratio
24+
COUNT(distinct(ss_ticket_number)) AS orders_count,
25+
-- return ss_item_sk ratio
26+
COUNT(ss_item_sk) AS orders_items,
27+
-- return monetary amount ratio
28+
SUM( ss_net_paid ) AS orders_money
29+
FROM store_sales s
30+
GROUP BY ss_customer_sk
31+
) orders
32+
LEFT OUTER JOIN
33+
(
34+
SELECT
35+
sr_customer_sk,
36+
-- return order ratio
37+
count(distinct(sr_ticket_number)) as returns_count,
38+
-- return ss_item_sk ratio
39+
COUNT(sr_item_sk) as returns_items,
40+
-- return monetary amount ratio
41+
SUM( sr_return_amt ) AS returns_money
42+
FROM store_returns
43+
GROUP BY sr_customer_sk
44+
) returned ON ss_customer_sk=sr_customer_sk
45+
'
46+
47+
EXEC sp_execute_external_script
48+
@language = N'Python'
49+
, @script = N'
50+
51+
import pandas as pd
52+
from sklearn.cluster import KMeans
53+
54+
#We concluded in step2 in the tutorial that 4 would be a good number of clusters
55+
n_clusters = 4
56+
57+
#Perform clustering
58+
est = KMeans(n_clusters=n_clusters, random_state=111).fit(customer_data[["orderRatio","itemsRatio","monetaryRatio","frequency"]])
59+
clusters = est.labels_
60+
customer_data["cluster"] = clusters
61+
62+
#OutputDataSet = customer_data
63+
'
64+
, @input_data_1 = @input_query
65+
, @input_data_1_name = N'customer_data'
66+
,@output_data_1_name = N'customer_data'
67+
with result sets (("Customer" int, "orderRatio" float,"itemsRatio" float,"monetaryRatio" float,"frequency" float,"cluster" float));
68+
END;
69+
GO
70+
71+
72+
--Creating a table for storing the clustering data
73+
DROP TABLE IF EXISTS [dbo].[py_customer_clusters];
74+
GO
75+
--Create a table to store the predictions in
76+
CREATE TABLE [dbo].[py_customer_clusters](
77+
[Customer] [bigint] NULL,
78+
[OrderRatio] [float] NULL,
79+
[itemsRatio] [float] NULL,
80+
[monetaryRatio] [float] NULL,
81+
[frequency] [float] NULL,
82+
[cluster] [int] NULL,
83+
) ON [PRIMARY]
84+
GO
85+
86+
--Execute the clustering and insert results into table
87+
INSERT INTO py_customer_clusters
88+
EXEC [dbo].[py_generate_customer_return_clusters];
89+
90+
-- Select contents of the table
91+
SELECT * FROM py_customer_clusters;
92+
93+
--Get email addresses of customers in cluster 0
94+
SELECT customer.[c_email_address], customer.c_customer_sk
95+
FROM dbo.customer
96+
JOIN
97+
[dbo].[py_customer_clusters] as c
98+
ON c.Customer = customer.c_customer_sk
99+
WHERE c.cluster = 0;
100+

samples/features/readme.md

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,11 @@ Master Data Services (MDS) is the SQL Server solution for master data management
1010

1111
[R Services](r-services)
1212

13-
SQL Server R Services brings R processing close to the data, allowing more scalable and more efficient predictive analytics.
13+
SQL Server R Services (in SQL Server 2016 and above) brings R processing close to the data, allowing more scalable and more efficient predictive analytics using R in-database.
14+
15+
[ML Services](ml-services)
16+
17+
SQL Server ML Services (SQL Server 2017) brings Python processing close to the data, allowing more scalable and more efficient predictive analytics using Python in-database.
1418

1519
[JSON Support](json)
1620

@@ -28,4 +32,4 @@ Graph tables enable you to add a non-relational capability to your database.
2832

2933
[Reporting Services (SSRS)](reporting-services)
3034

31-
Reporting Services provides reporting capabilities for your organziation. Reporting Services can be integrated with SharePoint Server or used as a standalone service.
35+
Reporting Services provides reporting capabilities for your organziation. Reporting Services can be integrated with SharePoint Server or used as a standalone service.

0 commit comments

Comments
 (0)