Skip to content

Commit 104c56a

Browse files
Updated customer clustering .py and .sql files
1 parent 53d6201 commit 104c56a

File tree

7 files changed

+138
-127
lines changed

7 files changed

+138
-127
lines changed

.vs/VSWorkspaceState.json

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
{
2+
"ExpandedNodes": [
3+
"",
4+
"\\samples",
5+
"\\samples\\features"
6+
],
7+
"SelectedNode": "\\samples\\features\\readme.md",
8+
"PreviewInSolutionExplorer": false
9+
}

.vs/slnx.sqlite

5.54 MB
Binary file not shown.

.vs/slnx.sqlite-journal

918 KB
Binary file not shown.
Lines changed: 119 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,119 @@
1+
# Load packages.
2+
import matplotlib.pyplot as plt
3+
import numpy as np
4+
import pandas as pd
5+
import revoscalepy as revoscale
6+
from scipy.spatial import distance as sci_distance
7+
from sklearn import cluster as sk_cluster
8+
9+
10+
11+
def perform_clustering():
12+
################################################################################################
13+
14+
## Connect to DB and select data
15+
16+
################################################################################################
17+
18+
# Connection string to connect to SQL Server named instance.
19+
conn_str = 'Driver=SQL Server;Server=localhost;Database=tpcxbb_1gb;Trusted_Connection=True;'
20+
21+
input_query = '''SELECT
22+
ss_customer_sk AS customer,
23+
ROUND(COALESCE(returns_count / NULLIF(1.0*orders_count, 0), 0), 7) AS orderRatio,
24+
ROUND(COALESCE(returns_items / NULLIF(1.0*orders_items, 0), 0), 7) AS itemsRatio,
25+
ROUND(COALESCE(returns_money / NULLIF(1.0*orders_money, 0), 0), 7) AS monetaryRatio,
26+
COALESCE(returns_count, 0) AS frequency
27+
FROM
28+
(
29+
SELECT
30+
ss_customer_sk,
31+
-- return order ratio
32+
COUNT(distinct(ss_ticket_number)) AS orders_count,
33+
-- return ss_item_sk ratio
34+
COUNT(ss_item_sk) AS orders_items,
35+
-- return monetary amount ratio
36+
SUM( ss_net_paid ) AS orders_money
37+
FROM store_sales s
38+
GROUP BY ss_customer_sk
39+
) orders
40+
LEFT OUTER JOIN
41+
(
42+
SELECT
43+
sr_customer_sk,
44+
-- return order ratio
45+
count(distinct(sr_ticket_number)) as returns_count,
46+
-- return ss_item_sk ratio
47+
COUNT(sr_item_sk) as returns_items,
48+
-- return monetary amount ratio
49+
SUM( sr_return_amt ) AS returns_money
50+
FROM store_returns
51+
GROUP BY sr_customer_sk ) returned ON ss_customer_sk=sr_customer_sk'''
52+
53+
54+
# Define the columns we wish to import.
55+
column_info = {
56+
"customer": {"type": "integer"},
57+
"orderRatio": {"type": "integer"},
58+
"itemsRatio": {"type": "integer"},
59+
"frequency": {"type": "integer"}
60+
}
61+
62+
data_source = revoscale.RxSqlServerData(sql_query=input_query, column_info=column_info,
63+
connection_string=conn_str)
64+
65+
# import data source and convert to pandas dataframe.
66+
customer_data = pd.DataFrame(revoscalepy.rx_import(data_source))
67+
print("Data frame:", customer_data.head(n=20))
68+
69+
################################################################################################
70+
71+
## Determine number of clusters using the Elbow method
72+
73+
################################################################################################
74+
75+
cdata = customer_data
76+
K = range(1, 20)
77+
KM = (sk_cluster.KMeans(n_clusters=k).fit(cdata) for k in K)
78+
centroids = (k.cluster_centers_ for k in KM)
79+
80+
D_k = (sci_distance.cdist(cdata, cent, 'euclidean') for cent in centroids)
81+
dist = (np.min(D, axis=1) for D in D_k)
82+
avgWithinSS = [sum(d) / cdata.shape[0] for d in dist]
83+
plt.plot(K, avgWithinSS, 'b*-')
84+
plt.grid(True)
85+
plt.xlabel('Number of clusters')
86+
plt.ylabel('Average within-cluster sum of squares')
87+
plt.title('Elbow for KMeans clustering')
88+
plt.show()
89+
90+
91+
################################################################################################
92+
93+
## Perform clustering using Kmeans
94+
95+
################################################################################################
96+
97+
# It looks like k=4 is a good number to use based on the elbow graph.
98+
n_clusters = 4
99+
100+
means_cluster = sk_cluster.KMeans(n_clusters=n_clusters, random_state=111)
101+
columns = ["orderRatio", "itemsRatio", "monetaryRatio", "frequency"]
102+
est = means_cluster.fit(customer_data[columns])
103+
clusters = est.labels_
104+
customer_data['cluster'] = clusters
105+
106+
# Print some data about the clusters:
107+
108+
# For each cluster, count the members.
109+
for c in range(n_clusters):
110+
cluster_members=customer_data[customer_data['cluster'] == c][:]
111+
print('Cluster{}(n={}):'.format(c, len(cluster_members)))
112+
print('-'* 17)
113+
114+
# Print mean values per cluster.
115+
print(customer_data.groupby(['cluster']).mean())
116+
117+
118+
perform_clustering()
119+

samples/features/machine-learning-services/python/getting-started/customer-clustering/customer_clustering.sql

Lines changed: 5 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,7 @@ USE [tpcxbb_1gb]
22
GO
33

44
-- Stored procedure that performs customer clustering using Python and SQL Server ML Services
5-
DROP PROCEDURE IF EXISTS [dbo].[py_generate_customer_return_clusters]
6-
GO
7-
CREATE procedure [dbo].[py_generate_customer_return_clusters]
5+
CREATE OR ALTER PROCEDURE [dbo].[py_generate_customer_return_clusters]
86
AS
97

108
BEGIN
@@ -53,9 +51,6 @@ EXEC sp_execute_external_script
5351
import pandas as pd
5452
from sklearn.cluster import KMeans
5553
56-
#get data from input query
57-
customer_data = my_input_data
58-
5954
#We concluded in step2 in the tutorial that 4 would be a good number of clusters
6055
n_clusters = 4
6156
@@ -64,16 +59,16 @@ est = KMeans(n_clusters=n_clusters, random_state=111).fit(customer_data[["orderR
6459
clusters = est.labels_
6560
customer_data["cluster"] = clusters
6661
67-
OutputDataSet = customer_data
62+
#OutputDataSet = customer_data
6863
'
6964
, @input_data_1 = @input_query
70-
, @input_data_1_name = N'my_input_data'
65+
, @input_data_1_name = N'customer_data'
66+
,@output_data_1_name = N'customer_data'
7167
with result sets (("Customer" int, "orderRatio" float,"itemsRatio" float,"monetaryRatio" float,"frequency" float,"cluster" float));
7268
END;
7369
GO
7470

7571

76-
7772
--Creating a table for storing the clustering data
7873
DROP TABLE IF EXISTS [dbo].[py_customer_clusters];
7974
GO
@@ -101,4 +96,4 @@ SELECT customer.[c_email_address], customer.c_customer_sk
10196
JOIN
10297
[dbo].[py_customer_clusters] as c
10398
ON c.Customer = customer.c_customer_sk
104-
WHERE c.cluster = 0;
99+
WHERE c.cluster = 0;

samples/features/machine-learning-services/python/getting-started/customer-clustering/customer_clustering_ng.py

Lines changed: 0 additions & 116 deletions
This file was deleted.

samples/features/readme.md

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,11 @@ Master Data Services (MDS) is the SQL Server solution for master data management
1010

1111
[R Services](r-services)
1212

13-
SQL Server R Services brings R processing close to the data, allowing more scalable and more efficient predictive analytics.
13+
SQL Server R Services (in SQL Server 2016 and above) brings R processing close to the data, allowing more scalable and more efficient predictive analytics using R in-database.
14+
15+
[ML Services](ml-services)
16+
17+
SQL Server ML Services (SQL Server 2017) brings Python processing close to the data, allowing more scalable and more efficient predictive analytics using Python in-database.
1418

1519
[JSON Support](json)
1620

0 commit comments

Comments
 (0)