1
+ # Load packages.
2
+ import pandas as pd
3
+ from revoscalepy import RxInSqlServer , RxSqlServerData , RxComputeContext , rx_import
4
+ from sklearn .cluster import KMeans
5
+ from sklearn .decomposition import PCA
6
+ import matplotlib .pyplot as plt
7
+ from mpl_toolkits .mplot3d import Axes3D
8
+ from scipy .spatial .distance import cdist , pdist
9
+ import numpy as np
10
+
11
+
12
+ def perform_clustering ():
13
+ ##########################################################################################################################################
14
+
15
+ ## Connect to DB and select data
16
+
17
+ ##########################################################################################################################################
18
+
19
+ # Connection string to connect to SQL Server named instance
20
+ conn_str = 'Driver=SQL Server;Server=localhost;Database=tpcxbb_1gb;Trusted_Connection=True;'
21
+
22
+ input_query = '''SELECT
23
+ ss_customer_sk AS customer,
24
+ ROUND(COALESCE(returns_count / NULLIF(1.0*orders_count, 0), 0), 7) AS orderRatio,
25
+ ROUND(COALESCE(returns_items / NULLIF(1.0*orders_items, 0), 0), 7) AS itemsRatio,
26
+ ROUND(COALESCE(returns_money / NULLIF(1.0*orders_money, 0), 0), 7) AS monetaryRatio,
27
+ COALESCE(returns_count, 0) AS frequency
28
+ FROM
29
+ (
30
+ SELECT
31
+ ss_customer_sk,
32
+ -- return order ratio
33
+ COUNT(distinct(ss_ticket_number)) AS orders_count,
34
+ -- return ss_item_sk ratio
35
+ COUNT(ss_item_sk) AS orders_items,
36
+ -- return monetary amount ratio
37
+ SUM( ss_net_paid ) AS orders_money
38
+ FROM store_sales s
39
+ GROUP BY ss_customer_sk
40
+ ) orders
41
+ LEFT OUTER JOIN
42
+ (
43
+ SELECT
44
+ sr_customer_sk,
45
+ -- return order ratio
46
+ count(distinct(sr_ticket_number)) as returns_count,
47
+ -- return ss_item_sk ratio
48
+ COUNT(sr_item_sk) as returns_items,
49
+ -- return monetary amount ratio
50
+ SUM( sr_return_amt ) AS returns_money
51
+ FROM store_returns
52
+ GROUP BY sr_customer_sk ) returned ON ss_customer_sk=sr_customer_sk'''
53
+
54
+
55
+ # Define the columns we wish to import
56
+ column_info = {
57
+ "customer" : {"type" : "integer" },
58
+ "orderRatio" : {"type" : "integer" },
59
+ "itemsRatio" : {"type" : "integer" },
60
+ "frequency" : {"type" : "integer" }
61
+ }
62
+
63
+ data_source = RxSqlServerData (sql_query = input_query , column_Info = column_info , connection_string = conn_str )
64
+ RxInSqlServer (connection_string = conn_str , num_tasks = 1 , auto_cleanup = False )
65
+ # import data source and convert to pandas dataframe
66
+ customer_data = pd .DataFrame (rx_import (data_source ))
67
+ print ("Data frame:" , customer_data .head (n = 20 ))
68
+
69
+ ##########################################################################################################################################
70
+
71
+ ## Determine number of clusters using the Elbow method
72
+
73
+ ##########################################################################################################################################
74
+
75
+ cdata = customer_data
76
+ K = range (1 , 20 )
77
+ KM = [KMeans (n_clusters = k ).fit (cdata ) for k in K ]
78
+ centroids = [k .cluster_centers_ for k in KM ]
79
+
80
+ D_k = [cdist (cdata , cent , 'euclidean' ) for cent in centroids ]
81
+ dist = [np .min (D , axis = 1 ) for D in D_k ]
82
+ avgWithinSS = [sum (d ) / cdata .shape [0 ] for d in dist ]
83
+ plt .plot (K , avgWithinSS , 'b*-' )
84
+ plt .grid (True )
85
+ plt .xlabel ('Number of clusters' )
86
+ plt .ylabel ('Average within-cluster sum of squares' )
87
+ plt .title ('Elbow for KMeans clustering' )
88
+ plt .show ()
89
+
90
+
91
+ ##########################################################################################################################################
92
+
93
+ ## Perform clustering using Kmeans
94
+
95
+ ##########################################################################################################################################
96
+
97
+ #It looks like k=4 is a good number to use based on the elbow graph
98
+ n_clusters = 4
99
+
100
+ est = KMeans (n_clusters = n_clusters , random_state = 111 ).fit (customer_data [["orderRatio" , "itemsRatio" , "monetaryRatio" , "frequency" ]])
101
+ clusters = est .labels_
102
+ customer_data ['cluster' ] = clusters
103
+
104
+ #Print some data about the clusters:
105
+
106
+ #For each cluster, count the members
107
+ for c in range (n_clusters ):
108
+ cluster_members = customer_data [customer_data ['cluster' ]== c ][:]
109
+ print ('Cluster{0}(n={1}):' .format (c ,len (cluster_members )))
110
+ print ('-------------------' )
111
+
112
+ #Print mean values per cluster
113
+ print (customer_data .groupby (['cluster' ]).mean ())
114
+
115
+
116
+ perform_clustering ()
0 commit comments