|
| 1 | +# Load packages. |
| 2 | +import matplotlib.pyplot as plt |
| 3 | +import numpy as np |
| 4 | +import pandas as pd |
| 5 | +import revoscalepy as revoscale |
| 6 | +from scipy.spatial import distance as sci_distance |
| 7 | +from sklearn import cluster as sk_cluster |
| 8 | + |
| 9 | + |
| 10 | + |
| 11 | +def perform_clustering(): |
| 12 | + ################################################################################################ |
| 13 | + |
| 14 | + ## Connect to DB and select data |
| 15 | + |
| 16 | + ################################################################################################ |
| 17 | + |
| 18 | + # Connection string to connect to SQL Server named instance. |
| 19 | + conn_str = 'Driver=SQL Server;Server=localhost;Database=tpcxbb_1gb;Trusted_Connection=True;' |
| 20 | + |
| 21 | + input_query = '''SELECT |
| 22 | + ss_customer_sk AS customer, |
| 23 | + ROUND(COALESCE(returns_count / NULLIF(1.0*orders_count, 0), 0), 7) AS orderRatio, |
| 24 | + ROUND(COALESCE(returns_items / NULLIF(1.0*orders_items, 0), 0), 7) AS itemsRatio, |
| 25 | + ROUND(COALESCE(returns_money / NULLIF(1.0*orders_money, 0), 0), 7) AS monetaryRatio, |
| 26 | + COALESCE(returns_count, 0) AS frequency |
| 27 | + FROM |
| 28 | + ( |
| 29 | + SELECT |
| 30 | + ss_customer_sk, |
| 31 | + -- return order ratio |
| 32 | + COUNT(distinct(ss_ticket_number)) AS orders_count, |
| 33 | + -- return ss_item_sk ratio |
| 34 | + COUNT(ss_item_sk) AS orders_items, |
| 35 | + -- return monetary amount ratio |
| 36 | + SUM( ss_net_paid ) AS orders_money |
| 37 | + FROM store_sales s |
| 38 | + GROUP BY ss_customer_sk |
| 39 | + ) orders |
| 40 | + LEFT OUTER JOIN |
| 41 | + ( |
| 42 | + SELECT |
| 43 | + sr_customer_sk, |
| 44 | + -- return order ratio |
| 45 | + count(distinct(sr_ticket_number)) as returns_count, |
| 46 | + -- return ss_item_sk ratio |
| 47 | + COUNT(sr_item_sk) as returns_items, |
| 48 | + -- return monetary amount ratio |
| 49 | + SUM( sr_return_amt ) AS returns_money |
| 50 | + FROM store_returns |
| 51 | + GROUP BY sr_customer_sk ) returned ON ss_customer_sk=sr_customer_sk''' |
| 52 | + |
| 53 | + |
| 54 | + # Define the columns we wish to import. |
| 55 | + column_info = { |
| 56 | + "customer": {"type": "integer"}, |
| 57 | + "orderRatio": {"type": "integer"}, |
| 58 | + "itemsRatio": {"type": "integer"}, |
| 59 | + "frequency": {"type": "integer"} |
| 60 | + } |
| 61 | + |
| 62 | + data_source = revoscale.RxSqlServerData(sql_query=input_query, column_info=column_info, |
| 63 | + connection_string=conn_str) |
| 64 | + |
| 65 | + # import data source and convert to pandas dataframe. |
| 66 | + customer_data = pd.DataFrame(revoscalepy.rx_import(data_source)) |
| 67 | + print("Data frame:", customer_data.head(n=20)) |
| 68 | + |
| 69 | + ################################################################################################ |
| 70 | + |
| 71 | + ## Determine number of clusters using the Elbow method |
| 72 | + |
| 73 | + ################################################################################################ |
| 74 | + |
| 75 | + cdata = customer_data |
| 76 | + K = range(1, 20) |
| 77 | + KM = (sk_cluster.KMeans(n_clusters=k).fit(cdata) for k in K) |
| 78 | + centroids = (k.cluster_centers_ for k in KM) |
| 79 | + |
| 80 | + D_k = (sci_distance.cdist(cdata, cent, 'euclidean') for cent in centroids) |
| 81 | + dist = (np.min(D, axis=1) for D in D_k) |
| 82 | + avgWithinSS = [sum(d) / cdata.shape[0] for d in dist] |
| 83 | + plt.plot(K, avgWithinSS, 'b*-') |
| 84 | + plt.grid(True) |
| 85 | + plt.xlabel('Number of clusters') |
| 86 | + plt.ylabel('Average within-cluster sum of squares') |
| 87 | + plt.title('Elbow for KMeans clustering') |
| 88 | + plt.show() |
| 89 | + |
| 90 | + |
| 91 | + ################################################################################################ |
| 92 | + |
| 93 | + ## Perform clustering using Kmeans |
| 94 | + |
| 95 | + ################################################################################################ |
| 96 | + |
| 97 | + # It looks like k=4 is a good number to use based on the elbow graph. |
| 98 | + n_clusters = 4 |
| 99 | + |
| 100 | + means_cluster = sk_cluster.KMeans(n_clusters=n_clusters, random_state=111) |
| 101 | + columns = ["orderRatio", "itemsRatio", "monetaryRatio", "frequency"] |
| 102 | + est = means_cluster.fit(customer_data[columns]) |
| 103 | + clusters = est.labels_ |
| 104 | + customer_data['cluster'] = clusters |
| 105 | + |
| 106 | + # Print some data about the clusters: |
| 107 | + |
| 108 | + # For each cluster, count the members. |
| 109 | + for c in range(n_clusters): |
| 110 | + cluster_members=customer_data[customer_data['cluster'] == c][:] |
| 111 | + print('Cluster{}(n={}):'.format(c, len(cluster_members))) |
| 112 | + print('-'* 17) |
| 113 | + |
| 114 | + # Print mean values per cluster. |
| 115 | + print(customer_data.groupby(['cluster']).mean()) |
| 116 | + |
| 117 | + |
| 118 | +perform_clustering() |
| 119 | + |
0 commit comments