Merge pull request #271 from NelGson/master

uc-msft · web-flow · commit 7171589b8d88 · 2017-08-17T22:17:00.000-07:00
New Python clustering tutorial
diff --git a/samples/features/machine-learning-services/python/getting-started/customer-clustering/customer_clustering.py b/samples/features/machine-learning-services/python/getting-started/customer-clustering/customer_clustering.py
@@ -0,0 +1,119 @@
+# Load packages.
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+import revoscalepy as revoscale
+from scipy.spatial import distance as sci_distance
+from sklearn import cluster as sk_cluster
+
+
+
+def perform_clustering():
+    ################################################################################################
+
+    ##	Connect to DB and select data
+
+    ################################################################################################
+
+    # Connection string to connect to SQL Server named instance.
+    conn_str = 'Driver=SQL Server;Server=localhost;Database=tpcxbb_1gb;Trusted_Connection=True;'
+
+    input_query = '''SELECT
+    ss_customer_sk AS customer,
+    ROUND(COALESCE(returns_count / NULLIF(1.0*orders_count, 0), 0), 7) AS orderRatio,
+    ROUND(COALESCE(returns_items / NULLIF(1.0*orders_items, 0), 0), 7) AS itemsRatio,
+    ROUND(COALESCE(returns_money / NULLIF(1.0*orders_money, 0), 0), 7) AS monetaryRatio,
+    COALESCE(returns_count, 0) AS frequency
+    FROM
+    (
+      SELECT
+        ss_customer_sk,
+        -- return order ratio
+        COUNT(distinct(ss_ticket_number)) AS orders_count,
+        -- return ss_item_sk ratio
+        COUNT(ss_item_sk) AS orders_items,
+        -- return monetary amount ratio
+        SUM( ss_net_paid ) AS orders_money
+      FROM store_sales s
+      GROUP BY ss_customer_sk
+    ) orders
+    LEFT OUTER JOIN
+    (
+      SELECT
+        sr_customer_sk,
+        -- return order ratio
+        count(distinct(sr_ticket_number)) as returns_count,
+        -- return ss_item_sk ratio
+        COUNT(sr_item_sk) as returns_items,
+        -- return monetary amount ratio
+        SUM( sr_return_amt ) AS returns_money
+    FROM store_returns
+    GROUP BY sr_customer_sk ) returned ON ss_customer_sk=sr_customer_sk'''
+
+
+    # Define the columns we wish to import.
+    column_info = {
+        "customer": {"type": "integer"},
+        "orderRatio": {"type": "integer"},
+        "itemsRatio": {"type": "integer"},
+        "frequency": {"type": "integer"}
+    }
+
+    data_source = revoscale.RxSqlServerData(sql_query=input_query, column_info=column_info,
+                                              connection_string=conn_str)
+    
+    # import data source and convert to pandas dataframe.
+    customer_data = pd.DataFrame(revoscalepy.rx_import(data_source))
+    print("Data frame:", customer_data.head(n=20))
+
+    ################################################################################################
+
+    ##	Determine number of clusters using the Elbow method
+
+    ################################################################################################
+
+    cdata = customer_data
+    K = range(1, 20)
+    KM = (sk_cluster.KMeans(n_clusters=k).fit(cdata) for k in K)
+    centroids = (k.cluster_centers_ for k in KM)
+
+    D_k = (sci_distance.cdist(cdata, cent, 'euclidean') for cent in centroids)
+    dist = (np.min(D, axis=1) for D in D_k)
+    avgWithinSS = [sum(d) / cdata.shape[0] for d in dist]
+    plt.plot(K, avgWithinSS, 'b*-')
+    plt.grid(True)
+    plt.xlabel('Number of clusters')
+    plt.ylabel('Average within-cluster sum of squares')
+    plt.title('Elbow for KMeans clustering')
+    plt.show()
+
+
+    ################################################################################################
+
+    ##	Perform clustering using Kmeans
+
+    ################################################################################################
+
+    # It looks like k=4 is a good number to use based on the elbow graph.
+    n_clusters = 4
+
+    means_cluster = sk_cluster.KMeans(n_clusters=n_clusters, random_state=111)
+    columns = ["orderRatio", "itemsRatio", "monetaryRatio", "frequency"]
+    est = means_cluster.fit(customer_data[columns])
+    clusters = est.labels_
+    customer_data['cluster'] = clusters
+
+    # Print some data about the clusters:
+
+    # For each cluster, count the members.
+    for c in range(n_clusters):
+        cluster_members=customer_data[customer_data['cluster'] == c][:]
+        print('Cluster{}(n={}):'.format(c, len(cluster_members)))
+        print('-'* 17)
+
+    # Print mean values per cluster.
+    print(customer_data.groupby(['cluster']).mean())
+
+
+perform_clustering()
+
diff --git a/samples/features/machine-learning-services/python/getting-started/customer-clustering/customer_clustering.sql b/samples/features/machine-learning-services/python/getting-started/customer-clustering/customer_clustering.sql
@@ -0,0 +1,100 @@
+USE [tpcxbb_1gb]
+GO
+
+-- Stored procedure that performs customer clustering using Python and SQL Server ML Services
+CREATE OR ALTER PROCEDURE [dbo].[py_generate_customer_return_clusters]
+AS
+
+BEGIN
+	DECLARE
+
+-- Input query to generate the purchase history & return metrics
+	 @input_query NVARCHAR(MAX) = N'
+SELECT
+  ss_customer_sk AS customer,
+  CAST( (ROUND(COALESCE(returns_count / NULLIF(1.0*orders_count, 0), 0), 7) ) AS FLOAT) AS orderRatio,
+  CAST( (ROUND(COALESCE(returns_items / NULLIF(1.0*orders_items, 0), 0), 7) ) AS FLOAT) AS itemsRatio,
+  CAST( (ROUND(COALESCE(returns_money / NULLIF(1.0*orders_money, 0), 0), 7) ) AS FLOAT) AS monetaryRatio,
+  CAST( (COALESCE(returns_count, 0)) AS FLOAT) AS frequency  
+FROM
+  (
+    SELECT
+      ss_customer_sk,
+      -- return order ratio
+      COUNT(distinct(ss_ticket_number)) AS orders_count,
+      -- return ss_item_sk ratio
+      COUNT(ss_item_sk) AS orders_items,
+      -- return monetary amount ratio
+      SUM( ss_net_paid ) AS orders_money
+    FROM store_sales s
+    GROUP BY ss_customer_sk
+  ) orders
+  LEFT OUTER JOIN
+  (
+    SELECT
+      sr_customer_sk,
+      -- return order ratio
+      count(distinct(sr_ticket_number)) as returns_count,
+      -- return ss_item_sk ratio
+      COUNT(sr_item_sk) as returns_items,
+      -- return monetary amount ratio
+      SUM( sr_return_amt ) AS returns_money
+    FROM store_returns
+    GROUP BY sr_customer_sk
+  ) returned ON ss_customer_sk=sr_customer_sk 
+ '
+
+EXEC sp_execute_external_script
+	  @language = N'Python'
+	, @script = N'
+
+import pandas as pd
+from sklearn.cluster import KMeans
+
+#We concluded in step2 in the tutorial that 4 would be a good number of clusters
+n_clusters = 4
+
+#Perform clustering
+est = KMeans(n_clusters=n_clusters, random_state=111).fit(customer_data[["orderRatio","itemsRatio","monetaryRatio","frequency"]])
+clusters = est.labels_
+customer_data["cluster"] = clusters
+
+#OutputDataSet = customer_data
+'
+	, @input_data_1 = @input_query
+	, @input_data_1_name = N'customer_data'
+	,@output_data_1_name = N'customer_data'
+			 with result sets (("Customer" int, "orderRatio" float,"itemsRatio" float,"monetaryRatio" float,"frequency" float,"cluster" float));
+END;
+GO
+
+
+--Creating a table for storing the clustering data
+DROP TABLE IF EXISTS [dbo].[py_customer_clusters];
+GO
+--Create a table to store the predictions in
+CREATE TABLE [dbo].[py_customer_clusters](
+ [Customer] [bigint] NULL,
+ [OrderRatio] [float] NULL,
+ [itemsRatio] [float] NULL,
+ [monetaryRatio] [float] NULL,
+ [frequency] [float] NULL,
+ [cluster] [int] NULL,
+ ) ON [PRIMARY]
+GO
+
+--Execute the clustering and insert results into table
+INSERT INTO py_customer_clusters
+EXEC [dbo].[py_generate_customer_return_clusters];
+
+-- Select contents of the table
+SELECT * FROM py_customer_clusters;
+
+--Get email addresses of customers in cluster 0 
+SELECT customer.[c_email_address], customer.c_customer_sk
+  FROM dbo.customer
+  JOIN
+  [dbo].[py_customer_clusters] as c
+  ON c.Customer = customer.c_customer_sk
+  WHERE c.cluster = 0;
+  
diff --git a/samples/features/readme.md b/samples/features/readme.md
@@ -10,7 +10,11 @@ Master Data Services (MDS) is the SQL Server solution for master data management
 
 [R Services](r-services)
 
-SQL Server R Services brings R processing close to the data, allowing more scalable and more efficient predictive analytics.
+SQL Server R Services (in SQL Server 2016 and above) brings R processing close to the data, allowing more scalable and more efficient predictive analytics using R in-database.
+
+[ML Services](ml-services)
+
+SQL Server ML Services (SQL Server 2017) brings Python processing close to the data, allowing more scalable and more efficient predictive analytics using Python in-database.
 
 [JSON Support](json)
 
@@ -28,4 +32,4 @@ Graph tables enable you to add a non-relational capability to your database.
 
 [Reporting Services (SSRS)](reporting-services)
 
-Reporting Services provides reporting capabilities for your organziation. Reporting Services can be integrated with SharePoint Server or used as a standalone service.
+Reporting Services provides reporting capabilities for your organziation. Reporting Services can be integrated with SharePoint Server or used as a standalone service.