microsoft
diff --git a/‎.vs/VSWorkspaceState.json‎
Lines changed: 9 additions & 0 deletions b/‎.vs/VSWorkspaceState.json‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎.vs/slnx.sqlite‎
5.54 MB b/‎.vs/slnx.sqlite‎
5.54 MB
diff --git a/‎.vs/slnx.sqlite-journal‎
918 KB b/‎.vs/slnx.sqlite-journal‎
918 KB
diff --git a/‎samples/features/machine-learning-services/python/getting-started/customer-clustering/customer_clustering.py‎
Lines changed: 119 additions & 0 deletions b/‎samples/features/machine-learning-services/python/getting-started/customer-clustering/customer_clustering.py‎
Lines changed: 119 additions & 0 deletions
diff --git a/‎samples/features/machine-learning-services/python/getting-started/customer-clustering/customer_clustering.sql‎
Lines changed: 5 additions & 10 deletions b/‎samples/features/machine-learning-services/python/getting-started/customer-clustering/customer_clustering.sql‎
Lines changed: 5 additions & 10 deletions
diff --git a/‎samples/features/machine-learning-services/python/getting-started/customer-clustering/customer_clustering_ng.py‎
Lines changed: 0 additions & 116 deletions b/‎samples/features/machine-learning-services/python/getting-started/customer-clustering/customer_clustering_ng.py‎
Lines changed: 0 additions & 116 deletions
diff --git a/‎samples/features/readme.md‎
Lines changed: 5 additions & 1 deletion b/‎samples/features/readme.md‎
Lines changed: 5 additions & 1 deletion
@@ -0,0 +1,9 @@
+{
+  "ExpandedNodes": [
+    "",
+    "\\samples",
+    "\\samples\\features"
+  ],
+  "SelectedNode": "\\samples\\features\\readme.md",
+  "PreviewInSolutionExplorer": false
+}
@@ -0,0 +1,119 @@
+# Load packages.
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+import revoscalepy as revoscale
+from scipy.spatial import distance as sci_distance
+from sklearn import cluster as sk_cluster
+
+
+
+def perform_clustering():
+    ################################################################################################
+
+    ##	Connect to DB and select data
+
+    ################################################################################################
+
+    # Connection string to connect to SQL Server named instance.
+    conn_str = 'Driver=SQL Server;Server=localhost;Database=tpcxbb_1gb;Trusted_Connection=True;'
+
+    input_query = '''SELECT
+    ss_customer_sk AS customer,
+    ROUND(COALESCE(returns_count / NULLIF(1.0*orders_count, 0), 0), 7) AS orderRatio,
+    ROUND(COALESCE(returns_items / NULLIF(1.0*orders_items, 0), 0), 7) AS itemsRatio,
+    ROUND(COALESCE(returns_money / NULLIF(1.0*orders_money, 0), 0), 7) AS monetaryRatio,
+    COALESCE(returns_count, 0) AS frequency
+    FROM
+    (
+      SELECT
+        ss_customer_sk,
+        -- return order ratio
+        COUNT(distinct(ss_ticket_number)) AS orders_count,
+        -- return ss_item_sk ratio
+        COUNT(ss_item_sk) AS orders_items,
+        -- return monetary amount ratio
+        SUM( ss_net_paid ) AS orders_money
+      FROM store_sales s
+      GROUP BY ss_customer_sk
+    ) orders
+    LEFT OUTER JOIN
+    (
+      SELECT
+        sr_customer_sk,
+        -- return order ratio
+        count(distinct(sr_ticket_number)) as returns_count,
+        -- return ss_item_sk ratio
+        COUNT(sr_item_sk) as returns_items,
+        -- return monetary amount ratio
+        SUM( sr_return_amt ) AS returns_money
+    FROM store_returns
+    GROUP BY sr_customer_sk ) returned ON ss_customer_sk=sr_customer_sk'''
+
+
+    # Define the columns we wish to import.
+    column_info = {
+        "customer": {"type": "integer"},
+        "orderRatio": {"type": "integer"},
+        "itemsRatio": {"type": "integer"},
+        "frequency": {"type": "integer"}
+    }
+
+    data_source = revoscale.RxSqlServerData(sql_query=input_query, column_info=column_info,
+                                              connection_string=conn_str)
+    
+    # import data source and convert to pandas dataframe.
+    customer_data = pd.DataFrame(revoscalepy.rx_import(data_source))
+    print("Data frame:", customer_data.head(n=20))
+
+    ################################################################################################
+
+    ##	Determine number of clusters using the Elbow method
+
+    ################################################################################################
+
+    cdata = customer_data
+    K = range(1, 20)
+    KM = (sk_cluster.KMeans(n_clusters=k).fit(cdata) for k in K)
+    centroids = (k.cluster_centers_ for k in KM)
+
+    D_k = (sci_distance.cdist(cdata, cent, 'euclidean') for cent in centroids)
+    dist = (np.min(D, axis=1) for D in D_k)
+    avgWithinSS = [sum(d) / cdata.shape[0] for d in dist]
+    plt.plot(K, avgWithinSS, 'b*-')
+    plt.grid(True)
+    plt.xlabel('Number of clusters')
+    plt.ylabel('Average within-cluster sum of squares')
+    plt.title('Elbow for KMeans clustering')
+    plt.show()
+
+
+    ################################################################################################
+
+    ##	Perform clustering using Kmeans
+
+    ################################################################################################
+
+    # It looks like k=4 is a good number to use based on the elbow graph.
+    n_clusters = 4
+
+    means_cluster = sk_cluster.KMeans(n_clusters=n_clusters, random_state=111)
+    columns = ["orderRatio", "itemsRatio", "monetaryRatio", "frequency"]
+    est = means_cluster.fit(customer_data[columns])
+    clusters = est.labels_
+    customer_data['cluster'] = clusters
+
+    # Print some data about the clusters:
+
+    # For each cluster, count the members.
+    for c in range(n_clusters):
+        cluster_members=customer_data[customer_data['cluster'] == c][:]
+        print('Cluster{}(n={}):'.format(c, len(cluster_members)))
+        print('-'* 17)
+
+    # Print mean values per cluster.
+    print(customer_data.groupby(['cluster']).mean())
+
+
+perform_clustering()
+
@@ -2,9 +2,7 @@ USE [tpcxbb_1gb]
 GO
 
 -- Stored procedure that performs customer clustering using Python and SQL Server ML Services
-DROP PROCEDURE IF EXISTS [dbo].[py_generate_customer_return_clusters]
-GO
-CREATE procedure [dbo].[py_generate_customer_return_clusters]
+CREATE OR ALTER PROCEDURE [dbo].[py_generate_customer_return_clusters]
 AS
 
 BEGIN
@@ -53,9 +51,6 @@ EXEC sp_execute_external_script
 import pandas as pd
 from sklearn.cluster import KMeans
 
-#get data from input query
-customer_data = my_input_data
-
 #We concluded in step2 in the tutorial that 4 would be a good number of clusters
 n_clusters = 4
 
@@ -64,16 +59,16 @@ est = KMeans(n_clusters=n_clusters, random_state=111).fit(customer_data[["orderR
 clusters = est.labels_
 customer_data["cluster"] = clusters
 
-OutputDataSet = customer_data
+#OutputDataSet = customer_data
 '
 	, @input_data_1 = @input_query
-	, @input_data_1_name = N'my_input_data'
+	, @input_data_1_name = N'customer_data'
+	,@output_data_1_name = N'customer_data'
 			 with result sets (("Customer" int, "orderRatio" float,"itemsRatio" float,"monetaryRatio" float,"frequency" float,"cluster" float));
 END;
 GO
 
 
-
 --Creating a table for storing the clustering data
 DROP TABLE IF EXISTS [dbo].[py_customer_clusters];
 GO
@@ -101,4 +96,4 @@ SELECT customer.[c_email_address], customer.c_customer_sk
   JOIN
   [dbo].[py_customer_clusters] as c
   ON c.Customer = customer.c_customer_sk
-  WHERE c.cluster = 0;
+  WHERE c.cluster = 0;
@@ -10,7 +10,11 @@ Master Data Services (MDS) is the SQL Server solution for master data management
 
 [R Services](r-services)
 
-SQL Server R Services brings R processing close to the data, allowing more scalable and more efficient predictive analytics.
+SQL Server R Services (in SQL Server 2016 and above) brings R processing close to the data, allowing more scalable and more efficient predictive analytics using R in-database.
+
+[ML Services](ml-services)
+
+SQL Server ML Services (SQL Server 2017) brings Python processing close to the data, allowing more scalable and more efficient predictive analytics using Python in-database.
 
 [JSON Support](json)