Skip to content

Commit 69d78b7

Browse files
authored
Merge pull request #171 from NelGson/master
getting started samples for Rservices added
2 parents 871c0e6 + d1e2ec3 commit 69d78b7

File tree

8 files changed

+512
-0
lines changed

8 files changed

+512
-0
lines changed
Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
2+
3+
4+
# Define the connection string
5+
connStr <- paste("Driver=SQL Server;Server=", "MyServer", ";Database=", "tpcx1b", ";Trusted_Connection=true;", sep = "");
6+
7+
# Input Query
8+
input_query <- "
9+
SELECT
10+
ss_customer_sk AS customer,
11+
round(CASE WHEN ((orders_count = 0) OR (returns_count IS NULL) OR (orders_count IS NULL) OR ((returns_count / orders_count) IS NULL) ) THEN 0.0 ELSE (cast(returns_count as nchar(10)) / orders_count) END, 7) AS orderRatio,
12+
round(CASE WHEN ((orders_items = 0) OR(returns_items IS NULL) OR (orders_items IS NULL) OR ((returns_items / orders_items) IS NULL) ) THEN 0.0 ELSE (cast(returns_items as nchar(10)) / orders_items) END, 7) AS itemsRatio,
13+
round(CASE WHEN ((orders_money = 0) OR (returns_money IS NULL) OR (orders_money IS NULL) OR ((returns_money / orders_money) IS NULL) ) THEN 0.0 ELSE (cast(returns_money as nchar(10)) / orders_money) END, 7) AS monetaryRatio,
14+
round(CASE WHEN ( returns_count IS NULL ) THEN 0.0 ELSE returns_count END, 0) AS frequency
15+
16+
FROM
17+
(
18+
SELECT
19+
ss_customer_sk,
20+
-- return order ratio
21+
COUNT(distinct(ss_ticket_number)) AS orders_count,
22+
-- return ss_item_sk ratio
23+
COUNT(ss_item_sk) AS orders_items,
24+
-- return monetary amount ratio
25+
SUM( ss_net_paid ) AS orders_money
26+
FROM store_sales s
27+
GROUP BY ss_customer_sk
28+
) orders
29+
LEFT OUTER JOIN
30+
(
31+
SELECT
32+
sr_customer_sk,
33+
-- return order ratio
34+
count(distinct(sr_ticket_number)) as returns_count,
35+
-- return ss_item_sk ratio
36+
COUNT(sr_item_sk) as returns_items,
37+
-- return monetary amount ratio
38+
SUM( sr_return_amt ) AS returns_money
39+
FROM store_returns
40+
GROUP BY sr_customer_sk
41+
) returned ON ss_customer_sk=sr_customer_sk
42+
"
43+
# Input customer data that needs to be classified
44+
customer_returns <- RxSqlServerData(sqlQuery = input_query,
45+
colClasses = c(customer = "numeric", orderRatio = "numeric", itemsRatio = "numeric", monetaryRatio = "numeric", frequency = "numeric"),
46+
connectionString = connStr);
47+
48+
49+
# Transform the data from an input dataset to an output dataset
50+
customer_data <- rxDataStep(customer_returns);
51+
#Look at the data we just loaded from SQL Server
52+
head(customer_data, n = 5);
53+
54+
# Determine number of clusters
55+
#Using a plot of the within groups sum of squares by number of clusters extracted can help determine the appropriate number of clusters.
56+
#We are looking for a bend in the plot. It is at this "elbow" in the plot that we have the appropriate number of clusters
57+
wss <- (nrow(customer_data) - 1) * sum(apply(customer_data, 2, var))
58+
for (i in 2:20) {
59+
xt = kmeans(customer_data, centers = i)
60+
wss[i] <- sum(kms = kmeans(customer_data, centers = i)$withinss)
61+
}
62+
plot(1:20, wss, type = "b", xlab = "Number of Clusters", ylab = "Within groups sum of squares")
63+
64+
# Output table to hold the customer group mappings
65+
return_cluster = RxSqlServerData(table = "return_cluster", connectionString = connStr);
66+
67+
# Set.seed for random number generator for predicatability
68+
set.seed(10);
69+
70+
# Generate clusters using rxKmeans and output key / cluster to a table in SQL Server called return_cluster
71+
clust <- rxKmeans( ~ orderRatio + itemsRatio + monetaryRatio + frequency, customer_returns, numClusters = 4
72+
, outFile = return_cluster, outColName = "cluster", extraVarsToWrite = c("customer"), overwrite = TRUE);
73+
74+
# Read the custome returns cluster table
75+
customer_cluster <- rxDataStep(return_cluster);
76+
77+
#Plot the clusters (need to install library "cluster")
78+
#install.packages("cluster")
79+
library("cluster");
80+
clusplot(customer_data, customer_cluster$cluster, color=TRUE, shade=TRUE, labels=4, lines=0, plotchar = TRUE);
81+
82+
#Look at the clustering details and analyze results
83+
clust
84+
85+
86+
Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,102 @@
1+
USE [tpcxbb_1gb]
2+
3+
DROP PROC IF EXISTS generate_customer_return_clusters;
4+
GO
5+
CREATE procedure [dbo].[generate_customer_return_clusters]
6+
AS
7+
/*
8+
This procedure uses R to classify customers into different groups based on their
9+
purchase & return history.
10+
*/
11+
BEGIN
12+
DECLARE @duration FLOAT
13+
, @predict_duration FLOAT
14+
, @instance_name NVARCHAR(100) = @@SERVERNAME
15+
, @database_name NVARCHAR(128) = db_name()
16+
17+
-- Input query to generate the purchase history & return metrics
18+
, @input_query NVARCHAR(MAX) = N'
19+
SELECT
20+
ss_customer_sk AS customer,
21+
round(CASE WHEN ((orders_count = 0) OR (returns_count IS NULL) OR (orders_count IS NULL) OR ((returns_count / orders_count) IS NULL) ) THEN 0.0 ELSE (cast(returns_count as nchar(10)) / orders_count) END, 7) AS orderRatio,
22+
round(CASE WHEN ((orders_items = 0) OR(returns_items IS NULL) OR (orders_items IS NULL) OR ((returns_items / orders_items) IS NULL) ) THEN 0.0 ELSE (cast(returns_items as nchar(10)) / orders_items) END, 7) AS itemsRatio,
23+
round(CASE WHEN ((orders_money = 0) OR (returns_money IS NULL) OR (orders_money IS NULL) OR ((returns_money / orders_money) IS NULL) ) THEN 0.0 ELSE (cast(returns_money as nchar(10)) / orders_money) END, 7) AS monetaryRatio,
24+
round(CASE WHEN ( returns_count IS NULL ) THEN 0.0 ELSE returns_count END, 0) AS frequency
25+
26+
FROM
27+
(
28+
SELECT
29+
ss_customer_sk,
30+
-- return order ratio
31+
COUNT(distinct(ss_ticket_number)) AS orders_count,
32+
-- return ss_item_sk ratio
33+
COUNT(ss_item_sk) AS orders_items,
34+
-- return monetary amount ratio
35+
SUM( ss_net_paid ) AS orders_money
36+
FROM store_sales s
37+
GROUP BY ss_customer_sk
38+
) orders
39+
LEFT OUTER JOIN
40+
(
41+
SELECT
42+
sr_customer_sk,
43+
-- return order ratio
44+
count(distinct(sr_ticket_number)) as returns_count,
45+
-- return ss_item_sk ratio
46+
COUNT(sr_item_sk) as returns_items,
47+
-- return monetary amount ratio
48+
SUM( sr_return_amt ) AS returns_money
49+
FROM store_returns
50+
GROUP BY sr_customer_sk
51+
) returned ON ss_customer_sk=sr_customer_sk
52+
'
53+
54+
EXEC sp_execute_external_script
55+
@language = N'R'
56+
, @script = N'
57+
# Define the connection string
58+
connStr <- paste("Driver=SQL Server;Server=", instance_name, ";Database=", database_name, ";Trusted_Connection=true;", sep="");
59+
60+
61+
# Input customer data that needs to be classified. This is the result we get from our query
62+
customer_returns <- RxSqlServerData(sqlQuery = input_query,
63+
colClasses = c(customer = "numeric", orderRatio = "numeric", itemsRatio = "numeric", monetaryRatio = "numeric", frequency = "numeric"),
64+
connectionString = connStr);
65+
66+
# Output table to hold the customer cluster mappings
67+
return_cluster = RxSqlServerData(table = "customer_return_clusters", connectionString = connStr);
68+
69+
# set.seed for random number generator for predicatability
70+
set.seed(10);
71+
72+
# generate clusters using rxKmeans and output clusters to a table called "customer_return_clusters".
73+
clust <- rxKmeans( ~ orderRatio + itemsRatio + monetaryRatio + frequency, customer_returns, numClusters = 4
74+
, outFile = return_cluster, outColName = "cluster", writeModelVars = TRUE , extraVarsToWrite = c("customer"), overwrite = TRUE);
75+
'
76+
, @input_data_1 = N''
77+
, @params = N'@instance_name nvarchar(100), @database_name nvarchar(128), @input_query nvarchar(max), @duration float OUTPUT'
78+
, @instance_name = @instance_name
79+
, @database_name = @database_name
80+
, @input_query = @input_query
81+
, @duration = @duration OUTPUT;
82+
END;
83+
84+
GO
85+
86+
87+
--Empty table of the results before running the stored procedure
88+
TRUNCATE TABLE customer_return_clusters;
89+
90+
--Execute the clustering. This will load the table customer_return_clusters with cluster mappings
91+
EXEC [dbo].[generate_customer_return_clusters];
92+
93+
--Now select data from table customer_return_clusters to verify that the clustering data was loaded
94+
SELECT * FROM customer_return_clusters;
95+
96+
--Select email addresses of customers in cluster 1
97+
SELECT customer.[c_email_address], customer.c_customer_sk
98+
FROM dbo.customer
99+
JOIN
100+
[dbo].[customer_return_clusters] as r
101+
ON r.customer = customer.c_customer_sk
102+
WHERE r.cluster = 1
Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
# Perform customer clustering with SQL Server R Services
2+
3+
In this sample, we are going to get ourselves familiar with clustering.
4+
Clustering can be explained as organizing data into groups where members of a group are similar in some way.
5+
6+
### Contents
7+
8+
[About this sample](#about-this-sample)<br/>
9+
[Before you begin](#before-you-begin)<br/>
10+
[Sample details](#sample-details)<br/>
11+
[Related links](#related-links)<br/>
12+
13+
14+
<a name=about-this-sample></a>
15+
16+
## About this sample
17+
18+
We will be using the Kmeans algorithm to perform the clustering of customers. This can for example be used to target a specific group of customers for marketing efforts.
19+
Kmeans clustering is an unsupervised learning algorithm that tries to group data based on similarities. Unsupervised learning means that there is no outcome to be predicted, and the algorithm just tries to find patterns in the data.
20+
21+
In this sample, you will learn how to perform Kmeans clustering in R and deploying the solution in SQL Server 2016.
22+
23+
Follow the step by step tutorial [here](https://www.microsoft.com/en-us/sql-server/developer-get-started/rclustering) to walk through this sample.
24+
25+
<!-- Delete the ones that don't apply -->
26+
- **Applies to:** SQL Server 2016 (or higher)
27+
- **Key features:**
28+
- **Workload:** SQL Server R Services
29+
- **Programming Language:** T-SQL, R
30+
- **Authors:** Nellie Gustafsson
31+
- **Update history:** Getting started tutorial for R Services
32+
33+
<a name=before-you-begin></a>
34+
35+
## Before you begin
36+
37+
To run this sample, you need the following prerequisites.
38+
Section 1 in the [tutorial](https://www.microsoft.com/en-us/sql-server/developer-get-started/rclustering) covers the prerequisites.
39+
After that, you can download a DB backup file and restore it using Setup.sql. [Download DB](https://deve2e.azureedge.net/sqlchoice/static/tpcxbb_1gb.bak)
40+
41+
**Software prerequisites:**
42+
43+
<!-- Examples -->
44+
1. SQL Server 2016 (or higher) with R Services installed
45+
2. SQL Server Management Studio
46+
3. R IDE Tool like Visual Studio
47+
48+
49+
<a name=sample-details></a>
50+
## Sample Details
51+
52+
### Customer Clustering.R
53+
54+
The R script that performs clustering.
55+
56+
### Customer Clustering.SQL
57+
58+
The SQL code to create stored procedure that performs clustering, and queries to verify and take further actions.
59+
60+
61+
<a name=related-links></a>
62+
63+
## Related Links
64+
<!-- Links to more articles. Remember to delete "en-us" from the link path. -->
65+
66+
For additional content, see these articles:
67+
68+
[SQL Server R Services - Upgrade and Installation FAQ](https://msdn.microsoft.com/en-us/library/mt653951.aspx)
69+
70+
[Other SQL Server R Services Tutorials](https://msdn.microsoft.com/en-us/library/mt591993.aspx)
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
-- Before we start, we need to restore the DB for this tutorial.
2+
-- Step1: Download the compressed backup file
3+
-- Save the file on a location where SQL Server can access it. For example: C:\Program Files\Microsoft SQL Server\MSSQL13.MSSQLSERVER\MSSQL\Backup\
4+
-- In a new query window in SSMS, execute the following restore statement, but REMEMBER TO CHANGE THE FILE PATHS
5+
-- to match the directories of your installation!
6+
USE master;
7+
GO
8+
RESTORE DATABASE tpcxbb_1gb
9+
FROM DISK = 'C:\Program Files\Microsoft SQL Server\MSSQL13.MSSQLSERVER\MSSQL\Backup\tpcxbb_1gb.bak'
10+
WITH
11+
MOVE 'tpcxbb_1gb' TO 'C:\Program Files\Microsoft SQL Server\MSSQL13.MSSQLSERVER\MSSQL\DATA\tpcxbb_1gb.mdf'
12+
,MOVE 'tpcxbb_1gb_log' TO 'C:\Program Files\Microsoft SQL Server\MSSQL13.MSSQLSERVER\MSSQL\DATA\tpcxbb_1gb.ldf';
13+
GO
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
#Connection string to connect to SQL Server
2+
connStr <- paste("Driver=SQL Server; Server=", "MyServer",
3+
";Database=", "tutorialdb", ";Trusted_Connection=true;", sep = "");
4+
5+
#Get the data from SQL Server Table
6+
SQL_rentaldata <- RxSqlServerData(table = "dbo.rental_data",
7+
connectionString = connStr, returnDataFrame = TRUE);
8+
9+
#Import the data into a data frame
10+
rentaldata <- rxImport(SQL_rentaldata);
11+
12+
#Let's see the structure of the data and the top rows
13+
head(rentaldata);
14+
str(rentaldata);
15+
16+
#Changing the three factor columns to factor types
17+
#This helps when building the model because we are explicitly saying that these values are categorical
18+
rentaldata$Holiday <- factor(rentaldata$Holiday);
19+
rentaldata$Snow <- factor(rentaldata$Snow);
20+
rentaldata$WeekDay <- factor(rentaldata$WeekDay);
21+
22+
#Visualize the dataset after the change
23+
str(rentaldata);
24+
25+
#Now let's split the dataset into 2 different sets
26+
#One set for training the model and the other for validating it
27+
train_data = rentaldata[rentaldata$Year < 2015,];
28+
test_data = rentaldata[rentaldata$Year == 2015,];
29+
30+
#Use this column to check the quality of the prediction against actual values
31+
actual_counts <- test_data$RentalCount;
32+
33+
#Model 1: Use rxLinMod to create a linear regression model. We are training the data using the training data set
34+
model_linmod <- rxLinMod(RentalCount ~ Month + Day + WeekDay + Snow + Holiday, data = train_data);
35+
36+
#Model 2: Use rxDTree to create a decision tree model. We are training the data using the training data set
37+
model_dtree <- rxDTree(RentalCount ~ Month + Day + WeekDay + Snow + Holiday, data = train_data);
38+
39+
#Use the models we just created to predict using the test data set.
40+
#That enables us to compare actual values of RentalCount from the two models and compare to the actual values in the test data set
41+
predict_linmod <- rxPredict(model_linmod, test_data, writeModelVars = TRUE);
42+
43+
predict_dtree <- rxPredict(model_dtree, test_data, writeModelVars = TRUE);
44+
45+
#Look at the top rows of the two prediction data sets.
46+
head(predict_linmod);
47+
head(predict_dtree);
48+
49+
#Now we will use the plotting functionality in R to viusalize the results from the predictions
50+
#We are plotting the difference between actual and predicted values for both models to compare accuracy
51+
par(mfrow = c(2, 1));
52+
plot(predict_linmod$RentalCount_Pred - predict_linmod$RentalCount, main = "Difference between actual and predicted. rxLinmod");
53+
plot(predict_dtree$RentalCount_Pred - predict_dtree$RentalCount, main = "Difference between actual and predicted. rxDTree");
54+

0 commit comments

Comments
 (0)