Skip to content

Commit 2cb6de8

Browse files
getting started samples for Rservices added
1 parent f6307ec commit 2cb6de8

File tree

4 files changed

+345
-0
lines changed

4 files changed

+345
-0
lines changed
Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
2+
# Input Query
3+
input_query <- "
4+
SELECT
5+
ss_customer_sk AS customer,
6+
round(CASE WHEN ((orders_count = 0) OR (returns_count IS NULL) OR (orders_count IS NULL) OR ((returns_count / orders_count) IS NULL) ) THEN 0.0 ELSE (cast(returns_count as nchar(10)) / orders_count) END, 7) AS orderRatio,
7+
round(CASE WHEN ((orders_items = 0) OR(returns_items IS NULL) OR (orders_items IS NULL) OR ((returns_items / orders_items) IS NULL) ) THEN 0.0 ELSE (cast(returns_items as nchar(10)) / orders_items) END, 7) AS itemsRatio,
8+
round(CASE WHEN ((orders_money = 0) OR (returns_money IS NULL) OR (orders_money IS NULL) OR ((returns_money / orders_money) IS NULL) ) THEN 0.0 ELSE (cast(returns_money as nchar(10)) / orders_money) END, 7) AS monetaryRatio,
9+
round(CASE WHEN ( returns_count IS NULL ) THEN 0.0 ELSE returns_count END, 0) AS frequency
10+
11+
FROM
12+
(
13+
SELECT
14+
ss_customer_sk,
15+
-- return order ratio
16+
COUNT(distinct(ss_ticket_number)) AS orders_count,
17+
-- return ss_item_sk ratio
18+
COUNT(ss_item_sk) AS orders_items,
19+
-- return monetary amount ratio
20+
SUM( ss_net_paid ) AS orders_money
21+
FROM store_sales s
22+
GROUP BY ss_customer_sk
23+
) orders
24+
LEFT OUTER JOIN
25+
(
26+
SELECT
27+
sr_customer_sk,
28+
-- return order ratio
29+
count(distinct(sr_ticket_number)) as returns_count,
30+
-- return ss_item_sk ratio
31+
COUNT(sr_item_sk) as returns_items,
32+
-- return monetary amount ratio
33+
SUM( sr_return_amt ) AS returns_money
34+
FROM store_returns
35+
GROUP BY sr_customer_sk
36+
) returned ON ss_customer_sk=sr_customer_sk
37+
"
38+
39+
# Define the connection string
40+
connStr <- paste("Driver=SQL Server;Server=", "NELLIELAPTOP", ";Database=", "tpcx1b", ";Trusted_Connection=true;", sep = "");
41+
42+
# Input customer data that needs to be classified
43+
customer_returns <- RxSqlServerData(sqlQuery = input_query,
44+
colClasses = c(customer = "numeric", orderRatio = "numeric", itemsRatio = "numeric", monetaryRatio = "numeric", frequency = "numeric"),
45+
connectionString = connStr);
46+
47+
48+
# Transform the data from an input dataset to an output dataset
49+
customer_data <- rxDataStep(customer_returns);
50+
#Look at the data we just loaded from SQL Server
51+
head(customer_data, n = 5);
52+
53+
# Determine number of clusters
54+
#Using a plot of the within groups sum of squares by number of clusters extracted can help determine the appropriate number of clusters.
55+
#We are looking for a bend in the plot. It is at this "elbow" in the plot that we have the appropriate number of clusters
56+
wss <- (nrow(customer_data) - 1) * sum(apply(customer_data, 2, var))
57+
for (i in 2:20) {
58+
xt = kmeans(customer_data, centers = i)
59+
print(xt$ifault)
60+
wss[i] <- sum(kms = kmeans(customer_data, centers = i)$withinss)
61+
}
62+
plot(1:20, wss, type = "b", xlab = "Number of Clusters", ylab = "Within groups sum of squares")
63+
64+
# Output table to hold the customer group mappings
65+
return_cluster = RxSqlServerData(table = "return_cluster", connectionString = connStr);
66+
67+
# Set.seed for random number generator for predicatability
68+
set.seed(10);
69+
70+
# Generate clusters using rxKmeans and output key / cluster to a table in SQL Server called return_cluster
71+
clust <- rxKmeans( ~ orderRatio + itemsRatio + monetaryRatio + frequency, customer_returns, numClusters = 4
72+
, outFile = return_cluster, outColName = "cluster", extraVarsToWrite = c("customer"), overwrite = TRUE);
73+
74+
# Read the custome returns cluster table
75+
customer_cluster <- rxDataStep(return_cluster);
76+
77+
#Plot the clusters (need to install library "cluster")
78+
#install.packages("cluster")
79+
library("cluster");
80+
clusplot(customer_data, customer_cluster$cluster, color=TRUE, shade=TRUE, labels=4, lines=0, plotchar = TRUE);
81+
82+
#Look at the clustering details and analyze results
83+
clust
84+
85+
86+
Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,102 @@
1+
USE [tpcx1b]
2+
3+
DROP PROC IF EXISTS generate_customer_return_clusters;
4+
GO
5+
CREATE procedure [dbo].[generate_customer_return_clusters]
6+
AS
7+
/*
8+
This procedure uses R to classify customers into different groups based on their
9+
purchase & return history.
10+
*/
11+
BEGIN
12+
DECLARE @duration FLOAT
13+
, @predict_duration FLOAT
14+
, @instance_name NVARCHAR(100) = @@SERVERNAME
15+
, @database_name NVARCHAR(128) = db_name()
16+
17+
-- Input query to generate the purchase history & return metrics
18+
, @input_query NVARCHAR(MAX) = N'
19+
SELECT
20+
ss_customer_sk AS customer,
21+
round(CASE WHEN ((orders_count = 0) OR (returns_count IS NULL) OR (orders_count IS NULL) OR ((returns_count / orders_count) IS NULL) ) THEN 0.0 ELSE (cast(returns_count as nchar(10)) / orders_count) END, 7) AS orderRatio,
22+
round(CASE WHEN ((orders_items = 0) OR(returns_items IS NULL) OR (orders_items IS NULL) OR ((returns_items / orders_items) IS NULL) ) THEN 0.0 ELSE (cast(returns_items as nchar(10)) / orders_items) END, 7) AS itemsRatio,
23+
round(CASE WHEN ((orders_money = 0) OR (returns_money IS NULL) OR (orders_money IS NULL) OR ((returns_money / orders_money) IS NULL) ) THEN 0.0 ELSE (cast(returns_money as nchar(10)) / orders_money) END, 7) AS monetaryRatio,
24+
round(CASE WHEN ( returns_count IS NULL ) THEN 0.0 ELSE returns_count END, 0) AS frequency
25+
26+
FROM
27+
(
28+
SELECT
29+
ss_customer_sk,
30+
-- return order ratio
31+
COUNT(distinct(ss_ticket_number)) AS orders_count,
32+
-- return ss_item_sk ratio
33+
COUNT(ss_item_sk) AS orders_items,
34+
-- return monetary amount ratio
35+
SUM( ss_net_paid ) AS orders_money
36+
FROM store_sales s
37+
GROUP BY ss_customer_sk
38+
) orders
39+
LEFT OUTER JOIN
40+
(
41+
SELECT
42+
sr_customer_sk,
43+
-- return order ratio
44+
count(distinct(sr_ticket_number)) as returns_count,
45+
-- return ss_item_sk ratio
46+
COUNT(sr_item_sk) as returns_items,
47+
-- return monetary amount ratio
48+
SUM( sr_return_amt ) AS returns_money
49+
FROM store_returns
50+
GROUP BY sr_customer_sk
51+
) returned ON ss_customer_sk=sr_customer_sk
52+
'
53+
54+
EXEC sp_execute_external_script
55+
@language = N'R'
56+
, @script = N'
57+
# Define the connection string
58+
connStr <- paste("Driver=SQL Server;Server=", instance_name, ";Database=", database_name, ";Trusted_Connection=true;", sep="");
59+
60+
61+
# Input customer data that needs to be classified. This is the result we get from our query
62+
customer_returns <- RxSqlServerData(sqlQuery = input_query,
63+
colClasses = c(customer = "numeric", orderRatio = "numeric", itemsRatio = "numeric", monetaryRatio = "numeric", frequency = "numeric"),
64+
connectionString = connStr);
65+
66+
# Output table to hold the customer cluster mappings
67+
return_cluster = RxSqlServerData(table = "customer_return_clusters", connectionString = connStr);
68+
69+
# set.seed for random number generator for predicatability
70+
set.seed(10);
71+
72+
# generate clusters using rxKmeans and output clusters to a table called "customer_return_clusters".
73+
clust <- rxKmeans( ~ orderRatio + itemsRatio + monetaryRatio + frequency, customer_returns, numClusters = 4
74+
, outFile = return_cluster, outColName = "cluster", writeModelVars = TRUE , extraVarsToWrite = c("customer"), overwrite = TRUE);
75+
'
76+
, @input_data_1 = N''
77+
, @params = N'@instance_name nvarchar(100), @database_name nvarchar(128), @input_query nvarchar(max), @duration float OUTPUT'
78+
, @instance_name = @instance_name
79+
, @database_name = @database_name
80+
, @input_query = @input_query
81+
, @duration = @duration OUTPUT;
82+
END;
83+
84+
GO
85+
86+
87+
--Empty table of the results before running the stored procedure
88+
TRUNCATE TABLE customer_return_clusters;
89+
90+
--Execute the clustering. This will load the table customer_return_clusters with cluster mappings
91+
EXEC [dbo].[generate_customer_return_clusters];
92+
93+
--Now select data from table customer_return_clusters to verify that the clustering data was loaded
94+
SELECT * FROM customer_return_clusters;
95+
96+
--Select email addresses of customers in cluster 1
97+
SELECT customer.[c_email_address], customer.c_customer_sk
98+
FROM dbo.customer
99+
JOIN
100+
[dbo].[customer_return_clusters] as r
101+
ON r.customer = customer.c_customer_sk
102+
WHERE r.cluster = 1
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
#Connection string to connect to SQL Server
2+
connStr <- paste("Driver=SQL Server; Server=", "MyServer",
3+
";Database=", "tutorialdb", ";Trusted_Connection=true;", sep = "");
4+
5+
#Get the data from SQL Server Table
6+
SQL_rentaldata <- RxSqlServerData(table = "dbo.rental_data",
7+
connectionString = connStr, returnDataFrame = TRUE);
8+
9+
#Import the data into a data frame
10+
rentaldata <- rxImport(SQL_rentaldata);
11+
12+
#Let's see the structure of the data and the top rows
13+
head(rentaldata);
14+
str(rentaldata);
15+
16+
#Changing the three factor columns to factor types
17+
#This helps when building the model because we are explicitly saying that these values are categorical
18+
rentaldata$Holiday <- factor(rentaldata$Holiday);
19+
rentaldata$Snow <- factor(rentaldata$Snow);
20+
rentaldata$WeekDay <- factor(rentaldata$WeekDay);
21+
22+
#Visualize the dataset after the change
23+
str(rentaldata);
24+
25+
#Now let's split the dataset into 2 different sets
26+
#One set for training the model and the other for validating it
27+
train_data = rentaldata[rentaldata$Year < 2015,];
28+
test_data = rentaldata[rentaldata$Year == 2015,];
29+
30+
#Use this column to check the quality of the prediction against actual values
31+
actual_counts <- test_data$RentalCount;
32+
33+
#Model 1: Use rxLinMod to create a linear regression model. We are training the data using the training data set
34+
model_linmod <- rxLinMod(RentalCount ~ Month + Day + WeekDay + Snow + Holiday, data = train_data);
35+
36+
#Model 2: Use rxDTree to create a decision tree model. We are training the data using the training data set
37+
model_dtree <- rxDTree(RentalCount ~ Month + Day + WeekDay + Snow + Holiday, data = train_data);
38+
39+
#Use the models we just created to predict using the test data set.
40+
#That enables us to compare actual values of RentalCount from the two models and compare to the actual values in the test data set
41+
predict_linmod <- rxPredict(model_linmod, test_data, writeModelVars = TRUE);
42+
43+
predict_dtree <- rxPredict(model_dtree, test_data, writeModelVars = TRUE);
44+
45+
#Look at the top rows of the two prediction data sets.
46+
head(predict_linmod);
47+
head(predict_dtree);
48+
49+
#Now we will use the plotting functionality in R to viusalize the results from the predictions
50+
#We are plotting the difference between actual and predicted values for both models to compare accuracy
51+
par(mfrow = c(2, 1));
52+
plot(predict_linmod$RentalCount_Pred - predict_linmod$RentalCount, main = "Difference between actual and predicted. rxLinmod");
53+
plot(predict_dtree$RentalCount_Pred - predict_dtree$RentalCount, main = "Difference between actual and predicted. rxDTree");
54+
Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,103 @@
1+
--Before we start, we need to restore the DB for this tutorial.
2+
--Step1:Download the compressed backup file
3+
--Save the file on a location where SQL Server can access it. For example:C:\Program Files \Microsoft SQL Server \MSSQL13.MSSQLSERVER\MSSQL\Backup\
4+
--In a new query window in SSMS, execute the following restore statement, but REMEMBER TO CHANGE THE FILE PATHS
5+
--to match the directories of your installation!
6+
USE master;
7+
GO
8+
RESTORE DATABASE TutorialDB
9+
FROM DISK = 'C:\Program Files\Microsoft SQL Server\MSSQL13.MSSQLSERVER\MSSQL\Backup\TutorialDB.bak'
10+
WITH
11+
MOVE 'TutorialDB' TO 'C:\Program Files\Microsoft SQL Server\MSSQL13.MSSQLSERVER\MSSQL\DATA\TutorialDB.mdf'
12+
, MOVE 'TutorialDB_log' TO 'C:\Program Files\Microsoft SQL Server\MSSQL13.MSSQLSERVER\MSSQL\DATA\TutorialDB.ldf';
13+
GO
14+
15+
USE tutorialdb;
16+
SELECT * FROM [dbo].[rental_data];
17+
18+
19+
-- Operationalize
20+
USE tutorialdb;
21+
GO
22+
-- Setup model table
23+
DROP TABLE IF EXISTS rental_rx_models;
24+
GO
25+
CREATE TABLE rental_rx_models (
26+
model_name VARCHAR(30) NOT NULL DEFAULT('default model') PRIMARY KEY,
27+
model VARBINARY(MAX) NOT NULL
28+
);
29+
GO
30+
31+
-- Stored procedure that trains and generates a model using the rental_data and a decision tree algorithm
32+
DROP PROCEDURE IF EXISTS generate_rental_rx_model;
33+
go
34+
CREATE PROCEDURE generate_rental_rx_model (@trained_model varbinary(max) OUTPUT)
35+
AS
36+
BEGIN
37+
EXECUTE sp_execute_external_script
38+
@language = N'R'
39+
, @script = N'
40+
require("RevoScaleR");
41+
42+
rental_train_data$Holiday = factor(rental_train_data$Holiday);
43+
rental_train_data$Snow = factor(rental_train_data$Snow);
44+
rental_train_data$WeekDay = factor(rental_train_data$WeekDay);
45+
46+
#Create a dtree model and train it using the training data set
47+
model_dtree <- rxDTree(RentalCount ~ Month + Day + WeekDay + Snow + Holiday, data = rental_train_data);
48+
#Before saving the model to the DB table, we need to serialize it
49+
trained_model <- as.raw(serialize(model_dtree, connection=NULL));'
50+
51+
, @input_data_1 = N'select "RentalCount", "Month", "Day", "WeekDay", "Snow", "Holiday" from dbo.rental_data where Year < 2015'
52+
, @input_data_1_name = N'rental_train_data'
53+
, @params = N'@trained_model varbinary(max) OUTPUT'
54+
, @trained_model = @trained_model OUTPUT;
55+
END;
56+
GO
57+
TRUNCATE TABLE rental_rx_models;
58+
--Script to call the stored procedure that generates the rxDTree model and save the model in a table in SQL Server
59+
DECLARE @model VARBINARY(MAX);
60+
EXEC generate_rental_rx_model @model OUTPUT;
61+
INSERT INTO rental_rx_models (model_name, model) VALUES('rxDTree', @model);
62+
SELECT * FROM rental_rx_models;
63+
GO
64+
65+
--Stored procedure that takes model name and new data as inout parameters and predicts the rental count for the new data
66+
DROP PROCEDURE IF EXISTS predict_rentals;
67+
GO
68+
CREATE PROCEDURE predict_rentals (@model VARCHAR(100),@q NVARCHAR(MAX))
69+
AS
70+
BEGIN
71+
DECLARE @rx_model VARBINARY(MAX) = (SELECT model FROM rental_rx_models WHERE model_name = @model);
72+
EXECUTE sp_execute_external_script
73+
@language = N'R'
74+
, @script = N'
75+
require("RevoScaleR");
76+
77+
#The InputDataSet contains the new data passed to this stored proc. We will use this data to predict.
78+
rentals = InputDataSet;
79+
80+
#Convert types to factors
81+
rentals$Holiday = factor(rentals$Holiday);
82+
rentals$Snow = factor(rentals$Snow);
83+
rentals$WeekDay = factor(rentals$WeekDay);
84+
85+
#Before using the model to predict, we need to unserialize it
86+
rental_model = unserialize(rx_model);
87+
88+
#Call prediction function
89+
rental_predictions = rxPredict(rental_model, rentals);'
90+
, @input_data_1 = @q
91+
, @output_data_1_name = N'rental_predictions'
92+
, @params = N'@rx_model varbinary(max)'
93+
, @rx_model = @rx_model
94+
WITH RESULT SETS (("RentalCount_Predicted" FLOAT));
95+
96+
END;
97+
GO
98+
99+
--Execute the predict_rentals stored proc and pass the modelname and a query string with a set of features we want to use to predict the rental count
100+
EXEC dbo.predict_rentals @model = 'rxDTree',
101+
@q ='SELECT CONVERT(INT, 3) AS Month, CONVERT(INT, 24) AS Day, CONVERT(INT, 4) AS WeekDay, CONVERT(INT, 1) AS Snow, CONVERT(INT, 1) AS Holiday';
102+
GO
103+

0 commit comments

Comments
 (0)