@@ -102,18 +102,10 @@ test_that("spark.glm and predict", {
102
102
})
103
103
104
104
test_that(" spark.glm summary" , {
105
- # prepare dataset
106
- Sepal.Length <- c(2.0 , 1.5 , 1.8 , 3.4 , 5.1 , 1.8 , 1.0 , 2.3 )
107
- Sepal.Width <- c(2.1 , 2.3 , 5.4 , 4.7 , 3.1 , 2.1 , 3.1 , 5.5 )
108
- Petal.Length <- c(1.8 , 2.1 , 7.1 , 2.5 , 3.7 , 6.3 , 2.2 , 7.2 )
109
- Species <- c(" setosa" , " versicolor" , " versicolor" , " versicolor" , " virginica" , " virginica" ,
110
- " versicolor" , " virginica" )
111
- dataset <- data.frame (Sepal.Length , Sepal.Width , Petal.Length , Species , stringsAsFactors = TRUE )
112
-
113
105
# gaussian family
114
- training <- suppressWarnings(createDataFrame(dataset ))
106
+ training <- suppressWarnings(createDataFrame(iris ))
115
107
stats <- summary(spark.glm(training , Sepal_Width ~ Sepal_Length + Species ))
116
- rStats <- summary(glm(Sepal.Width ~ Sepal.Length + Species , data = dataset ))
108
+ rStats <- summary(glm(Sepal.Width ~ Sepal.Length + Species , data = iris ))
117
109
118
110
# test summary coefficients return matrix type
119
111
expect_true(class(stats $ coefficients ) == " matrix" )
@@ -134,15 +126,15 @@ test_that("spark.glm summary", {
134
126
135
127
out <- capture.output(print(stats ))
136
128
expect_match(out [2 ], " Deviance Residuals:" )
137
- expect_true(any(grepl(" AIC: 35.84 " , out )))
129
+ expect_true(any(grepl(" AIC: 59.22 " , out )))
138
130
139
131
# binomial family
140
- df <- suppressWarnings(createDataFrame(dataset ))
132
+ df <- suppressWarnings(createDataFrame(iris ))
141
133
training <- df [df $ Species %in% c(" versicolor" , " virginica" ), ]
142
134
stats <- summary(spark.glm(training , Species ~ Sepal_Length + Sepal_Width ,
143
135
family = binomial(link = " logit" )))
144
136
145
- rTraining <- dataset [ dataset $ Species %in% c(" versicolor" , " virginica" ), ]
137
+ rTraining <- iris [ iris $ Species %in% c(" versicolor" , " virginica" ), ]
146
138
rStats <- summary(glm(Species ~ Sepal.Length + Sepal.Width , data = rTraining ,
147
139
family = binomial(link = " logit" )))
148
140
@@ -182,17 +174,17 @@ test_that("spark.glm summary", {
182
174
expect_equal(stats $ aic , rStats $ aic )
183
175
184
176
# Test spark.glm works with offset
185
- training <- suppressWarnings(createDataFrame(dataset ))
177
+ training <- suppressWarnings(createDataFrame(iris ))
186
178
stats <- summary(spark.glm(training , Sepal_Width ~ Sepal_Length + Species ,
187
179
family = poisson(), offsetCol = " Petal_Length" ))
188
180
rStats <- suppressWarnings(summary(glm(Sepal.Width ~ Sepal.Length + Species ,
189
- data = dataset , family = poisson(), offset = dataset $ Petal.Length )))
181
+ data = iris , family = poisson(), offset = iris $ Petal.Length )))
190
182
expect_true(all(abs(rStats $ coefficients - stats $ coefficients ) < 1e-3 ))
191
183
192
184
# Test summary works on base GLM models
193
- baseModel <- stats :: glm(Sepal.Width ~ Sepal.Length + Species , data = dataset )
185
+ baseModel <- stats :: glm(Sepal.Width ~ Sepal.Length + Species , data = iris )
194
186
baseSummary <- summary(baseModel )
195
- expect_true(abs(baseSummary $ deviance - 11.84013 ) < 1e-4 )
187
+ expect_true(abs(baseSummary $ deviance - 12.19313 ) < 1e-4 )
196
188
197
189
# Test spark.glm works with regularization parameter
198
190
data <- as.data.frame(cbind(a1 , a2 , b ))
@@ -308,19 +300,11 @@ test_that("glm and predict", {
308
300
})
309
301
310
302
test_that(" glm summary" , {
311
- # prepare dataset
312
- Sepal.Length <- c(2.0 , 1.5 , 1.8 , 3.4 , 5.1 , 1.8 , 1.0 , 2.3 )
313
- Sepal.Width <- c(2.1 , 2.3 , 5.4 , 4.7 , 3.1 , 2.1 , 3.1 , 5.5 )
314
- Petal.Length <- c(1.8 , 2.1 , 7.1 , 2.5 , 3.7 , 6.3 , 2.2 , 7.2 )
315
- Species <- c(" setosa" , " versicolor" , " versicolor" , " versicolor" , " virginica" , " virginica" ,
316
- " versicolor" , " virginica" )
317
- dataset <- data.frame (Sepal.Length , Sepal.Width , Petal.Length , Species , stringsAsFactors = TRUE )
318
-
319
303
# gaussian family
320
- training <- suppressWarnings(createDataFrame(dataset ))
304
+ training <- suppressWarnings(createDataFrame(iris ))
321
305
stats <- summary(glm(Sepal_Width ~ Sepal_Length + Species , data = training ))
322
306
323
- rStats <- summary(glm(Sepal.Width ~ Sepal.Length + Species , data = dataset ))
307
+ rStats <- summary(glm(Sepal.Width ~ Sepal.Length + Species , data = iris ))
324
308
325
309
coefs <- stats $ coefficients
326
310
rCoefs <- rStats $ coefficients
@@ -336,12 +320,12 @@ test_that("glm summary", {
336
320
expect_equal(stats $ aic , rStats $ aic )
337
321
338
322
# binomial family
339
- df <- suppressWarnings(createDataFrame(dataset ))
323
+ df <- suppressWarnings(createDataFrame(iris ))
340
324
training <- df [df $ Species %in% c(" versicolor" , " virginica" ), ]
341
325
stats <- summary(glm(Species ~ Sepal_Length + Sepal_Width , data = training ,
342
326
family = binomial(link = " logit" )))
343
327
344
- rTraining <- dataset [ dataset $ Species %in% c(" versicolor" , " virginica" ), ]
328
+ rTraining <- iris [ iris $ Species %in% c(" versicolor" , " virginica" ), ]
345
329
rStats <- summary(glm(Species ~ Sepal.Length + Sepal.Width , data = rTraining ,
346
330
family = binomial(link = " logit" )))
347
331
0 commit comments