Skip to content

Commit b28e76a

Browse files
author
Robert Kruszewski
committed
Revert "[SPARK-11215][ML] Add multiple columns support to StringIndexer"
This reverts commit 3310789.
1 parent 734627b commit b28e76a

File tree

10 files changed

+113
-533
lines changed

10 files changed

+113
-533
lines changed

R/pkg/tests/fulltests/test_mllib_classification.R

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -313,7 +313,7 @@ test_that("spark.mlp", {
313313
# Test predict method
314314
mlpTestDF <- df
315315
mlpPredictions <- collect(select(predict(model, mlpTestDF), "prediction"))
316-
expect_equal(head(mlpPredictions$prediction, 6), c("0.0", "1.0", "1.0", "1.0", "1.0", "1.0"))
316+
expect_equal(head(mlpPredictions$prediction, 6), c("1.0", "0.0", "0.0", "0.0", "0.0", "0.0"))
317317

318318
# Test model save/load
319319
if (windows_with_hadoop()) {
@@ -348,12 +348,12 @@ test_that("spark.mlp", {
348348

349349
# Test random seed
350350
# default seed
351-
model <- spark.mlp(df, label ~ features, layers = c(4, 5, 4, 3), maxIter = 100)
351+
model <- spark.mlp(df, label ~ features, layers = c(4, 5, 4, 3), maxIter = 10)
352352
mlpPredictions <- collect(select(predict(model, mlpTestDF), "prediction"))
353353
expect_equal(head(mlpPredictions$prediction, 10),
354354
c("1.0", "1.0", "1.0", "1.0", "0.0", "1.0", "2.0", "2.0", "1.0", "0.0"))
355355
# seed equals 10
356-
model <- spark.mlp(df, label ~ features, layers = c(4, 5, 4, 3), maxIter = 100, seed = 10)
356+
model <- spark.mlp(df, label ~ features, layers = c(4, 5, 4, 3), maxIter = 10, seed = 10)
357357
mlpPredictions <- collect(select(predict(model, mlpTestDF), "prediction"))
358358
expect_equal(head(mlpPredictions$prediction, 10),
359359
c("1.0", "1.0", "1.0", "1.0", "0.0", "1.0", "2.0", "2.0", "1.0", "0.0"))

R/pkg/tests/fulltests/test_mllib_regression.R

Lines changed: 13 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -102,18 +102,10 @@ test_that("spark.glm and predict", {
102102
})
103103

104104
test_that("spark.glm summary", {
105-
# prepare dataset
106-
Sepal.Length <- c(2.0, 1.5, 1.8, 3.4, 5.1, 1.8, 1.0, 2.3)
107-
Sepal.Width <- c(2.1, 2.3, 5.4, 4.7, 3.1, 2.1, 3.1, 5.5)
108-
Petal.Length <- c(1.8, 2.1, 7.1, 2.5, 3.7, 6.3, 2.2, 7.2)
109-
Species <- c("setosa", "versicolor", "versicolor", "versicolor", "virginica", "virginica",
110-
"versicolor", "virginica")
111-
dataset <- data.frame(Sepal.Length, Sepal.Width, Petal.Length, Species, stringsAsFactors = TRUE)
112-
113105
# gaussian family
114-
training <- suppressWarnings(createDataFrame(dataset))
106+
training <- suppressWarnings(createDataFrame(iris))
115107
stats <- summary(spark.glm(training, Sepal_Width ~ Sepal_Length + Species))
116-
rStats <- summary(glm(Sepal.Width ~ Sepal.Length + Species, data = dataset))
108+
rStats <- summary(glm(Sepal.Width ~ Sepal.Length + Species, data = iris))
117109

118110
# test summary coefficients return matrix type
119111
expect_true(class(stats$coefficients) == "matrix")
@@ -134,15 +126,15 @@ test_that("spark.glm summary", {
134126

135127
out <- capture.output(print(stats))
136128
expect_match(out[2], "Deviance Residuals:")
137-
expect_true(any(grepl("AIC: 35.84", out)))
129+
expect_true(any(grepl("AIC: 59.22", out)))
138130

139131
# binomial family
140-
df <- suppressWarnings(createDataFrame(dataset))
132+
df <- suppressWarnings(createDataFrame(iris))
141133
training <- df[df$Species %in% c("versicolor", "virginica"), ]
142134
stats <- summary(spark.glm(training, Species ~ Sepal_Length + Sepal_Width,
143135
family = binomial(link = "logit")))
144136

145-
rTraining <- dataset[dataset$Species %in% c("versicolor", "virginica"), ]
137+
rTraining <- iris[iris$Species %in% c("versicolor", "virginica"), ]
146138
rStats <- summary(glm(Species ~ Sepal.Length + Sepal.Width, data = rTraining,
147139
family = binomial(link = "logit")))
148140

@@ -182,17 +174,17 @@ test_that("spark.glm summary", {
182174
expect_equal(stats$aic, rStats$aic)
183175

184176
# Test spark.glm works with offset
185-
training <- suppressWarnings(createDataFrame(dataset))
177+
training <- suppressWarnings(createDataFrame(iris))
186178
stats <- summary(spark.glm(training, Sepal_Width ~ Sepal_Length + Species,
187179
family = poisson(), offsetCol = "Petal_Length"))
188180
rStats <- suppressWarnings(summary(glm(Sepal.Width ~ Sepal.Length + Species,
189-
data = dataset, family = poisson(), offset = dataset$Petal.Length)))
181+
data = iris, family = poisson(), offset = iris$Petal.Length)))
190182
expect_true(all(abs(rStats$coefficients - stats$coefficients) < 1e-3))
191183

192184
# Test summary works on base GLM models
193-
baseModel <- stats::glm(Sepal.Width ~ Sepal.Length + Species, data = dataset)
185+
baseModel <- stats::glm(Sepal.Width ~ Sepal.Length + Species, data = iris)
194186
baseSummary <- summary(baseModel)
195-
expect_true(abs(baseSummary$deviance - 11.84013) < 1e-4)
187+
expect_true(abs(baseSummary$deviance - 12.19313) < 1e-4)
196188

197189
# Test spark.glm works with regularization parameter
198190
data <- as.data.frame(cbind(a1, a2, b))
@@ -308,19 +300,11 @@ test_that("glm and predict", {
308300
})
309301

310302
test_that("glm summary", {
311-
# prepare dataset
312-
Sepal.Length <- c(2.0, 1.5, 1.8, 3.4, 5.1, 1.8, 1.0, 2.3)
313-
Sepal.Width <- c(2.1, 2.3, 5.4, 4.7, 3.1, 2.1, 3.1, 5.5)
314-
Petal.Length <- c(1.8, 2.1, 7.1, 2.5, 3.7, 6.3, 2.2, 7.2)
315-
Species <- c("setosa", "versicolor", "versicolor", "versicolor", "virginica", "virginica",
316-
"versicolor", "virginica")
317-
dataset <- data.frame(Sepal.Length, Sepal.Width, Petal.Length, Species, stringsAsFactors = TRUE)
318-
319303
# gaussian family
320-
training <- suppressWarnings(createDataFrame(dataset))
304+
training <- suppressWarnings(createDataFrame(iris))
321305
stats <- summary(glm(Sepal_Width ~ Sepal_Length + Species, data = training))
322306

323-
rStats <- summary(glm(Sepal.Width ~ Sepal.Length + Species, data = dataset))
307+
rStats <- summary(glm(Sepal.Width ~ Sepal.Length + Species, data = iris))
324308

325309
coefs <- stats$coefficients
326310
rCoefs <- rStats$coefficients
@@ -336,12 +320,12 @@ test_that("glm summary", {
336320
expect_equal(stats$aic, rStats$aic)
337321

338322
# binomial family
339-
df <- suppressWarnings(createDataFrame(dataset))
323+
df <- suppressWarnings(createDataFrame(iris))
340324
training <- df[df$Species %in% c("versicolor", "virginica"), ]
341325
stats <- summary(glm(Species ~ Sepal_Length + Sepal_Width, data = training,
342326
family = binomial(link = "logit")))
343327

344-
rTraining <- dataset[dataset$Species %in% c("versicolor", "virginica"), ]
328+
rTraining <- iris[iris$Species %in% c("versicolor", "virginica"), ]
345329
rStats <- summary(glm(Species ~ Sepal.Length + Sepal.Width, data = rTraining,
346330
family = binomial(link = "logit")))
347331

docs/ml-features.md

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -585,13 +585,11 @@ for more details on the API.
585585
## StringIndexer
586586

587587
`StringIndexer` encodes a string column of labels to a column of label indices.
588-
`StringIndexer` can encode multiple columns. The indices are in `[0, numLabels)`, and four ordering options are supported:
588+
The indices are in `[0, numLabels)`, and four ordering options are supported:
589589
"frequencyDesc": descending order by label frequency (most frequent label assigned 0),
590590
"frequencyAsc": ascending order by label frequency (least frequent label assigned 0),
591591
"alphabetDesc": descending alphabetical order, and "alphabetAsc": ascending alphabetical order
592-
(default = "frequencyDesc"). Note that in case of equal frequency when under
593-
"frequencyDesc"/"frequencyAsc", the strings are further sorted by alphabet.
594-
592+
(default = "frequencyDesc").
595593
The unseen labels will be put at index numLabels if user chooses to keep them.
596594
If the input column is numeric, we cast it to string and index the string
597595
values. When downstream pipeline components such as `Estimator` or

docs/ml-guide.md

Lines changed: 0 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -104,17 +104,6 @@ MLlib is under active development.
104104
The APIs marked `Experimental`/`DeveloperApi` may change in future releases,
105105
and the migration guide below will explain all changes between releases.
106106

107-
## From 2.4 to 3.0
108-
109-
### Changes of behavior
110-
111-
* [SPARK-11215](https://issues.apache.org/jira/browse/SPARK-11215):
112-
In Spark 2.4 and previous versions, when specifying `frequencyDesc` or `frequencyAsc` as
113-
`stringOrderType` param in `StringIndexer`, in case of equal frequency, the order of
114-
strings is undefined. Since Spark 3.0, the strings with equal frequency are further
115-
sorted by alphabet. And since Spark 3.0, `StringIndexer` supports encoding multiple
116-
columns.
117-
118107
## From 2.2 to 2.3
119108

120109
### Breaking changes

0 commit comments

Comments
 (0)