Skip to content

Commit 66338b8

Browse files
committed
Merge remote-tracking branch 'origin/master' into palantir-master
Conflicts: core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala dev/deps/spark-deps-hadoop-2.6 dev/deps/spark-deps-hadoop-2.7 mllib/src/test/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifierSuite.scala pom.xml sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/ParseDriver.scala sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCWriteSuite.scala
2 parents 0d4903d + e47f48c commit 66338b8

File tree

251 files changed

+6923
-3391
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

251 files changed

+6923
-3391
lines changed

R/pkg/DESCRIPTION

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ Package: SparkR
22
Type: Package
33
Version: 2.3.0
44
Title: R Frontend for Apache Spark
5-
Description: The SparkR package provides an R Frontend for Apache Spark.
5+
Description: Provides an R Frontend for Apache Spark.
66
Authors@R: c(person("Shivaram", "Venkataraman", role = c("aut", "cre"),
77
email = "[email protected]"),
88
person("Xiangrui", "Meng", role = "aut",

R/pkg/R/DataFrame.R

Lines changed: 40 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2930,7 +2930,7 @@ setMethod("saveAsTable",
29302930
invisible(callJMethod(write, "saveAsTable", tableName))
29312931
})
29322932

2933-
#' summary
2933+
#' describe
29342934
#'
29352935
#' Computes statistics for numeric and string columns.
29362936
#' If no columns are given, this function computes statistics for all numerical or string columns.
@@ -2941,7 +2941,7 @@ setMethod("saveAsTable",
29412941
#' @return A SparkDataFrame.
29422942
#' @family SparkDataFrame functions
29432943
#' @aliases describe,SparkDataFrame,character-method describe,SparkDataFrame,ANY-method
2944-
#' @rdname summary
2944+
#' @rdname describe
29452945
#' @name describe
29462946
#' @export
29472947
#' @examples
@@ -2953,6 +2953,7 @@ setMethod("saveAsTable",
29532953
#' describe(df, "col1")
29542954
#' describe(df, "col1", "col2")
29552955
#' }
2956+
#' @seealso See \link{summary} for expanded statistics and control over which statistics to compute.
29562957
#' @note describe(SparkDataFrame, character) since 1.4.0
29572958
setMethod("describe",
29582959
signature(x = "SparkDataFrame", col = "character"),
@@ -2962,7 +2963,7 @@ setMethod("describe",
29622963
dataFrame(sdf)
29632964
})
29642965

2965-
#' @rdname summary
2966+
#' @rdname describe
29662967
#' @name describe
29672968
#' @aliases describe,SparkDataFrame-method
29682969
#' @note describe(SparkDataFrame) since 1.4.0
@@ -2973,15 +2974,50 @@ setMethod("describe",
29732974
dataFrame(sdf)
29742975
})
29752976

2977+
#' summary
2978+
#'
2979+
#' Computes specified statistics for numeric and string columns. Available statistics are:
2980+
#' \itemize{
2981+
#' \item count
2982+
#' \item mean
2983+
#' \item stddev
2984+
#' \item min
2985+
#' \item max
2986+
#' \item arbitrary approximate percentiles specified as a percentage (eg, "75%")
2987+
#' }
2988+
#' If no statistics are given, this function computes count, mean, stddev, min,
2989+
#' approximate quartiles (percentiles at 25%, 50%, and 75%), and max.
2990+
#' This function is meant for exploratory data analysis, as we make no guarantee about the
2991+
#' backward compatibility of the schema of the resulting Dataset. If you want to
2992+
#' programmatically compute summary statistics, use the \code{agg} function instead.
2993+
#'
2994+
#'
29762995
#' @param object a SparkDataFrame to be summarized.
2996+
#' @param ... (optional) statistics to be computed for all columns.
2997+
#' @return A SparkDataFrame.
2998+
#' @family SparkDataFrame functions
29772999
#' @rdname summary
29783000
#' @name summary
29793001
#' @aliases summary,SparkDataFrame-method
3002+
#' @export
3003+
#' @examples
3004+
#'\dontrun{
3005+
#' sparkR.session()
3006+
#' path <- "path/to/file.json"
3007+
#' df <- read.json(path)
3008+
#' summary(df)
3009+
#' summary(df, "min", "25%", "75%", "max")
3010+
#' summary(select(df, "age", "height"))
3011+
#' }
29803012
#' @note summary(SparkDataFrame) since 1.5.0
3013+
#' @note The statistics provided by \code{summary} were change in 2.3.0 use \link{describe} for previous defaults.
3014+
#' @seealso \link{describe}
29813015
setMethod("summary",
29823016
signature(object = "SparkDataFrame"),
29833017
function(object, ...) {
2984-
describe(object)
3018+
statisticsList <- list(...)
3019+
sdf <- callJMethod(object@sdf, "summary", statisticsList)
3020+
dataFrame(sdf)
29853021
})
29863022

29873023

R/pkg/R/generics.R

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -521,7 +521,7 @@ setGeneric("gapplyCollect", function(x, ...) { standardGeneric("gapplyCollect")
521521
# @export
522522
setGeneric("getNumPartitions", function(x) { standardGeneric("getNumPartitions") })
523523

524-
#' @rdname summary
524+
#' @rdname describe
525525
#' @export
526526
setGeneric("describe", function(x, col, ...) { standardGeneric("describe") })
527527

R/pkg/R/install.R

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -270,7 +270,11 @@ sparkCachePath <- function() {
270270
if (is_windows()) {
271271
winAppPath <- Sys.getenv("LOCALAPPDATA", unset = NA)
272272
if (is.na(winAppPath)) {
273-
stop(paste("%LOCALAPPDATA% not found.",
273+
message("%LOCALAPPDATA% not found. Falling back to %USERPROFILE%.")
274+
winAppPath <- Sys.getenv("USERPROFILE", unset = NA)
275+
}
276+
if (is.na(winAppPath)) {
277+
stop(paste("%LOCALAPPDATA% and %USERPROFILE% not found.",
274278
"Please define the environment variable",
275279
"or restart and enter an installation path in localDir."))
276280
} else {

R/pkg/tests/fulltests/test_mllib_tree.R

Lines changed: 19 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,7 @@ test_that("spark.gbt", {
6666
# label must be binary - GBTClassifier currently only supports binary classification.
6767
iris2 <- iris[iris$Species != "virginica", ]
6868
data <- suppressWarnings(createDataFrame(iris2))
69-
model <- spark.gbt(data, Species ~ Petal_Length + Petal_Width, "classification")
69+
model <- spark.gbt(data, Species ~ Petal_Length + Petal_Width, "classification", seed = 12)
7070
stats <- summary(model)
7171
expect_equal(stats$numFeatures, 2)
7272
expect_equal(stats$numTrees, 20)
@@ -94,7 +94,7 @@ test_that("spark.gbt", {
9494

9595
iris2$NumericSpecies <- ifelse(iris2$Species == "setosa", 0, 1)
9696
df <- suppressWarnings(createDataFrame(iris2))
97-
m <- spark.gbt(df, NumericSpecies ~ ., type = "classification")
97+
m <- spark.gbt(df, NumericSpecies ~ ., type = "classification", seed = 12)
9898
s <- summary(m)
9999
# test numeric prediction values
100100
expect_equal(iris2$NumericSpecies, as.double(collect(predict(m, df))$prediction))
@@ -106,7 +106,7 @@ test_that("spark.gbt", {
106106
if (windows_with_hadoop()) {
107107
data <- read.df(absoluteSparkPath("data/mllib/sample_binary_classification_data.txt"),
108108
source = "libsvm")
109-
model <- spark.gbt(data, label ~ features, "classification")
109+
model <- spark.gbt(data, label ~ features, "classification", seed = 12)
110110
expect_equal(summary(model)$numFeatures, 692)
111111
}
112112

@@ -117,10 +117,11 @@ test_that("spark.gbt", {
117117
trainidxs <- base::sample(nrow(data), nrow(data) * 0.7)
118118
traindf <- as.DataFrame(data[trainidxs, ])
119119
testdf <- as.DataFrame(rbind(data[-trainidxs, ], c(0, "the other")))
120-
model <- spark.gbt(traindf, clicked ~ ., type = "classification")
120+
model <- spark.gbt(traindf, clicked ~ ., type = "classification", seed = 23)
121121
predictions <- predict(model, testdf)
122122
expect_error(collect(predictions))
123-
model <- spark.gbt(traindf, clicked ~ ., type = "classification", handleInvalid = "keep")
123+
model <- spark.gbt(traindf, clicked ~ ., type = "classification", handleInvalid = "keep",
124+
seed = 23)
124125
predictions <- predict(model, testdf)
125126
expect_equal(class(collect(predictions)$clicked[1]), "character")
126127
})
@@ -129,7 +130,7 @@ test_that("spark.randomForest", {
129130
# regression
130131
data <- suppressWarnings(createDataFrame(longley))
131132
model <- spark.randomForest(data, Employed ~ ., "regression", maxDepth = 5, maxBins = 16,
132-
numTrees = 1)
133+
numTrees = 1, seed = 1)
133134

134135
predictions <- collect(predict(model, data))
135136
expect_equal(predictions$prediction, c(60.323, 61.122, 60.171, 61.187,
@@ -177,7 +178,7 @@ test_that("spark.randomForest", {
177178
# classification
178179
data <- suppressWarnings(createDataFrame(iris))
179180
model <- spark.randomForest(data, Species ~ Petal_Length + Petal_Width, "classification",
180-
maxDepth = 5, maxBins = 16)
181+
maxDepth = 5, maxBins = 16, seed = 123)
181182

182183
stats <- summary(model)
183184
expect_equal(stats$numFeatures, 2)
@@ -215,7 +216,7 @@ test_that("spark.randomForest", {
215216
iris$NumericSpecies <- lapply(iris$Species, labelToIndex)
216217
data <- suppressWarnings(createDataFrame(iris[-5]))
217218
model <- spark.randomForest(data, NumericSpecies ~ Petal_Length + Petal_Width, "classification",
218-
maxDepth = 5, maxBins = 16)
219+
maxDepth = 5, maxBins = 16, seed = 123)
219220
stats <- summary(model)
220221
expect_equal(stats$numFeatures, 2)
221222
expect_equal(stats$numTrees, 20)
@@ -234,28 +235,29 @@ test_that("spark.randomForest", {
234235
traindf <- as.DataFrame(data[trainidxs, ])
235236
testdf <- as.DataFrame(rbind(data[-trainidxs, ], c(0, "the other")))
236237
model <- spark.randomForest(traindf, clicked ~ ., type = "classification",
237-
maxDepth = 10, maxBins = 10, numTrees = 10)
238+
maxDepth = 10, maxBins = 10, numTrees = 10, seed = 123)
238239
predictions <- predict(model, testdf)
239240
expect_error(collect(predictions))
240241
model <- spark.randomForest(traindf, clicked ~ ., type = "classification",
241242
maxDepth = 10, maxBins = 10, numTrees = 10,
242-
handleInvalid = "keep")
243+
handleInvalid = "keep", seed = 123)
243244
predictions <- predict(model, testdf)
244245
expect_equal(class(collect(predictions)$clicked[1]), "character")
245246

246247
# spark.randomForest classification can work on libsvm data
247248
if (windows_with_hadoop()) {
248249
data <- read.df(absoluteSparkPath("data/mllib/sample_multiclass_classification_data.txt"),
249250
source = "libsvm")
250-
model <- spark.randomForest(data, label ~ features, "classification")
251+
model <- spark.randomForest(data, label ~ features, "classification", seed = 123)
251252
expect_equal(summary(model)$numFeatures, 4)
252253
}
253254
})
254255

255256
test_that("spark.decisionTree", {
256257
# regression
257258
data <- suppressWarnings(createDataFrame(longley))
258-
model <- spark.decisionTree(data, Employed ~ ., "regression", maxDepth = 5, maxBins = 16)
259+
model <- spark.decisionTree(data, Employed ~ ., "regression", maxDepth = 5, maxBins = 16,
260+
seed = 42)
259261

260262
predictions <- collect(predict(model, data))
261263
expect_equal(predictions$prediction, c(60.323, 61.122, 60.171, 61.187,
@@ -288,7 +290,7 @@ test_that("spark.decisionTree", {
288290
# classification
289291
data <- suppressWarnings(createDataFrame(iris))
290292
model <- spark.decisionTree(data, Species ~ Petal_Length + Petal_Width, "classification",
291-
maxDepth = 5, maxBins = 16)
293+
maxDepth = 5, maxBins = 16, seed = 43)
292294

293295
stats <- summary(model)
294296
expect_equal(stats$numFeatures, 2)
@@ -325,7 +327,7 @@ test_that("spark.decisionTree", {
325327
iris$NumericSpecies <- lapply(iris$Species, labelToIndex)
326328
data <- suppressWarnings(createDataFrame(iris[-5]))
327329
model <- spark.decisionTree(data, NumericSpecies ~ Petal_Length + Petal_Width, "classification",
328-
maxDepth = 5, maxBins = 16)
330+
maxDepth = 5, maxBins = 16, seed = 44)
329331
stats <- summary(model)
330332
expect_equal(stats$numFeatures, 2)
331333
expect_equal(stats$maxDepth, 5)
@@ -339,7 +341,7 @@ test_that("spark.decisionTree", {
339341
if (windows_with_hadoop()) {
340342
data <- read.df(absoluteSparkPath("data/mllib/sample_multiclass_classification_data.txt"),
341343
source = "libsvm")
342-
model <- spark.decisionTree(data, label ~ features, "classification")
344+
model <- spark.decisionTree(data, label ~ features, "classification", seed = 45)
343345
expect_equal(summary(model)$numFeatures, 4)
344346
}
345347

@@ -351,11 +353,11 @@ test_that("spark.decisionTree", {
351353
traindf <- as.DataFrame(data[trainidxs, ])
352354
testdf <- as.DataFrame(rbind(data[-trainidxs, ], c(0, "the other")))
353355
model <- spark.decisionTree(traindf, clicked ~ ., type = "classification",
354-
maxDepth = 5, maxBins = 16)
356+
maxDepth = 5, maxBins = 16, seed = 46)
355357
predictions <- predict(model, testdf)
356358
expect_error(collect(predictions))
357359
model <- spark.decisionTree(traindf, clicked ~ ., type = "classification",
358-
maxDepth = 5, maxBins = 16, handleInvalid = "keep")
360+
maxDepth = 5, maxBins = 16, handleInvalid = "keep", seed = 46)
359361
predictions <- predict(model, testdf)
360362
expect_equal(class(collect(predictions)$clicked[1]), "character")
361363
})

R/pkg/tests/fulltests/test_sparkSQL.R

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2497,7 +2497,7 @@ test_that("read/write text files - compression option", {
24972497
unlink(textPath)
24982498
})
24992499

2500-
test_that("describe() and summarize() on a DataFrame", {
2500+
test_that("describe() and summary() on a DataFrame", {
25012501
df <- read.json(jsonPath)
25022502
stats <- describe(df, "age")
25032503
expect_equal(collect(stats)[1, "summary"], "count")
@@ -2508,8 +2508,15 @@ test_that("describe() and summarize() on a DataFrame", {
25082508
expect_equal(collect(stats)[5, "age"], "30")
25092509

25102510
stats2 <- summary(df)
2511-
expect_equal(collect(stats2)[4, "summary"], "min")
2512-
expect_equal(collect(stats2)[5, "age"], "30")
2511+
expect_equal(collect(stats2)[5, "summary"], "25%")
2512+
expect_equal(collect(stats2)[5, "age"], "30.0")
2513+
2514+
stats3 <- summary(df, "min", "max", "55.1%")
2515+
2516+
expect_equal(collect(stats3)[1, "summary"], "min")
2517+
expect_equal(collect(stats3)[2, "summary"], "max")
2518+
expect_equal(collect(stats3)[3, "summary"], "55.1%")
2519+
expect_equal(collect(stats3)[3, "age"], "30.0")
25132520

25142521
# SPARK-16425: SparkR summary() fails on column of type logical
25152522
df <- withColumn(df, "boolean", df$age == 30)
@@ -2742,15 +2749,15 @@ test_that("attach() on a DataFrame", {
27422749
expected_age <- data.frame(age = c(NA, 30, 19))
27432750
expect_equal(head(age), expected_age)
27442751
stat <- summary(age)
2745-
expect_equal(collect(stat)[5, "age"], "30")
2752+
expect_equal(collect(stat)[8, "age"], "30")
27462753
age <- age$age + 1
27472754
expect_is(age, "Column")
27482755
rm(age)
27492756
stat2 <- summary(age)
2750-
expect_equal(collect(stat2)[5, "age"], "30")
2757+
expect_equal(collect(stat2)[8, "age"], "30")
27512758
detach("df")
27522759
stat3 <- summary(df[, "age", drop = F])
2753-
expect_equal(collect(stat3)[5, "age"], "30")
2760+
expect_equal(collect(stat3)[8, "age"], "30")
27542761
expect_error(age)
27552762
})
27562763

R/pkg/vignettes/sparkr-vignettes.Rmd

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,17 @@ vignette: >
2727
limitations under the License.
2828
-->
2929

30+
```{r setup, include=FALSE}
31+
library(knitr)
32+
opts_hooks$set(eval = function(options) {
33+
# override eval to FALSE only on windows
34+
if (.Platform$OS.type == "windows") {
35+
options$eval = FALSE
36+
}
37+
options
38+
})
39+
```
40+
3041
## Overview
3142

3243
SparkR is an R package that provides a light-weight frontend to use Apache Spark from R. With Spark `r packageVersion("SparkR")`, SparkR provides a distributed data frame implementation that supports data processing operations like selection, filtering, aggregation etc. and distributed machine learning using [MLlib](http://spark.apache.org/mllib/).

common/kvstore/src/main/java/org/apache/spark/util/kvstore/InMemoryStore.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -171,7 +171,7 @@ public int size() {
171171
public <T> InMemoryView<T> view(Class<T> type) {
172172
Preconditions.checkArgument(ti.type().equals(type), "Unexpected type: %s", type);
173173
Collection<T> all = (Collection<T>) data.values();
174-
return new InMemoryView(type, all, ti);
174+
return new InMemoryView<>(type, all, ti);
175175
}
176176

177177
}

common/kvstore/src/main/java/org/apache/spark/util/kvstore/KVStoreIterator.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717

1818
package org.apache.spark.util.kvstore;
1919

20+
import java.io.Closeable;
2021
import java.util.Iterator;
2122
import java.util.List;
2223

@@ -31,7 +32,7 @@
3132
* </p>
3233
*/
3334
@Private
34-
public interface KVStoreIterator<T> extends Iterator<T>, AutoCloseable {
35+
public interface KVStoreIterator<T> extends Iterator<T>, Closeable {
3536

3637
/**
3738
* Retrieve multiple elements from the store.

common/network-common/src/main/java/org/apache/spark/network/client/TransportClientFactory.java

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -210,6 +210,14 @@ private TransportClient createClient(InetSocketAddress address)
210210
.option(ChannelOption.CONNECT_TIMEOUT_MILLIS, conf.connectionTimeoutMs())
211211
.option(ChannelOption.ALLOCATOR, pooledAllocator);
212212

213+
if (conf.receiveBuf() > 0) {
214+
bootstrap.option(ChannelOption.SO_RCVBUF, conf.receiveBuf());
215+
}
216+
217+
if (conf.sendBuf() > 0) {
218+
bootstrap.option(ChannelOption.SO_SNDBUF, conf.sendBuf());
219+
}
220+
213221
final AtomicReference<TransportClient> clientRef = new AtomicReference<>();
214222
final AtomicReference<Channel> channelRef = new AtomicReference<>();
215223

0 commit comments

Comments
 (0)