Skip to content

Commit 105e1b6

Browse files
authored
Merge pull request apache-spark-on-k8s#253 from palantir/resync-apache
2 parents 72d0f1e + 388cb48 commit 105e1b6

File tree

538 files changed

+15328
-5937
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

538 files changed

+15328
-5937
lines changed

R/pkg/DESCRIPTION

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ Package: SparkR
22
Type: Package
33
Version: 2.3.0
44
Title: R Frontend for Apache Spark
5-
Description: The SparkR package provides an R Frontend for Apache Spark.
5+
Description: Provides an R Frontend for Apache Spark.
66
Authors@R: c(person("Shivaram", "Venkataraman", role = c("aut", "cre"),
77
email = "[email protected]"),
88
person("Xiangrui", "Meng", role = "aut",

R/pkg/NAMESPACE

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -169,6 +169,7 @@ exportMethods("arrange",
169169
"transform",
170170
"union",
171171
"unionAll",
172+
"unionByName",
172173
"unique",
173174
"unpersist",
174175
"where",

R/pkg/R/DataFrame.R

Lines changed: 76 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2683,7 +2683,7 @@ generateAliasesForIntersectedCols <- function (x, intersectedColNames, suffix) {
26832683
#' @rdname union
26842684
#' @name union
26852685
#' @aliases union,SparkDataFrame,SparkDataFrame-method
2686-
#' @seealso \link{rbind}
2686+
#' @seealso \link{rbind} \link{unionByName}
26872687
#' @export
26882688
#' @examples
26892689
#'\dontrun{
@@ -2714,6 +2714,40 @@ setMethod("unionAll",
27142714
union(x, y)
27152715
})
27162716

2717+
#' Return a new SparkDataFrame containing the union of rows, matched by column names
2718+
#'
2719+
#' Return a new SparkDataFrame containing the union of rows in this SparkDataFrame
2720+
#' and another SparkDataFrame. This is different from \code{union} function, and both
2721+
#' \code{UNION ALL} and \code{UNION DISTINCT} in SQL as column positions are not taken
2722+
#' into account. Input SparkDataFrames can have different data types in the schema.
2723+
#'
2724+
#' Note: This does not remove duplicate rows across the two SparkDataFrames.
2725+
#' This function resolves columns by name (not by position).
2726+
#'
2727+
#' @param x A SparkDataFrame
2728+
#' @param y A SparkDataFrame
2729+
#' @return A SparkDataFrame containing the result of the union.
2730+
#' @family SparkDataFrame functions
2731+
#' @rdname unionByName
2732+
#' @name unionByName
2733+
#' @aliases unionByName,SparkDataFrame,SparkDataFrame-method
2734+
#' @seealso \link{rbind} \link{union}
2735+
#' @export
2736+
#' @examples
2737+
#'\dontrun{
2738+
#' sparkR.session()
2739+
#' df1 <- select(createDataFrame(mtcars), "carb", "am", "gear")
2740+
#' df2 <- select(createDataFrame(mtcars), "am", "gear", "carb")
2741+
#' head(unionByName(df1, df2))
2742+
#' }
2743+
#' @note unionByName since 2.3.0
2744+
setMethod("unionByName",
2745+
signature(x = "SparkDataFrame", y = "SparkDataFrame"),
2746+
function(x, y) {
2747+
unioned <- callJMethod(x@sdf, "unionByName", y@sdf)
2748+
dataFrame(unioned)
2749+
})
2750+
27172751
#' Union two or more SparkDataFrames
27182752
#'
27192753
#' Union two or more SparkDataFrames by row. As in R's \code{rbind}, this method
@@ -2730,7 +2764,7 @@ setMethod("unionAll",
27302764
#' @aliases rbind,SparkDataFrame-method
27312765
#' @rdname rbind
27322766
#' @name rbind
2733-
#' @seealso \link{union}
2767+
#' @seealso \link{union} \link{unionByName}
27342768
#' @export
27352769
#' @examples
27362770
#'\dontrun{
@@ -2930,7 +2964,7 @@ setMethod("saveAsTable",
29302964
invisible(callJMethod(write, "saveAsTable", tableName))
29312965
})
29322966

2933-
#' summary
2967+
#' describe
29342968
#'
29352969
#' Computes statistics for numeric and string columns.
29362970
#' If no columns are given, this function computes statistics for all numerical or string columns.
@@ -2941,7 +2975,7 @@ setMethod("saveAsTable",
29412975
#' @return A SparkDataFrame.
29422976
#' @family SparkDataFrame functions
29432977
#' @aliases describe,SparkDataFrame,character-method describe,SparkDataFrame,ANY-method
2944-
#' @rdname summary
2978+
#' @rdname describe
29452979
#' @name describe
29462980
#' @export
29472981
#' @examples
@@ -2953,6 +2987,7 @@ setMethod("saveAsTable",
29532987
#' describe(df, "col1")
29542988
#' describe(df, "col1", "col2")
29552989
#' }
2990+
#' @seealso See \link{summary} for expanded statistics and control over which statistics to compute.
29562991
#' @note describe(SparkDataFrame, character) since 1.4.0
29572992
setMethod("describe",
29582993
signature(x = "SparkDataFrame", col = "character"),
@@ -2962,7 +2997,7 @@ setMethod("describe",
29622997
dataFrame(sdf)
29632998
})
29642999

2965-
#' @rdname summary
3000+
#' @rdname describe
29663001
#' @name describe
29673002
#' @aliases describe,SparkDataFrame-method
29683003
#' @note describe(SparkDataFrame) since 1.4.0
@@ -2973,15 +3008,50 @@ setMethod("describe",
29733008
dataFrame(sdf)
29743009
})
29753010

3011+
#' summary
3012+
#'
3013+
#' Computes specified statistics for numeric and string columns. Available statistics are:
3014+
#' \itemize{
3015+
#' \item count
3016+
#' \item mean
3017+
#' \item stddev
3018+
#' \item min
3019+
#' \item max
3020+
#' \item arbitrary approximate percentiles specified as a percentage (eg, "75%")
3021+
#' }
3022+
#' If no statistics are given, this function computes count, mean, stddev, min,
3023+
#' approximate quartiles (percentiles at 25%, 50%, and 75%), and max.
3024+
#' This function is meant for exploratory data analysis, as we make no guarantee about the
3025+
#' backward compatibility of the schema of the resulting Dataset. If you want to
3026+
#' programmatically compute summary statistics, use the \code{agg} function instead.
3027+
#'
3028+
#'
29763029
#' @param object a SparkDataFrame to be summarized.
3030+
#' @param ... (optional) statistics to be computed for all columns.
3031+
#' @return A SparkDataFrame.
3032+
#' @family SparkDataFrame functions
29773033
#' @rdname summary
29783034
#' @name summary
29793035
#' @aliases summary,SparkDataFrame-method
3036+
#' @export
3037+
#' @examples
3038+
#'\dontrun{
3039+
#' sparkR.session()
3040+
#' path <- "path/to/file.json"
3041+
#' df <- read.json(path)
3042+
#' summary(df)
3043+
#' summary(df, "min", "25%", "75%", "max")
3044+
#' summary(select(df, "age", "height"))
3045+
#' }
29803046
#' @note summary(SparkDataFrame) since 1.5.0
3047+
#' @note The statistics provided by \code{summary} were change in 2.3.0 use \link{describe} for previous defaults.
3048+
#' @seealso \link{describe}
29813049
setMethod("summary",
29823050
signature(object = "SparkDataFrame"),
29833051
function(object, ...) {
2984-
describe(object)
3052+
statisticsList <- list(...)
3053+
sdf <- callJMethod(object@sdf, "summary", statisticsList)
3054+
dataFrame(sdf)
29853055
})
29863056

29873057

R/pkg/R/functions.R

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -176,7 +176,8 @@ NULL
176176
#'
177177
#' @param x Column to compute on. Note the difference in the following methods:
178178
#' \itemize{
179-
#' \item \code{to_json}: it is the column containing the struct or array of the structs.
179+
#' \item \code{to_json}: it is the column containing the struct, array of the structs,
180+
#' the map or array of maps.
180181
#' \item \code{from_json}: it is the column containing the JSON string.
181182
#' }
182183
#' @param ... additional argument(s). In \code{to_json} and \code{from_json}, this contains
@@ -1700,8 +1701,9 @@ setMethod("to_date",
17001701
})
17011702

17021703
#' @details
1703-
#' \code{to_json}: Converts a column containing a \code{structType} or array of \code{structType}
1704-
#' into a Column of JSON string. Resolving the Column can fail if an unsupported type is encountered.
1704+
#' \code{to_json}: Converts a column containing a \code{structType}, array of \code{structType},
1705+
#' a \code{mapType} or array of \code{mapType} into a Column of JSON string.
1706+
#' Resolving the Column can fail if an unsupported type is encountered.
17051707
#'
17061708
#' @rdname column_collection_functions
17071709
#' @aliases to_json to_json,Column-method
@@ -1715,6 +1717,14 @@ setMethod("to_date",
17151717
#'
17161718
#' # Converts an array of structs into a JSON array
17171719
#' df2 <- sql("SELECT array(named_struct('name', 'Bob'), named_struct('name', 'Alice')) as people")
1720+
#' df2 <- mutate(df2, people_json = to_json(df2$people))
1721+
#'
1722+
#' # Converts a map into a JSON object
1723+
#' df2 <- sql("SELECT map('name', 'Bob')) as people")
1724+
#' df2 <- mutate(df2, people_json = to_json(df2$people))
1725+
#'
1726+
#' # Converts an array of maps into a JSON array
1727+
#' df2 <- sql("SELECT array(map('name', 'Bob'), map('name', 'Alice')) as people")
17181728
#' df2 <- mutate(df2, people_json = to_json(df2$people))}
17191729
#' @note to_json since 2.2.0
17201730
setMethod("to_json", signature(x = "Column"),

R/pkg/R/generics.R

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -521,7 +521,7 @@ setGeneric("gapplyCollect", function(x, ...) { standardGeneric("gapplyCollect")
521521
# @export
522522
setGeneric("getNumPartitions", function(x) { standardGeneric("getNumPartitions") })
523523

524-
#' @rdname summary
524+
#' @rdname describe
525525
#' @export
526526
setGeneric("describe", function(x, col, ...) { standardGeneric("describe") })
527527

@@ -769,6 +769,10 @@ setGeneric("union", function(x, y) { standardGeneric("union") })
769769
#' @export
770770
setGeneric("unionAll", function(x, y) { standardGeneric("unionAll") })
771771

772+
#' @rdname unionByName
773+
#' @export
774+
setGeneric("unionByName", function(x, y) { standardGeneric("unionByName") })
775+
772776
#' @rdname unpersist
773777
#' @export
774778
setGeneric("unpersist", function(x, ...) { standardGeneric("unpersist") })

R/pkg/R/install.R

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -270,7 +270,11 @@ sparkCachePath <- function() {
270270
if (is_windows()) {
271271
winAppPath <- Sys.getenv("LOCALAPPDATA", unset = NA)
272272
if (is.na(winAppPath)) {
273-
stop(paste("%LOCALAPPDATA% not found.",
273+
message("%LOCALAPPDATA% not found. Falling back to %USERPROFILE%.")
274+
winAppPath <- Sys.getenv("USERPROFILE", unset = NA)
275+
}
276+
if (is.na(winAppPath)) {
277+
stop(paste("%LOCALAPPDATA% and %USERPROFILE% not found.",
274278
"Please define the environment variable",
275279
"or restart and enter an installation path in localDir."))
276280
} else {

R/pkg/tests/fulltests/test_mllib_tree.R

Lines changed: 19 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,7 @@ test_that("spark.gbt", {
6666
# label must be binary - GBTClassifier currently only supports binary classification.
6767
iris2 <- iris[iris$Species != "virginica", ]
6868
data <- suppressWarnings(createDataFrame(iris2))
69-
model <- spark.gbt(data, Species ~ Petal_Length + Petal_Width, "classification")
69+
model <- spark.gbt(data, Species ~ Petal_Length + Petal_Width, "classification", seed = 12)
7070
stats <- summary(model)
7171
expect_equal(stats$numFeatures, 2)
7272
expect_equal(stats$numTrees, 20)
@@ -94,7 +94,7 @@ test_that("spark.gbt", {
9494

9595
iris2$NumericSpecies <- ifelse(iris2$Species == "setosa", 0, 1)
9696
df <- suppressWarnings(createDataFrame(iris2))
97-
m <- spark.gbt(df, NumericSpecies ~ ., type = "classification")
97+
m <- spark.gbt(df, NumericSpecies ~ ., type = "classification", seed = 12)
9898
s <- summary(m)
9999
# test numeric prediction values
100100
expect_equal(iris2$NumericSpecies, as.double(collect(predict(m, df))$prediction))
@@ -106,7 +106,7 @@ test_that("spark.gbt", {
106106
if (windows_with_hadoop()) {
107107
data <- read.df(absoluteSparkPath("data/mllib/sample_binary_classification_data.txt"),
108108
source = "libsvm")
109-
model <- spark.gbt(data, label ~ features, "classification")
109+
model <- spark.gbt(data, label ~ features, "classification", seed = 12)
110110
expect_equal(summary(model)$numFeatures, 692)
111111
}
112112

@@ -117,10 +117,11 @@ test_that("spark.gbt", {
117117
trainidxs <- base::sample(nrow(data), nrow(data) * 0.7)
118118
traindf <- as.DataFrame(data[trainidxs, ])
119119
testdf <- as.DataFrame(rbind(data[-trainidxs, ], c(0, "the other")))
120-
model <- spark.gbt(traindf, clicked ~ ., type = "classification")
120+
model <- spark.gbt(traindf, clicked ~ ., type = "classification", seed = 23)
121121
predictions <- predict(model, testdf)
122122
expect_error(collect(predictions))
123-
model <- spark.gbt(traindf, clicked ~ ., type = "classification", handleInvalid = "keep")
123+
model <- spark.gbt(traindf, clicked ~ ., type = "classification", handleInvalid = "keep",
124+
seed = 23)
124125
predictions <- predict(model, testdf)
125126
expect_equal(class(collect(predictions)$clicked[1]), "character")
126127
})
@@ -129,7 +130,7 @@ test_that("spark.randomForest", {
129130
# regression
130131
data <- suppressWarnings(createDataFrame(longley))
131132
model <- spark.randomForest(data, Employed ~ ., "regression", maxDepth = 5, maxBins = 16,
132-
numTrees = 1)
133+
numTrees = 1, seed = 1)
133134

134135
predictions <- collect(predict(model, data))
135136
expect_equal(predictions$prediction, c(60.323, 61.122, 60.171, 61.187,
@@ -177,7 +178,7 @@ test_that("spark.randomForest", {
177178
# classification
178179
data <- suppressWarnings(createDataFrame(iris))
179180
model <- spark.randomForest(data, Species ~ Petal_Length + Petal_Width, "classification",
180-
maxDepth = 5, maxBins = 16)
181+
maxDepth = 5, maxBins = 16, seed = 123)
181182

182183
stats <- summary(model)
183184
expect_equal(stats$numFeatures, 2)
@@ -215,7 +216,7 @@ test_that("spark.randomForest", {
215216
iris$NumericSpecies <- lapply(iris$Species, labelToIndex)
216217
data <- suppressWarnings(createDataFrame(iris[-5]))
217218
model <- spark.randomForest(data, NumericSpecies ~ Petal_Length + Petal_Width, "classification",
218-
maxDepth = 5, maxBins = 16)
219+
maxDepth = 5, maxBins = 16, seed = 123)
219220
stats <- summary(model)
220221
expect_equal(stats$numFeatures, 2)
221222
expect_equal(stats$numTrees, 20)
@@ -234,28 +235,29 @@ test_that("spark.randomForest", {
234235
traindf <- as.DataFrame(data[trainidxs, ])
235236
testdf <- as.DataFrame(rbind(data[-trainidxs, ], c(0, "the other")))
236237
model <- spark.randomForest(traindf, clicked ~ ., type = "classification",
237-
maxDepth = 10, maxBins = 10, numTrees = 10)
238+
maxDepth = 10, maxBins = 10, numTrees = 10, seed = 123)
238239
predictions <- predict(model, testdf)
239240
expect_error(collect(predictions))
240241
model <- spark.randomForest(traindf, clicked ~ ., type = "classification",
241242
maxDepth = 10, maxBins = 10, numTrees = 10,
242-
handleInvalid = "keep")
243+
handleInvalid = "keep", seed = 123)
243244
predictions <- predict(model, testdf)
244245
expect_equal(class(collect(predictions)$clicked[1]), "character")
245246

246247
# spark.randomForest classification can work on libsvm data
247248
if (windows_with_hadoop()) {
248249
data <- read.df(absoluteSparkPath("data/mllib/sample_multiclass_classification_data.txt"),
249250
source = "libsvm")
250-
model <- spark.randomForest(data, label ~ features, "classification")
251+
model <- spark.randomForest(data, label ~ features, "classification", seed = 123)
251252
expect_equal(summary(model)$numFeatures, 4)
252253
}
253254
})
254255

255256
test_that("spark.decisionTree", {
256257
# regression
257258
data <- suppressWarnings(createDataFrame(longley))
258-
model <- spark.decisionTree(data, Employed ~ ., "regression", maxDepth = 5, maxBins = 16)
259+
model <- spark.decisionTree(data, Employed ~ ., "regression", maxDepth = 5, maxBins = 16,
260+
seed = 42)
259261

260262
predictions <- collect(predict(model, data))
261263
expect_equal(predictions$prediction, c(60.323, 61.122, 60.171, 61.187,
@@ -288,7 +290,7 @@ test_that("spark.decisionTree", {
288290
# classification
289291
data <- suppressWarnings(createDataFrame(iris))
290292
model <- spark.decisionTree(data, Species ~ Petal_Length + Petal_Width, "classification",
291-
maxDepth = 5, maxBins = 16)
293+
maxDepth = 5, maxBins = 16, seed = 43)
292294

293295
stats <- summary(model)
294296
expect_equal(stats$numFeatures, 2)
@@ -325,7 +327,7 @@ test_that("spark.decisionTree", {
325327
iris$NumericSpecies <- lapply(iris$Species, labelToIndex)
326328
data <- suppressWarnings(createDataFrame(iris[-5]))
327329
model <- spark.decisionTree(data, NumericSpecies ~ Petal_Length + Petal_Width, "classification",
328-
maxDepth = 5, maxBins = 16)
330+
maxDepth = 5, maxBins = 16, seed = 44)
329331
stats <- summary(model)
330332
expect_equal(stats$numFeatures, 2)
331333
expect_equal(stats$maxDepth, 5)
@@ -339,7 +341,7 @@ test_that("spark.decisionTree", {
339341
if (windows_with_hadoop()) {
340342
data <- read.df(absoluteSparkPath("data/mllib/sample_multiclass_classification_data.txt"),
341343
source = "libsvm")
342-
model <- spark.decisionTree(data, label ~ features, "classification")
344+
model <- spark.decisionTree(data, label ~ features, "classification", seed = 45)
343345
expect_equal(summary(model)$numFeatures, 4)
344346
}
345347

@@ -351,11 +353,11 @@ test_that("spark.decisionTree", {
351353
traindf <- as.DataFrame(data[trainidxs, ])
352354
testdf <- as.DataFrame(rbind(data[-trainidxs, ], c(0, "the other")))
353355
model <- spark.decisionTree(traindf, clicked ~ ., type = "classification",
354-
maxDepth = 5, maxBins = 16)
356+
maxDepth = 5, maxBins = 16, seed = 46)
355357
predictions <- predict(model, testdf)
356358
expect_error(collect(predictions))
357359
model <- spark.decisionTree(traindf, clicked ~ ., type = "classification",
358-
maxDepth = 5, maxBins = 16, handleInvalid = "keep")
360+
maxDepth = 5, maxBins = 16, handleInvalid = "keep", seed = 46)
359361
predictions <- predict(model, testdf)
360362
expect_equal(class(collect(predictions)$clicked[1]), "character")
361363
})

0 commit comments

Comments
 (0)