Skip to content

Commit 68ef3f5

Browse files
authored
Merge pull request apache-spark-on-k8s#230 from palantir/rk/upstream-no-squash
Upstream merge
2 parents 6eae452 + d56829a commit 68ef3f5

File tree

466 files changed

+11621
-9075
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

466 files changed

+11621
-9075
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ R-unit-tests.log
2525
R/unit-tests.out
2626
R/cran-check.out
2727
R/pkg/vignettes/sparkr-vignettes.html
28+
R/pkg/tests/fulltests/Rplots.pdf
2829
build/*.jar
2930
build/apache-maven*
3031
build/scala*

LICENSE

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -263,7 +263,7 @@ The text of each license is also included at licenses/LICENSE-[project].txt.
263263
(New BSD license) Protocol Buffer Java API (org.spark-project.protobuf:protobuf-java:2.4.1-shaded - http://code.google.com/p/protobuf)
264264
(The BSD License) Fortran to Java ARPACK (net.sourceforge.f2j:arpack_combined_all:0.1 - http://f2j.sourceforge.net)
265265
(The BSD License) xmlenc Library (xmlenc:xmlenc:0.52 - http://xmlenc.sourceforge.net)
266-
(The New BSD License) Py4J (net.sf.py4j:py4j:0.10.4 - http://py4j.sourceforge.net/)
266+
(The New BSD License) Py4J (net.sf.py4j:py4j:0.10.6 - http://py4j.sourceforge.net/)
267267
(Two-clause BSD-style license) JUnit-Interface (com.novocode:junit-interface:0.10 - http://github.com/szeiger/junit-interface/)
268268
(BSD licence) sbt and sbt-launch-lib.bash
269269
(BSD 3 Clause) d3.min.js (https://github.com/mbostock/d3/blob/master/LICENSE)

R/pkg/NAMESPACE

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -429,6 +429,7 @@ export("structField",
429429
"structField.character",
430430
"print.structField",
431431
"structType",
432+
"structType.character",
432433
"structType.jobj",
433434
"structType.structField",
434435
"print.structType")
@@ -465,5 +466,6 @@ S3method(print, summary.GBTRegressionModel)
465466
S3method(print, summary.GBTClassificationModel)
466467
S3method(structField, character)
467468
S3method(structField, jobj)
469+
S3method(structType, character)
468470
S3method(structType, jobj)
469471
S3method(structType, structField)

R/pkg/R/DataFrame.R

Lines changed: 32 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1391,6 +1391,10 @@ setMethod("summarize",
13911391
})
13921392

13931393
dapplyInternal <- function(x, func, schema) {
1394+
if (is.character(schema)) {
1395+
schema <- structType(schema)
1396+
}
1397+
13941398
packageNamesArr <- serialize(.sparkREnv[[".packages"]],
13951399
connection = NULL)
13961400

@@ -1408,6 +1412,8 @@ dapplyInternal <- function(x, func, schema) {
14081412
dataFrame(sdf)
14091413
}
14101414

1415+
setClassUnion("characterOrstructType", c("character", "structType"))
1416+
14111417
#' dapply
14121418
#'
14131419
#' Apply a function to each partition of a SparkDataFrame.
@@ -1418,10 +1424,11 @@ dapplyInternal <- function(x, func, schema) {
14181424
#' to each partition will be passed.
14191425
#' The output of func should be a R data.frame.
14201426
#' @param schema The schema of the resulting SparkDataFrame after the function is applied.
1421-
#' It must match the output of func.
1427+
#' It must match the output of func. Since Spark 2.3, the DDL-formatted string
1428+
#' is also supported for the schema.
14221429
#' @family SparkDataFrame functions
14231430
#' @rdname dapply
1424-
#' @aliases dapply,SparkDataFrame,function,structType-method
1431+
#' @aliases dapply,SparkDataFrame,function,characterOrstructType-method
14251432
#' @name dapply
14261433
#' @seealso \link{dapplyCollect}
14271434
#' @export
@@ -1444,6 +1451,17 @@ dapplyInternal <- function(x, func, schema) {
14441451
#' y <- cbind(y, y[1] + 1L)
14451452
#' },
14461453
#' schema)
1454+
#'
1455+
#' # The schema also can be specified in a DDL-formatted string.
1456+
#' schema <- "a INT, d DOUBLE, c STRING, d INT"
1457+
#' df1 <- dapply(
1458+
#' df,
1459+
#' function(x) {
1460+
#' y <- x[x[1] > 1, ]
1461+
#' y <- cbind(y, y[1] + 1L)
1462+
#' },
1463+
#' schema)
1464+
#'
14471465
#' collect(df1)
14481466
#' # the result
14491467
#' # a b c d
@@ -1452,7 +1470,7 @@ dapplyInternal <- function(x, func, schema) {
14521470
#' }
14531471
#' @note dapply since 2.0.0
14541472
setMethod("dapply",
1455-
signature(x = "SparkDataFrame", func = "function", schema = "structType"),
1473+
signature(x = "SparkDataFrame", func = "function", schema = "characterOrstructType"),
14561474
function(x, func, schema) {
14571475
dapplyInternal(x, func, schema)
14581476
})
@@ -1522,6 +1540,7 @@ setMethod("dapplyCollect",
15221540
#' @param schema the schema of the resulting SparkDataFrame after the function is applied.
15231541
#' The schema must match to output of \code{func}. It has to be defined for each
15241542
#' output column with preferred output column name and corresponding data type.
1543+
#' Since Spark 2.3, the DDL-formatted string is also supported for the schema.
15251544
#' @return A SparkDataFrame.
15261545
#' @family SparkDataFrame functions
15271546
#' @aliases gapply,SparkDataFrame-method
@@ -1541,7 +1560,7 @@ setMethod("dapplyCollect",
15411560
#'
15421561
#' Here our output contains three columns, the key which is a combination of two
15431562
#' columns with data types integer and string and the mean which is a double.
1544-
#' schema <- structType(structField("a", "integer"), structField("c", "string"),
1563+
#' schema <- structType(structField("a", "integer"), structField("c", "string"),
15451564
#' structField("avg", "double"))
15461565
#' result <- gapply(
15471566
#' df,
@@ -1550,6 +1569,15 @@ setMethod("dapplyCollect",
15501569
#' y <- data.frame(key, mean(x$b), stringsAsFactors = FALSE)
15511570
#' }, schema)
15521571
#'
1572+
#' The schema also can be specified in a DDL-formatted string.
1573+
#' schema <- "a INT, c STRING, avg DOUBLE"
1574+
#' result <- gapply(
1575+
#' df,
1576+
#' c("a", "c"),
1577+
#' function(key, x) {
1578+
#' y <- data.frame(key, mean(x$b), stringsAsFactors = FALSE)
1579+
#' }, schema)
1580+
#'
15531581
#' We can also group the data and afterwards call gapply on GroupedData.
15541582
#' For Example:
15551583
#' gdf <- group_by(df, "a", "c")

0 commit comments

Comments
 (0)