Skip to content

Commit 771a5c7

Browse files
authored
Merge pull request apache-spark-on-k8s#386 from palantir/rk/more-upstream
2 parents 1c56118 + 1930e67 commit 771a5c7

File tree

598 files changed

+25140
-5622
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

598 files changed

+25140
-5622
lines changed

LICENSE

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -237,6 +237,7 @@ The text of each license is also included at licenses/LICENSE-[project].txt.
237237

238238
(BSD 3 Clause) netlib core (com.github.fommil.netlib:core:1.1.2 - https://github.com/fommil/netlib-java/core)
239239
(BSD 3 Clause) JPMML-Model (org.jpmml:pmml-model:1.2.7 - https://github.com/jpmml/jpmml-model)
240+
(BSD 3 Clause) jmock (org.jmock:jmock-junit4:2.8.4 - http://jmock.org/)
240241
(BSD License) AntLR Parser Generator (antlr:antlr:2.7.7 - http://www.antlr.org/)
241242
(BSD License) ANTLR 4.5.2-1 (org.antlr:antlr4:4.5.2-1 - http://wwww.antlr.org/)
242243
(BSD licence) ANTLR ST4 4.0.4 (org.antlr:ST4:4.0.4 - http://www.stringtemplate.org)

R/pkg/NAMESPACE

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -201,10 +201,13 @@ exportMethods("%<=>%",
201201
"approxCountDistinct",
202202
"approxQuantile",
203203
"array_contains",
204+
"array_join",
204205
"array_max",
205206
"array_min",
206207
"array_position",
208+
"array_repeat",
207209
"array_sort",
210+
"arrays_overlap",
208211
"asc",
209212
"ascii",
210213
"asin",
@@ -302,6 +305,7 @@ exportMethods("%<=>%",
302305
"lower",
303306
"lpad",
304307
"ltrim",
308+
"map_entries",
305309
"map_keys",
306310
"map_values",
307311
"max",

R/pkg/R/DataFrame.R

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2297,6 +2297,8 @@ setMethod("rename",
22972297

22982298
setClassUnion("characterOrColumn", c("character", "Column"))
22992299

2300+
setClassUnion("numericOrColumn", c("numeric", "Column"))
2301+
23002302
#' Arrange Rows by Variables
23012303
#'
23022304
#' Sort a SparkDataFrame by the specified column(s).

R/pkg/R/client.R

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,7 @@ generateSparkSubmitArgs <- function(args, sparkHome, jars, sparkSubmitOpts, pack
6363
checkJavaVersion <- function() {
6464
javaBin <- "java"
6565
javaHome <- Sys.getenv("JAVA_HOME")
66-
javaReqs <- utils::packageDescription(utils::packageName(), fields=c("SystemRequirements"))
66+
javaReqs <- utils::packageDescription(utils::packageName(), fields = c("SystemRequirements"))
6767
sparkJavaVersion <- as.numeric(tail(strsplit(javaReqs, "[(=)]")[[1]], n = 1L))
6868
if (javaHome != "") {
6969
javaBin <- file.path(javaHome, "bin", javaBin)
@@ -90,7 +90,8 @@ checkJavaVersion <- function() {
9090
# Extract 8 from it to compare to sparkJavaVersion
9191
javaVersionNum <- as.integer(strsplit(javaVersionStr, "[.]")[[1L]][2])
9292
if (javaVersionNum != sparkJavaVersion) {
93-
stop(paste("Java version", sparkJavaVersion, "is required for this package; found version:", javaVersionStr))
93+
stop(paste("Java version", sparkJavaVersion, "is required for this package; found version:",
94+
javaVersionStr))
9495
}
9596
}
9697

R/pkg/R/functions.R

Lines changed: 80 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -189,6 +189,7 @@ NULL
189189
#' the map or array of maps.
190190
#' \item \code{from_json}: it is the column containing the JSON string.
191191
#' }
192+
#' @param y Column to compute on.
192193
#' @param value A value to compute on.
193194
#' \itemize{
194195
#' \item \code{array_contains}: a value to be checked if contained in the column.
@@ -207,7 +208,7 @@ NULL
207208
#' tmp <- mutate(df, v1 = create_array(df$mpg, df$cyl, df$hp))
208209
#' head(select(tmp, array_contains(tmp$v1, 21), size(tmp$v1)))
209210
#' head(select(tmp, array_max(tmp$v1), array_min(tmp$v1)))
210-
#' head(select(tmp, array_position(tmp$v1, 21), array_sort(tmp$v1)))
211+
#' head(select(tmp, array_position(tmp$v1, 21), array_repeat(df$mpg, 3), array_sort(tmp$v1)))
211212
#' head(select(tmp, flatten(tmp$v1), reverse(tmp$v1)))
212213
#' tmp2 <- mutate(tmp, v2 = explode(tmp$v1))
213214
#' head(tmp2)
@@ -216,12 +217,13 @@ NULL
216217
#' head(select(tmp, sort_array(tmp$v1)))
217218
#' head(select(tmp, sort_array(tmp$v1, asc = FALSE)))
218219
#' tmp3 <- mutate(df, v3 = create_map(df$model, df$cyl))
219-
#' head(select(tmp3, map_keys(tmp3$v3)))
220-
#' head(select(tmp3, map_values(tmp3$v3)))
220+
#' head(select(tmp3, map_entries(tmp3$v3), map_keys(tmp3$v3), map_values(tmp3$v3)))
221221
#' head(select(tmp3, element_at(tmp3$v3, "Valiant")))
222-
#' tmp4 <- mutate(df, v4 = create_array(df$mpg, df$cyl), v5 = create_array(df$hp))
223-
#' head(select(tmp4, concat(tmp4$v4, tmp4$v5)))
224-
#' head(select(tmp, concat(df$mpg, df$cyl, df$hp)))}
222+
#' tmp4 <- mutate(df, v4 = create_array(df$mpg, df$cyl), v5 = create_array(df$cyl, df$hp))
223+
#' head(select(tmp4, concat(tmp4$v4, tmp4$v5), arrays_overlap(tmp4$v4, tmp4$v5)))
224+
#' head(select(tmp, concat(df$mpg, df$cyl, df$hp)))
225+
#' tmp5 <- mutate(df, v6 = create_array(df$model, df$model))
226+
#' head(select(tmp5, array_join(tmp5$v6, "#"), array_join(tmp5$v6, "#", "NULL")))}
225227
NULL
226228

227229
#' Window functions for Column operations
@@ -1976,7 +1978,7 @@ setMethod("levenshtein", signature(y = "Column"),
19761978
})
19771979

19781980
#' @details
1979-
#' \code{months_between}: Returns number of months between dates \code{y} and \code{x}.
1981+
#' \code{months_between}: Returns number of months between dates \code{y} and \code{x}.
19801982
#' If \code{y} is later than \code{x}, then the result is positive. If \code{y} and \code{x}
19811983
#' are on the same day of month, or both are the last day of month, time of day will be ignored.
19821984
#' Otherwise, the difference is calculated based on 31 days per month, and rounded to 8 digits.
@@ -3006,6 +3008,27 @@ setMethod("array_contains",
30063008
column(jc)
30073009
})
30083010

3011+
#' @details
3012+
#' \code{array_join}: Concatenates the elements of column using the delimiter.
3013+
#' Null values are replaced with nullReplacement if set, otherwise they are ignored.
3014+
#'
3015+
#' @param delimiter a character string that is used to concatenate the elements of column.
3016+
#' @param nullReplacement an optional character string that is used to replace the Null values.
3017+
#' @rdname column_collection_functions
3018+
#' @aliases array_join array_join,Column-method
3019+
#' @note array_join since 2.4.0
3020+
setMethod("array_join",
3021+
signature(x = "Column", delimiter = "character"),
3022+
function(x, delimiter, nullReplacement = NULL) {
3023+
jc <- if (is.null(nullReplacement)) {
3024+
callJStatic("org.apache.spark.sql.functions", "array_join", x@jc, delimiter)
3025+
} else {
3026+
callJStatic("org.apache.spark.sql.functions", "array_join", x@jc, delimiter,
3027+
as.character(nullReplacement))
3028+
}
3029+
column(jc)
3030+
})
3031+
30093032
#' @details
30103033
#' \code{array_max}: Returns the maximum value of the array.
30113034
#'
@@ -3048,6 +3071,26 @@ setMethod("array_position",
30483071
column(jc)
30493072
})
30503073

3074+
#' @details
3075+
#' \code{array_repeat}: Creates an array containing \code{x} repeated the number of times
3076+
#' given by \code{count}.
3077+
#'
3078+
#' @param count a Column or constant determining the number of repetitions.
3079+
#' @rdname column_collection_functions
3080+
#' @aliases array_repeat array_repeat,Column,numericOrColumn-method
3081+
#' @note array_repeat since 2.4.0
3082+
setMethod("array_repeat",
3083+
signature(x = "Column", count = "numericOrColumn"),
3084+
function(x, count) {
3085+
if (class(count) == "Column") {
3086+
count <- count@jc
3087+
} else {
3088+
count <- as.integer(count)
3089+
}
3090+
jc <- callJStatic("org.apache.spark.sql.functions", "array_repeat", x@jc, count)
3091+
column(jc)
3092+
})
3093+
30513094
#' @details
30523095
#' \code{array_sort}: Sorts the input array in ascending order. The elements of the input array
30533096
#' must be orderable. NA elements will be placed at the end of the returned array.
@@ -3062,6 +3105,21 @@ setMethod("array_sort",
30623105
column(jc)
30633106
})
30643107

3108+
#' @details
3109+
#' \code{arrays_overlap}: Returns true if the input arrays have at least one non-null element in
3110+
#' common. If not and both arrays are non-empty and any of them contains a null, it returns null.
3111+
#' It returns false otherwise.
3112+
#'
3113+
#' @rdname column_collection_functions
3114+
#' @aliases arrays_overlap arrays_overlap,Column-method
3115+
#' @note arrays_overlap since 2.4.0
3116+
setMethod("arrays_overlap",
3117+
signature(x = "Column", y = "Column"),
3118+
function(x, y) {
3119+
jc <- callJStatic("org.apache.spark.sql.functions", "arrays_overlap", x@jc, y@jc)
3120+
column(jc)
3121+
})
3122+
30653123
#' @details
30663124
#' \code{flatten}: Creates a single array from an array of arrays.
30673125
#' If a structure of nested arrays is deeper than two levels, only one level of nesting is removed.
@@ -3076,6 +3134,19 @@ setMethod("flatten",
30763134
column(jc)
30773135
})
30783136

3137+
#' @details
3138+
#' \code{map_entries}: Returns an unordered array of all entries in the given map.
3139+
#'
3140+
#' @rdname column_collection_functions
3141+
#' @aliases map_entries map_entries,Column-method
3142+
#' @note map_entries since 2.4.0
3143+
setMethod("map_entries",
3144+
signature(x = "Column"),
3145+
function(x) {
3146+
jc <- callJStatic("org.apache.spark.sql.functions", "map_entries", x@jc)
3147+
column(jc)
3148+
})
3149+
30793150
#' @details
30803151
#' \code{map_keys}: Returns an unordered array containing the keys of the map.
30813152
#'
@@ -3149,8 +3220,8 @@ setMethod("size",
31493220
#' (or starting from the end if start is negative) with the specified length.
31503221
#'
31513222
#' @rdname column_collection_functions
3152-
#' @param start an index indicating the first element occuring in the result.
3153-
#' @param length a number of consecutive elements choosen to the result.
3223+
#' @param start an index indicating the first element occurring in the result.
3224+
#' @param length a number of consecutive elements chosen to the result.
31543225
#' @aliases slice slice,Column-method
31553226
#' @note slice since 2.4.0
31563227
setMethod("slice",

R/pkg/R/generics.R

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -624,7 +624,7 @@ setGeneric("summarize", function(x, ...) { standardGeneric("summarize") })
624624
#' @rdname summary
625625
setGeneric("summary", function(object, ...) { standardGeneric("summary") })
626626

627-
setGeneric("toJSON", function(x) { standardGeneric("toJSON") })
627+
setGeneric("toJSON", function(x, ...) { standardGeneric("toJSON") })
628628

629629
setGeneric("toRDD", function(x) { standardGeneric("toRDD") })
630630

@@ -757,6 +757,10 @@ setGeneric("approxCountDistinct", function(x, ...) { standardGeneric("approxCoun
757757
#' @name NULL
758758
setGeneric("array_contains", function(x, value) { standardGeneric("array_contains") })
759759

760+
#' @rdname column_collection_functions
761+
#' @name NULL
762+
setGeneric("array_join", function(x, delimiter, ...) { standardGeneric("array_join") })
763+
760764
#' @rdname column_collection_functions
761765
#' @name NULL
762766
setGeneric("array_max", function(x) { standardGeneric("array_max") })
@@ -769,10 +773,18 @@ setGeneric("array_min", function(x) { standardGeneric("array_min") })
769773
#' @name NULL
770774
setGeneric("array_position", function(x, value) { standardGeneric("array_position") })
771775

776+
#' @rdname column_collection_functions
777+
#' @name NULL
778+
setGeneric("array_repeat", function(x, count) { standardGeneric("array_repeat") })
779+
772780
#' @rdname column_collection_functions
773781
#' @name NULL
774782
setGeneric("array_sort", function(x) { standardGeneric("array_sort") })
775783

784+
#' @rdname column_collection_functions
785+
#' @name NULL
786+
setGeneric("arrays_overlap", function(x, y) { standardGeneric("arrays_overlap") })
787+
776788
#' @rdname column_string_functions
777789
#' @name NULL
778790
setGeneric("ascii", function(x) { standardGeneric("ascii") })
@@ -1034,6 +1046,10 @@ setGeneric("lpad", function(x, len, pad) { standardGeneric("lpad") })
10341046
#' @name NULL
10351047
setGeneric("ltrim", function(x, trimString) { standardGeneric("ltrim") })
10361048

1049+
#' @rdname column_collection_functions
1050+
#' @name NULL
1051+
setGeneric("map_entries", function(x) { standardGeneric("map_entries") })
1052+
10371053
#' @rdname column_collection_functions
10381054
#' @name NULL
10391055
setGeneric("map_keys", function(x) { standardGeneric("map_keys") })

R/pkg/R/sparkR.R

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -194,7 +194,7 @@ sparkR.sparkContext <- function(
194194

195195
# Don't use readString() so that we can provide a useful
196196
# error message if the R and Java versions are mismatched.
197-
authSecretLen = readInt(f)
197+
authSecretLen <- readInt(f)
198198
if (length(authSecretLen) == 0 || authSecretLen == 0) {
199199
stop("Unexpected EOF in JVM connection data. Mismatched versions?")
200200
}

R/pkg/tests/fulltests/test_sparkSQL.R

Lines changed: 36 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1503,6 +1503,36 @@ test_that("column functions", {
15031503
result <- collect(select(df2, reverse(df2[[1]])))[[1]]
15041504
expect_equal(result, "cba")
15051505

1506+
# Test array_repeat()
1507+
df <- createDataFrame(list(list("a", 3L), list("b", 2L)))
1508+
result <- collect(select(df, array_repeat(df[[1]], df[[2]])))[[1]]
1509+
expect_equal(result, list(list("a", "a", "a"), list("b", "b")))
1510+
1511+
result <- collect(select(df, array_repeat(df[[1]], 2L)))[[1]]
1512+
expect_equal(result, list(list("a", "a"), list("b", "b")))
1513+
1514+
# Test arrays_overlap()
1515+
df <- createDataFrame(list(list(list(1L, 2L), list(3L, 1L)),
1516+
list(list(1L, 2L), list(3L, 4L)),
1517+
list(list(1L, NA), list(3L, 4L))))
1518+
result <- collect(select(df, arrays_overlap(df[[1]], df[[2]])))[[1]]
1519+
expect_equal(result, c(TRUE, FALSE, NA))
1520+
1521+
# Test array_join()
1522+
df <- createDataFrame(list(list(list("Hello", "World!"))))
1523+
result <- collect(select(df, array_join(df[[1]], "#")))[[1]]
1524+
expect_equal(result, "Hello#World!")
1525+
df2 <- createDataFrame(list(list(list("Hello", NA, "World!"))))
1526+
result <- collect(select(df2, array_join(df2[[1]], "#", "Beautiful")))[[1]]
1527+
expect_equal(result, "Hello#Beautiful#World!")
1528+
result <- collect(select(df2, array_join(df2[[1]], "#")))[[1]]
1529+
expect_equal(result, "Hello#World!")
1530+
df3 <- createDataFrame(list(list(list("Hello", NULL, "World!"))))
1531+
result <- collect(select(df3, array_join(df3[[1]], "#", "Beautiful")))[[1]]
1532+
expect_equal(result, "Hello#Beautiful#World!")
1533+
result <- collect(select(df3, array_join(df3[[1]], "#")))[[1]]
1534+
expect_equal(result, "Hello#World!")
1535+
15061536
# Test array_sort() and sort_array()
15071537
df <- createDataFrame(list(list(list(2L, 1L, 3L, NA)), list(list(NA, 6L, 5L, NA, 4L))))
15081538

@@ -1531,8 +1561,13 @@ test_that("column functions", {
15311561
result <- collect(select(df, flatten(df[[1]])))[[1]]
15321562
expect_equal(result, list(list(1L, 2L, 3L, 4L), list(5L, 6L, 7L, 8L)))
15331563

1534-
# Test map_keys(), map_values() and element_at()
1564+
# Test map_entries(), map_keys(), map_values() and element_at()
15351565
df <- createDataFrame(list(list(map = as.environment(list(x = 1, y = 2)))))
1566+
result <- collect(select(df, map_entries(df$map)))[[1]]
1567+
expected_entries <- list(listToStruct(list(key = "x", value = 1)),
1568+
listToStruct(list(key = "y", value = 2)))
1569+
expect_equal(result, list(expected_entries))
1570+
15361571
result <- collect(select(df, map_keys(df$map)))[[1]]
15371572
expect_equal(result, list(list("x", "y")))
15381573

README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,8 @@ can be run using:
9999
Please see the guidance on how to
100100
[run tests for a module, or individual tests](http://spark.apache.org/developer-tools.html#individual-tests).
101101

102+
There is also a Kubernetes integration test, see resource-managers/kubernetes/integration-tests/README.md
103+
102104
## A Note About Hadoop Versions
103105

104106
Spark uses the Hadoop core library to talk to HDFS and other Hadoop-supported

bin/docker-image-tool.sh

Lines changed: 24 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -63,16 +63,25 @@ function build {
6363
if [ ! -d "$IMG_PATH" ]; then
6464
error "Cannot find docker image. This script must be run from a runnable distribution of Apache Spark."
6565
fi
66-
67-
local DOCKERFILE=${DOCKERFILE:-"$IMG_PATH/spark/Dockerfile"}
68-
69-
docker build "${BUILD_ARGS[@]}" \
66+
local BINDING_BUILD_ARGS=(
67+
--build-arg
68+
base_img=$(image_ref spark)
69+
)
70+
local BASEDOCKERFILE=${BASEDOCKERFILE:-"$IMG_PATH/spark/Dockerfile"}
71+
local PYDOCKERFILE=${PYDOCKERFILE:-"$IMG_PATH/spark/bindings/python/Dockerfile"}
72+
73+
docker build $NOCACHEARG "${BUILD_ARGS[@]}" \
7074
-t $(image_ref spark) \
71-
-f "$DOCKERFILE" .
75+
-f "$BASEDOCKERFILE" .
76+
77+
docker build $NOCACHEARG "${BINDING_BUILD_ARGS[@]}" \
78+
-t $(image_ref spark-py) \
79+
-f "$PYDOCKERFILE" .
7280
}
7381

7482
function push {
7583
docker push "$(image_ref spark)"
84+
docker push "$(image_ref spark-py)"
7685
}
7786

7887
function usage {
@@ -86,10 +95,12 @@ Commands:
8695
push Push a pre-built image to a registry. Requires a repository address to be provided.
8796
8897
Options:
89-
-f file Dockerfile to build. By default builds the Dockerfile shipped with Spark.
98+
-f file Dockerfile to build for JVM based Jobs. By default builds the Dockerfile shipped with Spark.
99+
-p file Dockerfile with Python baked in. By default builds the Dockerfile shipped with Spark.
90100
-r repo Repository address.
91101
-t tag Tag to apply to the built image, or to identify the image to be pushed.
92102
-m Use minikube's Docker daemon.
103+
-n Build docker image with --no-cache
93104
94105
Using minikube when building images will do so directly into minikube's Docker daemon.
95106
There is no need to push the images into minikube in that case, they'll be automatically
@@ -116,14 +127,18 @@ fi
116127

117128
REPO=
118129
TAG=
119-
DOCKERFILE=
120-
while getopts f:mr:t: option
130+
BASEDOCKERFILE=
131+
PYDOCKERFILE=
132+
NOCACHEARG=
133+
while getopts f:mr:t:n option
121134
do
122135
case "${option}"
123136
in
124-
f) DOCKERFILE=${OPTARG};;
137+
f) BASEDOCKERFILE=${OPTARG};;
138+
p) PYDOCKERFILE=${OPTARG};;
125139
r) REPO=${OPTARG};;
126140
t) TAG=${OPTARG};;
141+
n) NOCACHEARG="--no-cache";;
127142
m)
128143
if ! which minikube 1>/dev/null; then
129144
error "Cannot find minikube."

0 commit comments

Comments
 (0)