Skip to content

Commit a41d41c

Browse files
authored
Merge pull request apache-spark-on-k8s#473 from palantir/rk/merge
Small merge from upstream
2 parents d172c9c + ab6746e commit a41d41c

File tree

173 files changed

+1993
-1937
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

173 files changed

+1993
-1937
lines changed

R/pkg/NAMESPACE

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -195,6 +195,7 @@ exportMethods("%<=>%",
195195
"acos",
196196
"add_months",
197197
"alias",
198+
"approx_count_distinct",
198199
"approxCountDistinct",
199200
"approxQuantile",
200201
"array_contains",
@@ -253,6 +254,7 @@ exportMethods("%<=>%",
253254
"dayofweek",
254255
"dayofyear",
255256
"decode",
257+
"degrees",
256258
"dense_rank",
257259
"desc",
258260
"element_at",
@@ -335,6 +337,7 @@ exportMethods("%<=>%",
335337
"posexplode",
336338
"posexplode_outer",
337339
"quarter",
340+
"radians",
338341
"rand",
339342
"randn",
340343
"rank",
@@ -381,6 +384,7 @@ exportMethods("%<=>%",
381384
"tanh",
382385
"toDegrees",
383386
"toRadians",
387+
"to_csv",
384388
"to_date",
385389
"to_json",
386390
"to_timestamp",

R/pkg/R/functions.R

Lines changed: 90 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -112,7 +112,7 @@ NULL
112112
#' df <- createDataFrame(cbind(model = rownames(mtcars), mtcars))
113113
#' tmp <- mutate(df, v1 = log(df$mpg), v2 = cbrt(df$disp),
114114
#' v3 = bround(df$wt, 1), v4 = bin(df$cyl),
115-
#' v5 = hex(df$wt), v6 = toDegrees(df$gear),
115+
#' v5 = hex(df$wt), v6 = degrees(df$gear),
116116
#' v7 = atan2(df$cyl, df$am), v8 = hypot(df$cyl, df$am),
117117
#' v9 = pmod(df$hp, df$cyl), v10 = shiftLeft(df$disp, 1),
118118
#' v11 = conv(df$hp, 10, 16), v12 = sign(df$vs - 0.5),
@@ -187,6 +187,7 @@ NULL
187187
#' \itemize{
188188
#' \item \code{to_json}: it is the column containing the struct, array of the structs,
189189
#' the map or array of maps.
190+
#' \item \code{to_csv}: it is the column containing the struct.
190191
#' \item \code{from_json}: it is the column containing the JSON string.
191192
#' \item \code{from_csv}: it is the column containing the CSV string.
192193
#' }
@@ -204,11 +205,11 @@ NULL
204205
#' also supported for the schema.
205206
#' \item \code{from_csv}: a DDL-formatted string
206207
#' }
207-
#' @param ... additional argument(s). In \code{to_json} and \code{from_json}, this contains
208-
#' additional named properties to control how it is converted, accepts the same
209-
#' options as the JSON data source. Additionally \code{to_json} supports the "pretty"
210-
#' option which enables pretty JSON generation. In \code{arrays_zip}, this contains
211-
#' additional Columns of arrays to be merged.
208+
#' @param ... additional argument(s). In \code{to_json}, \code{to_csv} and \code{from_json},
209+
#' this contains additional named properties to control how it is converted, accepts
210+
#' the same options as the JSON/CSV data source. Additionally \code{to_json} supports
211+
#' the "pretty" option which enables pretty JSON generation. In \code{arrays_zip},
212+
#' this contains additional Columns of arrays to be merged.
212213
#' @name column_collection_functions
213214
#' @rdname column_collection_functions
214215
#' @family collection functions
@@ -319,23 +320,37 @@ setMethod("acos",
319320
})
320321

321322
#' @details
322-
#' \code{approxCountDistinct}: Returns the approximate number of distinct items in a group.
323+
#' \code{approx_count_distinct}: Returns the approximate number of distinct items in a group.
323324
#'
324325
#' @rdname column_aggregate_functions
325-
#' @aliases approxCountDistinct approxCountDistinct,Column-method
326+
#' @aliases approx_count_distinct approx_count_distinct,Column-method
326327
#' @examples
327328
#'
328329
#' \dontrun{
329-
#' head(select(df, approxCountDistinct(df$gear)))
330-
#' head(select(df, approxCountDistinct(df$gear, 0.02)))
330+
#' head(select(df, approx_count_distinct(df$gear)))
331+
#' head(select(df, approx_count_distinct(df$gear, 0.02)))
331332
#' head(select(df, countDistinct(df$gear, df$cyl)))
332333
#' head(select(df, n_distinct(df$gear)))
333334
#' head(distinct(select(df, "gear")))}
335+
#' @note approx_count_distinct(Column) since 3.0.0
336+
setMethod("approx_count_distinct",
337+
signature(x = "Column"),
338+
function(x) {
339+
jc <- callJStatic("org.apache.spark.sql.functions", "approx_count_distinct", x@jc)
340+
column(jc)
341+
})
342+
343+
#' @details
344+
#' \code{approxCountDistinct}: Returns the approximate number of distinct items in a group.
345+
#'
346+
#' @rdname column_aggregate_functions
347+
#' @aliases approxCountDistinct approxCountDistinct,Column-method
334348
#' @note approxCountDistinct(Column) since 1.4.0
335349
setMethod("approxCountDistinct",
336350
signature(x = "Column"),
337351
function(x) {
338-
jc <- callJStatic("org.apache.spark.sql.functions", "approxCountDistinct", x@jc)
352+
.Deprecated("approx_count_distinct")
353+
jc <- callJStatic("org.apache.spark.sql.functions", "approx_count_distinct", x@jc)
339354
column(jc)
340355
})
341356

@@ -1650,7 +1665,22 @@ setMethod("tanh",
16501665
setMethod("toDegrees",
16511666
signature(x = "Column"),
16521667
function(x) {
1653-
jc <- callJStatic("org.apache.spark.sql.functions", "toDegrees", x@jc)
1668+
.Deprecated("degrees")
1669+
jc <- callJStatic("org.apache.spark.sql.functions", "degrees", x@jc)
1670+
column(jc)
1671+
})
1672+
1673+
#' @details
1674+
#' \code{degrees}: Converts an angle measured in radians to an approximately equivalent angle
1675+
#' measured in degrees.
1676+
#'
1677+
#' @rdname column_math_functions
1678+
#' @aliases degrees degrees,Column-method
1679+
#' @note degrees since 3.0.0
1680+
setMethod("degrees",
1681+
signature(x = "Column"),
1682+
function(x) {
1683+
jc <- callJStatic("org.apache.spark.sql.functions", "degrees", x@jc)
16541684
column(jc)
16551685
})
16561686

@@ -1664,7 +1694,22 @@ setMethod("toDegrees",
16641694
setMethod("toRadians",
16651695
signature(x = "Column"),
16661696
function(x) {
1667-
jc <- callJStatic("org.apache.spark.sql.functions", "toRadians", x@jc)
1697+
.Deprecated("radians")
1698+
jc <- callJStatic("org.apache.spark.sql.functions", "radians", x@jc)
1699+
column(jc)
1700+
})
1701+
1702+
#' @details
1703+
#' \code{radians}: Converts an angle measured in degrees to an approximately equivalent angle
1704+
#' measured in radians.
1705+
#'
1706+
#' @rdname column_math_functions
1707+
#' @aliases radians radians,Column-method
1708+
#' @note radians since 3.0.0
1709+
setMethod("radians",
1710+
signature(x = "Column"),
1711+
function(x) {
1712+
jc <- callJStatic("org.apache.spark.sql.functions", "radians", x@jc)
16681713
column(jc)
16691714
})
16701715

@@ -1740,6 +1785,26 @@ setMethod("to_json", signature(x = "Column"),
17401785
column(jc)
17411786
})
17421787

1788+
#' @details
1789+
#' \code{to_csv}: Converts a column containing a \code{structType} into a Column of CSV string.
1790+
#' Resolving the Column can fail if an unsupported type is encountered.
1791+
#'
1792+
#' @rdname column_collection_functions
1793+
#' @aliases to_csv to_csv,Column-method
1794+
#' @examples
1795+
#'
1796+
#' \dontrun{
1797+
#' # Converts a struct into a CSV string
1798+
#' df2 <- sql("SELECT named_struct('date', cast('2000-01-01' as date)) as d")
1799+
#' select(df2, to_csv(df2$d, dateFormat = 'dd/MM/yyyy'))}
1800+
#' @note to_csv since 3.0.0
1801+
setMethod("to_csv", signature(x = "Column"),
1802+
function(x, ...) {
1803+
options <- varargsToStrEnv(...)
1804+
jc <- callJStatic("org.apache.spark.sql.functions", "to_csv", x@jc, options)
1805+
column(jc)
1806+
})
1807+
17431808
#' @details
17441809
#' \code{to_timestamp}: Converts the column into a TimestampType. You may optionally specify
17451810
#' a format according to the rules in:
@@ -2044,13 +2109,24 @@ setMethod("pmod", signature(y = "Column"),
20442109

20452110
#' @param rsd maximum estimation error allowed (default = 0.05).
20462111
#'
2112+
#' @rdname column_aggregate_functions
2113+
#' @aliases approx_count_distinct,Column-method
2114+
#' @note approx_count_distinct(Column, numeric) since 3.0.0
2115+
setMethod("approx_count_distinct",
2116+
signature(x = "Column"),
2117+
function(x, rsd = 0.05) {
2118+
jc <- callJStatic("org.apache.spark.sql.functions", "approx_count_distinct", x@jc, rsd)
2119+
column(jc)
2120+
})
2121+
20472122
#' @rdname column_aggregate_functions
20482123
#' @aliases approxCountDistinct,Column-method
20492124
#' @note approxCountDistinct(Column, numeric) since 1.4.0
20502125
setMethod("approxCountDistinct",
20512126
signature(x = "Column"),
20522127
function(x, rsd = 0.05) {
2053-
jc <- callJStatic("org.apache.spark.sql.functions", "approxCountDistinct", x@jc, rsd)
2128+
.Deprecated("approx_count_distinct")
2129+
jc <- callJStatic("org.apache.spark.sql.functions", "approx_count_distinct", x@jc, rsd)
20542130
column(jc)
20552131
})
20562132

R/pkg/R/generics.R

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -749,6 +749,10 @@ setGeneric("windowOrderBy", function(col, ...) { standardGeneric("windowOrderBy"
749749
#' @name NULL
750750
setGeneric("add_months", function(y, x) { standardGeneric("add_months") })
751751

752+
#' @rdname column_aggregate_functions
753+
#' @name NULL
754+
setGeneric("approx_count_distinct", function(x, ...) { standardGeneric("approx_count_distinct") })
755+
752756
#' @rdname column_aggregate_functions
753757
#' @name NULL
754758
setGeneric("approxCountDistinct", function(x, ...) { standardGeneric("approxCountDistinct") })
@@ -1290,10 +1294,18 @@ setGeneric("substring_index", function(x, delim, count) { standardGeneric("subst
12901294
#' @name NULL
12911295
setGeneric("sumDistinct", function(x) { standardGeneric("sumDistinct") })
12921296

1297+
#' @rdname column_math_functions
1298+
#' @name NULL
1299+
setGeneric("degrees", function(x) { standardGeneric("degrees") })
1300+
12931301
#' @rdname column_math_functions
12941302
#' @name NULL
12951303
setGeneric("toDegrees", function(x) { standardGeneric("toDegrees") })
12961304

1305+
#' @rdname column_math_functions
1306+
#' @name NULL
1307+
setGeneric("radians", function(x) { standardGeneric("radians") })
1308+
12971309
#' @rdname column_math_functions
12981310
#' @name NULL
12991311
setGeneric("toRadians", function(x) { standardGeneric("toRadians") })
@@ -1306,6 +1318,10 @@ setGeneric("to_date", function(x, format) { standardGeneric("to_date") })
13061318
#' @name NULL
13071319
setGeneric("to_json", function(x, ...) { standardGeneric("to_json") })
13081320

1321+
#' @rdname column_collection_functions
1322+
#' @name NULL
1323+
setGeneric("to_csv", function(x, ...) { standardGeneric("to_csv") })
1324+
13091325
#' @rdname column_datetime_functions
13101326
#' @name NULL
13111327
setGeneric("to_timestamp", function(x, format) { standardGeneric("to_timestamp") })

R/pkg/tests/fulltests/test_sparkSQL.R

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1379,7 +1379,7 @@ test_that("column operators", {
13791379

13801380
test_that("column functions", {
13811381
c <- column("a")
1382-
c1 <- abs(c) + acos(c) + approxCountDistinct(c) + ascii(c) + asin(c) + atan(c)
1382+
c1 <- abs(c) + acos(c) + approx_count_distinct(c) + ascii(c) + asin(c) + atan(c)
13831383
c2 <- avg(c) + base64(c) + bin(c) + bitwiseNOT(c) + cbrt(c) + ceil(c) + cos(c)
13841384
c3 <- cosh(c) + count(c) + crc32(c) + hash(c) + exp(c)
13851385
c4 <- explode(c) + expm1(c) + factorial(c) + first(c) + floor(c) + hex(c)
@@ -1388,7 +1388,7 @@ test_that("column functions", {
13881388
c7 <- mean(c) + min(c) + month(c) + negate(c) + posexplode(c) + quarter(c)
13891389
c8 <- reverse(c) + rint(c) + round(c) + rtrim(c) + sha1(c) + monotonically_increasing_id()
13901390
c9 <- signum(c) + sin(c) + sinh(c) + size(c) + stddev(c) + soundex(c) + sqrt(c) + sum(c)
1391-
c10 <- sumDistinct(c) + tan(c) + tanh(c) + toDegrees(c) + toRadians(c)
1391+
c10 <- sumDistinct(c) + tan(c) + tanh(c) + degrees(c) + radians(c)
13921392
c11 <- to_date(c) + trim(c) + unbase64(c) + unhex(c) + upper(c)
13931393
c12 <- variance(c) + ltrim(c, "a") + rtrim(c, "b") + trim(c, "c")
13941394
c13 <- lead("col", 1) + lead(c, 1) + lag("col", 1) + lag(c, 1)
@@ -1689,6 +1689,11 @@ test_that("column functions", {
16891689
expect_equal(arr$arrcol[[1]][[2]]$name, "Alice")
16901690
}
16911691

1692+
# Test to_csv()
1693+
df <- sql("SELECT named_struct('name', 'Bob') as people")
1694+
j <- collect(select(df, alias(to_csv(df$people), "csv")))
1695+
expect_equal(j[order(j$csv), ][1], "Bob")
1696+
16921697
# Test create_array() and create_map()
16931698
df <- as.DataFrame(data.frame(
16941699
x = c(1.0, 2.0), y = c(-1.0, 3.0), z = c(-2.0, 5.0)

bin/docker-image-tool.sh

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -197,6 +197,9 @@ do
197197
if ! which minikube 1>/dev/null; then
198198
error "Cannot find minikube."
199199
fi
200+
if ! minikube status 1>/dev/null; then
201+
error "Cannot contact minikube. Make sure it's running."
202+
fi
200203
eval $(minikube docker-env)
201204
;;
202205
esac

bin/spark-shell

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,10 @@ if [ -z "${SPARK_HOME}" ]; then
3232
source "$(dirname "$0")"/find-spark-home
3333
fi
3434

35-
export _SPARK_CMD_USAGE="Usage: ./bin/spark-shell [options]"
35+
export _SPARK_CMD_USAGE="Usage: ./bin/spark-shell [options]
36+
37+
Scala REPL options:
38+
-I <file> preload <file>, enforcing line-by-line interpretation"
3639

3740
# SPARK-4161: scala does not assume use of the java classpath,
3841
# so we need to add the "-Dscala.usejavacp=true" flag manually. We

bin/spark-shell2.cmd

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,13 @@ rem
2020
rem Figure out where the Spark framework is installed
2121
call "%~dp0find-spark-home.cmd"
2222

23-
set _SPARK_CMD_USAGE=Usage: .\bin\spark-shell.cmd [options]
23+
set LF=^
24+
25+
26+
rem two empty lines are required
27+
set _SPARK_CMD_USAGE=Usage: .\bin\spark-shell.cmd [options]^%LF%%LF%^%LF%%LF%^
28+
Scala REPL options:^%LF%%LF%^
29+
-I ^<file^> preload ^<file^>, enforcing line-by-line interpretation
2430

2531
rem SPARK-4161: scala does not assume use of the java classpath,
2632
rem so we need to add the "-Dscala.usejavacp=true" flag manually. We

core/pom.xml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@
5656
</dependency>
5757
<dependency>
5858
<groupId>org.apache.xbean</groupId>
59-
<artifactId>xbean-asm6-shaded</artifactId>
59+
<artifactId>xbean-asm7-shaded</artifactId>
6060
</dependency>
6161
<dependency>
6262
<groupId>org.apache.hadoop</groupId>

core/src/main/java/org/apache/spark/ExecutorPlugin.java

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -20,18 +20,18 @@
2020
import org.apache.spark.annotation.DeveloperApi;
2121

2222
/**
23-
* A plugin which can be automaticaly instantiated within each Spark executor. Users can specify
23+
* A plugin which can be automatically instantiated within each Spark executor. Users can specify
2424
* plugins which should be created with the "spark.executor.plugins" configuration. An instance
2525
* of each plugin will be created for every executor, including those created by dynamic allocation,
2626
* before the executor starts running any tasks.
2727
*
2828
* The specific api exposed to the end users still considered to be very unstable. We will
29-
* hopefully be able to keep compatability by providing default implementations for any methods
29+
* hopefully be able to keep compatibility by providing default implementations for any methods
3030
* added, but make no guarantees this will always be possible across all Spark releases.
3131
*
3232
* Spark does nothing to verify the plugin is doing legitimate things, or to manage the resources
3333
* it uses. A plugin acquires the same privileges as the user running the task. A bad plugin
34-
* could also intefere with task execution and make the executor fail in unexpected ways.
34+
* could also interfere with task execution and make the executor fail in unexpected ways.
3535
*/
3636
@DeveloperApi
3737
public interface ExecutorPlugin {

core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeSorterSpillWriter.java

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,10 @@ public final class UnsafeSorterSpillWriter {
4242

4343
private final SparkConf conf = new SparkConf();
4444

45-
/** The buffer size to use when writing the sorted records to an on-disk file */
45+
/**
46+
* The buffer size to use when writing the sorted records to an on-disk file, and
47+
* this space used by prefix + len + recordLength must be greater than 4 + 8 bytes.
48+
*/
4649
private final int diskWriteBufferSize =
4750
(int) (long) conf.get(package$.MODULE$.SHUFFLE_DISK_WRITE_BUFFER_SIZE());
4851

0 commit comments

Comments
 (0)