[SPARK-25007][R] Add array_intersect/array_except/array_union/shuffle to SparkR

huaxingao · Felix Cheung · commit a481794ca9a5 · 2018-09-02T00:06:19.000-07:00
## What changes were proposed in this pull request? Add the R version of array_intersect/array_except/array_union/shuffle ## How was this patch tested? Add test in test_sparkSQL.R Author: Huaxin Gao <huaxing@us.ibm.com> Closes apache#22291 from huaxingao/spark-25007.
diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE
@@ -204,6 +204,8 @@ exportMethods("%<=>%",
               "approxQuantile",
               "array_contains",
               "array_distinct",
+              "array_except",
+              "array_intersect",
               "array_join",
               "array_max",
               "array_min",
@@ -212,6 +214,7 @@ exportMethods("%<=>%",
               "array_repeat",
               "array_sort",
               "arrays_overlap",
+              "array_union",
               "arrays_zip",
               "asc",
               "ascii",
@@ -355,6 +358,7 @@ exportMethods("%<=>%",
               "shiftLeft",
               "shiftRight",
               "shiftRightUnsigned",
+              "shuffle",
               "sd",
               "sign",
               "signum",
diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R
@@ -208,7 +208,7 @@ NULL
 #' # Dataframe used throughout this doc
 #' df <- createDataFrame(cbind(model = rownames(mtcars), mtcars))
 #' tmp <- mutate(df, v1 = create_array(df$mpg, df$cyl, df$hp))
-#' head(select(tmp, array_contains(tmp$v1, 21), size(tmp$v1)))
+#' head(select(tmp, array_contains(tmp$v1, 21), size(tmp$v1), shuffle(tmp$v1)))
 #' head(select(tmp, array_max(tmp$v1), array_min(tmp$v1), array_distinct(tmp$v1)))
 #' head(select(tmp, array_position(tmp$v1, 21), array_repeat(df$mpg, 3), array_sort(tmp$v1)))
 #' head(select(tmp, flatten(tmp$v1), reverse(tmp$v1), array_remove(tmp$v1, 21)))
@@ -223,6 +223,8 @@ NULL
 #' head(select(tmp3, element_at(tmp3$v3, "Valiant")))
 #' tmp4 <- mutate(df, v4 = create_array(df$mpg, df$cyl), v5 = create_array(df$cyl, df$hp))
 #' head(select(tmp4, concat(tmp4$v4, tmp4$v5), arrays_overlap(tmp4$v4, tmp4$v5)))
+#' head(select(tmp4, array_except(tmp4$v4, tmp4$v5), array_intersect(tmp4$v4, tmp4$v5)))
+#' head(select(tmp4, array_union(tmp4$v4, tmp4$v5)))
 #' head(select(tmp4, arrays_zip(tmp4$v4, tmp4$v5), map_from_arrays(tmp4$v4, tmp4$v5)))
 #' head(select(tmp, concat(df$mpg, df$cyl, df$hp)))
 #' tmp5 <- mutate(df, v6 = create_array(df$model, df$model))
@@ -3024,6 +3026,34 @@ setMethod("array_distinct",
             column(jc)
           })
 
+#' @details
+#' \code{array_except}: Returns an array of the elements in the first array but not in the second
+#'  array, without duplicates. The order of elements in the result is not determined.
+#'
+#' @rdname column_collection_functions
+#' @aliases array_except array_except,Column-method
+#' @note array_except since 2.4.0
+setMethod("array_except",
+          signature(x = "Column", y = "Column"),
+          function(x, y) {
+            jc <- callJStatic("org.apache.spark.sql.functions", "array_except", x@jc, y@jc)
+            column(jc)
+          })
+
+#' @details
+#' \code{array_intersect}: Returns an array of the elements in the intersection of the given two
+#'  arrays, without duplicates.
+#'
+#' @rdname column_collection_functions
+#' @aliases array_intersect array_intersect,Column-method
+#' @note array_intersect since 2.4.0
+setMethod("array_intersect",
+          signature(x = "Column", y = "Column"),
+          function(x, y) {
+            jc <- callJStatic("org.apache.spark.sql.functions", "array_intersect", x@jc, y@jc)
+            column(jc)
+          })
+
 #' @details
 #' \code{array_join}: Concatenates the elements of column using the delimiter.
 #' Null values are replaced with nullReplacement if set, otherwise they are ignored.
@@ -3149,6 +3179,20 @@ setMethod("arrays_overlap",
             column(jc)
           })
 
+#' @details
+#' \code{array_union}: Returns an array of the elements in the union of the given two arrays,
+#'  without duplicates.
+#'
+#' @rdname column_collection_functions
+#' @aliases array_union array_union,Column-method
+#' @note array_union since 2.4.0
+setMethod("array_union",
+          signature(x = "Column", y = "Column"),
+          function(x, y) {
+            jc <- callJStatic("org.apache.spark.sql.functions", "array_union", x@jc, y@jc)
+            column(jc)
+          })
+
 #' @details
 #' \code{arrays_zip}: Returns a merged array of structs in which the N-th struct contains all N-th
 #' values of input arrays.
@@ -3167,6 +3211,19 @@ setMethod("arrays_zip",
             column(jc)
           })
 
+#' @details
+#' \code{shuffle}: Returns a random permutation of the given array.
+#'
+#' @rdname column_collection_functions
+#' @aliases shuffle shuffle,Column-method
+#' @note shuffle since 2.4.0
+setMethod("shuffle",
+          signature(x = "Column"),
+          function(x) {
+            jc <- callJStatic("org.apache.spark.sql.functions", "shuffle", x@jc)
+            column(jc)
+          })
+
 #' @details
 #' \code{flatten}: Creates a single array from an array of arrays.
 #' If a structure of nested arrays is deeper than two levels, only one level of nesting is removed.
diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R
@@ -767,6 +767,14 @@ setGeneric("array_contains", function(x, value) { standardGeneric("array_contain
 #' @name NULL
 setGeneric("array_distinct", function(x) { standardGeneric("array_distinct") })
 
+#' @rdname column_collection_functions
+#' @name NULL
+setGeneric("array_except", function(x, y) { standardGeneric("array_except") })
+
+#' @rdname column_collection_functions
+#' @name NULL
+setGeneric("array_intersect", function(x, y) { standardGeneric("array_intersect") })
+
 #' @rdname column_collection_functions
 #' @name NULL
 setGeneric("array_join", function(x, delimiter, ...) { standardGeneric("array_join") })
@@ -799,6 +807,10 @@ setGeneric("array_sort", function(x) { standardGeneric("array_sort") })
 #' @name NULL
 setGeneric("arrays_overlap", function(x, y) { standardGeneric("arrays_overlap") })
 
+#' @rdname column_collection_functions
+#' @name NULL
+setGeneric("array_union", function(x, y) { standardGeneric("array_union") })
+
 #' @rdname column_collection_functions
 #' @name NULL
 setGeneric("arrays_zip", function(x, ...) { standardGeneric("arrays_zip") })
@@ -1220,6 +1232,10 @@ setGeneric("shiftRight", function(y, x) { standardGeneric("shiftRight") })
 #' @name NULL
 setGeneric("shiftRightUnsigned", function(y, x) { standardGeneric("shiftRightUnsigned") })
 
+#' @rdname column_collection_functions
+#' @name NULL
+setGeneric("shuffle", function(x) { standardGeneric("shuffle") })
+
 #' @rdname column_math_functions
 #' @name NULL
 setGeneric("signum", function(x) { standardGeneric("signum") })
diff --git a/R/pkg/tests/fulltests/test_sparkSQL.R b/R/pkg/tests/fulltests/test_sparkSQL.R
@@ -1598,6 +1598,25 @@ test_that("column functions", {
   result <- collect(select(df, element_at(df$map, "y")))[[1]]
   expect_equal(result, 2)
 
+  # Test array_except(), array_intersect() and array_union()
+  df <- createDataFrame(list(list(list(1L, 2L, 3L), list(3L, 1L)),
+                             list(list(1L, 2L), list(3L, 4L)),
+                             list(list(1L, 2L, 3L), list(3L, 4L))))
+  result1 <- collect(select(df, array_except(df[[1]], df[[2]])))[[1]]
+  expect_equal(result1, list(list(2L), list(1L, 2L), list(1L, 2L)))
+
+  result2 <- collect(select(df, array_intersect(df[[1]], df[[2]])))[[1]]
+  expect_equal(result2, list(list(1L, 3L), list(), list(3L)))
+
+  result3 <- collect(select(df, array_union(df[[1]], df[[2]])))[[1]]
+  expect_equal(result3, list(list(1L, 2L, 3L), list(1L, 2L, 3L, 4L), list(1L, 2L, 3L, 4L)))
+
+  # Test shuffle()
+  df <- createDataFrame(list(list(list(1L, 20L, 3L, 5L)), list(list(4L, 5L, 6L, 7L))))
+  result <- collect(select(df, shuffle(df[[1]])))[[1]]
+  expect_true(setequal(result[[1]], c(1L, 20L, 3L, 5L)))
+  expect_true(setequal(result[[2]], c(4L, 5L, 6L, 7L)))
+
   # Test that stats::lag is working
   expect_equal(length(lag(ldeaths, 12)), 72)