Skip to content

Commit a28728a

Browse files
goldmedalHyukjinKwon
authored andcommitted
[SPARK-21513][SQL][FOLLOWUP] Allow UDF to_json support converting MapType to json for PySpark and SparkR
## What changes were proposed in this pull request? In previous work SPARK-21513, we has allowed `MapType` and `ArrayType` of `MapType`s convert to a json string but only for Scala API. In this follow-up PR, we will make SparkSQL support it for PySpark and SparkR, too. We also fix some little bugs and comments of the previous work in this follow-up PR. ### For PySpark ``` >>> data = [(1, {"name": "Alice"})] >>> df = spark.createDataFrame(data, ("key", "value")) >>> df.select(to_json(df.value).alias("json")).collect() [Row(json=u'{"name":"Alice")'] >>> data = [(1, [{"name": "Alice"}, {"name": "Bob"}])] >>> df = spark.createDataFrame(data, ("key", "value")) >>> df.select(to_json(df.value).alias("json")).collect() [Row(json=u'[{"name":"Alice"},{"name":"Bob"}]')] ``` ### For SparkR ``` # Converts a map into a JSON object df2 <- sql("SELECT map('name', 'Bob')) as people") df2 <- mutate(df2, people_json = to_json(df2$people)) # Converts an array of maps into a JSON array df2 <- sql("SELECT array(map('name', 'Bob'), map('name', 'Alice')) as people") df2 <- mutate(df2, people_json = to_json(df2$people)) ``` ## How was this patch tested? Add unit test cases. cc viirya HyukjinKwon Author: goldmedal <[email protected]> Closes apache#19223 from goldmedal/SPARK-21513-fp-PySaprkAndSparkR.
1 parent 054ddb2 commit a28728a

File tree

6 files changed

+46
-18
lines changed

6 files changed

+46
-18
lines changed

R/pkg/R/functions.R

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -176,7 +176,8 @@ NULL
176176
#'
177177
#' @param x Column to compute on. Note the difference in the following methods:
178178
#' \itemize{
179-
#' \item \code{to_json}: it is the column containing the struct or array of the structs.
179+
#' \item \code{to_json}: it is the column containing the struct, array of the structs,
180+
#' the map or array of maps.
180181
#' \item \code{from_json}: it is the column containing the JSON string.
181182
#' }
182183
#' @param ... additional argument(s). In \code{to_json} and \code{from_json}, this contains
@@ -1700,8 +1701,9 @@ setMethod("to_date",
17001701
})
17011702

17021703
#' @details
1703-
#' \code{to_json}: Converts a column containing a \code{structType} or array of \code{structType}
1704-
#' into a Column of JSON string. Resolving the Column can fail if an unsupported type is encountered.
1704+
#' \code{to_json}: Converts a column containing a \code{structType}, array of \code{structType},
1705+
#' a \code{mapType} or array of \code{mapType} into a Column of JSON string.
1706+
#' Resolving the Column can fail if an unsupported type is encountered.
17051707
#'
17061708
#' @rdname column_collection_functions
17071709
#' @aliases to_json to_json,Column-method
@@ -1715,6 +1717,14 @@ setMethod("to_date",
17151717
#'
17161718
#' # Converts an array of structs into a JSON array
17171719
#' df2 <- sql("SELECT array(named_struct('name', 'Bob'), named_struct('name', 'Alice')) as people")
1720+
#' df2 <- mutate(df2, people_json = to_json(df2$people))
1721+
#'
1722+
#' # Converts a map into a JSON object
1723+
#' df2 <- sql("SELECT map('name', 'Bob')) as people")
1724+
#' df2 <- mutate(df2, people_json = to_json(df2$people))
1725+
#'
1726+
#' # Converts an array of maps into a JSON array
1727+
#' df2 <- sql("SELECT array(map('name', 'Bob'), map('name', 'Alice')) as people")
17181728
#' df2 <- mutate(df2, people_json = to_json(df2$people))}
17191729
#' @note to_json since 2.2.0
17201730
setMethod("to_json", signature(x = "Column"),

R/pkg/tests/fulltests/test_sparkSQL.R

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1491,6 +1491,14 @@ test_that("column functions", {
14911491
j <- collect(select(df, alias(to_json(df$people), "json")))
14921492
expect_equal(j[order(j$json), ][1], "[{\"name\":\"Bob\"},{\"name\":\"Alice\"}]")
14931493

1494+
df <- sql("SELECT map('name', 'Bob') as people")
1495+
j <- collect(select(df, alias(to_json(df$people), "json")))
1496+
expect_equal(j[order(j$json), ][1], "{\"name\":\"Bob\"}")
1497+
1498+
df <- sql("SELECT array(map('name', 'Bob'), map('name', 'Alice')) as people")
1499+
j <- collect(select(df, alias(to_json(df$people), "json")))
1500+
expect_equal(j[order(j$json), ][1], "[{\"name\":\"Bob\"},{\"name\":\"Alice\"}]")
1501+
14941502
df <- read.json(mapTypeJsonPath)
14951503
j <- collect(select(df, alias(to_json(df$info), "json")))
14961504
expect_equal(j[order(j$json), ][1], "{\"age\":16,\"height\":176.5}")

python/pyspark/sql/functions.py

Lines changed: 16 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1884,9 +1884,9 @@ def json_tuple(col, *fields):
18841884
@since(2.1)
18851885
def from_json(col, schema, options={}):
18861886
"""
1887-
Parses a column containing a JSON string into a [[StructType]] or [[ArrayType]]
1888-
of [[StructType]]s with the specified schema. Returns `null`, in the case of an unparseable
1889-
string.
1887+
Parses a column containing a JSON string into a :class:`StructType` or :class:`ArrayType`
1888+
of :class:`StructType`\\s with the specified schema. Returns `null`, in the case of an
1889+
unparseable string.
18901890
18911891
:param col: string column in json format
18921892
:param schema: a StructType or ArrayType of StructType to use when parsing the json column.
@@ -1921,10 +1921,12 @@ def from_json(col, schema, options={}):
19211921
@since(2.1)
19221922
def to_json(col, options={}):
19231923
"""
1924-
Converts a column containing a [[StructType]] or [[ArrayType]] of [[StructType]]s into a
1925-
JSON string. Throws an exception, in the case of an unsupported type.
1924+
Converts a column containing a :class:`StructType`, :class:`ArrayType` of
1925+
:class:`StructType`\\s, a :class:`MapType` or :class:`ArrayType` of :class:`MapType`\\s
1926+
into a JSON string. Throws an exception, in the case of an unsupported type.
19261927
1927-
:param col: name of column containing the struct or array of the structs
1928+
:param col: name of column containing the struct, array of the structs, the map or
1929+
array of the maps.
19281930
:param options: options to control converting. accepts the same options as the json datasource
19291931
19301932
>>> from pyspark.sql import Row
@@ -1937,6 +1939,14 @@ def to_json(col, options={}):
19371939
>>> df = spark.createDataFrame(data, ("key", "value"))
19381940
>>> df.select(to_json(df.value).alias("json")).collect()
19391941
[Row(json=u'[{"age":2,"name":"Alice"},{"age":3,"name":"Bob"}]')]
1942+
>>> data = [(1, {"name": "Alice"})]
1943+
>>> df = spark.createDataFrame(data, ("key", "value"))
1944+
>>> df.select(to_json(df.value).alias("json")).collect()
1945+
[Row(json=u'{"name":"Alice"}')]
1946+
>>> data = [(1, [{"name": "Alice"}, {"name": "Bob"}])]
1947+
>>> df = spark.createDataFrame(data, ("key", "value"))
1948+
>>> df.select(to_json(df.value).alias("json")).collect()
1949+
[Row(json=u'[{"name":"Alice"},{"name":"Bob"}]')]
19401950
"""
19411951

19421952
sc = SparkContext._active_spark_context

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -618,13 +618,13 @@ case class JsonToStructs(
618618
{"time":"26/08/2015"}
619619
> SELECT _FUNC_(array(named_struct('a', 1, 'b', 2));
620620
[{"a":1,"b":2}]
621-
> SELECT _FUNC_(map('a',named_struct('b',1)));
621+
> SELECT _FUNC_(map('a', named_struct('b', 1)));
622622
{"a":{"b":1}}
623-
> SELECT _FUNC_(map(named_struct('a',1),named_struct('b',2)));
623+
> SELECT _FUNC_(map(named_struct('a', 1),named_struct('b', 2)));
624624
{"[1]":{"b":2}}
625-
> SELECT _FUNC_(map('a',1));
625+
> SELECT _FUNC_(map('a', 1));
626626
{"a":1}
627-
> SELECT _FUNC_(array((map('a',1))));
627+
> SELECT _FUNC_(array((map('a', 1))));
628628
[{"a":1}]
629629
""",
630630
since = "2.2.0")

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonGenerator.scala

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ private[sql] class JacksonGenerator(
4343
private type ValueWriter = (SpecializedGetters, Int) => Unit
4444

4545
// `JackGenerator` can only be initialized with a `StructType` or a `MapType`.
46-
require(dataType.isInstanceOf[StructType] | dataType.isInstanceOf[MapType],
46+
require(dataType.isInstanceOf[StructType] || dataType.isInstanceOf[MapType],
4747
"JacksonGenerator only supports to be initialized with a StructType " +
4848
s"or MapType but got ${dataType.simpleString}")
4949

sql/core/src/test/resources/sql-tests/results/json-functions.sql.out

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -26,13 +26,13 @@ Extended Usage:
2626
{"time":"26/08/2015"}
2727
> SELECT to_json(array(named_struct('a', 1, 'b', 2));
2828
[{"a":1,"b":2}]
29-
> SELECT to_json(map('a',named_struct('b',1)));
29+
> SELECT to_json(map('a', named_struct('b', 1)));
3030
{"a":{"b":1}}
31-
> SELECT to_json(map(named_struct('a',1),named_struct('b',2)));
31+
> SELECT to_json(map(named_struct('a', 1),named_struct('b', 2)));
3232
{"[1]":{"b":2}}
33-
> SELECT to_json(map('a',1));
33+
> SELECT to_json(map('a', 1));
3434
{"a":1}
35-
> SELECT to_json(array((map('a',1))));
35+
> SELECT to_json(array((map('a', 1))));
3636
[{"a":1}]
3737

3838
Since: 2.2.0

0 commit comments

Comments
 (0)