Skip to content

Commit 51620e2

Browse files
vinodkcgatorsmile
authored andcommitted
[SPARK-21756][SQL] Add JSON option to allow unquoted control characters
## What changes were proposed in this pull request? This patch adds allowUnquotedControlChars option in JSON data source to allow JSON Strings to contain unquoted control characters (ASCII characters with value less than 32, including tab and line feed characters) ## How was this patch tested? Add new test cases Author: vinodkc <[email protected]> Closes apache#19008 from vinodkc/br_fix_SPARK-21756.
1 parent 628bdea commit 51620e2

File tree

6 files changed

+36
-4
lines changed

6 files changed

+36
-4
lines changed

python/pyspark/sql/readwriter.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -176,7 +176,7 @@ def json(self, path, schema=None, primitivesAsString=None, prefersDecimal=None,
176176
allowComments=None, allowUnquotedFieldNames=None, allowSingleQuotes=None,
177177
allowNumericLeadingZero=None, allowBackslashEscapingAnyCharacter=None,
178178
mode=None, columnNameOfCorruptRecord=None, dateFormat=None, timestampFormat=None,
179-
multiLine=None):
179+
multiLine=None, allowUnquotedControlChars=None):
180180
"""
181181
Loads JSON files and returns the results as a :class:`DataFrame`.
182182
@@ -234,6 +234,9 @@ def json(self, path, schema=None, primitivesAsString=None, prefersDecimal=None,
234234
default value, ``yyyy-MM-dd'T'HH:mm:ss.SSSXXX``.
235235
:param multiLine: parse one record, which may span multiple lines, per file. If None is
236236
set, it uses the default value, ``false``.
237+
:param allowUnquotedControlChars: allows JSON Strings to contain unquoted control
238+
characters (ASCII characters with value less than 32,
239+
including tab and line feed characters) or not.
237240
238241
>>> df1 = spark.read.json('python/test_support/sql/people.json')
239242
>>> df1.dtypes
@@ -250,7 +253,8 @@ def json(self, path, schema=None, primitivesAsString=None, prefersDecimal=None,
250253
allowSingleQuotes=allowSingleQuotes, allowNumericLeadingZero=allowNumericLeadingZero,
251254
allowBackslashEscapingAnyCharacter=allowBackslashEscapingAnyCharacter,
252255
mode=mode, columnNameOfCorruptRecord=columnNameOfCorruptRecord, dateFormat=dateFormat,
253-
timestampFormat=timestampFormat, multiLine=multiLine)
256+
timestampFormat=timestampFormat, multiLine=multiLine,
257+
allowUnquotedControlChars=allowUnquotedControlChars)
254258
if isinstance(path, basestring):
255259
path = [path]
256260
if type(path) == list:

python/pyspark/sql/streaming.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -407,7 +407,7 @@ def json(self, path, schema=None, primitivesAsString=None, prefersDecimal=None,
407407
allowComments=None, allowUnquotedFieldNames=None, allowSingleQuotes=None,
408408
allowNumericLeadingZero=None, allowBackslashEscapingAnyCharacter=None,
409409
mode=None, columnNameOfCorruptRecord=None, dateFormat=None, timestampFormat=None,
410-
multiLine=None):
410+
multiLine=None, allowUnquotedControlChars=None):
411411
"""
412412
Loads a JSON file stream and returns the results as a :class:`DataFrame`.
413413
@@ -467,6 +467,9 @@ def json(self, path, schema=None, primitivesAsString=None, prefersDecimal=None,
467467
default value, ``yyyy-MM-dd'T'HH:mm:ss.SSSXXX``.
468468
:param multiLine: parse one record, which may span multiple lines, per file. If None is
469469
set, it uses the default value, ``false``.
470+
:param allowUnquotedControlChars: allows JSON Strings to contain unquoted control
471+
characters (ASCII characters with value less than 32,
472+
including tab and line feed characters) or not.
470473
471474
>>> json_sdf = spark.readStream.json(tempfile.mkdtemp(), schema = sdf_schema)
472475
>>> json_sdf.isStreaming
@@ -480,7 +483,8 @@ def json(self, path, schema=None, primitivesAsString=None, prefersDecimal=None,
480483
allowSingleQuotes=allowSingleQuotes, allowNumericLeadingZero=allowNumericLeadingZero,
481484
allowBackslashEscapingAnyCharacter=allowBackslashEscapingAnyCharacter,
482485
mode=mode, columnNameOfCorruptRecord=columnNameOfCorruptRecord, dateFormat=dateFormat,
483-
timestampFormat=timestampFormat, multiLine=multiLine)
486+
timestampFormat=timestampFormat, multiLine=multiLine,
487+
allowUnquotedControlChars=allowUnquotedControlChars)
484488
if isinstance(path, basestring):
485489
return self._df(self._jreader.json(path))
486490
else:

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JSONOptions.scala

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,8 @@ private[sql] class JSONOptions(
6464
parameters.get("allowNonNumericNumbers").map(_.toBoolean).getOrElse(true)
6565
val allowBackslashEscapingAnyCharacter =
6666
parameters.get("allowBackslashEscapingAnyCharacter").map(_.toBoolean).getOrElse(false)
67+
private val allowUnquotedControlChars =
68+
parameters.get("allowUnquotedControlChars").map(_.toBoolean).getOrElse(false)
6769
val compressionCodec = parameters.get("compression").map(CompressionCodecs.getCodecClassName)
6870
val parseMode: ParseMode =
6971
parameters.get("mode").map(ParseMode.fromString).getOrElse(PermissiveMode)
@@ -92,5 +94,6 @@ private[sql] class JSONOptions(
9294
factory.configure(JsonParser.Feature.ALLOW_NON_NUMERIC_NUMBERS, allowNonNumericNumbers)
9395
factory.configure(JsonParser.Feature.ALLOW_BACKSLASH_ESCAPING_ANY_CHARACTER,
9496
allowBackslashEscapingAnyCharacter)
97+
factory.configure(JsonParser.Feature.ALLOW_UNQUOTED_CONTROL_CHARS, allowUnquotedControlChars)
9598
}
9699
}

sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -313,6 +313,9 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
313313
* (e.g. 00012)</li>
314314
* <li>`allowBackslashEscapingAnyCharacter` (default `false`): allows accepting quoting of all
315315
* character using backslash quoting mechanism</li>
316+
* <li>`allowUnquotedControlChars` (default `false`): allows JSON Strings to contain unquoted
317+
* control characters (ASCII characters with value less than 32, including tab and line feed
318+
* characters) or not.</li>
316319
* <li>`mode` (default `PERMISSIVE`): allows a mode for dealing with corrupt records
317320
* during parsing.
318321
* <ul>

sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -195,6 +195,9 @@ final class DataStreamReader private[sql](sparkSession: SparkSession) extends Lo
195195
* (e.g. 00012)</li>
196196
* <li>`allowBackslashEscapingAnyCharacter` (default `false`): allows accepting quoting of all
197197
* character using backslash quoting mechanism</li>
198+
* <li>`allowUnquotedControlChars` (default `false`): allows JSON Strings to contain unquoted
199+
* control characters (ASCII characters with value less than 32, including tab and line feed
200+
* characters) or not.</li>
198201
* <li>`mode` (default `PERMISSIVE`): allows a mode for dealing with corrupt records
199202
* during parsing.
200203
* <ul>

sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonParsingOptionsSuite.scala

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,21 @@ class JsonParsingOptionsSuite extends QueryTest with SharedSQLContext {
7272
assert(df.first().getString(0) == "Reynold Xin")
7373
}
7474

75+
test("allowUnquotedControlChars off") {
76+
val str = """{"name": "a\u0001b"}"""
77+
val df = spark.read.json(Seq(str).toDS())
78+
79+
assert(df.schema.head.name == "_corrupt_record")
80+
}
81+
82+
test("allowUnquotedControlChars on") {
83+
val str = """{"name": "a\u0001b"}"""
84+
val df = spark.read.option("allowUnquotedControlChars", "true").json(Seq(str).toDS())
85+
86+
assert(df.schema.head.name == "name")
87+
assert(df.first().getString(0) == "a\u0001b")
88+
}
89+
7590
test("allowNumericLeadingZeros off") {
7691
val str = """{"age": 0018}"""
7792
val df = spark.read.json(Seq(str).toDS())

0 commit comments

Comments
 (0)