Skip to content

Commit 64ad7b8

Browse files
MaxGekkHyukjinKwon
authored andcommitted
[SPARK-23772][FOLLOW-UP][SQL] Provide an option to ignore column of all null values or empty array during JSON schema inference
## What changes were proposed in this pull request? The `dropFieldIfAllNull` parameter of the `json` method wasn't set as an option. This PR fixes that. ## How was this patch tested? I added a test to `sql/test.py` Author: Maxim Gekk <[email protected]> Closes apache#22002 from MaxGekk/drop-field-if-all-null.
1 parent ac527b5 commit 64ad7b8

File tree

2 files changed

+17
-1
lines changed

2 files changed

+17
-1
lines changed

python/pyspark/sql/readwriter.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -267,7 +267,7 @@ def json(self, path, schema=None, primitivesAsString=None, prefersDecimal=None,
267267
mode=mode, columnNameOfCorruptRecord=columnNameOfCorruptRecord, dateFormat=dateFormat,
268268
timestampFormat=timestampFormat, multiLine=multiLine,
269269
allowUnquotedControlChars=allowUnquotedControlChars, lineSep=lineSep,
270-
samplingRatio=samplingRatio, encoding=encoding)
270+
samplingRatio=samplingRatio, dropFieldIfAllNull=dropFieldIfAllNull, encoding=encoding)
271271
if isinstance(path, basestring):
272272
path = [path]
273273
if type(path) == list:

python/pyspark/sql/tests.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3351,6 +3351,22 @@ def test_checking_csv_header(self):
33513351
finally:
33523352
shutil.rmtree(path)
33533353

3354+
def test_ignore_column_of_all_nulls(self):
3355+
path = tempfile.mkdtemp()
3356+
shutil.rmtree(path)
3357+
try:
3358+
df = self.spark.createDataFrame([["""{"a":null, "b":1, "c":3.0}"""],
3359+
["""{"a":null, "b":null, "c":"string"}"""],
3360+
["""{"a":null, "b":null, "c":null}"""]])
3361+
df.write.text(path)
3362+
schema = StructType([
3363+
StructField('b', LongType(), nullable=True),
3364+
StructField('c', StringType(), nullable=True)])
3365+
readback = self.spark.read.json(path, dropFieldIfAllNull=True)
3366+
self.assertEquals(readback.schema, schema)
3367+
finally:
3368+
shutil.rmtree(path)
3369+
33543370
def test_repr_behaviors(self):
33553371
import re
33563372
pattern = re.compile(r'^ *\|', re.MULTILINE)

0 commit comments

Comments
 (0)