[SPARK-23772][FOLLOW-UP][SQL] Provide an option to ignore column of all null values or empty array during JSON schema inference

MaxGekk · HyukjinKwon · commit 64ad7b841d1e · 2018-08-06T16:46:55.000+08:00
## What changes were proposed in this pull request? The `dropFieldIfAllNull` parameter of the `json` method wasn't set as an option. This PR fixes that. ## How was this patch tested? I added a test to `sql/test.py` Author: Maxim Gekk <maxim.gekk@databricks.com> Closes apache#22002 from MaxGekk/drop-field-if-all-null.
diff --git a/python/pyspark/sql/readwriter.py b/python/pyspark/sql/readwriter.py
@@ -267,7 +267,7 @@ def json(self, path, schema=None, primitivesAsString=None, prefersDecimal=None,
             mode=mode, columnNameOfCorruptRecord=columnNameOfCorruptRecord, dateFormat=dateFormat,
             timestampFormat=timestampFormat, multiLine=multiLine,
             allowUnquotedControlChars=allowUnquotedControlChars, lineSep=lineSep,
-            samplingRatio=samplingRatio, encoding=encoding)
+            samplingRatio=samplingRatio, dropFieldIfAllNull=dropFieldIfAllNull, encoding=encoding)
         if isinstance(path, basestring):
             path = [path]
         if type(path) == list:
diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
@@ -3351,6 +3351,22 @@ def test_checking_csv_header(self):
         finally:
             shutil.rmtree(path)
 
+    def test_ignore_column_of_all_nulls(self):
+        path = tempfile.mkdtemp()
+        shutil.rmtree(path)
+        try:
+            df = self.spark.createDataFrame([["""{"a":null, "b":1, "c":3.0}"""],
+                                             ["""{"a":null, "b":null, "c":"string"}"""],
+                                             ["""{"a":null, "b":null, "c":null}"""]])
+            df.write.text(path)
+            schema = StructType([
+                StructField('b', LongType(), nullable=True),
+                StructField('c', StringType(), nullable=True)])
+            readback = self.spark.read.json(path, dropFieldIfAllNull=True)
+            self.assertEquals(readback.schema, schema)
+        finally:
+            shutil.rmtree(path)
+
     def test_repr_behaviors(self):
         import re
         pattern = re.compile(r'^ *\|', re.MULTILINE)