[SPARK-30113][SQL][PYTHON] Expose mergeSchema option in PySpark's ORC APIs

nchammas · HyukjinKwon · commit c8922d9145a9 · 2019-12-04T11:44:24.000+09:00
### What changes were proposed in this pull request? This PR is a follow-up to apache#24043 and cousin of apache#26730. It exposes the `mergeSchema` option directly in the ORC APIs. ### Why are the changes needed? So the Python API matches the Scala API. ### Does this PR introduce any user-facing change? Yes, it adds a new option directly in the ORC reader method signatures. ### How was this patch tested? I tested this manually as follows: ``` >>> spark.range(3).write.orc('test-orc') >>> spark.range(3).withColumnRenamed('id', 'name').write.orc('test-orc/nested') >>> spark.read.orc('test-orc', recursiveFileLookup=True, mergeSchema=True) DataFrame[id: bigint, name: bigint] >>> spark.read.orc('test-orc', recursiveFileLookup=True, mergeSchema=False) DataFrame[id: bigint] >>> spark.conf.set('spark.sql.orc.mergeSchema', True) >>> spark.read.orc('test-orc', recursiveFileLookup=True) DataFrame[id: bigint, name: bigint] >>> spark.read.orc('test-orc', recursiveFileLookup=True, mergeSchema=False) DataFrame[id: bigint] ``` Closes apache#26755 from nchammas/SPARK-30113-ORC-mergeSchema. Authored-by: Nicholas Chammas <nicholas.chammas@gmail.com> Signed-off-by: HyukjinKwon <gurwls223@apache.org>
diff --git a/python/pyspark/sql/readwriter.py b/python/pyspark/sql/readwriter.py
@@ -520,17 +520,20 @@ def func(iterator):
             raise TypeError("path can be only string, list or RDD")
 
     @since(1.5)
-    def orc(self, path, recursiveFileLookup=None):
+    def orc(self, path, mergeSchema=None, recursiveFileLookup=None):
         """Loads ORC files, returning the result as a :class:`DataFrame`.
 
+        :param mergeSchema: sets whether we should merge schemas collected from all
+            ORC part-files. This will override ``spark.sql.orc.mergeSchema``.
+            The default value is specified in ``spark.sql.orc.mergeSchema``.
         :param recursiveFileLookup: recursively scan a directory for files. Using this option
-                                    disables `partition discovery`_.
+            disables `partition discovery`_.
 
         >>> df = spark.read.orc('python/test_support/sql/orc_partitioned')
         >>> df.dtypes
         [('a', 'bigint'), ('b', 'int'), ('c', 'int')]
         """
-        self._set_opts(recursiveFileLookup=recursiveFileLookup)
+        self._set_opts(mergeSchema=mergeSchema, recursiveFileLookup=recursiveFileLookup)
         if isinstance(path, basestring):
             path = [path]
         return self._df(self._jreader.orc(_to_seq(self._spark._sc, path)))
diff --git a/python/pyspark/sql/streaming.py b/python/pyspark/sql/streaming.py
@@ -514,21 +514,24 @@ def json(self, path, schema=None, primitivesAsString=None, prefersDecimal=None,
             raise TypeError("path can be only a single string")
 
     @since(2.3)
-    def orc(self, path, recursiveFileLookup=None):
+    def orc(self, path, mergeSchema=None, recursiveFileLookup=None):
         """Loads a ORC file stream, returning the result as a :class:`DataFrame`.
 
         .. note:: Evolving.
 
+        :param mergeSchema: sets whether we should merge schemas collected from all
+            ORC part-files. This will override ``spark.sql.orc.mergeSchema``.
+            The default value is specified in ``spark.sql.orc.mergeSchema``.
         :param recursiveFileLookup: recursively scan a directory for files. Using this option
-                                    disables `partition discovery`_.
+            disables `partition discovery`_.
 
         >>> orc_sdf = spark.readStream.schema(sdf_schema).orc(tempfile.mkdtemp())
         >>> orc_sdf.isStreaming
         True
         >>> orc_sdf.schema == sdf_schema
         True
         """
-        self._set_opts(recursiveFileLookup=recursiveFileLookup)
+        self._set_opts(mergeSchema=mergeSchema, recursiveFileLookup=recursiveFileLookup)
         if isinstance(path, basestring):
             return self._df(self._jreader.orc(path))
         else: