[SPARK-23699][PYTHON][SQL] Raise same type of error caught with Arrow enabled

BryanCutler · BryanCutler · commit ed72badb04a5 · 2018-03-27T20:06:12.000-07:00
## What changes were proposed in this pull request? When using Arrow for createDataFrame or toPandas and an error is encountered with fallback disabled, this will raise the same type of error instead of a RuntimeError. This change also allows for the traceback of the error to be retained and prevents the accidental chaining of exceptions with Python 3. ## How was this patch tested? Updated existing tests to verify error type. Author: Bryan Cutler <cutlerb@gmail.com> Closes apache#20839 from BryanCutler/arrow-raise-same-error-SPARK-23699.
diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
@@ -2007,19 +2007,20 @@ def toPandas(self):
                         "toPandas attempted Arrow optimization because "
                         "'spark.sql.execution.arrow.enabled' is set to true; however, "
                         "failed by the reason below:\n  %s\n"
-                        "Attempts non-optimization as "
+                        "Attempting non-optimization as "
                         "'spark.sql.execution.arrow.fallback.enabled' is set to "
                         "true." % _exception_message(e))
                     warnings.warn(msg)
                     use_arrow = False
                 else:
                     msg = (
                         "toPandas attempted Arrow optimization because "
-                        "'spark.sql.execution.arrow.enabled' is set to true; however, "
-                        "failed by the reason below:\n  %s\n"
-                        "For fallback to non-optimization automatically, please set true to "
-                        "'spark.sql.execution.arrow.fallback.enabled'." % _exception_message(e))
-                    raise RuntimeError(msg)
+                        "'spark.sql.execution.arrow.enabled' is set to true, but has reached "
+                        "the error below and will not continue because automatic fallback "
+                        "with 'spark.sql.execution.arrow.fallback.enabled' has been set to "
+                        "false.\n  %s" % _exception_message(e))
+                    warnings.warn(msg)
+                    raise
 
             # Try to use Arrow optimization when the schema is supported and the required version
             # of PyArrow is found, if 'spark.sql.execution.arrow.enabled' is enabled.
@@ -2042,12 +2043,12 @@ def toPandas(self):
                     # be executed. So, simply fail in this case for now.
                     msg = (
                         "toPandas attempted Arrow optimization because "
-                        "'spark.sql.execution.arrow.enabled' is set to true; however, "
-                        "failed unexpectedly:\n  %s\n"
-                        "Note that 'spark.sql.execution.arrow.fallback.enabled' does "
-                        "not have an effect in such failure in the middle of "
-                        "computation." % _exception_message(e))
-                    raise RuntimeError(msg)
+                        "'spark.sql.execution.arrow.enabled' is set to true, but has reached "
+                        "the error below and can not continue. Note that "
+                        "'spark.sql.execution.arrow.fallback.enabled' does not have an effect "
+                        "on failures in the middle of computation.\n  %s" % _exception_message(e))
+                    warnings.warn(msg)
+                    raise
 
         # Below is toPandas without Arrow optimization.
         pdf = pd.DataFrame.from_records(self.collect(), columns=self.columns)
diff --git a/python/pyspark/sql/session.py b/python/pyspark/sql/session.py
@@ -674,18 +674,19 @@ def createDataFrame(self, data, schema=None, samplingRatio=None, verifySchema=Tr
                             "createDataFrame attempted Arrow optimization because "
                             "'spark.sql.execution.arrow.enabled' is set to true; however, "
                             "failed by the reason below:\n  %s\n"
-                            "Attempts non-optimization as "
+                            "Attempting non-optimization as "
                             "'spark.sql.execution.arrow.fallback.enabled' is set to "
                             "true." % _exception_message(e))
                         warnings.warn(msg)
                     else:
                         msg = (
                             "createDataFrame attempted Arrow optimization because "
-                            "'spark.sql.execution.arrow.enabled' is set to true; however, "
-                            "failed by the reason below:\n  %s\n"
-                            "For fallback to non-optimization automatically, please set true to "
-                            "'spark.sql.execution.arrow.fallback.enabled'." % _exception_message(e))
-                        raise RuntimeError(msg)
+                            "'spark.sql.execution.arrow.enabled' is set to true, but has reached "
+                            "the error below and will not continue because automatic fallback "
+                            "with 'spark.sql.execution.arrow.fallback.enabled' has been set to "
+                            "false.\n  %s" % _exception_message(e))
+                        warnings.warn(msg)
+                        raise
             data = self._convert_from_pandas(data, schema, timezone)
 
         if isinstance(schema, StructType):
diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
@@ -3559,7 +3559,7 @@ def test_toPandas_fallback_enabled(self):
                         warn.message for warn in warns if isinstance(warn.message, UserWarning)]
                     self.assertTrue(len(user_warns) > 0)
                     self.assertTrue(
-                        "Attempts non-optimization" in _exception_message(user_warns[-1]))
+                        "Attempting non-optimization" in _exception_message(user_warns[-1]))
                     self.assertPandasEqual(pdf, pd.DataFrame({u'map': [{u'a': 1}]}))
 
     def test_toPandas_fallback_disabled(self):
@@ -3682,7 +3682,7 @@ def test_createDataFrame_with_incorrect_schema(self):
         pdf = self.create_pandas_data_frame()
         wrong_schema = StructType(list(reversed(self.schema)))
         with QuietTest(self.sc):
-            with self.assertRaisesRegexp(RuntimeError, ".*No cast.*string.*timestamp.*"):
+            with self.assertRaisesRegexp(Exception, ".*No cast.*string.*timestamp.*"):
                 self.spark.createDataFrame(pdf, schema=wrong_schema)
 
     def test_createDataFrame_with_names(self):
@@ -3707,7 +3707,7 @@ def test_createDataFrame_column_name_encoding(self):
     def test_createDataFrame_with_single_data_type(self):
         import pandas as pd
         with QuietTest(self.sc):
-            with self.assertRaisesRegexp(RuntimeError, ".*IntegerType.*not supported.*"):
+            with self.assertRaisesRegexp(ValueError, ".*IntegerType.*not supported.*"):
                 self.spark.createDataFrame(pd.DataFrame({"a": [1]}), schema="int")
 
     def test_createDataFrame_does_not_modify_input(self):
@@ -3775,14 +3775,14 @@ def test_createDataFrame_fallback_enabled(self):
                         warn.message for warn in warns if isinstance(warn.message, UserWarning)]
                     self.assertTrue(len(user_warns) > 0)
                     self.assertTrue(
-                        "Attempts non-optimization" in _exception_message(user_warns[-1]))
+                        "Attempting non-optimization" in _exception_message(user_warns[-1]))
                     self.assertEqual(df.collect(), [Row(a={u'a': 1})])
 
     def test_createDataFrame_fallback_disabled(self):
         import pandas as pd
 
         with QuietTest(self.sc):
-            with self.assertRaisesRegexp(Exception, 'Unsupported type'):
+            with self.assertRaisesRegexp(TypeError, 'Unsupported type'):
                 self.spark.createDataFrame(
                     pd.DataFrame([[{u'a': 1}]]), "a: map<string, int>")
 
diff --git a/python/pyspark/sql/utils.py b/python/pyspark/sql/utils.py
@@ -121,7 +121,10 @@ def require_minimum_pandas_version():
     from distutils.version import LooseVersion
     try:
         import pandas
+        have_pandas = True
     except ImportError:
+        have_pandas = False
+    if not have_pandas:
         raise ImportError("Pandas >= %s must be installed; however, "
                           "it was not found." % minimum_pandas_version)
     if LooseVersion(pandas.__version__) < LooseVersion(minimum_pandas_version):
@@ -138,7 +141,10 @@ def require_minimum_pyarrow_version():
     from distutils.version import LooseVersion
     try:
         import pyarrow
+        have_arrow = True
     except ImportError:
+        have_arrow = False
+    if not have_arrow:
         raise ImportError("PyArrow >= %s must be installed; however, "
                           "it was not found." % minimum_pyarrow_version)
     if LooseVersion(pyarrow.__version__) < LooseVersion(minimum_pyarrow_version):