Skip to content

Commit ed72bad

Browse files
committed
[SPARK-23699][PYTHON][SQL] Raise same type of error caught with Arrow enabled
## What changes were proposed in this pull request? When using Arrow for createDataFrame or toPandas and an error is encountered with fallback disabled, this will raise the same type of error instead of a RuntimeError. This change also allows for the traceback of the error to be retained and prevents the accidental chaining of exceptions with Python 3. ## How was this patch tested? Updated existing tests to verify error type. Author: Bryan Cutler <[email protected]> Closes apache#20839 from BryanCutler/arrow-raise-same-error-SPARK-23699.
1 parent c68ec4e commit ed72bad

File tree

4 files changed

+31
-23
lines changed

4 files changed

+31
-23
lines changed

python/pyspark/sql/dataframe.py

Lines changed: 13 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -2007,19 +2007,20 @@ def toPandas(self):
20072007
"toPandas attempted Arrow optimization because "
20082008
"'spark.sql.execution.arrow.enabled' is set to true; however, "
20092009
"failed by the reason below:\n %s\n"
2010-
"Attempts non-optimization as "
2010+
"Attempting non-optimization as "
20112011
"'spark.sql.execution.arrow.fallback.enabled' is set to "
20122012
"true." % _exception_message(e))
20132013
warnings.warn(msg)
20142014
use_arrow = False
20152015
else:
20162016
msg = (
20172017
"toPandas attempted Arrow optimization because "
2018-
"'spark.sql.execution.arrow.enabled' is set to true; however, "
2019-
"failed by the reason below:\n %s\n"
2020-
"For fallback to non-optimization automatically, please set true to "
2021-
"'spark.sql.execution.arrow.fallback.enabled'." % _exception_message(e))
2022-
raise RuntimeError(msg)
2018+
"'spark.sql.execution.arrow.enabled' is set to true, but has reached "
2019+
"the error below and will not continue because automatic fallback "
2020+
"with 'spark.sql.execution.arrow.fallback.enabled' has been set to "
2021+
"false.\n %s" % _exception_message(e))
2022+
warnings.warn(msg)
2023+
raise
20232024

20242025
# Try to use Arrow optimization when the schema is supported and the required version
20252026
# of PyArrow is found, if 'spark.sql.execution.arrow.enabled' is enabled.
@@ -2042,12 +2043,12 @@ def toPandas(self):
20422043
# be executed. So, simply fail in this case for now.
20432044
msg = (
20442045
"toPandas attempted Arrow optimization because "
2045-
"'spark.sql.execution.arrow.enabled' is set to true; however, "
2046-
"failed unexpectedly:\n %s\n"
2047-
"Note that 'spark.sql.execution.arrow.fallback.enabled' does "
2048-
"not have an effect in such failure in the middle of "
2049-
"computation." % _exception_message(e))
2050-
raise RuntimeError(msg)
2046+
"'spark.sql.execution.arrow.enabled' is set to true, but has reached "
2047+
"the error below and can not continue. Note that "
2048+
"'spark.sql.execution.arrow.fallback.enabled' does not have an effect "
2049+
"on failures in the middle of computation.\n %s" % _exception_message(e))
2050+
warnings.warn(msg)
2051+
raise
20512052

20522053
# Below is toPandas without Arrow optimization.
20532054
pdf = pd.DataFrame.from_records(self.collect(), columns=self.columns)

python/pyspark/sql/session.py

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -674,18 +674,19 @@ def createDataFrame(self, data, schema=None, samplingRatio=None, verifySchema=Tr
674674
"createDataFrame attempted Arrow optimization because "
675675
"'spark.sql.execution.arrow.enabled' is set to true; however, "
676676
"failed by the reason below:\n %s\n"
677-
"Attempts non-optimization as "
677+
"Attempting non-optimization as "
678678
"'spark.sql.execution.arrow.fallback.enabled' is set to "
679679
"true." % _exception_message(e))
680680
warnings.warn(msg)
681681
else:
682682
msg = (
683683
"createDataFrame attempted Arrow optimization because "
684-
"'spark.sql.execution.arrow.enabled' is set to true; however, "
685-
"failed by the reason below:\n %s\n"
686-
"For fallback to non-optimization automatically, please set true to "
687-
"'spark.sql.execution.arrow.fallback.enabled'." % _exception_message(e))
688-
raise RuntimeError(msg)
684+
"'spark.sql.execution.arrow.enabled' is set to true, but has reached "
685+
"the error below and will not continue because automatic fallback "
686+
"with 'spark.sql.execution.arrow.fallback.enabled' has been set to "
687+
"false.\n %s" % _exception_message(e))
688+
warnings.warn(msg)
689+
raise
689690
data = self._convert_from_pandas(data, schema, timezone)
690691

691692
if isinstance(schema, StructType):

python/pyspark/sql/tests.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3559,7 +3559,7 @@ def test_toPandas_fallback_enabled(self):
35593559
warn.message for warn in warns if isinstance(warn.message, UserWarning)]
35603560
self.assertTrue(len(user_warns) > 0)
35613561
self.assertTrue(
3562-
"Attempts non-optimization" in _exception_message(user_warns[-1]))
3562+
"Attempting non-optimization" in _exception_message(user_warns[-1]))
35633563
self.assertPandasEqual(pdf, pd.DataFrame({u'map': [{u'a': 1}]}))
35643564

35653565
def test_toPandas_fallback_disabled(self):
@@ -3682,7 +3682,7 @@ def test_createDataFrame_with_incorrect_schema(self):
36823682
pdf = self.create_pandas_data_frame()
36833683
wrong_schema = StructType(list(reversed(self.schema)))
36843684
with QuietTest(self.sc):
3685-
with self.assertRaisesRegexp(RuntimeError, ".*No cast.*string.*timestamp.*"):
3685+
with self.assertRaisesRegexp(Exception, ".*No cast.*string.*timestamp.*"):
36863686
self.spark.createDataFrame(pdf, schema=wrong_schema)
36873687

36883688
def test_createDataFrame_with_names(self):
@@ -3707,7 +3707,7 @@ def test_createDataFrame_column_name_encoding(self):
37073707
def test_createDataFrame_with_single_data_type(self):
37083708
import pandas as pd
37093709
with QuietTest(self.sc):
3710-
with self.assertRaisesRegexp(RuntimeError, ".*IntegerType.*not supported.*"):
3710+
with self.assertRaisesRegexp(ValueError, ".*IntegerType.*not supported.*"):
37113711
self.spark.createDataFrame(pd.DataFrame({"a": [1]}), schema="int")
37123712

37133713
def test_createDataFrame_does_not_modify_input(self):
@@ -3775,14 +3775,14 @@ def test_createDataFrame_fallback_enabled(self):
37753775
warn.message for warn in warns if isinstance(warn.message, UserWarning)]
37763776
self.assertTrue(len(user_warns) > 0)
37773777
self.assertTrue(
3778-
"Attempts non-optimization" in _exception_message(user_warns[-1]))
3778+
"Attempting non-optimization" in _exception_message(user_warns[-1]))
37793779
self.assertEqual(df.collect(), [Row(a={u'a': 1})])
37803780

37813781
def test_createDataFrame_fallback_disabled(self):
37823782
import pandas as pd
37833783

37843784
with QuietTest(self.sc):
3785-
with self.assertRaisesRegexp(Exception, 'Unsupported type'):
3785+
with self.assertRaisesRegexp(TypeError, 'Unsupported type'):
37863786
self.spark.createDataFrame(
37873787
pd.DataFrame([[{u'a': 1}]]), "a: map<string, int>")
37883788

python/pyspark/sql/utils.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -121,7 +121,10 @@ def require_minimum_pandas_version():
121121
from distutils.version import LooseVersion
122122
try:
123123
import pandas
124+
have_pandas = True
124125
except ImportError:
126+
have_pandas = False
127+
if not have_pandas:
125128
raise ImportError("Pandas >= %s must be installed; however, "
126129
"it was not found." % minimum_pandas_version)
127130
if LooseVersion(pandas.__version__) < LooseVersion(minimum_pandas_version):
@@ -138,7 +141,10 @@ def require_minimum_pyarrow_version():
138141
from distutils.version import LooseVersion
139142
try:
140143
import pyarrow
144+
have_arrow = True
141145
except ImportError:
146+
have_arrow = False
147+
if not have_arrow:
142148
raise ImportError("PyArrow >= %s must be installed; however, "
143149
"it was not found." % minimum_pyarrow_version)
144150
if LooseVersion(pyarrow.__version__) < LooseVersion(minimum_pyarrow_version):

0 commit comments

Comments
 (0)