Skip to content

Commit 3e6a714

Browse files
viiryaHyukjinKwon
authored andcommitted
[SPARK-21766][PYSPARK][SQL] DataFrame toPandas() raises ValueError with nullable int columns
## What changes were proposed in this pull request? When calling `DataFrame.toPandas()` (without Arrow enabled), if there is a `IntegralType` column (`IntegerType`, `ShortType`, `ByteType`) that has null values the following exception is thrown: ValueError: Cannot convert non-finite values (NA or inf) to integer This is because the null values first get converted to float NaN during the construction of the Pandas DataFrame in `from_records`, and then it is attempted to be converted back to to an integer where it fails. The fix is going to check if the Pandas DataFrame can cause such failure when converting, if so, we don't do the conversion and use the inferred type by Pandas. Closes apache#18945 ## How was this patch tested? Added pyspark test. Author: Liang-Chi Hsieh <[email protected]> Closes apache#19319 from viirya/SPARK-21766.
1 parent d2b2932 commit 3e6a714

File tree

2 files changed

+22
-3
lines changed

2 files changed

+22
-3
lines changed

python/pyspark/sql/dataframe.py

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@
3737
from pyspark.sql.column import Column, _to_seq, _to_list, _to_java_column
3838
from pyspark.sql.readwriter import DataFrameWriter
3939
from pyspark.sql.streaming import DataStreamWriter
40+
from pyspark.sql.types import IntegralType
4041
from pyspark.sql.types import *
4142

4243
__all__ = ["DataFrame", "DataFrameNaFunctions", "DataFrameStatFunctions"]
@@ -1891,14 +1892,20 @@ def toPandas(self):
18911892
"if using spark.sql.execution.arrow.enable=true"
18921893
raise ImportError("%s\n%s" % (e.message, msg))
18931894
else:
1895+
pdf = pd.DataFrame.from_records(self.collect(), columns=self.columns)
1896+
18941897
dtype = {}
18951898
for field in self.schema:
18961899
pandas_type = _to_corrected_pandas_type(field.dataType)
1897-
if pandas_type is not None:
1900+
# SPARK-21766: if an integer field is nullable and has null values, it can be
1901+
# inferred by pandas as float column. Once we convert the column with NaN back
1902+
# to integer type e.g., np.int16, we will hit exception. So we use the inferred
1903+
# float type, not the corrected type from the schema in this case.
1904+
if pandas_type is not None and \
1905+
not(isinstance(field.dataType, IntegralType) and field.nullable and
1906+
pdf[field.name].isnull().any()):
18981907
dtype[field.name] = pandas_type
18991908

1900-
pdf = pd.DataFrame.from_records(self.collect(), columns=self.columns)
1901-
19021909
for f, t in dtype.items():
19031910
pdf[f] = pdf[f].astype(t, copy=False)
19041911
return pdf

python/pyspark/sql/tests.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2564,6 +2564,18 @@ def test_to_pandas(self):
25642564
self.assertEquals(types[2], np.bool)
25652565
self.assertEquals(types[3], np.float32)
25662566

2567+
@unittest.skipIf(not _have_pandas, "Pandas not installed")
2568+
def test_to_pandas_avoid_astype(self):
2569+
import numpy as np
2570+
schema = StructType().add("a", IntegerType()).add("b", StringType())\
2571+
.add("c", IntegerType())
2572+
data = [(1, "foo", 16777220), (None, "bar", None)]
2573+
df = self.spark.createDataFrame(data, schema)
2574+
types = df.toPandas().dtypes
2575+
self.assertEquals(types[0], np.float64) # doesn't convert to np.int32 due to NaN value.
2576+
self.assertEquals(types[1], np.object)
2577+
self.assertEquals(types[2], np.float64)
2578+
25672579
def test_create_dataframe_from_array_of_long(self):
25682580
import array
25692581
data = [Row(longarray=array.array('l', [-9223372036854775808, 0, 9223372036854775807]))]

0 commit comments

Comments
 (0)