Skip to content

Commit 33769b8

Browse files
zhengruifengHyukjinKwon
authored andcommitted
Revert "[SPARK-54182][SQL][PYTHON] Optimize non-arrow conversion of df.toPandas`"
revert #52897 due to perf regression when the number of columns are small Closes #53661 from zhengruifeng/revert_54182. Authored-by: Ruifeng Zheng <[email protected]> Signed-off-by: Hyukjin Kwon <[email protected]>
1 parent a911310 commit 33769b8

File tree

1 file changed

+12
-17
lines changed

1 file changed

+12
-17
lines changed

python/pyspark/sql/pandas/conversion.py

Lines changed: 12 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@
1818
from typing import (
1919
Any,
2020
Callable,
21-
Iterator,
2221
List,
2322
Optional,
2423
Sequence,
@@ -292,20 +291,18 @@ def _to_pandas(self, **kwargs: Any) -> "PandasDataFrameLike":
292291

293292
# Below is toPandas without Arrow optimization.
294293
rows = self.collect()
294+
if len(rows) > 0:
295+
pdf = pd.DataFrame.from_records(
296+
rows, index=range(len(rows)), columns=self.columns # type: ignore[arg-type]
297+
)
298+
else:
299+
pdf = pd.DataFrame(columns=self.columns)
295300

296-
if len(self.columns) > 0:
301+
if len(pdf.columns) > 0:
297302
timezone = sessionLocalTimeZone
298303
struct_in_pandas = pandasStructHandlingMode
299304

300-
# Extract columns from rows and apply converters
301-
if len(rows) > 0:
302-
# Use iterator to avoid materializing intermediate data structure
303-
columns_data: Iterator[Any] = iter(zip(*rows))
304-
else:
305-
columns_data = iter([] for _ in self.schema.fields)
306-
307-
# Build DataFrame from columns
308-
pdf = pd.concat(
305+
return pd.concat(
309306
[
310307
_create_converter_to_pandas(
311308
field.dataType,
@@ -316,15 +313,13 @@ def _to_pandas(self, **kwargs: Any) -> "PandasDataFrameLike":
316313
),
317314
error_on_duplicated_field_names=False,
318315
timestamp_utc_localized=False,
319-
)(pd.Series(col_data, dtype=object))
320-
for col_data, field in zip(columns_data, self.schema.fields)
316+
)(pser)
317+
for (_, pser), field in zip(pdf.items(), self.schema.fields)
321318
],
322-
axis=1,
323-
keys=self.columns,
319+
axis="columns",
324320
)
325-
return pdf
326321
else:
327-
return pd.DataFrame(columns=[], index=range(len(rows)))
322+
return pdf
328323

329324
def toArrow(self) -> "pa.Table":
330325
from pyspark.sql.dataframe import DataFrame

0 commit comments

Comments
 (0)