Skip to content

Commit fe9ffd5

Browse files
zhengruifengHyukjinKwon
authored andcommitted
[SPARK-54963][PYTHON][CONNECT] Make createDataFrame respect prefer_timestamp_ntz when infer_pandas_dict_as_map
### What changes were proposed in this pull request? Make `createDataFrame` respect `prefer_timestamp_ntz` when `infer_pandas_dict_as_map` ### Why are the changes needed? In `createDataFrame`, `prefer_timestamp_ntz` is used in some places but not all places. It should always respect `prefer_timestamp_ntz` ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? CI ### Was this patch authored or co-authored using generative AI tooling? No Closes #53731 from zhengruifeng/prefer_tz. Authored-by: Ruifeng Zheng <ruifengz@apache.org> Signed-off-by: Hyukjin Kwon <gurwls223@apache.org>
1 parent 38d3af2 commit fe9ffd5

File tree

2 files changed

+9
-10
lines changed

2 files changed

+9
-10
lines changed

python/pyspark/sql/connect/session.py

Lines changed: 8 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -547,10 +547,8 @@ def createDataFrame(
547547
"spark.sql.execution.arrow.useLargeVarTypes",
548548
)
549549
timezone = configs["spark.sql.session.timeZone"]
550-
prefer_timestamp = configs["spark.sql.timestampType"]
551-
prefers_large_types: bool = (
552-
cast(str, configs["spark.sql.execution.arrow.useLargeVarTypes"]).lower() == "true"
553-
)
550+
prefer_timestamp_ntz = configs["spark.sql.timestampType"] == "TIMESTAMP_NTZ"
551+
prefers_large_types = configs["spark.sql.execution.arrow.useLargeVarTypes"] == "true"
554552

555553
_table: Optional[pa.Table] = None
556554

@@ -582,9 +580,12 @@ def createDataFrame(
582580
messageParameters={},
583581
)
584582
arrow_type = field_type.field(0).type
585-
spark_type = MapType(StringType(), from_arrow_type(arrow_type))
583+
spark_type = MapType(
584+
StringType(),
585+
from_arrow_type(arrow_type, prefer_timestamp_ntz),
586+
)
586587
else:
587-
spark_type = from_arrow_type(field_type)
588+
spark_type = from_arrow_type(field_type, prefer_timestamp_ntz)
588589
struct.add(field.name, spark_type, nullable=field.nullable)
589590
schema = struct
590591
elif isinstance(schema, (list, tuple)) and cast(int, _num_cols) < len(data.columns):
@@ -659,9 +660,7 @@ def createDataFrame(
659660
_num_cols = len(_cols)
660661

661662
if not isinstance(schema, StructType):
662-
schema = from_arrow_schema(
663-
data.schema, prefer_timestamp_ntz=prefer_timestamp == "TIMESTAMP_NTZ"
664-
)
663+
schema = from_arrow_schema(data.schema, prefer_timestamp_ntz=prefer_timestamp_ntz)
665664

666665
_table = (
667666
_check_arrow_table_timestamps_localize(data, schema, True, timezone)

python/pyspark/sql/pandas/conversion.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -849,7 +849,7 @@ def _create_from_pandas_with_arrow(
849849
StringType(), from_arrow_type(arrow_type, prefer_timestamp_ntz)
850850
)
851851
else:
852-
spark_type = from_arrow_type(field_type)
852+
spark_type = from_arrow_type(field_type, prefer_timestamp_ntz)
853853
struct.add(name, spark_type, nullable=field.nullable)
854854
else:
855855
for name, field in zip(schema, arrow_schema):

0 commit comments

Comments
 (0)