[SPARK-54963][PYTHON][CONNECT] Make createDataFrame respect prefer_timestamp_ntz when infer_pandas_dict_as_map

zhengruifeng · HyukjinKwon · commit fe9ffd50e58b · 2026-01-09T07:02:52.000+09:00
### What changes were proposed in this pull request? Make `createDataFrame` respect `prefer_timestamp_ntz` when `infer_pandas_dict_as_map` ### Why are the changes needed? In `createDataFrame`, `prefer_timestamp_ntz` is used in some places but not all places. It should always respect `prefer_timestamp_ntz` ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? CI ### Was this patch authored or co-authored using generative AI tooling? No Closes #53731 from zhengruifeng/prefer_tz. Authored-by: Ruifeng Zheng <ruifengz@apache.org> Signed-off-by: Hyukjin Kwon <gurwls223@apache.org>
diff --git a/python/pyspark/sql/connect/session.py b/python/pyspark/sql/connect/session.py
@@ -547,10 +547,8 @@ def createDataFrame(
             "spark.sql.execution.arrow.useLargeVarTypes",
         )
         timezone = configs["spark.sql.session.timeZone"]
-        prefer_timestamp = configs["spark.sql.timestampType"]
-        prefers_large_types: bool = (
-            cast(str, configs["spark.sql.execution.arrow.useLargeVarTypes"]).lower() == "true"
-        )
+        prefer_timestamp_ntz = configs["spark.sql.timestampType"] == "TIMESTAMP_NTZ"
+        prefers_large_types = configs["spark.sql.execution.arrow.useLargeVarTypes"] == "true"
 
         _table: Optional[pa.Table] = None
 
@@ -582,9 +580,12 @@ def createDataFrame(
                                     messageParameters={},
                                 )
                             arrow_type = field_type.field(0).type
-                            spark_type = MapType(StringType(), from_arrow_type(arrow_type))
+                            spark_type = MapType(
+                                StringType(),
+                                from_arrow_type(arrow_type, prefer_timestamp_ntz),
+                            )
                         else:
-                            spark_type = from_arrow_type(field_type)
+                            spark_type = from_arrow_type(field_type, prefer_timestamp_ntz)
                         struct.add(field.name, spark_type, nullable=field.nullable)
                     schema = struct
             elif isinstance(schema, (list, tuple)) and cast(int, _num_cols) < len(data.columns):
@@ -659,9 +660,7 @@ def createDataFrame(
                 _num_cols = len(_cols)
 
             if not isinstance(schema, StructType):
-                schema = from_arrow_schema(
-                    data.schema, prefer_timestamp_ntz=prefer_timestamp == "TIMESTAMP_NTZ"
-                )
+                schema = from_arrow_schema(data.schema, prefer_timestamp_ntz=prefer_timestamp_ntz)
 
             _table = (
                 _check_arrow_table_timestamps_localize(data, schema, True, timezone)
diff --git a/python/pyspark/sql/pandas/conversion.py b/python/pyspark/sql/pandas/conversion.py
@@ -849,7 +849,7 @@ def _create_from_pandas_with_arrow(
                             StringType(), from_arrow_type(arrow_type, prefer_timestamp_ntz)
                         )
                     else:
-                        spark_type = from_arrow_type(field_type)
+                        spark_type = from_arrow_type(field_type, prefer_timestamp_ntz)
                     struct.add(name, spark_type, nullable=field.nullable)
             else:
                 for name, field in zip(schema, arrow_schema):

Original file line number	Diff line number	Diff line change
`@@ -849,7 +849,7 @@ def _create_from_pandas_with_arrow(`
`849`	`849`	`StringType(), from_arrow_type(arrow_type, prefer_timestamp_ntz)`
`850`	`850`	`)`
`851`	`851`	`else:`
`852`		`- spark_type = from_arrow_type(field_type)`
	`852`	`+ spark_type = from_arrow_type(field_type, prefer_timestamp_ntz)`
`853`	`853`	`struct.add(name, spark_type, nullable=field.nullable)`
`854`	`854`	`else:`
`855`	`855`	`for name, field in zip(schema, arrow_schema):`