Revert "[SPARK-50298][PYTHON][CONNECT] Implement verifySchema parameter of createDataFrame in Spark Connect"

HyukjinKwon · HyukjinKwon · commit 8d7e3d4af618 · 2024-11-22T15:36:27.000+09:00
This reverts commit e1477a3.
diff --git a/python/pyspark/sql/connect/conversion.py b/python/pyspark/sql/connect/conversion.py
@@ -322,7 +322,7 @@ def convert_other(value: Any) -> Any:
             return lambda value: value
 
     @staticmethod
-    def convert(data: Sequence[Any], schema: StructType, verifySchema: bool = False) -> "pa.Table":
+    def convert(data: Sequence[Any], schema: StructType) -> "pa.Table":
         assert isinstance(data, list) and len(data) > 0
 
         assert schema is not None and isinstance(schema, StructType)
@@ -372,8 +372,8 @@ def convert(data: Sequence[Any], schema: StructType, verifySchema: bool = False)
                 ]
             )
         )
-        table = pa.Table.from_arrays(pylist, schema=pa_schema)
-        return table.cast(pa_schema, safe=verifySchema)
+
+        return pa.Table.from_arrays(pylist, schema=pa_schema)
 
 
 class ArrowTableToRowsConversion:
diff --git a/python/pyspark/sql/connect/session.py b/python/pyspark/sql/connect/session.py
@@ -50,7 +50,6 @@
 )
 import urllib
 
-from pyspark._globals import _NoValue, _NoValueType
 from pyspark.sql.connect.dataframe import DataFrame
 from pyspark.sql.dataframe import DataFrame as ParentDataFrame
 from pyspark.sql.connect.logging import logger
@@ -450,7 +449,7 @@ def createDataFrame(
         data: Union["pd.DataFrame", "np.ndarray", "pa.Table", Iterable[Any]],
         schema: Optional[Union[AtomicType, StructType, str, List[str], Tuple[str, ...]]] = None,
         samplingRatio: Optional[float] = None,
-        verifySchema: Union[_NoValueType, bool] = _NoValue,
+        verifySchema: Optional[bool] = None,
     ) -> "ParentDataFrame":
         assert data is not None
         if isinstance(data, DataFrame):
@@ -462,6 +461,9 @@ def createDataFrame(
         if samplingRatio is not None:
             warnings.warn("'samplingRatio' is ignored. It is not supported with Spark Connect.")
 
+        if verifySchema is not None:
+            warnings.warn("'verifySchema' is ignored. It is not supported with Spark Connect.")
+
         _schema: Optional[Union[AtomicType, StructType]] = None
         _cols: Optional[List[str]] = None
         _num_cols: Optional[int] = None
@@ -574,10 +576,7 @@ def createDataFrame(
                 "spark.sql.session.timeZone", "spark.sql.execution.pandas.convertToArrowArraySafely"
             )
 
-            if verifySchema is _NoValue:
-                verifySchema = safecheck == "true"
-
-            ser = ArrowStreamPandasSerializer(cast(str, timezone), verifySchema)
+            ser = ArrowStreamPandasSerializer(cast(str, timezone), safecheck == "true")
 
             _table = pa.Table.from_batches(
                 [
@@ -597,9 +596,6 @@ def createDataFrame(
                 ).cast(arrow_schema)
 
         elif isinstance(data, pa.Table):
-            if verifySchema is _NoValue:
-                verifySchema = False
-
             prefer_timestamp_ntz = is_timestamp_ntz_preferred()
 
             (timezone,) = self._client.get_configs("spark.sql.session.timeZone")
@@ -617,10 +613,7 @@ def createDataFrame(
 
             _table = (
                 _check_arrow_table_timestamps_localize(data, schema, True, timezone)
-                .cast(
-                    to_arrow_schema(schema, error_on_duplicated_field_names_in_struct=True),
-                    safe=verifySchema,
-                )
+                .cast(to_arrow_schema(schema, error_on_duplicated_field_names_in_struct=True))
                 .rename_columns(schema.names)
             )
 
@@ -659,12 +652,6 @@ def createDataFrame(
             # The _table should already have the proper column names.
             _cols = None
 
-            if verifySchema is not _NoValue:
-                warnings.warn(
-                    "'verifySchema' is ignored. It is not supported"
-                    " with np.ndarray input on Spark Connect."
-                )
-
         else:
             _data = list(data)
 
@@ -696,15 +683,12 @@ def createDataFrame(
                         errorClass="CANNOT_DETERMINE_TYPE", messageParameters={}
                     )
 
-            if verifySchema is _NoValue:
-                verifySchema = True
-
             from pyspark.sql.connect.conversion import LocalDataToArrowConversion
 
             # Spark Connect will try its best to build the Arrow table with the
             # inferred schema in the client side, and then rename the columns and
             # cast the datatypes in the server side.
-            _table = LocalDataToArrowConversion.convert(_data, _schema, cast(bool, verifySchema))
+            _table = LocalDataToArrowConversion.convert(_data, _schema)
 
         # TODO: Beside the validation on number of columns, we should also check
         # whether the Arrow Schema is compatible with the user provided Schema.
diff --git a/python/pyspark/sql/tests/connect/test_parity_arrow.py b/python/pyspark/sql/tests/connect/test_parity_arrow.py
@@ -137,8 +137,9 @@ def test_toPandas_udt(self):
     def test_create_dataframe_namedtuples(self):
         self.check_create_dataframe_namedtuples(True)
 
+    @unittest.skip("Spark Connect does not support verifySchema.")
     def test_createDataFrame_verifySchema(self):
-        self.check_createDataFrame_verifySchema(True)
+        super().test_createDataFrame_verifySchema()
 
 
 if __name__ == "__main__":
diff --git a/python/pyspark/sql/tests/test_arrow.py b/python/pyspark/sql/tests/test_arrow.py
@@ -533,11 +533,6 @@ def test_createDataFrame_arrow_pandas(self):
         self.assertEqual(df_arrow.collect(), df_pandas.collect())
 
     def test_createDataFrame_verifySchema(self):
-        for arrow_enabled in [True, False]:
-            with self.subTest(arrow_enabled=arrow_enabled):
-                self.check_createDataFrame_verifySchema(arrow_enabled)
-
-    def check_createDataFrame_verifySchema(self, arrow_enabled):
         data = {"id": [1, 2, 3], "value": [100000000000, 200000000000, 300000000000]}
         # data.value should fail schema validation when verifySchema is True
         schema = StructType(
@@ -552,32 +547,29 @@ def check_createDataFrame_verifySchema(self, arrow_enabled):
         table = pa.table(data)
         df = self.spark.createDataFrame(table, schema=schema)
         self.assertEqual(df.collect(), expected)
+
         with self.assertRaises(Exception):
             self.spark.createDataFrame(table, schema=schema, verifySchema=True)
 
-        if arrow_enabled:
-            with self.sql_conf({"spark.sql.execution.arrow.pyspark.enabled": True}):
-                # pandas DataFrame with Arrow optimization
-                pdf = pd.DataFrame(data)
+        # pandas DataFrame with Arrow optimization
+        pdf = pd.DataFrame(data)
+        df = self.spark.createDataFrame(pdf, schema=schema)
+        # verifySchema defaults to `spark.sql.execution.pandas.convertToArrowArraySafely`,
+        # which is false by default
+        self.assertEqual(df.collect(), expected)
+        with self.assertRaises(Exception):
+            with self.sql_conf({"spark.sql.execution.pandas.convertToArrowArraySafely": True}):
                 df = self.spark.createDataFrame(pdf, schema=schema)
-                # verifySchema defaults to `spark.sql.execution.pandas.convertToArrowArraySafely`,
-                # which is false by default
-                self.assertEqual(df.collect(), expected)
-                with self.assertRaises(Exception):
-                    with self.sql_conf(
-                        {"spark.sql.execution.pandas.convertToArrowArraySafely": True}
-                    ):
-                        df = self.spark.createDataFrame(pdf, schema=schema)
-                with self.assertRaises(Exception):
-                    df = self.spark.createDataFrame(pdf, schema=schema, verifySchema=True)
-        else:
-            # pandas DataFrame without Arrow optimization
-            with self.sql_conf({"spark.sql.execution.arrow.pyspark.enabled": False}):
-                pdf = pd.DataFrame(data)
-                with self.assertRaises(Exception):
-                    self.spark.createDataFrame(pdf, schema=schema)  # verifySchema defaults to True
-                df = self.spark.createDataFrame(pdf, schema=schema, verifySchema=False)
-                self.assertEqual(df.collect(), expected)
+        with self.assertRaises(Exception):
+            df = self.spark.createDataFrame(pdf, schema=schema, verifySchema=True)
+
+        # pandas DataFrame without Arrow optimization
+        with self.sql_conf({"spark.sql.execution.arrow.pyspark.enabled": False}):
+            pdf = pd.DataFrame(data)
+            with self.assertRaises(Exception):
+                df = self.spark.createDataFrame(pdf, schema=schema)  # verifySchema defaults to True
+            df = self.spark.createDataFrame(pdf, schema=schema, verifySchema=False)
+            self.assertEqual(df.collect(), expected)
 
     def _createDataFrame_toggle(self, data, schema=None):
         with self.sql_conf({"spark.sql.execution.arrow.pyspark.enabled": False}):

Original file line number	Diff line number	Diff line change
`@@ -322,7 +322,7 @@ def convert_other(value: Any) -> Any:`
`322`	`322`	`return lambda value: value`
`323`	`323`
`324`	`324`	`@staticmethod`
`325`		`- def convert(data: Sequence[Any], schema: StructType, verifySchema: bool = False) -> "pa.Table":`
	`325`	`+ def convert(data: Sequence[Any], schema: StructType) -> "pa.Table":`
`326`	`326`	`assert isinstance(data, list) and len(data) > 0`
`327`	`327`
`328`	`328`	`assert schema is not None and isinstance(schema, StructType)`
`@@ -372,8 +372,8 @@ def convert(data: Sequence[Any], schema: StructType, verifySchema: bool = False)`
`372`	`372`	`]`
`373`	`373`	`)`
`374`	`374`	`)`
`375`		`- table = pa.Table.from_arrays(pylist, schema=pa_schema)`
`376`		`- return table.cast(pa_schema, safe=verifySchema)`
	`375`	`+`
	`376`	`+ return pa.Table.from_arrays(pylist, schema=pa_schema)`
`377`	`377`
`378`	`378`
`379`	`379`	`class ArrowTableToRowsConversion:`