@@ -228,7 +228,6 @@ def convert(
228228 assign_cols_by_name : bool = False ,
229229 int_to_decimal_coercion_enabled : bool = False ,
230230 ignore_unexpected_complex_type_values : bool = False ,
231- is_udtf : bool = False ,
232231 ) -> "pa.RecordBatch" :
233232 """
234233 Convert a pandas DataFrame or list of Series/DataFrames to an Arrow RecordBatch.
@@ -255,14 +254,6 @@ def convert(
255254 Whether to enable int to decimal coercion (default False)
256255 ignore_unexpected_complex_type_values : bool
257256 Whether to ignore unexpected complex type values in converter (default False)
258- is_udtf : bool
259- Whether this conversion is for a UDTF. UDTFs use broader Arrow exception
260- handling to allow more type coercions (e.g., struct field casting via
261- ArrowTypeError), and convert errors to UDTF_ARROW_TYPE_CAST_ERROR.
262- # TODO(SPARK-55502): Unify UDTF and regular UDF conversion paths to
263- # eliminate the is_udtf flag.
264- Regular UDFs only catch ArrowInvalid to preserve legacy behavior where
265- e.g. string→decimal must raise an error. (default False)
266257
267258 Returns
268259 -------
@@ -271,7 +262,7 @@ def convert(
271262 import pyarrow as pa
272263 import pandas as pd
273264
274- from pyspark .errors import PySparkTypeError , PySparkValueError , PySparkRuntimeError
265+ from pyspark .errors import PySparkTypeError , PySparkValueError
275266 from pyspark .sql .pandas .types import to_arrow_type , _create_converter_from_pandas
276267
277268 # Handle empty schema (0 columns)
@@ -318,7 +309,6 @@ def convert_column(
318309 assign_cols_by_name = assign_cols_by_name ,
319310 int_to_decimal_coercion_enabled = int_to_decimal_coercion_enabled ,
320311 ignore_unexpected_complex_type_values = ignore_unexpected_complex_type_values ,
321- is_udtf = is_udtf ,
322312 )
323313 # Wrap the nested RecordBatch as a single StructArray column
324314 return ArrowBatchTransformer .wrap_struct (nested_batch ).column (0 )
@@ -343,60 +333,38 @@ def convert_column(
343333
344334 mask = None if hasattr (series .array , "__arrow_array__" ) else series .isnull ()
345335
346- if is_udtf :
347- # UDTF path: broad ArrowException catch so that both ArrowInvalid
348- # AND ArrowTypeError (e.g. dict→struct) trigger the cast fallback.
336+ # Unified conversion path: broad ArrowException catch so that both ArrowInvalid
337+ # AND ArrowTypeError (e.g. dict→struct) trigger the cast fallback.
338+ try :
349339 try :
350- try :
351- return pa .Array .from_pandas (
352- series , mask = mask , type = arrow_type , safe = safecheck
353- )
354- except pa .lib .ArrowException : # broad: includes ArrowTypeError
355- if arrow_cast :
356- return pa .Array .from_pandas (series , mask = mask ).cast (
357- target_type = arrow_type , safe = safecheck
358- )
359- raise
360- except pa .lib .ArrowException : # convert any Arrow error to user-friendly message
361- raise PySparkRuntimeError (
362- errorClass = "UDTF_ARROW_TYPE_CAST_ERROR" ,
363- messageParameters = {
364- "col_name" : field_name ,
365- "col_type" : str (series .dtype ),
366- "arrow_type" : str (arrow_type ),
367- },
368- ) from None
369- else :
370- # UDF path: only ArrowInvalid triggers the cast fallback.
371- # ArrowTypeError (e.g. string→decimal) must NOT be silently cast.
372- try :
373- try :
374- return pa .Array .from_pandas (
375- series , mask = mask , type = arrow_type , safe = safecheck
376- )
377- except pa .lib .ArrowInvalid : # narrow: skip ArrowTypeError
378- if arrow_cast :
379- return pa .Array .from_pandas (series , mask = mask ).cast (
380- target_type = arrow_type , safe = safecheck
381- )
382- raise
383- except TypeError as e : # includes pa.lib.ArrowTypeError
384- raise PySparkTypeError (
385- f"Exception thrown when converting pandas.Series ({ series .dtype } ) "
386- f"with name '{ field_name } ' to Arrow Array ({ arrow_type } )."
387- ) from e
388- except ValueError as e : # includes pa.lib.ArrowInvalid
389- error_msg = (
390- f"Exception thrown when converting pandas.Series ({ series .dtype } ) "
391- f"with name '{ field_name } ' to Arrow Array ({ arrow_type } )."
340+ return pa .Array .from_pandas (
341+ series , mask = mask , type = arrow_type , safe = safecheck
392342 )
393- if safecheck :
394- error_msg += (
395- " It can be caused by overflows or other unsafe conversions "
396- "warned by Arrow. Arrow safe type check can be disabled by using "
397- "SQL config `spark.sql.execution.pandas.convertToArrowArraySafely`."
343+ except pa .lib .ArrowException : # broad: includes ArrowTypeError and ArrowInvalid
344+ if arrow_cast :
345+ return pa .Array .from_pandas (series , mask = mask ).cast (
346+ target_type = arrow_type , safe = safecheck
398347 )
399- raise PySparkValueError (error_msg ) from e
348+ raise
349+ except (TypeError , pa .lib .ArrowTypeError ) as e :
350+ # ArrowTypeError is a subclass of TypeError
351+ raise PySparkTypeError (
352+ f"Exception thrown when converting pandas.Series ({ series .dtype } ) "
353+ f"with name '{ field_name } ' to Arrow Array ({ arrow_type } )."
354+ ) from e
355+ except (ValueError , pa .lib .ArrowInvalid ) as e :
356+ # ArrowInvalid is a subclass of ValueError
357+ error_msg = (
358+ f"Exception thrown when converting pandas.Series ({ series .dtype } ) "
359+ f"with name '{ field_name } ' to Arrow Array ({ arrow_type } )."
360+ )
361+ if safecheck :
362+ error_msg += (
363+ " It can be caused by overflows or other unsafe conversions "
364+ "warned by Arrow. Arrow safe type check can be disabled by using "
365+ "SQL config `spark.sql.execution.pandas.convertToArrowArraySafely`."
366+ )
367+ raise PySparkValueError (error_msg ) from e
400368
401369 arrays = [convert_column (col , field ) for col , field in zip (columns , schema .fields )]
402370 return pa .RecordBatch .from_arrays (arrays , schema .names )
0 commit comments