@@ -248,31 +248,40 @@ def _flatten_struct_columns(
248248 clear_on_continuation_cols : list [str ],
249249 nested_originated_columns : set [str ],
250250) -> tuple [pd .DataFrame , list [str ]]:
251- """Flatten regular STRUCT columns using pandas accessor ."""
251+ """Flatten regular STRUCT columns."""
252252 result_df = dataframe .copy ()
253253 for col_name in struct_columns :
254- # Use pandas struct accessor to explode the struct column into a DataFrame of its fields
255- exploded_struct = result_df [col_name ].struct .explode ()
254+ col_data = result_df [col_name ]
255+ if isinstance (col_data .dtype , pd .ArrowDtype ):
256+ pa_type = cast (pd .ArrowDtype , col_data .dtype ).pyarrow_dtype
257+
258+ # Use PyArrow to flatten the struct column without row iteration
259+ # combine_chunks() ensures we have a single array if it was chunked
260+ arrow_array = pa .array (col_data )
261+ flattened_fields = arrow_array .flatten ()
256262
257- # Rename columns to 'parent.child' format
258- exploded_struct .columns = [
259- f"{ col_name } .{ sub_col } " for sub_col in exploded_struct .columns
260- ]
263+ new_cols_to_add = {}
264+ for field_idx in range (pa_type .num_fields ):
265+ field = pa_type .field (field_idx )
266+ new_col_name = f"{ col_name } .{ field .name } "
267+ nested_originated_columns .add (new_col_name )
268+ clear_on_continuation_cols .append (new_col_name )
261269
262- # Update metadata
263- for new_col in exploded_struct .columns :
264- nested_originated_columns .add (new_col )
265- clear_on_continuation_cols .append (new_col )
270+ # Create a new Series from the flattened array
271+ new_cols_to_add [new_col_name ] = pd .Series (
272+ flattened_fields [field_idx ].to_pylist (),
273+ dtype = pd .ArrowDtype (field .type ),
274+ index = result_df .index ,
275+ )
266276
267- # Replace the original struct column with the new field columns
268277 col_idx = result_df .columns .to_list ().index (col_name )
278+ new_cols_df = pd .DataFrame (new_cols_to_add , index = result_df .index )
269279 result_df = pd .concat (
270280 [
271281 result_df .iloc [:, :col_idx ],
272- exploded_struct ,
282+ new_cols_df ,
273283 result_df .iloc [:, col_idx + 1 :],
274284 ],
275285 axis = 1 ,
276286 )
277-
278287 return result_df , clear_on_continuation_cols
0 commit comments