@@ -70,18 +70,33 @@ class ColumnClassification:
7070
7171
7272@dataclasses .dataclass (frozen = True )
73- class ExplodeResult :
74- """The result of exploding array columns.
73+ class FlattenArrayOfStructsResult :
74+ """The result of flattening array-of-struct columns.
7575
7676 Attributes:
77- dataframe: The exploded DataFrame.
78- row_labels: Labels for the rows .
79- continuation_rows: Indices of continuation rows .
77+ dataframe: The flattened DataFrame.
78+ array_columns: The updated list of array columns .
79+ nested_originated_columns: The updated set of columns created from nested data .
8080 """
8181
8282 dataframe : pd .DataFrame
83- row_labels : list [str ]
84- continuation_rows : set [int ]
83+ array_columns : tuple [str , ...]
84+ nested_originated_columns : frozenset [str ]
85+
86+
87+ @dataclasses .dataclass (frozen = True )
88+ class FlattenStructsResult :
89+ """The result of flattening struct columns.
90+
91+ Attributes:
92+ dataframe: The flattened DataFrame.
93+ clear_on_continuation_cols: The updated list of columns to clear on continuation.
94+ nested_originated_columns: The updated set of columns created from nested data.
95+ """
96+
97+ dataframe : pd .DataFrame
98+ clear_on_continuation_cols : tuple [str , ...]
99+ nested_originated_columns : frozenset [str ]
85100
86101
87102def flatten_nested_data (
@@ -109,27 +124,31 @@ def flatten_nested_data(
109124 classification = _classify_columns (result_df )
110125
111126 # Process ARRAY-of-STRUCT columns into multiple ARRAY columns (one per struct field).
112- result_df , array_cols , nested_cols = _flatten_array_of_struct_columns (
127+ flatten_array_structs_result = _flatten_array_of_struct_columns (
113128 result_df ,
114129 classification .array_of_struct_columns ,
115130 classification .array_columns ,
116131 classification .nested_originated_columns ,
117132 )
133+ result_df = flatten_array_structs_result .dataframe
118134 classification = dataclasses .replace (
119- classification , array_columns = array_cols , nested_originated_columns = nested_cols
135+ classification ,
136+ array_columns = flatten_array_structs_result .array_columns ,
137+ nested_originated_columns = flatten_array_structs_result .nested_originated_columns ,
120138 )
121139
122140 # Flatten top-level STRUCT columns into separate columns.
123- result_df , clear_cols , nested_cols = _flatten_struct_columns (
141+ flatten_structs_result = _flatten_struct_columns (
124142 result_df ,
125143 classification .struct_columns ,
126144 classification .clear_on_continuation_cols ,
127145 classification .nested_originated_columns ,
128146 )
147+ result_df = flatten_structs_result .dataframe
129148 classification = dataclasses .replace (
130149 classification ,
131- clear_on_continuation_cols = clear_cols ,
132- nested_originated_columns = nested_cols ,
150+ clear_on_continuation_cols = flatten_structs_result . clear_on_continuation_cols ,
151+ nested_originated_columns = flatten_structs_result . nested_originated_columns ,
133152 )
134153
135154 # Now handle ARRAY columns (including the newly created ones from ARRAY of STRUCT)
@@ -206,7 +225,7 @@ def _flatten_array_of_struct_columns(
206225 array_of_struct_columns : tuple [str , ...],
207226 array_columns : tuple [str , ...],
208227 nested_originated_columns : frozenset [str ],
209- ) -> tuple [ pd . DataFrame , tuple [ str , ...], frozenset [ str ]] :
228+ ) -> FlattenArrayOfStructsResult :
210229 """Flatten ARRAY of STRUCT columns into separate ARRAY columns for each field.
211230
212231 Args:
@@ -216,7 +235,7 @@ def _flatten_array_of_struct_columns(
216235 nested_originated_columns: Columns tracked as originating from nested data.
217236
218237 Returns:
219- A tuple containing the modified DataFrame, updated array columns, and updated nested columns.
238+ A FlattenArrayOfStructsResult containing the updated DataFrame and columns.
220239 """
221240 result_df = dataframe .copy ()
222241 current_array_columns = list (array_columns )
@@ -245,7 +264,11 @@ def _flatten_array_of_struct_columns(
245264 current_array_columns .remove (col_name )
246265 current_array_columns .extend (new_cols_df .columns .tolist ())
247266
248- return result_df , tuple (current_array_columns ), frozenset (current_nested_columns )
267+ return FlattenArrayOfStructsResult (
268+ dataframe = result_df ,
269+ array_columns = tuple (current_array_columns ),
270+ nested_originated_columns = frozenset (current_nested_columns ),
271+ )
249272
250273
251274def _transpose_list_of_structs (arrow_array : pa .ListArray ) -> dict [str , pa .ListArray ]:
@@ -299,6 +322,21 @@ def _replace_column_in_df(
299322 )
300323
301324
325+ @dataclasses .dataclass (frozen = True )
326+ class ExplodeResult :
327+ """The result of exploding array columns.
328+
329+ Attributes:
330+ dataframe: The exploded DataFrame.
331+ row_labels: Labels for the rows.
332+ continuation_rows: Indices of continuation rows.
333+ """
334+
335+ dataframe : pd .DataFrame
336+ row_labels : list [str ]
337+ continuation_rows : set [int ]
338+
339+
302340def _explode_array_columns (
303341 dataframe : pd .DataFrame , array_columns : list [str ]
304342) -> ExplodeResult :
@@ -407,7 +445,7 @@ def _flatten_struct_columns(
407445 struct_columns : tuple [str , ...],
408446 clear_on_continuation_cols : tuple [str , ...],
409447 nested_originated_columns : frozenset [str ],
410- ) -> tuple [ pd . DataFrame , tuple [ str , ...], frozenset [ str ]] :
448+ ) -> FlattenStructsResult :
411449 """Flatten regular STRUCT columns into separate columns.
412450
413451 Args:
@@ -417,7 +455,7 @@ def _flatten_struct_columns(
417455 nested_originated_columns: Columns tracked as originating from nested data.
418456
419457 Returns:
420- A tuple containing the modified DataFrame, updated clear columns, and updated nested columns.
458+ A FlattenStructsResult containing the updated DataFrame and columns.
421459 """
422460 result_df = dataframe .copy ()
423461 current_clear_cols = list (clear_on_continuation_cols )
@@ -450,4 +488,8 @@ def _flatten_struct_columns(
450488 new_cols_df = pd .DataFrame (new_cols_to_add , index = result_df .index )
451489 result_df = _replace_column_in_df (result_df , col_name , new_cols_df )
452490
453- return result_df , tuple (current_clear_cols ), frozenset (current_nested_cols )
491+ return FlattenStructsResult (
492+ dataframe = result_df ,
493+ clear_on_continuation_cols = tuple (current_clear_cols ),
494+ nested_originated_columns = frozenset (current_nested_cols ),
495+ )
0 commit comments